Using `tables_io` to write to an hdf5 file from multiple places

# Standard imports
import os
from collections import OrderedDict
import numpy as np
import tables_io
#from ..tests.testUtils import make_test_data
import astropy.table as apTable
from mpi4py import MPI

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 7
      5 import tables_io
      6 #from ..tests.testUtils import make_test_data
----> 7 import astropy.table as apTable
      8 from mpi4py import MPI

ModuleNotFoundError: No module named 'astropy'

Set up

Some test data.

Ok, lets make some test data write it to an hdf5 file.

def make_test_data():
    nrow = 1000
    vect_size = 20
    mat_size = 5
    scalar = np.random.uniform(size=nrow)
    vect = np.random.uniform(size=nrow * vect_size).reshape(nrow, vect_size)
    matrix = np.random.uniform(size=nrow * mat_size * mat_size).reshape(nrow, mat_size, mat_size)
    data = dict(scalar=scalar, vect=vect, matrix=matrix)
    table = apTable.Table(data)
    table.meta["a"] = 1
    table.meta["b"] = None
    table.meta["c"] = [3, 4, 5]
    small_table = apTable.Table(dict(a=np.ones(21), b=np.zeros(21)))
    small_table.meta["small"] = True
    return(dict(data=table, md=small_table))

data = make_test_data()

Some utility functions to get the output data shape

def get_shapes_and_type(aDict):
    return OrderedDict([(key, (val.shape, val.dtype)) for key, val in aDict.items()])

def modify_shape(shapeIn, length):
    shapeList = list(shapeIn)
    shapeList[0] = length
    return tuple(shapeList)

def set_lengths(aDict, length):
    return OrderedDict([(key, (modify_shape(val[0], length), val[1])) for key, val in aDict.items()])        

Create the file

Here we get the output shapes and data types

dout = {'data':set_lengths(get_shapes_and_type(data['data']), 10000)}

print(dout)

{'data': OrderedDict({'scalar': ((10000,), dtype('float64')), 'vect': ((10000, 20), dtype('float64')), 'matrix': ((10000, 5, 5), dtype('float64'))})}

Here we allocate the file

os.unlink('test_multi_write.hdf5')
from tables_io import hdf5
groups, fout = hdf5.initialize_HDF5_write('test_mpi_write.hdf5',comm=MPI.COMM_WORLD, **dout)

Write the file

Here we fill the file

for i in range(10):
    data = make_test_data()
    start = i*1000
    end = (i+1)*1000
    hdf5.write_dict_to_HDF5_chunk(groups, data, start, end)

Here write the metadata and close the file

hdf5.finalize_HDF5_write(fout, 'md', **data['md'])

Test the file

Here we read the file in chunks

for chunks in tables_io.io.iterHdf5ToDict('test_multi_write.hdf5', 1000, 'data'):
    print("%i:%i %s %s %s" % (chunks[0], chunks[1], 
                              chunks[2]['matrix'].shape, chunks[2]['vect'].shape, chunks[2]['scalar'].shape))