Using `tables_io` to iterate over hdf5 files

# Standard imports
import os
import numpy as np
import tables_io
import astropy.table as apTable
#from tables_io.testUtils import make_test_data

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 5
      3 import numpy as np
      4 import tables_io
----> 5 import astropy.table as apTable
      6 #from tables_io.testUtils import make_test_data

ModuleNotFoundError: No module named 'astropy'

Some test data.

Ok, lets make some test data write it to an hdf5 file.

def make_test_data():
    nrow = 1000
    vect_size = 20
    mat_size = 5
    scalar = np.random.uniform(size=nrow)
    vect = np.random.uniform(size=nrow * vect_size).reshape(nrow, vect_size)
    matrix = np.random.uniform(size=nrow * mat_size * mat_size).reshape(nrow, mat_size, mat_size)
    data = dict(scalar=scalar, vect=vect, matrix=matrix)
    table = apTable.Table(data)
    table.meta["a"] = 1
    table.meta["b"] = None
    table.meta["c"] = [3, 4, 5]
    small_table = apTable.Table(dict(a=np.ones(21), b=np.zeros(21)))
    small_table.meta["small"] = True
    return(dict(data=table, md=small_table))

data = make_test_data()
td = tables_io.write(data, 'test_data', 'hdf5')

! ls test_data*

Let’s see how long the table is

nrow = tables_io.io.getInputDataLengthHdf5('test_data.hdf5', 'data')
print(nrow)

And let’s iterate over the data in chunks

for chunks in tables_io.io.iterHdf5ToDict('test_data.hdf5', 100, 'data'):
    print("%i:%i %s %s %s" % (chunks[0], chunks[1], 
                              chunks[2]['matrix'].shape, chunks[2]['vect'].shape, chunks[2]['scalar'].shape))

If you want to use MPI you can specify the rank and MPI size to only iterate through the data chunks that correspond to the current node.

for chunks in tables_io.io.iterHdf5ToDict('test_data.hdf5', 100, 'data',rank=0,parallel_size=3):
    print("%i:%i %s %s %s" % (chunks[0], chunks[1], 
                              chunks[2]['matrix'].shape, chunks[2]['vect'].shape, chunks[2]['scalar'].shape))