Using tables_io to iterate over hdf5 files
# Standard imports
import os
import numpy as np
import tables_io
import astropy.table as apTable
#from tables_io.testUtils import make_test_data
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 5
3 import numpy as np
4 import tables_io
----> 5 import astropy.table as apTable
6 #from tables_io.testUtils import make_test_data
ModuleNotFoundError: No module named 'astropy'
Some test data.
Ok, lets make some test data write it to an hdf5 file.
def make_test_data():
nrow = 1000
vect_size = 20
mat_size = 5
scalar = np.random.uniform(size=nrow)
vect = np.random.uniform(size=nrow * vect_size).reshape(nrow, vect_size)
matrix = np.random.uniform(size=nrow * mat_size * mat_size).reshape(nrow, mat_size, mat_size)
data = dict(scalar=scalar, vect=vect, matrix=matrix)
table = apTable.Table(data)
table.meta["a"] = 1
table.meta["b"] = None
table.meta["c"] = [3, 4, 5]
small_table = apTable.Table(dict(a=np.ones(21), b=np.zeros(21)))
small_table.meta["small"] = True
return(dict(data=table, md=small_table))
data = make_test_data()
td = tables_io.write(data, 'test_data', 'hdf5')
! ls test_data*
Let’s see how long the table is
nrow = tables_io.io.getInputDataLengthHdf5('test_data.hdf5', 'data')
print(nrow)
And let’s iterate over the data in chunks
for chunks in tables_io.io.iterHdf5ToDict('test_data.hdf5', 100, 'data'):
print("%i:%i %s %s %s" % (chunks[0], chunks[1],
chunks[2]['matrix'].shape, chunks[2]['vect'].shape, chunks[2]['scalar'].shape))
If you want to use MPI you can specify the rank and MPI size to only iterate through the data chunks that correspond to the current node.
for chunks in tables_io.io.iterHdf5ToDict('test_data.hdf5', 100, 'data',rank=0,parallel_size=3):
print("%i:%i %s %s %s" % (chunks[0], chunks[1],
chunks[2]['matrix'].shape, chunks[2]['vect'].shape, chunks[2]['scalar'].shape))