Source code for imagine.tools.io_handler

"""
The io_handler class is designed for IMAGINE I/O with HDF5+MPI,
but parallel HDF5 is not required.

There are two types of data reading,
corresponding to the data types defined in the Observable class.

    1. for reading 'measured' data (including mask maps),
    each node reads the full data.
    'read_copy' is designed for this case.

    2. for reading 'covariance' data,
    each node reads a certain rows.
    'read_dist' is designed for this case.

We do not require writing in parallel since the output workload is not heavy.
And there are also two types of data writing out, corresponds to the reading,
i.e., 'write_copy' and 'write_dist'.

For the testing suits, please turn to "imagine/tests/tools_tests.py".
"""

# %% IMPORTS
# Built-in imports
import logging as log
import os

# Package imports
import h5py
from mpi4py import MPI
import numpy as np

# IMAGINE imports
from imagine.tools.mpi_helper import mpi_arrange

# Globals
comm = MPI.COMM_WORLD
mpisize = comm.Get_size()
mpirank = comm.Get_rank()

# All declaration
__all__ = ['IOHandler']


# %% CLASS DEFINITIONS
[docs]class IOHandler(object): """ Handles the I/O. Parameters ---------- wkdir : string The absolute path of the working directory. """ def __init__(self, wk_dir=None): if wk_dir is None: self.wk_dir = os.getcwd() else: self.wk_dir = wk_dir log.debug('set working directory at %s' % self._wk_dir) self.file_path = None @property def wk_dir(self): """ String containing the absolute path of the working directory. """ return self._wk_dir @property def file_path(self): """ Absolute path of the HDF5 binary file. """ return self._file_path @wk_dir.setter def wk_dir(self, wk_dir): assert isinstance(wk_dir, str) self._wk_dir = wk_dir @file_path.setter def file_path(self, file_path): if file_path is None: self._file_path = None else: assert isinstance(file_path, str) self._file_path = file_path
[docs] def write_copy(self, data, file, key): """ Writes a copied data-set into a HDF5 file. In practice, it writes out the data stored in the master node, by defaut taking all nodes have the same copies. Parameters ---------- data : numpy.ndarray Distributed/copied data. file : str Strong for filename. key : str String for HDF5 group and dataset names, e.g., 'group name/dataset name'. """ log.debug('@ io_handler::write') assert isinstance(data, np.ndarray) assert (len(data.shape) == 2) assert isinstance(file, str) assert isinstance(key, str) # combine wk_path with filename self.file_path = os.path.join(self._wk_dir, file) # master node writing if not mpirank: # write permission, create if not exist with h5py.File(self._file_path, mode='a') as fh: # create group and dataset if not key in fh.keys(): dset = fh.create_dataset(key, data.shape, maxshape=(None, None), dtype=data.dtype) else: # rewrite dset = fh[key] dset.resize(data.shape) # write the master node piece dset[:,:] = data comm.Barrier()
[docs] def write_dist(self, data, file, key): """ Writes a distributed data-set into a HDF5 file. If the given filename does not exist then creates one the data shape must be either in (m,n) on each node, each node will pass its content to the master node who is in charge of sequential writing. Parameters ---------- data : numpy.ndarray Distributed data. file : str String for filename. key : str String for HDF5 group and dataset names, e.g., 'group name/dataset name'. """ log.debug('@ io_handler::write') assert isinstance(data, np.ndarray) assert (len(data.shape) == 2) assert isinstance(file, str) assert isinstance(key, str) # know the global data shape and each rank's offset local_rows = np.empty(mpisize, dtype=np.uint) local_cols = np.empty(mpisize, dtype=np.uint) comm.Allgather([np.array(data.shape[0], dtype=np.uint), MPI.LONG], [local_rows, MPI.LONG]) comm.Allgather([np.array(data.shape[1], dtype=np.uint), MPI.LONG], [local_cols, MPI.LONG]) # assert if cols are identical if np.any(local_cols-local_cols[0]): raise ValueError ('upsupported data shape') # set offset position for each node offset_begin = np.sum(local_rows[0:mpirank]) offset_end = offset_begin + local_rows[mpirank] global_shape = (np.sum(local_rows), local_cols[0]) # combine wk_path with filename self.file_path = os.path.join(self._wk_dir, file) # sequential writing if not mpirank: # write permission, create if not exist with h5py.File(self._file_path, mode='a') as fh: # create group and dataset if not key in fh.keys(): dset = fh.create_dataset(key, global_shape, maxshape=(None, None), dtype=data.dtype) else: # rewrite dset = fh[key] dset.resize(global_shape) # write the master node piece dset[0:local_rows[0],:] = data for source in range(1,mpisize): source_offset_begin = np.sum(local_rows[0:source]) source_offset_end = source_offset_begin + local_rows[source] # receive data from slave nodes source_data = comm.recv(source=source, tag=source) # write to disk dset[source_offset_begin:source_offset_end,:] = source_data else: # else to ``if not mpirank`` # send data to the master node comm.isend(data, dest=0, tag=mpirank) comm.Barrier()
[docs] def read_copy(self, file, key): """ Reads from a HDF5 file identically to all nodes, by doing so, each node contains an identical copy of the data stored in the file. Parameters ---------- data : numpy.ndarray Distributed data. file : str String for filename. key : str String for HDF5 group and dataset names, e.g., 'group name/dataset name'. Returns ------- Copied numpy.ndarray. The output must be in (1,n) shape on each node. """ log.debug('@ io_handler::read_copied') assert isinstance(file, str) assert isinstance(key, str) # combine wk_path with filename self.file_path = os.path.join(self._wk_dir, file) # write permission, create if not exist with h5py.File(self._file_path, mode='r') as fh: assert (fh[key].shape[0] == 1) data = fh[key][:,:] comm.Barrier() return data
[docs] def read_dist(self, file, key): """ Reads from a HDF5 file and returns a distributed data-set. Note that the binary file data should contain enough rows to be distributed on the available computing nodes, otherwise the mpi_arrange function will raise an error. Parameters ---------- data : numpy.ndarray Distributed data. file : str String for filename. key : str String for HDF5 group and dataset names, e.g., 'group name/dataset name'. Returns ------- Distributed numpy.ndarray. The output must be in either at least (1,n), or (m,n) shape on each node. """ log.debug('@ io_handler::read_dist') assert isinstance(file, str) assert isinstance(key, str) # combine wk_path with filename self.file_path = os.path.join(self._wk_dir, file) # write permission, create if not exist with h5py.File(self._file_path, mode='r') as fh: global_shape = fh[key].shape offset_begin, offset_end = mpi_arrange(global_shape[0]) data = fh[key][offset_begin:offset_end,:] comm.Barrier() return data