diff --git a/caput/fileformats.py b/caput/fileformats.py new file mode 100644 index 00000000..d8707d81 --- /dev/null +++ b/caput/fileformats.py @@ -0,0 +1,87 @@ +import bitshuffle.h5 +import h5py +import numcodecs +import zarr + + +class FileFormat: + """Abstract base class for file formats supported by this module.""" + + module = None + + @staticmethod + def open(*args, **vargs): + raise NotImplementedError + + @staticmethod + def compression_kwargs(compression=None, compression_opts=None, compressor=None): + """ + Sort compression arguments in a format expected by file format module. + + Parameters + ---------- + compression : str or int + Name or identifier of HDF5 compression filter. + compression_opts + See HDF5 documentation for compression filters. + compressor : `numcodecs` compressor + As required by `zarr`. + + Returns + ------- + dict + Compression arguments as required by the file format module. + """ + if compressor and (compression or compression_opts): + raise ValueError( + f"Found more than one kind of compression args: compression ({compression}, {compression_opts}) " + f"and compressor {compressor}." + ) + + +class HDF5(FileFormat): + module = h5py + + @staticmethod + def open(*args, **kwargs): + return h5py.File(*args, **kwargs) + + @staticmethod + def compression_kwargs(compression=None, compression_opts=None, compressor=None): + super(HDF5, HDF5).compression_kwargs(compression, compression_opts, compressor) + if compressor: + raise NotImplementedError + else: + return {"compression": compression, "compression_opts": compression_opts} + + +class Zarr(FileFormat): + module = zarr + + @staticmethod + def open(*args, **kwargs): + return zarr.open_group(*args, **kwargs) + + @staticmethod + def compression_kwargs(compression=None, compression_opts=None, compressor=None): + super(Zarr, Zarr).compression_kwargs(compression, compression_opts, compressor) + if compression: + if compression == "gzip": + return {"compressor": numcodecs.gzip.GZip(level=compression_opts)} + elif compression == bitshuffle.h5.H5FILTER: + blocksize, c = compression_opts + if c == bitshuffle.h5.H5_COMPRESS_LZ4: + cname = "lz4" + else: + raise ValueError( + f"Unknown value for cname in HDF5 compression opts: {compression_opts[1]}" + ) + return { + "compressor": numcodecs.Blosc( + cname, shuffle=numcodecs.blosc.BITSHUFFLE, blocksize=blocksize + ) + } + else: + ValueError(f"Compression filter not supported in zarr: {compression}") + else: + return {"compressor": compressor}