From 469aebecfb0d139909631dd8724952e3f461a5f8 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 7 Jul 2023 14:25:54 -0400 Subject: [PATCH] link FITS_rec instances to hdu extensions on save DataModel converts recarrays to FITS_rec on assignment these were not correctly linked to FITS extensions on save resulting in duplication of FITS_rec data on save. This adds a test for the duplication and some book keeping to allow linking the converted FITS_recs and extensions on save. Astropy creates a 'view' of the FITS_rec when assigned to a new FITS extension which makes a simple 'is' check complicated (so the strategy used for ndarray and NDArrayType could not be used). Instead, the links between FITS_rec and extensions are tracked (via the FitsContext) which allows creation of the links without having re-find the hdu created from the FITS_rec. (The existing strategy is left in place for ndarray to maintain compatibility with existing ASDF-in-FITS behavior which does not require the same schema structure as the DataModels). --- CHANGES.rst | 8 ++++++ src/stdatamodels/fits_support.py | 33 +++++++++++++++------- tests/test_fits.py | 48 ++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 10 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index b040086c..e7c0c206 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,11 @@ +1.7.0 (unreleased) +================== + +Bug Fixes +--------- + +- Link FITS_rec instances to created HDU on save to avoid data duplication. [#178] + 1.7.0 (2023-06-29) ================== diff --git a/src/stdatamodels/fits_support.py b/src/stdatamodels/fits_support.py index b3a38ca2..203a954a 100644 --- a/src/stdatamodels/fits_support.py +++ b/src/stdatamodels/fits_support.py @@ -274,6 +274,8 @@ def _fits_array_writer(fits_context, validator, _, instance, schema): if instance is None: return + instance_id = id(instance) + instance = np.asanyarray(instance) if not len(instance.shape): @@ -297,6 +299,10 @@ def _fits_array_writer(fits_context, validator, _, instance, schema): index=index, hdu_type=hdu_type) hdu.data = instance + if instance_id in fits_context.extension_array_links: + if fits_context.extension_array_links[instance_id]() is not hdu: + raise ValueError("Linking one array to multiple hdus is not supported") + fits_context.extension_array_links[instance_id] = weakref.ref(hdu) hdu.ver = index + 1 @@ -331,6 +337,7 @@ def __init__(self, hdulist): self.hdulist = weakref.ref(hdulist) self.comment_stack = [] self.sequence_index = None + self.extension_array_links = {} def _get_validators(hdulist): @@ -350,7 +357,7 @@ def _get_validators(hdulist): 'type': partial(_fits_type, fits_context), }) - return validators + return validators, fits_context def _save_from_schema(hdulist, tree, schema): @@ -370,24 +377,30 @@ def datetime_callback(node, json_id): else: kwargs = {} - validator = asdf_schema.get_validator( - schema, None, _get_validators(hdulist), **kwargs) + validators, context = _get_validators(hdulist) + validator = asdf_schema.get_validator(schema, None, validators, **kwargs) # This actually kicks off the saving validator.validate(tree, _schema=schema) - # Replace arrays in the tree that are identical to HDU arrays - # with ndarray-1.0.0 tagged objects with special source values - # that represent links to the surrounding FITS file. - def ndarray_callback(node, json_id): - if (isinstance(node, (np.ndarray, NDArrayType))): + # Now link extensions to items in the tree + + def callback(node, json_id): + if id(node) in context.extension_array_links: + hdu = context.extension_array_links[id(node)]() + return _create_tagged_dict_for_fits_array(hdu, hdulist.index(hdu)) + elif isinstance(node, (np.ndarray, NDArrayType)): + # in addition to links generated during validation + # replace arrays in the tree that are identical to HDU arrays + # with ndarray-1.0.0 tagged objects with special source values + # that represent links to the surrounding FITS file. + # This is important for general ASDF-in-FITS support for hdu_index, hdu in enumerate(hdulist): if hdu.data is not None and node is hdu.data: return _create_tagged_dict_for_fits_array(hdu, hdu_index) - return node - tree = treeutil.walk_and_modify(tree, ndarray_callback) + tree = treeutil.walk_and_modify(tree, callback) return tree diff --git a/tests/test_fits.py b/tests/test_fits.py index c2df2d12..26ffe3fa 100644 --- a/tests/test_fits.py +++ b/tests/test_fits.py @@ -1,3 +1,5 @@ +import re + import pytest from astropy.io import fits import numpy as np @@ -627,3 +629,49 @@ def test_resave_duplication_bug(tmp_path): with fits.open(fn1) as ff1, fits.open(fn2) as ff2: assert ff1['ASDF'].size == ff2['ASDF'].size + + +def test_table_linking(tmp_path): + file_path = tmp_path / "test.fits" + + schema = { + 'title': 'Test data model', + '$schema': 'http://stsci.edu/schemas/fits-schema/fits-schema', + 'type': 'object', + 'properties': { + 'meta': { + 'type': 'object', + 'properties': {} + }, + 'test_table': { + 'title': 'Test table', + 'fits_hdu': 'TESTTABL', + 'datatype': [ + {'name': 'A_COL', 'datatype': 'int8'}, + {'name': 'B_COL', 'datatype': 'int8'} + ] + } + } + } + + with DataModel(schema=schema) as dm: + test_array = np.array([(1, 2), (3, 4)], dtype=[('A_COL', 'i1'), ('B_COL', 'i1')]) + + # assigning to the model will convert the array to a FITS_rec + dm.test_table = test_array + assert isinstance(dm.test_table, fits.FITS_rec) + + # save the model (with the table) + dm.save(file_path) + + # open the model and confirm that the table was linked to an hdu + with fits.open(file_path) as ff: + # read the bytes for the embedded ASDF content + asdf_bytes = ff['ASDF'].data.tobytes() + + # get only the bytes for the tree (not blocks) by splitting + # on the yaml end document marker '...' + # on the first block magic sequence + tree_string = asdf_bytes.split(b'...')[0].decode('ascii') + unlinked_arrays = re.findall(r'source:\s+[^f]', tree_string) + assert not len(unlinked_arrays), unlinked_arrays