link FITS_rec instances to hdu extensions on save

DataModel converts recarrays to FITS_rec on assignment these were not correctly linked to FITS extensions on save resulting in duplication of FITS_rec data on save. This adds a test for the duplication and some book keeping to allow linking the converted FITS_recs and extensions on save. Astropy creates a 'view' of the FITS_rec when assigned to a new FITS extension which makes a simple 'is' check complicated (so the strategy used for ndarray and NDArrayType could not be used). Instead, the links between FITS_rec and extensions are tracked (via the FitsContext) which allows creation of the links without having re-find the hdu created from the FITS_rec. (The existing strategy is left in place for ndarray to maintain compatibility with existing ASDF-in-FITS behavior which does not require the same schema structure as the DataModels).
spacetelescope · Jul 7, 2023 · 469aebe · 469aebe
1 parent 2ffdb84
commit 469aebe
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 10 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,3 +1,11 @@
+1.7.0 (unreleased)
+==================
+
+Bug Fixes
+---------
+
+- Link FITS_rec instances to created HDU on save to avoid data duplication. [#178]
+
 1.7.0 (2023-06-29)
 ==================
 

diff --git a/src/stdatamodels/fits_support.py b/src/stdatamodels/fits_support.py
@@ -274,6 +274,8 @@ def _fits_array_writer(fits_context, validator, _, instance, schema):
     if instance is None:
         return
 
+    instance_id = id(instance)
+
     instance = np.asanyarray(instance)
 
     if not len(instance.shape):
@@ -297,6 +299,10 @@ def _fits_array_writer(fits_context, validator, _, instance, schema):
                            index=index, hdu_type=hdu_type)
 
     hdu.data = instance
+    if instance_id in fits_context.extension_array_links:
+        if fits_context.extension_array_links[instance_id]() is not hdu:
+            raise ValueError("Linking one array to multiple hdus is not supported")
+    fits_context.extension_array_links[instance_id] = weakref.ref(hdu)
     hdu.ver = index + 1
 
 
@@ -331,6 +337,7 @@ def __init__(self, hdulist):
         self.hdulist = weakref.ref(hdulist)
         self.comment_stack = []
         self.sequence_index = None
+        self.extension_array_links = {}
 
 
 def _get_validators(hdulist):
@@ -350,7 +357,7 @@ def _get_validators(hdulist):
         'type': partial(_fits_type, fits_context),
     })
 
-    return validators
+    return validators, fits_context
 
 
 def _save_from_schema(hdulist, tree, schema):
@@ -370,24 +377,30 @@ def datetime_callback(node, json_id):
     else:
         kwargs = {}
 
-    validator = asdf_schema.get_validator(
-        schema, None, _get_validators(hdulist), **kwargs)
+    validators, context = _get_validators(hdulist)
+    validator = asdf_schema.get_validator(schema, None, validators, **kwargs)
 
     # This actually kicks off the saving
     validator.validate(tree, _schema=schema)
 
-    # Replace arrays in the tree that are identical to HDU arrays
-    # with ndarray-1.0.0 tagged objects with special source values
-    # that represent links to the surrounding FITS file.
-    def ndarray_callback(node, json_id):
-        if (isinstance(node, (np.ndarray, NDArrayType))):
+    # Now link extensions to items in the tree
+
+    def callback(node, json_id):
+        if id(node) in context.extension_array_links:
+            hdu = context.extension_array_links[id(node)]()
+            return _create_tagged_dict_for_fits_array(hdu, hdulist.index(hdu))
+        elif isinstance(node, (np.ndarray, NDArrayType)):
+            # in addition to links generated during validation
+            # replace arrays in the tree that are identical to HDU arrays
+            # with ndarray-1.0.0 tagged objects with special source values
+            # that represent links to the surrounding FITS file.
+            # This is important for general ASDF-in-FITS support
             for hdu_index, hdu in enumerate(hdulist):
                 if hdu.data is not None and node is hdu.data:
                     return _create_tagged_dict_for_fits_array(hdu, hdu_index)
-
         return node
 
-    tree = treeutil.walk_and_modify(tree, ndarray_callback)
+    tree = treeutil.walk_and_modify(tree, callback)
 
     return tree
 

diff --git a/tests/test_fits.py b/tests/test_fits.py
@@ -1,3 +1,5 @@
+import re
+
 import pytest
 from astropy.io import fits
 import numpy as np
@@ -627,3 +629,49 @@ def test_resave_duplication_bug(tmp_path):
 
     with fits.open(fn1) as ff1, fits.open(fn2) as ff2:
         assert ff1['ASDF'].size == ff2['ASDF'].size
+
+
+def test_table_linking(tmp_path):
+    file_path = tmp_path / "test.fits"
+
+    schema = {
+        'title': 'Test data model',
+        '$schema': 'http://stsci.edu/schemas/fits-schema/fits-schema',
+        'type': 'object',
+        'properties': {
+            'meta': {
+                'type': 'object',
+                'properties': {}
+            },
+            'test_table': {
+                'title': 'Test table',
+                'fits_hdu': 'TESTTABL',
+                'datatype': [
+                    {'name': 'A_COL', 'datatype': 'int8'},
+                    {'name': 'B_COL', 'datatype': 'int8'}
+                ]
+            }
+        }
+    }
+
+    with DataModel(schema=schema) as dm:
+        test_array = np.array([(1, 2), (3, 4)], dtype=[('A_COL', 'i1'), ('B_COL', 'i1')])
+
+        # assigning to the model will convert the array to a FITS_rec
+        dm.test_table = test_array
+        assert isinstance(dm.test_table, fits.FITS_rec)
+
+        # save the model (with the table)
+        dm.save(file_path)
+
+    # open the model and confirm that the table was linked to an hdu
+    with fits.open(file_path) as ff:
+        # read the bytes for the embedded ASDF content
+        asdf_bytes = ff['ASDF'].data.tobytes()
+
+        # get only the bytes for the tree (not blocks) by splitting
+        # on the yaml end document marker '...'
+        # on the first block magic sequence
+        tree_string = asdf_bytes.split(b'...')[0].decode('ascii')
+        unlinked_arrays = re.findall(r'source:\s+[^f]', tree_string)
+        assert not len(unlinked_arrays), unlinked_arrays