fix: support null metadata and double lookup for write datasets (#3968)

* fix: support null metadata and double lookup for write datasets * PR changes * split test into 2
chanzuckerberg · Jan 13, 2023 · e14aa76 · e14aa76
1 parent ceb0379
commit e14aa76
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 43 deletions.
diff --git a/backend/layers/business/business_interface.py b/backend/layers/business/business_interface.py
@@ -10,6 +10,7 @@
     CollectionMetadata,
     CollectionVersion,
     CollectionVersionId,
+    CollectionVersionWithDatasets,
     DatasetArtifact,
     DatasetArtifactId,
     DatasetId,
@@ -29,13 +30,15 @@ def get_collections(self, filter: CollectionQueryFilter) -> Iterable[CollectionV
     def get_published_collection_version(self, collection_id: CollectionId) -> CollectionVersion:
         pass
 
-    def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersion:
+    def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersionWithDatasets:
         pass
 
     def get_collection_versions_from_canonical(self, collection_id: CollectionId) -> Iterable[CollectionVersion]:
         pass
 
-    def get_collection_version_from_canonical(self, collection_id: CollectionId) -> Optional[CollectionVersion]:
+    def get_collection_version_from_canonical(
+        self, collection_id: CollectionId
+    ) -> Optional[CollectionVersionWithDatasets]:
         pass
 
     def create_collection(

diff --git a/.../portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py b/.../portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py
@@ -26,7 +26,6 @@
     CollectionVersionId,
     CollectionVersionWithDatasets,
     DatasetVersion,
-    DatasetVersionId,
 )
 from backend.portal.api.curation.v1.curation.collections.common import (
     get_infered_dataset_version,
@@ -63,26 +62,34 @@ def get(collection_id: str, dataset_id: str = None):
 
 
 def _get_collection_and_dataset(
-    business_logic: BusinessLogicInterface, collection_id: CollectionVersionId, dataset_id: DatasetVersionId
+    business_logic: BusinessLogicInterface, collection_id: str, dataset_id: str
 ) -> Tuple[CollectionVersionWithDatasets, DatasetVersion]:
-    dataset_version = business_logic.get_dataset_version(dataset_id)
-    if dataset_version is None:
-        raise ForbiddenHTTPException()
+    """
+    Get collection and dataset by their ids. Will look up by both version and canonical id for both.
+    """
 
-    collection_version = business_logic.get_collection_version(collection_id)
+    collection_version = business_logic.get_collection_version_from_canonical(CollectionId(collection_id))
     if collection_version is None:
+        collection_version = business_logic.get_collection_version(CollectionVersionId(collection_id))
+    if collection_version is None:
+        raise ForbiddenHTTPException()
+
+    # Extract the dataset from the dataset list.
+    try:
+        dataset_version = next(
+            d for d in collection_version.datasets if d.version_id.id == dataset_id or d.dataset_id.id == dataset_id
+        )
+    except StopIteration:
         raise ForbiddenHTTPException()
 
     return collection_version, dataset_version
 
 
-def delete(token_info: dict, collection_id: str, dataset_id: str = None):
+def delete(token_info: dict, collection_id: str, dataset_id: str):
     business_logic = get_business_logic()
     user_info = UserInfo(token_info)
 
-    collection_version, dataset_version = _get_collection_and_dataset(
-        business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
-    )
+    collection_version, dataset_version = _get_collection_and_dataset(business_logic, collection_id, dataset_id)
 
     if not user_info.is_user_owner_or_allowed(collection_version.owner):
         raise ForbiddenHTTPException("Unauthorized")
@@ -105,9 +112,7 @@ def put(collection_id: str, dataset_id: str, body: dict, token_info: dict):
     url = body.get("url", body.get("link"))
     business_logic = get_business_logic()
 
-    collection_version, _ = _get_collection_and_dataset(
-        business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
-    )
+    collection_version, dataset_version = _get_collection_and_dataset(business_logic, collection_id, dataset_id)
 
     if not UserInfo(token_info).is_user_owner_or_allowed(collection_version.owner):
         raise ForbiddenHTTPException()
@@ -117,7 +122,7 @@ def put(collection_id: str, dataset_id: str, body: dict, token_info: dict):
             collection_version.version_id,
             url,
             None,
-            None if dataset_id is None else DatasetVersionId(dataset_id),
+            None if dataset_id is None else dataset_version.version_id,
         )
         return Response(status=202)
     except CollectionNotFoundException:

diff --git a/backend/portal/api/curation/v1/curation/collections/common.py b/backend/portal/api/curation/v1/curation/collections/common.py
@@ -162,14 +162,16 @@ def reshape_dataset_for_curation_api(
     else:
         columns = EntityColumns.dataset_metadata_cols
 
-    # Get dataset metadata fields
-    for column in columns:
-        col = getattr(dataset_version.metadata, column)
-        if isinstance(col, OntologyTermId):
-            col = [asdict(col)]
-        elif isinstance(col, list) and len(col) != 0 and isinstance(col[0], OntologyTermId):
-            col = [asdict(i) for i in col]
-        ds[column] = col
+    # Get dataset metadata fields.
+    # Metadata can be None if the dataset isn't still fully processed, so we account for that
+    if dataset_version.metadata is not None:
+        for column in columns:
+            col = getattr(dataset_version.metadata, column)
+            if isinstance(col, OntologyTermId):
+                col = [asdict(col)]
+            elif isinstance(col, list) and len(col) != 0 and isinstance(col[0], OntologyTermId):
+                col = [asdict(i) for i in col]
+            ds[column] = col
 
     # Get none preview specific dataset fields
     if not preview:

diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py
@@ -952,26 +952,50 @@ def test__update_public_collection_owner__405(self):
 
 
 class TestDeleteDataset(BaseAPIPortalTest):
-    def test__delete_dataset(self):
-        auth_credentials = [
+    def setUp(self):
+        super().setUp()
+        self.auth_credentials = [
             (self.make_super_curator_header, "super", 202),
             (self.make_owner_header, "owner", 202),
             (None, "none", 401),
             (self.make_not_owner_header, "not_owner", 403),
         ]
-        for auth, auth_description, expected_status_code in auth_credentials:
+
+    def _delete(self, auth, collection_id, dataset_id):
+        """
+        Helper method to call the delete endpoint
+        """
+        test_url = f"/curation/v1/collections/{collection_id}/datasets/{dataset_id}"
+        headers = auth() if callable(auth) else auth
+        return self.app.delete(test_url, headers=headers)
+
+    def test__delete_dataset_by_version_id(self):
+        """
+        Calling DELETE /collections/:collection_id/datasets/:dataset_id should work according to the
+        auth token passed and when using versioned ids
+        """
+        for auth, auth_description, expected_status_code in self.auth_credentials:
             with self.subTest(f"{auth_description} {expected_status_code}"):
                 dataset = self.generate_dataset(
                     statuses=[DatasetStatusUpdate(DatasetStatusKey.UPLOAD, DatasetUploadStatus.UPLOADING)],
                     publish=False,
                 )
+                response = self._delete(auth, dataset.collection_version_id, dataset.dataset_version_id)
+                self.assertEqual(expected_status_code, response.status_code)
 
-                test_url = (
-                    f"/curation/v1/collections/{dataset.collection_version_id}/datasets/"
-                    f"{dataset.dataset_version_id}"
+    def test__delete_dataset_by_canonical_id(self):
+        """
+        Calling DELETE /collections/:collection_id/datasets/:dataset_id should work according to the
+        auth token passed and when using canonical ids. In this case, the unpublished collection
+        version will be looked up and used for deletion.
+        """
+        for auth, auth_description, expected_status_code in self.auth_credentials:
+            with self.subTest(f"{auth_description} {expected_status_code}"):
+                dataset = self.generate_dataset(
+                    statuses=[DatasetStatusUpdate(DatasetStatusKey.UPLOAD, DatasetUploadStatus.UPLOADING)],
+                    publish=False,
                 )
-                headers = auth() if callable(auth) else auth
-                response = self.app.delete(test_url, headers=headers)
+                response = self._delete(auth, dataset.collection_id, dataset.dataset_id)
                 self.assertEqual(expected_status_code, response.status_code)
 
 
@@ -1191,17 +1215,27 @@ def test__new_from_link__OK(self, *mocks):
         Calling PUT /datasets/:dataset_id should succeed if a valid link is uploaded by the owner of the collection
         """
 
-        dataset = self.generate_dataset(
-            statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)],
-        )
-        body = {"link": self.good_link}
-        headers = self.make_owner_header()
-        response = self.app.put(
-            f"/curation/v1/collections/{dataset.collection_version_id}/datasets/{dataset.dataset_version_id}",
-            json=body,
-            headers=headers,
-        )
-        self.assertEqual(202, response.status_code)
+        def _test_create(collection_id, dataset_id):
+            body = {"link": self.good_link}
+            headers = self.make_owner_header()
+            response = self.app.put(
+                f"/curation/v1/collections/{collection_id}/datasets/{dataset_id}",
+                json=body,
+                headers=headers,
+            )
+            self.assertEqual(202, response.status_code)
+
+        with self.subTest("with version_ids"):
+            dataset = self.generate_dataset(
+                statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)],
+            )
+            _test_create(dataset.collection_version_id, dataset.dataset_version_id)
+
+        with self.subTest("with collection_ids"):
+            dataset = self.generate_dataset(
+                statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)],
+            )
+            _test_create(dataset.collection_id, dataset.dataset_id)
 
     def test__new_from_link__Super_Curator(self, *mocks):
         """