From e14aa7691e8f2530e14f692c16961779b05cc408 Mon Sep 17 00:00:00 2001
From: Emanuele Bezzi <ebezzi@chanzuckerberg.com>
Date: Fri, 13 Jan 2023 16:07:52 -0500
Subject: [PATCH 1/2] fix: support null metadata and double lookup for write
 datasets (#3968)

* fix: support null metadata and double lookup for write datasets

* PR changes

* split test into 2
---
 backend/layers/business/business_interface.py |  7 +-
 .../datasets/dataset_id/actions.py            | 33 +++++----
 .../v1/curation/collections/common.py         | 18 ++---
 .../backend/layers/api/test_curation_api.py   | 72 ++++++++++++++-----
 4 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/backend/layers/business/business_interface.py b/backend/layers/business/business_interface.py
index a2fdbcd16f363..0d8fa37f41e74 100644
--- a/backend/layers/business/business_interface.py
+++ b/backend/layers/business/business_interface.py
@@ -10,6 +10,7 @@
     CollectionMetadata,
     CollectionVersion,
     CollectionVersionId,
+    CollectionVersionWithDatasets,
     DatasetArtifact,
     DatasetArtifactId,
     DatasetId,
@@ -29,13 +30,15 @@ def get_collections(self, filter: CollectionQueryFilter) -> Iterable[CollectionV
     def get_published_collection_version(self, collection_id: CollectionId) -> CollectionVersion:
         pass
 
-    def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersion:
+    def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersionWithDatasets:
         pass
 
     def get_collection_versions_from_canonical(self, collection_id: CollectionId) -> Iterable[CollectionVersion]:
         pass
 
-    def get_collection_version_from_canonical(self, collection_id: CollectionId) -> Optional[CollectionVersion]:
+    def get_collection_version_from_canonical(
+        self, collection_id: CollectionId
+    ) -> Optional[CollectionVersionWithDatasets]:
         pass
 
     def create_collection(
diff --git a/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py b/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py
index 5af5aff620f44..5f6d6492b3895 100644
--- a/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py
+++ b/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py
@@ -26,7 +26,6 @@
     CollectionVersionId,
     CollectionVersionWithDatasets,
     DatasetVersion,
-    DatasetVersionId,
 )
 from backend.portal.api.curation.v1.curation.collections.common import (
     get_infered_dataset_version,
@@ -63,26 +62,34 @@ def get(collection_id: str, dataset_id: str = None):
 
 
 def _get_collection_and_dataset(
-    business_logic: BusinessLogicInterface, collection_id: CollectionVersionId, dataset_id: DatasetVersionId
+    business_logic: BusinessLogicInterface, collection_id: str, dataset_id: str
 ) -> Tuple[CollectionVersionWithDatasets, DatasetVersion]:
-    dataset_version = business_logic.get_dataset_version(dataset_id)
-    if dataset_version is None:
-        raise ForbiddenHTTPException()
+    """
+    Get collection and dataset by their ids. Will look up by both version and canonical id for both.
+    """
 
-    collection_version = business_logic.get_collection_version(collection_id)
+    collection_version = business_logic.get_collection_version_from_canonical(CollectionId(collection_id))
     if collection_version is None:
+        collection_version = business_logic.get_collection_version(CollectionVersionId(collection_id))
+    if collection_version is None:
+        raise ForbiddenHTTPException()
+
+    # Extract the dataset from the dataset list.
+    try:
+        dataset_version = next(
+            d for d in collection_version.datasets if d.version_id.id == dataset_id or d.dataset_id.id == dataset_id
+        )
+    except StopIteration:
         raise ForbiddenHTTPException()
 
     return collection_version, dataset_version
 
 
-def delete(token_info: dict, collection_id: str, dataset_id: str = None):
+def delete(token_info: dict, collection_id: str, dataset_id: str):
     business_logic = get_business_logic()
     user_info = UserInfo(token_info)
 
-    collection_version, dataset_version = _get_collection_and_dataset(
-        business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
-    )
+    collection_version, dataset_version = _get_collection_and_dataset(business_logic, collection_id, dataset_id)
 
     if not user_info.is_user_owner_or_allowed(collection_version.owner):
         raise ForbiddenHTTPException("Unauthorized")
@@ -105,9 +112,7 @@ def put(collection_id: str, dataset_id: str, body: dict, token_info: dict):
     url = body.get("url", body.get("link"))
     business_logic = get_business_logic()
 
-    collection_version, _ = _get_collection_and_dataset(
-        business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
-    )
+    collection_version, dataset_version = _get_collection_and_dataset(business_logic, collection_id, dataset_id)
 
     if not UserInfo(token_info).is_user_owner_or_allowed(collection_version.owner):
         raise ForbiddenHTTPException()
@@ -117,7 +122,7 @@ def put(collection_id: str, dataset_id: str, body: dict, token_info: dict):
             collection_version.version_id,
             url,
             None,
-            None if dataset_id is None else DatasetVersionId(dataset_id),
+            None if dataset_id is None else dataset_version.version_id,
         )
         return Response(status=202)
     except CollectionNotFoundException:
diff --git a/backend/portal/api/curation/v1/curation/collections/common.py b/backend/portal/api/curation/v1/curation/collections/common.py
index 0fde8dd0f8051..69b3aaff8ca7f 100644
--- a/backend/portal/api/curation/v1/curation/collections/common.py
+++ b/backend/portal/api/curation/v1/curation/collections/common.py
@@ -162,14 +162,16 @@ def reshape_dataset_for_curation_api(
     else:
         columns = EntityColumns.dataset_metadata_cols
 
-    # Get dataset metadata fields
-    for column in columns:
-        col = getattr(dataset_version.metadata, column)
-        if isinstance(col, OntologyTermId):
-            col = [asdict(col)]
-        elif isinstance(col, list) and len(col) != 0 and isinstance(col[0], OntologyTermId):
-            col = [asdict(i) for i in col]
-        ds[column] = col
+    # Get dataset metadata fields.
+    # Metadata can be None if the dataset isn't still fully processed, so we account for that
+    if dataset_version.metadata is not None:
+        for column in columns:
+            col = getattr(dataset_version.metadata, column)
+            if isinstance(col, OntologyTermId):
+                col = [asdict(col)]
+            elif isinstance(col, list) and len(col) != 0 and isinstance(col[0], OntologyTermId):
+                col = [asdict(i) for i in col]
+            ds[column] = col
 
     # Get none preview specific dataset fields
     if not preview:
diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py
index 73782ef7b5105..8db9d35cfbee0 100644
--- a/tests/unit/backend/layers/api/test_curation_api.py
+++ b/tests/unit/backend/layers/api/test_curation_api.py
@@ -952,26 +952,50 @@ def test__update_public_collection_owner__405(self):
 
 
 class TestDeleteDataset(BaseAPIPortalTest):
-    def test__delete_dataset(self):
-        auth_credentials = [
+    def setUp(self):
+        super().setUp()
+        self.auth_credentials = [
             (self.make_super_curator_header, "super", 202),
             (self.make_owner_header, "owner", 202),
             (None, "none", 401),
             (self.make_not_owner_header, "not_owner", 403),
         ]
-        for auth, auth_description, expected_status_code in auth_credentials:
+
+    def _delete(self, auth, collection_id, dataset_id):
+        """
+        Helper method to call the delete endpoint
+        """
+        test_url = f"/curation/v1/collections/{collection_id}/datasets/{dataset_id}"
+        headers = auth() if callable(auth) else auth
+        return self.app.delete(test_url, headers=headers)
+
+    def test__delete_dataset_by_version_id(self):
+        """
+        Calling DELETE /collections/:collection_id/datasets/:dataset_id should work according to the
+        auth token passed and when using versioned ids
+        """
+        for auth, auth_description, expected_status_code in self.auth_credentials:
             with self.subTest(f"{auth_description} {expected_status_code}"):
                 dataset = self.generate_dataset(
                     statuses=[DatasetStatusUpdate(DatasetStatusKey.UPLOAD, DatasetUploadStatus.UPLOADING)],
                     publish=False,
                 )
+                response = self._delete(auth, dataset.collection_version_id, dataset.dataset_version_id)
+                self.assertEqual(expected_status_code, response.status_code)
 
-                test_url = (
-                    f"/curation/v1/collections/{dataset.collection_version_id}/datasets/"
-                    f"{dataset.dataset_version_id}"
+    def test__delete_dataset_by_canonical_id(self):
+        """
+        Calling DELETE /collections/:collection_id/datasets/:dataset_id should work according to the
+        auth token passed and when using canonical ids. In this case, the unpublished collection
+        version will be looked up and used for deletion.
+        """
+        for auth, auth_description, expected_status_code in self.auth_credentials:
+            with self.subTest(f"{auth_description} {expected_status_code}"):
+                dataset = self.generate_dataset(
+                    statuses=[DatasetStatusUpdate(DatasetStatusKey.UPLOAD, DatasetUploadStatus.UPLOADING)],
+                    publish=False,
                 )
-                headers = auth() if callable(auth) else auth
-                response = self.app.delete(test_url, headers=headers)
+                response = self._delete(auth, dataset.collection_id, dataset.dataset_id)
                 self.assertEqual(expected_status_code, response.status_code)
 
 
@@ -1191,17 +1215,27 @@ def test__new_from_link__OK(self, *mocks):
         Calling PUT /datasets/:dataset_id should succeed if a valid link is uploaded by the owner of the collection
         """
 
-        dataset = self.generate_dataset(
-            statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)],
-        )
-        body = {"link": self.good_link}
-        headers = self.make_owner_header()
-        response = self.app.put(
-            f"/curation/v1/collections/{dataset.collection_version_id}/datasets/{dataset.dataset_version_id}",
-            json=body,
-            headers=headers,
-        )
-        self.assertEqual(202, response.status_code)
+        def _test_create(collection_id, dataset_id):
+            body = {"link": self.good_link}
+            headers = self.make_owner_header()
+            response = self.app.put(
+                f"/curation/v1/collections/{collection_id}/datasets/{dataset_id}",
+                json=body,
+                headers=headers,
+            )
+            self.assertEqual(202, response.status_code)
+
+        with self.subTest("with version_ids"):
+            dataset = self.generate_dataset(
+                statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)],
+            )
+            _test_create(dataset.collection_version_id, dataset.dataset_version_id)
+
+        with self.subTest("with collection_ids"):
+            dataset = self.generate_dataset(
+                statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)],
+            )
+            _test_create(dataset.collection_id, dataset.dataset_id)
 
     def test__new_from_link__Super_Curator(self, *mocks):
         """

From 8a52c649e1f5d881aca42f6a9cfbc3b807e37287 Mon Sep 17 00:00:00 2001
From: Emanuele Bezzi <ebezzi@chanzuckerberg.com>
Date: Fri, 13 Jan 2023 17:26:38 -0500
Subject: [PATCH 2/2] fix: improve dataset identifiers (#3967)

* fix: improve dataset identifiers

* Linter

* Test declutter

* new test+overhaul
---
 backend/layers/api/portal_api.py              |  22 ++-
 .../backend/layers/api/test_portal_api.py     | 156 +++++++++++++++++-
 tests/unit/backend/layers/common/base_test.py |   4 +-
 3 files changed, 174 insertions(+), 8 deletions(-)

diff --git a/backend/layers/api/portal_api.py b/backend/layers/api/portal_api.py
index 068b229f91f0b..dd57d847b3599 100644
--- a/backend/layers/api/portal_api.py
+++ b/backend/layers/api/portal_api.py
@@ -650,7 +650,7 @@ def delete_dataset(dataset_id: str, token_info: dict):
 
 def get_dataset_identifiers(url: str):
     """
-    a.k.a. the meta endpoint
+    Return a set of dataset identifiers. This endpoint is meant to be used by single-cell-explorer.
     """
     try:
         path = urlparse(url).path
@@ -665,19 +665,31 @@ def get_dataset_identifiers(url: str):
     if dataset is None:
         raise NotFoundHTTPException()
 
+    # A dataset version can appear in multiple collections versions. This endpoint should:
+    # 1. Return the most recent published version that contains the dataset version (aka the mapped version)
+    # 2. If the version only appears in an unpublished version, return that one.
+
     collection = get_business_logic().get_collection_version_from_canonical(dataset.collection_id)
-    if collection is None:  # orphaned datasets
+    if collection is None:  # orphaned datasets - shouldn't happen, but we should return 404 just in case
+        raise NotFoundHTTPException()
+
+    if dataset.version_id not in [d.version_id for d in collection.datasets]:
+        # If the dataset is not in the mapped collection version, it means the dataset belongs to the active
+        # unpublished version. We should return that one
+        collection = get_business_logic().get_unpublished_collection_version_from_canonical(dataset.collection_id)
+
+    if collection is None:  # again, orphaned datasets
         raise NotFoundHTTPException()
 
+    collection_id, dataset_id = collection.version_id.id, dataset.version_id.id
+
     # Retrieves the URI of the cxg artifact
     s3_uri = next(a.uri for a in dataset.artifacts if a.type == DatasetArtifactType.CXG)
 
-    dataset_id = dataset.version_id.id
-
     dataset_identifiers = {
         "s3_uri": s3_uri,
         "dataset_id": dataset_id,
-        "collection_id": dataset.collection_id.id,
+        "collection_id": collection_id,
         "collection_visibility": "PUBLIC" if collection.published_at is not None else "PRIVATE",
         "tombstoned": False,  # No longer applicable
     }
diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py
index 02b92032a5d2d..1f2f5dd227751 100644
--- a/tests/unit/backend/layers/api/test_portal_api.py
+++ b/tests/unit/backend/layers/api/test_portal_api.py
@@ -7,6 +7,7 @@
 from unittest.mock import Mock, patch
 from backend.layers.business.entities import DatasetArtifactDownloadData
 from backend.layers.common.entities import (
+    CollectionVersionId,
     DatasetStatusKey,
 )
 from backend.layers.common.entities import (
@@ -1774,7 +1775,7 @@ def test__dataset_meta__ok(self):
             expected_identifiers = {
                 "s3_uri": test_uri_0,
                 "dataset_id": public_dataset.dataset_version_id,
-                "collection_id": public_dataset.collection_id,
+                "collection_id": public_dataset.collection_version_id,
                 "collection_visibility": "PUBLIC",  # this is a published collection
                 "tombstoned": False,
             }
@@ -1793,7 +1794,7 @@ def test__dataset_meta__ok(self):
             expected_identifiers = {
                 "s3_uri": test_uri_1,
                 "dataset_id": private_dataset.dataset_version_id,
-                "collection_id": private_dataset.collection_id,
+                "collection_id": private_dataset.collection_version_id,
                 "collection_visibility": "PRIVATE",
                 "tombstoned": False,
             }
@@ -1813,6 +1814,157 @@ def test__dataset_meta__404(self):
         response = self.app.get(test_url_404, headers)
         self.assertEqual(response.status_code, 404)
 
+    def test__explorer_portal_integration(self):
+        """
+        Tests the explorer <-> portal integration.
+        The steps carried out by this test are:
+        1. Generate the explorer_url
+        2. Call the `get_dataset_identifiers` endpoint, retrieve `collection_id` and `dataset_id` from there
+        3. Call the GET /collections/:collection_id endpoint, locate the dataset
+        """
+        headers = {"host": "localhost", "Content-Type": "application/json"}
+
+        def _call_meta_endpoint(explorer_url):
+            test_url = f"/dp/v1/datasets/meta?url={explorer_url}"
+            response = self.app.get(test_url, headers)
+            self.assertEqual(response.status_code, 200)
+            return json.loads(response.data)
+
+        def _call_collections_endpoint(collection_id):
+            test_url = f"/dp/v1/collections/{collection_id}"
+            response = self.app.get(test_url, headers)
+            self.assertEqual(response.status_code, 200)
+            return json.loads(response.data)
+
+        with self.subTest("Dataset belonging to an unpublished collection"):
+
+            test_uri = "some_uri_0"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)],
+                publish=False,
+            )
+            # In this case, explorer_url points to the canonical link
+            explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset belonging to a published collection"):
+
+            test_uri = "some_uri_1"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            # In this case, explorer_url points to the canonical link
+            explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset belonging to a revision of a published collection, not replaced"):
+
+            test_uri = "some_uri_2"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            self.business_logic.create_collection_version(CollectionId(dataset.collection_id))
+
+            # In this case, explorer_url points to the versioned link
+            explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset belonging to a revision of a published collection, replaced"):
+
+            test_uri = "some_uri_1"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id))
+            revised_dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)],
+                collection_version=revision,
+                replace_dataset_version_id=DatasetVersionId(dataset.dataset_version_id),
+            )
+            self.assertEqual(revised_dataset.dataset_id, dataset.dataset_id)
+            self.assertNotEqual(revised_dataset.dataset_version_id, dataset.dataset_version_id)
+
+            # Retrieve the explorer url from the GET collections/:collection_id endpoint. This is the only way to force
+            # explorer_url to be exactly the same used by the portal to open the explorer url
+            test_url = f"/dp/v1/collections/{revision.version_id}"
+            response = self.app.get(test_url, headers)
+            self.assertEqual(response.status_code, 200)
+            response_data = json.loads(response.data)
+            datasets = response_data["datasets"]
+            self.assertIn(revised_dataset.dataset_version_id, [dataset["id"] for dataset in datasets])
+            replaced_dataset = next(
+                dataset for dataset in datasets if dataset["id"] == revised_dataset.dataset_version_id
+            )
+
+            explorer_url = replaced_dataset["dataset_deployments"][0]["url"]
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset that appears in multiple published versions"):
+            """
+            If a dataset appears in multiple collection versions, the most recent one will be returned.
+            """
+            test_uri = "some_uri_1"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id))
+
+            self.business_logic.publish_collection_version(revision.version_id)
+
+            # Both versions are now published
+            original_version = self.business_logic.get_collection_version(
+                CollectionVersionId(dataset.collection_version_id)
+            )
+            revision_version = self.business_logic.get_collection_version(revision.version_id)
+
+            self.assertIsNotNone(original_version.published_at)
+            self.assertIsNotNone(revision_version.published_at)
+
+            explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            self.assertEqual(returned_collection_id, revision_version.version_id.id)
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
 
 class TestDatasetCurators(BaseAPIPortalTest):
     def setUp(self):
diff --git a/tests/unit/backend/layers/common/base_test.py b/tests/unit/backend/layers/common/base_test.py
index 451051cab627c..32145529138ee 100644
--- a/tests/unit/backend/layers/common/base_test.py
+++ b/tests/unit/backend/layers/common/base_test.py
@@ -16,6 +16,7 @@
     DatasetStatusGeneric,
     DatasetStatusKey,
     DatasetValidationStatus,
+    DatasetVersionId,
     Link,
     OntologyTermId,
 )
@@ -226,6 +227,7 @@ def generate_dataset(
         validation_message: str = None,
         artifacts: List[DatasetArtifactUpdate] = None,
         publish: bool = False,
+        replace_dataset_version_id: Optional[DatasetVersionId] = None,
     ) -> DatasetData:
         """
         Convenience method for generating a dataset. Also generates an unpublished collection if needed.
@@ -233,7 +235,7 @@ def generate_dataset(
         if not collection_version:
             collection_version = self.generate_unpublished_collection(owner)
         dataset_version_id, dataset_id = self.business_logic.ingest_dataset(
-            collection_version.version_id, "http://fake.url", None, None
+            collection_version.version_id, "http://fake.url", None, replace_dataset_version_id
         )
         if not metadata:
             metadata = copy.deepcopy(self.sample_dataset_metadata)