fix: improve dataset identifiers (#3967)

* fix: improve dataset identifiers * Linter * Test declutter * new test+overhaul
chanzuckerberg · Jan 13, 2023 · 8a52c64 · 8a52c64
1 parent e14aa76
commit 8a52c64
Show file tree

Hide file tree

Showing 3 changed files with 174 additions and 8 deletions.
diff --git a/backend/layers/api/portal_api.py b/backend/layers/api/portal_api.py
@@ -650,7 +650,7 @@ def delete_dataset(dataset_id: str, token_info: dict):
 
 def get_dataset_identifiers(url: str):
     """
-    a.k.a. the meta endpoint
+    Return a set of dataset identifiers. This endpoint is meant to be used by single-cell-explorer.
     """
     try:
         path = urlparse(url).path
@@ -665,19 +665,31 @@ def get_dataset_identifiers(url: str):
     if dataset is None:
         raise NotFoundHTTPException()
 
+    # A dataset version can appear in multiple collections versions. This endpoint should:
+    # 1. Return the most recent published version that contains the dataset version (aka the mapped version)
+    # 2. If the version only appears in an unpublished version, return that one.
+
     collection = get_business_logic().get_collection_version_from_canonical(dataset.collection_id)
-    if collection is None:  # orphaned datasets
+    if collection is None:  # orphaned datasets - shouldn't happen, but we should return 404 just in case
+        raise NotFoundHTTPException()
+
+    if dataset.version_id not in [d.version_id for d in collection.datasets]:
+        # If the dataset is not in the mapped collection version, it means the dataset belongs to the active
+        # unpublished version. We should return that one
+        collection = get_business_logic().get_unpublished_collection_version_from_canonical(dataset.collection_id)
+
+    if collection is None:  # again, orphaned datasets
         raise NotFoundHTTPException()
 
+    collection_id, dataset_id = collection.version_id.id, dataset.version_id.id
+
     # Retrieves the URI of the cxg artifact
     s3_uri = next(a.uri for a in dataset.artifacts if a.type == DatasetArtifactType.CXG)
 
-    dataset_id = dataset.version_id.id
-
     dataset_identifiers = {
         "s3_uri": s3_uri,
         "dataset_id": dataset_id,
-        "collection_id": dataset.collection_id.id,
+        "collection_id": collection_id,
         "collection_visibility": "PUBLIC" if collection.published_at is not None else "PRIVATE",
         "tombstoned": False,  # No longer applicable
     }

diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py
@@ -7,6 +7,7 @@
 from unittest.mock import Mock, patch
 from backend.layers.business.entities import DatasetArtifactDownloadData
 from backend.layers.common.entities import (
+    CollectionVersionId,
     DatasetStatusKey,
 )
 from backend.layers.common.entities import (
@@ -1774,7 +1775,7 @@ def test__dataset_meta__ok(self):
             expected_identifiers = {
                 "s3_uri": test_uri_0,
                 "dataset_id": public_dataset.dataset_version_id,
-                "collection_id": public_dataset.collection_id,
+                "collection_id": public_dataset.collection_version_id,
                 "collection_visibility": "PUBLIC",  # this is a published collection
                 "tombstoned": False,
             }
@@ -1793,7 +1794,7 @@ def test__dataset_meta__ok(self):
             expected_identifiers = {
                 "s3_uri": test_uri_1,
                 "dataset_id": private_dataset.dataset_version_id,
-                "collection_id": private_dataset.collection_id,
+                "collection_id": private_dataset.collection_version_id,
                 "collection_visibility": "PRIVATE",
                 "tombstoned": False,
             }
@@ -1813,6 +1814,157 @@ def test__dataset_meta__404(self):
         response = self.app.get(test_url_404, headers)
         self.assertEqual(response.status_code, 404)
 
+    def test__explorer_portal_integration(self):
+        """
+        Tests the explorer <-> portal integration.
+        The steps carried out by this test are:
+        1. Generate the explorer_url
+        2. Call the `get_dataset_identifiers` endpoint, retrieve `collection_id` and `dataset_id` from there
+        3. Call the GET /collections/:collection_id endpoint, locate the dataset
+        """
+        headers = {"host": "localhost", "Content-Type": "application/json"}
+
+        def _call_meta_endpoint(explorer_url):
+            test_url = f"/dp/v1/datasets/meta?url={explorer_url}"
+            response = self.app.get(test_url, headers)
+            self.assertEqual(response.status_code, 200)
+            return json.loads(response.data)
+
+        def _call_collections_endpoint(collection_id):
+            test_url = f"/dp/v1/collections/{collection_id}"
+            response = self.app.get(test_url, headers)
+            self.assertEqual(response.status_code, 200)
+            return json.loads(response.data)
+
+        with self.subTest("Dataset belonging to an unpublished collection"):
+
+            test_uri = "some_uri_0"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)],
+                publish=False,
+            )
+            # In this case, explorer_url points to the canonical link
+            explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset belonging to a published collection"):
+
+            test_uri = "some_uri_1"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            # In this case, explorer_url points to the canonical link
+            explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset belonging to a revision of a published collection, not replaced"):
+
+            test_uri = "some_uri_2"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            self.business_logic.create_collection_version(CollectionId(dataset.collection_id))
+
+            # In this case, explorer_url points to the versioned link
+            explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset belonging to a revision of a published collection, replaced"):
+
+            test_uri = "some_uri_1"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id))
+            revised_dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)],
+                collection_version=revision,
+                replace_dataset_version_id=DatasetVersionId(dataset.dataset_version_id),
+            )
+            self.assertEqual(revised_dataset.dataset_id, dataset.dataset_id)
+            self.assertNotEqual(revised_dataset.dataset_version_id, dataset.dataset_version_id)
+
+            # Retrieve the explorer url from the GET collections/:collection_id endpoint. This is the only way to force
+            # explorer_url to be exactly the same used by the portal to open the explorer url
+            test_url = f"/dp/v1/collections/{revision.version_id}"
+            response = self.app.get(test_url, headers)
+            self.assertEqual(response.status_code, 200)
+            response_data = json.loads(response.data)
+            datasets = response_data["datasets"]
+            self.assertIn(revised_dataset.dataset_version_id, [dataset["id"] for dataset in datasets])
+            replaced_dataset = next(
+                dataset for dataset in datasets if dataset["id"] == revised_dataset.dataset_version_id
+            )
+
+            explorer_url = replaced_dataset["dataset_deployments"][0]["url"]
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
+        with self.subTest("Dataset that appears in multiple published versions"):
+            """
+            If a dataset appears in multiple collection versions, the most recent one will be returned.
+            """
+            test_uri = "some_uri_1"
+
+            dataset = self.generate_dataset(
+                artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
+            )
+            revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id))
+
+            self.business_logic.publish_collection_version(revision.version_id)
+
+            # Both versions are now published
+            original_version = self.business_logic.get_collection_version(
+                CollectionVersionId(dataset.collection_version_id)
+            )
+            revision_version = self.business_logic.get_collection_version(revision.version_id)
+
+            self.assertIsNotNone(original_version.published_at)
+            self.assertIsNotNone(revision_version.published_at)
+
+            explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/"
+            meta_response = _call_meta_endpoint(explorer_url)
+
+            returned_collection_id = meta_response["collection_id"]
+            returned_dataset_id = meta_response["dataset_id"]
+
+            self.assertEqual(returned_collection_id, revision_version.version_id.id)
+
+            collections_response = _call_collections_endpoint(returned_collection_id)
+            datasets = collections_response["datasets"]
+            self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])
+
 
 class TestDatasetCurators(BaseAPIPortalTest):
     def setUp(self):

diff --git a/tests/unit/backend/layers/common/base_test.py b/tests/unit/backend/layers/common/base_test.py
@@ -16,6 +16,7 @@
     DatasetStatusGeneric,
     DatasetStatusKey,
     DatasetValidationStatus,
+    DatasetVersionId,
     Link,
     OntologyTermId,
 )
@@ -226,14 +227,15 @@ def generate_dataset(
         validation_message: str = None,
         artifacts: List[DatasetArtifactUpdate] = None,
         publish: bool = False,
+        replace_dataset_version_id: Optional[DatasetVersionId] = None,
     ) -> DatasetData:
         """
         Convenience method for generating a dataset. Also generates an unpublished collection if needed.
         """
         if not collection_version:
             collection_version = self.generate_unpublished_collection(owner)
         dataset_version_id, dataset_id = self.business_logic.ingest_dataset(
-            collection_version.version_id, "http://fake.url", None, None
+            collection_version.version_id, "http://fake.url", None, replace_dataset_version_id
         )
         if not metadata:
             metadata = copy.deepcopy(self.sample_dataset_metadata)