From e14aa7691e8f2530e14f692c16961779b05cc408 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 13 Jan 2023 16:07:52 -0500 Subject: [PATCH 1/2] fix: support null metadata and double lookup for write datasets (#3968) * fix: support null metadata and double lookup for write datasets * PR changes * split test into 2 --- backend/layers/business/business_interface.py | 7 +- .../datasets/dataset_id/actions.py | 33 +++++---- .../v1/curation/collections/common.py | 18 ++--- .../backend/layers/api/test_curation_api.py | 72 ++++++++++++++----- 4 files changed, 87 insertions(+), 43 deletions(-) diff --git a/backend/layers/business/business_interface.py b/backend/layers/business/business_interface.py index a2fdbcd16f363..0d8fa37f41e74 100644 --- a/backend/layers/business/business_interface.py +++ b/backend/layers/business/business_interface.py @@ -10,6 +10,7 @@ CollectionMetadata, CollectionVersion, CollectionVersionId, + CollectionVersionWithDatasets, DatasetArtifact, DatasetArtifactId, DatasetId, @@ -29,13 +30,15 @@ def get_collections(self, filter: CollectionQueryFilter) -> Iterable[CollectionV def get_published_collection_version(self, collection_id: CollectionId) -> CollectionVersion: pass - def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersion: + def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersionWithDatasets: pass def get_collection_versions_from_canonical(self, collection_id: CollectionId) -> Iterable[CollectionVersion]: pass - def get_collection_version_from_canonical(self, collection_id: CollectionId) -> Optional[CollectionVersion]: + def get_collection_version_from_canonical( + self, collection_id: CollectionId + ) -> Optional[CollectionVersionWithDatasets]: pass def create_collection( diff --git a/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py b/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py index 5af5aff620f44..5f6d6492b3895 100644 --- a/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py +++ b/backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py @@ -26,7 +26,6 @@ CollectionVersionId, CollectionVersionWithDatasets, DatasetVersion, - DatasetVersionId, ) from backend.portal.api.curation.v1.curation.collections.common import ( get_infered_dataset_version, @@ -63,26 +62,34 @@ def get(collection_id: str, dataset_id: str = None): def _get_collection_and_dataset( - business_logic: BusinessLogicInterface, collection_id: CollectionVersionId, dataset_id: DatasetVersionId + business_logic: BusinessLogicInterface, collection_id: str, dataset_id: str ) -> Tuple[CollectionVersionWithDatasets, DatasetVersion]: - dataset_version = business_logic.get_dataset_version(dataset_id) - if dataset_version is None: - raise ForbiddenHTTPException() + """ + Get collection and dataset by their ids. Will look up by both version and canonical id for both. + """ - collection_version = business_logic.get_collection_version(collection_id) + collection_version = business_logic.get_collection_version_from_canonical(CollectionId(collection_id)) if collection_version is None: + collection_version = business_logic.get_collection_version(CollectionVersionId(collection_id)) + if collection_version is None: + raise ForbiddenHTTPException() + + # Extract the dataset from the dataset list. + try: + dataset_version = next( + d for d in collection_version.datasets if d.version_id.id == dataset_id or d.dataset_id.id == dataset_id + ) + except StopIteration: raise ForbiddenHTTPException() return collection_version, dataset_version -def delete(token_info: dict, collection_id: str, dataset_id: str = None): +def delete(token_info: dict, collection_id: str, dataset_id: str): business_logic = get_business_logic() user_info = UserInfo(token_info) - collection_version, dataset_version = _get_collection_and_dataset( - business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id) - ) + collection_version, dataset_version = _get_collection_and_dataset(business_logic, collection_id, dataset_id) if not user_info.is_user_owner_or_allowed(collection_version.owner): raise ForbiddenHTTPException("Unauthorized") @@ -105,9 +112,7 @@ def put(collection_id: str, dataset_id: str, body: dict, token_info: dict): url = body.get("url", body.get("link")) business_logic = get_business_logic() - collection_version, _ = _get_collection_and_dataset( - business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id) - ) + collection_version, dataset_version = _get_collection_and_dataset(business_logic, collection_id, dataset_id) if not UserInfo(token_info).is_user_owner_or_allowed(collection_version.owner): raise ForbiddenHTTPException() @@ -117,7 +122,7 @@ def put(collection_id: str, dataset_id: str, body: dict, token_info: dict): collection_version.version_id, url, None, - None if dataset_id is None else DatasetVersionId(dataset_id), + None if dataset_id is None else dataset_version.version_id, ) return Response(status=202) except CollectionNotFoundException: diff --git a/backend/portal/api/curation/v1/curation/collections/common.py b/backend/portal/api/curation/v1/curation/collections/common.py index 0fde8dd0f8051..69b3aaff8ca7f 100644 --- a/backend/portal/api/curation/v1/curation/collections/common.py +++ b/backend/portal/api/curation/v1/curation/collections/common.py @@ -162,14 +162,16 @@ def reshape_dataset_for_curation_api( else: columns = EntityColumns.dataset_metadata_cols - # Get dataset metadata fields - for column in columns: - col = getattr(dataset_version.metadata, column) - if isinstance(col, OntologyTermId): - col = [asdict(col)] - elif isinstance(col, list) and len(col) != 0 and isinstance(col[0], OntologyTermId): - col = [asdict(i) for i in col] - ds[column] = col + # Get dataset metadata fields. + # Metadata can be None if the dataset isn't still fully processed, so we account for that + if dataset_version.metadata is not None: + for column in columns: + col = getattr(dataset_version.metadata, column) + if isinstance(col, OntologyTermId): + col = [asdict(col)] + elif isinstance(col, list) and len(col) != 0 and isinstance(col[0], OntologyTermId): + col = [asdict(i) for i in col] + ds[column] = col # Get none preview specific dataset fields if not preview: diff --git a/tests/unit/backend/layers/api/test_curation_api.py b/tests/unit/backend/layers/api/test_curation_api.py index 73782ef7b5105..8db9d35cfbee0 100644 --- a/tests/unit/backend/layers/api/test_curation_api.py +++ b/tests/unit/backend/layers/api/test_curation_api.py @@ -952,26 +952,50 @@ def test__update_public_collection_owner__405(self): class TestDeleteDataset(BaseAPIPortalTest): - def test__delete_dataset(self): - auth_credentials = [ + def setUp(self): + super().setUp() + self.auth_credentials = [ (self.make_super_curator_header, "super", 202), (self.make_owner_header, "owner", 202), (None, "none", 401), (self.make_not_owner_header, "not_owner", 403), ] - for auth, auth_description, expected_status_code in auth_credentials: + + def _delete(self, auth, collection_id, dataset_id): + """ + Helper method to call the delete endpoint + """ + test_url = f"/curation/v1/collections/{collection_id}/datasets/{dataset_id}" + headers = auth() if callable(auth) else auth + return self.app.delete(test_url, headers=headers) + + def test__delete_dataset_by_version_id(self): + """ + Calling DELETE /collections/:collection_id/datasets/:dataset_id should work according to the + auth token passed and when using versioned ids + """ + for auth, auth_description, expected_status_code in self.auth_credentials: with self.subTest(f"{auth_description} {expected_status_code}"): dataset = self.generate_dataset( statuses=[DatasetStatusUpdate(DatasetStatusKey.UPLOAD, DatasetUploadStatus.UPLOADING)], publish=False, ) + response = self._delete(auth, dataset.collection_version_id, dataset.dataset_version_id) + self.assertEqual(expected_status_code, response.status_code) - test_url = ( - f"/curation/v1/collections/{dataset.collection_version_id}/datasets/" - f"{dataset.dataset_version_id}" + def test__delete_dataset_by_canonical_id(self): + """ + Calling DELETE /collections/:collection_id/datasets/:dataset_id should work according to the + auth token passed and when using canonical ids. In this case, the unpublished collection + version will be looked up and used for deletion. + """ + for auth, auth_description, expected_status_code in self.auth_credentials: + with self.subTest(f"{auth_description} {expected_status_code}"): + dataset = self.generate_dataset( + statuses=[DatasetStatusUpdate(DatasetStatusKey.UPLOAD, DatasetUploadStatus.UPLOADING)], + publish=False, ) - headers = auth() if callable(auth) else auth - response = self.app.delete(test_url, headers=headers) + response = self._delete(auth, dataset.collection_id, dataset.dataset_id) self.assertEqual(expected_status_code, response.status_code) @@ -1191,17 +1215,27 @@ def test__new_from_link__OK(self, *mocks): Calling PUT /datasets/:dataset_id should succeed if a valid link is uploaded by the owner of the collection """ - dataset = self.generate_dataset( - statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)], - ) - body = {"link": self.good_link} - headers = self.make_owner_header() - response = self.app.put( - f"/curation/v1/collections/{dataset.collection_version_id}/datasets/{dataset.dataset_version_id}", - json=body, - headers=headers, - ) - self.assertEqual(202, response.status_code) + def _test_create(collection_id, dataset_id): + body = {"link": self.good_link} + headers = self.make_owner_header() + response = self.app.put( + f"/curation/v1/collections/{collection_id}/datasets/{dataset_id}", + json=body, + headers=headers, + ) + self.assertEqual(202, response.status_code) + + with self.subTest("with version_ids"): + dataset = self.generate_dataset( + statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)], + ) + _test_create(dataset.collection_version_id, dataset.dataset_version_id) + + with self.subTest("with collection_ids"): + dataset = self.generate_dataset( + statuses=[DatasetStatusUpdate(DatasetStatusKey.PROCESSING, DatasetProcessingStatus.INITIALIZED)], + ) + _test_create(dataset.collection_id, dataset.dataset_id) def test__new_from_link__Super_Curator(self, *mocks): """ From 8a52c649e1f5d881aca42f6a9cfbc3b807e37287 Mon Sep 17 00:00:00 2001 From: Emanuele Bezzi Date: Fri, 13 Jan 2023 17:26:38 -0500 Subject: [PATCH 2/2] fix: improve dataset identifiers (#3967) * fix: improve dataset identifiers * Linter * Test declutter * new test+overhaul --- backend/layers/api/portal_api.py | 22 ++- .../backend/layers/api/test_portal_api.py | 156 +++++++++++++++++- tests/unit/backend/layers/common/base_test.py | 4 +- 3 files changed, 174 insertions(+), 8 deletions(-) diff --git a/backend/layers/api/portal_api.py b/backend/layers/api/portal_api.py index 068b229f91f0b..dd57d847b3599 100644 --- a/backend/layers/api/portal_api.py +++ b/backend/layers/api/portal_api.py @@ -650,7 +650,7 @@ def delete_dataset(dataset_id: str, token_info: dict): def get_dataset_identifiers(url: str): """ - a.k.a. the meta endpoint + Return a set of dataset identifiers. This endpoint is meant to be used by single-cell-explorer. """ try: path = urlparse(url).path @@ -665,19 +665,31 @@ def get_dataset_identifiers(url: str): if dataset is None: raise NotFoundHTTPException() + # A dataset version can appear in multiple collections versions. This endpoint should: + # 1. Return the most recent published version that contains the dataset version (aka the mapped version) + # 2. If the version only appears in an unpublished version, return that one. + collection = get_business_logic().get_collection_version_from_canonical(dataset.collection_id) - if collection is None: # orphaned datasets + if collection is None: # orphaned datasets - shouldn't happen, but we should return 404 just in case + raise NotFoundHTTPException() + + if dataset.version_id not in [d.version_id for d in collection.datasets]: + # If the dataset is not in the mapped collection version, it means the dataset belongs to the active + # unpublished version. We should return that one + collection = get_business_logic().get_unpublished_collection_version_from_canonical(dataset.collection_id) + + if collection is None: # again, orphaned datasets raise NotFoundHTTPException() + collection_id, dataset_id = collection.version_id.id, dataset.version_id.id + # Retrieves the URI of the cxg artifact s3_uri = next(a.uri for a in dataset.artifacts if a.type == DatasetArtifactType.CXG) - dataset_id = dataset.version_id.id - dataset_identifiers = { "s3_uri": s3_uri, "dataset_id": dataset_id, - "collection_id": dataset.collection_id.id, + "collection_id": collection_id, "collection_visibility": "PUBLIC" if collection.published_at is not None else "PRIVATE", "tombstoned": False, # No longer applicable } diff --git a/tests/unit/backend/layers/api/test_portal_api.py b/tests/unit/backend/layers/api/test_portal_api.py index 02b92032a5d2d..1f2f5dd227751 100644 --- a/tests/unit/backend/layers/api/test_portal_api.py +++ b/tests/unit/backend/layers/api/test_portal_api.py @@ -7,6 +7,7 @@ from unittest.mock import Mock, patch from backend.layers.business.entities import DatasetArtifactDownloadData from backend.layers.common.entities import ( + CollectionVersionId, DatasetStatusKey, ) from backend.layers.common.entities import ( @@ -1774,7 +1775,7 @@ def test__dataset_meta__ok(self): expected_identifiers = { "s3_uri": test_uri_0, "dataset_id": public_dataset.dataset_version_id, - "collection_id": public_dataset.collection_id, + "collection_id": public_dataset.collection_version_id, "collection_visibility": "PUBLIC", # this is a published collection "tombstoned": False, } @@ -1793,7 +1794,7 @@ def test__dataset_meta__ok(self): expected_identifiers = { "s3_uri": test_uri_1, "dataset_id": private_dataset.dataset_version_id, - "collection_id": private_dataset.collection_id, + "collection_id": private_dataset.collection_version_id, "collection_visibility": "PRIVATE", "tombstoned": False, } @@ -1813,6 +1814,157 @@ def test__dataset_meta__404(self): response = self.app.get(test_url_404, headers) self.assertEqual(response.status_code, 404) + def test__explorer_portal_integration(self): + """ + Tests the explorer <-> portal integration. + The steps carried out by this test are: + 1. Generate the explorer_url + 2. Call the `get_dataset_identifiers` endpoint, retrieve `collection_id` and `dataset_id` from there + 3. Call the GET /collections/:collection_id endpoint, locate the dataset + """ + headers = {"host": "localhost", "Content-Type": "application/json"} + + def _call_meta_endpoint(explorer_url): + test_url = f"/dp/v1/datasets/meta?url={explorer_url}" + response = self.app.get(test_url, headers) + self.assertEqual(response.status_code, 200) + return json.loads(response.data) + + def _call_collections_endpoint(collection_id): + test_url = f"/dp/v1/collections/{collection_id}" + response = self.app.get(test_url, headers) + self.assertEqual(response.status_code, 200) + return json.loads(response.data) + + with self.subTest("Dataset belonging to an unpublished collection"): + + test_uri = "some_uri_0" + + dataset = self.generate_dataset( + artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], + publish=False, + ) + # In this case, explorer_url points to the canonical link + explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/" + meta_response = _call_meta_endpoint(explorer_url) + + returned_collection_id = meta_response["collection_id"] + returned_dataset_id = meta_response["dataset_id"] + + collections_response = _call_collections_endpoint(returned_collection_id) + datasets = collections_response["datasets"] + self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets]) + + with self.subTest("Dataset belonging to a published collection"): + + test_uri = "some_uri_1" + + dataset = self.generate_dataset( + artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True + ) + # In this case, explorer_url points to the canonical link + explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/" + meta_response = _call_meta_endpoint(explorer_url) + + returned_collection_id = meta_response["collection_id"] + returned_dataset_id = meta_response["dataset_id"] + + collections_response = _call_collections_endpoint(returned_collection_id) + datasets = collections_response["datasets"] + self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets]) + + with self.subTest("Dataset belonging to a revision of a published collection, not replaced"): + + test_uri = "some_uri_2" + + dataset = self.generate_dataset( + artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True + ) + self.business_logic.create_collection_version(CollectionId(dataset.collection_id)) + + # In this case, explorer_url points to the versioned link + explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/" + meta_response = _call_meta_endpoint(explorer_url) + + returned_collection_id = meta_response["collection_id"] + returned_dataset_id = meta_response["dataset_id"] + + collections_response = _call_collections_endpoint(returned_collection_id) + datasets = collections_response["datasets"] + self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets]) + + with self.subTest("Dataset belonging to a revision of a published collection, replaced"): + + test_uri = "some_uri_1" + + dataset = self.generate_dataset( + artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True + ) + revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id)) + revised_dataset = self.generate_dataset( + artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], + collection_version=revision, + replace_dataset_version_id=DatasetVersionId(dataset.dataset_version_id), + ) + self.assertEqual(revised_dataset.dataset_id, dataset.dataset_id) + self.assertNotEqual(revised_dataset.dataset_version_id, dataset.dataset_version_id) + + # Retrieve the explorer url from the GET collections/:collection_id endpoint. This is the only way to force + # explorer_url to be exactly the same used by the portal to open the explorer url + test_url = f"/dp/v1/collections/{revision.version_id}" + response = self.app.get(test_url, headers) + self.assertEqual(response.status_code, 200) + response_data = json.loads(response.data) + datasets = response_data["datasets"] + self.assertIn(revised_dataset.dataset_version_id, [dataset["id"] for dataset in datasets]) + replaced_dataset = next( + dataset for dataset in datasets if dataset["id"] == revised_dataset.dataset_version_id + ) + + explorer_url = replaced_dataset["dataset_deployments"][0]["url"] + meta_response = _call_meta_endpoint(explorer_url) + + returned_collection_id = meta_response["collection_id"] + returned_dataset_id = meta_response["dataset_id"] + + collections_response = _call_collections_endpoint(returned_collection_id) + datasets = collections_response["datasets"] + self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets]) + + with self.subTest("Dataset that appears in multiple published versions"): + """ + If a dataset appears in multiple collection versions, the most recent one will be returned. + """ + test_uri = "some_uri_1" + + dataset = self.generate_dataset( + artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True + ) + revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id)) + + self.business_logic.publish_collection_version(revision.version_id) + + # Both versions are now published + original_version = self.business_logic.get_collection_version( + CollectionVersionId(dataset.collection_version_id) + ) + revision_version = self.business_logic.get_collection_version(revision.version_id) + + self.assertIsNotNone(original_version.published_at) + self.assertIsNotNone(revision_version.published_at) + + explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/" + meta_response = _call_meta_endpoint(explorer_url) + + returned_collection_id = meta_response["collection_id"] + returned_dataset_id = meta_response["dataset_id"] + + self.assertEqual(returned_collection_id, revision_version.version_id.id) + + collections_response = _call_collections_endpoint(returned_collection_id) + datasets = collections_response["datasets"] + self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets]) + class TestDatasetCurators(BaseAPIPortalTest): def setUp(self): diff --git a/tests/unit/backend/layers/common/base_test.py b/tests/unit/backend/layers/common/base_test.py index 451051cab627c..32145529138ee 100644 --- a/tests/unit/backend/layers/common/base_test.py +++ b/tests/unit/backend/layers/common/base_test.py @@ -16,6 +16,7 @@ DatasetStatusGeneric, DatasetStatusKey, DatasetValidationStatus, + DatasetVersionId, Link, OntologyTermId, ) @@ -226,6 +227,7 @@ def generate_dataset( validation_message: str = None, artifacts: List[DatasetArtifactUpdate] = None, publish: bool = False, + replace_dataset_version_id: Optional[DatasetVersionId] = None, ) -> DatasetData: """ Convenience method for generating a dataset. Also generates an unpublished collection if needed. @@ -233,7 +235,7 @@ def generate_dataset( if not collection_version: collection_version = self.generate_unpublished_collection(owner) dataset_version_id, dataset_id = self.business_logic.ingest_dataset( - collection_version.version_id, "http://fake.url", None, None + collection_version.version_id, "http://fake.url", None, replace_dataset_version_id ) if not metadata: metadata = copy.deepcopy(self.sample_dataset_metadata)