Skip to content

Commit

Permalink
fix: improve dataset identifiers (#3967)
Browse files Browse the repository at this point in the history
* fix: improve dataset identifiers

* Linter

* Test declutter

* new test+overhaul
  • Loading branch information
ebezzi authored Jan 13, 2023
1 parent e14aa76 commit 8a52c64
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 8 deletions.
22 changes: 17 additions & 5 deletions backend/layers/api/portal_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ def delete_dataset(dataset_id: str, token_info: dict):

def get_dataset_identifiers(url: str):
"""
a.k.a. the meta endpoint
Return a set of dataset identifiers. This endpoint is meant to be used by single-cell-explorer.
"""
try:
path = urlparse(url).path
Expand All @@ -665,19 +665,31 @@ def get_dataset_identifiers(url: str):
if dataset is None:
raise NotFoundHTTPException()

# A dataset version can appear in multiple collections versions. This endpoint should:
# 1. Return the most recent published version that contains the dataset version (aka the mapped version)
# 2. If the version only appears in an unpublished version, return that one.

collection = get_business_logic().get_collection_version_from_canonical(dataset.collection_id)
if collection is None: # orphaned datasets
if collection is None: # orphaned datasets - shouldn't happen, but we should return 404 just in case
raise NotFoundHTTPException()

if dataset.version_id not in [d.version_id for d in collection.datasets]:
# If the dataset is not in the mapped collection version, it means the dataset belongs to the active
# unpublished version. We should return that one
collection = get_business_logic().get_unpublished_collection_version_from_canonical(dataset.collection_id)

if collection is None: # again, orphaned datasets
raise NotFoundHTTPException()

collection_id, dataset_id = collection.version_id.id, dataset.version_id.id

# Retrieves the URI of the cxg artifact
s3_uri = next(a.uri for a in dataset.artifacts if a.type == DatasetArtifactType.CXG)

dataset_id = dataset.version_id.id

dataset_identifiers = {
"s3_uri": s3_uri,
"dataset_id": dataset_id,
"collection_id": dataset.collection_id.id,
"collection_id": collection_id,
"collection_visibility": "PUBLIC" if collection.published_at is not None else "PRIVATE",
"tombstoned": False, # No longer applicable
}
Expand Down
156 changes: 154 additions & 2 deletions tests/unit/backend/layers/api/test_portal_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from unittest.mock import Mock, patch
from backend.layers.business.entities import DatasetArtifactDownloadData
from backend.layers.common.entities import (
CollectionVersionId,
DatasetStatusKey,
)
from backend.layers.common.entities import (
Expand Down Expand Up @@ -1774,7 +1775,7 @@ def test__dataset_meta__ok(self):
expected_identifiers = {
"s3_uri": test_uri_0,
"dataset_id": public_dataset.dataset_version_id,
"collection_id": public_dataset.collection_id,
"collection_id": public_dataset.collection_version_id,
"collection_visibility": "PUBLIC", # this is a published collection
"tombstoned": False,
}
Expand All @@ -1793,7 +1794,7 @@ def test__dataset_meta__ok(self):
expected_identifiers = {
"s3_uri": test_uri_1,
"dataset_id": private_dataset.dataset_version_id,
"collection_id": private_dataset.collection_id,
"collection_id": private_dataset.collection_version_id,
"collection_visibility": "PRIVATE",
"tombstoned": False,
}
Expand All @@ -1813,6 +1814,157 @@ def test__dataset_meta__404(self):
response = self.app.get(test_url_404, headers)
self.assertEqual(response.status_code, 404)

def test__explorer_portal_integration(self):
"""
Tests the explorer <-> portal integration.
The steps carried out by this test are:
1. Generate the explorer_url
2. Call the `get_dataset_identifiers` endpoint, retrieve `collection_id` and `dataset_id` from there
3. Call the GET /collections/:collection_id endpoint, locate the dataset
"""
headers = {"host": "localhost", "Content-Type": "application/json"}

def _call_meta_endpoint(explorer_url):
test_url = f"/dp/v1/datasets/meta?url={explorer_url}"
response = self.app.get(test_url, headers)
self.assertEqual(response.status_code, 200)
return json.loads(response.data)

def _call_collections_endpoint(collection_id):
test_url = f"/dp/v1/collections/{collection_id}"
response = self.app.get(test_url, headers)
self.assertEqual(response.status_code, 200)
return json.loads(response.data)

with self.subTest("Dataset belonging to an unpublished collection"):

test_uri = "some_uri_0"

dataset = self.generate_dataset(
artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)],
publish=False,
)
# In this case, explorer_url points to the canonical link
explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/"
meta_response = _call_meta_endpoint(explorer_url)

returned_collection_id = meta_response["collection_id"]
returned_dataset_id = meta_response["dataset_id"]

collections_response = _call_collections_endpoint(returned_collection_id)
datasets = collections_response["datasets"]
self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])

with self.subTest("Dataset belonging to a published collection"):

test_uri = "some_uri_1"

dataset = self.generate_dataset(
artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
)
# In this case, explorer_url points to the canonical link
explorer_url = f"http://base.url/{dataset.dataset_id}.cxg/"
meta_response = _call_meta_endpoint(explorer_url)

returned_collection_id = meta_response["collection_id"]
returned_dataset_id = meta_response["dataset_id"]

collections_response = _call_collections_endpoint(returned_collection_id)
datasets = collections_response["datasets"]
self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])

with self.subTest("Dataset belonging to a revision of a published collection, not replaced"):

test_uri = "some_uri_2"

dataset = self.generate_dataset(
artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
)
self.business_logic.create_collection_version(CollectionId(dataset.collection_id))

# In this case, explorer_url points to the versioned link
explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/"
meta_response = _call_meta_endpoint(explorer_url)

returned_collection_id = meta_response["collection_id"]
returned_dataset_id = meta_response["dataset_id"]

collections_response = _call_collections_endpoint(returned_collection_id)
datasets = collections_response["datasets"]
self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])

with self.subTest("Dataset belonging to a revision of a published collection, replaced"):

test_uri = "some_uri_1"

dataset = self.generate_dataset(
artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
)
revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id))
revised_dataset = self.generate_dataset(
artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)],
collection_version=revision,
replace_dataset_version_id=DatasetVersionId(dataset.dataset_version_id),
)
self.assertEqual(revised_dataset.dataset_id, dataset.dataset_id)
self.assertNotEqual(revised_dataset.dataset_version_id, dataset.dataset_version_id)

# Retrieve the explorer url from the GET collections/:collection_id endpoint. This is the only way to force
# explorer_url to be exactly the same used by the portal to open the explorer url
test_url = f"/dp/v1/collections/{revision.version_id}"
response = self.app.get(test_url, headers)
self.assertEqual(response.status_code, 200)
response_data = json.loads(response.data)
datasets = response_data["datasets"]
self.assertIn(revised_dataset.dataset_version_id, [dataset["id"] for dataset in datasets])
replaced_dataset = next(
dataset for dataset in datasets if dataset["id"] == revised_dataset.dataset_version_id
)

explorer_url = replaced_dataset["dataset_deployments"][0]["url"]
meta_response = _call_meta_endpoint(explorer_url)

returned_collection_id = meta_response["collection_id"]
returned_dataset_id = meta_response["dataset_id"]

collections_response = _call_collections_endpoint(returned_collection_id)
datasets = collections_response["datasets"]
self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])

with self.subTest("Dataset that appears in multiple published versions"):
"""
If a dataset appears in multiple collection versions, the most recent one will be returned.
"""
test_uri = "some_uri_1"

dataset = self.generate_dataset(
artifacts=[DatasetArtifactUpdate(DatasetArtifactType.CXG, test_uri)], publish=True
)
revision = self.business_logic.create_collection_version(CollectionId(dataset.collection_id))

self.business_logic.publish_collection_version(revision.version_id)

# Both versions are now published
original_version = self.business_logic.get_collection_version(
CollectionVersionId(dataset.collection_version_id)
)
revision_version = self.business_logic.get_collection_version(revision.version_id)

self.assertIsNotNone(original_version.published_at)
self.assertIsNotNone(revision_version.published_at)

explorer_url = f"http://base.url/{dataset.dataset_version_id}.cxg/"
meta_response = _call_meta_endpoint(explorer_url)

returned_collection_id = meta_response["collection_id"]
returned_dataset_id = meta_response["dataset_id"]

self.assertEqual(returned_collection_id, revision_version.version_id.id)

collections_response = _call_collections_endpoint(returned_collection_id)
datasets = collections_response["datasets"]
self.assertIn(returned_dataset_id, [dataset["id"] for dataset in datasets])


class TestDatasetCurators(BaseAPIPortalTest):
def setUp(self):
Expand Down
4 changes: 3 additions & 1 deletion tests/unit/backend/layers/common/base_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
DatasetStatusGeneric,
DatasetStatusKey,
DatasetValidationStatus,
DatasetVersionId,
Link,
OntologyTermId,
)
Expand Down Expand Up @@ -226,14 +227,15 @@ def generate_dataset(
validation_message: str = None,
artifacts: List[DatasetArtifactUpdate] = None,
publish: bool = False,
replace_dataset_version_id: Optional[DatasetVersionId] = None,
) -> DatasetData:
"""
Convenience method for generating a dataset. Also generates an unpublished collection if needed.
"""
if not collection_version:
collection_version = self.generate_unpublished_collection(owner)
dataset_version_id, dataset_id = self.business_logic.ingest_dataset(
collection_version.version_id, "http://fake.url", None, None
collection_version.version_id, "http://fake.url", None, replace_dataset_version_id
)
if not metadata:
metadata = copy.deepcopy(self.sample_dataset_metadata)
Expand Down

0 comments on commit 8a52c64

Please sign in to comment.