Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: curation API for datasets #3708

Merged
merged 12 commits into from
Dec 15, 2022
19 changes: 19 additions & 0 deletions backend/layers/business/business.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,25 @@ def _assert_collection_version_unpublished(
raise CollectionIsPublishedException([f"Collection version {collection_version_id.id} is published"])
return collection_version

def create_empty_dataset(self, collection_version_id: CollectionVersionId) -> Tuple[DatasetVersionId, DatasetId]:
"""
Creates an empty dataset that can be later used for ingestion
"""
self._assert_collection_version_unpublished(collection_version_id)

new_dataset_version = self.database_provider.create_canonical_dataset(collection_version_id)
# adds new dataset version to collection version
self.database_provider.add_dataset_to_collection_version_mapping(
collection_version_id, new_dataset_version.version_id
)

self.database_provider.update_dataset_upload_status(new_dataset_version.version_id, DatasetUploadStatus.WAITING)
self.database_provider.update_dataset_processing_status(
new_dataset_version.version_id, DatasetProcessingStatus.PENDING
)

return (new_dataset_version.version_id, new_dataset_version.dataset_id)

# TODO: Alternatives: 1) return DatasetVersion 2) Return a new class
def ingest_dataset(
self,
Expand Down
2 changes: 1 addition & 1 deletion backend/layers/persistence/persistence_mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def get_dataset_version_status(self, version_id: DatasetVersionId) -> DatasetSta
return copy.deepcopy(self.datasets_versions[version_id.id].status)

def get_dataset_mapped_version(self, dataset_id: DatasetId) -> Optional[DatasetVersion]:
cd = self.collections.get(dataset_id.id)
cd = self.datasets.get(dataset_id.id)
if cd is not None:
version = self.datasets_versions[cd.version_id.id]
return self._update_dataset_version_with_canonical(version)
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
from flask import g, make_response, jsonify

from backend.api_server.db import dbconnect
from backend.common.corpora_orm import CollectionVisibility, ProcessingStatus
from backend.common.entities import Dataset
from backend.common.utils.http_exceptions import MethodNotAllowedException
from backend.portal.api.app.v1.authorization import owner_or_allowed
from backend.portal.api.collections_common import get_collection_else_forbidden
from backend.layers.api.router import get_business_logic
from backend.layers.auth.user_info import UserInfo
from backend.portal.api.curation.v1.curation.collections.common import get_infered_collection_version_else_forbidden, is_owner_or_allowed_else_forbidden
from backend.layers.common.entities import CollectionVersionId


@dbconnect
def post(token_info: dict, collection_id: str):
db_session = g.db_session
collection = get_collection_else_forbidden(db_session, collection_id, owner=owner_or_allowed(token_info))
if collection.visibility != CollectionVisibility.PRIVATE:
user_info = UserInfo(token_info)
business_logic = get_business_logic()

collection_version = get_infered_collection_version_else_forbidden(collection_id)
is_owner_or_allowed_else_forbidden(collection_version, user_info)

if collection_version.published_at is not None:
raise MethodNotAllowedException("Collection must be PRIVATE Collection, or a revision of a PUBLIC Collection.")
dataset = Dataset.create(
db_session, collection=collection, processing_status={"processing_status": ProcessingStatus.INITIALIZED}
)
return make_response(jsonify({"id": dataset.id}), 201)

dataset_id, dataset_version_id = business_logic.create_empty_dataset(CollectionVersionId(collection_id))

return make_response(jsonify({"id": dataset_id.id}), 201)
Original file line number Diff line number Diff line change
@@ -1,41 +1,202 @@
from flask import g, make_response, jsonify

from backend.api_server.db import dbconnect
from backend.common.corpora_orm import DbCollection
from typing import Tuple
from flask import Response, jsonify, make_response
from backend.common.corpora_orm import IsPrimaryData
from backend.common.utils.exceptions import MaxFileSizeExceededException
from backend.common.utils.http_exceptions import (
ForbiddenHTTPException,
InvalidParametersHTTPException,
MethodNotAllowedException,
NotFoundHTTPException,
TooLargeHTTPException,
)
from backend.layers.api.router import get_business_logic
from backend.layers.api.transform import (
dataset_asset_to_response,
dataset_processing_status_to_response,
ontology_term_ids_to_response,
)
from backend.layers.auth.user_info import UserInfo
from backend.layers.business.business_interface import BusinessLogicInterface
from backend.layers.business.exceptions import (
CollectionIsPublishedException,
CollectionNotFoundException,
CollectionUpdateException,
DatasetInWrongStatusException,
DatasetNotFoundException,
InvalidURIException,
)

from backend.layers.common.entities import (
CollectionId,
CollectionVersionId,
CollectionVersionWithDatasets,
DatasetArtifact,
DatasetArtifactType,
DatasetProcessingStatus,
DatasetValidationStatus,
DatasetVersion,
DatasetVersionId,
)
from backend.portal.api.app.v1.collections.collection_id.upload_links import upload_from_link
from backend.portal.api.collections_common import (
get_dataset_else_error,
delete_dataset_common,
from backend.portal.api.curation.v1.curation.collections.common import (
DATASET_ONTOLOGY_ELEMENTS,
DATASET_ONTOLOGY_ELEMENTS_PREVIEW,
get_infered_dataset_version,
)
from backend.portal.api.curation.v1.curation.collections.common import EntityColumns, reshape_dataset_for_curation_api


@dbconnect
is_primary_data_mapping = {
"PRIMARY": [True],
"SECONDARY": [False],
"BOTH": [True, False],
}


def _reshape_dataset_for_curation_api(d: DatasetVersion, preview=False) -> dict:
artifacts = []
for artifact in d.artifacts:
if artifact.type in (DatasetArtifactType.H5AD, DatasetArtifactType.RDS):
artifacts.append(dataset_asset_to_response(artifact, d.dataset_id.id))

dataset = {
"assay": ontology_term_ids_to_response(d.metadata.assay),
"batch_condition": d.metadata.batch_condition,
"cell_count": d.metadata.cell_count,
"cell_type": ontology_term_ids_to_response(d.metadata.cell_type),
"dataset_assets": artifacts,
"development_stage": ontology_term_ids_to_response(d.metadata.development_stage),
"disease": ontology_term_ids_to_response(d.metadata.disease),
"donor_id": d.metadata.donor_id,
"explorer_url": "string",
"id": d.dataset_id.id,
"mean_genes_per_cell": d.metadata.mean_genes_per_cell,
"organism": ontology_term_ids_to_response(d.metadata.organism),
"processing_status": dataset_processing_status_to_response(d.status, d.dataset_id.id),
"processing_status_detail": "string",
"revised_at": "string", # TODO
"revision": 0,
"schema_version": d.metadata.schema_version,
"self_reported_ethnicity": ontology_term_ids_to_response(d.metadata.self_reported_ethnicity),
"sex": ontology_term_ids_to_response(d.metadata.sex),
"suspension_type": d.metadata.suspension_type,
"tissue": ontology_term_ids_to_response(d.metadata.tissue),
"x_approximate_distribution": d.metadata.x_approximate_distribution.upper(),
}

if d.status:
if d.status.processing_status == DatasetProcessingStatus.FAILURE:
if d.status.validation_status == DatasetValidationStatus.INVALID:
dataset["processing_status_detail"] = d.status.validation_message
dataset["processing_status"] = "VALIDATION_FAILURE"
else:
dataset["processing_status"] = "PIPELINE_FAILURE"
else:
dataset["processing_status"] = d.status.processing_status

dataset_ontology_elements = DATASET_ONTOLOGY_ELEMENTS_PREVIEW if preview else DATASET_ONTOLOGY_ELEMENTS
for ontology_element in dataset_ontology_elements:
if dataset_ontology_element := dataset.get(ontology_element):
if not isinstance(dataset_ontology_element, list):
# Package in array
dataset[ontology_element] = [dataset_ontology_element]
else:
dataset[ontology_element] = []

if not preview: # Add these fields only to full (and not preview) Dataset metadata response
dataset["revision_of"] = d.canonical_dataset.dataset_id.id
dataset["title"] = d.metadata.name
if d.metadata.is_primary_data is not None:
dataset["is_primary_data"] = is_primary_data_mapping.get(d.metadata.is_primary_data, [])

return dataset


def get(collection_id: str, dataset_id: str = None):
db_session = g.db_session
if not db_session.query(DbCollection.id).filter(DbCollection.id == collection_id).first():
business_logic = get_business_logic()

collection_version = business_logic.get_collection_version_from_canonical(CollectionId(collection_id))
if collection_version is None:
raise NotFoundHTTPException("Collection not found!")
dataset = get_dataset_else_error(db_session, dataset_id, collection_id)
response_body = reshape_dataset_for_curation_api(dataset.to_dict_keep(EntityColumns.columns_for_dataset))

version = get_infered_dataset_version(dataset_id)
if version is None:
raise NotFoundHTTPException("Dataset not found")

response_body = _reshape_dataset_for_curation_api(version)
return make_response(jsonify(response_body), 200)


@dbconnect
def _get_collection_and_dataset(
business_logic: BusinessLogicInterface, collection_id: CollectionVersionId, dataset_id: DatasetVersionId
) -> Tuple[CollectionVersionWithDatasets, DatasetVersion]:
dataset_version = business_logic.get_dataset_version(dataset_id)
if dataset_version is None:
raise ForbiddenHTTPException()

collection_version = business_logic.get_collection_version(collection_id)
if collection_version is None:
raise ForbiddenHTTPException()

return collection_version, dataset_version


def delete(token_info: dict, collection_id: str, dataset_id: str = None):
db_session = g.db_session
dataset = get_dataset_else_error(db_session, dataset_id, collection_id, include_tombstones=True)
delete_dataset_common(db_session, dataset, token_info)
return "", 202

business_logic = get_business_logic()
user_info = UserInfo(token_info)

collection_version, dataset_version = _get_collection_and_dataset(
business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
)

if not user_info.is_user_owner_or_allowed(collection_version.owner):
raise ForbiddenHTTPException("Unauthorized")
# End of duplicate block

# TODO: deduplicate from ApiCommon. We need to settle the class/module level debate before can do that
if dataset_version.version_id not in [v.version_id for v in collection_version.datasets]:
raise ForbiddenHTTPException(f"Dataset {dataset_id} does not belong to a collection")

try:
business_logic.remove_dataset_version(collection_version.version_id, dataset_version.version_id)
except CollectionUpdateException:
raise MethodNotAllowedException(detail="Cannot delete a public Dataset")
return Response(status=202)
# End of duplicate block


def put(collection_id: str, dataset_id: str, body: dict, token_info: dict):
upload_from_link(
collection_id,
token_info,
body.get("url", body.get("link")),
dataset_id,
# TODO: deduplicate from ApiCommon. We need to settle the class/module level debate before can do that
url = body.get("url", body.get("link"))
business_logic = get_business_logic()

collection_version, _ = _get_collection_and_dataset(
business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
)
return "", 202

if not UserInfo(token_info).is_user_owner_or_allowed(collection_version.owner):
raise ForbiddenHTTPException()

try:
business_logic.ingest_dataset(
collection_version.version_id,
url,
None if dataset_id is None else DatasetVersionId(dataset_id),
)
return Response(status=202)
except CollectionNotFoundException:
raise ForbiddenHTTPException()
except CollectionIsPublishedException:
raise ForbiddenHTTPException()
except DatasetNotFoundException:
raise NotFoundHTTPException()
except InvalidURIException:
raise InvalidParametersHTTPException(detail="The dropbox shared link is invalid.")
except MaxFileSizeExceededException:
raise TooLargeHTTPException()
except DatasetInWrongStatusException:
raise MethodNotAllowedException(
detail="Submission failed. A dataset cannot be updated while a previous update for the same dataset "
"is in progress. Please cancel the current submission by deleting the dataset, or wait until "
"the submission has finished processing."
)
# End of duplicate block
12 changes: 12 additions & 0 deletions backend/portal/api/curation/v1/curation/collections/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
CollectionVersion,
CollectionVersionId,
CollectionVersionWithDatasets,
DatasetId,
DatasetArtifactType,
DatasetProcessingStatus,
DatasetValidationStatus,
Expand Down Expand Up @@ -373,6 +374,17 @@ def get_infered_collection_version_else_forbidden(collection_id: str) -> Optiona
raise ForbiddenHTTPException()
return version

def get_infered_dataset_version(dataset_id: str) -> Optional[DatasetVersion]:
"""
Infer the dataset version from either a DatasetId or a DatasetVersionId and return the DatasetVersion.
:param dataset_id: identifies the dataset version
:return: The DatasetVersion if it exists.
"""
version = get_business_logic().get_dataset_version(DatasetVersionId(dataset_id))
if version is None:
version = get_business_logic().get_dataset_version_from_canonical(DatasetId(dataset_id))
return version


def is_owner_or_allowed_else_forbidden(collection_version, user_info):
if not user_info.is_user_owner_or_allowed(collection_version.owner):
Expand Down
10 changes: 9 additions & 1 deletion tests/unit/backend/api_server/base_api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tests.unit.backend.api_server.config import TOKEN_EXPIRES
from tests.unit.backend.api_server.mock_auth import MockOauthServer
from tests.unit.backend.fixtures.environment_setup import EnvironmentSetup
from unit.backend.layers.common.base_api_test import NewBaseTest
from tests.unit.backend.layers.common.base_api_test import NewBaseTest


class BaseAPITest(NewBaseTest):
Expand Down Expand Up @@ -87,6 +87,14 @@ def get_cxguser_token(user="owner"):
class BaseAuthAPITest(BaseAPITest):
def setUp(self):
super().setUp()

# TODO: this can be improved, but the current authorization method requires it
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Bento007 do you know what's going on here? Looks like the mock below isn't enough since this line:
x = assert_authorized_token(token, CorporaAuthConfig().curation_audience)
is calling the config in the parameters, which happens before the mock is called.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That should also exist in my function but it throws an error. I'll double check.

self.mock = patch(
"backend.common.corpora_config.CorporaAuthConfig.__getattr__",
return_value="mock_audience"
)
self.mock.start()

self.mock_assert_authorized_token = patch(
"backend.portal.api.app.v1.authentication.assert_authorized_token",
side_effect=mock_assert_authorized_token,
Expand Down
Loading