Skip to content

Commit

Permalink
feat: curation API for datasets (#3708)
Browse files Browse the repository at this point in the history
* init

* stuff

* baby steps

* new stuff

* end

* more stuff

* Remove ApiCommon

* Update backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py

Co-authored-by: Trent Smith <1429913+Bento007@users.noreply.github.com>

* Update backend/portal/api/curation/v1/curation/collections/collection_id/datasets/dataset_id/actions.py

Co-authored-by: Trent Smith <1429913+Bento007@users.noreply.github.com>

* PR comments

Co-authored-by: Trent Smith <1429913+Bento007@users.noreply.github.com>
  • Loading branch information
ebezzi and Bento007 authored Dec 15, 2022
1 parent 418cc68 commit d1b5c45
Show file tree
Hide file tree
Showing 10 changed files with 516 additions and 38 deletions.
19 changes: 19 additions & 0 deletions backend/layers/business/business.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,25 @@ def _assert_collection_version_unpublished(
raise CollectionIsPublishedException([f"Collection version {collection_version_id.id} is published"])
return collection_version

def create_empty_dataset(self, collection_version_id: CollectionVersionId) -> Tuple[DatasetVersionId, DatasetId]:
"""
Creates an empty dataset that can be later used for ingestion
"""
self._assert_collection_version_unpublished(collection_version_id)

new_dataset_version = self.database_provider.create_canonical_dataset(collection_version_id)
# adds new dataset version to collection version
self.database_provider.add_dataset_to_collection_version_mapping(
collection_version_id, new_dataset_version.version_id
)

self.database_provider.update_dataset_upload_status(new_dataset_version.version_id, DatasetUploadStatus.WAITING)
self.database_provider.update_dataset_processing_status(
new_dataset_version.version_id, DatasetProcessingStatus.PENDING
)

return (new_dataset_version.version_id, new_dataset_version.dataset_id)

# TODO: Alternatives: 1) return DatasetVersion 2) Return a new class
def ingest_dataset(
self,
Expand Down
2 changes: 1 addition & 1 deletion backend/layers/persistence/persistence_mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def get_dataset_version_status(self, version_id: DatasetVersionId) -> DatasetSta
return copy.deepcopy(self.datasets_versions[version_id.id].status)

def get_dataset_mapped_version(self, dataset_id: DatasetId) -> Optional[DatasetVersion]:
cd = self.collections.get(dataset_id.id)
cd = self.datasets.get(dataset_id.id)
if cd is not None:
version = self.datasets_versions[cd.version_id.id]
return self._update_dataset_version_with_canonical(version)
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
from flask import g, make_response, jsonify

from backend.api_server.db import dbconnect
from backend.common.corpora_orm import CollectionVisibility, ProcessingStatus
from backend.common.entities import Dataset
from backend.common.utils.http_exceptions import MethodNotAllowedException
from backend.portal.api.app.v1.authorization import owner_or_allowed
from backend.portal.api.collections_common import get_collection_else_forbidden
from backend.layers.api.router import get_business_logic
from backend.layers.auth.user_info import UserInfo
from backend.portal.api.curation.v1.curation.collections.common import get_infered_collection_version_else_forbidden, is_owner_or_allowed_else_forbidden
from backend.layers.common.entities import CollectionVersionId


@dbconnect
def post(token_info: dict, collection_id: str):
db_session = g.db_session
collection = get_collection_else_forbidden(db_session, collection_id, owner=owner_or_allowed(token_info))
if collection.visibility != CollectionVisibility.PRIVATE:
user_info = UserInfo(token_info)
business_logic = get_business_logic()

collection_version = get_infered_collection_version_else_forbidden(collection_id)
is_owner_or_allowed_else_forbidden(collection_version, user_info)

if collection_version.published_at is not None:
raise MethodNotAllowedException("Collection must be PRIVATE Collection, or a revision of a PUBLIC Collection.")
dataset = Dataset.create(
db_session, collection=collection, processing_status={"processing_status": ProcessingStatus.INITIALIZED}
)
return make_response(jsonify({"id": dataset.id}), 201)

dataset_id, dataset_version_id = business_logic.create_empty_dataset(CollectionVersionId(collection_id))

return make_response(jsonify({"id": dataset_id.id}), 201)
Original file line number Diff line number Diff line change
@@ -1,41 +1,202 @@
from flask import g, make_response, jsonify

from backend.api_server.db import dbconnect
from backend.common.corpora_orm import DbCollection
from typing import Tuple
from flask import Response, jsonify, make_response
from backend.common.corpora_orm import IsPrimaryData
from backend.common.utils.exceptions import MaxFileSizeExceededException
from backend.common.utils.http_exceptions import (
ForbiddenHTTPException,
InvalidParametersHTTPException,
MethodNotAllowedException,
NotFoundHTTPException,
TooLargeHTTPException,
)
from backend.layers.api.router import get_business_logic
from backend.layers.api.transform import (
dataset_asset_to_response,
dataset_processing_status_to_response,
ontology_term_ids_to_response,
)
from backend.layers.auth.user_info import UserInfo
from backend.layers.business.business_interface import BusinessLogicInterface
from backend.layers.business.exceptions import (
CollectionIsPublishedException,
CollectionNotFoundException,
CollectionUpdateException,
DatasetInWrongStatusException,
DatasetNotFoundException,
InvalidURIException,
)

from backend.layers.common.entities import (
CollectionId,
CollectionVersionId,
CollectionVersionWithDatasets,
DatasetArtifact,
DatasetArtifactType,
DatasetProcessingStatus,
DatasetValidationStatus,
DatasetVersion,
DatasetVersionId,
)
from backend.portal.api.app.v1.collections.collection_id.upload_links import upload_from_link
from backend.portal.api.collections_common import (
get_dataset_else_error,
delete_dataset_common,
from backend.portal.api.curation.v1.curation.collections.common import (
DATASET_ONTOLOGY_ELEMENTS,
DATASET_ONTOLOGY_ELEMENTS_PREVIEW,
get_infered_dataset_version,
)
from backend.portal.api.curation.v1.curation.collections.common import EntityColumns, reshape_dataset_for_curation_api


@dbconnect
is_primary_data_mapping = {
"PRIMARY": [True],
"SECONDARY": [False],
"BOTH": [True, False],
}


def _reshape_dataset_for_curation_api(d: DatasetVersion, preview=False) -> dict:
artifacts = []
for artifact in d.artifacts:
if artifact.type in (DatasetArtifactType.H5AD, DatasetArtifactType.RDS):
artifacts.append(dataset_asset_to_response(artifact, d.dataset_id.id))

dataset = {
"assay": ontology_term_ids_to_response(d.metadata.assay),
"batch_condition": d.metadata.batch_condition,
"cell_count": d.metadata.cell_count,
"cell_type": ontology_term_ids_to_response(d.metadata.cell_type),
"dataset_assets": artifacts,
"development_stage": ontology_term_ids_to_response(d.metadata.development_stage),
"disease": ontology_term_ids_to_response(d.metadata.disease),
"donor_id": d.metadata.donor_id,
"explorer_url": "string",
"id": d.dataset_id.id,
"mean_genes_per_cell": d.metadata.mean_genes_per_cell,
"organism": ontology_term_ids_to_response(d.metadata.organism),
"processing_status": dataset_processing_status_to_response(d.status, d.dataset_id.id),
"processing_status_detail": "string",
"revised_at": "string", # TODO
"revision": 0,
"schema_version": d.metadata.schema_version,
"self_reported_ethnicity": ontology_term_ids_to_response(d.metadata.self_reported_ethnicity),
"sex": ontology_term_ids_to_response(d.metadata.sex),
"suspension_type": d.metadata.suspension_type,
"tissue": ontology_term_ids_to_response(d.metadata.tissue),
"x_approximate_distribution": d.metadata.x_approximate_distribution.upper(),
}

if d.status:
if d.status.processing_status == DatasetProcessingStatus.FAILURE:
if d.status.validation_status == DatasetValidationStatus.INVALID:
dataset["processing_status_detail"] = d.status.validation_message
dataset["processing_status"] = "VALIDATION_FAILURE"
else:
dataset["processing_status"] = "PIPELINE_FAILURE"
else:
dataset["processing_status"] = d.status.processing_status

dataset_ontology_elements = DATASET_ONTOLOGY_ELEMENTS_PREVIEW if preview else DATASET_ONTOLOGY_ELEMENTS
for ontology_element in dataset_ontology_elements:
if dataset_ontology_element := dataset.get(ontology_element):
if not isinstance(dataset_ontology_element, list):
# Package in array
dataset[ontology_element] = [dataset_ontology_element]
else:
dataset[ontology_element] = []

if not preview: # Add these fields only to full (and not preview) Dataset metadata response
dataset["revision_of"] = d.canonical_dataset.dataset_id.id
dataset["title"] = d.metadata.name
if d.metadata.is_primary_data is not None:
dataset["is_primary_data"] = is_primary_data_mapping.get(d.metadata.is_primary_data, [])

return dataset


def get(collection_id: str, dataset_id: str = None):
db_session = g.db_session
if not db_session.query(DbCollection.id).filter(DbCollection.id == collection_id).first():
business_logic = get_business_logic()

collection_version = business_logic.get_collection_version_from_canonical(CollectionId(collection_id))
if collection_version is None:
raise NotFoundHTTPException("Collection not found!")
dataset = get_dataset_else_error(db_session, dataset_id, collection_id)
response_body = reshape_dataset_for_curation_api(dataset.to_dict_keep(EntityColumns.columns_for_dataset))

version = get_infered_dataset_version(dataset_id)
if version is None:
raise NotFoundHTTPException("Dataset not found")

response_body = _reshape_dataset_for_curation_api(version)
return make_response(jsonify(response_body), 200)


@dbconnect
def _get_collection_and_dataset(
business_logic: BusinessLogicInterface, collection_id: CollectionVersionId, dataset_id: DatasetVersionId
) -> Tuple[CollectionVersionWithDatasets, DatasetVersion]:
dataset_version = business_logic.get_dataset_version(dataset_id)
if dataset_version is None:
raise ForbiddenHTTPException()

collection_version = business_logic.get_collection_version(collection_id)
if collection_version is None:
raise ForbiddenHTTPException()

return collection_version, dataset_version


def delete(token_info: dict, collection_id: str, dataset_id: str = None):
db_session = g.db_session
dataset = get_dataset_else_error(db_session, dataset_id, collection_id, include_tombstones=True)
delete_dataset_common(db_session, dataset, token_info)
return "", 202

business_logic = get_business_logic()
user_info = UserInfo(token_info)

collection_version, dataset_version = _get_collection_and_dataset(
business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
)

if not user_info.is_user_owner_or_allowed(collection_version.owner):
raise ForbiddenHTTPException("Unauthorized")
# End of duplicate block

# TODO: deduplicate from ApiCommon. We need to settle the class/module level debate before can do that
if dataset_version.version_id not in [v.version_id for v in collection_version.datasets]:
raise ForbiddenHTTPException(f"Dataset {dataset_id} does not belong to a collection")

try:
business_logic.remove_dataset_version(collection_version.version_id, dataset_version.version_id)
except CollectionUpdateException:
raise MethodNotAllowedException(detail="Cannot delete a public Dataset")
return Response(status=202)
# End of duplicate block


def put(collection_id: str, dataset_id: str, body: dict, token_info: dict):
upload_from_link(
collection_id,
token_info,
body.get("url", body.get("link")),
dataset_id,
# TODO: deduplicate from ApiCommon. We need to settle the class/module level debate before can do that
url = body.get("url", body.get("link"))
business_logic = get_business_logic()

collection_version, _ = _get_collection_and_dataset(
business_logic, CollectionVersionId(collection_id), DatasetVersionId(dataset_id)
)
return "", 202

if not UserInfo(token_info).is_user_owner_or_allowed(collection_version.owner):
raise ForbiddenHTTPException()

try:
business_logic.ingest_dataset(
collection_version.version_id,
url,
None if dataset_id is None else DatasetVersionId(dataset_id),
)
return Response(status=202)
except CollectionNotFoundException:
raise ForbiddenHTTPException()
except CollectionIsPublishedException:
raise ForbiddenHTTPException()
except DatasetNotFoundException:
raise NotFoundHTTPException()
except InvalidURIException:
raise InvalidParametersHTTPException(detail="The dropbox shared link is invalid.")
except MaxFileSizeExceededException:
raise TooLargeHTTPException()
except DatasetInWrongStatusException:
raise MethodNotAllowedException(
detail="Submission failed. A dataset cannot be updated while a previous update for the same dataset "
"is in progress. Please cancel the current submission by deleting the dataset, or wait until "
"the submission has finished processing."
)
# End of duplicate block
12 changes: 12 additions & 0 deletions backend/portal/api/curation/v1/curation/collections/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
CollectionVersion,
CollectionVersionId,
CollectionVersionWithDatasets,
DatasetId,
DatasetArtifactType,
DatasetProcessingStatus,
DatasetValidationStatus,
Expand Down Expand Up @@ -373,6 +374,17 @@ def get_infered_collection_version_else_forbidden(collection_id: str) -> Optiona
raise ForbiddenHTTPException()
return version

def get_infered_dataset_version(dataset_id: str) -> Optional[DatasetVersion]:
"""
Infer the dataset version from either a DatasetId or a DatasetVersionId and return the DatasetVersion.
:param dataset_id: identifies the dataset version
:return: The DatasetVersion if it exists.
"""
version = get_business_logic().get_dataset_version(DatasetVersionId(dataset_id))
if version is None:
version = get_business_logic().get_dataset_version_from_canonical(DatasetId(dataset_id))
return version


def is_owner_or_allowed_else_forbidden(collection_version, user_info):
if not user_info.is_user_owner_or_allowed(collection_version.owner):
Expand Down
10 changes: 9 additions & 1 deletion tests/unit/backend/api_server/base_api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tests.unit.backend.api_server.config import TOKEN_EXPIRES
from tests.unit.backend.api_server.mock_auth import MockOauthServer
from tests.unit.backend.fixtures.environment_setup import EnvironmentSetup
from unit.backend.layers.common.base_api_test import NewBaseTest
from tests.unit.backend.layers.common.base_api_test import NewBaseTest


class BaseAPITest(NewBaseTest):
Expand Down Expand Up @@ -87,6 +87,14 @@ def get_cxguser_token(user="owner"):
class BaseAuthAPITest(BaseAPITest):
def setUp(self):
super().setUp()

# TODO: this can be improved, but the current authorization method requires it
self.mock = patch(
"backend.common.corpora_config.CorporaAuthConfig.__getattr__",
return_value="mock_audience"
)
self.mock.start()

self.mock_assert_authorized_token = patch(
"backend.portal.api.app.v1.authentication.assert_authorized_token",
side_effect=mock_assert_authorized_token,
Expand Down
Loading

0 comments on commit d1b5c45

Please sign in to comment.