Skip to content

Commit

Permalink
Merge pull request #3631 from chanzuckerberg/nayib/persistence-layer
Browse files Browse the repository at this point in the history
feat: persistence layer
  • Loading branch information
nayib-jose-gloria authored Dec 5, 2022
2 parents 7ca3661 + 8602a32 commit 7bd523a
Show file tree
Hide file tree
Showing 35 changed files with 2,269 additions and 200 deletions.
2 changes: 1 addition & 1 deletion .happy/terraform/modules/ecs-stack/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ locals {
deletion_cmd = ["make", "-C", "/single-cell-data-portal/backend", "db/delete_remote_dev"]
frontend_cmd = []
# TODO: Assess whether this is safe for Portal API as well. Trying 1 worker in rdev portal backend containers, to minimize use of memory by TileDB (allocates multi-GB per process)
backend_cmd = ["gunicorn", "--worker-class", "gevent", "--workers", "1", "--bind", "0.0.0.0:5000", "backend.api_server.app:app", "--max-requests", "10000", "--timeout", "180", "--keep-alive", "5", "--log-level", "info"]
backend_cmd = ["gunicorn", "--worker-class", "gevent", "--workers", "1", "--bind", "0.0.0.0:5000", "backend.api_server.app:app", "--max-requests", "10000", "--timeout", "180", "--keep-alive", "5", "--log-level", "info", "--preload"]
data_load_path = "s3://${local.secret["s3_buckets"]["env"]["name"]}/database/dev_data.sql"

vpc_id = local.secret["vpc_id"]
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile.processing_image
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ ADD tests /single-cell-data-portal/tests
ADD backend/__init__.py backend/__init__.py
ADD backend/portal/pipeline/__init__.py backend/portal/pipeline/__init__.py
ADD backend/portal/pipeline/processing backend/portal/pipeline/processing
ADD backend/layers backend/layers
ADD backend/common backend/common

ARG HAPPY_BRANCH="unknown"
Expand All @@ -32,4 +33,4 @@ LABEL commit=${HAPPY_COMMIT}
ENV COMMIT_SHA=${HAPPY_COMMIT}
ENV COMMIT_BRANCH=${HAPPY_BRANCH}

CMD ["python3", "-m", "backend.portal.pipeline.processing.process"]
CMD ["python3", "-m", "backend.layers.processing.process"]
2 changes: 1 addition & 1 deletion Dockerfile.upload_failures
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM public.ecr.aws/lambda/python:3.8

COPY backend/portal/pipeline/upload_failures .
COPY backend/layers/processing/upload_failures .

RUN pip3 install -r requirements.txt

Expand Down
3 changes: 2 additions & 1 deletion Dockerfile.upload_success
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
FROM public.ecr.aws/lambda/python:3.8

COPY backend/portal/pipeline/upload_success .
COPY backend/layers/processing/upload_success .
COPY backend/layers ./backend/layers

RUN pip3 install -r requirements.txt

Expand Down
5 changes: 5 additions & 0 deletions backend/api_server/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import json
import os
import time

# TODO: Add StackOverflow link that explains
import gevent.monkey
gevent.monkey.patch_all()

from urllib.parse import urlparse

import connexion
Expand Down
3 changes: 2 additions & 1 deletion backend/api_server/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
Authlib==0.14.3
connexion[swagger-ui]==2.13.0
dataclasses-json
Flask-Cors>=3.0.6
gunicorn[gevent] >=20.1.0, <21.0.0
numba # required for where's my gene
Expand All @@ -14,4 +15,4 @@ python-json-logger
rsa>=4.7 # not directly required, pinned by Snyk to avoid a vulnerability
scanpy
SQLAlchemy-Utils>=0.36.8
SQLAlchemy>=1.3.17,<2
SQLAlchemy>=1.4.0,<1.5
51 changes: 26 additions & 25 deletions backend/layers/api/portal_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def get_collections_list(self, from_date: int = None, to_date: int = None, token
"visibility": "PRIVATE" if c.published_at is None else "PUBLIC",
"owner": c.owner,
"created_at": c.created_at,
"revision_of": "NA", # TODO: looks like this isn't returned right now
# "revision_of": "NA", # TODO: looks like this isn't returned right now
})

result = {"collections": collections}
Expand Down Expand Up @@ -90,7 +90,7 @@ def _dataset_asset_to_response(self, dataset_artifact: DatasetArtifact, dataset_
"dataset_id": dataset_id,
"filename": "TODO", # TODO: might need to get it from the url
"filetype": dataset_artifact.type,
"id": dataset_artifact.id,
"id": dataset_artifact.id.id,
"s3_uri": dataset_artifact.uri,
"updated_at": 0,
"user_submitted": True,
Expand All @@ -108,38 +108,39 @@ def _ontology_term_ids_to_response(self, ontology_term_ids: List[OntologyTermId]
def remove_none(self, body: dict):
return {k: v for k, v in body.items() if v is not None}

# Note: `metadata` can be none while the dataset is uploading
def _dataset_to_response(self, dataset: DatasetVersion):
return {
"assay": self._ontology_term_ids_to_response(dataset.metadata.assay),
"batch_condition": dataset.metadata.batch_condition,
"cell_count": dataset.metadata.cell_count,
"cell_type": self._ontology_term_ids_to_response(dataset.metadata.cell_type),
return self.remove_none({
"assay": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.assay),
"batch_condition": None if dataset.metadata is None else dataset.metadata.batch_condition,
"cell_count": None if dataset.metadata is None else dataset.metadata.cell_count,
"cell_type": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.cell_type),
"collection_id": dataset.collection_id.id,
"created_at": dataset.created_at,
"dataset_assets": [self._dataset_asset_to_response(a, dataset.dataset_id.id) for a in dataset.artifacts],
"dataset_deployments": [{"url": "TODO"}], # TODO: dataset.metadata.explorer_url,
"development_stage": self._ontology_term_ids_to_response(dataset.metadata.development_stage),
"disease": self._ontology_term_ids_to_response(dataset.metadata.disease),
"donor_id": dataset.metadata.donor_id,
"id": dataset.dataset_id.id,
"is_primary_data": dataset.metadata.is_primary_data,
"development_stage": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.development_stage),
"disease": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.disease),
"donor_id": None if dataset.metadata is None else dataset.metadata.donor_id,
"id": dataset.version_id.id,
"is_primary_data": None if dataset.metadata is None else dataset.metadata.is_primary_data,
"is_valid": True, # why do we have this
"mean_genes_per_cell": dataset.metadata.mean_genes_per_cell,
"name": dataset.metadata.name,
"organism": self._ontology_term_ids_to_response(dataset.metadata.organism),
"mean_genes_per_cell": None if dataset.metadata is None else dataset.metadata.mean_genes_per_cell,
"name": "" if dataset.metadata is None else dataset.metadata.name,
"organism": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.organism),
"processing_status": self._dataset_processing_status_to_response(dataset.status, dataset.dataset_id.id),
"published": True,
"published": True, # TODO
"published_at": dataset.canonical_dataset.published_at,
"revision": 0, # TODO this is the progressive revision number. I don't think we'll need this
"schema_version": dataset.metadata.schema_version,
"self_reported_ethnicity": self._ontology_term_ids_to_response(dataset.metadata.self_reported_ethnicity),
"sex": self._ontology_term_ids_to_response(dataset.metadata.sex),
"suspension_type": dataset.metadata.suspension_type,
"tissue": self._ontology_term_ids_to_response(dataset.metadata.tissue),
"tombstone": False,
"schema_version": None if dataset.metadata is None else dataset.metadata.schema_version,
"self_reported_ethnicity": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.self_reported_ethnicity),
"sex": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.sex),
"suspension_type": None if dataset.metadata is None else dataset.metadata.suspension_type,
"tissue": None if dataset.metadata is None else self._ontology_term_ids_to_response(dataset.metadata.tissue),
"tombstone": False, # TODO
"updated_at": dataset.created_at, # Legacy: datasets can't be updated anymore
"x_approximate_distribution": dataset.metadata.x_approximate_distribution,
}
"x_approximate_distribution": None if dataset.metadata is None else dataset.metadata.x_approximate_distribution,
})

def _collection_to_response(self, collection: CollectionVersion, access_type: str):
collection_id = collection.collection_id.id if collection.published_at is not None else collection.version_id.id
Expand All @@ -157,7 +158,7 @@ def _collection_to_response(self, collection: CollectionVersion, access_type: st
"name": collection.metadata.name,
"published_at": collection.published_at,
"publisher_metadata": collection.publisher_metadata, # TODO: convert
"updated_at": collection.published_at,
"updated_at": collection.published_at or collection.created_at,
"visibility": "PUBLIC" if collection.published_at is not None else "PRIVATE",
})

Expand Down
13 changes: 6 additions & 7 deletions backend/layers/api/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

from backend.layers.api.portal_api import PortalApi
from backend.layers.business.business import BusinessLogic
from backend.layers.persistence.persistence import DatabaseProviderInterface
from backend.layers.persistence.persistence import DatabaseProvider, DatabaseProviderInterface
from backend.layers.persistence.persistence_mock import DatabaseProviderMock
from backend.layers.thirdparty.crossref_provider import CrossrefProviderInterface
from backend.layers.thirdparty.s3_provider import S3Provider
from backend.layers.thirdparty.step_function_provider import StepFunctionProviderInterface
from backend.layers.thirdparty.uri_provider import UriProviderInterface
from backend.layers.thirdparty.step_function_provider import StepFunctionProvider, StepFunctionProviderInterface
from backend.layers.thirdparty.uri_provider import UriProvider, UriProviderInterface
# from tests.unit.backend.layers.persistence.persistence_mock import DatabaseProviderMock
# from backend.layers.api.portal_api import PortalApi

Expand All @@ -22,12 +22,11 @@
def portal_api():
global api
if api is None:
database_provider = DatabaseProviderMock()
database_provider = DatabaseProvider()
crossref_provider = CrossrefProviderInterface()
step_function_provider = StepFunctionProviderInterface()
step_function_provider = StepFunctionProvider()
s3_provider = S3Provider()
uri_provider = UriProviderInterface()
uri_provider.validate = Mock(return_value=True) # By default, every link should be valid
uri_provider = UriProvider()

business_logic = BusinessLogic(
database_provider,
Expand Down
Loading

0 comments on commit 7bd523a

Please sign in to comment.