Skip to content

Commit

Permalink
Merge pull request #1329 from AlexsLemonade/dev
Browse files Browse the repository at this point in the history
Deploy S3 bucket lockdown, change SRA downloads to HTTP
  • Loading branch information
kurtwheeler authored Jun 5, 2019
2 parents 09d60fe + f5fc426 commit e134437
Show file tree
Hide file tree
Showing 25 changed files with 221 additions and 38 deletions.
1 change: 1 addition & 0 deletions api/environments/local
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ NOMAD_HOST=nomad
LOCAL_ROOT_DIR=/home/user/data_store
USE_S3=False
S3_BUCKET_NAME=data-refinery
S3_QN_TARGET_BUCKET_NAME=
DATABASE_TIMEOUT=5

RUNNING_IN_CLOUD=False
Expand Down
2 changes: 2 additions & 0 deletions api/environments/test
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ NOMAD_HOST=nomad
MAILCHIMP_USER=MC_FAKE_USER
MAILCHIMP_API_KEY=MC_FAKE_API_KEY
MAILCHIMP_LIST_ID=MC_FAKE_LIST_ID

S3_QN_TARGET_BUCKET_NAME=
52 changes: 52 additions & 0 deletions common/data_refinery_common/migrations/0020_update_qn_bucket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Generated by Django 2.1.5 on 2019-04-04 14:27

import sys

from django.conf import settings
from django.db import migrations
from data_refinery_common.utils import get_env_variable


# We want this to throw if it can't access this, no point in running a
# migration to set everything to a bad value.
S3_QN_TARGET_BUCKET_NAME = get_env_variable("S3_QN_TARGET_BUCKET_NAME")


def update_qn_bucket(apps, schema_editor):
'''Sets the s3_bucket for QN Targets to a bucket just for them.
Based off of:
https://simpleisbetterthancomplex.com/tutorial/2017/09/26/how-to-create-django-data-migrations.html
We can't import the ComputedFile model directly as it may be a newer
version than this migration expects. We use the historical version.
'''

if not settings.RUNNING_IN_CLOUD:
return

# Pagination isn't necessary here because we have very few QN targets.
ComputedFile = apps.get_model('data_refinery_common', 'ComputedFile')
for computed_file in ComputedFile.objects.filter(is_qn_target=True):
if not computed_file.s3_bucket or not computed_file.s3_key:
continue

if not computed_file.change_s3_location(S3_QN_TARGET_BUCKET_NAME, computed_file.s3_key):
# The call to change_s3_location will already logs what
# went wrong, so we don't need to log. If there's any kind
# of network/AWS issue going on all we really need to do
# is stop the migration from finishing successfully since
# once a migration completes successfully it won't ever be
# run again.
sys.exit(1)


class Migration(migrations.Migration):

dependencies = [
('data_refinery_common', '0019_sample_is_blacklisted'),
]

operations = [
migrations.RunPython(update_qn_bucket),
]
52 changes: 52 additions & 0 deletions common/data_refinery_common/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,58 @@ def sync_from_s3(self, force=False, path=None):
logger.exception(e, computed_file_id=self.pk)
return None

def change_s3_location(self, new_bucket: str, new_key: str) -> bool:
"""Moves the file from its current location in S3.
The new location will be set based on `new_bucket` and
`new_key`. The s3_bucket and s3_key properties will be updated
to reflect this on a successful move.
"""
old_bucket = self.s3_bucket
old_key = self.s3_key
copy_source = {
'Bucket': old_bucket,
'Key': old_key
}
try:
response = S3.copy_object(Bucket=new_bucket,
CopySource=copy_source,
Key=new_key)
except:
logger.exception("Could not copy computed file within S3",
computed_file_id=self.id,
source_bucket=old_bucket,
source_key=old_key,
destination_bucket=new_bucket,
destination_key=new_key)
return False

try:
self.s3_bucket = new_bucket
self.s3_key = new_key
self.save()
except:
logger.exception("Could not save computed file after it was copied!!!",
computed_file_id=self.id,
source_bucket=old_bucket,
source_key=old_key,
destination_bucket=new_bucket,
destination_key=new_key)
return False

try:
response = S3.delete_object(Bucket=old_bucket, Key=old_key)
except:
logger.exception("Could not delete computed file after it was copied and saved!!!",
computed_file_id=self.id,
source_bucket=old_bucket,
source_key=old_key,
destination_bucket=new_bucket,
destination_key=new_key)
return False

return True

def calculate_sha1(self) -> None:
""" Calculate the SHA1 value of a given file.
"""
Expand Down
31 changes: 24 additions & 7 deletions common/data_refinery_common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,22 +265,22 @@ def calculate_sha1(absolute_file_path):

return hash_object.hexdigest()

def get_fasp_sra_download(run_accession: str):
def get_sra_download_url(run_accession, protocol="fasp"):
"""Try getting the sra-download URL from CGI endpoint"""
#Ex: curl --data "acc=SRR6718414&accept-proto=fasp&version=2.0" https://www.ncbi.nlm.nih.gov/Traces/names/names.cgi
cgi_url = "https://www.ncbi.nlm.nih.gov/Traces/names/names.cgi"
data = "acc=" + run_accession + "&accept-proto=fasp&version=2.0"
data = "acc=" + run_accession + "&accept-proto=" + protocol + "&version=2.0"
try:
resp = requests.post(cgi_url, data=data)
except Exception as e:
# Our configured logger needs util, so we use the standard logging library for just this.
import logging
logger = logging.getLogger(__name__)
logger.exception("Bad FASP CGI request!: " + str(cgi_url) + ", " + str(data))
logger.exception("Bad CGI request!: " + str(cgi_url) + ", " + str(data))
return None

if resp.status_code != 200:
# This isn't on the new FASP servers
# This isn't on the new servers
return None
else:
try:
Expand All @@ -290,15 +290,32 @@ def get_fasp_sra_download(run_accession: str):
# Sometimes, the responses from names.cgi makes no sense at all on a per-accession-code basis. This helps us handle that.
# $ curl --data "acc=SRR5818019&accept-proto=fasp&version=2.0" https://www.ncbi.nlm.nih.gov/Traces/names/names.cgi
# 2.0\nremote|SRR5818019|434259775|2017-07-11T21:32:08Z|a4bfc16dbab1d4f729c4552e3c9519d1|||400|Only 'https' protocol is allowed for this object
sra_url = resp.text.split('\n')[1].split('|')[6].split('fasp://')[1]
protocol_header = protocol + '://'
sra_url = resp.text.split('\n')[1].split('|')[6]
return sra_url
except Exception as e:
# Our configured logger needs util, so we use the standard logging library for just this.
import logging
logger = logging.getLogger(__name__)
logger.exception("Error parsing FASP CGI response: " + str(cgi_url) + " " + str(data) + " " + str(resp.text))
logger.exception("Error parsing CGI response: " + str(cgi_url) + " " + str(data) + " " + str(resp.text))
return None


def get_fasp_sra_download(run_accession: str):
"""Get an URL for SRA using the FASP protocol.
These URLs should not actually include the protcol."""
full_url = get_sra_download_url(run_accession, 'fasp')
if full_url:
sra_url = full_url.split('fasp://')[1]
return sra_url
else:
return None

def get_https_sra_download(run_accession: str):
"""Get an HTTPS URL for SRA."""
return get_sra_download_url(run_accession, 'https')

def load_blacklist(blacklist_csv: str="config/RNASeqRunBlackList.csv"):
""" Loads the SRA run blacklist """

Expand All @@ -313,4 +330,4 @@ def load_blacklist(blacklist_csv: str="config/RNASeqRunBlackList.csv"):

blacklisted_samples.append(line[0].strip())

return blacklisted_samples
return blacklisted_samples
1 change: 1 addition & 0 deletions common/environments/local
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ NOMAD_PORT=4646
LOCAL_ROOT_DIR=/home/user/data_store
USE_S3=False
S3_BUCKET_NAME=data-refinery
S3_QN_TARGET_BUCKET_NAME=data-refinery-s3-qn-target-bucket
DATABASE_TIMEOUT=5

RUNNING_IN_CLOUD=False
Expand Down
1 change: 1 addition & 0 deletions common/environments/test
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ SERVICE=common
LOCAL_ROOT_DIR=/home/user/data_store
USE_S3=False
S3_BUCKET_NAME=data-refinery
S3_QN_TARGET_BUCKET_NAME=data_refinery_qn_target_bucket
DATABASE_TIMEOUT=5
2 changes: 1 addition & 1 deletion common/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ docker run \
--add-host=database:"$DB_HOST_IP" \
--add-host=nomad:"$HOST_IP" \
--add-host=elasticsearch:"$ES_HOST_IP" \
--env-file api/environments/test \
--env-file common/environments/test \
-it ccdlstaging/dr_common_tests bash -c "$(run_tests_with_coverage $@)" --parallel
1 change: 1 addition & 0 deletions foreman/environments/local
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ LOCAL_ROOT_DIR=/home/user/data_store
RAVEN_DSN=
RAVEN_DSN_API=
S3_TRANSCRIPTOME_INDEX_BUCKET_NAME=
S3_QN_TARGET_BUCKET_NAME=

LOG_LEVEL=INFO

Expand Down
1 change: 1 addition & 0 deletions foreman/environments/test
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ LOCAL_ROOT_DIR=/home/user/data_store
RAVEN_DSN=
RAVEN_DSN_API=
S3_TRANSCRIPTOME_INDEX_BUCKET_NAME=
S3_QN_TARGET_BUCKET_NAME=

LOG_LEVEL=INFO

Expand Down
10 changes: 9 additions & 1 deletion infrastructure/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ print_options() {
echo ' "-e dev" will deploy a dev stack which is appropriate for a single developer to use to test.'
echo '-d May be used to override the Dockerhub repo where the images will be pulled from.'
echo ' This may also be specified by setting the TF_VAR_dockerhub_repo environment variable.'
echo ' If unset, defaults to the value in `infrastructure/environments/$env`, which is "ccdlstaging"'
echo ' If unset, defaults to "ccdlstaging" if the version contains "-dev" and "ccdl" otherwise.'
echo ' for dev and staging environments and "ccdl" for prod.'
echo ' This option is useful for testing code changes. Images with the code to be tested can be pushed'
echo ' to your private Dockerhub repo and then the system will find them.'
Expand Down Expand Up @@ -80,6 +80,14 @@ if [[ -z $SYSTEM_VERSION ]]; then
exit 1
fi

if [[ -z $TF_VAR_dockerhub_repo ]]; then
if [[ $SYSTEM_VERSION == *"-dev" ]]; then
export TF_VAR_dockerhub_repo=ccdlstaging
else
export TF_VAR_dockerhub_repo=ccdl
fi
fi

if [[ -z $TF_VAR_region ]]; then
TF_VAR_region=us-east-1
fi
Expand Down
17 changes: 14 additions & 3 deletions infrastructure/disk.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

##
# EBS
##
##

resource "aws_ebs_volume" "data_refinery_ebs" {
count = "${var.max_clients}"
Expand Down Expand Up @@ -36,7 +36,7 @@ resource "aws_s3_bucket" "data_refinery_bucket" {

resource "aws_s3_bucket" "data_refinery_results_bucket" {
bucket = "data-refinery-s3-results-${var.user}-${var.stage}"
acl = "public-read"
acl = "private"
force_destroy = "${var.static_bucket_prefix == "dev" ? true : false}"

tags {
Expand Down Expand Up @@ -92,9 +92,20 @@ resource "aws_s3_bucket" "data_refinery_transcriptome_index_bucket" {
}
}

resource "aws_s3_bucket" "data_refinery_qn_target_bucket" {
bucket = "data-refinery-s3-qn-target-${var.user}-${var.stage}"
acl = "public-read"
force_destroy = "${var.static_bucket_prefix == "dev" ? true : false}"

tags {
Name = "data-refinery-s3-qn-target-${var.user}-${var.stage}"
Environment = "${var.stage}"
}
}

resource "aws_s3_bucket" "data_refinery_compendia_bucket" {
bucket = "data-refinery-s3-compendia-${var.user}-${var.stage}"
acl = "public-read"
acl = "private"
force_destroy = "${var.static_bucket_prefix == "dev" ? true : false}"

tags {
Expand Down
Binary file modified infrastructure/environments/prod.tfvars
Binary file not shown.
Binary file modified infrastructure/environments/staging.tfvars
Binary file not shown.
3 changes: 2 additions & 1 deletion infrastructure/init_terraform.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ rm -rf .terraform
terraform init \
-force-copy \
-backend-config="bucket=refinebio-tfstate-deploy-$STAGE" \
-backend-config="key=terraform-${TF_VAR_user}.tfstate"
-backend-config="key=terraform-${TF_VAR_user}.tfstate" \
-backend-config="dynamodb_table=refinebio-terraform-lock"
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ echo "
# Cannot specify bip option in config file because it is hardcoded in
# the startup command because docker is run by clowns.
service docker stop
nohup /usr/bin/dockerd -s overlay2 --bip=172.17.77.1/22 --log-driver=json-file --log-opt max-size=100m --log-opt max-file=3 > /dev/null &
nohup /usr/bin/dockerd -s overlay2 --bip=172.17.77.1/22 --log-driver=json-file --log-opt max-size=100m --log-opt max-file=3 > /var/log/docker_daemon.log &

# Output the files we need to start up Nomad and register jobs:
# (Note that the lines starting with "$" are where
Expand Down
2 changes: 2 additions & 0 deletions infrastructure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ output "environment_variables" {
value = "${aws_s3_bucket.data_refinery_results_bucket.id}"},
{name = "S3_TRANSCRIPTOME_INDEX_BUCKET_NAME"
value = "${aws_s3_bucket.data_refinery_transcriptome_index_bucket.id}"},
{name = "S3_QN_TARGET_BUCKET_NAME"
value = "${aws_s3_bucket.data_refinery_qn_target_bucket.id}"},
{name = "S3_COMPENDIA_BUCKET_NAME"
value = "${aws_s3_bucket.data_refinery_compendia_bucket.id}"},
{name = "LOCAL_ROOT_DIR"
Expand Down
2 changes: 2 additions & 0 deletions workers/README_DATASET.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ This download includes gene expression matrices and experiment and sample metada

* The `aggregated_metadata.json` file contains information about the options you selected for download.
Specifically, the `aggregate_by` and `scale_by` fields note how the samples are grouped into gene expression matrices and how the gene expression data values were transformed, respectively.
The `quantile_normalized` fields notes whether or not quantile normalization was performed.
Currently, we only support skipping quantile normalization for RNA-seq experiments when aggregating by experiment on the web interface.

* Individual gene expression matrices and their corresponding sample metadata files are in their own directories.

Expand Down
32 changes: 28 additions & 4 deletions workers/data_refinery_workers/downloaders/sra.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
DownloaderJobOriginalFileAssociation,
OriginalFile,
)
from data_refinery_common.utils import get_env_variable, get_fasp_sra_download
from data_refinery_common.utils import get_env_variable, get_https_sra_download
from data_refinery_workers.downloaders import utils

logger = get_and_configure_logger(__name__)
Expand All @@ -38,14 +38,14 @@ def _download_file(download_url: str,
elif "ncbi.nlm.nih.gov" in download_url and not force_ftp:
# Try to convert old-style endpoints into new-style endpoints if possible
try:
if 'anonftp' in download_url:
if 'anonftp' in download_url or 'dbtest' in download_url:
accession = download_url.split('/')[-1].split('.sra')[0]
new_url = get_fasp_sra_download(accession)
new_url = get_https_sra_download(accession)
if new_url:
download_url = new_url
except Exception:
pass
return _download_file_aspera(download_url, downloader_job, target_file_path, source="NCBI")
return _download_file_http(download_url, downloader_job, target_file_path)
else:
return _download_file_ftp(download_url, downloader_job, target_file_path)

Expand Down Expand Up @@ -77,6 +77,30 @@ def _download_file_ftp(download_url: str, downloader_job: DownloaderJob, target_
return True


def _download_file_http(download_url: str,
downloader_job: DownloaderJob,
target_file_path: str
) -> bool:
try:
target_file = open(target_file_path, "wb")
logger.debug("Downloading file from %s to %s using HTTP.",
download_url,
target_file_path,
downloader_job=downloader_job.id)

with closing(urllib.request.urlopen(download_url, timeout=60)) as request:
shutil.copyfileobj(request, target_file, CHUNK_SIZE)
except Exception:
logger.exception("Exception caught while downloading file.",
downloader_job=downloader_job.id)
downloader_job.failure_reason = "Exception caught while downloading file"
return False
finally:
target_file.close()

return True


def _download_file_aspera(download_url: str,
downloader_job: DownloaderJob,
target_file_path: str,
Expand Down
Loading

0 comments on commit e134437

Please sign in to comment.