Merge pull request #1329 from AlexsLemonade/dev

Deploy S3 bucket lockdown, change SRA downloads to HTTP
AlexsLemonade · Jun 5, 2019 · e134437 · e134437
2 parents 09d60fe + f5fc426
commit e134437
Show file tree

Hide file tree

Showing 25 changed files with 221 additions and 38 deletions.
diff --git a/api/environments/local b/api/environments/local
@@ -16,6 +16,7 @@ NOMAD_HOST=nomad
 LOCAL_ROOT_DIR=/home/user/data_store
 USE_S3=False
 S3_BUCKET_NAME=data-refinery
+S3_QN_TARGET_BUCKET_NAME=
 DATABASE_TIMEOUT=5
 
 RUNNING_IN_CLOUD=False

diff --git a/api/environments/test b/api/environments/test
@@ -17,3 +17,5 @@ NOMAD_HOST=nomad
 MAILCHIMP_USER=MC_FAKE_USER
 MAILCHIMP_API_KEY=MC_FAKE_API_KEY
 MAILCHIMP_LIST_ID=MC_FAKE_LIST_ID
+
+S3_QN_TARGET_BUCKET_NAME=
diff --git a/common/data_refinery_common/migrations/0020_update_qn_bucket.py b/common/data_refinery_common/migrations/0020_update_qn_bucket.py
@@ -0,0 +1,52 @@
+# Generated by Django 2.1.5 on 2019-04-04 14:27
+
+import sys
+
+from django.conf import settings
+from django.db import migrations
+from data_refinery_common.utils import get_env_variable
+
+
+# We want this to throw if it can't access this, no point in running a
+# migration to set everything to a bad value.
+S3_QN_TARGET_BUCKET_NAME = get_env_variable("S3_QN_TARGET_BUCKET_NAME")
+
+
+def update_qn_bucket(apps, schema_editor):
+    '''Sets the s3_bucket for QN Targets to a bucket just for them.
+
+    Based off of:
+    https://simpleisbetterthancomplex.com/tutorial/2017/09/26/how-to-create-django-data-migrations.html
+
+    We can't import the ComputedFile model directly as it may be a newer
+    version than this migration expects. We use the historical version.
+    '''
+
+    if not settings.RUNNING_IN_CLOUD:
+        return
+
+    # Pagination isn't necessary here because we have very few QN targets.
+    ComputedFile = apps.get_model('data_refinery_common', 'ComputedFile')
+    for computed_file in ComputedFile.objects.filter(is_qn_target=True):
+        if not computed_file.s3_bucket or not computed_file.s3_key:
+            continue
+
+        if not computed_file.change_s3_location(S3_QN_TARGET_BUCKET_NAME, computed_file.s3_key):
+            # The call to change_s3_location will already logs what
+            # went wrong, so we don't need to log. If there's any kind
+            # of network/AWS issue going on all we really need to do
+            # is stop the migration from finishing successfully since
+            # once a migration completes successfully it won't ever be
+            # run again.
+            sys.exit(1)
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('data_refinery_common', '0019_sample_is_blacklisted'),
+    ]
+
+    operations = [
+        migrations.RunPython(update_qn_bucket),
+    ]
diff --git a/common/data_refinery_common/models/models.py b/common/data_refinery_common/models/models.py
@@ -924,6 +924,58 @@ def sync_from_s3(self, force=False, path=None):
             logger.exception(e, computed_file_id=self.pk)
             return None
 
+    def change_s3_location(self, new_bucket: str, new_key: str) -> bool:
+        """Moves the file from its current location in S3.
+
+        The new location will be set based on `new_bucket` and
+        `new_key`. The s3_bucket and s3_key properties will be updated
+        to reflect this on a successful move.
+        """
+        old_bucket = self.s3_bucket
+        old_key = self.s3_key
+        copy_source = {
+            'Bucket': old_bucket,
+            'Key': old_key
+        }
+        try:
+            response = S3.copy_object(Bucket=new_bucket,
+                                      CopySource=copy_source,
+                                      Key=new_key)
+        except:
+            logger.exception("Could not copy computed file within S3",
+                             computed_file_id=self.id,
+                             source_bucket=old_bucket,
+                             source_key=old_key,
+                             destination_bucket=new_bucket,
+                             destination_key=new_key)
+            return False
+
+        try:
+            self.s3_bucket = new_bucket
+            self.s3_key = new_key
+            self.save()
+        except:
+            logger.exception("Could not save computed file after it was copied!!!",
+                             computed_file_id=self.id,
+                             source_bucket=old_bucket,
+                             source_key=old_key,
+                             destination_bucket=new_bucket,
+                             destination_key=new_key)
+            return False
+
+        try:
+            response = S3.delete_object(Bucket=old_bucket, Key=old_key)
+        except:
+            logger.exception("Could not delete computed file after it was copied and saved!!!",
+                             computed_file_id=self.id,
+                             source_bucket=old_bucket,
+                             source_key=old_key,
+                             destination_bucket=new_bucket,
+                             destination_key=new_key)
+            return False
+
+        return True
+
     def calculate_sha1(self) -> None:
         """ Calculate the SHA1 value of a given file.
         """

diff --git a/common/data_refinery_common/utils.py b/common/data_refinery_common/utils.py
@@ -265,22 +265,22 @@ def calculate_sha1(absolute_file_path):
 
     return hash_object.hexdigest()
 
-def get_fasp_sra_download(run_accession: str):
+def get_sra_download_url(run_accession, protocol="fasp"):
     """Try getting the sra-download URL from CGI endpoint"""
     #Ex: curl --data "acc=SRR6718414&accept-proto=fasp&version=2.0" https://www.ncbi.nlm.nih.gov/Traces/names/names.cgi
     cgi_url = "https://www.ncbi.nlm.nih.gov/Traces/names/names.cgi"
-    data = "acc=" + run_accession + "&accept-proto=fasp&version=2.0"
+    data = "acc=" + run_accession + "&accept-proto=" + protocol + "&version=2.0"
     try:
         resp = requests.post(cgi_url, data=data)
     except Exception as e:
         # Our configured logger needs util, so we use the standard logging library for just this.
         import logging
         logger = logging.getLogger(__name__)
-        logger.exception("Bad FASP CGI request!: " + str(cgi_url) + ", " + str(data))
+        logger.exception("Bad CGI request!: " + str(cgi_url) + ", " + str(data))
         return None
 
     if resp.status_code != 200:
-        # This isn't on the new FASP servers
+        # This isn't on the new servers
         return None
     else:
         try:
@@ -290,15 +290,32 @@ def get_fasp_sra_download(run_accession: str):
             # Sometimes, the responses from names.cgi makes no sense at all on a per-accession-code basis. This helps us handle that.
             # $ curl --data "acc=SRR5818019&accept-proto=fasp&version=2.0" https://www.ncbi.nlm.nih.gov/Traces/names/names.cgi
             # 2.0\nremote|SRR5818019|434259775|2017-07-11T21:32:08Z|a4bfc16dbab1d4f729c4552e3c9519d1|||400|Only 'https' protocol is allowed for this object
-            sra_url = resp.text.split('\n')[1].split('|')[6].split('fasp://')[1]
+            protocol_header = protocol + '://'
+            sra_url = resp.text.split('\n')[1].split('|')[6]
             return sra_url
         except Exception as e:
             # Our configured logger needs util, so we use the standard logging library for just this.
             import logging
             logger = logging.getLogger(__name__)
-            logger.exception("Error parsing FASP CGI response: " + str(cgi_url) + " " + str(data) + " " + str(resp.text))
+            logger.exception("Error parsing CGI response: " + str(cgi_url) + " " + str(data) + " " + str(resp.text))
             return None
 
+
+def get_fasp_sra_download(run_accession: str):
+    """Get an URL for SRA using the FASP protocol.
+
+    These URLs should not actually include the protcol."""
+    full_url = get_sra_download_url(run_accession, 'fasp')
+    if full_url:
+        sra_url = full_url.split('fasp://')[1]
+        return sra_url
+    else:
+        return None
+
+def get_https_sra_download(run_accession: str):
+    """Get an HTTPS URL for SRA."""
+    return get_sra_download_url(run_accession, 'https')
+
 def load_blacklist(blacklist_csv: str="config/RNASeqRunBlackList.csv"):
     """ Loads the SRA run blacklist """
 
@@ -313,4 +330,4 @@ def load_blacklist(blacklist_csv: str="config/RNASeqRunBlackList.csv"):
 
             blacklisted_samples.append(line[0].strip())
 
-    return blacklisted_samples
+    return blacklisted_samples
diff --git a/common/environments/local b/common/environments/local
@@ -14,6 +14,7 @@ NOMAD_PORT=4646
 LOCAL_ROOT_DIR=/home/user/data_store
 USE_S3=False
 S3_BUCKET_NAME=data-refinery
+S3_QN_TARGET_BUCKET_NAME=data-refinery-s3-qn-target-bucket
 DATABASE_TIMEOUT=5
 
 RUNNING_IN_CLOUD=False

diff --git a/common/environments/test b/common/environments/test
@@ -17,4 +17,5 @@ SERVICE=common
 LOCAL_ROOT_DIR=/home/user/data_store
 USE_S3=False
 S3_BUCKET_NAME=data-refinery
+S3_QN_TARGET_BUCKET_NAME=data_refinery_qn_target_bucket
 DATABASE_TIMEOUT=5
diff --git a/common/run_tests.sh b/common/run_tests.sh
@@ -31,5 +31,5 @@ docker run \
        --add-host=database:"$DB_HOST_IP" \
        --add-host=nomad:"$HOST_IP" \
        --add-host=elasticsearch:"$ES_HOST_IP" \
-       --env-file api/environments/test \
+       --env-file common/environments/test \
        -it ccdlstaging/dr_common_tests bash -c "$(run_tests_with_coverage $@)" --parallel
diff --git a/foreman/environments/local b/foreman/environments/local
@@ -24,6 +24,7 @@ LOCAL_ROOT_DIR=/home/user/data_store
 RAVEN_DSN=
 RAVEN_DSN_API=
 S3_TRANSCRIPTOME_INDEX_BUCKET_NAME=
+S3_QN_TARGET_BUCKET_NAME=
 
 LOG_LEVEL=INFO
 

diff --git a/foreman/environments/test b/foreman/environments/test
@@ -24,6 +24,7 @@ LOCAL_ROOT_DIR=/home/user/data_store
 RAVEN_DSN=
 RAVEN_DSN_API=
 S3_TRANSCRIPTOME_INDEX_BUCKET_NAME=
+S3_QN_TARGET_BUCKET_NAME=
 
 LOG_LEVEL=INFO
 

diff --git a/infrastructure/deploy.sh b/infrastructure/deploy.sh
@@ -18,7 +18,7 @@ print_options() {
     echo '   "-e dev" will deploy a dev stack which is appropriate for a single developer to use to test.'
     echo '-d May be used to override the Dockerhub repo where the images will be pulled from.'
     echo '   This may also be specified by setting the TF_VAR_dockerhub_repo environment variable.'
-    echo '   If unset, defaults to the value in `infrastructure/environments/$env`, which is "ccdlstaging"'
+    echo '   If unset, defaults to "ccdlstaging" if the version contains "-dev" and "ccdl" otherwise.'
     echo '   for dev and staging environments and "ccdl" for prod.'
     echo '   This option is useful for testing code changes. Images with the code to be tested can be pushed'
     echo '   to your private Dockerhub repo and then the system will find them.'
@@ -80,6 +80,14 @@ if [[ -z $SYSTEM_VERSION ]]; then
     exit 1
 fi
 
+if [[ -z $TF_VAR_dockerhub_repo ]]; then
+    if [[ $SYSTEM_VERSION == *"-dev" ]]; then
+        export TF_VAR_dockerhub_repo=ccdlstaging
+    else
+        export TF_VAR_dockerhub_repo=ccdl
+    fi
+fi
+
 if [[ -z $TF_VAR_region ]]; then
     TF_VAR_region=us-east-1
 fi

diff --git a/infrastructure/disk.tf b/infrastructure/disk.tf
@@ -2,7 +2,7 @@
 
 ##
 # EBS
-## 
+##
 
 resource "aws_ebs_volume" "data_refinery_ebs" {
   count = "${var.max_clients}"
@@ -36,7 +36,7 @@ resource "aws_s3_bucket" "data_refinery_bucket" {
 
 resource "aws_s3_bucket" "data_refinery_results_bucket" {
   bucket = "data-refinery-s3-results-${var.user}-${var.stage}"
-  acl    = "public-read"
+  acl    = "private"
   force_destroy = "${var.static_bucket_prefix == "dev" ? true : false}"
 
   tags {
@@ -92,9 +92,20 @@ resource "aws_s3_bucket" "data_refinery_transcriptome_index_bucket" {
   }
 }
 
+resource "aws_s3_bucket" "data_refinery_qn_target_bucket" {
+  bucket = "data-refinery-s3-qn-target-${var.user}-${var.stage}"
+  acl    = "public-read"
+  force_destroy = "${var.static_bucket_prefix == "dev" ? true : false}"
+
+  tags {
+    Name        = "data-refinery-s3-qn-target-${var.user}-${var.stage}"
+    Environment = "${var.stage}"
+  }
+}
+
 resource "aws_s3_bucket" "data_refinery_compendia_bucket" {
   bucket = "data-refinery-s3-compendia-${var.user}-${var.stage}"
-  acl    = "public-read"
+  acl    = "private"
   force_destroy = "${var.static_bucket_prefix == "dev" ? true : false}"
 
   tags {

diff --git a/infrastructure/environments/prod.tfvars b/infrastructure/environments/prod.tfvars
diff --git a/infrastructure/environments/staging.tfvars b/infrastructure/environments/staging.tfvars
diff --git a/infrastructure/init_terraform.sh b/infrastructure/init_terraform.sh
@@ -21,4 +21,5 @@ rm -rf .terraform
 terraform init \
     -force-copy \
     -backend-config="bucket=refinebio-tfstate-deploy-$STAGE" \
-    -backend-config="key=terraform-${TF_VAR_user}.tfstate"
+    -backend-config="key=terraform-${TF_VAR_user}.tfstate" \
+    -backend-config="dynamodb_table=refinebio-terraform-lock"
diff --git a/infrastructure/nomad-configuration/client-smasher-instance-user-data.tpl.sh b/infrastructure/nomad-configuration/client-smasher-instance-user-data.tpl.sh
@@ -55,7 +55,7 @@ echo "
 # Cannot specify bip option in config file because it is hardcoded in
 # the startup command because docker is run by clowns.
 service docker stop
-nohup /usr/bin/dockerd -s overlay2 --bip=172.17.77.1/22 --log-driver=json-file --log-opt max-size=100m --log-opt max-file=3 > /dev/null &
+nohup /usr/bin/dockerd -s overlay2 --bip=172.17.77.1/22 --log-driver=json-file --log-opt max-size=100m --log-opt max-file=3 > /var/log/docker_daemon.log &
 
 # Output the files we need to start up Nomad and register jobs:
 # (Note that the lines starting with "$" are where

diff --git a/infrastructure/variables.tf b/infrastructure/variables.tf
@@ -241,6 +241,8 @@ output "environment_variables" {
       value = "${aws_s3_bucket.data_refinery_results_bucket.id}"},
     {name = "S3_TRANSCRIPTOME_INDEX_BUCKET_NAME"
       value = "${aws_s3_bucket.data_refinery_transcriptome_index_bucket.id}"},
+    {name = "S3_QN_TARGET_BUCKET_NAME"
+      value = "${aws_s3_bucket.data_refinery_qn_target_bucket.id}"},
     {name = "S3_COMPENDIA_BUCKET_NAME"
       value = "${aws_s3_bucket.data_refinery_compendia_bucket.id}"},
     {name = "LOCAL_ROOT_DIR"

diff --git a/workers/README_DATASET.md b/workers/README_DATASET.md
@@ -9,6 +9,8 @@ This download includes gene expression matrices and experiment and sample metada
 
 * The `aggregated_metadata.json` file contains information about the options you selected for download.
 Specifically, the `aggregate_by` and `scale_by` fields note how the samples are grouped into gene expression matrices and how the gene expression data values were transformed, respectively.
+The `quantile_normalized` fields notes whether or not quantile normalization was performed.
+Currently, we only support skipping quantile normalization for RNA-seq experiments when aggregating by experiment on the web interface.
 
 * Individual gene expression matrices and their corresponding sample metadata files are in their own directories.
 

diff --git a/workers/data_refinery_workers/downloaders/sra.py b/workers/data_refinery_workers/downloaders/sra.py
@@ -13,7 +13,7 @@
     DownloaderJobOriginalFileAssociation,
     OriginalFile,
 )
-from data_refinery_common.utils import get_env_variable, get_fasp_sra_download
+from data_refinery_common.utils import get_env_variable, get_https_sra_download
 from data_refinery_workers.downloaders import utils
 
 logger = get_and_configure_logger(__name__)
@@ -38,14 +38,14 @@ def _download_file(download_url: str,
     elif "ncbi.nlm.nih.gov" in download_url and not force_ftp:
         # Try to convert old-style endpoints into new-style endpoints if possible
         try:
-            if 'anonftp' in download_url:
+            if 'anonftp' in download_url or 'dbtest' in download_url:
                 accession = download_url.split('/')[-1].split('.sra')[0]
-                new_url = get_fasp_sra_download(accession)
+                new_url = get_https_sra_download(accession)
                 if new_url:
                     download_url = new_url
         except Exception:
             pass
-        return _download_file_aspera(download_url, downloader_job, target_file_path, source="NCBI")
+        return _download_file_http(download_url, downloader_job, target_file_path)
     else:
         return _download_file_ftp(download_url, downloader_job, target_file_path)
 
@@ -77,6 +77,30 @@ def _download_file_ftp(download_url: str, downloader_job: DownloaderJob, target_
     return True
 
 
+def _download_file_http(download_url: str,
+                        downloader_job: DownloaderJob,
+                        target_file_path: str
+                          ) -> bool:
+    try:
+        target_file = open(target_file_path, "wb")
+        logger.debug("Downloading file from %s to %s using HTTP.",
+                     download_url,
+                     target_file_path,
+                     downloader_job=downloader_job.id)
+
+        with closing(urllib.request.urlopen(download_url, timeout=60)) as request:
+            shutil.copyfileobj(request, target_file, CHUNK_SIZE)
+    except Exception:
+        logger.exception("Exception caught while downloading file.",
+                         downloader_job=downloader_job.id)
+        downloader_job.failure_reason = "Exception caught while downloading file"
+        return False
+    finally:
+        target_file.close()
+
+    return True
+
+
 def _download_file_aspera(download_url: str,
                           downloader_job: DownloaderJob,
                           target_file_path: str,