Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HOTFIX] Deploy compedium fixes to production #1444

Merged
merged 13 commits into from
Aug 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions workers/data_refinery_workers/processors/create_compendia.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
Organism
)
from data_refinery_common.utils import get_env_variable
from data_refinery_workers.processors import utils, smasher, visualize
from data_refinery_workers.processors import utils, smasher#, visualize


S3_BUCKET_NAME = get_env_variable("S3_BUCKET_NAME", "data-refinery")
Expand Down Expand Up @@ -131,26 +131,26 @@ def _perform_imputation(job_context: Dict) -> Dict:
# Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix
combined_matrix = microarray_expression_matrix.merge(log2_rnaseq_matrix, how='outer', left_index=True, right_index=True)

# Visualize Prefiltered
output_path = job_context['output_dir'] + "pre_filtered_" + str(time.time()) + ".png"
visualized_prefilter = visualize.visualize(combined_matrix.copy(), output_path)
# # Visualize Prefiltered
# output_path = job_context['output_dir'] + "pre_filtered_" + str(time.time()) + ".png"
# visualized_prefilter = visualize.visualize(combined_matrix.copy(), output_path)

# Remove genes (rows) with <=70% present values in combined_matrix
thresh = combined_matrix.shape[1] * .7 # (Rows, Columns)
row_filtered_combined_matrix = combined_matrix.dropna(axis='index', thresh=thresh) # Everything below `thresh` is dropped

# Visualize Row Filtered
output_path = job_context['output_dir'] + "row_filtered_" + str(time.time()) + ".png"
visualized_rowfilter = visualize.visualize(row_filtered_combined_matrix.copy(), output_path)
# # Visualize Row Filtered
# output_path = job_context['output_dir'] + "row_filtered_" + str(time.time()) + ".png"
# visualized_rowfilter = visualize.visualize(row_filtered_combined_matrix.copy(), output_path)

# Remove samples (columns) with <50% present values in combined_matrix
# XXX: Find better test data for this!
col_thresh = row_filtered_combined_matrix.shape[0] * .5
row_col_filtered_combined_matrix_samples = row_filtered_combined_matrix.dropna(axis='columns', thresh=col_thresh)

# Visualize Row and Column Filtered
output_path = job_context['output_dir'] + "row_col_filtered_" + str(time.time()) + ".png"
visualized_rowcolfilter = visualize.visualize(row_col_filtered_combined_matrix_samples.copy(), output_path)
# # Visualize Row and Column Filtered
# output_path = job_context['output_dir'] + "row_col_filtered_" + str(time.time()) + ".png"
# visualized_rowcolfilter = visualize.visualize(row_col_filtered_combined_matrix_samples.copy(), output_path)

# "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix
for column in cached_zeroes.keys():
Expand Down Expand Up @@ -204,15 +204,15 @@ def _perform_imputation(job_context: Dict) -> Dict:
# XXX: Refactor QN target acquisition and application before doing this
job_context['organism'] = Organism.get_object_for_name(list(job_context['input_files'].keys())[0])
job_context['merged_no_qn'] = untransposed_imputed_matrix_df
output_path = job_context['output_dir'] + "compendia_no_qn_" + str(time.time()) + ".png"
visualized_merged_no_qn = visualize.visualize(untransposed_imputed_matrix_df.copy(), output_path)
# output_path = job_context['output_dir'] + "compendia_no_qn_" + str(time.time()) + ".png"
# visualized_merged_no_qn = visualize.visualize(untransposed_imputed_matrix_df.copy(), output_path)

# Perform the Quantile Normalization
job_context = smasher._quantile_normalize(job_context, ks_check=False)

# Visualize Final Compendia
output_path = job_context['output_dir'] + "compendia_with_qn_" + str(time.time()) + ".png"
visualized_merged_qn = visualize.visualize(job_context['merged_qn'].copy(), output_path)
# output_path = job_context['output_dir'] + "compendia_with_qn_" + str(time.time()) + ".png"
# visualized_merged_qn = visualize.visualize(job_context['merged_qn'].copy(), output_path)

job_context['time_end'] = timezone.now()
job_context['formatted_command'] = "create_compendia.py"
Expand Down Expand Up @@ -280,8 +280,8 @@ def _create_result_objects(job_context: Dict) -> Dict:
# Create the resulting archive
final_zip_base = "/home/user/data_store/smashed/" + str(job_context["dataset"].pk) + "_compendia"
# Copy LICENSE.txt and README.md files
shutil.copy("README_COMPENDIA.md", final_zip_base + "/README.md")
shutil.copy("LICENSE_DATASET.txt", final_zip_base + "/LICENSE.TXT")
shutil.copy("/home/user/README_COMPENDIUM.md", job_context["output_dir"] + "/README.md")
shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT")
archive_path = shutil.make_archive(final_zip_base, 'zip', job_context["output_dir"])

# Save the related metadata file
Expand Down
7 changes: 4 additions & 3 deletions workers/data_refinery_workers/processors/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
boto3
boto3>=1.9.199
coveralls
django-elasticsearch-dsl
django>=2.1.8
Expand All @@ -9,9 +9,10 @@ pandas
psycopg2-binary
python-nomad>=1.0.2
pyyaml>=4.2b1
requests>=2.20.0
requests>=2.22.0
retrying
scikit-learn
# Fancyimpute needs this version.
scikit-learn>=0.21.2
scipy
simplejson
sympy
Expand Down
22 changes: 11 additions & 11 deletions workers/data_refinery_workers/processors/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
# This file is autogenerated by pip-compile
# To update, run:
#
# pip-compile --output-file=requirements.txt requirements.in
# pip-compile requirements.in
#
bokeh==1.0.4 # via datashader
boto3==1.9.16
botocore==1.12.16 # via boto3, s3transfer
boto3==1.9.199
botocore==1.12.199 # via boto3, s3transfer
certifi==2018.8.24 # via requests
chardet==3.0.4 # via requests
click==7.0 # via distributed
Expand All @@ -20,18 +20,18 @@ datashader==0.6.9
datashape==0.5.2
decorator==4.4.0 # via networkx
distributed==1.26.0 # via dask
django-elasticsearch-dsl==0.5.1
django-elasticsearch-dsl==6.4.2
django==2.1.8
docopt==0.6.2 # via coveralls
docutils==0.14 # via botocore
elasticsearch-dsl==6.1.0
elasticsearch==6.2.0 # via elasticsearch-dsl
elasticsearch-dsl==6.4.0
elasticsearch==6.4.0 # via elasticsearch-dsl
heapdict==1.0.0 # via zict
holoviews==1.11.3
idna==2.7 # via requests
ipaddress==1.0.22 # via elasticsearch-dsl
jinja2==2.10 # via bokeh
jmespath==0.9.3 # via boto3, botocore
joblib==0.13.2 # via scikit-learn
kiwisolver==1.0.1 # via matplotlib
llvmlite==0.28.0 # via numba
locket==0.2.0 # via partd
Expand All @@ -58,11 +58,11 @@ pytz==2018.5 # via django, pandas
pyviz-comms==0.7.2 # via holoviews
pywavelets==1.0.2 # via scikit-image
pyyaml==4.2b4
requests==2.20.0
requests==2.22.0
retrying==1.3.3
s3transfer==0.1.13 # via boto3
s3transfer==0.2.1 # via boto3
scikit-image==0.14.2 # via datashader
scikit-learn==0.20.0
scikit-learn==0.21.3
scipy==1.1.0
selenium==3.141.0
simplejson==3.16.0
Expand All @@ -75,7 +75,7 @@ toolz==0.9.0 # via dask, datashader, distributed, partd
tornado==6.0.2 # via bokeh, distributed
unicodecsv==0.14.1
untangle==1.1.1
urllib3==1.22 # via botocore, elasticsearch, requests, selenium
urllib3==1.25.3 # via botocore, elasticsearch, requests, selenium
xarray==0.12.3
zict==0.1.4 # via distributed

Expand Down
2 changes: 2 additions & 0 deletions workers/data_refinery_workers/processors/test_salmon.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,6 +917,7 @@ def run_tximport_at_progress_point(complete_accessions: List[str], incomplete_ac

job_context = salmon.get_tximport_inputs(job_context)
job_context = salmon.tximport(job_context)
job_context = utils.end_job(job_context)

return job_context

Expand Down Expand Up @@ -992,6 +993,7 @@ def test_early_tximport(self):
sample = tpm_file.samples.first()
SampleComputedFileAssociation.objects.get(sample=sample, computed_file=tpm_file)
SampleComputedFileAssociation.objects.get(sample=sample, computed_file=rds_file)
self.assertTrue(sample.is_processed)

# Make sure that these samples actually were ignored.
for accession_code in incomplete_accessions:
Expand Down
5 changes: 3 additions & 2 deletions workers/data_refinery_workers/processors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from django.utils import timezone
from typing import List, Dict, Callable

from data_refinery_common.job_lookup import ProcessorEnum
from data_refinery_common.job_lookup import ProcessorEnum, ProcessorPipeline
from data_refinery_common.job_management import create_downloader_job
from data_refinery_common.logging import get_and_configure_logger
from data_refinery_common.models import (
Expand Down Expand Up @@ -197,7 +197,8 @@ def start_job(job_context: Dict):
raise Exception("processors.start_job called on job %s that has already been started!" % str(job.id))

original_file = job.original_files.first()
if original_file and not original_file.needs_processing(job_context["job_id"]):
if not job.pipeline_applied == ProcessorPipeline.TXIMPORT.value and original_file\
and not original_file.needs_processing(job_context["job_id"]):
failure_reason = ("Sample has a good computed file, it must have been processed, "
"so it doesn't need to be downloaded! Aborting!")
logger.error(failure_reason,
Expand Down
2 changes: 1 addition & 1 deletion workers/dockerfiles/Dockerfile.compendia
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM miserlou/python3base:latest

# This is very similar to the `smasher` image,
# This is very similar to the `smasher` image,
# but comes from a base image with OpenBLAS, needed for fast impuation,
# and some of the other libraries required for fancyimpute.

Expand Down