From 55acec49e9f215581f24a71125c503f0da73e1d5 Mon Sep 17 00:00:00 2001
From: Erika Salomon <ecsalomon@gmail.com>
Date: Wed, 6 Mar 2019 15:53:01 -0600
Subject: [PATCH 1/2] Faster train/test task generation

Generating train/test tasks can take hours. One culprit may be slow IO
of the matrices, which are loaded to decide if the job should be
processed. This commit makes the train-tester generate all of the tasks
withoug checking the matrices and skips processing the tasks if the
matrices are no good.
---
 src/triage/component/catwalk/__init__.py | 53 +++++++++++++-----------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/triage/component/catwalk/__init__.py b/src/triage/component/catwalk/__init__.py
index 5beba2438..6de6b41a8 100644
--- a/src/triage/component/catwalk/__init__.py
+++ b/src/triage/component/catwalk/__init__.py
@@ -31,22 +31,6 @@ def __init__(
     def generate_tasks(self, split, grid_config, model_comment=None):
         logging.info("Generating train/test tasks for split %s", split["train_uuid"])
         train_store = self.matrix_storage_engine.get_store(split["train_uuid"])
-        if train_store.empty:
-            logging.warning(
-                """Train matrix for split %s was empty,
-            no point in training this model. Skipping
-            """,
-                split["train_uuid"],
-            )
-            return []
-        if len(train_store.labels.unique()) == 1:
-            logging.warning(
-                """Train Matrix for split %s had only one
-            unique value, no point in training this model. Skipping
-            """,
-                split["train_uuid"],
-            )
-            return []
         train_tasks = self.model_trainer.generate_train_tasks(
             grid_config=grid_config,
             misc_db_parameters=dict(test=False, model_comment=model_comment),
@@ -59,14 +43,6 @@ def generate_tasks(self, split, grid_config, model_comment=None):
         ):
             test_store = self.matrix_storage_engine.get_store(test_uuid)
 
-            if test_store.empty:
-                logging.warning(
-                    """Test matrix for uuid %s
-                was empty, no point in generating predictions. Not creating train/test task.
-                """,
-                    test_uuid,
-                )
-                continue
             for train_task in train_tasks:
                 train_test_tasks.append(
                     {
@@ -83,6 +59,35 @@ def process_all_tasks(self, tasks):
 
     def process_task(self, test_store, train_store, train_kwargs):
         logging.info("Beginning train task %s", train_kwargs)
+
+        # If the train or test design matrix empty, or if the train store only
+        # has one label value, skip training the model.
+        if train_store.empty:
+            logging.warning(
+                """Train matrix for split %s was empty,
+            no point in training this model. Skipping
+            """,
+                split["train_uuid"],
+            )
+            return
+        if len(train_store.labels.unique()) == 1:
+            logging.warning(
+                """Train Matrix for split %s had only one
+            unique value, no point in training this model. Skipping
+            """,
+                split["train_uuid"],
+            )
+            return
+        if test_store.empty:
+            logging.warning(
+                """Test matrix for uuid %s
+            was empty, no point in generating predictions. Not processing train/test task.
+            """,
+                test_uuid,
+            )
+            return
+
+        # If the matrices and train labels are OK, train and test the model!
         with self.model_trainer.cache_models(), test_store.cache(), train_store.cache():
             # will cache any trained models until it goes out of scope (at the end of the task)
             # this way we avoid loading the model pickle again for predictions

From 8e2089fb738fbc7f12f184162aba5da126732c27 Mon Sep 17 00:00:00 2001
From: Tristan Crockett <tristan.h.crockett@gmail.com>
Date: Thu, 7 Mar 2019 10:36:08 -0600
Subject: [PATCH 2/2] Always cache metadata

---
 src/triage/component/catwalk/storage.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/triage/component/catwalk/storage.py b/src/triage/component/catwalk/storage.py
index 80e5b4a1d..e70462a30 100644
--- a/src/triage/component/catwalk/storage.py
+++ b/src/triage/component/catwalk/storage.py
@@ -398,11 +398,7 @@ def metadata(self):
         """The raw metadata. Will load from storage into memory if not already loaded"""
         if self.__metadata is not None:
             return self.__metadata
-        metadata = self.load_metadata()
-        if self.should_cache:
-            self.__metadata = metadata
-        else:
-            return metadata
+        self.__metadata = self.load_metadata()
         return self.__metadata
 
     @metadata.setter
@@ -540,7 +536,6 @@ def save(self):
 
     def clear_cache(self):
         self._matrix_label_tuple = None
-        self.metadata = None
 
     def __getstate__(self):
         """Remove object of a large size upon serialization.
@@ -549,7 +544,6 @@ def __getstate__(self):
         """
         state = self.__dict__.copy()
         state['_matrix_label_tuple'] = None
-        state['__metadata'] = None
         return state