From 55acec49e9f215581f24a71125c503f0da73e1d5 Mon Sep 17 00:00:00 2001 From: Erika Salomon Date: Wed, 6 Mar 2019 15:53:01 -0600 Subject: [PATCH 1/2] Faster train/test task generation Generating train/test tasks can take hours. One culprit may be slow IO of the matrices, which are loaded to decide if the job should be processed. This commit makes the train-tester generate all of the tasks withoug checking the matrices and skips processing the tasks if the matrices are no good. --- src/triage/component/catwalk/__init__.py | 53 +++++++++++++----------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/src/triage/component/catwalk/__init__.py b/src/triage/component/catwalk/__init__.py index 5beba2438..6de6b41a8 100644 --- a/src/triage/component/catwalk/__init__.py +++ b/src/triage/component/catwalk/__init__.py @@ -31,22 +31,6 @@ def __init__( def generate_tasks(self, split, grid_config, model_comment=None): logging.info("Generating train/test tasks for split %s", split["train_uuid"]) train_store = self.matrix_storage_engine.get_store(split["train_uuid"]) - if train_store.empty: - logging.warning( - """Train matrix for split %s was empty, - no point in training this model. Skipping - """, - split["train_uuid"], - ) - return [] - if len(train_store.labels.unique()) == 1: - logging.warning( - """Train Matrix for split %s had only one - unique value, no point in training this model. Skipping - """, - split["train_uuid"], - ) - return [] train_tasks = self.model_trainer.generate_train_tasks( grid_config=grid_config, misc_db_parameters=dict(test=False, model_comment=model_comment), @@ -59,14 +43,6 @@ def generate_tasks(self, split, grid_config, model_comment=None): ): test_store = self.matrix_storage_engine.get_store(test_uuid) - if test_store.empty: - logging.warning( - """Test matrix for uuid %s - was empty, no point in generating predictions. Not creating train/test task. - """, - test_uuid, - ) - continue for train_task in train_tasks: train_test_tasks.append( { @@ -83,6 +59,35 @@ def process_all_tasks(self, tasks): def process_task(self, test_store, train_store, train_kwargs): logging.info("Beginning train task %s", train_kwargs) + + # If the train or test design matrix empty, or if the train store only + # has one label value, skip training the model. + if train_store.empty: + logging.warning( + """Train matrix for split %s was empty, + no point in training this model. Skipping + """, + split["train_uuid"], + ) + return + if len(train_store.labels.unique()) == 1: + logging.warning( + """Train Matrix for split %s had only one + unique value, no point in training this model. Skipping + """, + split["train_uuid"], + ) + return + if test_store.empty: + logging.warning( + """Test matrix for uuid %s + was empty, no point in generating predictions. Not processing train/test task. + """, + test_uuid, + ) + return + + # If the matrices and train labels are OK, train and test the model! with self.model_trainer.cache_models(), test_store.cache(), train_store.cache(): # will cache any trained models until it goes out of scope (at the end of the task) # this way we avoid loading the model pickle again for predictions From 8e2089fb738fbc7f12f184162aba5da126732c27 Mon Sep 17 00:00:00 2001 From: Tristan Crockett Date: Thu, 7 Mar 2019 10:36:08 -0600 Subject: [PATCH 2/2] Always cache metadata --- src/triage/component/catwalk/storage.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/triage/component/catwalk/storage.py b/src/triage/component/catwalk/storage.py index 80e5b4a1d..e70462a30 100644 --- a/src/triage/component/catwalk/storage.py +++ b/src/triage/component/catwalk/storage.py @@ -398,11 +398,7 @@ def metadata(self): """The raw metadata. Will load from storage into memory if not already loaded""" if self.__metadata is not None: return self.__metadata - metadata = self.load_metadata() - if self.should_cache: - self.__metadata = metadata - else: - return metadata + self.__metadata = self.load_metadata() return self.__metadata @metadata.setter @@ -540,7 +536,6 @@ def save(self): def clear_cache(self): self._matrix_label_tuple = None - self.metadata = None def __getstate__(self): """Remove object of a large size upon serialization. @@ -549,7 +544,6 @@ def __getstate__(self): """ state = self.__dict__.copy() state['_matrix_label_tuple'] = None - state['__metadata'] = None return state