PR comments

microsoft · Jan 14, 2022 · 9f6cf7b · 9f6cf7b
1 parent c09cd55
commit 9f6cf7b
Show file tree

Hide file tree

Showing 11 changed files with 86 additions and 54 deletions.
diff --git a/InnerEye/ML/common.py b/InnerEye/ML/common.py
@@ -28,10 +28,6 @@
 LAST_CHECKPOINT_FILE_NAME = "last"
 LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX = LAST_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX
 
-# The file names for what is retained as the "best" checkpoint: The checkpoint at the end of training.
-BEST_CHECKPOINT_FILE_NAME = LAST_CHECKPOINT_FILE_NAME
-BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX = BEST_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX
-
 FINAL_MODEL_FOLDER = "final_model"
 FINAL_ENSEMBLE_MODEL_FOLDER = "final_ensemble_model"
 CHECKPOINT_FOLDER = "checkpoints"
@@ -92,4 +88,4 @@ def get_best_checkpoint_path(path: Path) -> Path:
     Given a path and checkpoint, formats a path based on the checkpoint file name format.
     :param path to checkpoint folder
     """
-    return path / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    return path / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
@@ -55,14 +55,6 @@ def write_args_file(config: Any, outputs_folder: Path) -> None:
     logging.info(output)
 
 
-class AutoSaveCheckpointCallback(ModelCheckpoint):
-    def __init__(self, container: LightningContainer):
-        super().__init__(dirpath=str(container.checkpoint_folder),
-                         filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
-                         every_n_val_epochs=container.autosave_every_n_val_epochs,
-                         save_last=False)
-
-
 def create_lightning_trainer(container: LightningContainer,
                              resume_from_checkpoint: Optional[Path] = None,
                              num_nodes: int = 1) -> \
@@ -116,8 +108,13 @@ def create_lightning_trainer(container: LightningContainer,
     # models, this still appears to be the best way of choosing them because validation loss on the relatively small
     # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
     # not for the HeadAndNeck model.
-    last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder), save_last=True, save_top_k=0)
-    recovery_checkpoint_callback = AutoSaveCheckpointCallback(container)
+    last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
+                                               save_last=True,
+                                               save_top_k=0)
+    recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
+                                                   filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
+                                                   every_n_val_epochs=container.autosave_every_n_val_epochs,
+                                                   save_last=False)
     callbacks: List[Callback] = [
         last_checkpoint_callback,
         recovery_checkpoint_callback,
@@ -270,7 +267,7 @@ def model_train(checkpoint_path: Optional[Path],
         logging.info(f"Terminating training thread with rank {lightning_model.global_rank}.")
         sys.exit()
 
-    logging.info("Choosing the best checkpoint and removing redundant files.")
+    logging.info("Removing redundant checkpoint files.")
     cleanup_checkpoints(container.checkpoint_folder)
     # Lightning modifies a ton of environment variables. If we first run training and then the test suite,
     # those environment variables will mislead the training runs in the test suite, and make them crash.

diff --git a/InnerEye/ML/utils/checkpoint_handling.py b/InnerEye/ML/utils/checkpoint_handling.py
@@ -20,8 +20,7 @@
     fetch_child_runs, tag_values_all_distinct
 from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME
 from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, MODEL_INFERENCE_JSON_FILE_NAME
-from InnerEye.ML.common import (AUTOSAVE_CHECKPOINT_CANDIDATES,
-                                BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_FOLDER,
+from InnerEye.ML.common import (AUTOSAVE_CHECKPOINT_CANDIDATES, CHECKPOINT_FOLDER,
                                 LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, RECOVERY_CHECKPOINT_FILE_NAME)
 from InnerEye.ML.deep_learning_config import OutputParams
 from InnerEye.ML.lightning_container import LightningContainer
@@ -273,12 +272,10 @@ def download_folder_from_run_to_temp_folder(folder: str,
 
 def find_recovery_checkpoint_on_disk_or_cloud(path: Path) -> Optional[Path]:
     """
-    Looks at all the recovery files, extracts the epoch number for all of them and returns the most recent (latest
-    epoch)
-    checkpoint path along with the corresponding epoch number. If no recovery checkpoint are found, return None.
+    Looks at all the checkpoint files and returns the path to the one that should be used for recovery.
+    If no checkpoint files are found on disk, the function attempts to download from the current AzureML run.
     :param path: The folder to start searching in.
-    :return: None if there is no file matching the search pattern, or a Tuple with Path object and integer pointing to
-    recovery checkpoint path and recovery epoch.
+    :return: None if there is no suitable recovery checkpoints, or else a full path to the checkpoint file.
     """
     recovery_checkpoint = find_recovery_checkpoint(path)
     if recovery_checkpoint is None and is_running_in_azure_ml():
@@ -316,7 +313,7 @@ def find_recovery_checkpoint(path: Path) -> Optional[Path]:
         logging.warning(f"Found these legacy checkpoint files: {legacy_recovery_checkpoints}")
         raise ValueError("The legacy recovery checkpoint setup is no longer supported. As a workaround, you can take "
                          f"one of the legacy checkpoints and upload as '{AUTOSAVE_CHECKPOINT_CANDIDATES[0]}'")
-    candidates = [*AUTOSAVE_CHECKPOINT_CANDIDATES, BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
+    candidates = [*AUTOSAVE_CHECKPOINT_CANDIDATES, LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
     for f in candidates:
         full_path = path / f
         if full_path.is_file():
@@ -369,7 +366,7 @@ def download_best_checkpoints_from_child_runs(config: OutputParams, run: Run) ->
             subdir = str(child.tags[tag_to_use] if can_use_split_indices else child.number)
             child_dst = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / subdir
             download_run_output_file(
-                blob_path=Path(CHECKPOINT_FOLDER) / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
+                blob_path=Path(CHECKPOINT_FOLDER) / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
                 destination=child_dst,
                 run=child
             )

diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py
@@ -36,7 +36,7 @@
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.Common.spawn_subprocess import spawn_and_monitor_subprocess
-from InnerEye.ML.common import (BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_FOLDER, DATASET_CSV_FILE_NAME,
+from InnerEye.ML.common import (LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_FOLDER, DATASET_CSV_FILE_NAME,
                                 ModelExecutionMode)
 from InnerEye.ML.configs.other.HelloContainer import HelloContainer
 from InnerEye.ML.configs.segmentation.BasicModel2Epochs import BasicModel2Epochs
@@ -233,7 +233,7 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
                                                           workspace=get_default_workspace())
     files = list(temp_folder.glob("*"))
     assert len(files) == 1
-    assert (temp_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
+    assert (temp_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
     # Test if what's in the folder are really files, not directories
     for file in files:
         assert file.is_file()
@@ -247,7 +247,7 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
             result = find_recovery_checkpoint_on_disk_or_cloud(test_output_dirs.root_dir)
             download.assert_called_once_with(folder=checkpoint_folder)
             assert result is not None
-            assert result.name == BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+            assert result.name == LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
 
 
 @pytest.mark.inference

diff --git a/Tests/ML/runners/test_runner.py b/Tests/ML/runners/test_runner.py
@@ -21,7 +21,7 @@
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.Common.type_annotations import TupleInt3
-from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
+from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
 from InnerEye.ML.configs.unit_testing.passthrough_model import PassThroughModel
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
 from InnerEye.ML.metrics import InferenceMetricsForSegmentation
@@ -282,7 +282,7 @@ def run_model_inference_train_and_test(test_output_dirs: OutputFolderForTests,
                                                                     train_and_test_data_small_dir,
                                                                     "data")
 
-    checkpoint_path = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    checkpoint_path = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
     create_model_and_store_checkpoint(config, checkpoint_path)
     checkpoint_handler = get_default_checkpoint_handler(model_config=config,
                                                         project_root=test_output_dirs.root_dir)

diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
@@ -14,7 +14,7 @@
 from InnerEye.Common.metrics_constants import MetricsFileColumns
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.ML import model_testing
-from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
+from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
 from InnerEye.ML.config import DATASET_ID_FILE, GROUND_TRUTH_IDS_FILE, ModelArchitectureConfig
 from InnerEye.ML.dataset.full_image_dataset import FullImageDataset
 from InnerEye.ML.model_config_base import ModelConfigBase
@@ -98,7 +98,7 @@ def test_model_test(
     execution_mode = ModelExecutionMode.TEST
     checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
     # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
-    create_model_and_store_checkpoint(config, config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
+    create_model_and_store_checkpoint(config, config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
     checkpoint_handler.additional_training_done()
     inference_results = model_testing.segmentation_model_test(config,
                                                               execution_mode=execution_mode,

diff --git a/Tests/ML/test_model_train_test_and_recovery.py b/Tests/ML/test_model_train_test_and_recovery.py
@@ -10,7 +10,7 @@
 
 from InnerEye.Common.metrics_constants import MetricType
 from InnerEye.Common.output_directories import OutputFolderForTests
-from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, ModelExecutionMode
+from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, ModelExecutionMode
 from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
 from InnerEye.ML.metrics import InferenceMetricsForClassification
 from InnerEye.ML.model_testing import model_test
@@ -68,7 +68,7 @@ def test_recover_testing_from_run_recovery(mean_teacher_model: bool,
     os.makedirs(str(config_local_weights.outputs_folder))
 
     local_weights_path = test_output_dirs.root_dir / "local_weights_file.pth"
-    shutil.copyfile(str(config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX),
+    shutil.copyfile(str(config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX),
                     local_weights_path)
     config_local_weights.local_weights_path = [local_weights_path]
 
@@ -96,7 +96,7 @@ def test_autosave_checkpoints(test_output_dirs: OutputFolderForTests, num_epochs
     config.num_epochs = num_epochs
     model_train_unittest(config, dirs=test_output_dirs)
     assert len(list(config.checkpoint_folder.glob("*.*"))) == 1
-    assert (config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
+    assert (config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
 
 
 def test_recovery_e2e(test_output_dirs: OutputFolderForTests) -> None:

diff --git a/Tests/ML/test_model_training.py b/Tests/ML/test_model_training.py
@@ -19,7 +19,7 @@
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
 from InnerEye.Common.metrics_constants import MetricType, TrackedMetrics, VALIDATION_PREFIX
 from InnerEye.Common.output_directories import OutputFolderForTests
-from InnerEye.ML.common import (BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
+from InnerEye.ML.common import (LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
                                 DATASET_CSV_FILE_NAME, ModelExecutionMode, STORED_CSV_FILE_NAMES)
 from InnerEye.ML.config import MixtureLossComponent, SegmentationLoss
 from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
@@ -192,7 +192,7 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
     assert train_config.checkpoint_folder.is_dir()
     actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt"))
     assert len(actual_checkpoints) == 1, f"Actual checkpoints: {actual_checkpoints}"
-    assert (train_config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
+    assert (train_config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
     assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file()
     assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file()
     assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file()
@@ -330,7 +330,7 @@ def test_recover_training_mean_teacher_model(test_output_dirs: OutputFolderForTe
     config.num_epochs = 4
     model_train_unittest(config, dirs=test_output_dirs)
     assert len(list(config.checkpoint_folder.glob("*.*"))) == 1
-    assert (config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
+    assert (config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
 
     # Restart training from previous run
     config.num_epochs = 3

diff --git a/Tests/ML/utils/test_checkpoint_handling.py b/Tests/ML/utils/test_checkpoint_handling.py
@@ -14,7 +14,7 @@
 from InnerEye.Common.fixed_paths import MODEL_INFERENCE_JSON_FILE_NAME
 from InnerEye.ML.utils.checkpoint_handling import MODEL_WEIGHTS_DIR_NAME, get_recovery_checkpoint_path
 from InnerEye.Common.output_directories import OutputFolderForTests
-from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER
+from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.model_inference_config import read_model_inference_config
 from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
@@ -86,7 +86,7 @@ def test_download_recovery_checkpoints_from_single_run(test_output_dirs: OutputF
 
     expected_checkpoint_root = config.checkpoint_folder
     expected_paths = [get_recovery_checkpoint_path(path=expected_checkpoint_root),
-                      expected_checkpoint_root / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
+                      expected_checkpoint_root / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
     assert checkpoint_handler.run_recovery.checkpoints_roots == [expected_checkpoint_root]
     for path in expected_paths:
         assert path.is_file()
@@ -199,7 +199,7 @@ def test_get_best_checkpoint_single_run(test_output_dirs: OutputFolderForTests)
     # in the run, into a subfolder of the checkpoint folder
     checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
     checkpoint_handler.download_recovery_checkpoints_or_weights()
-    expected_checkpoint = config.checkpoint_folder / f"{BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX}"
+    expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
     checkpoint_paths = checkpoint_handler.get_best_checkpoints()
     assert checkpoint_paths
     assert len(checkpoint_paths) == 1
@@ -215,13 +215,13 @@ def test_get_best_checkpoint_single_run(test_output_dirs: OutputFolderForTests)
 
     # There is no checkpoint in the current run - use the one from run_recovery
     checkpoint_paths = checkpoint_handler.get_best_checkpoints()
-    expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
     assert checkpoint_paths
     assert len(checkpoint_paths) == 1
     assert checkpoint_paths[0] == expected_checkpoint
 
     # Copy over checkpoints to make it look like training has happened and a better checkpoint written
-    expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
     expected_checkpoint.touch()
     checkpoint_paths = checkpoint_handler.get_best_checkpoints()
     assert checkpoint_paths
@@ -238,7 +238,7 @@ def test_download_checkpoints_from_hyperdrive_child_runs(test_output_dirs: Outpu
     hyperdrive_run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
     checkpoint_handler.download_checkpoints_from_hyperdrive_child_runs(hyperdrive_run)
     expected_checkpoints = [config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(i)
-                            / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX for i in range(2)]
+                            / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX for i in range(2)]
     checkpoint_paths = checkpoint_handler.get_best_checkpoints()
     assert checkpoint_paths
     assert len(checkpoint_paths) == 2
@@ -266,7 +266,7 @@ def test_get_checkpoints_to_test(test_output_dirs: OutputFolderForTests) -> None
     checkpoint_handler.container.checkpoint_folder.mkdir(parents=True)
 
     # Copy checkpoint to make it seem like training has happened
-    expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
     expected_checkpoint.touch()
     checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()
 
@@ -295,10 +295,10 @@ def test_get_checkpoints_to_test_single_run(test_output_dirs: OutputFolderForTes
 
     assert checkpoint_and_paths
     assert len(checkpoint_and_paths) == 1
-    assert checkpoint_and_paths[0] == config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    assert checkpoint_and_paths[0] == config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
 
     # Copy checkpoint to make it seem like training has happened
-    expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
+    expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
     expected_checkpoint.touch()
     checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()