Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ant0nsc committed Jan 14, 2022
1 parent c09cd55 commit 9f6cf7b
Show file tree
Hide file tree
Showing 11 changed files with 86 additions and 54 deletions.
6 changes: 1 addition & 5 deletions InnerEye/ML/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,6 @@
LAST_CHECKPOINT_FILE_NAME = "last"
LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX = LAST_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX

# The file names for what is retained as the "best" checkpoint: The checkpoint at the end of training.
BEST_CHECKPOINT_FILE_NAME = LAST_CHECKPOINT_FILE_NAME
BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX = BEST_CHECKPOINT_FILE_NAME + CHECKPOINT_SUFFIX

FINAL_MODEL_FOLDER = "final_model"
FINAL_ENSEMBLE_MODEL_FOLDER = "final_ensemble_model"
CHECKPOINT_FOLDER = "checkpoints"
Expand Down Expand Up @@ -92,4 +88,4 @@ def get_best_checkpoint_path(path: Path) -> Path:
Given a path and checkpoint, formats a path based on the checkpoint file name format.
:param path to checkpoint folder
"""
return path / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
return path / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
19 changes: 8 additions & 11 deletions InnerEye/ML/model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,6 @@ def write_args_file(config: Any, outputs_folder: Path) -> None:
logging.info(output)


class AutoSaveCheckpointCallback(ModelCheckpoint):
def __init__(self, container: LightningContainer):
super().__init__(dirpath=str(container.checkpoint_folder),
filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
every_n_val_epochs=container.autosave_every_n_val_epochs,
save_last=False)


def create_lightning_trainer(container: LightningContainer,
resume_from_checkpoint: Optional[Path] = None,
num_nodes: int = 1) -> \
Expand Down Expand Up @@ -116,8 +108,13 @@ def create_lightning_trainer(container: LightningContainer,
# models, this still appears to be the best way of choosing them because validation loss on the relatively small
# training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
# not for the HeadAndNeck model.
last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder), save_last=True, save_top_k=0)
recovery_checkpoint_callback = AutoSaveCheckpointCallback(container)
last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
save_last=True,
save_top_k=0)
recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
every_n_val_epochs=container.autosave_every_n_val_epochs,
save_last=False)
callbacks: List[Callback] = [
last_checkpoint_callback,
recovery_checkpoint_callback,
Expand Down Expand Up @@ -270,7 +267,7 @@ def model_train(checkpoint_path: Optional[Path],
logging.info(f"Terminating training thread with rank {lightning_model.global_rank}.")
sys.exit()

logging.info("Choosing the best checkpoint and removing redundant files.")
logging.info("Removing redundant checkpoint files.")
cleanup_checkpoints(container.checkpoint_folder)
# Lightning modifies a ton of environment variables. If we first run training and then the test suite,
# those environment variables will mislead the training runs in the test suite, and make them crash.
Expand Down
15 changes: 6 additions & 9 deletions InnerEye/ML/utils/checkpoint_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
fetch_child_runs, tag_values_all_distinct
from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME
from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, MODEL_INFERENCE_JSON_FILE_NAME
from InnerEye.ML.common import (AUTOSAVE_CHECKPOINT_CANDIDATES,
BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_FOLDER,
from InnerEye.ML.common import (AUTOSAVE_CHECKPOINT_CANDIDATES, CHECKPOINT_FOLDER,
LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, RECOVERY_CHECKPOINT_FILE_NAME)
from InnerEye.ML.deep_learning_config import OutputParams
from InnerEye.ML.lightning_container import LightningContainer
Expand Down Expand Up @@ -273,12 +272,10 @@ def download_folder_from_run_to_temp_folder(folder: str,

def find_recovery_checkpoint_on_disk_or_cloud(path: Path) -> Optional[Path]:
"""
Looks at all the recovery files, extracts the epoch number for all of them and returns the most recent (latest
epoch)
checkpoint path along with the corresponding epoch number. If no recovery checkpoint are found, return None.
Looks at all the checkpoint files and returns the path to the one that should be used for recovery.
If no checkpoint files are found on disk, the function attempts to download from the current AzureML run.
:param path: The folder to start searching in.
:return: None if there is no file matching the search pattern, or a Tuple with Path object and integer pointing to
recovery checkpoint path and recovery epoch.
:return: None if there is no suitable recovery checkpoints, or else a full path to the checkpoint file.
"""
recovery_checkpoint = find_recovery_checkpoint(path)
if recovery_checkpoint is None and is_running_in_azure_ml():
Expand Down Expand Up @@ -316,7 +313,7 @@ def find_recovery_checkpoint(path: Path) -> Optional[Path]:
logging.warning(f"Found these legacy checkpoint files: {legacy_recovery_checkpoints}")
raise ValueError("The legacy recovery checkpoint setup is no longer supported. As a workaround, you can take "
f"one of the legacy checkpoints and upload as '{AUTOSAVE_CHECKPOINT_CANDIDATES[0]}'")
candidates = [*AUTOSAVE_CHECKPOINT_CANDIDATES, BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
candidates = [*AUTOSAVE_CHECKPOINT_CANDIDATES, LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
for f in candidates:
full_path = path / f
if full_path.is_file():
Expand Down Expand Up @@ -369,7 +366,7 @@ def download_best_checkpoints_from_child_runs(config: OutputParams, run: Run) ->
subdir = str(child.tags[tag_to_use] if can_use_split_indices else child.number)
child_dst = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / subdir
download_run_output_file(
blob_path=Path(CHECKPOINT_FOLDER) / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
blob_path=Path(CHECKPOINT_FOLDER) / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
destination=child_dst,
run=child
)
Expand Down
6 changes: 3 additions & 3 deletions Tests/AfterTraining/test_after_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.Common.spawn_subprocess import spawn_and_monitor_subprocess
from InnerEye.ML.common import (BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_FOLDER, DATASET_CSV_FILE_NAME,
from InnerEye.ML.common import (LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_FOLDER, DATASET_CSV_FILE_NAME,
ModelExecutionMode)
from InnerEye.ML.configs.other.HelloContainer import HelloContainer
from InnerEye.ML.configs.segmentation.BasicModel2Epochs import BasicModel2Epochs
Expand Down Expand Up @@ -233,7 +233,7 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
workspace=get_default_workspace())
files = list(temp_folder.glob("*"))
assert len(files) == 1
assert (temp_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
assert (temp_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
# Test if what's in the folder are really files, not directories
for file in files:
assert file.is_file()
Expand All @@ -247,7 +247,7 @@ def test_download_checkpoints_from_aml(test_output_dirs: OutputFolderForTests) -
result = find_recovery_checkpoint_on_disk_or_cloud(test_output_dirs.root_dir)
download.assert_called_once_with(folder=checkpoint_folder)
assert result is not None
assert result.name == BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
assert result.name == LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX


@pytest.mark.inference
Expand Down
4 changes: 2 additions & 2 deletions Tests/ML/runners/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.Common.type_annotations import TupleInt3
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
from InnerEye.ML.configs.unit_testing.passthrough_model import PassThroughModel
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.metrics import InferenceMetricsForSegmentation
Expand Down Expand Up @@ -282,7 +282,7 @@ def run_model_inference_train_and_test(test_output_dirs: OutputFolderForTests,
train_and_test_data_small_dir,
"data")

checkpoint_path = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
checkpoint_path = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
create_model_and_store_checkpoint(config, checkpoint_path)
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=test_output_dirs.root_dir)
Expand Down
4 changes: 2 additions & 2 deletions Tests/ML/test_model_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from InnerEye.Common.metrics_constants import MetricsFileColumns
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML import model_testing
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
from InnerEye.ML.config import DATASET_ID_FILE, GROUND_TRUTH_IDS_FILE, ModelArchitectureConfig
from InnerEye.ML.dataset.full_image_dataset import FullImageDataset
from InnerEye.ML.model_config_base import ModelConfigBase
Expand Down Expand Up @@ -98,7 +98,7 @@ def test_model_test(
execution_mode = ModelExecutionMode.TEST
checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
# Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
create_model_and_store_checkpoint(config, config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
create_model_and_store_checkpoint(config, config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
checkpoint_handler.additional_training_done()
inference_results = model_testing.segmentation_model_test(config,
execution_mode=execution_mode,
Expand Down
6 changes: 3 additions & 3 deletions Tests/ML/test_model_train_test_and_recovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from InnerEye.Common.metrics_constants import MetricType
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, ModelExecutionMode
from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, ModelExecutionMode
from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
from InnerEye.ML.metrics import InferenceMetricsForClassification
from InnerEye.ML.model_testing import model_test
Expand Down Expand Up @@ -68,7 +68,7 @@ def test_recover_testing_from_run_recovery(mean_teacher_model: bool,
os.makedirs(str(config_local_weights.outputs_folder))

local_weights_path = test_output_dirs.root_dir / "local_weights_file.pth"
shutil.copyfile(str(config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX),
shutil.copyfile(str(config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX),
local_weights_path)
config_local_weights.local_weights_path = [local_weights_path]

Expand Down Expand Up @@ -96,7 +96,7 @@ def test_autosave_checkpoints(test_output_dirs: OutputFolderForTests, num_epochs
config.num_epochs = num_epochs
model_train_unittest(config, dirs=test_output_dirs)
assert len(list(config.checkpoint_folder.glob("*.*"))) == 1
assert (config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
assert (config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()


def test_recovery_e2e(test_output_dirs: OutputFolderForTests) -> None:
Expand Down
6 changes: 3 additions & 3 deletions Tests/ML/test_model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.metrics_constants import MetricType, TrackedMetrics, VALIDATION_PREFIX
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import (BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
from InnerEye.ML.common import (LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX,
DATASET_CSV_FILE_NAME, ModelExecutionMode, STORED_CSV_FILE_NAMES)
from InnerEye.ML.config import MixtureLossComponent, SegmentationLoss
from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
Expand Down Expand Up @@ -192,7 +192,7 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
assert train_config.checkpoint_folder.is_dir()
actual_checkpoints = list(train_config.checkpoint_folder.rglob("*.ckpt"))
assert len(actual_checkpoints) == 1, f"Actual checkpoints: {actual_checkpoints}"
assert (train_config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
assert (train_config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
assert (train_config.outputs_folder / DATASET_CSV_FILE_NAME).is_file()
assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.TRAIN]).is_file()
assert (train_config.outputs_folder / STORED_CSV_FILE_NAMES[ModelExecutionMode.VAL]).is_file()
Expand Down Expand Up @@ -330,7 +330,7 @@ def test_recover_training_mean_teacher_model(test_output_dirs: OutputFolderForTe
config.num_epochs = 4
model_train_unittest(config, dirs=test_output_dirs)
assert len(list(config.checkpoint_folder.glob("*.*"))) == 1
assert (config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()
assert (config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX).is_file()

# Restart training from previous run
config.num_epochs = 3
Expand Down
18 changes: 9 additions & 9 deletions Tests/ML/utils/test_checkpoint_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from InnerEye.Common.fixed_paths import MODEL_INFERENCE_JSON_FILE_NAME
from InnerEye.ML.utils.checkpoint_handling import MODEL_WEIGHTS_DIR_NAME, get_recovery_checkpoint_path
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER
from InnerEye.ML.common import LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.model_inference_config import read_model_inference_config
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
Expand Down Expand Up @@ -86,7 +86,7 @@ def test_download_recovery_checkpoints_from_single_run(test_output_dirs: OutputF

expected_checkpoint_root = config.checkpoint_folder
expected_paths = [get_recovery_checkpoint_path(path=expected_checkpoint_root),
expected_checkpoint_root / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
expected_checkpoint_root / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX]
assert checkpoint_handler.run_recovery.checkpoints_roots == [expected_checkpoint_root]
for path in expected_paths:
assert path.is_file()
Expand Down Expand Up @@ -199,7 +199,7 @@ def test_get_best_checkpoint_single_run(test_output_dirs: OutputFolderForTests)
# in the run, into a subfolder of the checkpoint folder
checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
checkpoint_handler.download_recovery_checkpoints_or_weights()
expected_checkpoint = config.checkpoint_folder / f"{BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX}"
expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
checkpoint_paths = checkpoint_handler.get_best_checkpoints()
assert checkpoint_paths
assert len(checkpoint_paths) == 1
Expand All @@ -215,13 +215,13 @@ def test_get_best_checkpoint_single_run(test_output_dirs: OutputFolderForTests)

# There is no checkpoint in the current run - use the one from run_recovery
checkpoint_paths = checkpoint_handler.get_best_checkpoints()
expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
assert checkpoint_paths
assert len(checkpoint_paths) == 1
assert checkpoint_paths[0] == expected_checkpoint

# Copy over checkpoints to make it look like training has happened and a better checkpoint written
expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
expected_checkpoint.touch()
checkpoint_paths = checkpoint_handler.get_best_checkpoints()
assert checkpoint_paths
Expand All @@ -238,7 +238,7 @@ def test_download_checkpoints_from_hyperdrive_child_runs(test_output_dirs: Outpu
hyperdrive_run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
checkpoint_handler.download_checkpoints_from_hyperdrive_child_runs(hyperdrive_run)
expected_checkpoints = [config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(i)
/ BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX for i in range(2)]
/ LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX for i in range(2)]
checkpoint_paths = checkpoint_handler.get_best_checkpoints()
assert checkpoint_paths
assert len(checkpoint_paths) == 2
Expand Down Expand Up @@ -266,7 +266,7 @@ def test_get_checkpoints_to_test(test_output_dirs: OutputFolderForTests) -> None
checkpoint_handler.container.checkpoint_folder.mkdir(parents=True)

# Copy checkpoint to make it seem like training has happened
expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
expected_checkpoint.touch()
checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()

Expand Down Expand Up @@ -295,10 +295,10 @@ def test_get_checkpoints_to_test_single_run(test_output_dirs: OutputFolderForTes

assert checkpoint_and_paths
assert len(checkpoint_and_paths) == 1
assert checkpoint_and_paths[0] == config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
assert checkpoint_and_paths[0] == config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX

# Copy checkpoint to make it seem like training has happened
expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
expected_checkpoint = config.checkpoint_folder / LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
expected_checkpoint.touch()
checkpoint_and_paths = checkpoint_handler.get_checkpoints_to_test()

Expand Down
Loading

0 comments on commit 9f6cf7b

Please sign in to comment.