From ce31b99dee668908b23111cfc13b4863672a3609 Mon Sep 17 00:00:00 2001 From: Mannat Singh Date: Thu, 5 Mar 2020 17:55:51 -0800 Subject: [PATCH] Fix distributed training test (#420) Summary: Pull Request resolved: https://github.com/facebookresearch/ClassyVision/pull/420 The distributed training test was running the local trainer on each process. The `distributed_backend` argument needed to be added since `classy_train.py` now runs the `LocalTrainer` by default. Reviewed By: vreis Differential Revision: D20296492 fbshipit-source-id: dbaee937f39d31e7c98db2836b98d7f97307783b --- classy_train.py | 5 +++++ test/trainer_distributed_trainer_test.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/classy_train.py b/classy_train.py index dcccc9f66e..26e2eb53b8 100755 --- a/classy_train.py +++ b/classy_train.py @@ -43,6 +43,7 @@ from pathlib import Path import torch +from classy_vision.generic.distributed_util import get_rank, get_world_size from classy_vision.generic.opts import check_generic_args, parse_train_arguments from classy_vision.generic.registry_utils import import_all_packages_from_directory from classy_vision.generic.util import load_checkpoint, load_json @@ -104,6 +105,10 @@ def main(args, config): trainer = trainer_class(use_gpu=use_gpu, num_dataloader_workers=args.num_workers) + logging.info( + f"Starting training on rank {get_rank()} worker. " + f"World size is {get_world_size()}" + ) # That's it! When this call returns, training is done. trainer.train(task) diff --git a/test/trainer_distributed_trainer_test.py b/test/trainer_distributed_trainer_test.py index 3f99f84d87..e3950d74b3 100644 --- a/test/trainer_distributed_trainer_test.py +++ b/test/trainer_distributed_trainer_test.py @@ -57,7 +57,8 @@ def test_training(self): --device={device} \ --config={self.config_files[config_key]} \ --num_workers=4 \ - --log_freq=100 + --log_freq=100 \ + --distributed_backend=ddp """ result = subprocess.run(cmd, shell=True) success = result.returncode == 0