From 606c61964d8dbaeefcbf721215ff7786031abfc5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Apr 2024 09:39:03 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colossalai/nn/optimizer/README.md | 2 +- colossalai/shardformer/modeling/gpt2.py | 1 - colossalai/shardformer/policies/gpt2.py | 8 ++++++-- colossalai/shardformer/policies/llama.py | 10 ++++++++-- examples/images/vit/vit_benchmark.py | 4 +--- examples/language/llama2/finetune.py | 4 +--- examples/language/llama2/pretrain.py | 4 +--- examples/language/opt/opt_train_demo.py | 4 +--- .../test_hybrid_parallel_plugin_checkpoint_io.py | 8 ++------ tests/test_optimizer/test_dist_adafactor.py | 3 --- tests/test_optimizer/test_nvme.py | 3 ++- tests/test_pipeline/test_schedule/test_interleaved.py | 4 +--- tests/test_pipeline/test_schedule/test_oneF_oneB.py | 4 +--- 13 files changed, 25 insertions(+), 34 deletions(-) diff --git a/colossalai/nn/optimizer/README.md b/colossalai/nn/optimizer/README.md index 07c95143c74c..d4edd36051eb 100644 --- a/colossalai/nn/optimizer/README.md +++ b/colossalai/nn/optimizer/README.md @@ -89,7 +89,7 @@ A series of optimizers have been optimized and integrated. ### Distributed Adafactor -Distributed Adafactor supports tensor parallelism and ZerO optimization. +Distributed Adafactor supports tensor parallelism and ZerO optimization. ### Performance | Version | iter | Float Percision | Device Nums | weight shape | Avg runtime(ms) | Avg Speed Up Rate | Best Speed Up Rate | diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py index 407338b162df..e3bf4b782f29 100644 --- a/colossalai/shardformer/modeling/gpt2.py +++ b/colossalai/shardformer/modeling/gpt2.py @@ -1084,7 +1084,6 @@ def forward( shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group ) - if not shard_config.parallel_output: lm_logits = gather_forward_split_backward(lm_logits, -1, shard_config.tensor_parallel_process_group) diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py index 6a50d65ba1e6..4bb6c8225970 100644 --- a/colossalai/shardformer/policies/gpt2.py +++ b/colossalai/shardformer/policies/gpt2.py @@ -269,13 +269,17 @@ def module_policy(self): GPT2LMHeadModel: ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( - suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": not self.shard_config.parallel_output} + suffix="lm_head", + target_module=col_nn.Linear1D_Col, + kwargs={"gather_output": not self.shard_config.parallel_output}, ) ], ) } if self.shard_config.parallel_output: - addon_module[GPT2LMHeadModel].method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)} + addon_module[GPT2LMHeadModel].method_replacement = { + "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config) + } module_policy.update(addon_module) if self.pipeline_stage_manager is not None: diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py index 4c454ac7f2cf..bcc825104f1d 100644 --- a/colossalai/shardformer/policies/llama.py +++ b/colossalai/shardformer/policies/llama.py @@ -255,12 +255,18 @@ def module_policy(self): new_item = { LlamaForCausalLM: ModulePolicyDescription( sub_module_replacement=[ - SubModuleReplacementDescription(suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": not self.shard_config.parallel_output}) + SubModuleReplacementDescription( + suffix="lm_head", + target_module=Linear1D_Col, + kwargs={"gather_output": not self.shard_config.parallel_output}, + ) ], ) } if self.shard_config.parallel_output: - new_item[LlamaForCausalLM].method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)} + new_item[LlamaForCausalLM].method_replacement = { + "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config) + } policy.update(new_item) if self.pipeline_stage_manager: diff --git a/examples/images/vit/vit_benchmark.py b/examples/images/vit/vit_benchmark.py index 32b1ec803aec..fdae9ee01537 100644 --- a/examples/images/vit/vit_benchmark.py +++ b/examples/images/vit/vit_benchmark.py @@ -119,9 +119,7 @@ def criterion(outputs, inputs): if hasattr(booster.plugin, "stage_manager") and booster.plugin.stage_manager is not None: # run pipeline forward backward batch = iter([batch]) - outputs = booster.execute_pipeline( - batch, model, criterion, optimizer, return_loss=True - ) + outputs = booster.execute_pipeline(batch, model, criterion, optimizer, return_loss=True) else: outputs = model(**batch) loss = criterion(outputs, None) diff --git a/examples/language/llama2/finetune.py b/examples/language/llama2/finetune.py index 122186c30a58..69b4ebe42bf7 100644 --- a/examples/language/llama2/finetune.py +++ b/examples/language/llama2/finetune.py @@ -270,9 +270,7 @@ def main(): ) as pbar: for step in pbar: if use_pipeline: - outputs = booster.execute_pipeline( - dataloader_iter, model, _criterion, optimizer, return_loss=True - ) + outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True) loss = outputs["loss"] else: batch = next(dataloader_iter) diff --git a/examples/language/llama2/pretrain.py b/examples/language/llama2/pretrain.py index 7b5805b801a8..970cd5290f9f 100644 --- a/examples/language/llama2/pretrain.py +++ b/examples/language/llama2/pretrain.py @@ -285,9 +285,7 @@ def main(): ) as pbar: for step in pbar: if use_pipeline: - outputs = booster.execute_pipeline( - dataloader_iter, model, _criterion, optimizer, return_loss=True - ) + outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True) loss = outputs["loss"] else: batch = next(dataloader_iter) diff --git a/examples/language/opt/opt_train_demo.py b/examples/language/opt/opt_train_demo.py index 82dff1920fde..05336bec42c5 100644 --- a/examples/language/opt/opt_train_demo.py +++ b/examples/language/opt/opt_train_demo.py @@ -41,9 +41,7 @@ def train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, dataloader, b # Forward pass for _ in pbar: if use_pipeline: - outputs = booster.execute_pipeline( - dataloader, model, _criterion, optimizer, return_loss=True - ) + outputs = booster.execute_pipeline(dataloader, model, _criterion, optimizer, return_loss=True) # Backward and optimize if is_pp_last_stage: loss = outputs["loss"] diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py index 557666a804e3..d8a625b98a66 100644 --- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py @@ -74,9 +74,7 @@ def _preprocess_data(data): data = data_gen_fn() model.train() if booster.plugin.stage_manager is not None: - booster.execute_pipeline( - _preprocess_data(data), model, _criterion, optimizer, return_loss=True - ) + booster.execute_pipeline(_preprocess_data(data), model, _criterion, optimizer, return_loss=True) else: output = model(**_preprocess_data(data)) loss = criterion(output) @@ -108,9 +106,7 @@ def _preprocess_data(data): data_for_shard = data_gen_fn() data_for_origin = data_gen_fn() if booster.plugin.stage_manager is not None: - booster.execute_pipeline( - _preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True - ) + booster.execute_pipeline(_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True) booster.execute_pipeline( _preprocess_data(data_for_origin), new_model, diff --git a/tests/test_optimizer/test_dist_adafactor.py b/tests/test_optimizer/test_dist_adafactor.py index f675f42301a8..3649e69f4f81 100644 --- a/tests/test_optimizer/test_dist_adafactor.py +++ b/tests/test_optimizer/test_dist_adafactor.py @@ -457,9 +457,6 @@ def exam_dist_adafactor_zero(dtype: torch.dtype, tp_zero_size: tuple[int, int]): print(f"Curr Param correct {correctness}") # print(f"device {local_rank} base_optim state dict {base_optim.optim.state_dict()['state'].items()} \n dist_optim state dict {dist_optim.optim.state_dict()['state'].items()} \n") - - - @parameterize("dtype", [torch.bfloat16]) # torch.float32, torch.float16, torch.bfloat16 @parameterize("tp_zero_size", [(4, 2)]) # (2, 2), (4, 1),(1, 4), (2, 4), (4, 2) diff --git a/tests/test_optimizer/test_nvme.py b/tests/test_optimizer/test_nvme.py index 3315b3256d02..603b7b6fa325 100644 --- a/tests/test_optimizer/test_nvme.py +++ b/tests/test_optimizer/test_nvme.py @@ -1,5 +1,5 @@ -import torch import pytest +import torch from colossalai.nn.optimizer import CPUAdam, HybridAdam from colossalai.testing import clear_cache_before_run, parameterize @@ -17,6 +17,7 @@ def check_params_equal(model, torch_model): for p, torch_p in zip(model.parameters(), torch_model.parameters()): assert torch.allclose(p, torch_p, atol=1e-3), f"diff: {torch.abs(p - torch_p)}" + # TODO Something wrong with ci when running this test. @pytest.mark.skip(reason="skip because of something wrong with CI") @clear_cache_before_run() diff --git a/tests/test_pipeline/test_schedule/test_interleaved.py b/tests/test_pipeline/test_schedule/test_interleaved.py index 7aa4640553ca..f8820688e610 100644 --- a/tests/test_pipeline/test_schedule/test_interleaved.py +++ b/tests/test_pipeline/test_schedule/test_interleaved.py @@ -103,9 +103,7 @@ def criterion(x, *args, **kwargs): torch_loss = criterion(torch_output) torch_loss.backward() - pp_ret = schedule.forward_backward_step( - sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True - ) + pp_ret = schedule.forward_backward_step(sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True) # check loss if stage_manager.is_last_stage(ignore_chunk=True): diff --git a/tests/test_pipeline/test_schedule/test_oneF_oneB.py b/tests/test_pipeline/test_schedule/test_oneF_oneB.py index e1a679890c8d..590800780ab4 100644 --- a/tests/test_pipeline/test_schedule/test_oneF_oneB.py +++ b/tests/test_pipeline/test_schedule/test_oneF_oneB.py @@ -99,9 +99,7 @@ def custom_fwd(self, x): torch_output = torch_model(input_list[0]) torch_loss = criterion(torch_output) torch_loss.backward() - pp_ret = schedule.forward_backward_step( - sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True - ) + pp_ret = schedule.forward_backward_step(sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True) # check loss if stage_manager.is_last_stage():