From 606c61964d8dbaeefcbf721215ff7786031abfc5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 8 Apr 2024 09:39:03 +0000
Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/nn/optimizer/README.md                      |  2 +-
 colossalai/shardformer/modeling/gpt2.py                |  1 -
 colossalai/shardformer/policies/gpt2.py                |  8 ++++++--
 colossalai/shardformer/policies/llama.py               | 10 ++++++++--
 examples/images/vit/vit_benchmark.py                   |  4 +---
 examples/language/llama2/finetune.py                   |  4 +---
 examples/language/llama2/pretrain.py                   |  4 +---
 examples/language/opt/opt_train_demo.py                |  4 +---
 .../test_hybrid_parallel_plugin_checkpoint_io.py       |  8 ++------
 tests/test_optimizer/test_dist_adafactor.py            |  3 ---
 tests/test_optimizer/test_nvme.py                      |  3 ++-
 tests/test_pipeline/test_schedule/test_interleaved.py  |  4 +---
 tests/test_pipeline/test_schedule/test_oneF_oneB.py    |  4 +---
 13 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/colossalai/nn/optimizer/README.md b/colossalai/nn/optimizer/README.md
index 07c95143c74c..d4edd36051eb 100644
--- a/colossalai/nn/optimizer/README.md
+++ b/colossalai/nn/optimizer/README.md
@@ -89,7 +89,7 @@ A series of optimizers have been optimized and integrated.
 
 ### Distributed Adafactor
 
-Distributed Adafactor supports tensor parallelism and ZerO optimization. 
+Distributed Adafactor supports tensor parallelism and ZerO optimization.
 
 ### Performance
 |            Version              |    iter    | Float Percision |      Device Nums     | weight shape  | Avg runtime(ms)  | Avg Speed Up Rate | Best Speed Up Rate  |
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 407338b162df..e3bf4b782f29 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -1084,7 +1084,6 @@ def forward(
                 shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
             )
 
-
         if not shard_config.parallel_output:
             lm_logits = gather_forward_split_backward(lm_logits, -1, shard_config.tensor_parallel_process_group)
 
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 6a50d65ba1e6..4bb6c8225970 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -269,13 +269,17 @@ def module_policy(self):
                 GPT2LMHeadModel: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": not self.shard_config.parallel_output}
+                            suffix="lm_head",
+                            target_module=col_nn.Linear1D_Col,
+                            kwargs={"gather_output": not self.shard_config.parallel_output},
                         )
                     ],
                 )
             }
             if self.shard_config.parallel_output:
-                addon_module[GPT2LMHeadModel].method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)}
+                addon_module[GPT2LMHeadModel].method_replacement = {
+                    "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
+                }
             module_policy.update(addon_module)
 
         if self.pipeline_stage_manager is not None:
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 4c454ac7f2cf..bcc825104f1d 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -255,12 +255,18 @@ def module_policy(self):
             new_item = {
                 LlamaForCausalLM: ModulePolicyDescription(
                     sub_module_replacement=[
-                        SubModuleReplacementDescription(suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": not self.shard_config.parallel_output})
+                        SubModuleReplacementDescription(
+                            suffix="lm_head",
+                            target_module=Linear1D_Col,
+                            kwargs={"gather_output": not self.shard_config.parallel_output},
+                        )
                     ],
                 )
             }
             if self.shard_config.parallel_output:
-                new_item[LlamaForCausalLM].method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)}
+                new_item[LlamaForCausalLM].method_replacement = {
+                    "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
+                }
             policy.update(new_item)
 
         if self.pipeline_stage_manager:
diff --git a/examples/images/vit/vit_benchmark.py b/examples/images/vit/vit_benchmark.py
index 32b1ec803aec..fdae9ee01537 100644
--- a/examples/images/vit/vit_benchmark.py
+++ b/examples/images/vit/vit_benchmark.py
@@ -119,9 +119,7 @@ def criterion(outputs, inputs):
             if hasattr(booster.plugin, "stage_manager") and booster.plugin.stage_manager is not None:
                 # run pipeline forward backward
                 batch = iter([batch])
-                outputs = booster.execute_pipeline(
-                    batch, model, criterion, optimizer, return_loss=True
-                )
+                outputs = booster.execute_pipeline(batch, model, criterion, optimizer, return_loss=True)
             else:
                 outputs = model(**batch)
                 loss = criterion(outputs, None)
diff --git a/examples/language/llama2/finetune.py b/examples/language/llama2/finetune.py
index 122186c30a58..69b4ebe42bf7 100644
--- a/examples/language/llama2/finetune.py
+++ b/examples/language/llama2/finetune.py
@@ -270,9 +270,7 @@ def main():
         ) as pbar:
             for step in pbar:
                 if use_pipeline:
-                    outputs = booster.execute_pipeline(
-                        dataloader_iter, model, _criterion, optimizer, return_loss=True
-                    )
+                    outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
                     loss = outputs["loss"]
                 else:
                     batch = next(dataloader_iter)
diff --git a/examples/language/llama2/pretrain.py b/examples/language/llama2/pretrain.py
index 7b5805b801a8..970cd5290f9f 100644
--- a/examples/language/llama2/pretrain.py
+++ b/examples/language/llama2/pretrain.py
@@ -285,9 +285,7 @@ def main():
         ) as pbar:
             for step in pbar:
                 if use_pipeline:
-                    outputs = booster.execute_pipeline(
-                        dataloader_iter, model, _criterion, optimizer, return_loss=True
-                    )
+                    outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
                     loss = outputs["loss"]
                 else:
                     batch = next(dataloader_iter)
diff --git a/examples/language/opt/opt_train_demo.py b/examples/language/opt/opt_train_demo.py
index 82dff1920fde..05336bec42c5 100644
--- a/examples/language/opt/opt_train_demo.py
+++ b/examples/language/opt/opt_train_demo.py
@@ -41,9 +41,7 @@ def train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, dataloader, b
         # Forward pass
         for _ in pbar:
             if use_pipeline:
-                outputs = booster.execute_pipeline(
-                    dataloader, model, _criterion, optimizer, return_loss=True
-                )
+                outputs = booster.execute_pipeline(dataloader, model, _criterion, optimizer, return_loss=True)
                 # Backward and optimize
                 if is_pp_last_stage:
                     loss = outputs["loss"]
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index 557666a804e3..d8a625b98a66 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -74,9 +74,7 @@ def _preprocess_data(data):
     data = data_gen_fn()
     model.train()
     if booster.plugin.stage_manager is not None:
-        booster.execute_pipeline(
-            _preprocess_data(data), model, _criterion, optimizer, return_loss=True
-        )
+        booster.execute_pipeline(_preprocess_data(data), model, _criterion, optimizer, return_loss=True)
     else:
         output = model(**_preprocess_data(data))
         loss = criterion(output)
@@ -108,9 +106,7 @@ def _preprocess_data(data):
     data_for_shard = data_gen_fn()
     data_for_origin = data_gen_fn()
     if booster.plugin.stage_manager is not None:
-        booster.execute_pipeline(
-            _preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True
-        )
+        booster.execute_pipeline(_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True)
         booster.execute_pipeline(
             _preprocess_data(data_for_origin),
             new_model,
diff --git a/tests/test_optimizer/test_dist_adafactor.py b/tests/test_optimizer/test_dist_adafactor.py
index f675f42301a8..3649e69f4f81 100644
--- a/tests/test_optimizer/test_dist_adafactor.py
+++ b/tests/test_optimizer/test_dist_adafactor.py
@@ -457,9 +457,6 @@ def exam_dist_adafactor_zero(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
         print(f"Curr Param correct {correctness}")
     # print(f"device {local_rank} base_optim state dict {base_optim.optim.state_dict()['state'].items()} \n dist_optim state dict {dist_optim.optim.state_dict()['state'].items()} \n")
 
-    
-    
-
 
 @parameterize("dtype", [torch.bfloat16])  # torch.float32, torch.float16, torch.bfloat16
 @parameterize("tp_zero_size", [(4, 2)])  # (2, 2), (4, 1),(1, 4), (2, 4), (4, 2)
diff --git a/tests/test_optimizer/test_nvme.py b/tests/test_optimizer/test_nvme.py
index 3315b3256d02..603b7b6fa325 100644
--- a/tests/test_optimizer/test_nvme.py
+++ b/tests/test_optimizer/test_nvme.py
@@ -1,5 +1,5 @@
-import torch
 import pytest
+import torch
 
 from colossalai.nn.optimizer import CPUAdam, HybridAdam
 from colossalai.testing import clear_cache_before_run, parameterize
@@ -17,6 +17,7 @@ def check_params_equal(model, torch_model):
     for p, torch_p in zip(model.parameters(), torch_model.parameters()):
         assert torch.allclose(p, torch_p, atol=1e-3), f"diff: {torch.abs(p - torch_p)}"
 
+
 # TODO Something wrong with ci when running this test.
 @pytest.mark.skip(reason="skip because of something wrong with CI")
 @clear_cache_before_run()
diff --git a/tests/test_pipeline/test_schedule/test_interleaved.py b/tests/test_pipeline/test_schedule/test_interleaved.py
index 7aa4640553ca..f8820688e610 100644
--- a/tests/test_pipeline/test_schedule/test_interleaved.py
+++ b/tests/test_pipeline/test_schedule/test_interleaved.py
@@ -103,9 +103,7 @@ def criterion(x, *args, **kwargs):
     torch_loss = criterion(torch_output)
     torch_loss.backward()
 
-    pp_ret = schedule.forward_backward_step(
-        sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
-    )
+    pp_ret = schedule.forward_backward_step(sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True)
 
     # check loss
     if stage_manager.is_last_stage(ignore_chunk=True):
diff --git a/tests/test_pipeline/test_schedule/test_oneF_oneB.py b/tests/test_pipeline/test_schedule/test_oneF_oneB.py
index e1a679890c8d..590800780ab4 100644
--- a/tests/test_pipeline/test_schedule/test_oneF_oneB.py
+++ b/tests/test_pipeline/test_schedule/test_oneF_oneB.py
@@ -99,9 +99,7 @@ def custom_fwd(self, x):
     torch_output = torch_model(input_list[0])
     torch_loss = criterion(torch_output)
     torch_loss.backward()
-    pp_ret = schedule.forward_backward_step(
-        sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
-    )
+    pp_ret = schedule.forward_backward_step(sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True)
 
     # check loss
     if stage_manager.is_last_stage():