Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Apr 8, 2024
1 parent b75ac58 commit 606c619
Show file tree
Hide file tree
Showing 13 changed files with 25 additions and 34 deletions.
2 changes: 1 addition & 1 deletion colossalai/nn/optimizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ A series of optimizers have been optimized and integrated.

### Distributed Adafactor

Distributed Adafactor supports tensor parallelism and ZerO optimization.
Distributed Adafactor supports tensor parallelism and ZerO optimization.

### Performance
| Version | iter | Float Percision | Device Nums | weight shape | Avg runtime(ms) | Avg Speed Up Rate | Best Speed Up Rate |
Expand Down
1 change: 0 additions & 1 deletion colossalai/shardformer/modeling/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,6 @@ def forward(
shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
)


if not shard_config.parallel_output:
lm_logits = gather_forward_split_backward(lm_logits, -1, shard_config.tensor_parallel_process_group)

Expand Down
8 changes: 6 additions & 2 deletions colossalai/shardformer/policies/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,13 +269,17 @@ def module_policy(self):
GPT2LMHeadModel: ModulePolicyDescription(
sub_module_replacement=[
SubModuleReplacementDescription(
suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": not self.shard_config.parallel_output}
suffix="lm_head",
target_module=col_nn.Linear1D_Col,
kwargs={"gather_output": not self.shard_config.parallel_output},
)
],
)
}
if self.shard_config.parallel_output:
addon_module[GPT2LMHeadModel].method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)}
addon_module[GPT2LMHeadModel].method_replacement = {
"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
}
module_policy.update(addon_module)

if self.pipeline_stage_manager is not None:
Expand Down
10 changes: 8 additions & 2 deletions colossalai/shardformer/policies/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,12 +255,18 @@ def module_policy(self):
new_item = {
LlamaForCausalLM: ModulePolicyDescription(
sub_module_replacement=[
SubModuleReplacementDescription(suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": not self.shard_config.parallel_output})
SubModuleReplacementDescription(
suffix="lm_head",
target_module=Linear1D_Col,
kwargs={"gather_output": not self.shard_config.parallel_output},
)
],
)
}
if self.shard_config.parallel_output:
new_item[LlamaForCausalLM].method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)}
new_item[LlamaForCausalLM].method_replacement = {
"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
}
policy.update(new_item)

if self.pipeline_stage_manager:
Expand Down
4 changes: 1 addition & 3 deletions examples/images/vit/vit_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,7 @@ def criterion(outputs, inputs):
if hasattr(booster.plugin, "stage_manager") and booster.plugin.stage_manager is not None:
# run pipeline forward backward
batch = iter([batch])
outputs = booster.execute_pipeline(
batch, model, criterion, optimizer, return_loss=True
)
outputs = booster.execute_pipeline(batch, model, criterion, optimizer, return_loss=True)
else:
outputs = model(**batch)
loss = criterion(outputs, None)
Expand Down
4 changes: 1 addition & 3 deletions examples/language/llama2/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,7 @@ def main():
) as pbar:
for step in pbar:
if use_pipeline:
outputs = booster.execute_pipeline(
dataloader_iter, model, _criterion, optimizer, return_loss=True
)
outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
loss = outputs["loss"]
else:
batch = next(dataloader_iter)
Expand Down
4 changes: 1 addition & 3 deletions examples/language/llama2/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,7 @@ def main():
) as pbar:
for step in pbar:
if use_pipeline:
outputs = booster.execute_pipeline(
dataloader_iter, model, _criterion, optimizer, return_loss=True
)
outputs = booster.execute_pipeline(dataloader_iter, model, _criterion, optimizer, return_loss=True)
loss = outputs["loss"]
else:
batch = next(dataloader_iter)
Expand Down
4 changes: 1 addition & 3 deletions examples/language/opt/opt_train_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ def train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, dataloader, b
# Forward pass
for _ in pbar:
if use_pipeline:
outputs = booster.execute_pipeline(
dataloader, model, _criterion, optimizer, return_loss=True
)
outputs = booster.execute_pipeline(dataloader, model, _criterion, optimizer, return_loss=True)
# Backward and optimize
if is_pp_last_stage:
loss = outputs["loss"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,7 @@ def _preprocess_data(data):
data = data_gen_fn()
model.train()
if booster.plugin.stage_manager is not None:
booster.execute_pipeline(
_preprocess_data(data), model, _criterion, optimizer, return_loss=True
)
booster.execute_pipeline(_preprocess_data(data), model, _criterion, optimizer, return_loss=True)
else:
output = model(**_preprocess_data(data))
loss = criterion(output)
Expand Down Expand Up @@ -108,9 +106,7 @@ def _preprocess_data(data):
data_for_shard = data_gen_fn()
data_for_origin = data_gen_fn()
if booster.plugin.stage_manager is not None:
booster.execute_pipeline(
_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True
)
booster.execute_pipeline(_preprocess_data(data_for_shard), model, _criterion, optimizer, return_loss=True)
booster.execute_pipeline(
_preprocess_data(data_for_origin),
new_model,
Expand Down
3 changes: 0 additions & 3 deletions tests/test_optimizer/test_dist_adafactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,9 +457,6 @@ def exam_dist_adafactor_zero(dtype: torch.dtype, tp_zero_size: tuple[int, int]):
print(f"Curr Param correct {correctness}")
# print(f"device {local_rank} base_optim state dict {base_optim.optim.state_dict()['state'].items()} \n dist_optim state dict {dist_optim.optim.state_dict()['state'].items()} \n")





@parameterize("dtype", [torch.bfloat16]) # torch.float32, torch.float16, torch.bfloat16
@parameterize("tp_zero_size", [(4, 2)]) # (2, 2), (4, 1),(1, 4), (2, 4), (4, 2)
Expand Down
3 changes: 2 additions & 1 deletion tests/test_optimizer/test_nvme.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import torch
import pytest
import torch

from colossalai.nn.optimizer import CPUAdam, HybridAdam
from colossalai.testing import clear_cache_before_run, parameterize
Expand All @@ -17,6 +17,7 @@ def check_params_equal(model, torch_model):
for p, torch_p in zip(model.parameters(), torch_model.parameters()):
assert torch.allclose(p, torch_p, atol=1e-3), f"diff: {torch.abs(p - torch_p)}"


# TODO Something wrong with ci when running this test.
@pytest.mark.skip(reason="skip because of something wrong with CI")
@clear_cache_before_run()
Expand Down
4 changes: 1 addition & 3 deletions tests/test_pipeline/test_schedule/test_interleaved.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ def criterion(x, *args, **kwargs):
torch_loss = criterion(torch_output)
torch_loss.backward()

pp_ret = schedule.forward_backward_step(
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
)
pp_ret = schedule.forward_backward_step(sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True)

# check loss
if stage_manager.is_last_stage(ignore_chunk=True):
Expand Down
4 changes: 1 addition & 3 deletions tests/test_pipeline/test_schedule/test_oneF_oneB.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,7 @@ def custom_fwd(self, x):
torch_output = torch_model(input_list[0])
torch_loss = criterion(torch_output)
torch_loss.backward()
pp_ret = schedule.forward_backward_step(
sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
)
pp_ret = schedule.forward_backward_step(sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True)

# check loss
if stage_manager.is_last_stage():
Expand Down

0 comments on commit 606c619

Please sign in to comment.