From 95c21e3950a86ad2f00e1b89690772bd87ae53ca Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Mon, 26 Feb 2024 19:46:58 +0800
Subject: [PATCH 01/23] [extension] hotfix jit extension setup (#5402)

---
 setup.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 1244bfff0327..5f6d447ddc6b 100644
--- a/setup.py
+++ b/setup.py
@@ -121,8 +121,9 @@ def get_version() -> str:
             "tests",
             "scripts",
             "requirements",
+            "extensions",
             "*.egg-info",
-        )
+        ),
     ),
     description="An integrated large-scale model training system with efficient parallelization techniques",
     long_description=fetch_readme(),
@@ -153,10 +154,7 @@ def get_version() -> str:
     ],
     package_data={
         "colossalai": [
-            "_C/*.pyi",
-            "kernel/cuda_native/csrc/*",
-            "kernel/cuda_native/csrc/kernel/*",
-            "kernel/cuda_native/csrc/kernels/include/*",
+            "kernel/extensions/csrc/**/*",
         ]
     },
 )

From d882d18c6544d024dd181c04fbb8c10893d3a653 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 27 Feb 2024 11:22:07 +0800
Subject: [PATCH 02/23] [example] reuse flash attn patch (#5400)

---
 examples/language/llama2/attn.py      | 85 +--------------------------
 examples/language/llama2/benchmark.py |  5 +-
 examples/language/llama2/finetune.py  |  5 +-
 examples/language/llama2/pretrain.py  |  5 +-
 4 files changed, 7 insertions(+), 93 deletions(-)
 mode change 100644 => 120000 examples/language/llama2/attn.py

diff --git a/examples/language/llama2/attn.py b/examples/language/llama2/attn.py
deleted file mode 100644
index 2b2356b18b70..000000000000
--- a/examples/language/llama2/attn.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from types import MethodType
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb, repeat_kv
-
-SUPPORT_XFORMERS = False
-SUPPORT_FLASH2 = False
-try:
-    import xformers.ops as xops
-
-    SUPPORT_XFORMERS = True
-except ImportError:
-    pass
-
-try:
-    from flash_attn import flash_attn_func
-
-    SUPPORT_FLASH2 = True
-except ImportError:
-    pass
-
-SUPPORT_FLASH = SUPPORT_XFORMERS or SUPPORT_FLASH2
-
-
-def llama_flash_attention(
-    self: LlamaAttention,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-    # [bsz, nh, t, hd]
-
-    if past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = torch.cat([past_key_value[0], key_states], dim=2)
-        value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    # q, k, v is [B, H, S, K] and xformers need [B, S, H, K]. returns [B, S, H, K]
-    query_states = query_states.transpose(1, 2)
-    key_states = key_states.transpose(1, 2)
-    value_states = value_states.transpose(1, 2)
-    if SUPPORT_FLASH2:
-        attn_output = flash_attn_func(query_states, key_states, value_states, causal=True)
-    else:
-        attn_output = xops.memory_efficient_attention(
-            query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask()
-        )
-
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    attn_output = self.o_proj(attn_output)
-
-    if not output_attentions:
-        attn_weights = None
-
-    return attn_output, attn_weights, past_key_value
-
-
-def replace_xformers(model: nn.Module):
-    for module in model.modules():
-        if isinstance(module, LlamaAttention):
-            module.forward = MethodType(llama_flash_attention, module)
diff --git a/examples/language/llama2/attn.py b/examples/language/llama2/attn.py
new file mode 120000
index 000000000000..4e95c7bfa519
--- /dev/null
+++ b/examples/language/llama2/attn.py
@@ -0,0 +1 @@
+../../../applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
\ No newline at end of file
diff --git a/examples/language/llama2/benchmark.py b/examples/language/llama2/benchmark.py
index b8f70ce9c9d8..54b023f64742 100644
--- a/examples/language/llama2/benchmark.py
+++ b/examples/language/llama2/benchmark.py
@@ -3,7 +3,7 @@
 from contextlib import nullcontext
 
 import torch
-from attn import SUPPORT_FLASH, replace_xformers
+from attn import replace_with_flash_attention
 from data_utils import RandomDataset
 from model_utils import format_numel_str, get_model_numel
 from performance_evaluator import PerformanceEvaluator
@@ -188,8 +188,7 @@ def empty_init():
         model.gradient_checkpointing_enable()
 
     if args.xformers:
-        assert SUPPORT_FLASH, "Use flash attention while xfomers is not installed"
-        replace_xformers(model)
+        replace_with_flash_attention(model)
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
diff --git a/examples/language/llama2/finetune.py b/examples/language/llama2/finetune.py
index 66b5400765f7..3dbd0cf357b4 100644
--- a/examples/language/llama2/finetune.py
+++ b/examples/language/llama2/finetune.py
@@ -9,7 +9,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from attn import SUPPORT_XFORMERS, replace_xformers
+from attn import replace_with_flash_attention
 from data_utils import load_json, prepare_dataloader, save_json
 from datasets import load_dataset
 from torch.optim import Optimizer
@@ -219,8 +219,7 @@ def main():
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
     if args.flash_attention:
-        assert SUPPORT_XFORMERS, "Use flash attention while xfomers is not installed"
-        replace_xformers(model)
+        replace_with_flash_attention(model)
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
diff --git a/examples/language/llama2/pretrain.py b/examples/language/llama2/pretrain.py
index 4cdf93e1914b..fe7d958307e9 100644
--- a/examples/language/llama2/pretrain.py
+++ b/examples/language/llama2/pretrain.py
@@ -8,7 +8,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from attn import SUPPORT_XFORMERS, replace_xformers
+from attn import replace_with_flash_attention
 from data_utils import load_json, prepare_dataloader, save_json
 from datasets import load_dataset
 from torch.optim import Optimizer
@@ -238,8 +238,7 @@ def main():
     if args.grad_checkpoint:
         model.gradient_checkpointing_enable()
     if args.flash_attention:
-        assert SUPPORT_XFORMERS, "Use flash attention while xfomers is not installed"
-        replace_xformers(model)
+        replace_with_flash_attention(model)
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")

From bf34c6fef6adb6f550bc3ff19443269e69bb7ad5 Mon Sep 17 00:00:00 2001
From: QinLuo <eric.x.sun@gmail.com>
Date: Tue, 27 Feb 2024 13:51:14 +0800
Subject: [PATCH 03/23] [fsdp] impl save/load shard model/optimizer (#5357)

---
 .../booster/plugin/torch_fsdp_plugin.py       | 153 ++++++++++++++++--
 .../test_torch_fsdp_checkpoint_io.py          |  38 +++++
 2 files changed, 179 insertions(+), 12 deletions(-)

diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py
index 2ea7593a5cc5..5445b4a6349d 100644
--- a/colossalai/booster/plugin/torch_fsdp_plugin.py
+++ b/colossalai/booster/plugin/torch_fsdp_plugin.py
@@ -1,3 +1,5 @@
+import logging
+import os
 import warnings
 from pathlib import Path
 from typing import Callable, Iterable, Iterator, List, Optional, Tuple
@@ -25,7 +27,7 @@
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 
-from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO, utils
+from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO, utils, CheckpointIndexFile
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 
@@ -74,17 +76,54 @@ def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str,
 
     def save_sharded_model(
         self,
-        model: nn.Module,
-        checkpoint: str,
-        gather_dtensor: bool,
-        prefix: Optional[str],
-        size_per_shard: int,
-        use_safetensors: bool,
+        model: ModelWrapper,
+        checkpoint_path: str,
+        gather_dtensor: bool = True,
+        prefix: Optional[str] = None,
+        size_per_shard: int = 1024,
+        use_safetensors: bool = False,
     ):
         """
         Save model to checkpoint but only on master process.
         """
-        raise NotImplementedError("Sharded model checkpoint is not supported yet.")
+        assert isinstance(model, TorchFSDPModel), "Please boost the model before saving!"
+        if os.path.isfile(checkpoint_path):
+            logging.error(f"Provided path ({checkpoint_path}) should be a directory, not a file")
+            return
+
+        Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
+        with FSDP.state_dict_type(
+            model.unwrap(),
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        ):
+            state_dict = model.unwrap().state_dict()
+
+        state_dict_shard = utils.shard_model_checkpoint(state_dict, max_shard_size=size_per_shard)
+
+        weights_name, save_index_file = utils.get_model_base_filenames(prefix, use_safetensors)
+        index_file = CheckpointIndexFile(checkpoint_path)
+
+        # In general cases, is_master is set to True to get the right behavior.
+        total_size = utils.save_state_dict_shards(
+            sharded_state_dict=state_dict_shard,
+            checkpoint=checkpoint_path,
+            index_file=index_file,
+            base_filename=weights_name,
+            is_master=self.coordinator.is_master(),
+            use_safetensors=use_safetensors,
+        )
+
+        # only save the index file on the master rank
+        if self.coordinator.is_master():
+            index_file.append_meta_data("total_size", total_size)
+            index_file.write_index_file(save_index_file)
+            utils.save_config_file(model.unwrap(), checkpoint_path)
+            logging.info(
+                f"The model is split into checkpoint shards. "
+                f"You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
 
     def load_sharded_model(
         self,
@@ -97,7 +136,24 @@ def load_sharded_model(
         """
         Load model to checkpoint but only on master process.
         """
-        raise NotImplementedError("Sharded model checkpoint is not supported yet.")
+        assert isinstance(model, TorchFSDPModel), "Please boost the model before loading!"
+        use_safetensors = False
+        if "safetensors" in checkpoint_index_file.name:
+            use_safetensors = True
+
+        if use_safetensors and not utils.is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors` library: `pip install safetensors`.")
+
+        # read checkpoint index file
+        ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
+        checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()
+
+        fsdp_state_dict = {}
+        for shard_file in checkpoint_files:
+            fsdp_state_dict.update(utils.load_shard_state_dict(Path(shard_file), use_safetensors))
+
+        with FSDP.state_dict_type(model.unwrap(), StateDictType.FULL_STATE_DICT):
+            model.unwrap().load_state_dict(fsdp_state_dict, strict=False)
 
     def save_sharded_optimizer(
         self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool, prefix: str, size_per_shard: int
@@ -105,13 +161,86 @@ def save_sharded_optimizer(
         """
         Save optimizer to checkpoint but only on master process.
         """
-        raise NotImplementedError("Sharded optimizer checkpoint is not supported yet.")
+        assert isinstance(optimizer, FSDPOptimizerWrapper), "Please boost the optimizer before saving!"
+
+        if os.path.isfile(checkpoint):
+            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
+            return
+
+        Path(checkpoint).mkdir(parents=True, exist_ok=True)
+
+        with FSDP.state_dict_type(
+            optimizer.unwrap_model().unwrap(),
+            StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        ):
+            fsdp_optim_state = FSDP.full_optim_state_dict(
+                optimizer.unwrap_model().unwrap(), optim=optimizer, rank0_only=True
+            )
+
+        if self.coordinator.is_master():
+            # Preparing file paths and index file.
+            states_name, save_index_file, param_group_file = utils.get_optimizer_base_filenames(prefix)
+            index_file = CheckpointIndexFile(checkpoint)
+
+            index_file.append_meta_data("param_groups", param_group_file)
+            group_file_path = os.path.join(checkpoint, param_group_file)
+            utils.save_param_groups(fsdp_optim_state, group_file_path)
+
+            sharded_state = utils.shard_optimizer_checkpoint(fsdp_optim_state, max_shard_size=size_per_shard)
+
+            # Save shards of optimizer states.
+            # In general cases, is_master is set to True to get the right behavior.
+            total_size = utils.save_state_dict_shards(
+                sharded_state_dict=sharded_state,
+                checkpoint=checkpoint,
+                index_file=index_file,
+                base_filename=states_name,
+                is_master=self.coordinator.is_master(),
+                use_safetensors=False,
+            )
+
+            index_file.append_meta_data("total_size", total_size)
+            index_file.write_index_file(save_index_file)
+            logging.info(
+                f"The optimizer is going to be split to checkpoint shards. "
+                f"You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
 
     def load_sharded_optimizer(self, optimizer: Optimizer, index_file_path: str, size_per_shard: int):
         """
         Load optimizer to checkpoint but only on master process.
         """
-        raise NotImplementedError("Sharded optimizer checkpoint is not supported yet.")
+        assert isinstance(optimizer, FSDPOptimizerWrapper), "Please boost the optimizer before saving!"
+
+        ckpt_index_file = CheckpointIndexFile.from_file(index_file_path)
+
+        # Load param_groups
+        param_group_path = ckpt_index_file.get_param_group_filename()
+        if param_group_path is None:
+            raise RuntimeError(
+                f"Invalid index file path {index_file_path} for an optimizer. "
+                "Looking param group file under current directory."
+            )
+
+        saved_param_groups = torch.load(param_group_path)
+
+        # Load param
+        fsdp_optim_state = {}
+        checkpoint_files, _ = ckpt_index_file.get_checkpoint_filenames()
+        for shard_file in checkpoint_files:
+            state_dict_shard = utils.load_shard_state_dict(Path(shard_file), use_safetensors=False)
+            fsdp_optim_state.update(state_dict_shard)
+
+        fsdp_optim_dict = dict(state=fsdp_optim_state, param_groups=saved_param_groups)
+
+        with FSDP.state_dict_type(optimizer.unwrap_model().unwrap(), StateDictType.FULL_STATE_DICT):
+            fsdp_state = FSDP.optim_state_dict_to_load(
+                model=optimizer.unwrap_model().unwrap(), optim=optimizer, optim_state_dict=fsdp_optim_dict
+            )
+            optimizer.load_state_dict(fsdp_state)
+
 
     def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
         """
@@ -190,7 +319,7 @@ def __init__(
         raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
 
     def support_no_sync(self) -> bool:
-        False
+        return False
 
     def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]:
         raise NotImplementedError("Torch fsdp no_sync func not supported yet.")
diff --git a/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
index dd41f8185c2b..dca562a3b837 100644
--- a/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
@@ -10,6 +10,7 @@
 
 if version.parse(torch.__version__) >= version.parse("1.12.0"):
     from colossalai.booster.plugin import TorchFSDPPlugin
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
@@ -99,6 +100,43 @@ def run_model():
         outputs_sec = fsdp_model(inputs)
         assert criterion(outputs_sec) == criterion(outputs)
 
+    with shared_tempdir() as tempdir:
+        model_ckpt_path = f"{tempdir}/model"
+        optim_ckpt_path = f"{tempdir}/optimizer"
+
+        run_model()
+
+        booster.save_model(fsdp_model, model_ckpt_path, shard=True)
+        booster.save_optimizer(optimizer, optim_ckpt_path, shard=True)
+
+        full_msd = fsdp_model.unwrap().state_dict()
+        full_osd = FSDP.full_optim_state_dict(optimizer.unwrap_model().unwrap(), optim=optimizer)
+
+        import copy
+        sharded_osd = copy.deepcopy(full_osd)
+
+        run_model()
+
+        full_msd_updated = fsdp_model.unwrap().state_dict()
+        full_osd_updated = FSDP.full_optim_state_dict(optimizer.unwrap_model().unwrap(), optim=optimizer)
+
+        # cost much time led to timeout
+        # assert not compare_nested_dict(full_osd_updated, sharded_osd)
+        # assert not compare_nested_dict(full_msd_updated, full_msd)
+        outputs_first = fsdp_model(inputs)
+        assert criterion(outputs_first) != criterion(outputs)
+
+        booster.load_model(fsdp_model, model_ckpt_path)
+        booster.load_optimizer(optimizer, optim_ckpt_path)
+
+        full_msd_restore = fsdp_model.unwrap().state_dict()
+        sharded_osd_restore = FSDP.full_optim_state_dict(optimizer.unwrap_model().unwrap(), optim=optimizer)
+
+        assert compare_nested_dict(sharded_osd, sharded_osd_restore)
+        assert compare_nested_dict(full_msd_restore, full_msd)
+        outputs_sec = fsdp_model(inputs)
+        assert criterion(outputs_sec) == criterion(outputs)
+
 
 def run_dist(rank, world_size, port):
     # init dist env

From dcdd8a5ef7ae450442a20c8021a333501b4922d7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 27 Feb 2024 15:19:13 +0800
Subject: [PATCH 04/23] [setup] fixed nightly release (#5388)

---
 .../workflows/release_nightly_on_schedule.yml | 12 ++++---
 .../scripts/update_setup_for_nightly.py       | 34 +++++++++++++++++++
 setup.py                                      | 16 ++-------
 3 files changed, 45 insertions(+), 17 deletions(-)
 create mode 100644 .github/workflows/scripts/update_setup_for_nightly.py

diff --git a/.github/workflows/release_nightly_on_schedule.yml b/.github/workflows/release_nightly_on_schedule.yml
index 4125f333f301..072a943aef19 100644
--- a/.github/workflows/release_nightly_on_schedule.yml
+++ b/.github/workflows/release_nightly_on_schedule.yml
@@ -6,11 +6,13 @@ on:
     - cron:  '0 0 * * 6' # release on every Sunday 00:00 UTC time
 
 jobs:
-  build-n-publish:
+  publish:
     if: github.repository == 'hpcaitech/ColossalAI'
     name: Build and publish Python 🐍 distributions 📦 to PyPI
     runs-on: ubuntu-latest
     timeout-minutes: 20
+    outputs:
+      status: ${{ steps.publish.outcome }}
     steps:
     - uses: actions/checkout@v2
 
@@ -18,7 +20,9 @@ jobs:
       with:
         python-version: '3.8.14'
 
-    - run: NIGHTLY=1 python setup.py sdist build
+    - run: |
+        python .github/workflows/scripts/update_setup_for_nightly.py
+        python setup.py sdist build
 
     # publish to PyPI if executed on the main branch
     - name: Publish package to PyPI
@@ -31,7 +35,7 @@ jobs:
 
   notify:
     name: Notify Lark via webhook
-    needs: build-n-publish
+    needs: publish
     runs-on: ubuntu-latest
     if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI'
     steps:
@@ -62,4 +66,4 @@ jobs:
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
           WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
-          STATUS: ${{ steps.publish.outcome }}
+          STATUS: ${{ needs.publish.outputs.status }}
diff --git a/.github/workflows/scripts/update_setup_for_nightly.py b/.github/workflows/scripts/update_setup_for_nightly.py
new file mode 100644
index 000000000000..d8a3087ef54e
--- /dev/null
+++ b/.github/workflows/scripts/update_setup_for_nightly.py
@@ -0,0 +1,34 @@
+from datetime import datetime
+
+
+def open_setup_file():
+    with open("setup.py", "r") as f:
+        file_lines = f.readlines()
+    return file_lines
+
+
+def replace_nightly_package_info(file_lines):
+    version = datetime.today().strftime("%Y.%m.%d")
+    package_name = "colossalai-nightly"
+
+    for idx, line in enumerate(file_lines):
+        if "version = get_version()" in line:
+            file_lines[idx] = f'version = "{version}"\n'
+        if 'package_name = "colossalai"' in line:
+            file_lines[idx] = f'package_name = "{package_name}"\n'
+    return file_lines
+
+
+def write_setup_file(file_lines):
+    with open("setup.py", "w") as f:
+        f.writelines(file_lines)
+
+
+def main():
+    file_lines = open_setup_file()
+    file_lines = replace_nightly_package_info(file_lines)
+    write_setup_file(file_lines)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index 5f6d447ddc6b..e54ec41ea9f8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 import os
 import sys
-from datetime import datetime
 from typing import List
 
 from setuptools import find_packages, setup
@@ -15,7 +14,6 @@
 
 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
 BUILD_EXT = int(os.environ.get("BUILD_EXT", "0")) == 1
-IS_NIGHTLY = int(os.environ.get("NIGHTLY", "0")) == 1
 
 # we do not support windows currently
 if sys.platform == "win32":
@@ -96,23 +94,15 @@ def get_version() -> str:
 else:
     ext_modules = []
 
-# always put not nightly branch as the if branch
-# otherwise github will treat colossalai-nightly as the project name
-# and it will mess up with the dependency graph insights
-if not IS_NIGHTLY:
-    version = get_version()
-    package_name = "colossalai"
-else:
-    # use date as the nightly version
-    version = datetime.today().strftime("%Y.%m.%d")
-    package_name = "colossalai-nightly"
+version = get_version()
+package_name = "colossalai"
 
 setup(
     name=package_name,
     version=version,
     packages=find_packages(
         exclude=(
-            "op_builder",
+            "extensions",
             "benchmark",
             "docker",
             "tests",

From 0a25e16e4657b06958dd371410a136bc2f400552 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Tue, 27 Feb 2024 22:44:07 +0800
Subject: [PATCH 05/23] [shardformer]gather llama logits (#5398)

* gather llama logits

* fix
---
 colossalai/shardformer/modeling/llama.py     | 7 +++++++
 colossalai/shardformer/shard/shard_config.py | 1 +
 2 files changed, 8 insertions(+)

diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index e10a7ed7da0c..92c709218a26 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -16,6 +16,7 @@
 from colossalai.shardformer.shard import ShardConfig
 
 from ..layer import cross_entropy_1d
+from ..layer._operation import _gather
 
 try:
     from transformers.models.llama.modeling_llama import _prepare_4d_causal_attention_mask
@@ -288,6 +289,9 @@ def llama_for_causal_lm_forward(
                     shift_logits = shift_logits.view(-1, self.config.vocab_size)
                     loss = loss_fct(shift_logits, shift_labels)
 
+            if not shard_config.parallel_output:
+                logits = _gather(logits, -1, shard_config.tensor_parallel_process_group)
+
             if not return_dict:
                 output = (logits,) + outputs[1:]
                 return (loss,) + output if loss is not None else output
@@ -588,6 +592,9 @@ def forward(
                 shift_logits = shift_logits.view(-1, self.config.vocab_size)
                 loss = loss_fct(shift_logits, shift_labels)
 
+        if not shard_config.parallel_output:
+            logits = _gather(logits, -1, shard_config.tensor_parallel_process_group)
+
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index b5c9e66e0b87..415fc6dd5f06 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -34,6 +34,7 @@ class ShardConfig:
     enable_all_optimization: bool = False
     enable_sequence_parallelism: bool = False
     enable_sequence_overlap: bool = False
+    parallel_output = True
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
     # pipeline_parallel_size: int
     # data_parallel_size: int

From a28c971516133ca730623a9f36d27411e3414159 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 28 Feb 2024 17:46:27 +0800
Subject: [PATCH 06/23] update requirements (#5407)

---
 applications/Colossal-LLaMA-2/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/Colossal-LLaMA-2/requirements.txt b/applications/Colossal-LLaMA-2/requirements.txt
index d8afee768c02..34afaf7e5cfd 100644
--- a/applications/Colossal-LLaMA-2/requirements.txt
+++ b/applications/Colossal-LLaMA-2/requirements.txt
@@ -1,9 +1,9 @@
 torch<2.0.0, >=1.12.1
 packaging==23.1
-colossalai==0.3.2
+colossalai==0.3.5
 autoflake==2.2.1
 black==23.9.1
-transformers
+transformers==4.33.3
 tensorboard==2.14.0
 six==1.16.0
 datasets

From 2461f37886e5e23f12f4131d47e0b0f7eedd0154 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 29 Feb 2024 13:56:55 +0800
Subject: [PATCH 07/23] [workflow] added pypi channel (#5412)

---
 .github/workflows/release_test_pypi_before_merge.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml
index 284ab4d1afb0..7af641fc3056 100644
--- a/.github/workflows/release_test_pypi_before_merge.yml
+++ b/.github/workflows/release_test_pypi_before_merge.yml
@@ -49,6 +49,6 @@ jobs:
         # we need to install the requirements.txt first
         # as test-pypi may not contain the distributions for libs listed in the txt file
         pip install -r requirements/requirements.txt
-        pip install --index-url https://test.pypi.org/simple/ colossalai==$VERSION
+        pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.python.org/pypi colossalai==$VERSION
       env:
         VERSION: ${{ steps.prep-version.outputs.version }}

From 5de940de320b9c94c75060e1537f7e4a4df45faa Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 29 Feb 2024 14:51:29 +0800
Subject: [PATCH 08/23] [doc] fix blog link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3963fe2fb5d6..442e6bbcd8cf 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
    <a href="https://www.colossalai.org/"> Documentation </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> Examples </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
-   <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
+   <a href="https://hpc-ai.com/blog"> Blog </a></h3>
 
    [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)

From a1c6cdb1894e3ac02223a59aadf7957786cae300 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 29 Feb 2024 14:52:30 +0800
Subject: [PATCH 09/23] [doc] fix blog link

---
 docs/README-zh-Hans.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index 0c438c726baa..c25f19795a20 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -9,7 +9,7 @@
    <a href="https://www.colossalai.org/"> 文档 </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> 例程 </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> 论坛 </a> |
-   <a href="https://medium.com/@hpcaitech"> 博客 </a></h3>
+   <a href="https://hpc-ai.com/blog"> 博客 </a></h3>
 
    [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)

From 4b8312c08e8d05a5f41453d63c8671aab601ed1c Mon Sep 17 00:00:00 2001
From: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Date: Fri, 1 Mar 2024 17:27:50 +0800
Subject: [PATCH 10/23] fix sft single turn inference example (#5416)

---
 applications/Colossal-LLaMA-2/inference_example.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/applications/Colossal-LLaMA-2/inference_example.py b/applications/Colossal-LLaMA-2/inference_example.py
index 77e18d8b5939..63ce91e50432 100644
--- a/applications/Colossal-LLaMA-2/inference_example.py
+++ b/applications/Colossal-LLaMA-2/inference_example.py
@@ -15,7 +15,7 @@ def load_model(model_path, device="cuda", **kwargs):
     model.to(device)
 
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left')
     except OSError:
         raise ImportError("Tokenizer not found. Please check if the tokenizer exists or the model path is correct.")
 
@@ -29,6 +29,7 @@ def generate(args):
     if args.prompt_style == "sft":
         conversation = default_conversation.copy()
         conversation.append_message("Human", args.input_txt)
+        conversation.append_message("Assistant", None)
         input_txt = conversation.get_prompt()
     else:
         BASE_INFERENCE_SUFFIX = "\n\n->\n\n"
@@ -46,7 +47,7 @@ def generate(args):
         num_return_sequences=1,
     )
     response = tokenizer.decode(output.cpu()[0, num_input_tokens:], skip_special_tokens=True)
-    logger.info(f"Question: {input_txt} \n\n Answer: \n{response}")
+    logger.info(f"\nHuman: {args.input_txt} \n\nAssistant: \n{response}")
     return response
 
 

From 29695cf70c2652e4017bd76ff6337572f5b05035 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 4 Mar 2024 16:18:13 +0800
Subject: [PATCH 11/23] [example]add gpt2 benchmark example script. (#5295)

* benchmark gpt2

* fix

fix

fix

fix

* [doc] fix typo in Colossal-LLaMA-2/README.md (#5247)

* [workflow] fixed build CI (#5240)

* [workflow] fixed build CI

* polish

* polish

* polish

* polish

* polish

* [ci] fixed booster test (#5251)

* [ci] fixed booster test

* [ci] fixed booster test

* [ci] fixed booster test

* [ci] fixed ddp test (#5254)

* [ci] fixed ddp test

* polish

* fix typo in  applications/ColossalEval/README.md (#5250)

* [ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>

* [doc] fix doc typo (#5256)

* [doc] fix annotation display

* [doc] fix llama2 doc

* [hotfix]: add pp sanity check and fix mbs arg (#5268)

* fix: fix misleading mbs arg

* feat: add pp sanity check

* fix: fix 1f1b sanity check

* [workflow] fixed incomplete bash command (#5272)

* [workflow] fixed oom tests (#5275)

* [workflow] fixed oom tests

* polish

* polish

* polish

* [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276)

* fix ci

fix

* fix test

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

* fix

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>

* [shardformer] hybridparallelplugin support gradients accumulation. (#5246)

* support gradients acc

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

fix

* fix

fix

* fix

fix

fix

* [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230)

* fix auto loading gpt2 tokenizer (#5279)

* [doc] add llama2-13B disyplay (#5285)

* Update README.md

* fix 13b typo

---------

Co-authored-by: binmakeswell <binmakeswell@gmail.com>

* fix llama pretrain (#5287)

* fix

* fix

* fix

fix

* fix

fix

fix

* fix

fix

* benchmark gpt2

* fix

fix

fix

fix

* [workflow] fixed build CI (#5240)

* [workflow] fixed build CI

* polish

* polish

* polish

* polish

* polish

* [ci] fixed booster test (#5251)

* [ci] fixed booster test

* [ci] fixed booster test

* [ci] fixed booster test

* fix

fix

* fix

fix

fix

* fix

* fix

fix

fix

fix

fix

* fix

* Update shardformer.py

---------

Co-authored-by: digger yu <digger-yu@outlook.com>
Co-authored-by: Frank Lee <somerlee.9@gmail.com>
Co-authored-by: Wenhao Chen <cwher@outlook.com>
Co-authored-by: binmakeswell <binmakeswell@gmail.com>
Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com>
Co-authored-by: Desperado-Jia <502205863@qq.com>
---
 .github/workflows/build_on_pr.yml             |   2 +-
 .github/workflows/build_on_schedule.yml       |   2 +-
 .../booster/plugin/hybrid_parallel_plugin.py  |   3 +
 colossalai/shardformer/layer/_operation.py    |  27 +--
 colossalai/shardformer/modeling/gpt2.py       |  93 ++++++-
 colossalai/shardformer/policies/gpt2.py       |  82 +++++--
 colossalai/shardformer/shard/shardformer.py   |   4 +
 examples/__init__.py                          |   0
 examples/language/__init__.py                 |   0
 examples/language/{llama2 => }/data_utils.py  |   2 +-
 .../gpt/hybridparallelism/benchmark.py        | 228 ++++++++++++++++++
 examples/language/llama2/benchmark.py         |   3 +
 examples/language/{llama2 => }/model_utils.py |   0
 .../{llama2 => }/performance_evaluator.py     |   0
 tests/kit/model_zoo/registry.py               |   2 +-
 .../test_plugin/test_3d_plugin.py             |   2 +-
 .../test_plugin/test_gemini_plugin.py         |   2 +-
 .../test_gemini_checkpoint_io.py              |   2 +-
 tests/test_lazy/test_models.py                |   2 +-
 .../test_gpt2_qkv_fused_linear_1d.py          |   5 +-
 .../test_layer/test_linear_1d.py              |   3 +
 .../test_layer/test_qkv_fused_linear_1d.py    |   5 +-
 22 files changed, 421 insertions(+), 48 deletions(-)
 create mode 100644 examples/__init__.py
 create mode 100644 examples/language/__init__.py
 rename examples/language/{llama2 => }/data_utils.py (99%)
 create mode 100644 examples/language/gpt/hybridparallelism/benchmark.py
 rename examples/language/{llama2 => }/model_utils.py (100%)
 rename examples/language/{llama2 => }/performance_evaluator.py (100%)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 2cad504f3391..b01d15490e0f 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -201,4 +201,4 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: report
-          path: report/
+          path: report/
\ No newline at end of file
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index ae1a5275e5da..510665b46f4b 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -83,4 +83,4 @@ jobs:
           SERVER_URL: ${{github.server_url }}
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
-          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
\ No newline at end of file
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index da67e6b41fbf..bf677e052f88 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -36,6 +36,8 @@
 
 DP_AXIS, PP_AXIS, TP_AXIS = 0, 1, 2
 
+PRECISION_TORCH_TYPE = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}
+
 
 def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
     if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
@@ -1059,6 +1061,7 @@ def __init__(
             overlap_communication=overlap_communication,
             cpu_offload=cpu_offload,
             partition_grad=(self.zero_stage == 2),
+            forced_dtype=PRECISION_TORCH_TYPE[precision],
         )
 
         self.max_norm = max_norm
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 4bca335c84d8..d4960c7e4bde 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -9,6 +9,7 @@
 
 try:
     import fused_weight_gradient_mlp_cuda
+
     _grad_accum_fusion_available = True
 except ImportError:
     _grad_accum_fusion_available = False
@@ -78,7 +79,8 @@ def backward(ctx, grad_output):
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
         weight = weight.view(weight.shape)
-        bias = bias.view(bias.shape)
+        if bias is not None:
+            bias = bias.view(bias.shape)
 
         total_input = input
         grad_input = grad_output.matmul(weight.T)
@@ -91,9 +93,8 @@ def backward(ctx, grad_output):
         if ctx.async_grad_allreduce:
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
-            # Delay the start of weight gradient computation shortly (3us) to have
-            # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
+            # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+            # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
         grad_weight = total_input.t().matmul(grad_output)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
@@ -115,7 +116,6 @@ def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_allreduce = async_grad_allreduce
-
         if bias is not None:
             output = F.linear(input_, weight, bias)
         else:
@@ -143,9 +143,8 @@ def backward(ctx, grad_output):
         if ctx.async_grad_allreduce:
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
-            # Delay the start of weight gradient computation shortly (3us) to have
-            # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
+            # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+            # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
         if _grad_accum_fusion_available and weight.grad is not None:
             grad = weight.grad
@@ -228,9 +227,8 @@ def backward(ctx, grad_output):
                     input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
                 ).contiguous()
                 handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Delay the start of weight gradient computation shortly (3us) to have
-                # reduce-scatter scheduled first and have GPU resources allocated
-                _ = torch.empty(1, device=grad_output.device) + 1
+                # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+                # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
             if _grad_accum_fusion_available and weight.grad is not None:
                 grad = weight.grad
@@ -394,9 +392,8 @@ def backward(ctx, grad_output):
                     input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
                 ).contiguous()
                 handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Delay the start of weight gradient computation shortly (3us) to have
-                # reduce-scatter scheduled first and have GPU resources allocated
-                _ = torch.empty(1, device=grad_output.device) + 1
+                # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+                # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
             grad_weight = total_input.t().matmul(grad_output)
             grad_bias = grad_output.sum(dim=0) if use_bias else None
@@ -431,7 +428,7 @@ def backward(ctx, grad_output):
             input_parallel = torch.cat(tensor_list, dim=dim).contiguous()
             # calculate gradient
             if len(input_parallel.shape) > 2:
-                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])   
+                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
             grad_weight = input_parallel.t().matmul(grad_output)
             # wait until reduce-scatter finished
             reducescatter_handle.wait()
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 055e3096d794..3e5cc6015adc 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -24,6 +24,8 @@
 from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
 from colossalai.shardformer.shard import ShardConfig
 
+from ..layer import cross_entropy_1d
+
 
 class GPT2PipelineForwards:
     """
@@ -326,7 +328,15 @@ def gpt2_lmhead_model_forward(
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.view(-1)
+            if shard_config.enable_tensor_parallelism:
+                loss = cross_entropy_1d(
+                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+                )
+            else:
+                loss = loss_fct(shift_logits, shift_labels)
+
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
@@ -1006,3 +1016,84 @@ def custom_forward(*inputs):
         )
 
     return forward
+
+
+def get_lm_forward_with_dist_cross_entropy(shard_config: ShardConfig):
+    from transformers import GPT2LMHeadModel
+
+    def forward(
+        self: GPT2LMHeadModel,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.view(-1)
+            if shard_config.enable_tensor_parallelism:
+                loss = cross_entropy_1d(
+                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
+                )
+            else:
+                loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+    return forward
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 022e6ff5b32c..303766993e3d 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -5,7 +5,12 @@
 
 import colossalai.shardformer.layer as col_nn
 
-from ..modeling.gpt2 import GPT2PipelineForwards, get_gpt2_flash_attention_forward, gpt2_sequence_parallel_forward_fn
+from ..modeling.gpt2 import (
+    GPT2PipelineForwards,
+    get_gpt2_flash_attention_forward,
+    get_lm_forward_with_dist_cross_entropy,
+    gpt2_sequence_parallel_forward_fn,
+)
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = [
@@ -87,9 +92,7 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="mlp.c_proj",
                         target_module=col_nn.GPT2FusedLinearConv1D_Row,
-                        kwargs={
-                            "seq_parallel": use_sequence_parallel,
-                        },
+                        kwargs={"seq_parallel": use_sequence_parallel},
                     ),
                     SubModuleReplacementDescription(
                         suffix="attn.attn_dropout",
@@ -167,15 +170,35 @@ def get_held_layers(self) -> List[nn.Module]:
         stage_manager = self.pipeline_stage_manager
 
         held_layers = []
-        layers_per_stage = self.distribute_layers(len(module.h), stage_manager.num_stages)
-        if stage_manager.is_first_stage():
-            held_layers.append(module.wte)
-            held_layers.append(module.wpe)
-            held_layers.append(module.drop)
-        start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
-        held_layers.extend(module.h[start_idx:end_idx])
-        if stage_manager.is_last_stage():
-            held_layers.append(module.ln_f)
+        if stage_manager.is_interleave:
+            assert stage_manager.num_model_chunks is not None
+            layers_per_stage = self.distribute_layers(
+                len(module.h), stage_manager.num_stages * stage_manager.num_model_chunks
+            )
+            stage_indices = Policy.get_stage_index(
+                layers_per_stage,
+                stage_manager.stage,
+                num_model_chunks=stage_manager.num_model_chunks,
+                num_stages=stage_manager.num_stages,
+            )
+            if stage_manager.is_first_stage(ignore_chunk=True):
+                held_layers.append(module.wte)
+                held_layers.append(module.wpe)
+                held_layers.append(module.drop)
+            for start_idx, end_idx in stage_indices:
+                held_layers.extend(module.h[start_idx:end_idx])
+            if stage_manager.is_last_stage(ignore_chunk=True):
+                held_layers.append(module.ln_f)
+        else:
+            layers_per_stage = self.distribute_layers(len(module.h), stage_manager.num_stages)
+            if stage_manager.is_first_stage():
+                held_layers.append(module.wte)
+                held_layers.append(module.wpe)
+                held_layers.append(module.drop)
+            start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
+            held_layers.extend(module.h[start_idx:end_idx])
+            if stage_manager.is_last_stage():
+                held_layers.append(module.ln_f)
         return held_layers
 
     def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
@@ -189,13 +212,27 @@ def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, poli
         else:
             module = self.model.transformer
 
-        layers_per_stage = Policy.distribute_layers(len(module.h), stage_manager.num_stages)
-        stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
-        method_replacement = {
-            "forward": partial(
-                new_forward, stage_manager=stage_manager, stage_index=stage_index, shard_config=self.shard_config
+        if stage_manager.is_interleave:
+            layers_per_stage = self.distribute_layers(
+                len(module.h), stage_manager.num_stages * stage_manager.num_model_chunks
+            )
+            stage_manager.stage_indices = Policy.get_stage_index(
+                layers_per_stage,
+                stage_manager.stage,
+                num_model_chunks=stage_manager.num_model_chunks,
+                num_stages=stage_manager.num_stages,
             )
-        }
+            method_replacement = {
+                "forward": partial(new_forward, stage_manager=stage_manager, shard_config=self.shard_config)
+            }
+        else:
+            layers_per_stage = Policy.distribute_layers(len(module.h), stage_manager.num_stages)
+            stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
+            method_replacement = {
+                "forward": partial(
+                    new_forward, stage_manager=stage_manager, stage_index=stage_index, shard_config=self.shard_config
+                )
+            }
         self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=model_cls)
 
 
@@ -232,9 +269,10 @@ def module_policy(self):
                 GPT2LMHeadModel: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}
+                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": False}
                         )
-                    ]
+                    ],
+                    method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )
             }
             module_policy.update(addon_module)
@@ -249,7 +287,7 @@ def module_policy(self):
 
     def get_held_layers(self) -> List[nn.Module]:
         held_layers = super().get_held_layers()
-        if self.pipeline_stage_manager.is_last_stage():
+        if self.pipeline_stage_manager.is_last_stage(ignore_chunk=True):
             held_layers.append(self.model.lm_head)
         return held_layers
 
diff --git a/colossalai/shardformer/shard/shardformer.py b/colossalai/shardformer/shard/shardformer.py
index 7a0d75bf2f2a..b132f47fd810 100644
--- a/colossalai/shardformer/shard/shardformer.py
+++ b/colossalai/shardformer/shard/shardformer.py
@@ -1,3 +1,4 @@
+import os
 from typing import Dict, List, Tuple
 
 import torch.nn as nn
@@ -9,6 +10,9 @@
 from .shard_config import ShardConfig
 from .sharder import ModelSharder
 
+# set CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that when communication and computation overlap, the order of core scheduling is correct
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
 
 class ShardFormer:
     """
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/__init__.py b/examples/language/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/language/llama2/data_utils.py b/examples/language/data_utils.py
similarity index 99%
rename from examples/language/llama2/data_utils.py
rename to examples/language/data_utils.py
index 6b9e8ef28eb7..ec849ef9d1eb 100644
--- a/examples/language/llama2/data_utils.py
+++ b/examples/language/data_utils.py
@@ -121,4 +121,4 @@ def __getitem__(self, idx):
             "input_ids": self.input_ids[idx],
             "attention_mask": self.attention_mask[idx],
             "labels": self.input_ids[idx],
-        }
+        }
\ No newline at end of file
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
new file mode 100644
index 000000000000..1315deae6eb0
--- /dev/null
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -0,0 +1,228 @@
+import argparse
+import resource
+from contextlib import nullcontext
+
+import torch
+from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
+from torch.optim import Adam
+from tqdm import tqdm
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+
+import colossalai
+
+# import colossalai.utils.device as device_utils
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, TorchFSDPPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.utils import get_current_device
+from examples.language.data_utils import RandomDataset
+from examples.language.model_utils import format_numel_str, get_model_numel
+from examples.language.performance_evaluator import PerformanceEvaluator
+
+# ==============================
+# Constants
+# ==============================
+MODEL_CONFIGS = {
+    "118M": GPT2Config(activation_function="gelu"),
+    "338M": GPT2Config(n_embd=1024, n_head=16, n_layer=24, activation_function="gelu"),
+    "738M": GPT2Config(n_embd=1280, n_head=20, n_layer=36, activation_function="gelu"),
+    "6.21B": GPT2Config(n_embd=4096, n_head=32, n_layer=32, n_positions=4096, activation_function="gelu"),
+}
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="6.21B", help="Model configuration")
+    parser.add_argument(
+        "-p",
+        "--plugin",
+        choices=["gemini", "gemini_auto", "fsdp", "fsdp_cpu", "3d", "3d_cpu"],
+        default="gemini",
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
+    parser.add_argument("-s", "--num_steps", type=int, default=200, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=3, help="Number of steps to ignore")
+    parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
+    parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
+    parser.add_argument(
+        "-w", "--warmup_ratio", type=float, default=0.8, help="warm up ratio of non-model data. Only for gemini-auto"
+    )
+    parser.add_argument("-m", "--memory_limit", type=int, help="Gemini memory limit in mb")
+    parser.add_argument("--shard_param_frac", type=float, default=1.0, help="Shard param fraction. Only for gemini")
+    parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
+    parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
+    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--mbs", type=int, default=1)
+    parser.add_argument("--zero", type=int, default=0)
+    parser.add_argument("--pp_style", type=str, default="1f1b")
+    parser.add_argument("--num_model_chunks", type=int, default=2)
+    parser.add_argument("--cpu_offload", action="store_true", help="Use gradient checkpointing")
+    args = parser.parse_args()
+
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+
+    def empty_init():
+        pass
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    use_empty_init = True
+    if args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision="bf16",
+            shard_param_frac=args.shard_param_frac,
+            offload_optim_frac=args.offload_optim_frac,
+            offload_param_frac=args.offload_param_frac,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            placement_policy="auto",
+            precision="bf16",
+            warmup_non_model_data_ratio=args.warmup_ratio,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
+    elif args.plugin == "fsdp":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                )
+            )
+    elif args.plugin == "fsdp_cpu":
+        if use_empty_init:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+                param_init_fn=empty_init(),
+            )
+        else:
+            plugin = TorchFSDPPlugin(
+                mixed_precision=MixedPrecision(
+                    param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+                ),
+                cpu_offload=CPUOffload(offload_params=True),
+            )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            pp_style=args.pp_style,
+            zero_stage=args.zero,
+            num_model_chunks=args.num_model_chunks,
+            enable_all_optimization=True,
+            num_microbatches=args.mbs,
+            cpu_offload=args.cpu_offload,
+            precision="bf16",
+        )
+    elif args.plugin == "3d_cpu":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            zero_stage=args.zero,
+            cpu_offload=True,
+            enable_fused_normalization=torch.cuda.is_available(),
+            num_microbatches=args.mbs,
+            initial_scale=2**8,
+            precision="bf16",
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ==============================
+    # Initialize Dataset and Dataloader
+    # ==============================
+    dp_size = plugin.dp_size if isinstance(plugin, HybridParallelPlugin) else coordinator.world_size
+
+    config = MODEL_CONFIGS[args.config]
+    dataset = RandomDataset(
+        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True)
+
+    # ==============================
+    # Initialize Model and Optimizer
+    # ==============================
+    init_ctx = (
+        LazyInitContext(default_device=get_current_device())
+        if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
+        else nullcontext()
+    )
+
+    with init_ctx:
+        model = GPT2LMHeadModel(config)
+
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
+
+    model_numel = get_model_numel(model)
+    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        model.config.n_layer,
+        model.config.n_embd,
+        model.config.vocab_size,
+        args.grad_checkpoint,
+        args.ignore_steps,
+        dp_world_size=dp_size,
+    )
+
+    optimizer = Adam(model.parameters())
+    torch.set_default_dtype(torch.bfloat16)
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+    torch.set_default_dtype(torch.float)
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
+    )
+
+    if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
+        data_iter = iter(dataloader)
+        for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()):
+            performance_evaluator.on_step_start(step)
+            booster.execute_pipeline(
+                data_iter, model, criterion=lambda outputs, inputs: outputs[0], optimizer=optimizer, return_loss=False
+            )
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length))
+    else:
+        for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
+            performance_evaluator.on_step_start(step)
+            outputs = model(**batch)
+            loss = outputs[0]
+            booster.backward(loss, optimizer)
+            optimizer.step()
+            optimizer.zero_grad()
+            performance_evaluator.on_step_end(**batch)
+        coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+    performance_evaluator.on_fit_end()
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/llama2/benchmark.py b/examples/language/llama2/benchmark.py
index 54b023f64742..832465490907 100644
--- a/examples/language/llama2/benchmark.py
+++ b/examples/language/llama2/benchmark.py
@@ -19,6 +19,9 @@
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
+from examples.language.data_utils import RandomDataset
+from examples.language.model_utils import format_numel_str, get_model_numel
+from examples.language.performance_evaluator import PerformanceEvaluator
 
 # ==============================
 # Constants
diff --git a/examples/language/llama2/model_utils.py b/examples/language/model_utils.py
similarity index 100%
rename from examples/language/llama2/model_utils.py
rename to examples/language/model_utils.py
diff --git a/examples/language/llama2/performance_evaluator.py b/examples/language/performance_evaluator.py
similarity index 100%
rename from examples/language/llama2/performance_evaluator.py
rename to examples/language/performance_evaluator.py
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index a16b16ad6af7..fce81ab52c2b 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -102,4 +102,4 @@ def get_sub_registry(
         return new_dict
 
 
-model_zoo = ModelZooRegistry()
+model_zoo = ModelZooRegistry()
\ No newline at end of file
diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index d629e769d715..285c4866c441 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -276,4 +276,4 @@ def test_gemini_plugin(early_stop: bool = True):
 
 
 if __name__ == "__main__":
-    test_gemini_plugin(early_stop=False)
+    test_gemini_plugin(early_stop=False)
\ No newline at end of file
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index 17dfa3a1860d..0f72d2bcd3e4 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -185,4 +185,4 @@ def test_gemini_plugin_3d(early_stop: bool = True):
 
 
 if __name__ == "__main__":
-    test_gemini_plugin(early_stop=False)
+    test_gemini_plugin(early_stop=False)
\ No newline at end of file
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index 61cac1d8369b..daddf6dc7ca0 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -186,4 +186,4 @@ def test_gemini_ckpIO_3d():
 
 
 if __name__ == "__main__":
-    test_gemini_ckpIO()
+    test_gemini_ckpIO()
\ No newline at end of file
diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py
index d0c4cd0a7c48..aeca5f21dc1d 100644
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -24,4 +24,4 @@ def test_torchvision_models_lazy_init(subset, default_device):
 
 
 if __name__ == "__main__":
-    test_torchvision_models_lazy_init("transformers", "cpu")
+    test_torchvision_models_lazy_init("transformers", "cpu")
\ No newline at end of file
diff --git a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
index 10ffdcd7138c..e056860ede57 100644
--- a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
@@ -1,3 +1,4 @@
+import os
 from contextlib import nullcontext
 
 import torch
@@ -11,8 +12,10 @@
 from colossalai.shardformer.layer.qkv_fused_linear import split_fused_qkv_in_gpt2_style
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
-
 # This code is copied from https://github.com/huggingface/transformers
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+
 class Conv1D(nn.Module):
     """
     1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
diff --git a/tests/test_shardformer/test_layer/test_linear_1d.py b/tests/test_shardformer/test_layer/test_linear_1d.py
index 5bacf1865c48..defa4afb919b 100644
--- a/tests/test_shardformer/test_layer/test_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_linear_1d.py
@@ -1,3 +1,4 @@
+import os
 from contextlib import nullcontext
 
 import torch
@@ -11,6 +12,8 @@
 from colossalai.tensor.d_tensor import is_distributed_tensor
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
 
 def check_linear_1d_col(lazy_init: bool, seq_parallel: bool, overlap: bool):
     ctx = LazyInitContext() if lazy_init else nullcontext()
diff --git a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
index b02d581810cd..5e996d2ba985 100644
--- a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
@@ -1,3 +1,4 @@
+import os
 from contextlib import nullcontext
 
 import torch
@@ -11,8 +12,10 @@
 from colossalai.shardformer.layer.qkv_fused_linear import split_fused_qkv_in_gpt2_style
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
-
 # This code is copied from https://github.com/huggingface/transformers
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+
 class Conv1D(nn.Module):
     """
     1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).

From 822241a99cca799e1fca250ff2fb7f54ea0f8dcd Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Tue, 5 Mar 2024 12:08:58 +0800
Subject: [PATCH 12/23] [doc] sora release (#5425)

* [doc] sora release

* [doc] sora release

* [doc] sora release

* [doc] sora release
---
 README.md              | 20 ++++++++++++++++----
 applications/README.md |  1 +
 docs/README-zh-Hans.md | 20 ++++++++++++++++----
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 442e6bbcd8cf..f045c56043be 100644
--- a/README.md
+++ b/README.md
@@ -25,16 +25,13 @@
 </div>
 
 ## Latest News
+* [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
 * [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
 * [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b)
 * [2023/11] [Enhanced MoE Parallelism, Open-source MoE Model Training Can Be 9 Times More Efficient](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
 * [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific LLM Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
 * [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
 * [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
-* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
-* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
-* [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
-* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
 
 ## Table of Contents
 <ul>
@@ -43,6 +40,7 @@
  <li>
    <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
    <ul>
+     <li><a href="#Open-Sora">Open-Sora: Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million</a></li>
      <li><a href="#Colossal-LLaMA-2">Colossal-LLaMA-2: One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution</a></li>
      <li><a href="#ColossalChat">ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline</a></li>
      <li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
@@ -126,6 +124,20 @@ distributed training and inference in a few lines.
 <p align="right">(<a href="#top">back to top</a>)</p>
 
 ## Colossal-AI in the Real World
+### Open-Sora
+
+[Open-Sora](https://github.com/hpcaitech/Open-Sora)：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million
+[[code]](https://github.com/hpcaitech/Open-Sora)
+[[blog]](https://hpc-ai.com/blog/open-sora)
+
+<p id="diffusion_demo" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-1.png" width=600/>
+</p>
+
+<p id="diffusion_demo" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-2.png" width=600/>
+</p>
+
 
 ### Colossal-LLaMA-2
 
diff --git a/applications/README.md b/applications/README.md
index 49a2900f1715..8abe1e52d96c 100644
--- a/applications/README.md
+++ b/applications/README.md
@@ -4,6 +4,7 @@ This directory contains the applications that are powered by Colossal-AI.
 
 The list of applications include:
 
+- [X] [Open-Sora](https://github.com/hpcaitech/Open-Sora): Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million
 - [X] [Colossal-LLaMA-2](./Colossal-LLaMA-2/): Continual Pre-training of LLaMA-2.
 - [X] [ColossalEval](./ColossalEval): Evaluation Pipeline for LLMs.
 - [X] [ColossalChat](./Chat/README.md): Replication of ChatGPT with RLHF.
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index c25f19795a20..90ad5540ae83 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -24,16 +24,13 @@
 </div>
 
 ## 新闻
+* [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
 * [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
 * [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b)
 * [2023/11] [Enhanced MoE Parallelism, Open-source MoE Model Training Can Be 9 Times More Efficient](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
 * [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific LLM Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
 * [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
 * [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
-* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
-* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
-* [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
-* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
 
 ## 目录
 <ul>
@@ -42,6 +39,7 @@
  <li>
    <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a>
    <ul>
+     <li><a href="#Open-Sora">Open-Sora：开源Sora复现方案，成本降低46%，序列扩充至近百万</a></li>
      <li><a href="#Colossal-LLaMA-2">Colossal-LLaMA-2: 千元预算半天训练，效果媲美主流大模型，开源可商用中文LLaMA-2</a></li>
      <li><a href="#ColossalChat">ColossalChat：完整RLHF流程0门槛克隆ChatGPT</a></li>
      <li><a href="#AIGC">AIGC: 加速 Stable Diffusion</a></li>
@@ -121,6 +119,20 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 
 ## Colossal-AI 成功案例
+### Open-Sora
+
+[Open-Sora](https://github.com/hpcaitech/Open-Sora)：开源Sora复现方案，成本降低46%，序列扩充至近百万
+[[代码]](https://github.com/hpcaitech/Open-Sora)
+[[博客]](https://hpc-ai.com/blog/open-sora)
+
+<p id="diffusion_demo" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-1.png" width=600/>
+</p>
+
+<p id="diffusion_demo" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-2.png" width=600/>
+</p>
+
 ### Colossal-LLaMA-2
 
 - 7B：千元预算半天训练，效果媲美主流大模型，开源可商用中文LLaMA-2

From 070df689e627d07f28c8087ec85a4299c73145d9 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 5 Mar 2024 15:35:54 +0800
Subject: [PATCH 13/23] [devops] fix extention building (#5427)

---
 .cuda_ext.json                                | 12 ++---
 .../compatiblity_test_on_dispatch.yml         |  2 +-
 .github/workflows/compatiblity_test_on_pr.yml |  2 +-
 .../compatiblity_test_on_schedule.yml         |  2 +-
 .../workflows/cuda_ext_check_before_merge.yml |  2 +-
 .github/workflows/doc_test_on_pr.yml          |  2 +-
 .github/workflows/doc_test_on_schedule.yml    |  2 +-
 .../workflows/example_check_on_dispatch.yml   |  2 +-
 .github/workflows/example_check_on_pr.yml     |  2 +-
 .../workflows/example_check_on_schedule.yml   |  2 +-
 colossalai/cli/check/check_installation.py    |  2 +-
 .../inference/serving/ray_serve/README.md     |  2 +-
 .../inference/serving/torch_serve/README.md   |  2 +-
 .../serving/torch_serve/docker/Dockerfile     |  2 +-
 colossalai/nn/optimizer/cpu_adam.py           |  2 +-
 docker/Dockerfile                             |  2 +-
 docs/README-zh-Hans.md                        | 44 +++++++++----------
 docs/source/en/get_started/installation.md    |  2 +-
 examples/images/diffusion/README.md           |  2 +-
 examples/images/diffusion/test_ci.sh          |  2 +-
 examples/language/llama2/README.md            |  2 +-
 examples/language/openmoe/README.md           |  2 +-
 extensions/utils.py                           |  2 +-
 setup.py                                      |  2 +-
 24 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/.cuda_ext.json b/.cuda_ext.json
index eba19cf05e31..b8269f83786c 100644
--- a/.cuda_ext.json
+++ b/.cuda_ext.json
@@ -1,16 +1,16 @@
 {
   "build": [
     {
-      "torch_command": "pip install torch==1.12.1+cu102 torchvision==0.13.1+cu102 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu102",
-      "cuda_image": "hpcaitech/cuda-conda:10.2"
+      "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121",
+      "cuda_image": "hpcaitech/cuda-conda:12.1"
     },
     {
-      "torch_command": "pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113",
-      "cuda_image": "hpcaitech/cuda-conda:11.3"
+      "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118",
+      "cuda_image": "hpcaitech/cuda-conda:11.8"
     },
     {
-      "torch_command": "pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu116",
-      "cuda_image": "hpcaitech/cuda-conda:11.6"
+      "torch_command": "pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1",
+      "cuda_image": "hpcaitech/cuda-conda:11.7"
     }
   ]
 }
diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index 5083212993cc..a6f9582ac901 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -83,7 +83,7 @@ jobs:
           fi
       - name: Install Colossal-AI
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
           pip install -r requirements/requirements-test.txt
       - name: Unit Testing
         run: |
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index cc17c66f9c3a..ede6c380a8ec 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -78,7 +78,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
           pip install -r requirements/requirements-test.txt
       - name: Unit Testing
         run: |
diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
index 158fe751bf2e..1cf456ff62c1 100644
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -75,7 +75,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
           pip install -r requirements/requirements-test.txt
 
       - name: Unit Testing
diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml
index 686f0f395c73..14f53bd69ef9 100644
--- a/.github/workflows/cuda_ext_check_before_merge.yml
+++ b/.github/workflows/cuda_ext_check_before_merge.yml
@@ -51,4 +51,4 @@ jobs:
 
       - name: Build
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index 51238905e115..8afc46b87aa2 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -89,7 +89,7 @@ jobs:
       - name: Install ColossalAI
         run: |
           source activate pytorch
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
 
       - name: Test the Doc
         run: |
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index b3536184d78a..e2491e4607f5 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install ColossalAI
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
 
       - name: Install Doc Test Requirements
         run: |
diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
index bba321fd2d59..24e726b4f16d 100644
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -53,7 +53,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Install Colossal-AI
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
       - name: Test the example
         run: |
           dir=${{ matrix.directory }}
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index fcff8e569ff7..728f059c1bb3 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -88,7 +88,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
 
       - name: Test the example
         run: |
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index abb9479492e7..efb131a864cb 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -42,7 +42,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v .
 
       - name: Traverse all files
         run: |
diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py
index 772c513ffa06..f5602bbe6155 100644
--- a/colossalai/cli/check/check_installation.py
+++ b/colossalai/cli/check/check_installation.py
@@ -76,7 +76,7 @@ def check_installation():
     click.echo("")
     click.echo(f"Note:")
     click.echo(
-        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set"
+        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable BUILD_EXT=1 is set"
     )
     click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")
 
diff --git a/colossalai/legacy/inference/serving/ray_serve/README.md b/colossalai/legacy/inference/serving/ray_serve/README.md
index 1d408238760b..888f04bb50f9 100644
--- a/colossalai/legacy/inference/serving/ray_serve/README.md
+++ b/colossalai/legacy/inference/serving/ray_serve/README.md
@@ -25,7 +25,7 @@ conda install -c conda-forge cupy cudnn cutensor nccl cuda-version=11.6
 
 # install colossalai with PyTorch extensions
 cd <path_to_ColossalAI_repo>
-CUDA_EXT=1 pip install -e .
+BUILD_EXT=1 pip install -e .
 
 # install other dependencies
 pip install triton==2.0.0.dev20221202
diff --git a/colossalai/legacy/inference/serving/torch_serve/README.md b/colossalai/legacy/inference/serving/torch_serve/README.md
index 6bd145bc30ae..fcf2e36d23c5 100644
--- a/colossalai/legacy/inference/serving/torch_serve/README.md
+++ b/colossalai/legacy/inference/serving/torch_serve/README.md
@@ -25,7 +25,7 @@ conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit
 cd <path_to_ColossalAI_repo>
 pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-test.txt
-CUDA_EXT=1 pip install -e .
+BUILD_EXT=1 pip install -e .
 
 # install torchserve
 cd <path_to_torch_serve_repo>
diff --git a/colossalai/legacy/inference/serving/torch_serve/docker/Dockerfile b/colossalai/legacy/inference/serving/torch_serve/docker/Dockerfile
index 6d780a84747f..755812397932 100644
--- a/colossalai/legacy/inference/serving/torch_serve/docker/Dockerfile
+++ b/colossalai/legacy/inference/serving/torch_serve/docker/Dockerfile
@@ -38,7 +38,7 @@ ARG VERSION=main
 RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git && \
     cd ./ColossalAI && \
     git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 && \
-    CUDA_EXT=1 pip install -v --no-cache-dir .
+    BUILD_EXT=1 pip install -v --no-cache-dir .
 
 # install titans
 RUN pip install --no-cache-dir titans
diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py
index 5be629fb2045..68fb582e5d1f 100644
--- a/colossalai/nn/optimizer/cpu_adam.py
+++ b/colossalai/nn/optimizer/cpu_adam.py
@@ -78,7 +78,7 @@ def __init__(
         super(CPUAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
         self.adamw_mode = adamw_mode
         cpu_adam = CPUAdamLoader().load()
-        # if you find yourself stuck here, make sure that you install colossalai with CUDA_EXT=1 specification
+        # if you find yourself stuck here, make sure that you install colossalai with BUILD_EXT=1 specification
         self.cpu_adam_op = cpu_adam.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode)
 
     def torch_adam_update(
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 26d3fab1b6d7..0e796a9d4a95 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -37,7 +37,7 @@ RUN git clone https://github.com/NVIDIA/apex && \
 ARG VERSION=main
 RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
     && cd ./ColossalAI \
-    && CUDA_EXT=1 pip install -v --no-cache-dir .
+    && BUILD_EXT=1 pip install -v --no-cache-dir .
 
 # install titans
 RUN pip install --no-cache-dir titans
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index 90ad5540ae83..bc4106d12642 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -146,25 +146,25 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 [[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
 [[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
 
-|              Model             |  Backbone  | Tokens Consumed |     MMLU (5-shot)    | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot)  |
-| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
-|          Baichuan-7B           |     -      |      1.2T       |    42.32 (42.30)     | 44.53 (44.02) |        38.72     |       36.74     |       42.80     |
-|       Baichuan-13B-Base        |     -      |      1.4T       |    50.51 (51.60)     | 55.73 (55.30) |        47.20     |       51.41     |       53.60     |
-|       Baichuan2-7B-Base        |     -      |      2.6T       |    46.97 (54.16)     | 57.67 (57.07) |        45.76     |       52.60     |       54.00     |
-|       Baichuan2-13B-Base       |     -      |      2.6T       |    54.84 (59.17)     | 62.62 (61.97) |        52.08     |       58.25     |       58.10     |
-|           ChatGLM-6B           |     -      |      1.0T       |    39.67 (40.63)     |   41.17 (-)   |        40.10     |       36.53     |       38.90     |
-|          ChatGLM2-6B           |     -      |      1.4T       |    44.74 (45.46)     |   49.40 (-)   |        46.36     |       45.49     |       51.70     |
-|          InternLM-7B           |     -      |      1.6T       |    46.70 (51.00)     |   52.00 (-)   |        44.77     |       61.64     |       52.80     |
-|            Qwen-7B             |     -      |      2.2T       |        54.29 (56.70) | 56.03 (58.80) |        52.47     |       56.42     |       59.60     |
-|           Llama-2-7B           |     -      |      2.0T       |    44.47 (45.30)     |   32.97 (-)   |        32.60     |       25.46     |         -       |
-| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |        37.43         |     29.92     |        32.00     |       27.57     |         -       |
-| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |        38.56         |     31.52     |        30.99     |       25.95     |         -       |
-| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |        33.86         |     34.69     |        34.52     |       25.18     |        34.2     |
-| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |        43.73         |     42.04     |        37.64     |       30.61     |         -       |
-|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |        48.41         |     38.31     |        38.45     |       27.72     |         -       |
-|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |        49.96         |     41.10     |        39.83     |       33.00     |         -       |
-| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |        50.25         |     40.99     |        40.04     |       30.54     |         -       |
-|  **Colossal-LLaMA-2-7b-base**  | Llama-2-7B |   **0.0085T**   |        53.06         |     49.89     |        51.48     |       58.82     |        50.2     |
+|             Model              |  Backbone  | Tokens Consumed | MMLU (5-shot) | CMMLU (5-shot) | AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot) |
+|:------------------------------:|:----------:|:---------------:|:-------------:|:--------------:|:----------------:|:---------------:|:--------------:|
+|          Baichuan-7B           |     -      |      1.2T       | 42.32 (42.30) | 44.53 (44.02)  |      38.72       |      36.74      |     42.80      |
+|       Baichuan-13B-Base        |     -      |      1.4T       | 50.51 (51.60) | 55.73 (55.30)  |      47.20       |      51.41      |     53.60      |
+|       Baichuan2-7B-Base        |     -      |      2.6T       | 46.97 (54.16) | 57.67 (57.07)  |      45.76       |      52.60      |     54.00      |
+|       Baichuan2-13B-Base       |     -      |      2.6T       | 54.84 (59.17) | 62.62 (61.97)  |      52.08       |      58.25      |     58.10      |
+|           ChatGLM-6B           |     -      |      1.0T       | 39.67 (40.63) |   41.17 (-)    |      40.10       |      36.53      |     38.90      |
+|          ChatGLM2-6B           |     -      |      1.4T       | 44.74 (45.46) |   49.40 (-)    |      46.36       |      45.49      |     51.70      |
+|          InternLM-7B           |     -      |      1.6T       | 46.70 (51.00) |   52.00 (-)    |      44.77       |      61.64      |     52.80      |
+|            Qwen-7B             |     -      |      2.2T       | 54.29 (56.70) | 56.03 (58.80)  |      52.47       |      56.42      |     59.60      |
+|           Llama-2-7B           |     -      |      2.0T       | 44.47 (45.30) |   32.97 (-)    |      32.60       |      25.46      |       -        |
+| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |     37.43     |     29.92      |      32.00       |      27.57      |       -        |
+| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |     38.56     |     31.52      |      30.99       |      25.95      |       -        |
+| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |     33.86     |     34.69      |      34.52       |      25.18      |      34.2      |
+| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |     43.73     |     42.04      |      37.64       |      30.61      |       -        |
+|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |     48.41     |     38.31      |      38.45       |      27.72      |       -        |
+|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |     49.96     |     41.10      |      39.83       |      33.00      |       -        |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |     50.25     |     40.99      |      40.04       |      30.54      |       -        |
+|  **Colossal-LLaMA-2-7b-base**  | Llama-2-7B |   **0.0085T**   |     53.06     |     49.89      |      51.48       |      58.82      |      50.2      |
 
 
 ### ColossalChat
@@ -406,10 +406,10 @@ pip install colossalai
 
 **注：目前只支持Linux。**
 
-但是，如果你想在安装时就直接构建PyTorch扩展，您可以设置环境变量`CUDA_EXT=1`.
+但是，如果你想在安装时就直接构建PyTorch扩展，您可以设置环境变量`BUILD_EXT=1`.
 
 ```bash
-CUDA_EXT=1 pip install colossalai
+BUILD_EXT=1 pip install colossalai
 ```
 
 **否则，PyTorch扩展只会在你实际需要使用他们时在运行时里被构建。**
@@ -438,7 +438,7 @@ pip install .
 我们默认在`pip install`时不安装PyTorch扩展，而是在运行时临时编译，如果你想要提前安装这些扩展的话（在使用融合优化器时会用到），可以使用一下命令。
 
 ```shell
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```
 
 <p align="right">(<a href="#top">返回顶端</a>)</p>
diff --git a/docs/source/en/get_started/installation.md b/docs/source/en/get_started/installation.md
index f9c8fe4758c8..50325462d522 100644
--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
@@ -42,7 +42,7 @@ pip install -r requirements/requirements.txt
 BUILD_EXT=1 pip install .
 ```
 
-If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer), just don't specify the `CUDA_EXT`:
+If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer), just don't specify the `BUILD_EXT`:
 
 ```shell
 pip install .
diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index d6a1c47d6b87..5434551f4fb4 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -77,7 +77,7 @@ git clone https://github.com/hpcaitech/ColossalAI.git
 cd ColossalAI
 
 # install colossalai
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```
 
 #### Step 3: Accelerate with flash attention by xformers (Optional)
diff --git a/examples/images/diffusion/test_ci.sh b/examples/images/diffusion/test_ci.sh
index 44cf47046684..652db5d3918a 100755
--- a/examples/images/diffusion/test_ci.sh
+++ b/examples/images/diffusion/test_ci.sh
@@ -8,7 +8,7 @@ conda activate ldm
 conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
 pip install transformers diffusers invisible-watermark
 
-CUDA_EXT=1  pip install colossalai
+BUILD_EXT=1  pip install colossalai
 
 wget https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt
 
diff --git a/examples/language/llama2/README.md b/examples/language/llama2/README.md
index 752453b5a7e3..068f15cbb041 100644
--- a/examples/language/llama2/README.md
+++ b/examples/language/llama2/README.md
@@ -53,7 +53,7 @@ We follow the hyperparameter settings from the original LLaMA paper. We use Adam
 Please install the latest ColossalAI from source.
 
 ```bash
-CUDA_EXT=1 pip install -U git+https://github.com/hpcaitech/ColossalAI
+BUILD_EXT=1 pip install -U git+https://github.com/hpcaitech/ColossalAI
 ```
 
 Then install other dependencies.
diff --git a/examples/language/openmoe/README.md b/examples/language/openmoe/README.md
index 45657f192024..f62223c9319d 100644
--- a/examples/language/openmoe/README.md
+++ b/examples/language/openmoe/README.md
@@ -17,7 +17,7 @@
 Please install the latest ColossalAI from source.
 
 ```bash
-CUDA_EXT=1 pip install -U git+https://github.com/hpcaitech/ColossalAI
+BUILD_EXT=1 pip install -U git+https://github.com/hpcaitech/ColossalAI
 ```
 
 Then install dependencies.
diff --git a/extensions/utils.py b/extensions/utils.py
index 3f75f952d57b..d5d87a77a9c0 100644
--- a/extensions/utils.py
+++ b/extensions/utils.py
@@ -154,7 +154,7 @@ def check_cuda_availability():
 def set_cuda_arch_list(cuda_dir):
     """
     This function sets the PyTorch TORCH_CUDA_ARCH_LIST variable for ahead-of-time extension compilation.
-    Ahead-of-time compilation occurs when CUDA_EXT=1 is set when running 'pip install'.
+    Ahead-of-time compilation occurs when BUILD_EXT=1 is set when running 'pip install'.
     """
     cuda_available = check_cuda_availability()
 
diff --git a/setup.py b/setup.py
index e54ec41ea9f8..ef89481e6b1e 100644
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,7 @@ def get_version() -> str:
 if BUILD_EXT:
     if not TORCH_AVAILABLE:
         raise ModuleNotFoundError(
-            "[extension] PyTorch is not found while CUDA_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
+            "[extension] PyTorch is not found while BUILD_EXT=1. You need to install PyTorch first in order to build CUDA extensions"
         )
 
     from extensions import ALL_EXTENSIONS

From e304e4db354906493f9e22866b8fcee5f403a829 Mon Sep 17 00:00:00 2001
From: MickeyCHAN <76671016+danyow-cheung@users.noreply.github.com>
Date: Tue, 5 Mar 2024 21:41:23 +0800
Subject: [PATCH 14/23] [hotfix] fix sd vit import error (#5420)

* fix import error

* Update dpt_depth.py

---------

Co-authored-by: binmakeswell <binmakeswell@gmail.com>
---
 examples/images/diffusion/ldm/modules/midas/midas/dpt_depth.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/images/diffusion/ldm/modules/midas/midas/dpt_depth.py b/examples/images/diffusion/ldm/modules/midas/midas/dpt_depth.py
index 74871e8b1fce..fbb0e0563b4f 100644
--- a/examples/images/diffusion/ldm/modules/midas/midas/dpt_depth.py
+++ b/examples/images/diffusion/ldm/modules/midas/midas/dpt_depth.py
@@ -2,7 +2,8 @@
 import torch.nn as nn
 
 from .base_model import BaseModel
-from .blocks import FeatureFusionBlock_custom, Interpolate, _make_encoder, forward_vit
+from .blocks import FeatureFusionBlock_custom, Interpolate, _make_encoder
+from .vit import forward_vit
 
 
 def _make_fusion_block(features, use_bn):

From e239cf9060d90a22557d685c9d6ce350520c5337 Mon Sep 17 00:00:00 2001
From: Luo Yihang <luo_yihang@outlook.com>
Date: Tue, 5 Mar 2024 21:44:38 +0800
Subject: [PATCH 15/23] [hotfix] fix typo of openmoe model source (#5403)

---
 examples/language/openmoe/benchmark/benchmark_cai.py  | 2 +-
 examples/language/openmoe/benchmark/benchmark_fsdp.py | 2 +-
 examples/language/openmoe/infer.py                    | 6 +++---
 examples/language/openmoe/train.py                    | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/language/openmoe/benchmark/benchmark_cai.py b/examples/language/openmoe/benchmark/benchmark_cai.py
index 03b660ecf446..770c500d86bf 100644
--- a/examples/language/openmoe/benchmark/benchmark_cai.py
+++ b/examples/language/openmoe/benchmark/benchmark_cai.py
@@ -207,7 +207,7 @@ def main():
     coordinator.print_on_master(f"Set plugin as {plugin}")
 
     # Build OpenMoe model
-    repo_name = "hpcaitech/openmoe-" + args.model_name
+    repo_name = "hpcai-tech/openmoe-" + args.model_name
     config = LlamaConfig.from_pretrained(repo_name)
     set_openmoe_args(
         config,
diff --git a/examples/language/openmoe/benchmark/benchmark_fsdp.py b/examples/language/openmoe/benchmark/benchmark_fsdp.py
index 7f438fc5acce..b00fbd001022 100644
--- a/examples/language/openmoe/benchmark/benchmark_fsdp.py
+++ b/examples/language/openmoe/benchmark/benchmark_fsdp.py
@@ -53,7 +53,7 @@ def fsdp_main(rank, world_size, args):
     train_loader = torch.utils.data.DataLoader(dataset, **train_kwargs)
     torch.cuda.set_device(rank)
 
-    config = LlamaConfig.from_pretrained("hpcaitech/openmoe-%s" % args.model_name)
+    config = LlamaConfig.from_pretrained("hpcai-tech/openmoe-%s" % args.model_name)
     set_openmoe_args(
         config,
         num_experts=config.num_experts,
diff --git a/examples/language/openmoe/infer.py b/examples/language/openmoe/infer.py
index db90c6e34507..04df64531937 100644
--- a/examples/language/openmoe/infer.py
+++ b/examples/language/openmoe/infer.py
@@ -15,19 +15,19 @@ def parse_args():
 def inference(args):
     tokenizer = T5Tokenizer.from_pretrained("google/umt5-small")
     if args.model == "test":
-        config = LlamaConfig.from_pretrained("hpcaitech/openmoe-base")
+        config = LlamaConfig.from_pretrained("hpcai-tech/openmoe-base")
         set_openmoe_args(config,
                          num_experts=config.num_experts,
                          moe_layer_interval=config.moe_layer_interval,
                          enable_kernel=True)
         model = OpenMoeForCausalLM(config)
     else:
-        config = LlamaConfig.from_pretrained(f"hpcaitech/openmoe-{args.model}")
+        config = LlamaConfig.from_pretrained(f"hpcai-tech/openmoe-{args.model}")
         set_openmoe_args(config,
                          num_experts=config.num_experts,
                          moe_layer_interval=config.moe_layer_interval,
                          enable_kernel=False)
-        model = OpenMoeForCausalLM.from_pretrained(f"hpcaitech/openmoe-{args.model}", config=config)
+        model = OpenMoeForCausalLM.from_pretrained(f"hpcai-tech/openmoe-{args.model}", config=config)
     model = model.eval().bfloat16()
     model = model.to(torch.cuda.current_device())
 
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index 1ae661f548b8..89c4d5420994 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -269,12 +269,12 @@ def main():
 
     # Build OpenMoe model
     if test_mode:
-        config = LlamaConfig.from_pretrained("hpcaitech/openmoe-base")
+        config = LlamaConfig.from_pretrained("hpcai-tech/openmoe-base")
         config.hidden_size = 128
         config.intermediate_size = 256
         config.vocab_size = 32000
     else:
-        repo_name = "hpcaitech/openmoe-" + args.model_name
+        repo_name = "hpcai-tech/openmoe-" + args.model_name
         config = LlamaConfig.from_pretrained(repo_name)
     set_openmoe_args(
         config,

From 70cce5cbed8046f82389f76f0916fc060aef9243 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 5 Mar 2024 21:45:55 +0800
Subject: [PATCH 16/23] [doc] update some translations with README-zh-Hans.md
 (#5382)

---
 docs/README-zh-Hans.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index bc4106d12642..110e1a502b0f 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -141,10 +141,10 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 [[模型权重]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
 
 - 13B: 万元预算打造高质量13B私有模型
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
-[[blog]](https://hpc-ai.com/blog/colossal-llama-2-13b)
-[[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
-[[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
+[[代码]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
+[[博客]](https://hpc-ai.com/blog/colossal-llama-2-13b)
+[[HuggingFace 模型权重]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
+[[Modelscope 模型权重]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
 
 |             Model              |  Backbone  | Tokens Consumed | MMLU (5-shot) | CMMLU (5-shot) | AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot) |
 |:------------------------------:|:----------:|:---------------:|:-------------:|:--------------:|:----------------:|:---------------:|:--------------:|
@@ -264,8 +264,8 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 </p>
 
 - 700亿参数LLaMA2训练加速195%
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2)
-[[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)
+[[代码]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2)
+[[博客]](https://www.hpc-ai.tech/blog/70b-llama2-training)
 
 ### LLaMA1
 <p align="center">
@@ -361,7 +361,8 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/SwiftInfer.jpg" width=800/>
 </p>
 
-- [SwiftInfer](https://github.com/hpcaitech/SwiftInfer): Inference performance improved by 46%, open source solution breaks the length limit of LLM for multi-round conversations
+- [SwiftInfer](https://github.com/hpcaitech/SwiftInfer): 开源解决方案打破了多轮对话的 LLM 长度限制，推理性能提高了46%
+
 
 <p id="GPT-3-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference_GPT-3.jpg" width=800/>

From 16c96d4d8cbe26b5ee32a35fd5ee809e035c9e96 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 5 Mar 2024 21:47:48 +0800
Subject: [PATCH 17/23] [hotfix] fix typo change _descrption to _description
 (#5331)

---
 .../inference/engine/microbatch_manager.py    | 54 +++++++++----------
 .../inference/pipeline/microbatch_manager.py  | 54 +++++++++----------
 colossalai/pipeline/schedule/generate.py      | 12 ++---
 3 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/colossalai/inference/engine/microbatch_manager.py b/colossalai/inference/engine/microbatch_manager.py
index d698c89f9936..7264b81e06a0 100644
--- a/colossalai/inference/engine/microbatch_manager.py
+++ b/colossalai/inference/engine/microbatch_manager.py
@@ -17,8 +17,8 @@ class Status(Enum):
 
 class MicroBatchDescription:
     """
-    This is the class to record the infomation of each microbatch, and also do some update operation.
-    This clase is the base class of `HeadMicroBatchDescription` and `BodyMicroBatchDescription`, for more
+    This is the class to record the information of each microbatch, and also do some update operation.
+    This class is the base class of `HeadMicroBatchDescription` and `BodyMicroBatchDescription`, for more
     details, please refer to the doc of these two classes blow.
 
     Args:
@@ -61,15 +61,15 @@ def state(self):
     @property
     def cur_length(self):
         """
-        Return the current sequnence length of micro batch
+        Return the current sequence length of micro batch
 
         """
 
 
 class HeadMicroBatchDescription(MicroBatchDescription):
     """
-    This class is used to record the infomation of the first stage of pipeline, the first stage should have attributes `input_ids` and `attention_mask`
-    and `new_tokens`, and the `new_tokens` is the tokens generated by the first stage. Also due to the schdule of pipeline, the operation to update the
+    This class is used to record the information of the first stage of pipeline, the first stage should have attributes `input_ids` and `attention_mask`
+    and `new_tokens`, and the `new_tokens` is the tokens generated by the first stage. Also due to the schedule of pipeline, the operation to update the
     information and the condition to determine the state is different from other stages.
 
     Args:
@@ -123,7 +123,7 @@ def cur_length(self):
 
 class BodyMicroBatchDescription(MicroBatchDescription):
     """
-    This class is used to record the infomation of the stages except the first stage of pipeline, the stages should have attributes `hidden_states` and `past_key_values`,
+    This class is used to record the information of the stages except the first stage of pipeline, the stages should have attributes `hidden_states` and `past_key_values`,
 
     Args:
         inputs_dict (Dict[str, torch.Tensor]): will always be `None`. Other stages only receive hiddenstates from previous stage.
@@ -173,76 +173,76 @@ def __init__(
         self.max_input_len = max_input_len
         self.max_output_len = max_output_len
         self.cache_manager_list = cache_manager_list
-        self.mb_descrption_buffer = {}
+        self.mb_description_buffer = {}
         self.new_tokens_buffer = {}
         self.idx = 0
 
-    def add_descrption(self, inputs_dict: Dict[str, torch.Tensor]):
+    def add_description(self, inputs_dict: Dict[str, torch.Tensor]):
         if self.stage == 0:
-            self.mb_descrption_buffer[self.idx] = HeadMicroBatchDescription(
+            self.mb_description_buffer[self.idx] = HeadMicroBatchDescription(
                 inputs_dict, self.max_input_len, self.max_output_len, self.cache_manager_list[self.idx]
             )
         else:
-            self.mb_descrption_buffer[self.idx] = BodyMicroBatchDescription(
+            self.mb_description_buffer[self.idx] = BodyMicroBatchDescription(
                 inputs_dict, self.max_input_len, self.max_output_len, self.cache_manager_list[self.idx]
             )
 
     def step(self, new_token: torch.Tensor = None):
         """
         Update the state if microbatch manager, 2 conditions.
-        1. For first stage in PREFILL, receive inputs and outputs, `_add_descrption` will save its inputs.
-        2. For other conditon, only receive the output of previous stage, and update the descrption.
+        1. For first stage in PREFILL, receive inputs and outputs, `_add_description` will save its inputs.
+        2. For other condition, only receive the output of previous stage, and update the description.
 
         Args:
             inputs_dict (Dict[str, torch.Tensor]): the inputs of current stage. The key should have `input_ids` and `attention_mask`.
             output_dict (Dict[str, torch.Tensor]): the outputs of previous stage. The key should have `hidden_states` and `past_key_values`.
             new_token (torch.Tensor): the new token generated by current stage.
         """
-        # Add descrption first if the descrption is None
-        self.cur_descrption.update(new_token)
+        # Add description first if the description is None
+        self.cur_description.update(new_token)
         return self.cur_state
 
     def export_new_tokens(self):
         new_tokens_list = []
-        for i in self.mb_descrption_buffer.values():
+        for i in self.mb_description_buffer.values():
             new_tokens_list.extend(i.new_tokens.tolist())
         return new_tokens_list
 
     def is_micro_batch_done(self):
-        if len(self.mb_descrption_buffer) == 0:
+        if len(self.mb_description_buffer) == 0:
             return False
-        for mb in self.mb_descrption_buffer.values():
+        for mb in self.mb_description_buffer.values():
             if mb.state != Status.DONE:
                 return False
         return True
 
     def clear(self):
-        self.mb_descrption_buffer.clear()
+        self.mb_description_buffer.clear()
         for cache in self.cache_manager_list:
             cache.free_all()
 
     def next(self):
         self.idx = (self.idx + 1) % self.buffer_size
 
-    def _remove_descrption(self):
-        self.mb_descrption_buffer.pop(self.idx)
+    def _remove_description(self):
+        self.mb_description_buffer.pop(self.idx)
 
     @property
-    def cur_descrption(self) -> MicroBatchDescription:
-        return self.mb_descrption_buffer.get(self.idx)
+    def cur_description(self) -> MicroBatchDescription:
+        return self.mb_description_buffer.get(self.idx)
 
     @property
     def cur_infer_state(self):
-        if self.cur_descrption is None:
+        if self.cur_description is None:
             return None
-        return self.cur_descrption.infer_state
+        return self.cur_description.infer_state
 
     @property
     def cur_state(self):
         """
-        Return the state of current micro batch, when current descrption is None, the state is PREFILL
+        Return the state of current micro batch, when current description is None, the state is PREFILL
 
         """
-        if self.cur_descrption is None:
+        if self.cur_description is None:
             return Status.PREFILL
-        return self.cur_descrption.state
+        return self.cur_description.state
diff --git a/colossalai/legacy/inference/pipeline/microbatch_manager.py b/colossalai/legacy/inference/pipeline/microbatch_manager.py
index 441cf603985c..cb0a8c1a9332 100644
--- a/colossalai/legacy/inference/pipeline/microbatch_manager.py
+++ b/colossalai/legacy/inference/pipeline/microbatch_manager.py
@@ -18,8 +18,8 @@ class Status(Enum):
 
 class MicroBatchDescription:
     """
-    This is the class to record the infomation of each microbatch, and also do some update operation.
-    This clase is the base class of `HeadMicroBatchDescription` and `BodyMicroBatchDescription`, for more
+    This is the class to record the information of each microbatch, and also do some update operation.
+    This class is the base class of `HeadMicroBatchDescription` and `BodyMicroBatchDescription`, for more
     details, please refer to the doc of these two classes blow.
 
     Args:
@@ -62,15 +62,15 @@ def state(self):
     @property
     def cur_length(self):
         """
-        Return the current sequnence length of micro batch
+        Return the current sequence length of micro batch
 
         """
 
 
 class HeadMicroBatchDescription(MicroBatchDescription):
     """
-    This class is used to record the infomation of the first stage of pipeline, the first stage should have attributes `input_ids` and `attention_mask`
-    and `new_tokens`, and the `new_tokens` is the tokens generated by the first stage. Also due to the schdule of pipeline, the operation to update the
+    This class is used to record the information of the first stage of pipeline, the first stage should have attributes `input_ids` and `attention_mask`
+    and `new_tokens`, and the `new_tokens` is the tokens generated by the first stage. Also due to the schedule of pipeline, the operation to update the
     information and the condition to determine the state is different from other stages.
 
     Args:
@@ -124,7 +124,7 @@ def cur_length(self):
 
 class BodyMicroBatchDescription(MicroBatchDescription):
     """
-    This class is used to record the infomation of the stages except the first stage of pipeline, the stages should have attributes `hidden_states` and `past_key_values`,
+    This class is used to record the information of the stages except the first stage of pipeline, the stages should have attributes `hidden_states` and `past_key_values`,
 
     Args:
         inputs_dict (Dict[str, torch.Tensor]): will always be `None`. Other stages only receive hiddenstates from previous stage.
@@ -174,76 +174,76 @@ def __init__(
         self.max_input_len = max_input_len
         self.max_output_len = max_output_len
         self.cache_manager_list = cache_manager_list
-        self.mb_descrption_buffer = {}
+        self.mb_description_buffer = {}
         self.new_tokens_buffer = {}
         self.idx = 0
 
-    def add_descrption(self, inputs_dict: Dict[str, torch.Tensor]):
+    def add_description(self, inputs_dict: Dict[str, torch.Tensor]):
         if self.stage == 0:
-            self.mb_descrption_buffer[self.idx] = HeadMicroBatchDescription(
+            self.mb_description_buffer[self.idx] = HeadMicroBatchDescription(
                 inputs_dict, self.max_input_len, self.max_output_len, self.cache_manager_list[self.idx]
             )
         else:
-            self.mb_descrption_buffer[self.idx] = BodyMicroBatchDescription(
+            self.mb_description_buffer[self.idx] = BodyMicroBatchDescription(
                 inputs_dict, self.max_input_len, self.max_output_len, self.cache_manager_list[self.idx]
             )
 
     def step(self, new_token: torch.Tensor = None):
         """
         Update the state if microbatch manager, 2 conditions.
-        1. For first stage in PREFILL, receive inputs and outputs, `_add_descrption` will save its inputs.
-        2. For other conditon, only receive the output of previous stage, and update the descrption.
+        1. For first stage in PREFILL, receive inputs and outputs, `_add_description` will save its inputs.
+        2. For other condition, only receive the output of previous stage, and update the description.
 
         Args:
             inputs_dict (Dict[str, torch.Tensor]): the inputs of current stage. The key should have `input_ids` and `attention_mask`.
             output_dict (Dict[str, torch.Tensor]): the outputs of previous stage. The key should have `hidden_states` and `past_key_values`.
             new_token (torch.Tensor): the new token generated by current stage.
         """
-        # Add descrption first if the descrption is None
-        self.cur_descrption.update(new_token)
+        # Add description first if the description is None
+        self.cur_description.update(new_token)
         return self.cur_state
 
     def export_new_tokens(self):
         new_tokens_list = []
-        for i in self.mb_descrption_buffer.values():
+        for i in self.mb_description_buffer.values():
             new_tokens_list.extend(i.new_tokens.tolist())
         return new_tokens_list
 
     def is_micro_batch_done(self):
-        if len(self.mb_descrption_buffer) == 0:
+        if len(self.mb_description_buffer) == 0:
             return False
-        for mb in self.mb_descrption_buffer.values():
+        for mb in self.mb_description_buffer.values():
             if mb.state != Status.DONE:
                 return False
         return True
 
     def clear(self):
-        self.mb_descrption_buffer.clear()
+        self.mb_description_buffer.clear()
         for cache in self.cache_manager_list:
             cache.free_all()
 
     def next(self):
         self.idx = (self.idx + 1) % self.buffer_size
 
-    def _remove_descrption(self):
-        self.mb_descrption_buffer.pop(self.idx)
+    def _remove_description(self):
+        self.mb_description_buffer.pop(self.idx)
 
     @property
-    def cur_descrption(self) -> MicroBatchDescription:
-        return self.mb_descrption_buffer.get(self.idx)
+    def cur_description(self) -> MicroBatchDescription:
+        return self.mb_description_buffer.get(self.idx)
 
     @property
     def cur_infer_state(self):
-        if self.cur_descrption is None:
+        if self.cur_description is None:
             return None
-        return self.cur_descrption.infer_state
+        return self.cur_description.infer_state
 
     @property
     def cur_state(self):
         """
-        Return the state of current micro batch, when current descrption is None, the state is PREFILL
+        Return the state of current micro batch, when current description is None, the state is PREFILL
 
         """
-        if self.cur_descrption is None:
+        if self.cur_description is None:
             return Status.PREFILL
-        return self.cur_descrption.state
+        return self.cur_description.state
diff --git a/colossalai/pipeline/schedule/generate.py b/colossalai/pipeline/schedule/generate.py
index 20f316c2ae48..d6a6aec63a12 100644
--- a/colossalai/pipeline/schedule/generate.py
+++ b/colossalai/pipeline/schedule/generate.py
@@ -95,7 +95,7 @@ def _prepare_inputs_for_interval_stage(self):
         Returns:
             dict: inputs for interval stage, `{'past_key_values': torch.Tensor}` or `None`
         """
-        model_inputs = {"infer_state": self.mb_manager.cur_descrption.infer_state}
+        model_inputs = {"infer_state": self.mb_manager.cur_description.infer_state}
         return model_inputs
 
     def _prepare_inputs_for_new_token(self, new_token: torch.Tensor):
@@ -107,7 +107,7 @@ def _prepare_inputs_for_new_token(self, new_token: torch.Tensor):
         Returns:
             dict: inputs for new token, `{'input_ids': torch.Tensor, 'attention_mask': torch.Tensor, 'past_key_values': torch.Tensor}`
         """
-        new_mask = self.mb_manager.cur_descrption.attn_mask
+        new_mask = self.mb_manager.cur_description.attn_mask
 
         return dict(input_ids=new_token, attention_mask=new_mask)
 
@@ -133,7 +133,7 @@ def _init_infer_state_action(self) -> None:
         1.Load micro_batch 2.Use the current micro_batch to init the current infer_state
         """
         inputs_dict = self.load_micro_batch()
-        self.mb_manager.add_descrption(inputs_dict)
+        self.mb_manager.add_description(inputs_dict)
 
     def _load_stage_action(self, model: Module) -> None:
         """
@@ -141,7 +141,7 @@ def _load_stage_action(self, model: Module) -> None:
         1.load micro_batch 2.do the forward 3.step to update
         """
         inputs_dict = self.load_micro_batch()
-        self.mb_manager.add_descrption(inputs_dict)
+        self.mb_manager.add_description(inputs_dict)
         if self.verbose and self.stage_manager.is_first_stage():
             torch.cuda.synchronize()
             self.timestamps[self.mb_manager.idx].append(time.time())
@@ -379,7 +379,7 @@ def generate_step_broadcast(self, model: Module, data_iter: Iterable) -> Union[t
                     if self.verbose and self.stage_manager.is_first_stage():
                         torch.cuda.synchronize()
                         self.timestamps[self.mb_manager.idx].append(time.time())
-                    self.mb_manager.add_descrption(inputs_dict)
+                    self.mb_manager.add_description(inputs_dict)
                     interval_inputs = {"infer_state": self.mb_manager.cur_infer_state}
                     output_dict = model_forward(model, inputs_dict, interval_inputs)
                 # In GENERATE phase
@@ -415,7 +415,7 @@ def generate_step_broadcast(self, model: Module, data_iter: Iterable) -> Union[t
                         inputs_dict = None
                         if self.mb_manager.cur_state is Status.PREFILL:
                             inputs_dict = self.load_micro_batch()
-                            self.mb_manager.add_descrption(inputs_dict)
+                            self.mb_manager.add_description(inputs_dict)
                         interval_inputs = {
                             "hidden_states": hidden_states["hidden_states"],
                             "infer_state": self.mb_manager.cur_infer_state,

From 049121d19d7ead4a4dcbeb091df9ff87ba991a63 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 5 Mar 2024 21:48:46 +0800
Subject: [PATCH 18/23] [hotfix] fix typo change enabel to enable under
 colossalai/shardformer/ (#5317)

---
 colossalai/shardformer/layer/_operation.py                | 4 ++--
 colossalai/shardformer/layer/normalization.py             | 8 ++++----
 colossalai/shardformer/modeling/bloom.py                  | 2 +-
 .../shardformer/modeling/chatglm2_6b/modeling_chatglm.py  | 6 +++---
 colossalai/shardformer/modeling/gptj.py                   | 4 ++--
 colossalai/shardformer/modeling/llama.py                  | 4 ++--
 colossalai/shardformer/modeling/opt.py                    | 2 +-
 colossalai/shardformer/modeling/t5.py                     | 2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index d4960c7e4bde..241770901ed7 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -173,7 +173,7 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
     Args:
         input_ (`torch.Tensor`): The input tensor from sequence parallel region.
         process_group (`torch.distributed.ProcessGroup`): The process group used for collective communication.
-        overlap (`bool`): Whther to overlap the all_gather op and gradient calculate in backward.
+        overlap (`bool`): Whether to overlap the all_gather op and gradient calculate in backward.
 
     """
 
@@ -534,7 +534,7 @@ def backward(ctx, grad_output):
         return grad_output, None, None
 
 
-def hook_paramter_in_backward(input, weight=None, bias=None):
+def hook_parameter_in_backward(input, weight=None, bias=None):
     return HookParameter.apply(input, weight, bias)
 
 
diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py
index 4aa281290340..43dd153aff1a 100644
--- a/colossalai/shardformer/layer/normalization.py
+++ b/colossalai/shardformer/layer/normalization.py
@@ -7,7 +7,7 @@
 
 from colossalai.lazy import LazyInitContext
 
-from ._operation import hook_paramter_in_backward
+from ._operation import hook_parameter_in_backward
 from .utils import SeqParallelUtils
 
 __all__ = ["FusedLayerNorm", "FusedRMSNorm", "LayerNorm", "RMSNorm", "BaseLayerNorm"]
@@ -29,7 +29,7 @@ def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True):
 
         def forward(self, input):
             output = super().forward(input)
-            output = hook_paramter_in_backward(output, self.weight, self.bias)
+            output = hook_parameter_in_backward(output, self.weight, self.bias)
             return output
 
     class FusedRMSNormWithHook(ApexFusedRMSNorm):
@@ -38,7 +38,7 @@ def __init__(self, normalized_shape, eps=0.00001, elementwise_affine=True):
 
         def forward(self, input):
             output = super().forward(input)
-            output = hook_paramter_in_backward(output, self.weight)
+            output = hook_parameter_in_backward(output, self.weight)
             return output
 
 except ImportError:
@@ -79,7 +79,7 @@ def __init__(self, hidden_size, eps=0.00001):
 
         def forward(self, input):
             output = super().forward(input)
-            output = hook_paramter_in_backward(output, self.weight, self.bias)
+            output = hook_parameter_in_backward(output, self.weight, self.bias)
             return output
 
 
diff --git a/colossalai/shardformer/modeling/bloom.py b/colossalai/shardformer/modeling/bloom.py
index cd8a023306dc..d94c30d29e71 100644
--- a/colossalai/shardformer/modeling/bloom.py
+++ b/colossalai/shardformer/modeling/bloom.py
@@ -699,7 +699,7 @@ def bloom_for_question_answering_forward(
             return {"hidden_states": hidden_states}
 
 
-def get_bloom_flash_attention_forward(enabel_jit_fused=False):
+def get_bloom_flash_attention_forward(enable_jit_fused=False):
     try:
         from xformers.ops import memory_efficient_attention as me_attention
     except:
diff --git a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
index 71aa2296eb4c..bf581300a7b1 100644
--- a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
+++ b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
@@ -181,7 +181,7 @@ def forward_impl(
 
         cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
 
-        # this is to mimic the behaviour of complex32, else we will get different results
+        # this is to mimic the behavior of complex32, else we will get different results
         if dtype in (torch.float16, torch.bfloat16, torch.int8):
             cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
         return cache
@@ -290,7 +290,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask):
             # [sk, b, np, hn] -> [sk, b * np, hn]
             key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
 
-            # preallocting input tensor: [b * np, sq, sk]
+            # preallocating input tensor: [b * np, sq, sk]
             matmul_input_buffer = torch.empty(
                 output_size[0] * output_size[1],
                 output_size[2],
@@ -1289,7 +1289,7 @@ def stream_generate(
         if has_default_max_length and generation_config.max_new_tokens is None:
             warnings.warn(
                 f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                "This behavior is deprecated and will be removed from the config in v5 of Transformers -- we"
                 " recommend using `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
diff --git a/colossalai/shardformer/modeling/gptj.py b/colossalai/shardformer/modeling/gptj.py
index 22b0f7a90656..1990d7df3279 100644
--- a/colossalai/shardformer/modeling/gptj.py
+++ b/colossalai/shardformer/modeling/gptj.py
@@ -122,7 +122,7 @@ def gptj_model_forward(
         # head_mask has shape n_layer x batch x num_attention_heads x N x N
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
 
-        # position id to be asssigned not just for the first stage for attn input
+        # position id to be assigned not just for the first stage for attn input
         if position_ids is not None:
             position_ids = position_ids.view(-1, seq_length)
         else:
@@ -593,7 +593,7 @@ def forward(
 
         # key = key.permute(0, 2, 1, 3)
         # query = query.permute(0, 2, 1, 3)
-        key = key.to(dtype=value.dtype)  # fp16 compatability
+        key = key.to(dtype=value.dtype)  # fp16 compatibility
         query = query.to(dtype=value.dtype)
 
         if layer_past is not None:
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 92c709218a26..f20ceb2d6760 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -225,13 +225,13 @@ def llama_for_causal_lm_forward(
         >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         logger = logging.get_logger(__name__)
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
diff --git a/colossalai/shardformer/modeling/opt.py b/colossalai/shardformer/modeling/opt.py
index 7f6cbbbcf4f3..d0e267eacd25 100644
--- a/colossalai/shardformer/modeling/opt.py
+++ b/colossalai/shardformer/modeling/opt.py
@@ -123,7 +123,7 @@ def opt_model_forward(
 
         else:
             if hidden_states is None:
-                raise ValueError("hidden_states shouln't be None for intermediate stages.")
+                raise ValueError("hidden_states shouldn't be None for intermediate stages.")
             input_shape = hidden_states.size()[:-1]
             batch_size, seq_length = input_shape[0], input_shape[1]
             device = hidden_states.device
diff --git a/colossalai/shardformer/modeling/t5.py b/colossalai/shardformer/modeling/t5.py
index dcb1785207eb..9c5ce3fb65c9 100644
--- a/colossalai/shardformer/modeling/t5.py
+++ b/colossalai/shardformer/modeling/t5.py
@@ -77,7 +77,7 @@ def t5_stack_forward(
         if in_decoder != (stage >= decoder_starting_stage):
             raise ValueError("Config in T5Stack is not aligned with pipeline setting.")
 
-        # at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embedds
+        # at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embeds
         # at_last_stage: current stage is the last stage of encoder/decoder, making outputs the same form as huggingface
         at_first_stage = (stage == 0) or (stage == decoder_starting_stage)
         at_last_stage = (stage == decoder_starting_stage - 1) or (stage == stage_manager.num_stages - 1)

From a7ae2b5b4c05f194fc563c488ed885f4a225fbc8 Mon Sep 17 00:00:00 2001
From: Dongruixuan Li <dongruixuan@hotmail.com>
Date: Tue, 5 Mar 2024 08:48:55 -0500
Subject: [PATCH 19/23] [eval-hotfix] set few_shot_data to None when few shot
 is disabled (#5422)

---
 applications/ColossalEval/colossal_eval/dataset/agieval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalEval/colossal_eval/dataset/agieval.py b/applications/ColossalEval/colossal_eval/dataset/agieval.py
index d018a2ba5652..32f8544e93df 100644
--- a/applications/ColossalEval/colossal_eval/dataset/agieval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/agieval.py
@@ -201,7 +201,7 @@ def load(
         for file in files:
             dataset_name = os.path.basename(file)[0 : -len(".jsonl")]
 
-            few_shot_data = []
+            few_shot_data = None
             if few_shot:
                 # process demo once if it is few-shot-CoT
                 few_shot_data = combine_prompt(prompt_path, dataset_name, load_explanation=False, chat_mode=False)

From 5e1c93d732a3ee77657fe68c031dacdeae4d7807 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 5 Mar 2024 21:52:30 +0800
Subject: [PATCH 20/23] [hotfix] fix typo change MoECheckpintIO to
 MoECheckpointIO (#5335)

Co-authored-by: binmakeswell <binmakeswell@gmail.com>
---
 colossalai/accelerator/api.py                             | 2 +-
 colossalai/booster/plugin/gemini_plugin.py                | 2 +-
 colossalai/booster/plugin/hybrid_parallel_plugin.py       | 2 +-
 colossalai/booster/plugin/moe_hybrid_parallel_plugin.py   | 7 ++++---
 colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py | 4 ++--
 colossalai/moe/__init__.py                                | 4 ++--
 colossalai/moe/checkpoint.py                              | 4 ++--
 7 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/colossalai/accelerator/api.py b/colossalai/accelerator/api.py
index 02b3055d7380..85a19b6a61d3 100644
--- a/colossalai/accelerator/api.py
+++ b/colossalai/accelerator/api.py
@@ -40,7 +40,7 @@ def set_accelerator(accelerator: Union[str, BaseAccelerator]) -> None:
 def auto_set_accelerator() -> None:
     """
     Automatically check if any accelerator is available.
-    If an accelerator is availabe, set it as the global accelerator.
+    If an accelerator is available, set it as the global accelerator.
     """
     global _ACCELERATOR
 
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index 95b96bbfd9ed..6c503377326a 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -437,7 +437,7 @@ def __init__(
         )
 
     def __del__(self):
-        """Destroy the prcess groups in ProcessGroupMesh"""
+        """Destroy the process groups in ProcessGroupMesh"""
         self.pg_mesh.destroy_mesh_process_groups()
 
     def support_no_sync(self) -> bool:
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index bf677e052f88..8cc76dd3e0f3 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1067,7 +1067,7 @@ def __init__(
         self.max_norm = max_norm
 
     def __del__(self):
-        """Destroy the prcess groups in ProcessGroupMesh"""
+        """Destroy the process groups in ProcessGroupMesh"""
         self.pg_mesh.destroy_mesh_process_groups()
 
     @property
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 45e5a23c1b22..454710fccaa7 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -22,7 +22,7 @@
 )
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.moe import MOE_MANAGER, MoECheckpintIO
+from colossalai.moe import MOE_MANAGER, MoECheckpointIO
 from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer import ShardConfig
@@ -341,9 +341,10 @@ def seed_worker(worker_id):
             **_kwargs,
         )
 
-    def get_checkpoint_io(self) -> MoECheckpintIO:
+
+    def get_checkpoint_io(self) -> MoECheckpointIO:
         if self.checkpoint_io is None:
-            self.checkpoint_io = MoECheckpintIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
+            self.checkpoint_io = MoECheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
         else:
             self.checkpoint_io = self.checkpoint_io(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
         return self.checkpoint_io
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 36df30335dd7..80822724982e 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -51,7 +51,7 @@ class HybridParallelCheckpointIO(GeneralCheckpointIO):
         pp_group (ProcessGroup): Process group along pipeline parallel dimension.
         tp_group (ProcessGroup): Process group along tensor parallel dimension.
         zero_stage (int): The zero stage of plugin. Should be in [0, 1, 2].
-        verbose (bool, optional): Whether to print logging massage when saving/loading has been succesfully executed. Defaults to True.
+        verbose (bool, optional): Whether to print logging massage when saving/loading has been successfully executed. Defaults to True.
     """
 
     def __init__(
@@ -574,7 +574,7 @@ def _get_param_id_from_optimizer_param(
         for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
             # obtain updated param group
             new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouln't change.
+            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouldn't change.
             updated_groups.append(new_pg)
         optimizer.optim.__dict__.update({"param_groups": updated_groups})
 
diff --git a/colossalai/moe/__init__.py b/colossalai/moe/__init__.py
index 6dd0a5fc3c52..cc33c77f3eed 100644
--- a/colossalai/moe/__init__.py
+++ b/colossalai/moe/__init__.py
@@ -1,4 +1,4 @@
-from .checkpoint import MoECheckpintIO
+from .checkpoint import MoECheckpointIO
 from .experts import MLPExperts
 from .layers import SparseMLP, apply_load_balance
 from .manager import MOE_MANAGER
@@ -14,7 +14,7 @@
     "NormalNoiseGenerator",
     "UniformNoiseGenerator",
     "SparseMLP",
-    "MoECheckpintIO",
+    "MoECheckpointIO",
     "MOE_MANAGER",
     "apply_load_balance",
 ]
diff --git a/colossalai/moe/checkpoint.py b/colossalai/moe/checkpoint.py
index b37ffabea41f..59a0ec3f0c39 100644
--- a/colossalai/moe/checkpoint.py
+++ b/colossalai/moe/checkpoint.py
@@ -40,7 +40,7 @@
 )
 
 
-class MoECheckpintIO(HybridParallelCheckpointIO):
+class MoECheckpointIO(HybridParallelCheckpointIO):
     def __init__(
         self,
         dp_group: ProcessGroup,
@@ -373,7 +373,7 @@ def _get_param_id_from_optimizer_param(
         for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
             # obtain updated param group
             new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouln't change.
+            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouldn't change.
             updated_groups.append(new_pg)
         # ep param group
         if len(optimizer.optim.param_groups) > len(saved_groups):

From c8003d463b7b0aa4d8c3318355ee43840b5f3cda Mon Sep 17 00:00:00 2001
From: hugo-syn <61210734+hugo-syn@users.noreply.github.com>
Date: Tue, 5 Mar 2024 15:02:08 +0100
Subject: [PATCH 21/23] [doc] Fix typo s/infered/inferred/ (#5288)

Signed-off-by: hugo-syn <hugo.vincent@synacktiv.com>
---
 applications/ColossalQA/colossalqa/prompt/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/applications/ColossalQA/colossalqa/prompt/README.md b/applications/ColossalQA/colossalqa/prompt/README.md
index e5c74906b113..01f51fef7665 100644
--- a/applications/ColossalQA/colossalqa/prompt/README.md
+++ b/applications/ColossalQA/colossalqa/prompt/README.md
@@ -24,7 +24,7 @@ Assistant:
 ### English
 ```
 [INST] <<SYS>>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-If the answer cannot be infered based on the given context, please don't share false information.<</SYS>>
+If the answer cannot be inferred based on the given context, please don't share false information.<</SYS>>
 Use the context and chat history to respond to the human's input at the end or carry on the conversation. You should generate one response only. No following up is needed.
 
 context:
@@ -51,7 +51,7 @@ Note that the "[INST] <<SYS>>...<</SYS>>" template is the specific prompt format
 #### Normal Length
 ```
 [INST] <<SYS>>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-If the answer cannot be infered based on the given context, please don't share false information.<</SYS>>
+If the answer cannot be inferred based on the given context, please don't share false information.<</SYS>>
 Use the context and chat history to respond to the human's input at the end or carry on the conversation. You should generate one response only. No following up is needed.
 
 context:
@@ -73,7 +73,7 @@ Assistant:
 #### Overlength
 ```
 [INST] <<SYS>>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-If the answer cannot be infered based on the given context, please don't share false information.<</SYS>>
+If the answer cannot be inferred based on the given context, please don't share false information.<</SYS>>
 Use the context and chat history to respond to the human's input at the end or carry on the conversation. You should generate one response only. No following up is needed.
 
 context:

From 68f55a709c639223c89423c18d55d73806d4f2f5 Mon Sep 17 00:00:00 2001
From: Youngon <Youngon_wyl@163.com>
Date: Tue, 5 Mar 2024 22:03:40 +0800
Subject: [PATCH 22/23] [hotfix] fix stable diffusion inference bug. (#5289)

* Update train_ddp.yaml

delete  "strategy" to fix DDP config loading bug in "main.py"

* Update train_ddp.yaml

fix inference with scripts/txt2img.py config file load bug.

* Update README.md

add pretrain model test code.
---
 examples/images/diffusion/README.md              | 2 ++
 examples/images/diffusion/configs/train_ddp.yaml | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md
index 5434551f4fb4..4975dc2e3922 100644
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@@ -202,6 +202,8 @@ python main.py --logdir /tmp/ -t -b configs/Teyvat/train_colossalai_teyvat.yaml
 ```
 
 ## Inference
+if you want to test with pretrain model,as bellow:
+python scripts/txt2img.py --prompt "a photograph of an astronaut riding a horse" --plms    --outdir ./output     --ckpt 512-base-ema.ckpt     --config configs/train_ddp.yaml
 
 You can get your training last.ckpt and train config.yaml in your `--logdir`, and run by
 ```
diff --git a/examples/images/diffusion/configs/train_ddp.yaml b/examples/images/diffusion/configs/train_ddp.yaml
index 72dc05b649a4..56e1b5705d2d 100644
--- a/examples/images/diffusion/configs/train_ddp.yaml
+++ b/examples/images/diffusion/configs/train_ddp.yaml
@@ -1,5 +1,6 @@
 model:
   base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
   params:
     parameterization: "v"
     linear_start: 0.00085
@@ -86,8 +87,6 @@ lightning:
     max_epochs: 2
     precision: 16
     auto_select_gpus: False
-    strategy:
-      find_unused_parameters: False
     log_every_n_steps: 2
 #    max_steps: 6o
     logger: True

From 743e7fad2f03ab0082d40c5da960aca29daed76e Mon Sep 17 00:00:00 2001
From: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:58:56 +0800
Subject: [PATCH 23/23] [colossal-llama2] add stream chat examlple for chat
 version model (#5428)

* add stream chat for chat version

* remove os.system clear

* modify function name
---
 .../utils/stream_chat_patch.py                | 247 ++++++++++++++++++
 .../Colossal-LLaMA-2/stream_chat_example.py   |  55 ++++
 2 files changed, 302 insertions(+)
 create mode 100644 applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py
 create mode 100644 applications/Colossal-LLaMA-2/stream_chat_example.py

diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py
new file mode 100644
index 000000000000..8f8eecb18eb0
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py
@@ -0,0 +1,247 @@
+from copy import deepcopy
+from typing import Optional, List, Dict, Tuple, Callable, Any
+
+import torch
+from torch import nn
+
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging
+from transformers.generation.utils import GenerationConfig, LogitsProcessorList, StoppingCriteriaList
+  
+logger = logging.get_logger(__name__)
+
+
+def get_prompt_template(
+    input_query:str, 
+    history:List[Dict]= None, 
+    roles:list = ["", "Human", "Assistant"],
+) -> str:
+    """
+    Generates a prompt template for chat models based on input and history.
+
+    Args:
+        input_query (str): User's current input query.
+        history (List[Dict], optional): List of past conversations, each a dict with 'role' and 'message'.
+        roles (list): Specifies the roles in the conversation, defaults to ["", "Human", "Assistant"].
+
+    Returns:
+        str: A formatted prompt including the input query and history.
+    """
+    prompt = ""
+    if history is None:
+        new_history = []
+    else:
+        new_history = deepcopy(history)
+    
+    new_history.append({"role": roles[1], "message": input_query.strip()})
+    new_history.append({"role": roles[2], "message": None})
+
+    for _, item in enumerate(new_history):
+        role = item.get("role")
+        message = item.get("message")
+        if role == roles[0]:
+            prompt += f"<s>{message}\n\n"
+        else:
+            if message:
+                prompt += f"{role}: <s>{message}</s>"
+            else:
+                prompt += f"{role}: <s>"
+    return prompt
+
+@torch.inference_mode()
+def streaming_chat(
+    model: Any, 
+    tokenizer: PreTrainedTokenizer,
+    input_query: str, 
+    history: List[Dict] = None, 
+    roles: list = ["", "Human", "Assistant"], 
+    past_key_values: Tuple[Tuple[torch.FloatTensor, Any], Any] = None, 
+    temperature: float = 0.8, 
+    top_p: float = 0.95, 
+    top_k: int = 50, 
+    do_sample: bool = True, 
+    length_penalty: float = 1.2,
+    max_new_tokens: int = 512, 
+    logits_processor: LogitsProcessorList = None, 
+    return_past_key_values: bool = False, 
+    **kwargs,
+):
+    """
+    Streaming chat responses generation with a given model and tokenizer.
+
+    Args:
+        model (Any): The language model to generate responses.
+        tokenizer (PreTrainedTokenizer): Tokenizer compatible with the model, used for encoding inputs and decoding responses.
+        input_query (str): The current user input to respond to.
+        history (List[Dict], optional): A list of past conversations, where each conversation is a dictionary with keys 'role' and 'message'.
+        roles (list): Roles involved in the conversation, defaults to ["", "Human", "Assistant"].
+        past_key_values (Tuple[Tuple[torch.FloatTensor, Any], Any], optional): Past key values for incremental decoding.
+        temperature (float): The temperature value for token sampling, defaults to 0.8.
+        top_p (float): Nucleus sampling probability threshold, defaults to 0.95.
+        top_k (int): Top-K filtering threshold, defaults to 50.
+        do_sample (bool): Whether to sample responses, defaults to True.
+        length_penalty (float): Penalty for response length, defaults to 1.2.
+        max_new_tokens (int): Maximum number of new tokens to generate, defaults to 512.
+        logits_processor (LogitsProcessorList, optional): Custom logits processors, defaults to None.
+        return_past_key_values (bool): Whether to return past key values for further incremental decoding, defaults to False.
+        **kwargs: Additional keyword arguments for generation.
+
+    Yields:
+        Tuple[str, List[Dict], Optional[Tuple[Tuple[torch.FloatTensor, Any], Any]]]: A tuple containing the generated response, updated history, and 
+        optionally the updated past key values if `return_past_key_values` is True.
+
+    Ensures padding is on the left side for the tokenizer.
+    """
+    assert tokenizer.padding_side == "left", "Current generation only supports left padding."
+    if history is None:
+        history = []
+    if logits_processor is None:
+        logits_processor = LogitsProcessorList()
+        
+    generation_kwargs = {
+        'temperature': temperature,
+        'top_p': top_p,
+        'top_k': top_k,
+        'do_sample': do_sample,
+        'max_new_tokens': max_new_tokens,
+        'length_penalty': length_penalty,
+        'use_cache': True,
+        **kwargs
+    }
+
+    prompt_str = get_prompt_template(input_query, history=history, roles=roles)
+   
+    eos_token_id = [tokenizer.eos_token_id]
+    inputs = tokenizer(prompt_str, return_tensors="pt").to(model.device)
+    history.append({"role": roles[1], "message": input_query.strip()})
+    history.append({"role": roles[2], "message": None})
+
+    for outputs in stream_generate(model, **inputs, past_key_values=past_key_values,
+                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                            **generation_kwargs):
+        if return_past_key_values:
+            outputs, past_key_values = outputs
+
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+
+        history[-1]["message"] = response.strip()
+        if return_past_key_values:
+            yield response, history, past_key_values
+        else:
+            yield response, history
+                    
+
+@torch.inference_mode()
+def stream_generate(
+    model: Any, 
+    input_ids: torch.Tensor, 
+    generation_config: Optional[GenerationConfig] = None,
+    logits_processor: Optional[LogitsProcessorList] = None,
+    stopping_criteria: Optional[StoppingCriteriaList] = None,
+    prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+    return_past_key_values: bool = False, 
+    **kwargs,
+):
+    """
+    Generates sequences of token ids using the specified model and generation parameters.
+    Adapted from https://huggingface.co/THUDM/chatglm3-6b/blob/main/modeling_chatglm.py
+    
+    Args:
+        model (Any): The model used for generating sequences of token ids.
+        input_ids (torch.Tensor): The sequence used as a prompt for the generation or as model inputs to the encoder. 
+        generation_config (Optional[GenerationConfig]): The generation configuration to be used as base parametrization for the generation call.
+        logits_processor (Optional[LogitsProcessorList]): Custom logits processors that complement the default logits processors built from arguments
+        and generation config.
+        stopping_criteria (Optional[StoppingCriteriaList]): Custom stopping criteria that complement the default stopping criteria built from arguments 
+        and a generation config.
+        prefix_allowed_tokens_fn (Optional[Callable[[int, torch.Tensor], List[int]]]): Function to constrain token generation.
+        return_past_key_values (bool): Whether to return past key values for further incremental decoding, defaults to False.
+        **kwargs: Additional parameters for model generation.
+
+    Yields:
+        torch.Tensor: The generated token IDs, updated after each generation step.
+        Optional[Tuple[Tuple[torch.FloatTensor, Any], Any]]: The past key values, returned if `return_past_key_values` is True, defaults to False.
+    """
+    input_ids_len = input_ids.size(1)
+
+    if generation_config is None:
+        generation_config = model.generation_config
+    generation_config = deepcopy(generation_config)
+    model_kwargs = generation_config.update(**kwargs)
+    
+    eos_token_id = generation_config.eos_token_id
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+
+    if generation_config.max_new_tokens is not None:
+        generation_config.max_length = generation_config.max_new_tokens + input_ids_len
+    
+    if input_ids_len >= generation_config.max_length:
+        input_ids_string = "decoder_input_ids" if model.config.is_encoder_decoder else "input_ids"
+        logger.warning(
+                    f"Input length of {input_ids_string} is {input_ids_len}, but `max_length` is set to"
+                    f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                    " increasing `max_new_tokens`."
+                )
+    logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+    stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+    
+    # prepare distribution pre_processing samplers
+    logits_processor = model._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_len,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+    # prepare stopping criteria
+    stopping_criteria = model._get_stopping_criteria(
+        generation_config=generation_config, stopping_criteria=stopping_criteria
+    )
+
+    logits_warper = model._get_logits_warper(generation_config)
+    unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+    scores = None
+    
+    while True:
+        model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        # forward pass to get next token
+        outputs = model(
+            **model_inputs,
+            return_dict=True,
+            output_attentions=False,
+            output_hidden_states=False,
+        )
+
+        # NOTE: this is correct only in left padding mode
+        # pre-process distribution
+        next_token_logits = outputs.logits[:, -1, :]
+        next_token_scores = logits_processor(input_ids, next_token_logits)
+        next_token_scores = logits_warper(input_ids, next_token_scores)
+
+        # sample
+        probs = nn.functional.softmax(next_token_scores, dim=-1)
+        if generation_config.do_sample:
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+        else:
+            next_tokens = torch.argmax(probs, dim=-1)
+
+        # update generated ids, model inputs, and length for next step
+        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        model_kwargs = model._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=model.config.is_encoder_decoder
+        )
+        unfinished_sequences = unfinished_sequences.mul(
+            next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+        )
+
+        if return_past_key_values:
+            yield input_ids, outputs.past_key_values
+        else:
+            yield input_ids
+        # stop when each sentence is finished, or if exceed the maximum length
+        if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+            break
\ No newline at end of file
diff --git a/applications/Colossal-LLaMA-2/stream_chat_example.py b/applications/Colossal-LLaMA-2/stream_chat_example.py
new file mode 100644
index 000000000000..3e45c690f878
--- /dev/null
+++ b/applications/Colossal-LLaMA-2/stream_chat_example.py
@@ -0,0 +1,55 @@
+import os
+import argparse
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from colossal_llama2.utils.stream_chat_patch import streaming_chat
+
+SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+def main(args):
+    model = AutoModelForCausalLM.from_pretrained(args.model_path).cuda().eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
+
+    past_key_values, history = None, []
+    roles = ["", "Human", "Assistant"]
+
+    history = []
+    history.append({"role": roles[0], "message": SYSTEM})
+
+    while True:
+        input_query = input(f"\n{roles[1]}: ")
+        if input_query.strip() == "exit":
+            break
+        if input_query.strip() == "clear":
+            past_key_values, history = None, []
+            continue
+
+        print(f"\n{roles[2]}: ", end="")
+        gen_len = 0
+        for response, history, past_key_values in streaming_chat(
+            model, tokenizer, input_query, history=history, roles=roles,
+            temperature = args.temperature,
+            top_p = args.top_p,
+            top_k = args.top_k,
+            do_sample = args.do_sample,
+            length_penalty = args.length_penalty,
+            max_new_tokens = args.max_new_tokens,
+            past_key_values=past_key_values,
+            return_past_key_values=True):
+
+            output = response[gen_len:]
+            print(output, end="", flush=True)
+            gen_len = len(response)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', type=str, default=None, help="path to chat version model")
+    parser.add_argument('--tokenizer_path', type=str, default=None, help="path to chat version tokenizer")
+    parser.add_argument('--temperature', type=float, default=0.8, help="set temperature")
+    parser.add_argument('--top_p', type=float, default=0.95, help="set top p value")
+    parser.add_argument('--top_k', type=int, default=50, help="set top k value")
+    parser.add_argument('--do_sample', type=bool, default=True, help="whether turn on do_sample or not")
+    parser.add_argument('--length_penalty', type=float, default=1.2, help="set length penalty")
+    parser.add_argument('--max_new_tokens', type=int, default=512, help="set max new tokens")
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file