[moe] clean legacy code

hpcaitech · Jul 19, 2024 · c8bf268 · c8bf268
1 parent 8d3d7f3
commit c8bf268
Show file tree

Hide file tree

Showing 39 changed files with 163 additions and 173 deletions.
diff --git a/colossalai/shardformer/layer/moe/__init__.py → colossalai/legacy/moe/layer/__init__.py b/colossalai/shardformer/layer/moe/__init__.py → colossalai/legacy/moe/layer/__init__.py
diff --git a/colossalai/shardformer/layer/moe/experts.py → colossalai/legacy/moe/layer/experts.py b/colossalai/shardformer/layer/moe/experts.py → colossalai/legacy/moe/layer/experts.py
@@ -5,9 +5,9 @@
 import torch.nn as nn
 
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
-from colossalai.moe._operation import EPGradScalerIn, EPGradScalerOut
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import get_activation
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_activation
+from colossalai.moe.operators import EPGradScalerIn, EPGradScalerOut
 from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size
 

diff --git a/colossalai/shardformer/layer/moe/layers.py → colossalai/legacy/moe/layer/layers.py b/colossalai/shardformer/layer/moe/layers.py → colossalai/legacy/moe/layer/layers.py
@@ -7,9 +7,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from colossalai.moe._operation import AllGather, AllToAll, HierarchicalAllToAll, MoeCombine, MoeDispatch, ReduceScatter
-from colossalai.moe.load_balance import LoadBalancer
-from colossalai.moe.utils import create_ep_hierarchical_group, get_noise_generator
+from colossalai.legacy.moe.load_balance import LoadBalancer
+from colossalai.legacy.moe.utils import create_ep_hierarchical_group, get_noise_generator
+from colossalai.moe.operators import AllGather, AllToAll, HierarchicalAllToAll, MoeCombine, MoeDispatch, ReduceScatter
 from colossalai.shardformer.layer.moe import MLPExperts
 from colossalai.tensor.moe_tensor.api import get_dp_group, get_ep_group, get_ep_group_ranks, get_ep_size
 

diff --git a/colossalai/shardformer/layer/moe/routers.py → colossalai/legacy/moe/layer/routers.py b/colossalai/shardformer/layer/moe/routers.py → colossalai/legacy/moe/layer/routers.py
@@ -5,9 +5,9 @@
 import torch.nn as nn
 
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
-from colossalai.moe._operation import EPGradScalerIn, EPGradScalerOut
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import get_activation
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_activation
+from colossalai.moe.operators import EPGradScalerIn, EPGradScalerOut
 from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size
 

diff --git a/colossalai/moe/load_balance.py → colossalai/legacy/moe/load_balance.py b/colossalai/moe/load_balance.py → colossalai/legacy/moe/load_balance.py
@@ -7,7 +7,7 @@
 from torch.distributed import ProcessGroup
 
 from colossalai.cluster import ProcessGroupMesh
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.shardformer.layer.moe import MLPExperts
 from colossalai.zero.low_level import LowLevelZeroOptimizer
 

diff --git a/colossalai/moe/manager.py → colossalai/legacy/moe/manager.py b/colossalai/moe/manager.py → colossalai/legacy/moe/manager.py
diff --git a/examples/language/openmoe/README.md → colossalai/legacy/moe/openmoe/README.md b/examples/language/openmoe/README.md → colossalai/legacy/moe/openmoe/README.md
diff --git a/...nguage/openmoe/benchmark/benchmark_cai.py → ...cy/moe/openmoe/benchmark/benchmark_cai.py b/...nguage/openmoe/benchmark/benchmark_cai.py → ...cy/moe/openmoe/benchmark/benchmark_cai.py
@@ -18,9 +18,9 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import skip_init
 from colossalai.moe.layers import apply_load_balance
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import skip_init
 from colossalai.nn.optimizer import HybridAdam
 
 

diff --git a/...nguage/openmoe/benchmark/benchmark_cai.sh → ...cy/moe/openmoe/benchmark/benchmark_cai.sh b/...nguage/openmoe/benchmark/benchmark_cai.sh → ...cy/moe/openmoe/benchmark/benchmark_cai.sh
diff --git a/...e/openmoe/benchmark/benchmark_cai_dist.sh → ...e/openmoe/benchmark/benchmark_cai_dist.sh b/...e/openmoe/benchmark/benchmark_cai_dist.sh → ...e/openmoe/benchmark/benchmark_cai_dist.sh
diff --git a/...guage/openmoe/benchmark/benchmark_fsdp.py → ...y/moe/openmoe/benchmark/benchmark_fsdp.py b/...guage/openmoe/benchmark/benchmark_fsdp.py → ...y/moe/openmoe/benchmark/benchmark_fsdp.py
@@ -14,7 +14,7 @@
 from transformers.models.llama import LlamaConfig
 from utils import PerformanceEvaluator, get_model_numel
 
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 
 
 class RandomDataset(Dataset):

diff --git a/...guage/openmoe/benchmark/benchmark_fsdp.sh → ...y/moe/openmoe/benchmark/benchmark_fsdp.sh b/...guage/openmoe/benchmark/benchmark_fsdp.sh → ...y/moe/openmoe/benchmark/benchmark_fsdp.sh
diff --git a/...s/language/openmoe/benchmark/hostfile.txt → ...legacy/moe/openmoe/benchmark/hostfile.txt b/...s/language/openmoe/benchmark/hostfile.txt → ...legacy/moe/openmoe/benchmark/hostfile.txt
diff --git a/examples/language/openmoe/benchmark/utils.py → ...lai/legacy/moe/openmoe/benchmark/utils.py b/examples/language/openmoe/benchmark/utils.py → ...lai/legacy/moe/openmoe/benchmark/utils.py
diff --git a/examples/language/openmoe/infer.py → colossalai/legacy/moe/openmoe/infer.py b/examples/language/openmoe/infer.py → colossalai/legacy/moe/openmoe/infer.py
diff --git a/examples/language/openmoe/infer.sh → colossalai/legacy/moe/openmoe/infer.sh b/examples/language/openmoe/infer.sh → colossalai/legacy/moe/openmoe/infer.sh
diff --git a/examples/language/openmoe/model/__init__.py → ...alai/legacy/moe/openmoe/model/__init__.py b/examples/language/openmoe/model/__init__.py → ...alai/legacy/moe/openmoe/model/__init__.py
diff --git a/...age/openmoe/model/convert_openmoe_ckpt.py → ...moe/openmoe/model/convert_openmoe_ckpt.py b/...age/openmoe/model/convert_openmoe_ckpt.py → ...moe/openmoe/model/convert_openmoe_ckpt.py
diff --git a/...age/openmoe/model/convert_openmoe_ckpt.sh → ...moe/openmoe/model/convert_openmoe_ckpt.sh b/...age/openmoe/model/convert_openmoe_ckpt.sh → ...moe/openmoe/model/convert_openmoe_ckpt.sh
diff --git a/...anguage/openmoe/model/modeling_openmoe.py → ...acy/moe/openmoe/model/modeling_openmoe.py b/...anguage/openmoe/model/modeling_openmoe.py → ...acy/moe/openmoe/model/modeling_openmoe.py
@@ -50,8 +50,8 @@
 except:
     HAS_FLASH_ATTN = False
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import get_activation, set_moe_args
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_activation, set_moe_args
 from colossalai.shardformer.layer.moe import SparseMLP
 
 if HAS_TRITON:

diff --git a/...uage/openmoe/model/openmoe_8b_config.json → .../moe/openmoe/model/openmoe_8b_config.json b/...uage/openmoe/model/openmoe_8b_config.json → .../moe/openmoe/model/openmoe_8b_config.json
diff --git a/...ge/openmoe/model/openmoe_base_config.json → ...oe/openmoe/model/openmoe_base_config.json b/...ge/openmoe/model/openmoe_base_config.json → ...oe/openmoe/model/openmoe_base_config.json
diff --git a/.../language/openmoe/model/openmoe_policy.py → ...egacy/moe/openmoe/model/openmoe_policy.py b/.../language/openmoe/model/openmoe_policy.py → ...egacy/moe/openmoe/model/openmoe_policy.py
@@ -9,7 +9,7 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import logging
 
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription

diff --git a/examples/language/openmoe/requirements.txt → ...salai/legacy/moe/openmoe/requirements.txt b/examples/language/openmoe/requirements.txt → ...salai/legacy/moe/openmoe/requirements.txt
diff --git a/examples/language/openmoe/test_ci.sh → colossalai/legacy/moe/openmoe/test_ci.sh b/examples/language/openmoe/test_ci.sh → colossalai/legacy/moe/openmoe/test_ci.sh
diff --git a/examples/language/openmoe/train.py → colossalai/legacy/moe/openmoe/train.py b/examples/language/openmoe/train.py → colossalai/legacy/moe/openmoe/train.py
@@ -19,7 +19,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
-from colossalai.moe.utils import skip_init
+from colossalai.legacy.moe.utils import skip_init
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.shardformer.layer.moe import apply_load_balance
 

diff --git a/examples/language/openmoe/train.sh → colossalai/legacy/moe/openmoe/train.sh b/examples/language/openmoe/train.sh → colossalai/legacy/moe/openmoe/train.sh
diff --git a/colossalai/moe/utils.py → colossalai/legacy/moe/utils.py b/colossalai/moe/utils.py → colossalai/legacy/moe/utils.py
@@ -9,7 +9,7 @@
 from torch.distributed.distributed_c10d import get_process_group_ranks
 
 from colossalai.accelerator import get_accelerator
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 
 

diff --git a/colossalai/moe/__init__.py b/colossalai/moe/__init__.py
@@ -1,5 +0,0 @@
-from .manager import MOE_MANAGER
-
-__all__ = [
-    "MOE_MANAGER",
-]

diff --git a/colossalai/moe/_operation.py → colossalai/moe/operators.py b/colossalai/moe/_operation.py → colossalai/moe/operators.py
@@ -469,6 +469,8 @@ def all_to_all_uneven(
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# TODO: used when non-moe are tp but moe are not
+
 
 def _gather_tokens(input_, dim: int, tp_group: ProcessGroup):
     """Gather tensors and concatenate them along a dimension"""

diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
@@ -14,13 +14,7 @@
 from transformers.utils import is_flash_attn_2_available, logging
 
 from colossalai.lazy import LazyInitContext
-from colossalai.moe._operation import (
-    DPGradScalerIn,
-    DPGradScalerOut,
-    EPGradScalerIn,
-    EPGradScalerOut,
-    all_to_all_uneven,
-)
+from colossalai.moe.operators import DPGradScalerIn, DPGradScalerOut, EPGradScalerIn, EPGradScalerOut, all_to_all_uneven
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
 from colossalai.shardformer.shard import ShardConfig

diff --git a/tests/test_legacy/test_moe/moe_utils.py b/tests/test_legacy/test_moe/moe_utils.py
@@ -0,0 +1,136 @@
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+
+from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
+from colossalai.legacy.engine.gradient_handler._base_gradient_handler import BaseGradientHandler
+from colossalai.legacy.engine.gradient_handler.utils import bucket_allreduce
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_moe_epsize_param_dict
+from colossalai.legacy.registry import GRADIENT_HANDLER
+from colossalai.tensor.moe_tensor.api import get_ep_group, get_ep_size, set_moe_tensor_ep_group
+
+
+def delete_moe_info(model):
+    for _, param in model.named_parameters():
+        if hasattr(param, "ep_group"):
+            delattr(param, "ep_group")
+
+
+class MoeModel(nn.Module):
+    def __init__(self, ep_group: ProcessGroup = None):
+        super().__init__()
+        self.test_embed = nn.Linear(4, 16, bias=False)
+        self.w1 = torch.nn.Parameter(torch.randn(16, 8))
+        if ep_group:
+            set_moe_tensor_ep_group(self.w1, ep_group)
+
+    def forward(self, x):
+        x = self.test_embed(x)
+        x = torch.matmul(x, self.w1)
+
+        return x
+
+
+@GRADIENT_HANDLER.register_module
+class MoeGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in a data parallel group and
+    moe model parallel. A all-reduce collective communication will be operated in
+    :func:`handle_gradient` among a data parallel group.
+    For better performance, it bucketizes the gradients of all parameters that are
+    the same type to improve the efficiency of communication.
+
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+
+    def __init__(self, model, optimizer=None):
+        super().__init__(model, optimizer)
+
+    def handle_gradient(self):
+        """A method running an all-reduce operation in a data parallel group.
+        Then running an all-reduce operation for all parameters in experts
+        across moe model parallel group
+        """
+        if dist.get_world_size() > 1:
+            epsize_param_dict = get_moe_epsize_param_dict(self._model)
+
+            # epsize is 1, indicating the params are replicated among processes in data parallelism
+            # use the ParallelMode.DATA to get data parallel group
+            # reduce gradients for all parameters in data parallelism
+            if 1 in epsize_param_dict:
+                bucket_allreduce(param_list=epsize_param_dict[1])
+
+            for ep_size in epsize_param_dict:
+                if ep_size != 1 and ep_size != MOE_MANAGER.world_size:
+                    bucket_allreduce(
+                        param_list=epsize_param_dict[ep_size], group=MOE_MANAGER.parallel_info_dict[ep_size].dp_group
+                    )
+
+
+def assert_not_equal_in_group(tensor, process_group=None):
+    # all gather tensors from different ranks
+    world_size = dist.get_world_size(process_group)
+    tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
+    dist.all_gather(tensor_list, tensor, group=process_group)
+
+    # check if they are equal one by one
+    for i in range(world_size - 1):
+        a = tensor_list[i]
+        b = tensor_list[i + 1]
+        assert not torch.allclose(a, b), (
+            f"expected tensors on rank {i} and {i + 1} not to be equal " f"but they are, {a} vs {b}"
+        )
+
+
+def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
+    model.train()
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
+        if criterion:
+            y = model(data)
+            loss = criterion(y, label)
+        else:
+            loss = model(data, label)
+        loss = loss.float()
+
+    if isinstance(model, LowLevelZeroModel):
+        optimizer.backward(loss)
+    else:
+        loss.backward()
+    return y
+
+
+def sync_local_from_ep(local_model, ep_model, assert_grad_flag: bool = False) -> None:
+    """Sync the parameters of tp model from ep model
+
+    Args:
+        local_model (MoeModule)
+        ep_model (MoeModule)
+    """
+    for (local_name, local_param), (ep_name, ep_param) in zip(
+        local_model.named_parameters(), ep_model.named_parameters()
+    ):
+        if "experts" not in local_name:
+            if assert_grad_flag:
+                assert torch.allclose(local_param, ep_param), f"local_param: {local_param}, ep_param: {ep_param}"
+                assert torch.allclose(local_param.grad, ep_param.grad)
+            else:
+                local_param.data.copy_(ep_param.data)
+            continue
+
+        # gather param from ep model
+        param_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
+        dist.all_gather(param_list, ep_param, group=get_ep_group(ep_param))
+        all_param = torch.cat(param_list, dim=0)
+        if assert_grad_flag:
+            grad_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
+            dist.all_gather(grad_list, ep_param.grad, group=get_ep_group(ep_param))
+            all_grad = torch.cat(grad_list, dim=0)
+
+        if assert_grad_flag:
+            assert torch.allclose(local_param, all_param)
+            assert torch.allclose(local_param.grad, all_grad)
+        else:
+            local_param.data.copy_(all_param.data)
diff --git a/tests/test_moe/test_grad_handler.py → ...test_legacy/test_moe/test_grad_handler.py b/tests/test_moe/test_grad_handler.py → ...test_legacy/test_moe/test_grad_handler.py
@@ -5,7 +5,7 @@
 
 import colossalai
 from colossalai.accelerator import get_accelerator
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 
 # from colossalai.shardformer.layer.moe.layers import SparseMLP
 from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn

diff --git a/tests/test_moe/test_mixtral_layer.py → ...est_legacy/test_moe/test_mixtral_layer.py b/tests/test_moe/test_mixtral_layer.py → ...est_legacy/test_moe/test_mixtral_layer.py
diff --git a/tests/test_moe/test_moe_group.py → tests/test_legacy/test_moe/test_moe_group.py b/tests/test_moe/test_moe_group.py → tests/test_legacy/test_moe/test_moe_group.py
@@ -4,8 +4,8 @@
 
 import colossalai
 from colossalai.accelerator import get_accelerator
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import sync_moe_model_param
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import sync_moe_model_param
 
 # from colossalai.shardformer.layer.moe import MLPExperts
 from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn

diff --git a/tests/test_moe/test_moe_hybrid_zero.py → ...t_legacy/test_moe/test_moe_hybrid_zero.py b/tests/test_moe/test_moe_hybrid_zero.py → ...t_legacy/test_moe/test_moe_hybrid_zero.py
@@ -6,7 +6,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from tests.test_moe.moe_utils import MoeModel

diff --git a/tests/test_moe/test_moe_load_balance.py → ..._legacy/test_moe/test_moe_load_balance.py b/tests/test_moe/test_moe_load_balance.py → ..._legacy/test_moe/test_moe_load_balance.py
@@ -6,7 +6,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 
 # from colossalai.shardformer.layer.moe import apply_load_balance
 from colossalai.tensor.moe_tensor.api import is_moe_tensor