Skip to content

Commit

Permalink
[moe] clean legacy code
Browse files Browse the repository at this point in the history
  • Loading branch information
botbw committed Jul 19, 2024
1 parent 8d3d7f3 commit c8bf268
Show file tree
Hide file tree
Showing 39 changed files with 163 additions and 173 deletions.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import torch.nn as nn

from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
from colossalai.moe._operation import EPGradScalerIn, EPGradScalerOut
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import get_activation
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.utils import get_activation
from colossalai.moe.operators import EPGradScalerIn, EPGradScalerOut
from colossalai.shardformer.layer.utils import Randomizer
from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import torch.nn as nn
import torch.nn.functional as F

from colossalai.moe._operation import AllGather, AllToAll, HierarchicalAllToAll, MoeCombine, MoeDispatch, ReduceScatter
from colossalai.moe.load_balance import LoadBalancer
from colossalai.moe.utils import create_ep_hierarchical_group, get_noise_generator
from colossalai.legacy.moe.load_balance import LoadBalancer
from colossalai.legacy.moe.utils import create_ep_hierarchical_group, get_noise_generator
from colossalai.moe.operators import AllGather, AllToAll, HierarchicalAllToAll, MoeCombine, MoeDispatch, ReduceScatter
from colossalai.shardformer.layer.moe import MLPExperts
from colossalai.tensor.moe_tensor.api import get_dp_group, get_ep_group, get_ep_group_ranks, get_ep_size

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import torch.nn as nn

from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
from colossalai.moe._operation import EPGradScalerIn, EPGradScalerOut
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import get_activation
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.utils import get_activation
from colossalai.moe.operators import EPGradScalerIn, EPGradScalerOut
from colossalai.shardformer.layer.utils import Randomizer
from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from torch.distributed import ProcessGroup

from colossalai.cluster import ProcessGroupMesh
from colossalai.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.shardformer.layer.moe import MLPExperts
from colossalai.zero.low_level import LowLevelZeroOptimizer

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.utils import skip_init
from colossalai.moe.layers import apply_load_balance
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import skip_init
from colossalai.nn.optimizer import HybridAdam


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from transformers.models.llama import LlamaConfig
from utils import PerformanceEvaluator, get_model_numel

from colossalai.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.manager import MOE_MANAGER


class RandomDataset(Dataset):
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
except:
HAS_FLASH_ATTN = False
from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import get_activation, set_moe_args
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.utils import get_activation, set_moe_args
from colossalai.shardformer.layer.moe import SparseMLP

if HAS_TRITON:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.utils import logging

from colossalai.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from colossalai.booster import Booster
from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
from colossalai.cluster import DistCoordinator
from colossalai.moe.utils import skip_init
from colossalai.legacy.moe.utils import skip_init
from colossalai.nn.optimizer import HybridAdam
from colossalai.shardformer.layer.moe import apply_load_balance

Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion colossalai/moe/utils.py → colossalai/legacy/moe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from torch.distributed.distributed_c10d import get_process_group_ranks

from colossalai.accelerator import get_accelerator
from colossalai.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.tensor.moe_tensor.api import is_moe_tensor


Expand Down
5 changes: 0 additions & 5 deletions colossalai/moe/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +0,0 @@
from .manager import MOE_MANAGER

__all__ = [
"MOE_MANAGER",
]
2 changes: 2 additions & 0 deletions colossalai/moe/_operation.py → colossalai/moe/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,8 @@ def all_to_all_uneven(
# See the License for the specific language governing permissions and
# limitations under the License.

# TODO: used when non-moe are tp but moe are not


def _gather_tokens(input_, dim: int, tp_group: ProcessGroup):
"""Gather tensors and concatenate them along a dimension"""
Expand Down
8 changes: 1 addition & 7 deletions colossalai/shardformer/modeling/mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,7 @@
from transformers.utils import is_flash_attn_2_available, logging

from colossalai.lazy import LazyInitContext
from colossalai.moe._operation import (
DPGradScalerIn,
DPGradScalerOut,
EPGradScalerIn,
EPGradScalerOut,
all_to_all_uneven,
)
from colossalai.moe.operators import DPGradScalerIn, DPGradScalerOut, EPGradScalerIn, EPGradScalerOut, all_to_all_uneven
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
from colossalai.shardformer.shard import ShardConfig
Expand Down
136 changes: 136 additions & 0 deletions tests/test_legacy/test_moe/moe_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import torch
import torch.distributed as dist
import torch.nn as nn
from torch.distributed import ProcessGroup

from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
from colossalai.legacy.engine.gradient_handler._base_gradient_handler import BaseGradientHandler
from colossalai.legacy.engine.gradient_handler.utils import bucket_allreduce
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.utils import get_moe_epsize_param_dict
from colossalai.legacy.registry import GRADIENT_HANDLER
from colossalai.tensor.moe_tensor.api import get_ep_group, get_ep_size, set_moe_tensor_ep_group


def delete_moe_info(model):
for _, param in model.named_parameters():
if hasattr(param, "ep_group"):
delattr(param, "ep_group")


class MoeModel(nn.Module):
def __init__(self, ep_group: ProcessGroup = None):
super().__init__()
self.test_embed = nn.Linear(4, 16, bias=False)
self.w1 = torch.nn.Parameter(torch.randn(16, 8))
if ep_group:
set_moe_tensor_ep_group(self.w1, ep_group)

def forward(self, x):
x = self.test_embed(x)
x = torch.matmul(x, self.w1)

return x


@GRADIENT_HANDLER.register_module
class MoeGradientHandler(BaseGradientHandler):
"""A helper class to handle all-reduce operations in a data parallel group and
moe model parallel. A all-reduce collective communication will be operated in
:func:`handle_gradient` among a data parallel group.
For better performance, it bucketizes the gradients of all parameters that are
the same type to improve the efficiency of communication.
Args:
model (Module): Model where the gradients accumulate.
optimizer (Optimizer): Optimizer for updating the parameters.
"""

def __init__(self, model, optimizer=None):
super().__init__(model, optimizer)

def handle_gradient(self):
"""A method running an all-reduce operation in a data parallel group.
Then running an all-reduce operation for all parameters in experts
across moe model parallel group
"""
if dist.get_world_size() > 1:
epsize_param_dict = get_moe_epsize_param_dict(self._model)

# epsize is 1, indicating the params are replicated among processes in data parallelism
# use the ParallelMode.DATA to get data parallel group
# reduce gradients for all parameters in data parallelism
if 1 in epsize_param_dict:
bucket_allreduce(param_list=epsize_param_dict[1])

for ep_size in epsize_param_dict:
if ep_size != 1 and ep_size != MOE_MANAGER.world_size:
bucket_allreduce(
param_list=epsize_param_dict[ep_size], group=MOE_MANAGER.parallel_info_dict[ep_size].dp_group
)


def assert_not_equal_in_group(tensor, process_group=None):
# all gather tensors from different ranks
world_size = dist.get_world_size(process_group)
tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
dist.all_gather(tensor_list, tensor, group=process_group)

# check if they are equal one by one
for i in range(world_size - 1):
a = tensor_list[i]
b = tensor_list[i + 1]
assert not torch.allclose(a, b), (
f"expected tensors on rank {i} and {i + 1} not to be equal " f"but they are, {a} vs {b}"
)


def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
model.train()
with torch.cuda.amp.autocast(enabled=enable_autocast):
if criterion:
y = model(data)
loss = criterion(y, label)
else:
loss = model(data, label)
loss = loss.float()

if isinstance(model, LowLevelZeroModel):
optimizer.backward(loss)
else:
loss.backward()
return y


def sync_local_from_ep(local_model, ep_model, assert_grad_flag: bool = False) -> None:
"""Sync the parameters of tp model from ep model
Args:
local_model (MoeModule)
ep_model (MoeModule)
"""
for (local_name, local_param), (ep_name, ep_param) in zip(
local_model.named_parameters(), ep_model.named_parameters()
):
if "experts" not in local_name:
if assert_grad_flag:
assert torch.allclose(local_param, ep_param), f"local_param: {local_param}, ep_param: {ep_param}"
assert torch.allclose(local_param.grad, ep_param.grad)
else:
local_param.data.copy_(ep_param.data)
continue

# gather param from ep model
param_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
dist.all_gather(param_list, ep_param, group=get_ep_group(ep_param))
all_param = torch.cat(param_list, dim=0)
if assert_grad_flag:
grad_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
dist.all_gather(grad_list, ep_param.grad, group=get_ep_group(ep_param))
all_grad = torch.cat(grad_list, dim=0)

if assert_grad_flag:
assert torch.allclose(local_param, all_param)
assert torch.allclose(local_param.grad, all_grad)
else:
local_param.data.copy_(all_param.data)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.manager import MOE_MANAGER

# from colossalai.shardformer.layer.moe.layers import SparseMLP
from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.moe.manager import MOE_MANAGER
from colossalai.moe.utils import sync_moe_model_param
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.utils import sync_moe_model_param

# from colossalai.shardformer.layer.moe import MLPExperts
from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from colossalai.booster import Booster
from colossalai.booster.plugin import LowLevelZeroPlugin
from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
from colossalai.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.manager import MOE_MANAGER
from colossalai.tensor.moe_tensor.api import is_moe_tensor
from colossalai.testing import rerun_if_address_is_in_use, spawn
from tests.test_moe.moe_utils import MoeModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from colossalai.booster import Booster
from colossalai.booster.plugin import LowLevelZeroPlugin
from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
from colossalai.moe.manager import MOE_MANAGER
from colossalai.legacy.moe.manager import MOE_MANAGER

# from colossalai.shardformer.layer.moe import apply_load_balance
from colossalai.tensor.moe_tensor.api import is_moe_tensor
Expand Down
Loading

0 comments on commit c8bf268

Please sign in to comment.