Skip to content

Commit

Permalink
Qwen-1.5-MoE router strategy alignment with huggingface implementation (
Browse files Browse the repository at this point in the history
#241)

Co-authored-by: 同润 <jerry.lp@alibaba-inc.com>
  • Loading branch information
jerryli1981 and 同润 committed May 29, 2024
1 parent a002437 commit 3de1394
Show file tree
Hide file tree
Showing 20 changed files with 273 additions and 96 deletions.
2 changes: 1 addition & 1 deletion examples/qwen1_5/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ sh hf2mcore_qwen1.5_dense_to_moe_convertor.sh \
```bash
cd /workspace/Pai-Megatron-Patch/toolkits/model_checkpoints_convertor/qwen \
bash hf2mcore_qwen1.5_moe_convertor.sh \
2.7B \
A2.7B \
/mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B \
/mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B-to-mcore-tp2-pp1-ep4 \
2 \
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/deepseek_v2/moe/router.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from abc import ABC, abstractmethod
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/deepseek_v2/moe/token_dispatcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from typing import List, Optional, Tuple
Expand Down
61 changes: 14 additions & 47 deletions megatron_patch/model/deepseek_v2/transformer/attention.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Union
Expand Down Expand Up @@ -54,7 +67,6 @@ def __init__(
# For normal attention without groups, num_query_groups == num_attention_heads,
# so these two will be the same
self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
#self.query_projection_size = 196 * self.config.num_attention_heads
self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups

# Per attention head and per partition values.
Expand Down Expand Up @@ -219,43 +231,6 @@ def forward(

core_attn_out = attn_output.reshape(q_len, bsz, self.num_heads * self.config.v_head_dim)

# ==================================
# core attention computation
# ==================================
#torch.Size([96, 1, 16384])
"""
attn_mask_type = self.attn_mask_type
if self.checkpoint_core_attention and self.training:
core_attn_out = self._checkpointed_attention_forward(
query,
key,
value,
attention_mask,
attn_mask_type=attn_mask_type,
packed_seq_params=packed_seq_params,
)
else:
core_attn_out = self.core_attention(
query,
key,
value,
attention_mask,
attn_mask_type=attn_mask_type,
packed_seq_params=packed_seq_params,
)
if packed_seq_params is not None:
# reshape to same output shape as unpacked case
# (t, np, hn) -> (t, b=1, h=np*hn)
# t is the pack size = sum (sq_i)
# note that batch is a dummy dimension in the packed case
core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
"""
# =================
# Output. [sq, b, h]
# =================
# torch.Size([96, 1, 16384])

output, bias = self.linear_proj(core_attn_out)

return output, bias
Expand Down Expand Up @@ -436,14 +411,6 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None, posi
key_states[:, :, :, : self.config.qk_nope_head_dim] = k_nope
key_states[:, :, :, self.config.qk_nope_head_dim :] = k_pe

#[1, 128, 96, 192] -> [96, 1, 128, 192]
#query = query_states.transpose(0, 2).transpose(1, 2)
#key = key_states.transpose(0, 2).transpose(1, 2)

#value = torch.zeros_like(value_states.new_empty(bsz, self.num_heads, q_len, self.q_head_dim))
#value[:, :, :, : self.config.qk_nope_head_dim] = value_states
#value = value.transpose(0, 2).transpose(1, 2)
#value = value_states.transpose(0, 2).transpose(1, 2)
return query_states, key_states, value_states


14 changes: 13 additions & 1 deletion megatron_patch/model/deepseek_v2/transformer/mlp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import Optional, Tuple, Union
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/deepseek_v2/transformer_block.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from contextlib import nullcontext
Expand Down
20 changes: 13 additions & 7 deletions megatron_patch/model/deepseek_v2/transformer_layer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC
from dataclasses import dataclass, field
Expand Down Expand Up @@ -126,12 +138,6 @@ def __init__(
## [Module 9: BiasDropoutFusion]
self.mlp_bda = build_module(submodules.mlp_bda)

# @jcasper how should we handle nvfuser?
# Set bias+dropout+add fusion grad_enable execution handler.
# TORCH_MAJOR = int(torch.__version__.split('.')[0])
# TORCH_MINOR = int(torch.__version__.split('.')[1])
# use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
# self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
self.bias_dropout_add_exec_handler = torch.enable_grad

def _get_layer_offset(self):
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/llama3/gpt_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""GPT-2 model."""

Expand Down
2 changes: 0 additions & 2 deletions megatron_patch/model/llama3/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

from .transformer_legacy import ParallelTransformer



def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
bias=None):
"""LM logits using word embedding weights."""
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/llama3/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Dict, Literal, Optional, Tuple, Union
Expand Down
15 changes: 14 additions & 1 deletion megatron_patch/model/llama3/transformer/attention.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod
from dataclasses import dataclass
from importlib.metadata import version
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/llama3/transformer/mlp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import Optional, Tuple, Union
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/llama3/transformer_legacy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Transformer."""
from contextlib import nullcontext
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/qwen1_5/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Dict, Literal, Optional, Tuple, Union
Expand Down
22 changes: 19 additions & 3 deletions megatron_patch/model/qwen1_5/moe/router.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from abc import ABC, abstractmethod
Expand Down Expand Up @@ -145,8 +157,12 @@ def aux_loss_load_balancing(self, logits: torch.Tensor):
Returns:
Tuple[torch.Tensor, torch.Tensor]: The scores and the indices tensor after applying load balancing.
"""
top_logits, indices = torch.topk(logits, k=self.topk, dim=1)
scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32).type_as(logits)
#top_logits, indices = torch.topk(logits, k=self.topk, dim=1)
#scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32).type_as(logits)

routing_weights = torch.softmax(logits, dim=1, dtype=torch.float)
scores, indices = torch.topk(routing_weights, k=self.topk, dim=-1)

# Apply load balancing loss
probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
scores = self.apply_load_balancing_loss(probs, indices, activation=scores)
Expand Down
14 changes: 13 additions & 1 deletion megatron_patch/model/qwen1_5/moe/token_dispatcher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import abstractmethod
from typing import List, Optional, Tuple
Expand Down
15 changes: 14 additions & 1 deletion megatron_patch/model/qwen1_5/transformer/attention.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABC, abstractmethod
from dataclasses import dataclass
from importlib.metadata import version
Expand Down
Loading

0 comments on commit 3de1394

Please sign in to comment.