Qwen-1.5-MoE router strategy alignment with huggingface implementation (

#241) Co-authored-by: 同润 <jerry.lp@alibaba-inc.com>
alibaba · May 29, 2024 · 3de1394 · 3de1394
1 parent a002437
commit 3de1394
Show file tree

Hide file tree

Showing 20 changed files with 273 additions and 96 deletions.
diff --git a/examples/qwen1_5/README.md b/examples/qwen1_5/README.md
@@ -404,7 +404,7 @@ sh hf2mcore_qwen1.5_dense_to_moe_convertor.sh \
 ```bash
 cd /workspace/Pai-Megatron-Patch/toolkits/model_checkpoints_convertor/qwen \
 bash hf2mcore_qwen1.5_moe_convertor.sh \
-2.7B \
+A2.7B \
 /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B \
 /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B-to-mcore-tp2-pp1-ep4 \
 2 \

diff --git a/megatron_patch/model/deepseek_v2/moe/router.py b/megatron_patch/model/deepseek_v2/moe/router.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import math
 from abc import ABC, abstractmethod

diff --git a/megatron_patch/model/deepseek_v2/moe/token_dispatcher.py b/megatron_patch/model/deepseek_v2/moe/token_dispatcher.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from abc import abstractmethod
 from typing import List, Optional, Tuple

diff --git a/megatron_patch/model/deepseek_v2/transformer/attention.py b/megatron_patch/model/deepseek_v2/transformer/attention.py
@@ -1,4 +1,17 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Union
@@ -54,7 +67,6 @@ def __init__(
         # For normal attention without groups, num_query_groups == num_attention_heads,
         # so these two will be the same
         self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
-        #self.query_projection_size = 196 * self.config.num_attention_heads
         self.kv_projection_size = self.config.kv_channels * self.config.num_query_groups
 
         # Per attention head and per partition values.
@@ -219,43 +231,6 @@ def forward(
 
         core_attn_out = attn_output.reshape(q_len, bsz, self.num_heads * self.config.v_head_dim)
 
-        # ==================================
-        # core attention computation
-        # ==================================
-        #torch.Size([96, 1, 16384])
-        """
-        attn_mask_type = self.attn_mask_type
-        if self.checkpoint_core_attention and self.training:
-            core_attn_out = self._checkpointed_attention_forward(
-                query,
-                key,
-                value,
-                attention_mask,
-                attn_mask_type=attn_mask_type,
-                packed_seq_params=packed_seq_params,
-            )
-        else:
-            core_attn_out = self.core_attention(
-                query,
-                key,
-                value,
-                attention_mask,
-                attn_mask_type=attn_mask_type,
-                packed_seq_params=packed_seq_params,
-            )
-
-        if packed_seq_params is not None:
-            # reshape to same output shape as unpacked case
-            # (t, np, hn) -> (t, b=1, h=np*hn)
-            # t is the pack size = sum (sq_i)
-            # note that batch is a dummy dimension in the packed case
-            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
-        """
-        # =================
-        # Output. [sq, b, h]
-        # =================
-        # torch.Size([96, 1, 16384])
-
         output, bias = self.linear_proj(core_attn_out)
 
         return output, bias
@@ -436,14 +411,6 @@ def get_query_key_value_tensors(self, hidden_states, key_value_states=None, posi
         key_states[:, :, :, : self.config.qk_nope_head_dim] = k_nope
         key_states[:, :, :, self.config.qk_nope_head_dim :] = k_pe
 
-        #[1, 128, 96, 192] -> [96, 1, 128, 192]
-        #query = query_states.transpose(0, 2).transpose(1, 2)
-        #key = key_states.transpose(0, 2).transpose(1, 2)
-
-        #value = torch.zeros_like(value_states.new_empty(bsz, self.num_heads, q_len, self.q_head_dim))
-        #value[:, :, :, : self.config.qk_nope_head_dim] = value_states
-        #value = value.transpose(0, 2).transpose(1, 2)
-        #value = value_states.transpose(0, 2).transpose(1, 2)
         return query_states, key_states, value_states
 
 
diff --git a/megatron_patch/model/deepseek_v2/transformer/mlp.py b/megatron_patch/model/deepseek_v2/transformer/mlp.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union

diff --git a/megatron_patch/model/deepseek_v2/transformer_block.py b/megatron_patch/model/deepseek_v2/transformer_block.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import re
 from contextlib import nullcontext

diff --git a/megatron_patch/model/deepseek_v2/transformer_layer.py b/megatron_patch/model/deepseek_v2/transformer_layer.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from abc import ABC
 from dataclasses import dataclass, field
@@ -126,12 +138,6 @@ def __init__(
         ## [Module 9: BiasDropoutFusion]
         self.mlp_bda = build_module(submodules.mlp_bda)
 
-        # @jcasper how should we handle nvfuser?
-        # Set bias+dropout+add fusion grad_enable execution handler.
-        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
-        # TORCH_MINOR = int(torch.__version__.split('.')[1])
-        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
-        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
         self.bias_dropout_add_exec_handler = torch.enable_grad
 
     def _get_layer_offset(self):

diff --git a/megatron_patch/model/llama3/gpt_model.py b/megatron_patch/model/llama3/gpt_model.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """GPT-2 model."""
 

diff --git a/megatron_patch/model/llama3/language_model.py b/megatron_patch/model/llama3/language_model.py
@@ -16,8 +16,6 @@
 
 from .transformer_legacy import ParallelTransformer
 
-
-
 def parallel_lm_logits(input_, word_embeddings_weight, parallel_output,
                        bias=None):
     """LM logits using word embedding weights."""

diff --git a/megatron_patch/model/llama3/model.py b/megatron_patch/model/llama3/model.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import logging
 from typing import Dict, Literal, Optional, Tuple, Union

diff --git a/megatron_patch/model/llama3/transformer/attention.py b/megatron_patch/model/llama3/transformer/attention.py
@@ -1,4 +1,17 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from importlib.metadata import version

diff --git a/megatron_patch/model/llama3/transformer/mlp.py b/megatron_patch/model/llama3/transformer/mlp.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union

diff --git a/megatron_patch/model/llama3/transformer_legacy.py b/megatron_patch/model/llama3/transformer_legacy.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """Transformer."""
 from contextlib import nullcontext

diff --git a/megatron_patch/model/qwen1_5/model.py b/megatron_patch/model/qwen1_5/model.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import logging
 from typing import Dict, Literal, Optional, Tuple, Union

diff --git a/megatron_patch/model/qwen1_5/moe/router.py b/megatron_patch/model/qwen1_5/moe/router.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import math
 from abc import ABC, abstractmethod
@@ -145,8 +157,12 @@ def aux_loss_load_balancing(self, logits: torch.Tensor):
         Returns:
             Tuple[torch.Tensor, torch.Tensor]: The scores and the indices tensor after applying load balancing.
         """
-        top_logits, indices = torch.topk(logits, k=self.topk, dim=1)
-        scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32).type_as(logits)
+        #top_logits, indices = torch.topk(logits, k=self.topk, dim=1)
+        #scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32).type_as(logits)
+
+        routing_weights = torch.softmax(logits, dim=1, dtype=torch.float)
+        scores, indices = torch.topk(routing_weights, k=self.topk, dim=-1)
+
         # Apply load balancing loss
         probs = torch.softmax(logits, dim=-1, dtype=torch.float32)
         scores = self.apply_load_balancing_loss(probs, indices, activation=scores)

diff --git a/megatron_patch/model/qwen1_5/moe/token_dispatcher.py b/megatron_patch/model/qwen1_5/moe/token_dispatcher.py
@@ -1,4 +1,16 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from abc import abstractmethod
 from typing import List, Optional, Tuple

diff --git a/megatron_patch/model/qwen1_5/transformer/attention.py b/megatron_patch/model/qwen1_5/transformer/attention.py
@@ -1,4 +1,17 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from importlib.metadata import version