hpcaitech · flybird11111 · Mar 18, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
@@ -899,6 +899,7 @@ class HybridParallelPlugin(PipelinePluginBase):
         enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
         enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
+        parallel_output (bool): Whether to keep the output parallel when enabling tensor parallelism. Default to True.
         num_microbatches (int, optional): Number of microbatches when using pipeline parallelism. Defaults to None.
         microbatch_size (int, optional): Microbatch size when using pipeline parallelism.
             Either ``num_microbatches`` or ``microbatch_size`` should be provided if using pipeline.
@@ -925,6 +926,7 @@ class HybridParallelPlugin(PipelinePluginBase):
         pp_style (str, optional): The style for pipeline parallelism. Defaults to '1f1b'.
         num_model_chunks (int, optional): The number of model chunks for interleaved pipeline parallelism. Defaults to 1.
         enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
+        make_vocab_size_divisible_by (bool, optional): make the vocabulary size is divisible by `make_vocab_size_divisible_by`,  to select a faster CUDA kernel operator. Default to 128.
     """
 
     def __init__(
@@ -939,6 +941,7 @@ def __init__(
         enable_jit_fused: bool = False,
         enable_sequence_parallelism: bool = False,
         enable_sequence_overlap: bool = False,
+        parallel_output: bool = True,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
         initial_scale: float = 2**16,
@@ -963,6 +966,7 @@ def __init__(
         pp_style: str = "1f1b",
         num_model_chunks: int = 1,
         enable_metadata_cache: bool = True,
+        make_vocab_size_divisible_by: int = 128,
     ) -> None:
         super().__init__()
         assert (
@@ -1035,6 +1039,8 @@ def __init__(
             enable_jit_fused=self.enable_jit_fused,
             enable_sequence_parallelism=enable_sequence_parallelism,
             enable_sequence_overlap=enable_sequence_overlap,
+            parallel_output=parallel_output,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,

@@ -793,11 +793,12 @@ def forward(
             scale = scale * (1 / float(self.layer_idx + 1))
 
         # use coloattention
-        attention = ColoAttention(
-            embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=self.attn_dropout.p, scale=scale
-        )
+        if not hasattr(self, "attention"):
+            self.attention = ColoAttention(
+                embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=self.attn_dropout.p, scale=scale
+            )
 
-        attn_output = attention(query, key, value, attn_mask=flash_attention_mask, attn_mask_type=attn_mask_type)
+        attn_output = self.attention(query, key, value, attn_mask=flash_attention_mask, attn_mask_type=attn_mask_type)
 
         attn_output = self.c_proj(attn_output)
         attn_output = self.resid_dropout(attn_output)

@@ -485,8 +485,9 @@ def forward(
             flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
             attn_mask_type = AttnMaskType.paddedcausal
 
-        attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads)
-        attn_output = attention(
+        if not hasattr(self, "attention"):
+            self.attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads)
+        attn_output = self.attention(
             query_states,
             key_states,
             value_states,

diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
@@ -38,6 +38,13 @@ def preprocess(self):
             if vocab_size % world_size != 0:
                 new_vocab_size = vocab_size + world_size - vocab_size % world_size
                 self.model.resize_token_embeddings(new_vocab_size)
+        else:
+            # Make vocab_size divisible by `make_vocab_size_divisible_by` to select a faster CUDA kernel operator.
+            new_vocab_size = vocab_size
+            multiple = self.shard_config.make_vocab_size_divisible_by
+            while (new_vocab_size % multiple) != 0:
+                new_vocab_size += 1
+            self.model.resize_token_embeddings(new_vocab_size)
         return self.model
 
     def module_policy(self):

@@ -34,7 +34,8 @@ class ShardConfig:
     enable_all_optimization: bool = False
     enable_sequence_parallelism: bool = False
     enable_sequence_overlap: bool = False
-    parallel_output = True
+    parallel_output: bool = True
+    make_vocab_size_divisible_by: int = 128
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
     # pipeline_parallel_size: int
     # data_parallel_size: int