From 52707c6328b25961b27be9af443e74f8e5883d87 Mon Sep 17 00:00:00 2001
From: "Zian(Andy) Zheng" <62330719+Orion-Zheng@users.noreply.github.com>
Date: Fri, 13 Oct 2023 16:46:33 +0800
Subject: [PATCH] Update flash_attention_patch.py

To be compatible with the new change in the Transformers library, where a new argument 'padding_mask' was added to forward function of attention layer.
https://github.com/huggingface/transformers/pull/25598
---
 .../colossal_llama2/utils/flash_attention_patch.py               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
index 6c58c59307a6..111659b2d928 100644
--- a/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
@@ -65,6 +65,7 @@ def attention_forward(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    **kwargs
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
     Re-define LLaMA-2 `LlamaAttention` forward method using flash-attention.