Update sentencetransformermodel.py

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>
opensearch-project · Aug 15, 2023 · f783502 · f783502
1 parent 1402bf4
commit f783502
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -791,8 +791,9 @@ def save_as_pt(
             zip_file_name = str(model_id.split("/")[-1] + ".zip")
         zip_file_path = os.path.join(model_output_path, zip_file_name)
 
-        # handle undefined model_max_length in model's tokenizer (e.g. "intfloat/e5-small-v2" )
-        if model.tokenizer.model_max_length == 1000000000000000019884624838656:
+        # handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2")
+        # (See PR #219 and https://github.com/huggingface/transformers/issues/14561 for more context)
+        if model.tokenizer.model_max_length > model.get_max_seq_length():
             model.tokenizer.model_max_length = model.get_max_seq_length()
             print(
                 f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {model.tokenizer.model_max_length}"