Handle undefined model max length (#219) (#222)

* Handle undefined model max length Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com> * Update CHANGELOG.md Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com> * Remove scratch notebook Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com> * Fix bug Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com> * Use compare instead Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com> * Update sentencetransformermodel.py Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com> --------- Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com> (cherry picked from commit 9b558c7) Co-authored-by: Thanawan Atchariyachanvanit <latchari@amazon.com>
opensearch-project · Aug 15, 2023 · 61c0907 · 61c0907
1 parent 021008a
commit 61c0907
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ### Fixed
 - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
 - Correct demo_ml_commons_integration.ipynb by @thanawan-atc in ([#208](https://github.com/opensearch-project/opensearch-py-ml/pull/208))
+- Handle the case when the model max length is undefined in tokenizer by @thanawan-atc in ([#219](https://github.com/opensearch-project/opensearch-py-ml/pull/219))
 
 ## [1.1.0]
 

diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py
@@ -791,6 +791,14 @@ def save_as_pt(
             zip_file_name = str(model_id.split("/")[-1] + ".zip")
         zip_file_path = os.path.join(model_output_path, zip_file_name)
 
+        # handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2")
+        # (See PR #219 and https://github.com/huggingface/transformers/issues/14561 for more context)
+        if model.tokenizer.model_max_length > model.get_max_seq_length():
+            model.tokenizer.model_max_length = model.get_max_seq_length()
+            print(
+                f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {model.tokenizer.model_max_length}"
+            )
+
         # save tokenizer.json in save_json_folder_name
         model.save(save_json_folder_path)
         self._fill_null_truncation_field(
@@ -885,6 +893,14 @@ def save_as_onnx(
 
         zip_file_path = os.path.join(model_output_path, zip_file_name)
 
+        # handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2")
+        # (See PR #219 and https://github.com/huggingface/transformers/issues/14561 for more context)
+        if model.tokenizer.model_max_length > model.get_max_seq_length():
+            model.tokenizer.model_max_length = model.get_max_seq_length()
+            print(
+                f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {model.tokenizer.model_max_length}"
+            )
+
         # save tokenizer.json in output_path
         model.save(save_json_folder_path)
         self._fill_null_truncation_field(
@@ -898,6 +914,8 @@ def save_as_onnx(
             opset=15,
         )
 
+        print("model file is saved to ", model_path)
+
         # zip model file along with tokenizer.json as output
         with ZipFile(str(zip_file_path), "w") as zipObj:
             zipObj.write(

diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py
@@ -571,5 +571,63 @@ def test_truncation_parameter():
     clean_test_folder(TEST_FOLDER)
 
 
+def test_undefined_model_max_length_in_tokenizer_for_torch_script():
+    # Model of which tokenizer has undefined model max length.
+    model_id = "intfloat/e5-small-v2"
+    expected_max_length = 512
+
+    clean_test_folder(TEST_FOLDER)
+    test_model14 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model14.save_as_pt(model_id=model_id, sentences=["today is sunny"])
+
+    tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
+    try:
+        with open(tokenizer_json_file_path, "r") as json_file:
+            tokenizer_json = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating tokenizer.json file for tracing raised an exception {exec}"
+
+    assert (
+        tokenizer_json["truncation"]["max_length"] == expected_max_length
+    ), "max_length is not properly set"
+
+    clean_test_folder(TEST_FOLDER)
+
+
+def test_undefined_model_max_length_in_tokenizer_for_onnx():
+    # Model of which tokenizer has undefined model max length.
+    model_id = "intfloat/e5-small-v2"
+    expected_max_length = 512
+
+    clean_test_folder(TEST_FOLDER)
+    test_model14 = SentenceTransformerModel(
+        folder_path=TEST_FOLDER,
+        model_id=model_id,
+    )
+
+    test_model14.save_as_onnx(model_id=model_id)
+
+    tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
+    try:
+        with open(tokenizer_json_file_path, "r") as json_file:
+            tokenizer_json = json.load(json_file)
+    except Exception as exec:
+        assert (
+            False
+        ), f"Creating tokenizer.json file for tracing raised an exception {exec}"
+
+    assert (
+        tokenizer_json["truncation"]["max_length"] == expected_max_length
+    ), "max_length is not properly set"
+
+    clean_test_folder(TEST_FOLDER)
+
+
 clean_test_folder(TEST_FOLDER)
 clean_test_folder(TESTDATA_UNZIP_FOLDER)