Skip to content

Commit

Permalink
Handle undefined model max length (#219) (#222)
Browse files Browse the repository at this point in the history
* Handle undefined model max length

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Update CHANGELOG.md

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Remove scratch notebook

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Fix bug

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Use compare instead

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

* Update sentencetransformermodel.py

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>

---------

Signed-off-by: Thanawan Atchariyachanvanit <latchari@amazon.com>
(cherry picked from commit 9b558c7)

Co-authored-by: Thanawan Atchariyachanvanit <latchari@amazon.com>
  • Loading branch information
opensearch-trigger-bot[bot] and thanawan-atc committed Aug 15, 2023
1 parent 021008a commit 61c0907
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
### Fixed
- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203))
- Correct demo_ml_commons_integration.ipynb by @thanawan-atc in ([#208](https://github.com/opensearch-project/opensearch-py-ml/pull/208))
- Handle the case when the model max length is undefined in tokenizer by @thanawan-atc in ([#219](https://github.com/opensearch-project/opensearch-py-ml/pull/219))

## [1.1.0]

Expand Down
18 changes: 18 additions & 0 deletions opensearch_py_ml/ml_models/sentencetransformermodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,14 @@ def save_as_pt(
zip_file_name = str(model_id.split("/")[-1] + ".zip")
zip_file_path = os.path.join(model_output_path, zip_file_name)

# handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2")
# (See PR #219 and https://github.com/huggingface/transformers/issues/14561 for more context)
if model.tokenizer.model_max_length > model.get_max_seq_length():
model.tokenizer.model_max_length = model.get_max_seq_length()
print(
f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {model.tokenizer.model_max_length}"
)

# save tokenizer.json in save_json_folder_name
model.save(save_json_folder_path)
self._fill_null_truncation_field(
Expand Down Expand Up @@ -885,6 +893,14 @@ def save_as_onnx(

zip_file_path = os.path.join(model_output_path, zip_file_name)

# handle when model_max_length is unproperly defined in model's tokenizer (e.g. "intfloat/e5-small-v2")
# (See PR #219 and https://github.com/huggingface/transformers/issues/14561 for more context)
if model.tokenizer.model_max_length > model.get_max_seq_length():
model.tokenizer.model_max_length = model.get_max_seq_length()
print(
f"The model_max_length is not properly defined in tokenizer_config.json. Setting it to be {model.tokenizer.model_max_length}"
)

# save tokenizer.json in output_path
model.save(save_json_folder_path)
self._fill_null_truncation_field(
Expand All @@ -898,6 +914,8 @@ def save_as_onnx(
opset=15,
)

print("model file is saved to ", model_path)

# zip model file along with tokenizer.json as output
with ZipFile(str(zip_file_path), "w") as zipObj:
zipObj.write(
Expand Down
58 changes: 58 additions & 0 deletions tests/ml_models/test_sentencetransformermodel_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,5 +571,63 @@ def test_truncation_parameter():
clean_test_folder(TEST_FOLDER)


def test_undefined_model_max_length_in_tokenizer_for_torch_script():
# Model of which tokenizer has undefined model max length.
model_id = "intfloat/e5-small-v2"
expected_max_length = 512

clean_test_folder(TEST_FOLDER)
test_model14 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model14.save_as_pt(model_id=model_id, sentences=["today is sunny"])

tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
try:
with open(tokenizer_json_file_path, "r") as json_file:
tokenizer_json = json.load(json_file)
except Exception as exec:
assert (
False
), f"Creating tokenizer.json file for tracing raised an exception {exec}"

assert (
tokenizer_json["truncation"]["max_length"] == expected_max_length
), "max_length is not properly set"

clean_test_folder(TEST_FOLDER)


def test_undefined_model_max_length_in_tokenizer_for_onnx():
# Model of which tokenizer has undefined model max length.
model_id = "intfloat/e5-small-v2"
expected_max_length = 512

clean_test_folder(TEST_FOLDER)
test_model14 = SentenceTransformerModel(
folder_path=TEST_FOLDER,
model_id=model_id,
)

test_model14.save_as_onnx(model_id=model_id)

tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json")
try:
with open(tokenizer_json_file_path, "r") as json_file:
tokenizer_json = json.load(json_file)
except Exception as exec:
assert (
False
), f"Creating tokenizer.json file for tracing raised an exception {exec}"

assert (
tokenizer_json["truncation"]["max_length"] == expected_max_length
), "max_length is not properly set"

clean_test_folder(TEST_FOLDER)


clean_test_folder(TEST_FOLDER)
clean_test_folder(TESTDATA_UNZIP_FOLDER)

0 comments on commit 61c0907

Please sign in to comment.