diff --git a/CHANGELOG.md b/CHANGELOG.md index da9401cb..5368d028 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,15 @@ # CHANGELOG Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +## [1.2.0] + +### Added + +### Changed + +### Fixed +- Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) + ## [1.1.0] ### Added diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 3b3943ae..d4e8a521 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -8,6 +8,7 @@ sphinx_rtd_theme nbsphinx pandoc deprecated + # using in SentenceTransformerModel torch pyyaml @@ -15,6 +16,7 @@ accelerate sentence_transformers transformers tqdm +mdutils # traitlets has been having all sorts of release problems lately. traitlets<5.1 diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 8fbcb1a0..05db5270 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -10,6 +10,7 @@ import pickle import platform import random +import re import shutil import subprocess import time @@ -23,6 +24,7 @@ import torch import yaml from accelerate import Accelerator, notebook_launcher +from mdutils.fileutils import MarkDownFile from sentence_transformers import SentenceTransformer from sentence_transformers.models import Normalize, Pooling, Transformer from torch.utils.data import DataLoader @@ -1006,6 +1008,74 @@ def set_up_accelerate_config( "Failed to open config file for ml common upload: " + file_path + "\n" ) + def _get_model_description_from_readme_file(self, readme_file_path) -> str: + """ + Get description of the model from README.md file in the model folder + after the model is saved in local directory + + See example here: + https://huggingface.co/sentence-transformers/msmarco-distilbert-base-tas-b/blob/main/README.md) + + This function assumes that the README.md has the following format: + + # sentence-transformers/msmarco-distilbert-base-tas-b + This is [ ... further description ... ] + + # [ ... Next section ...] + ... + + :param readme_file_path: Path to README.md file + :type readme_file_path: string + :return: Description of the model + :rtype: string + """ + readme_data = MarkDownFile.read_file(readme_file_path) + + # Find the description section + start_str = f"# {self.model_id}" + start = readme_data.find(start_str) + if start == -1: + model_name = self.model_id.split("/")[1] + start_str = f"# {model_name}" + start = readme_data.find(start_str) + end = readme_data.find("\n#", start + len(start_str)) + + # If we cannot find the scope of description section, raise error. + if start == -1 or end == -1: + assert False, "Cannot find description in README.md file" + + # Parse out the description section + description = readme_data[start + len(start_str) + 1 : end].strip() + description = description.split("\n")[0] + + # Remove hyperlink and reformat text + description = re.sub(r"\(.*?\)", "", description) + description = re.sub(r"[\[\]]", "", description) + description = re.sub(r"\*", "", description) + + # Remove unnecessary part if exists (i.e. " For an introduction to ...") + # (Found in https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1/blob/main/README.md) + unnecessary_part = description.find(" For an introduction to") + if unnecessary_part != -1: + description = description[:unnecessary_part] + + return description + + def _generate_default_model_description(self, embedding_dimension) -> str: + """ + Generate default model description of the model based on embedding_dimension + + ::param embedding_dimension: Embedding dimension of the model. + :type embedding_dimension: int + :return: Description of the model + :rtype: string + """ + print( + "Using default description from embedding_dimension instead (You can overwrite this by specifying description parameter in make_model_config_json function" + ) + description = f"This is a sentence-transformers model: It maps sentences & paragraphs to a {embedding_dimension} dimensional dense vector space." + return description + def make_model_config_json( self, model_name: str = None, @@ -1014,6 +1084,7 @@ def make_model_config_json( embedding_dimension: int = None, pooling_mode: str = None, normalize_result: bool = None, + description: str = None, all_config: str = None, model_type: str = None, verbose: bool = False, @@ -1040,6 +1111,9 @@ def make_model_config_json( :param normalize_result: Optional, whether to normalize the result of the model. If None, check from the pre-trained hugging-face model object. :type normalize_result: bool + :param description: Optional, the description of the model. If None, get description from the README.md + file in the model folder. + :type description: str :param all_config: Optional, the all_config of the model. If None, parse all contents from the config file of pre-trained hugging-face model @@ -1087,6 +1161,26 @@ def make_model_config_json( f"Raised exception while getting model data from pre-trained hugging-face model object: {e}" ) + if description is None: + readme_file_path = os.path.join(self.folder_path, "README.md") + if os.path.exists(readme_file_path): + try: + if verbose: + print("reading README.md file") + description = self._get_model_description_from_readme_file( + readme_file_path + ) + except Exception as e: + print(f"Cannot scrape model description from README.md file: {e}") + description = self._generate_default_model_description( + embedding_dimension + ) + else: + print("Cannot find README.md file to scrape model description") + description = self._generate_default_model_description( + embedding_dimension + ) + if all_config is None: if not os.path.exists(config_json_file_path): raise Exception( @@ -1114,6 +1208,7 @@ def make_model_config_json( model_config_content = { "name": model_name, "version": version_number, + "description": description, "model_format": model_format, "model_task_type": "TEXT_EMBEDDING", "model_config": { diff --git a/requirements-dev.txt b/requirements-dev.txt index 2734a6fa..0f74e77a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ sentence_transformers tqdm transformers deprecated +mdutils # # Testing diff --git a/tests/conftest.py b/tests/conftest.py index 1e230f4a..93502fa0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -109,7 +109,7 @@ def check_values(self, oml_obj, pd_obj): def check_exception(self, ed_exc, pd_exc): """Checks that either an exception was raised or not from both opensearch_py_ml and pandas""" - assert (ed_exc is None) == (pd_exc is None) and type(ed_exc) == type(pd_exc) + assert (ed_exc is None) == (pd_exc is None) and isinstance(ed_exc, type(pd_exc)) if pd_exc is not None: raise pd_exc diff --git a/tests/ml_commons/test_ml_commons_client.py b/tests/ml_commons/test_ml_commons_client.py index ae32edd9..10be2c16 100644 --- a/tests/ml_commons/test_ml_commons_client.py +++ b/tests/ml_commons/test_ml_commons_client.py @@ -68,8 +68,8 @@ def clean_test_folder(TEST_FOLDER): def test_init(): - assert type(ml_client._client) == OpenSearch - assert type(ml_client._model_uploader) == ModelUploader + assert isinstance(ml_client._client, OpenSearch) + assert isinstance(ml_client._model_uploader, ModelUploader) def test_execute(): diff --git a/tests/ml_models/test_sentencetransformermodel_pytest.py b/tests/ml_models/test_sentencetransformermodel_pytest.py index de76f1a7..7bf0c95b 100644 --- a/tests/ml_models/test_sentencetransformermodel_pytest.py +++ b/tests/ml_models/test_sentencetransformermodel_pytest.py @@ -204,7 +204,12 @@ def test_make_model_config_json_for_torch_script(): assert ( "model_format" in model_config_data_torch and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ) + ), "Missing or Wrong model_format in torch script model config file" + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources." + ), "Missing or Wrong model description in onnx model config file'" assert ( "model_config" in model_config_data_torch ), "Missing 'model_config' in torch script model config file" @@ -248,11 +253,16 @@ def test_make_model_config_json_for_onnx(): assert ( "name" in model_config_data_onnx and model_config_data_onnx["name"] == model_id - ), "Missing or Wrong model name in onnx model config file'" + ), "Missing or Wrong model name in onnx model config file" assert ( "model_format" in model_config_data_onnx and model_config_data_onnx["model_format"] == "ONNX" - ) + ), "Missing or Wrong model_format in onnx model config file" + assert ( + "description" in model_config_data_onnx + and model_config_data_onnx["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search." + ), "Missing or Wrong model description in onnx model config file" assert ( "model_config" in model_config_data_onnx ), "Missing 'model_config' in onnx model config file" @@ -310,7 +320,7 @@ def test_overwrite_fields_in_model_config(): assert ( "model_format" in model_config_data_torch and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ) + ), "Missing or Wrong model_format in onnx model config file" assert ( "model_config" in model_config_data_torch ), "Missing 'model_config' in torch script model config file" @@ -354,7 +364,7 @@ def test_overwrite_fields_in_model_config(): assert ( "model_format" in model_config_data_torch and model_config_data_torch["model_format"] == "TORCH_SCRIPT" - ) + ), "Missing or Wrong model_format in torch script model config file" assert ( "model_config" in model_config_data_torch ), "Missing 'model_config' in torch script model config file" @@ -372,10 +382,42 @@ def test_overwrite_fields_in_model_config(): clean_test_folder(TEST_FOLDER) -def test_truncation_parameter(): +def test_missing_readme_md_file(): model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" - MAX_LENGTH_TASB = 512 + clean_test_folder(TEST_FOLDER) + test_model9 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + test_model9.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + temp_path = os.path.join( + TEST_FOLDER, + "README.md", + ) + os.remove(temp_path) + model_config_path_torch = test_model9.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space." + ), "Should use default model description when README.md file is missing" + + clean_test_folder(TEST_FOLDER) + + +def test_missing_expected_description_in_readme_file(): + model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2" clean_test_folder(TEST_FOLDER) test_model10 = SentenceTransformerModel( folder_path=TEST_FOLDER, @@ -383,6 +425,100 @@ def test_truncation_parameter(): ) test_model10.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + temp_path = os.path.join( + TEST_FOLDER, + "README.md", + ) + with open(temp_path, "w") as f: + f.write("No model description here") + model_config_path_torch = test_model10.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space." + ), "Should use default model description when description is missing from README.md" + + clean_test_folder(TEST_FOLDER) + + +def test_overwrite_description(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + clean_test_folder(TEST_FOLDER) + test_model11 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model11.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model11.make_model_config_json( + model_format="TORCH_SCRIPT", description="Expected Description" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] == "Expected Description" + ), "Cannot overwrite description in model config file" + + clean_test_folder(TEST_FOLDER) + + +def test_long_description(): + model_id = "sentence-transformers/gtr-t5-base" + clean_test_folder(TEST_FOLDER) + test_model12 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model12.save_as_pt(model_id=model_id, sentences=["today is sunny"]) + model_config_path_torch = test_model12.make_model_config_json( + model_format="TORCH_SCRIPT" + ) + try: + with open(model_config_path_torch) as json_file: + model_config_data_torch = json.load(json_file) + except Exception as exec: + assert ( + False + ), f"Creating model config file for tracing in torch_script raised an exception {exec}" + + assert ( + "description" in model_config_data_torch + and model_config_data_torch["description"] + == "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space. The model was specifically trained for the task of sematic search." + ), "Missing or Wrong model description in torch_script model config file" + + clean_test_folder(TEST_FOLDER) + + +def test_truncation_parameter(): + model_id = "sentence-transformers/msmarco-distilbert-base-tas-b" + MAX_LENGTH_TASB = 512 + + clean_test_folder(TEST_FOLDER) + test_model13 = SentenceTransformerModel( + folder_path=TEST_FOLDER, + model_id=model_id, + ) + + test_model13.save_as_pt(model_id=model_id, sentences=["today is sunny"]) tokenizer_json_file_path = os.path.join(TEST_FOLDER, "tokenizer.json") try: