diff --git a/.github/workflows/model_listing_uploader.yml b/.github/workflows/model_listing_uploader.yml new file mode 100644 index 00000000..f08776ca --- /dev/null +++ b/.github/workflows/model_listing_uploader.yml @@ -0,0 +1,49 @@ +name: Model Listing Uploading +on: + push: + branches: + - main + paths: + - utils/model_uploader/model_listing/pretrained_model_listing.json + workflow_dispatch: + +jobs: + upload-model-listing: + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: read + environment: opensearch-py-ml-cicd-env + env: + bucket_model_listing_file_path: ml-models/model_listing/pre_trained_models.json + repo_model_listing_path: ./utils/model_uploader/model_listing/pretrained_model_listing.json + steps: + - name: Fail if branch is not main + if: github.ref == 'refs/heads/main' + run: | + echo "This workflow should only be triggered on 'main' branch" + exit 1 + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: upload-model-listing + - name: Update pre_trained_models.json in S3 + run: aws s3 cp ${{ env.repo_model_listing_path }} s3://${{ secrets.MODEL_BUCKET }}/${{ env.bucket_model_listing_file_path }} + + trigger-ml-models-release-workflow: + needs: upload-model-listing + runs-on: 'ubuntu-latest' + permissions: + contents: read + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Trigger Jenkins Workflow with Generic Webhook + run: | + jenkins_trigger_token=${{ secrets.JENKINS_ML_MODELS_RELEASE_GENERIC_WEBHOOK_TOKEN }} + jenkins_params="{\"BASE_DOWNLOAD_PATH\":\"ml-models/model_listing\"}" + sh utils/model_uploader/trigger_ml_models_release.sh $jenkins_trigger_token $jenkins_params diff --git a/.github/workflows/update_model_listing.yml b/.github/workflows/update_model_listing.yml new file mode 100644 index 00000000..ffc6396b --- /dev/null +++ b/.github/workflows/update_model_listing.yml @@ -0,0 +1,145 @@ +name: Update Pretrained Model Listing +on: + workflow_dispatch: + +jobs: + update-model-listing: + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: write + pull-requests: write + environment: opensearch-py-ml-cicd-env + env: + bucket_model_listing_file_path: ml-models/model_listing/pre_trained_models.json + repo_model_listing_path: ./utils/model_uploader/model_listing/pretrained_model_listing.json + path_prefixes: "ml-models/huggingface/" + # To expand the model listing to encompass additional folders, simply adjust the path_prefixes as indicated below: + # "ml-models/first_folder/ ml-models/second_folder/ ml-models/third_folder/" (Separate each folder with a space) + steps: + - name: Fail if branch is not main + if: github.ref == 'refs/heads/main' + run: | + echo "This workflow should only be triggered on 'main' branch" + exit 1 + - name: Checkout Main Branch + uses: actions/checkout@v3 + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: update-model-listing + - name: List Models + run: | + path_prefixes="${{ env.path_prefixes }}" + for prefix in $path_prefixes + do + if aws s3 ls s3://${{ secrets.PERSONAL_MODEL_BUCKET }}/$prefix > /dev/null + then + aws s3api list-objects --bucket ${{ secrets.MODEL_BUCKET }} --prefix $prefix --query "Contents[].{Key: Key}" --output text | grep "/config.json$" >> config_paths.txt + else + echo "Folder with prefix $prefix does not exist." + fi + done + echo $(cat config_paths.txt) + - name: Download config files + run: | + mkdir config_folder + path_prefixes="${{ env.path_prefixes }}" + for prefix in $path_prefixes + do + aws s3 cp s3://${{ secrets.MODEL_BUCKET }}/$prefix config_folder/$prefix --recursive --exclude "*" --include "*/config.json" + done + echo $(ls config_folder) + - name: Set Up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Update pre_trained_models.json + run: | + python utils/model_uploader/update_pretrained_model_listing.py "config_paths.txt" "config_folder" + - name: Create PR Body + id: create_pr_body + run: | + update_time=$(TZ='America/Los_Angeles' date "+%Y-%m-%d %T") + echo "update_time=$update_time" >> $GITHUB_OUTPUT + pr_body=" + - [ ] This PR made commit to only these two files: pretrained_model_listing.json and CHANGELOG.md. + - [ ] CHANGELOG.md has been updated by the workflow or by you if the workflow fails to do so. + - [ ] Merge conflicts have been resolved. + + ========= Workflow Details ========== + - Workflow Name: ${{ github.workflow }} + - Workflow Run ID: ${{ github.run_id }} + - Workflow Initiator: @${{ github.actor }} + - File Update Time: $update_time" + + echo "pr_body<> $GITHUB_OUTPUT + echo "${pr_body@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${pr_body@E}" + - name: Create a Branch & Raise a PR + uses: peter-evans/create-pull-request@v5 + id: create_pr + with: + committer: github-actions[bot] + commit-message: 'GitHub Actions Workflow: Update Pretrained Model Listing' + signoff: true + title: 'Update Pretrained Model Listing - ${{ steps.create_pr_body.outputs.update_time }}' + body: ${{ steps.create_pr_body.outputs.pr_body }} + labels: ModelListingUploading + branch: model-listing-uploader/${{ github.run_id }} + delete-branch: true + add-paths: ${{ env.repo_model_listing_path }} + - name: Checkout PR Branch + id: checkout_pr_branch + continue-on-error: true + uses: actions/checkout@v3 + with: + ref: model-listing-uploader/${{ github.run_id }} + - name: Create a line for updating CHANGELOG.md + id: create_changelog_line + if: steps.checkout_pr_branch.outcome == 'success' + continue-on-error: true + run: | + pr_ref="([#${{ steps.create_pr.outputs.pull-request-number }}](${{ steps.create_pr.outputs.pull-request-url }}))" + changelog_line="Update pretrained_model_listing.json (${{ steps.create_pr_body.outputs.update_time }}) by @${{ github.actor }} $pr_ref" + echo "changelog_line=$changelog_line" >> $GITHUB_OUTPUT + - name: Warning Comment on PR if create_changelog_line fails + if: steps.checkout_pr_branch.outcome == 'success' && steps.create_changelog_line.outcome == 'failure' + uses: thollander/actions-comment-pull-request@v2 + with: + pr_number: ${{ steps.create_pr.outputs.pull-request-number }} + message: "Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please update CHANGELOG.md manually." + - name: Update CHANGELOG.md + if: steps.checkout_pr_branch.outcome == 'success' && steps.create_changelog_line.outcome == 'success' + id: update_changelog + continue-on-error: true + run: | + python -m pip install mdutils + python utils/model_uploader/update_changelog_md.py "${{ steps.create_changelog_line.outputs.changelog_line }}" + - name: Commit Updates + if: steps.checkout_pr_branch.outcome == 'success' && steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'success' + uses: stefanzweifel/git-auto-commit-action@v4 + id: commit + with: + branch: model-listing-uploader/${{ github.run_id }} + commit_user_email: "github-actions[bot]@users.noreply.github.com" + commit_message: 'GitHub Actions Workflow: Update CHANGELOG.md - ${{ env.model_info }}' + commit_options: '--signoff' + file_pattern: CHANGELOG.md + - name: Warning Comment on PR if update_changelog fails + if: steps.checkout_pr_branch.outcome == 'success' && steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'failure' + uses: thollander/actions-comment-pull-request@v2 + with: + pr_number: ${{ steps.create_pr.outputs.pull-request-number }} + message: | + Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please add the following line manually. + >>> + ${{ steps.create_changelog_line.outputs.changelog_line }} + - name: No Change in Model Listing + if: steps.checkout_pr_branch.outcome == 'failure' + run: | + echo "There is no change in model listing." + echo "Exiting the workflow" diff --git a/CHANGELOG.md b/CHANGELOG.md index f3f01e40..53270541 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,9 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Added - Add workflows and scripts for automating model tracing and uploading process by @thanawan-atc in ([#209](https://github.com/opensearch-project/opensearch-py-ml/pull/209)) +- Add workflow and scripts for automating model listing updating process by @thanawan-atc in ([#210](https://github.com/opensearch-project/opensearch-py-ml/pull/210)) + ### Changed diff --git a/noxfile.py b/noxfile.py index 03809192..6016583c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -125,7 +125,7 @@ def test(session, pandas_version: str): "-m", "pytest", "--cov-report=term-missing", - "--cov=opensearch_py_ml/", + "--cov", "--cov-config=setup.cfg", "--doctest-modules", "--nbval", diff --git a/setup.cfg b/setup.cfg index a0cf9569..27eb772c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,3 +7,7 @@ exclude_lines= @abstractmethod if TYPE_CHECKING: raise NotImplementedError* +[coverage:run] +include= + opensearch_py_ml/* + utils/model_uploader/update_pretrained_model_listing.py diff --git a/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/intfloat/e5-small-v2/1.0.1/onnx/config.json b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/intfloat/e5-small-v2/1.0.1/onnx/config.json new file mode 100644 index 00000000..0784b70f --- /dev/null +++ b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/intfloat/e5-small-v2/1.0.1/onnx/config.json @@ -0,0 +1 @@ +{"name": "intfloat/e5-small-v2", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space.", "model_format": "ONNX", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "bert", "embedding_dimension": 384, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/intfloat_e5-small-v2/\", \"architectures\": [\"BertModel\"], \"attention_probs_dropout_prob\": 0.1, \"classifier_dropout\": null, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 384, \"initializer_range\": 0.02, \"intermediate_size\": 1536, \"layer_norm_eps\": 1e-12, \"max_position_embeddings\": 512, \"model_type\": \"bert\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 0, \"position_embedding_type\": \"absolute\", \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"type_vocab_size\": 2, \"use_cache\": true, \"vocab_size\": 30522}"}} diff --git a/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json new file mode 100644 index 00000000..d4bfbddc --- /dev/null +++ b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json @@ -0,0 +1 @@ +{"name": "sentence-transformers/clip-ViT-B-32-multilingual-v1", "version": "1.0.1", "description": "This is a multi-lingual version of the OpenAI CLIP-ViT-B32 model. You can map text and images to a common dense vector space such that images and the matching texts are close. This model can be used for image search and for multi-lingual zero-shot image classification .", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "distilbert", "embedding_dimension": 512, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": false, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_clip-ViT-B-32-multilingual-v1/\", \"activation\": \"gelu\", \"architectures\": [\"DistilBertModel\"], \"attention_dropout\": 0.1, \"dim\": 768, \"dropout\": 0.1, \"hidden_dim\": 3072, \"initializer_range\": 0.02, \"max_position_embeddings\": 512, \"model_type\": \"distilbert\", \"n_heads\": 12, \"n_layers\": 6, \"output_past\": true, \"pad_token_id\": 0, \"qa_dropout\": 0.1, \"seq_classif_dropout\": 0.2, \"sinusoidal_pos_embds\": false, \"tie_weights_\": true, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 119547}"}} diff --git a/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json new file mode 100644 index 00000000..bfa5fa37 --- /dev/null +++ b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json @@ -0,0 +1 @@ +{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources.", "model_format": "ONNX", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}} diff --git a/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json new file mode 100644 index 00000000..cbbb2145 --- /dev/null +++ b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json @@ -0,0 +1 @@ +{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources.", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}} diff --git a/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json new file mode 100644 index 00000000..118dd2a2 --- /dev/null +++ b/tests/ml_model_listing/samples/config_folder/ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json @@ -0,0 +1 @@ +{"name": "sentence-transformers/multi-qa-mpnet-base-cos-v1", "version": "2.0.0", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources. (New Version)", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "mpnet", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": true, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/sentence-transformers_multi-qa-mpnet-base-cos-v1/\", \"architectures\": [\"MPNetModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"eos_token_id\": 2, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"mpnet\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"relative_attention_num_buckets\": 32, \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"vocab_size\": 30527}"}} diff --git a/tests/ml_model_listing/samples/config_folder/ml-models/other_source/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json b/tests/ml_model_listing/samples/config_folder/ml-models/other_source/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json new file mode 100644 index 00000000..bc543aec --- /dev/null +++ b/tests/ml_model_listing/samples/config_folder/ml-models/other_source/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json @@ -0,0 +1 @@ +{"name": "jhgan/ko-sroberta-multitask", "version": "1.0.1", "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.", "model_format": "TORCH_SCRIPT", "model_task_type": "TEXT_EMBEDDING", "model_config": {"model_type": "roberta", "embedding_dimension": 768, "framework_type": "sentence_transformers", "pooling_mode": "MEAN", "normalize_result": false, "all_config": "{\"_name_or_path\": \"/root/.cache/torch/sentence_transformers/jhgan_ko-sroberta-multitask/\", \"architectures\": [\"RobertaModel\"], \"attention_probs_dropout_prob\": 0.1, \"bos_token_id\": 0, \"classifier_dropout\": null, \"eos_token_id\": 2, \"gradient_checkpointing\": false, \"hidden_act\": \"gelu\", \"hidden_dropout_prob\": 0.1, \"hidden_size\": 768, \"initializer_range\": 0.02, \"intermediate_size\": 3072, \"layer_norm_eps\": 1e-05, \"max_position_embeddings\": 514, \"model_type\": \"roberta\", \"num_attention_heads\": 12, \"num_hidden_layers\": 12, \"pad_token_id\": 1, \"position_embedding_type\": \"absolute\", \"tokenizer_class\": \"BertTokenizer\", \"torch_dtype\": \"float32\", \"transformers_version\": \"4.31.0\", \"type_vocab_size\": 1, \"use_cache\": true, \"vocab_size\": 32000}"}} diff --git a/tests/ml_model_listing/samples/config_paths.txt b/tests/ml_model_listing/samples/config_paths.txt new file mode 100644 index 00000000..3705f396 --- /dev/null +++ b/tests/ml_model_listing/samples/config_paths.txt @@ -0,0 +1 @@ +ml-models/huggingface/intfloat/e5-small-v2/1.0.1/onnx/config.json ml-models/other_source/jhgan/ko-sroberta-multitask/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/onnx/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/1.0.1/torch_script/config.json ml-models/huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1/2.0.0/torch_script/config.json \ No newline at end of file diff --git a/tests/ml_model_listing/samples/pretrained_model_listing.json b/tests/ml_model_listing/samples/pretrained_model_listing.json new file mode 100644 index 00000000..89e92945 --- /dev/null +++ b/tests/ml_model_listing/samples/pretrained_model_listing.json @@ -0,0 +1,53 @@ +[ + { + "name": "huggingface/intfloat/e5-small-v2", + "versions": { + "1.0.1": { + "format": [ + "onnx" + ], + "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space." + } + } + }, + { + "name": "huggingface/sentence-transformers/clip-ViT-B-32-multilingual-v1", + "versions": { + "1.0.1": { + "format": [ + "torch_script" + ], + "description": "This is a multi-lingual version of the OpenAI CLIP-ViT-B32 model. You can map text and images to a common dense vector space such that images and the matching texts are close. This model can be used for image search and for multi-lingual zero-shot image classification ." + } + } + }, + { + "name": "huggingface/sentence-transformers/multi-qa-mpnet-base-cos-v1", + "versions": { + "1.0.1": { + "format": [ + "onnx", + "torch_script" + ], + "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources." + }, + "2.0.0": { + "format": [ + "torch_script" + ], + "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and was designed for semantic search. It has been trained on 215M pairs from diverse sources. (New Version)" + } + } + }, + { + "name": "other_source/jhgan/ko-sroberta-multitask", + "versions": { + "1.0.1": { + "format": [ + "torch_script" + ], + "description": "This is a sentence-transformers model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search." + } + } + } +] diff --git a/tests/ml_model_listing/test_update_pretrained_model_listing.py b/tests/ml_model_listing/test_update_pretrained_model_listing.py new file mode 100644 index 00000000..bceae131 --- /dev/null +++ b/tests/ml_model_listing/test_update_pretrained_model_listing.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# We need to append UTILS_MODEL_UPLOADER_DIR path so that we can import +# functions from update_pretrained_model_listing.py +# since this python script is not in the root directory. + +import json +import os +import shutil +import sys + +import pytest + +THIS_DIR = os.path.dirname(__file__) +UTILS_MODEL_UPLOADER_DIR = os.path.join(THIS_DIR, "../../utils/model_uploader") +sys.path.append(UTILS_MODEL_UPLOADER_DIR) + +SAMPLE_FOLDER = os.path.join(THIS_DIR, "samples") +CONFIG_PATHS_TXT_FILENAME = "config_paths.txt" +CONFIG_FOLDERNAME = "config_folder" +SAMPLE_PRETRAINED_MODEL_LISTING = os.path.join( + SAMPLE_FOLDER, "pretrained_model_listing.json" +) +SAMPLE_FOLDER_COPY = os.path.join(THIS_DIR, "samples_copy") +SAMPLE_MISSING_CONFIG_SUBFOLDERNAME = "ml-models/huggingface/sentence-transformers" +TEST_FILE = os.path.join(THIS_DIR, "test_pretrained_model_listing.json") + +from update_pretrained_model_listing import main as update_pretrained_model_listing_main + + +def clean_test_file(): + if os.path.isfile(TEST_FILE): + os.remove(TEST_FILE) + + +def copy_samples_folder(): + shutil.copytree(SAMPLE_FOLDER, SAMPLE_FOLDER_COPY) + + +def clean_samples_folder_copy(): + if os.path.exists(SAMPLE_FOLDER_COPY): + for files in os.listdir(SAMPLE_FOLDER_COPY): + sub_path = os.path.join(SAMPLE_FOLDER_COPY, files) + if os.path.isfile(sub_path): + os.remove(sub_path) + else: + try: + shutil.rmtree(sub_path) + except OSError as err: + print( + "Fail to delete files, please delete all files in " + + str(SAMPLE_FOLDER_COPY) + + " " + + str(err) + ) + + shutil.rmtree(SAMPLE_FOLDER_COPY) + + +clean_samples_folder_copy() +clean_test_file() + + +def test_create_new_pretrained_model_listing(): + clean_test_file() + try: + update_pretrained_model_listing_main( + [ + os.path.join(SAMPLE_FOLDER, CONFIG_PATHS_TXT_FILENAME), + os.path.join(SAMPLE_FOLDER, CONFIG_FOLDERNAME), + "--pretrained_model_listing_json_filepath", + TEST_FILE, + ] + ) + except Exception as e: + assert False, print(f"Failed while creating new pretrained model listing: {e}") + + try: + with open(SAMPLE_PRETRAINED_MODEL_LISTING, "r") as f: + sample_pretrained_model_listing = json.load(f) + except Exception as e: + assert False, print( + f"Cannot open {SAMPLE_PRETRAINED_MODEL_LISTING} to use it for verification: {e}" + ) + + try: + with open(TEST_FILE, "r") as f: + test_pretrained_model_listing = json.load(f) + except Exception as e: + assert False, print(f"Cannot open {TEST_FILE} to verify its content: {e}") + + assert test_pretrained_model_listing == sample_pretrained_model_listing, print( + "Incorrect pretrained model listing" + ) + + clean_test_file() + + +def test_missing_config_file(): + clean_test_file() + clean_samples_folder_copy() + + copy_samples_folder() + shutil.rmtree( + os.path.join( + SAMPLE_FOLDER_COPY, CONFIG_FOLDERNAME, SAMPLE_MISSING_CONFIG_SUBFOLDERNAME + ) + ) + + with pytest.raises(Exception) as exc_info: + update_pretrained_model_listing_main( + [ + os.path.join(SAMPLE_FOLDER_COPY, CONFIG_PATHS_TXT_FILENAME), + os.path.join(SAMPLE_FOLDER_COPY, CONFIG_FOLDERNAME), + "--pretrained_model_listing_json_filepath", + TEST_FILE, + ] + ) + assert exc_info.type is Exception + assert "Cannot open" in str(exc_info.value) + + clean_test_file() + clean_samples_folder_copy() + + +clean_samples_folder_copy() +clean_test_file() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..8d89f258 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. diff --git a/utils/model_uploader/__init__.py b/utils/model_uploader/__init__.py new file mode 100644 index 00000000..8d89f258 --- /dev/null +++ b/utils/model_uploader/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py index 299daf5b..3ffd3e56 100644 --- a/utils/model_uploader/model_autotracing.py +++ b/utils/model_uploader/model_autotracing.py @@ -53,7 +53,6 @@ ] RTOL_TEST = 1e-03 ATOL_TEST = 1e-05 -ML_BASE_URI = "/_plugins/_ml" def verify_license_in_md_file() -> bool: diff --git a/utils/model_uploader/update_pretrained_model_listing.py b/utils/model_uploader/update_pretrained_model_listing.py new file mode 100644 index 00000000..db56da7a --- /dev/null +++ b/utils/model_uploader/update_pretrained_model_listing.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Listing Uploading" workflow +# (See model_listing_uploader.yml) to update pretrained_model_listing.json. + +import argparse +import json +import os +import sys +from typing import Optional + +JSON_FILENAME = "pretrained_model_listing.json" +JSON_DIRNAME = "utils/model_uploader/model_listing" +PRETRAINED_MODEL_LISTING_JSON_FILEPATH = os.path.join(JSON_DIRNAME, JSON_FILENAME) + + +def get_sentence_transformer_model_description( + config_folderpath: str, config_filepath: str +) -> Optional[str]: + """ + Get description of the pretrained sentence transformer model from config file + + :param config_folderpath: Path to the folder that stores copies of config files from S3 (e.g. 'config_folder') + :type config_folderpath: string + :param config_filepath: Path to local config file + (e.g. 'sentence-transformers/all-MiniLM-L12-v2/2.0.0/onnx/config.json') + :type config_filepath: string + :return: Description of the model + :rtype: string or None + """ + filepath = os.path.join(config_folderpath, config_filepath) + try: + with open(filepath, "r") as f: + model_config = json.load(f) + except Exception as e: + raise Exception(f"Cannot open {filepath} to get model description: {e}") + if "description" in model_config: + return model_config["description"] + else: + return None + + +def create_new_pretrained_model_listing( + config_paths_txt_filepath: str, + config_folderpath: str, + pretrained_model_listing_json_filepath: str = PRETRAINED_MODEL_LISTING_JSON_FILEPATH, +): + """ + Create a new pretrained model listing and store it at pretrained_model_listing_json_filepath + based on current models in config_paths_txt_filepath and their config files in config_folderpath + + :param config_paths_txt_filepath: Path to the txt file that stores a list of config paths from S3 + in the ml-models/huggingface/ folder of the S3 bucket + :type config_paths_txt_filepath: string + :param config_folderpath: Path to the folder that stores copies of config files from S3 + :type config_folderpath: string + :return: No return value expected + :param pretrained_model_listing_json_filepath: Path to the json file that stores new model listing + :rtype: None + """ + print("\n=== Begin running update_pretrained_model_listing.py ===") + print(f"--- Reading {config_paths_txt_filepath} ---") + with open(config_paths_txt_filepath, "r") as f: + config_paths_lst = f.read().split() + + print("\n--- Creating New Model Listing --- ") + new_model_listing_dict = {} + for config_filepath in config_paths_lst: + # (e.g. 'ml-models/huggingface/sentence-transformers/all-MiniLM-L12-v2/2.0.0/onnx/config.json') + model_parts = config_filepath.split("/") + model_name = "/".join(model_parts[1:4]) + model_version = model_parts[4] + model_format = model_parts[5] + if model_name not in new_model_listing_dict: + new_model_listing_dict[model_name] = { + "name": model_name, + "versions": {}, + } + versions_content = new_model_listing_dict[model_name]["versions"] + if model_version not in versions_content: + versions_content[model_version] = { + "format": [], + } + versions_content[model_version]["format"].append(model_format) + if "description" not in versions_content[model_version]: + description = get_sentence_transformer_model_description( + config_folderpath, config_filepath + ) + if description is not None: + versions_content[model_version]["description"] = description + + new_model_listing_lst = list(new_model_listing_dict.values()) + new_model_listing_lst = sorted(new_model_listing_lst, key=lambda d: d["name"]) + for model_dict in new_model_listing_lst: + model_dict["versions"] = dict(sorted(model_dict["versions"].items())) + + print( + f"\n--- Dumping New Model Listing in {pretrained_model_listing_json_filepath} --- " + ) + if not os.path.isdir(JSON_DIRNAME): + os.makedirs(JSON_DIRNAME) + with open(pretrained_model_listing_json_filepath, "w") as f: + json.dump(new_model_listing_lst, f, indent=2) + print("\n=== Finished running update_pretrained_model_listing.py ===") + + +def main(args): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "config_paths_txt_filepath", + type=str, + help="Path to the txt file that stores a list of config paths from S3", + ) + parser.add_argument( + "config_folderpath", + type=str, + help="Path to the folder that stores copies of config files from S3", + ) + parser.add_argument( + "-fp", + "--pretrained_model_listing_json_filepath", + type=str, + default=PRETRAINED_MODEL_LISTING_JSON_FILEPATH, + help="Path to the json file that stores new model listing", + ) + + parsed_args = parser.parse_args(args) + + if not parsed_args.config_paths_txt_filepath.endswith(".txt"): + raise Exception( + "Invalid argument: config_paths_txt_filepath should be .txt file" + ) + + create_new_pretrained_model_listing( + parsed_args.config_paths_txt_filepath, + parsed_args.config_folderpath, + parsed_args.pretrained_model_listing_json_filepath, + ) + + +if __name__ == "__main__": + main(sys.argv[1:])