From 20223e6dfcc3395c19180099a6af52ce9a76a77a Mon Sep 17 00:00:00 2001 From: Thanawan Atchariyachanvanit Date: Fri, 18 Aug 2023 17:04:54 -0700 Subject: [PATCH] [Automating Model Tracing and Uploading] PR 1 Model Auto-tracing & Uploading (#209) * Initiate PR #1 Model Auto-tracing & Uploading Signed-off-by: Thanawan Atchariyachanvanit * Improve update_changelog_md.py & sentencetransformermodel.py scraping Signed-off-by: Thanawan Atchariyachanvanit * Change default to N/A Signed-off-by: Thanawan Atchariyachanvanit * Update update_models_upload_history_md.py Signed-off-by: Thanawan Atchariyachanvanit * Update delete_model_uploader_branch.yml Signed-off-by: Thanawan Atchariyachanvanit * Correct linting & Update CHANGELOG.md Signed-off-by: Thanawan Atchariyachanvanit * Revert "Correct demo_ml_commons_integration.ipynb (#208)" This reverts commit 664e447190a38b18cffdd9d1009b4ba06f70e283. Signed-off-by: Thanawan Atchariyachanvanit * Update CHANGELOG.md Signed-off-by: Thanawan Atchariyachanvanit * Small fix Signed-off-by: Thanawan Atchariyachanvanit * Add model_type Signed-off-by: Thanawan Atchariyachanvanit * Removed a modified file from pull request Signed-off-by: Thanawan Atchariyachanvanit * Removed a modified file from pull request Signed-off-by: Thanawan Atchariyachanvanit * Update model_autotracing.py Signed-off-by: Thanawan Atchariyachanvanit * Update model_autotracing.py Signed-off-by: Thanawan Atchariyachanvanit * Update model_autotracing.py Signed-off-by: Thanawan Atchariyachanvanit --------- Signed-off-by: Thanawan Atchariyachanvanit --- .ci/run-repository.sh | 40 +- .ci/run-tests | 2 +- .github/workflows/build_deploy_doc.yml | 2 +- .../delete_model_uploader_branch.yml | 18 + .github/workflows/integration.yml | 2 +- .github/workflows/model_uploader.yml | 412 ++++++++++++ CHANGELOG.md | 2 + noxfile.py | 25 +- .../ml_models/sentencetransformermodel.py | 4 +- utils/{ => lint}/license-headers.py | 0 utils/model_uploader/model_autotracing.py | 599 ++++++++++++++++++ .../save_model_file_path_to_env.py | 88 +++ utils/model_uploader/update_changelog_md.py | 123 ++++ .../update_models_upload_history_md.py | 283 +++++++++ 14 files changed, 1585 insertions(+), 15 deletions(-) create mode 100644 .github/workflows/delete_model_uploader_branch.yml create mode 100644 .github/workflows/model_uploader.yml rename utils/{ => lint}/license-headers.py (100%) create mode 100644 utils/model_uploader/model_autotracing.py create mode 100644 utils/model_uploader/save_model_file_path_to_env.py create mode 100644 utils/model_uploader/update_changelog_md.py create mode 100644 utils/model_uploader/update_models_upload_history_md.py diff --git a/.ci/run-repository.sh b/.ci/run-repository.sh index ba0cea2b..6725e47d 100755 --- a/.ci/run-repository.sh +++ b/.ci/run-repository.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# Called by entry point `run-test` use this script to add your repository specific test commands +# Called by entry point `run-test` use this script to add your repository specific task commands # Once called opensearch is up and running and the following parameters are available to this script # OPENSEARCH_VERSION -- version e.g Major.Minor.Patch(-Prelease) @@ -16,7 +16,7 @@ set -e echo -e "\033[34;1mINFO:\033[0m URL ${opensearch_url}\033[0m" echo -e "\033[34;1mINFO:\033[0m EXTERNAL OS URL ${external_opensearch_url}\033[0m" echo -e "\033[34;1mINFO:\033[0m VERSION ${OPENSEARCH_VERSION}\033[0m" -echo -e "\033[34;1mINFO:\033[0m IS_DOC: ${IS_DOC}\033[0m" +echo -e "\033[34;1mINFO:\033[0m TASK_TYPE: ${TASK_TYPE}\033[0m" echo -e "\033[34;1mINFO:\033[0m TEST_SUITE ${TEST_SUITE}\033[0m" echo -e "\033[34;1mINFO:\033[0m PYTHON_VERSION ${PYTHON_VERSION}\033[0m" echo -e "\033[34;1mINFO:\033[0m PYTHON_CONNECTION_CLASS ${PYTHON_CONNECTION_CLASS}\033[0m" @@ -33,7 +33,8 @@ docker build \ echo -e "\033[1m>>>>> Run [opensearch-project/opensearch-py-ml container] >>>>>>>>>>>>>>>>>>>>>>>>>>>>>\033[0m" -if [[ "$IS_DOC" == "false" ]]; then +if [[ "$TASK_TYPE" == "test" ]]; then + # Set up OpenSearch cluster & Run integration and unit tests (Invoked by integration.yml workflow) docker run \ --network=${network_name} \ --env "STACK_VERSION=${STACK_VERSION}" \ @@ -45,10 +46,11 @@ if [[ "$IS_DOC" == "false" ]]; then --name opensearch-py-ml-test-runner \ opensearch-project/opensearch-py-ml \ nox -s "test-${PYTHON_VERSION}(pandas_version='${PANDAS_VERSION}')" + docker cp opensearch-py-ml-test-runner:/code/opensearch-py-ml/junit/ ./junit/ - docker rm opensearch-py-ml-test-runner -else +elif [[ "$TASK_TYPE" == "doc" ]]; then + # Set up OpenSearch cluster & Run docs (Invoked by build_deploy_doc.yml workflow) docker run \ --network=${network_name} \ --env "STACK_VERSION=${STACK_VERSION}" \ @@ -60,7 +62,31 @@ else --name opensearch-py-ml-doc-runner \ opensearch-project/opensearch-py-ml \ nox -s docs + docker cp opensearch-py-ml-doc-runner:/code/opensearch-py-ml/docs/build/ ./docs/ - docker rm opensearch-py-ml-doc-runner -fi \ No newline at end of file +elif [[ "$TASK_TYPE" == "trace" ]]; then + # Set up OpenSearch cluster & Run model autotracing (Invoked by model_uploader.yml workflow) + echo -e "\033[34;1mINFO:\033[0m MODEL_ID: ${MODEL_ID}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MODEL_VERSION: ${MODEL_VERSION}\033[0m" + echo -e "\033[34;1mINFO:\033[0m TRACING_FORMAT: ${TRACING_FORMAT}\033[0m" + echo -e "\033[34;1mINFO:\033[0m EMBEDDING_DIMENSION: ${EMBEDDING_DIMENSION:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m POOLING_MODE: ${POOLING_MODE:-N/A}\033[0m" + echo -e "\033[34;1mINFO:\033[0m MODEL_DESCRIPTION: ${MODEL_DESCRIPTION:-N/A}\033[0m" + + docker run \ + --network=${network_name} \ + --env "STACK_VERSION=${STACK_VERSION}" \ + --env "OPENSEARCH_URL=${opensearch_url}" \ + --env "OPENSEARCH_VERSION=${OPENSEARCH_VERSION}" \ + --env "TEST_SUITE=${TEST_SUITE}" \ + --env "PYTHON_CONNECTION_CLASS=${PYTHON_CONNECTION_CLASS}" \ + --env "TEST_TYPE=server" \ + --name opensearch-py-ml-trace-runner \ + opensearch-project/opensearch-py-ml \ + nox -s trace -- ${MODEL_ID} ${MODEL_VERSION} ${TRACING_FORMAT} -ed ${EMBEDDING_DIMENSION} -pm ${POOLING_MODE} -md ${MODEL_DESCRIPTION:+"$MODEL_DESCRIPTION"} + + docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/upload/ ./upload/ + docker cp opensearch-py-ml-trace-runner:/code/opensearch-py-ml/trace_output/ ./trace_output/ + docker rm opensearch-py-ml-trace-runner +fi diff --git a/.ci/run-tests b/.ci/run-tests index abfeac34..258da8a5 100755 --- a/.ci/run-tests +++ b/.ci/run-tests @@ -10,7 +10,7 @@ export PYTHON_CONNECTION_CLASS="${PYTHON_CONNECTION_CLASS:=Urllib3HttpConnection export CLUSTER="${1:-opensearch}" export SECURE_INTEGRATION="${2:-true}" export OPENSEARCH_VERSION="${3:-latest}" -export IS_DOC="${4:-false}" +export TASK_TYPE="${4:-test}" if [[ "$SECURE_INTEGRATION" == "true" ]]; then export OPENSEARCH_URL_EXTENSION="https" else diff --git a/.github/workflows/build_deploy_doc.yml b/.github/workflows/build_deploy_doc.yml index 9321b32e..876a73ea 100644 --- a/.github/workflows/build_deploy_doc.yml +++ b/.github/workflows/build_deploy_doc.yml @@ -20,7 +20,7 @@ jobs: - name: Checkout Repository uses: actions/checkout@v2 - name: Integ ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} - run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} true" + run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} doc" - name: Deploy uses: peaceiris/actions-gh-pages@v3 with: diff --git a/.github/workflows/delete_model_uploader_branch.yml b/.github/workflows/delete_model_uploader_branch.yml new file mode 100644 index 00000000..68aa2259 --- /dev/null +++ b/.github/workflows/delete_model_uploader_branch.yml @@ -0,0 +1,18 @@ +name: Delete merged branch for model-uploader & model-listing-uploader +on: + pull_request: + types: + - closed + +jobs: + delete-branch: + runs-on: ubuntu-latest + if: startsWith(github.event.pull_request.head.ref,'model-uploader/') || startsWith(github.event.pull_request.head.ref,'model-listing-uploader/') + steps: + # Compared to backport.yml, this GitHub action will delete branch + # of a PR that is closed, but not merged as well. + - name: Delete closed PR branch + uses: dawidd6/action-delete-branch@v3 + with: + GITHUB_TOKEN: ${{github.token}} + numbers: ${{github.event.pull_request.number}} diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 607bd356..e36c7735 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -18,7 +18,7 @@ jobs: - name: Checkout uses: actions/checkout@v2 - name: Integ ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} - run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }}" + run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} test" - name: Upload coverage to Codecov uses: codecov/codecov-action@v2 with: diff --git a/.github/workflows/model_uploader.yml b/.github/workflows/model_uploader.yml new file mode 100644 index 00000000..751cd7b6 --- /dev/null +++ b/.github/workflows/model_uploader.yml @@ -0,0 +1,412 @@ +name: Model Auto-tracing & Uploading +on: + # Step 1: Initiate the workflow + workflow_dispatch: + inputs: + model_source: + description: "Model source (e.g. huggingface)" + required: true + type: string + default: "huggingface" + model_id: + description: "Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)" + required: true + type: string + model_version: + description: "Model version number (e.g. 1.0.1)" + required: true + type: string + tracing_format: + description: "Model format for auto-tracing (torch_script/onnx)" + required: true + type: choice + options: + - "BOTH" + - "TORCH_SCRIPT" + - "ONNX" + embedding_dimension: + description: "(Optional) Embedding Dimension (Specify here if it does not exist in original config.json file, or you want to overwrite it.)" + required: false + type: int + pooling_mode: + description: "(Optional) Pooling Mode (Specify here if it does not exist in original config.json file or you want to overwrite it.)" + required: false + type: choice + options: + - "" + - "CLS" + - "MEAN" + - "MAX" + - "MEAN_SQRT_LEN" + model_description: + description: "(Optional) Description (Specify here if you want to overwrite the default model description)" + required: false + type: string + +jobs: + # Step 2: Initiate workflow variable + init-workflow-var: + runs-on: 'ubuntu-latest' + steps: + - name: Fail if branch is not main + if: github.ref == 'refs/heads/main' + run: | + echo "This workflow should only be triggered on 'main' branch" + exit 1 + - name: Initiate folders + id: init_folders + run: | + model_id=${{ github.event.inputs.model_id }} + echo "model_folder=ml-models/${{github.event.inputs.model_source}}/${model_id}" >> $GITHUB_OUTPUT + echo "sentence_transformer_folder=ml-models/${{github.event.inputs.model_source}}/${model_id%%/*}/" >> $GITHUB_OUTPUT + - name: Initiate workflow_info + id: init_workflow_info + run: | + embedding_dimension=${{ github.event.inputs.embedding_dimension }} + pooling_mode=${{ github.event.inputs.pooling_mode }} + model_description="${{ github.event.inputs.model_description }}" + + workflow_info=" + ============= Workflow Details ============== + - Workflow Name: ${{ github.workflow }} + - Workflow Run ID: ${{ github.run_id }} + - Workflow Initiator: @${{ github.actor }} + + ========= Workflow Input Information ========= + - Model ID: ${{ github.event.inputs.model_id }} + - Model Version: ${{ github.event.inputs.model_version }} + - Tracing Format: ${{ github.event.inputs.tracing_format }} + - Embedding Dimension: ${embedding_dimension:-N/A} + - Pooling Mode: ${pooling_mode:-N/A} + - Model Description: ${model_description:-N/A} + + ======== Workflow Output Information ========= + - Embedding Verification: Passed" + + echo "workflow_info<> $GITHUB_OUTPUT + echo "${workflow_info@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${workflow_info@E}" + - name: Initiate license_line + id: init_license_line + run: | + echo "verified=:white_check_mark: — It is verified that this model is licensed under Apache 2.0" >> $GITHUB_OUTPUT + echo "unverified=- [ ] :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0 :warning:" >> $GITHUB_OUTPUT + outputs: + model_folder: ${{ steps.init_folders.outputs.model_folder }} + sentence_transformer_folder: ${{ steps.init_folders.outputs.sentence_transformer_folder }} + workflow_info: ${{ steps.init_workflow_info.outputs.workflow_info }} + verified_license_line: ${{ steps.init_license_line.outputs.verified }} + unverified_license_line: ${{ steps.init_license_line.outputs.unverified }} + + # Step 3: Check if the model already exists in the model hub + checking-out-model-hub: + needs: init-workflow-var + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: read + environment: opensearch-py-ml-cicd-env + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Set Up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: checking-out-model-hub + - name: Check if TORCH_SCRIPT Model Exists + if: github.event.inputs.tracing_format == 'TORCH_SCRIPT' || github.event.inputs.tracing_format == 'BOTH' + run: | + TORCH_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \ + ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \ + ${{ github.event.inputs.model_version }} TORCH_SCRIPT) + aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $TORCH_FILE_PATH > /dev/null 2>&1 || TORCH_MODEL_NOT_EXIST=true + if [[ -z $TORCH_MODEL_NOT_EXIST ]] + then + echo "${{ github.event.inputs.model_id }} already exists on model hub for TORCH_SCRIPT format and ${{ github.event.inputs.model_version }} version." + exit 1 + fi + - name: Check if ONNX Model Exists + if: github.event.inputs.tracing_format == 'ONNX' || github.event.inputs.tracing_format == 'BOTH' + run: | + ONNX_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \ + ${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \ + ${{ github.event.inputs.model_version }} ONNX) + aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $ONNX_FILE_PATH > /dev/null 2>&1 || ONNX_MODEL_NOT_EXIST=true + if [[ -z $ONNX_MODEL_NOT_EXIST ]] + then + echo "${{ github.event.inputs.model_id }} already exists on model hub for ONNX format and ${{ github.event.inputs.model_version }} version." + exit 1 + fi + + # Step 4: Trace the model, Verify the embeddings & Upload the model files as artifacts + model-auto-tracing: + needs: [init-workflow-var, checking-out-model-hub] + name: model-auto-tracing + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + environment: opensearch-py-ml-cicd-env + strategy: + matrix: + cluster: ["opensearch"] + secured: ["true"] + entry: + - { opensearch_version: 2.7.0 } + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Export Arguments + run: | + echo "MODEL_ID=${{ github.event.inputs.model_id }}" >> $GITHUB_ENV + echo "MODEL_VERSION=${{ github.event.inputs.model_version }}" >> $GITHUB_ENV + echo "TRACING_FORMAT=${{ github.event.inputs.tracing_format }}" >> $GITHUB_ENV + echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV + echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV + echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV + - name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}} + run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace" + - name: License Verification + id: license_verification + run: | + apache_verified=$(> $GITHUB_OUTPUT + echo "license_info=Automatically Verified" >> $GITHUB_OUTPUT + else + echo "license_line=${{ needs.init-workflow-var.outputs.unverified_license_line }}" >> $GITHUB_OUTPUT + echo "license_info=Manually Verified" >> $GITHUB_OUTPUT + fi + - name: Model Description Info + id: model_description_info + run: | + model_description_info="$(> $GITHUB_OUTPUT + echo "$model_description_info" + - name: Upload Artifact + uses: actions/upload-artifact@v3 + with: + name: upload + path: ./upload/ + retention-days: 5 + if-no-files-found: error + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: model-auto-tracing + - name: Dryrun model uploading + id: dryrun_model_uploading + run: | + dryrun_output=$(aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} --dryrun \ + | sed 's|s3://${{ secrets.MODEL_BUCKET }}/|s3://(MODEL_BUCKET)/|' + ) + echo "dryrun_output<> $GITHUB_OUTPUT + echo "${dryrun_output@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${dryrun_output@E}" + outputs: + license_line: ${{ steps.license_verification.outputs.license_line }} + license_info: ${{ steps.license_verification.outputs.license_info }} + model_description_info: ${{ steps.model_description_info.outputs.model_description_info }} + dryrun_output: ${{ steps.dryrun_model_uploading.outputs.dryrun_output }} + + # Step 5: Ask for manual approval from the CODEOWNERS + manual-approval: + needs: [init-workflow-var, model-auto-tracing] + runs-on: 'ubuntu-latest' + permissions: + issues: write + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Get Approvers + id: get_approvers + run: | + echo "approvers=$(cat .github/CODEOWNERS | grep @ | tr -d '* ' | sed 's/@/,/g' | sed 's/,//1')" >> $GITHUB_OUTPUT + - name: Create Issue Body + id: create_issue_body + run: | + issue_body="Please approve or deny opensearch-py-ml model uploading: + + ${{ needs.model-auto-tracing.outputs.license_line }} + + ${{ needs.init-workflow-var.outputs.workflow_info }} + ${{ needs.model-auto-tracing.outputs.model_description_info }} + + ===== Dry Run of Model Uploading ===== + ${{ needs.model-auto-tracing.outputs.dryrun_output }}" + + echo "issue_body<> $GITHUB_OUTPUT + echo "${issue_body@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${issue_body@E}" + - uses: trstringer/manual-approval@v1 + with: + secret: ${{ github.TOKEN }} + approvers: ${{ steps.get_approvers.outputs.approvers }} + minimum-approvals: 1 + issue-title: "Upload Model to OpenSearch Model Hub (${{ github.event.inputs.model_id }})" + issue-body: ${{ steps.create_issue_body.outputs.issue_body }} + exclude-workflow-initiator-as-approver: false + + # Step 6: Download the artifacts & Upload it to the S3 bucket + model-uploading: + needs: [init-workflow-var, manual-approval] + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: read + environment: opensearch-py-ml-cicd-env + steps: + - name: Download Artifact + uses: actions/download-artifact@v2 + with: + name: upload + path: ./upload/ + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }} + role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }} + role-session-name: model-uploading + - name: Copy Files to the Bucket + id: copying_to_bucket + run: | + aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} + echo "upload_time=$(TZ='America/Los_Angeles' date "+%Y-%m-%d %T")" >> $GITHUB_OUTPUT + outputs: + upload_time: ${{ steps.copying_to_bucket.outputs.upload_time }} + + # Step 7: Update MODEL_UPLOAD_HISTORY.md & supported_models.json + history-update: + needs: [init-workflow-var, model-auto-tracing, model-uploading] + runs-on: 'ubuntu-latest' + permissions: + id-token: write + contents: write + pull-requests: write + env: + model_info: ${{ github.event.inputs.model_id }} (v.${{ github.event.inputs.model_version }})(${{ github.event.inputs.tracing_format }}) + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Set Up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install Packages + run: + python -m pip install mdutils + - name: Update Model Upload History + run: | + model_description="${{ github.event.inputs.model_description }}" + python utils/model_uploader/update_models_upload_history_md.py \ + ${{ github.event.inputs.model_id }} \ + ${{ github.event.inputs.model_version }} \ + ${{ github.event.inputs.tracing_format }} \ + -ed ${{ github.event.inputs.embedding_dimension }} \ + -pm ${{ github.event.inputs.pooling_mode }} \ + -id ${{ github.run_id }} -u ${{ github.actor }} \ + -t "${{ needs.model-uploading.outputs.upload_time }}" + - name: Create PR Body + id: create_pr_body + run: | + pr_body=" + - [ ] This PR made commit to only these three files: MODEL_UPLOAD_HISTORY.md, supported_models.json, and CHANGELOG.md. + - [ ] CHANGELOG.md has been updated by the workflow or by you if the workflow fails to do so. + - [ ] Merge conflicts have been resolved. + + ${{ needs.init-workflow-var.outputs.workflow_info }} + ${{ needs.model-auto-tracing.outputs.license_info }} + ${{ needs.model-auto-tracing.outputs.model_description_info }}" + + echo "pr_body<> $GITHUB_OUTPUT + echo "${pr_body@E}" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + echo "${pr_body@E}" + - name: Create a Branch & Raise a PR + uses: peter-evans/create-pull-request@v5 + id: create_pr + with: + committer: github-actions[bot] + commit-message: 'GitHub Actions Workflow: Update Model Upload History - ${{ env.model_info }}' + signoff: true + title: 'Update Model Upload History - ${{ env.model_info }}' + body: ${{ steps.create_pr_body.outputs.pr_body }} + labels: ModelUploading + branch: model-uploader/${{ github.run_id }} + delete-branch: true + add-paths: | + ./utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md + ./utils/model_uploader/upload_history/supported_models.json + - name: Checkout Repository + uses: actions/checkout@v3 + with: + ref: model-uploader/${{ github.run_id }} + - name: Create a line for updating CHANGELOG.md + id: create_changelog_line + continue-on-error: true + run: | + pr_ref="([#${{ steps.create_pr.outputs.pull-request-number }}](${{ steps.create_pr.outputs.pull-request-url }}))" + changelog_line="Update model upload history - ${{ env.model_info }} by @${{ github.actor }} $pr_ref" + echo "changelog_line=$changelog_line" >> $GITHUB_OUTPUT + - name: Warning Comment on PR if create_changelog_line fails + if: steps.create_changelog_line.outcome == 'failure' + uses: thollander/actions-comment-pull-request@v2 + with: + pr_number: ${{ steps.create_pr.outputs.pull-request-number }} + message: | + Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please update CHANGELOG.md manually. + - name: Update CHANGELOG.md + if: steps.create_changelog_line.outcome == 'success' + id: update_changelog + continue-on-error: true + run: | + python utils/model_uploader/update_changelog_md.py "${{ steps.create_changelog_line.outputs.changelog_line }}" + - name: Commit Updates + if: steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'success' + uses: stefanzweifel/git-auto-commit-action@v4 + id: commit + with: + branch: model-uploader/${{ github.run_id }} + commit_user_email: "github-actions[bot]@users.noreply.github.com" + commit_message: 'GitHub Actions Workflow: Update CHANGELOG.md - ${{ env.model_info }}' + commit_options: '--signoff' + file_pattern: CHANGELOG.md + - name: Warning Comment on PR if update_changelog fails + if: steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'failure' + uses: thollander/actions-comment-pull-request@v2 + with: + pr_number: ${{ steps.create_pr.outputs.pull-request-number }} + message: | + Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please add the following line manually. + >>> + ${{ steps.create_changelog_line.outputs.changelog_line }} + + # Step 8: Trigger Jenkins ml-models workflow + trigger-ml-models-release-workflow: + needs: [init-workflow-var, history-update] + runs-on: 'ubuntu-latest' + permissions: + contents: read + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + - name: Trigger Jenkins Workflow with Generic Webhook + run: | + jenkins_trigger_token=${{ secrets.JENKINS_ML_MODELS_RELEASE_GENERIC_WEBHOOK_TOKEN }} + base_download_path=${{ needs.init-workflow-var.outputs.model_folder }} + version=${{ github.event.inputs.model_version }} + format=${{ github.event.inputs.tracing_format }} + jenkins_params="{\"BASE_DOWNLOAD_PATH\":\"$base_download_path\", \"VERSION\":\"$version\", \"FORMAT\":\"$format\"}" + sh utils/model_uploader/trigger_ml_models_release.sh $jenkins_trigger_token "$jenkins_params" diff --git a/CHANGELOG.md b/CHANGELOG.md index d5c1db52..d1570d86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,11 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [1.2.0] ### Added +- Add workflows and scripts for automating model tracing and uploading process by @thanawan-atc in ([#209](https://github.com/opensearch-project/opensearch-py-ml/pull/209)) ### Changed + ### Fixed - Enable make_model_config_json to add model description to model config file by @thanawan-atc in ([#203](https://github.com/opensearch-project/opensearch-py-ml/pull/203)) - Correct demo_ml_commons_integration.ipynb by @thanawan-atc in ([#208](https://github.com/opensearch-project/opensearch-py-ml/pull/208)) diff --git a/noxfile.py b/noxfile.py index 448c3990..03809192 100644 --- a/noxfile.py +++ b/noxfile.py @@ -61,7 +61,7 @@ @nox.session(reuse_venv=True) def format(session): session.install("black", "isort", "flynt") - session.run("python", "utils/license-headers.py", "fix", *SOURCE_FILES) + session.run("python", "utils/lint/license-headers.py", "fix", *SOURCE_FILES) session.run("flynt", *SOURCE_FILES) session.run("black", "--target-version=py38", *SOURCE_FILES) session.run("isort", "--profile=black", *SOURCE_FILES) @@ -73,7 +73,7 @@ def lint(session): # Install numpy to use its mypy plugin # https://numpy.org/devdocs/reference/typing.html#mypy-plugin session.install("black", "flake8", "mypy", "isort", "numpy") - session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES) + session.run("python", "utils/lint/license-headers.py", "check", *SOURCE_FILES) session.run("black", "--check", "--target-version=py38", *SOURCE_FILES) session.run("isort", "--check", "--profile=black", *SOURCE_FILES) session.run("flake8", "--ignore=E501,W503,E402,E712,E203", *SOURCE_FILES) @@ -142,10 +142,29 @@ def test(session, pandas_version: str): @nox.session(reuse_venv=True) def docs(session): # Run this so users get an error if they don't have Pandoc installed. - session.install("-r", "docs/requirements-docs.txt") session.install(".") session.cd("docs") session.run("make", "clean", external=True) session.run("make", "html", external=True) + + +# While nox is typically used for automating testing, in this case, we utilize it +# to automate the action workflow, leveraging its ability to set up the environment +# required for model autotracing. +@nox.session +def trace(session): + session.install( + "-r", + "requirements-dev.txt", + "--timeout", + "1500", + ) + session.install(".") + + session.run( + "python", + "utils/model_uploader/model_autotracing.py", + *(session.posargs), + ) diff --git a/opensearch_py_ml/ml_models/sentencetransformermodel.py b/opensearch_py_ml/ml_models/sentencetransformermodel.py index 85b9e885..3fbb334d 100644 --- a/opensearch_py_ml/ml_models/sentencetransformermodel.py +++ b/opensearch_py_ml/ml_models/sentencetransformermodel.py @@ -1050,11 +1050,11 @@ def _get_model_description_from_readme_file(self, readme_file_path) -> str: readme_data = MarkDownFile.read_file(readme_file_path) # Find the description section - start_str = f"# {self.model_id}" + start_str = f"\n# {self.model_id}" start = readme_data.find(start_str) if start == -1: model_name = self.model_id.split("/")[1] - start_str = f"# {model_name}" + start_str = f"\n# {model_name}" start = readme_data.find(start_str) end = readme_data.find("\n#", start + len(start_str)) diff --git a/utils/license-headers.py b/utils/lint/license-headers.py similarity index 100% rename from utils/license-headers.py rename to utils/lint/license-headers.py diff --git a/utils/model_uploader/model_autotracing.py b/utils/model_uploader/model_autotracing.py new file mode 100644 index 00000000..299daf5b --- /dev/null +++ b/utils/model_uploader/model_autotracing.py @@ -0,0 +1,599 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" workflow +# (See model_uploader.yml) to perform model auto-tracing and prepare +# files for uploading to OpenSearch model hub. + +import argparse +import json +import os +import shutil +import sys +import warnings +from typing import List, Optional, Tuple +from zipfile import ZipFile + +import numpy as np +from mdutils.fileutils import MarkDownFile +from numpy.typing import DTypeLike +from sentence_transformers import SentenceTransformer + +# We need to append ROOT_DIR path so that we can import +# OPENSEARCH_TEST_CLIENT and opensearch_py_ml since this +# python script is not in the root directory. +THIS_DIR = os.path.dirname(__file__) +ROOT_DIR = os.path.join(THIS_DIR, "../..") +sys.path.append(ROOT_DIR) + +LICENSE_PATH = "LICENSE" +from opensearch_py_ml.ml_commons import MLCommonClient +from opensearch_py_ml.ml_models.sentencetransformermodel import SentenceTransformerModel +from tests import OPENSEARCH_TEST_CLIENT + +BOTH_FORMAT = "BOTH" +TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT" +ONNX_FORMAT = "ONNX" + +TEMP_MODEL_PATH = "temp_model_path" +TORCHSCRIPT_FOLDER_PATH = "model-torchscript/" +ONNX_FOLDER_PATH = "model-onnx/" +UPLOAD_FOLDER_PATH = "upload/" +MODEL_CONFIG_FILE_NAME = "ml-commons_model_config.json" +OUTPUT_DIR = "trace_output/" +LICENSE_VAR_FILE = "apache_verified.txt" +DESCRIPTION_VAR_FILE = "description.txt" +TEST_SENTENCES = [ + "First test sentence", + "This is a very long sentence used for testing model embedding outputs.", +] +RTOL_TEST = 1e-03 +ATOL_TEST = 1e-05 +ML_BASE_URI = "/_plugins/_ml" + + +def verify_license_in_md_file() -> bool: + """ + Verify that the model is licensed under Apache 2.0 + by looking at metadata in README.md file of the model + + TODO: Support other open source licenses in future + + :return: Whether the model is licensed under Apache 2.0 + :rtype: Bool + """ + try: + readme_data = MarkDownFile.read_file(TEMP_MODEL_PATH + "/" + "README.md") + except Exception as e: + print(f"Cannot verify the license: {e}") + return False + + start = readme_data.find("---") + end = readme_data.find("---", start + 3) + if start == -1 or end == -1: + return False + metadata_info = readme_data[start + 3 : end] + if "apache-2.0" in metadata_info.lower(): + print("\nFound apache-2.0 license at " + TEMP_MODEL_PATH + "README.md") + return True + else: + print("\nDid not find apache-2.0 license at " + TEMP_MODEL_PATH + "README.md") + return False + + +def trace_sentence_transformer_model( + model_id: str, + model_version: str, + model_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + model_description: Optional[str] = None, +) -> Tuple[str, str]: + """ + Trace the pretrained sentence transformer model, create a model config file, + and return a path to the model file and a path to the model config file required for model registration + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param model_description: Model description input + :type model_description: string + :return: Tuple of model_path (path to model zip file) and model_config_path (path to model config json file) + :rtype: Tuple[str, str] + """ + folder_path = ( + TORCHSCRIPT_FOLDER_PATH + if model_format == TORCH_SCRIPT_FORMAT + else ONNX_FOLDER_PATH + ) + + # 1.) Initiate a sentence transformer model class object + pre_trained_model = None + try: + pre_trained_model = SentenceTransformerModel( + model_id=model_id, folder_path=folder_path, overwrite=True + ) + except Exception as e: + assert ( + False + ), f"Raised Exception in tracing {model_format} model\ + during initiating a sentence transformer model class object: {e}" + + # 2.) Save the model in the specified format + model_path = None + try: + if model_format == TORCH_SCRIPT_FORMAT: + model_path = pre_trained_model.save_as_pt( + model_id=model_id, sentences=TEST_SENTENCES + ) + else: + model_path = pre_trained_model.save_as_onnx(model_id=model_id) + except Exception as e: + assert False, f"Raised Exception during saving model as {model_format}: {e}" + + # 3.) Create a model config json file + try: + pre_trained_model.make_model_config_json( + version_number=model_version, + model_format=model_format, + embedding_dimension=embedding_dimension, + pooling_mode=pooling_mode, + description=model_description, + ) + except Exception as e: + assert ( + False + ), f"Raised Exception during making model config file for {model_format} model: {e}" + + # 4.) Return model_path & model_config_path for model registration + model_config_path = folder_path + MODEL_CONFIG_FILE_NAME + + return model_path, model_config_path + + +def register_and_deploy_sentence_transformer_model( + ml_client: "MLCommonClient", + model_path: str, + model_config_path: str, + model_format: str, +) -> List["DTypeLike"]: + """ + Register the pretrained sentence transformer model by using the model file and the model config file, + deploy the model to generate embeddings for the TEST_SENTENCES, + and return the embeddings for model verification + + :param ml_client: A client that communicates to the ml-common plugin for OpenSearch + :type ml_client: MLCommonClient + :param model_path: Path to model file + :type model_path: string + :param model_config_path: Path to model config file + :type model_config_path: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :return: List of embedding data for TEST_SENTENCES + :rtype: List["DTypeLike"] + """ + embedding_data = None + + # 1.) Register & Deploy the model + model_id = "" + try: + model_id = ml_client.register_model( + model_path=model_path, + model_config_path=model_config_path, + deploy_model=True, + isVerbose=True, + ) + print(f"\n{model_format}_model_id:", model_id) + assert model_id != "" or model_id is not None + except Exception as e: + assert ( + False + ), f"Raised Exception in {model_format} model registration/deployment: {e}" + + # 2.) Check model status + try: + ml_model_status = ml_client.get_model_info(model_id) + print("\nModel Status:") + print(ml_model_status) + assert ml_model_status.get("model_state") == "DEPLOYED" + assert ml_model_status.get("model_format") == model_format + assert ml_model_status.get("algorithm") == "TEXT_EMBEDDING" + except Exception as e: + assert False, f"Raised Exception in getting {model_format} model info: {e}" + + # 3.) Generate embeddings + try: + embedding_output = ml_client.generate_embedding(model_id, TEST_SENTENCES) + assert len(embedding_output.get("inference_results")) == 2 + embedding_data = [ + embedding_output["inference_results"][i]["output"][0]["data"] + for i in range(len(TEST_SENTENCES)) + ] + except Exception as e: + assert ( + False + ), f"Raised Exception in generating sentence embedding with {model_format} model: {e}" + + # 4.) Undeploy the model + try: + ml_client.undeploy_model(model_id) + ml_model_status = ml_client.get_model_info(model_id) + assert ml_model_status.get("model_state") == "UNDEPLOYED" + except Exception as e: + assert False, f"Raised Exception in {model_format} model undeployment: {e}" + + # 5.) Delete the model + try: + delete_model_obj = ml_client.delete_model(model_id) + assert delete_model_obj.get("result") == "deleted" + except Exception as e: + assert False, f"Raised Exception in deleting {model_format} model: {e}" + + # 6.) Return embedding outputs for model verification + return embedding_data + + +def verify_embedding_data( + original_embedding_data: List["DTypeLike"], + tracing_embedding_data: List["DTypeLike"], +) -> bool: + """ + Verify the embeddings generated by the traced model with those of original model + + :param original_embedding_data: Embedding outputs of TEST_SENTENCES generated by the original model + :type original_embedding_data: List['DTypeLike'] + :param tracing_embedding_data: Embedding outputs of TEST_SENTENCES generated by the traced model + :type tracing_embedding_data: List['DTypeLike'] + :return: Whether the embeddings generated by the traced model match with those of original model + :rtype: bool + """ + failed_cases = [] + for i in range(len(TEST_SENTENCES)): + try: + np.testing.assert_allclose( + original_embedding_data[i], + tracing_embedding_data[i], + rtol=RTOL_TEST, + atol=ATOL_TEST, + ) + except Exception as e: + failed_cases.append((TEST_SENTENCES[i], e)) + + if len(failed_cases): + print( + "\nOriginal embeddings DOES NOT matches the embeddings in the following case(s):" + ) + for sentence, e in failed_cases: + print(sentence) + print(e) + return False + else: + return True + + +def prepare_files_for_uploading( + model_id: str, + model_version: str, + model_format: str, + src_model_path: str, + src_model_config_path: str, +) -> None: + """ + Prepare files for uploading by storing them in UPLOAD_FOLDER_PATH + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :param src_model_path: Path to model files for uploading + :type src_model_path: string + :param src_model_config_path: Path to model config files for uploading + :type src_model_config_path: string + :return: Tuple of dst_model_path (path to model zip file) and dst_model_config_path + (path to model config json file) in the UPLOAD_FOLDER_PATH + :rtype: Tuple[str, str] + """ + model_type, model_name = model_id.split("/") + model_format = model_format.lower() + folder_to_delete = ( + TORCHSCRIPT_FOLDER_PATH if model_format == "torch_script" else ONNX_FOLDER_PATH + ) + + # Store to be uploaded files in UPLOAD_FOLDER_PATH + try: + dst_model_dir = ( + f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}" + ) + os.makedirs(dst_model_dir, exist_ok=True) + dst_model_filename = ( + f"{model_type}_{model_name}-{model_version}-{model_format}.zip" + ) + dst_model_path = dst_model_dir + "/" + dst_model_filename + with ZipFile(src_model_path, "a") as zipObj: + zipObj.write(filename=LICENSE_PATH, arcname="LICENSE") + shutil.copy(src_model_path, dst_model_path) + print(f"\nCopied {src_model_path} to {dst_model_path}") + + dst_model_config_dir = ( + f"{UPLOAD_FOLDER_PATH}{model_name}/{model_version}/{model_format}" + ) + os.makedirs(dst_model_config_dir, exist_ok=True) + dst_model_config_filename = "config.json" + dst_model_config_path = dst_model_config_dir + "/" + dst_model_config_filename + shutil.copy(src_model_config_path, dst_model_config_path) + print(f"Copied {src_model_config_path} to {dst_model_config_path}") + except Exception as e: + assert ( + False + ), f"Raised Exception during preparing {model_format} files for uploading: {e}" + + # Delete model folder downloaded from HuggingFace during model tracing + try: + shutil.rmtree(folder_to_delete) + except Exception as e: + assert False, f"Raised Exception while deleting {folder_to_delete}: {e}" + + return dst_model_path, dst_model_config_path + + +def store_license_verified_variable(license_verified: bool) -> None: + """ + Store whether the model is licensed under Apache 2.0 in OUTPUT_DIR/LICENSE_VAR_FILE + to be used to generate issue body for manual approval + + :param license_verified: Whether the model is licensed under Apache 2.0 + :type model_path: bool + :return: No return value expected + :rtype: None + """ + try: + os.makedirs(OUTPUT_DIR, exist_ok=True) + license_var_filepath = OUTPUT_DIR + "/" + LICENSE_VAR_FILE + with open(license_var_filepath, "w") as f: + f.write(str(license_verified)) + except Exception as e: + print( + f"Cannot store license_verified ({license_verified}) in {license_var_filepath}: {e}" + ) + + +def store_description_variable(config_path_for_checking_description: str) -> None: + """ + Store model description in OUTPUT_DIR/DESCRIPTION_VAR_FILE + to be used to generate issue body for manual approval + + :param config_path_for_checking_description: Path to config json file + :type config_path_for_checking_description: str + :return: No return value expected + :rtype: None + """ + try: + os.makedirs(OUTPUT_DIR, exist_ok=True) + description_var_filepath = OUTPUT_DIR + "/" + DESCRIPTION_VAR_FILE + with open(config_path_for_checking_description, "r") as f: + config_dict = json.load(f) + description = ( + config_dict["description"] if "description" in config_dict else "-" + ) + print(f"Storing the following description at {description_var_filepath}") + print(description) + with open(description_var_filepath, "w") as f: + f.write(description) + except Exception as e: + print( + f"Cannot store description ({description}) in {description_var_filepath}: {e}" + ) + + +def main( + model_id: str, + model_version: str, + tracing_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + model_description: Optional[str] = None, +) -> None: + """ + Perform model auto-tracing and prepare files for uploading to OpenSearch model hub + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param tracing_format: Tracing format ("TORCH_SCRIPT", "ONNX", or "BOTH") + :type tracing_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param model_description: Model description input + :type model_description: string + :return: No return value expected + :rtype: None + """ + + print("\n=== Begin running model_autotracing.py ===") + print("Model ID: ", model_id) + print("Model Version: ", model_version) + print("Tracing Format: ", tracing_format) + print( + "Embedding Dimension: ", + embedding_dimension if embedding_dimension is not None else "N/A", + ) + print("Pooling Mode: ", pooling_mode if pooling_mode is not None else "N/A") + print( + "Model Description: ", + model_description if model_description is not None else "N/A", + ) + print("==========================================") + + ml_client = MLCommonClient(OPENSEARCH_TEST_CLIENT) + + pre_trained_model = SentenceTransformer(model_id) + original_embedding_data = list( + pre_trained_model.encode(TEST_SENTENCES, convert_to_numpy=True) + ) + + pre_trained_model.save(path=TEMP_MODEL_PATH) + license_verified = verify_license_in_md_file() + try: + shutil.rmtree(TEMP_MODEL_PATH) + except Exception as e: + assert False, f"Raised Exception while deleting {TEMP_MODEL_PATH}: {e}" + + if tracing_format in [TORCH_SCRIPT_FORMAT, BOTH_FORMAT]: + print("--- Begin tracing a model in TORCH_SCRIPT ---") + ( + torchscript_model_path, + torchscript_model_config_path, + ) = trace_sentence_transformer_model( + model_id, + model_version, + TORCH_SCRIPT_FORMAT, + embedding_dimension, + pooling_mode, + model_description, + ) + + torchscript_embedding_data = register_and_deploy_sentence_transformer_model( + ml_client, + torchscript_model_path, + torchscript_model_config_path, + TORCH_SCRIPT_FORMAT, + ) + pass_test = verify_embedding_data( + original_embedding_data, torchscript_embedding_data + ) + assert ( + pass_test + ), f"Failed while verifying embeddings of {model_id} model in TORCH_SCRIPT format" + + ( + torchscript_dst_model_path, + torchscript_dst_model_config_path, + ) = prepare_files_for_uploading( + model_id, + model_version, + TORCH_SCRIPT_FORMAT, + torchscript_model_path, + torchscript_model_config_path, + ) + + config_path_for_checking_description = torchscript_dst_model_config_path + print("--- Finished tracing a model in TORCH_SCRIPT ---") + + if tracing_format in [ONNX_FORMAT, BOTH_FORMAT]: + print("--- Begin tracing a model in ONNX ---") + ( + onnx_model_path, + onnx_model_config_path, + ) = trace_sentence_transformer_model( + model_id, + model_version, + ONNX_FORMAT, + embedding_dimension, + pooling_mode, + model_description, + ) + + onnx_embedding_data = register_and_deploy_sentence_transformer_model( + ml_client, onnx_model_path, onnx_model_config_path, ONNX_FORMAT + ) + + pass_test = verify_embedding_data(original_embedding_data, onnx_embedding_data) + assert ( + pass_test + ), f"Failed while verifying embeddings of {model_id} model in ONNX format" + + onnx_dst_model_path, onnx_dst_model_config_path = prepare_files_for_uploading( + model_id, + model_version, + ONNX_FORMAT, + onnx_model_path, + onnx_model_config_path, + ) + + config_path_for_checking_description = onnx_dst_model_config_path + print("--- Finished tracing a model in ONNX ---") + + store_license_verified_variable(license_verified) + store_description_variable(config_path_for_checking_description) + + print("\n=== Finished running model_autotracing.py ===") + + +if __name__ == "__main__": + warnings.filterwarnings("ignore", category=DeprecationWarning) + warnings.filterwarnings("ignore", category=FutureWarning) + warnings.filterwarnings("ignore", message="Unverified HTTPS request") + warnings.filterwarnings("ignore", message="TracerWarning: torch.tensor") + warnings.filterwarnings( + "ignore", message="using SSL with verify_certs=False is insecure." + ) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "model_id", + type=str, + help="Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)", + ) + parser.add_argument( + "model_version", type=str, help="Model version number (e.g. 1.0.1)" + ) + parser.add_argument( + "tracing_format", + choices=["BOTH", "TORCH_SCRIPT", "ONNX"], + help="Model format for auto-tracing", + ) + parser.add_argument( + "-ed", + "--embedding_dimension", + type=int, + nargs="?", + default=None, + const=None, + help="Embedding dimension of the model to use if it does not exist in original config.json", + ) + parser.add_argument( + "-pm", + "--pooling_mode", + type=str, + nargs="?", + default=None, + const=None, + choices=["CLS", "MEAN", "MAX", "MEAN_SQRT_LEN"], + help="Pooling mode if it does not exist in original config.json", + ) + parser.add_argument( + "-md", + "--model_description", + type=str, + nargs="?", + default=None, + const=None, + help="Model description if you want to overwrite the default description", + ) + args = parser.parse_args() + + main( + args.model_id, + args.model_version, + args.tracing_format, + args.embedding_dimension, + args.pooling_mode, + args.model_description, + ) diff --git a/utils/model_uploader/save_model_file_path_to_env.py b/utils/model_uploader/save_model_file_path_to_env.py new file mode 100644 index 00000000..537d5127 --- /dev/null +++ b/utils/model_uploader/save_model_file_path_to_env.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" workflow +# (See model_uploader.yml) to verify if the model already exists in +# model hub before continuing the workflow. + +import argparse +import re + +VERSION_PATTERN = r"^([1-9]\d*|0)(\.(([1-9]\d*)|0)){0,3}$" + + +def verify_inputs(model_id: str, model_version: str) -> None: + """ + Verify the format of model_id and model_version + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :return: No return value expected + :rtype: None + """ + assert model_id.count("/") == 1, f"Invalid Model ID: {model_id}" + assert ( + re.fullmatch(VERSION_PATTERN, model_version) is not None + ), f"Invalid Model Version: {model_version}" + + +def get_model_file_path( + model_folder: str, model_id: str, model_version: str, model_format: str +) -> str: + """ + Construct the expected model file path on model hub + + :param model_folder: Model folder for uploading + :type model_folder: string + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :return: Expected model file path on model hub + :rtype: string + """ + model_type, model_name = model_id.split("/") + model_format = model_format.lower() + model_dirname = f"{model_folder}{model_name}/{model_version}/{model_format}" + model_filename = f"{model_type}_{model_name}-{model_version}-{model_format}.zip" + model_file_path = model_dirname + "/" + model_filename + return model_file_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "model_folder", + type=str, + help="Model folder for uploading (e.g. ml-models/huggingface/sentence-transformers/)", + ) + parser.add_argument( + "model_id", + type=str, + help="Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)", + ) + parser.add_argument( + "model_version", type=str, help="Model version number (e.g. 1.0.1)" + ) + parser.add_argument( + "model_format", + choices=["TORCH_SCRIPT", "ONNX"], + help="Model format for auto-tracing", + ) + + args = parser.parse_args() + verify_inputs(args.model_id, args.model_version) + model_file_path = get_model_file_path( + args.model_folder, args.model_id, args.model_version, args.model_format + ) + + # Print the model file path so that the workflow can store it in the variable (See model_uploader.yml) + print(model_file_path) diff --git a/utils/model_uploader/update_changelog_md.py b/utils/model_uploader/update_changelog_md.py new file mode 100644 index 00000000..0b1cbba6 --- /dev/null +++ b/utils/model_uploader/update_changelog_md.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" +# & "Model Listing Uploading" workflow (See model_uploader.yml +# & model_listing_uploader.yml) to update CHANGELOG.md after +# uploading the model to our model hub. + +import argparse + +from mdutils.fileutils import MarkDownFile + +CHANGELOG_DIRNAME = "." +CHANGELOG_FILENAME = "CHANGELOG.md" +SUBSECTION_NAME = "Changed" +PREV_SUBSECTION_NAME = "Added" + + +def update_changelog_file( + changelog_line: str, +) -> None: + """ + Update supported_models.json + + :param changelog_line: Line to be added to CHANGELOG.md + :type changelog_line: string + :return: No return value expected + :rtype: None + """ + changelog_data = MarkDownFile.read_file(f"{CHANGELOG_DIRNAME}/{CHANGELOG_FILENAME}") + + # Find the most recent version section and pull it out + this_version_ptr = changelog_data.find("\n## ") + 1 + assert this_version_ptr != 0, "Cannot find a version section in the CHANGELOG.md" + next_version_ptr = changelog_data.find("\n## ", this_version_ptr + 1) + 1 + if next_version_ptr == 0: + next_version_ptr = -1 + this_version_section = changelog_data[this_version_ptr:next_version_ptr] + + # Find the sub-section SUBSECTION_NAME + this_subsection_ptr = this_version_section.find(f"\n### {SUBSECTION_NAME}") + 1 + if this_subsection_ptr != 0: + # Case 1: Section SUBSECTION_NAME exists + # Append a change_log line to the end of that subsection if it exists + next_subsection_ptr = ( + this_version_section.find("\n### ", this_subsection_ptr + 1) + 1 + ) + if next_subsection_ptr == 0: + next_subsection_ptr = -1 + this_subsection = this_version_section[ + this_subsection_ptr:next_subsection_ptr + ].strip() + this_subsection += "\n- " + changelog_line + "\n\n" + new_version_section = ( + this_version_section[:this_subsection_ptr] + + this_subsection + + this_version_section[next_subsection_ptr:] + ) + else: + # Case 2: Sub-section SUBSECTION_NAME does not exist + # Create sub-section SUBSECTION_NAME and add a change_log line + this_subsection = f"### {SUBSECTION_NAME}\n- {changelog_line}\n\n" + prev_subsection_ptr = ( + this_version_section.find(f"\n### {PREV_SUBSECTION_NAME}") + 1 + ) + if prev_subsection_ptr != 0: + # Case 2.1: Sub-section PREV_SUBSECTION_NAME exist + # Add a sub-section SUBSECTION_NAME after PREV_SUBSECTION_NAME if PREV_SUBSECTION_NAME exists + next_subsection_ptr = ( + this_version_section.find("\n### ", prev_subsection_ptr + 1) + 1 + ) + prev_subsection = this_version_section[ + prev_subsection_ptr:next_subsection_ptr + ].strip() + new_version_section = ( + this_version_section[:prev_subsection_ptr] + + prev_subsection + + "\n\n" + + this_subsection + + this_version_section[next_subsection_ptr:] + ) + else: + # Case 2.2: Sub-section PREV_SUBSECTION_NAME does not exist + next_subsection_ptr = this_version_section.find("\n### ") + 1 + if next_subsection_ptr != 0: + # Case 2.2.1: There exists other sub-section in this version section + # Add a sub-section SECTION_NAME before other sub-sections + new_version_section = ( + this_version_section[:next_subsection_ptr] + + this_subsection + + this_version_section[next_subsection_ptr:] + ) + else: + # Case 2.2.2: There isn't any other sub-section in this version section + # Add a sub-section SECTION_NAME after version headline + new_version_section = ( + this_version_section.strip() + "\n\n" + this_subsection + ) + + # Insert new_version_section back to the document + new_changelog_data = ( + changelog_data[:this_version_ptr] + + new_version_section + + changelog_data[next_version_ptr:] + ) + + mdFile = MarkDownFile(CHANGELOG_FILENAME, dirname=CHANGELOG_DIRNAME) + mdFile.rewrite_all_file(data=new_changelog_data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "changelog_line", + type=str, + help="Line to be added to CHANGELOG.md", + ) + args = parser.parse_args() + update_changelog_file(args.changelog_line) diff --git a/utils/model_uploader/update_models_upload_history_md.py b/utils/model_uploader/update_models_upload_history_md.py new file mode 100644 index 00000000..c625309a --- /dev/null +++ b/utils/model_uploader/update_models_upload_history_md.py @@ -0,0 +1,283 @@ +# SPDX-License-Identifier: Apache-2.0 +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. +# Any modifications Copyright OpenSearch Contributors. See +# GitHub history for details. + +# This program is run by "Model Auto-tracing & Uploading" workflow +# (See model_uploader.yml) to update MODEL_UPLOAD_HISTORY.md & supported_models.json +# after uploading the model to our model hub. + +import argparse +import json +import os +from typing import Dict, List, Optional + +from mdutils.fileutils import MarkDownFile +from mdutils.tools.Table import Table + +BOTH_FORMAT = "BOTH" +TORCH_SCRIPT_FORMAT = "TORCH_SCRIPT" +ONNX_FORMAT = "ONNX" + +MD_FILENAME = "MODEL_UPLOAD_HISTORY.md" +JSON_FILENAME = "supported_models.json" +DIRNAME = "utils/model_uploader/upload_history" +MODEL_JSON_FILEPATH = os.path.join(DIRNAME, JSON_FILENAME) +KEYS = [ + "Upload Time", + "Model Uploader", + "Model ID", + "Model Version", + "Model Format", + "Embedding Dimension", + "Pooling Mode", + "Workflow Run ID", +] +MD_HEADER = "# Pretrained Model Upload History\n\nThe model-serving framework supports a variety of open-source pretrained models that can assist with a range of machine learning (ML) search and analytics use cases. \n\n\n## Uploaded Pretrained Models\n\n\n### Sentence transformers\n\nSentence transformer models map sentences and paragraphs across a dimensional dense vector space. The number of vectors depends on the model. Use these models for use cases such as clustering and semantic search. \n\nThe following table shows sentence transformer model upload history.\n\n[//]: # (This may be the most platform independent comment)\n" + + +def create_model_json_obj( + model_id: str, + model_version: str, + model_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + workflow_id: Optional[str] = None, + model_uploader: Optional[str] = None, + upload_time: Optional[str] = None, +) -> Dict: + """ + Create a model dict obj to be added to supported_models.json + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param model_format: Model format ("TORCH_SCRIPT" or "ONNX") + :type model_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param workflow_id: Workflow run id + :type workflow_id: string + :param model_uploader: Model uploader input + :type model_uploader: string + :param uploader_time: Upload time input + :type uploader_time: string + :return: Model dictionary object to be added to supported_models.json + :rtype: dict + """ + model_obj = { + "Model Uploader": "@" + model_uploader if model_uploader is not None else "-", + "Upload Time": upload_time if upload_time is not None else "-", + "Model ID": model_id, + "Model Version": model_version, + "Model Format": model_format, + "Embedding Dimension": str(embedding_dimension) + if embedding_dimension is not None + else "N/A", + "Pooling Mode": pooling_mode if pooling_mode is not None else "N/A", + "Workflow Run ID": workflow_id if workflow_id is not None else "-", + } + return model_obj + + +def sort_models(models: List[Dict]) -> List[Dict]: + """ + Sort models + + :param models: List of model dictionary objects to be sorted + :type models: list[dict] + :return: Sorted list of model dictionary objects + :rtype: list[dict] + """ + models = sorted( + models, + key=lambda d: ( + d["Upload Time"], + d["Model Version"], + d["Model ID"], + d["Model Format"], + ), + ) + return models + + +def update_model_json_file( + model_id: str, + model_version: str, + tracing_format: str, + embedding_dimension: Optional[int] = None, + pooling_mode: Optional[str] = None, + workflow_id: Optional[str] = None, + model_uploader: Optional[str] = None, + upload_time: Optional[str] = None, +) -> None: + """ + Update supported_models.json + + :param model_id: Model ID of the pretrained model + :type model_id: string + :param model_version: Version of the pretrained model for registration + :type model_version: string + :param tracing_format: Tracing format ("TORCH_SCRIPT", "ONNX", or "BOTH") + :type tracing_format: string + :param embedding_dimension: Embedding dimension input + :type embedding_dimension: int + :param pooling_mode: Pooling mode input ("CLS", "MEAN", "MAX", "MEAN_SQRT_LEN" or None) + :type pooling_mode: string + :param workflow_id: Workflow run id + :type workflow_id: string + :param model_uploader: Model uploader input + :type model_uploader: string + :param uploader_time: Upload time input + :type uploader_time: string + :return: No return value expected + :rtype: None + """ + models = [] + if os.path.isfile(MODEL_JSON_FILEPATH): + with open(MODEL_JSON_FILEPATH, "r") as f: + models = json.load(f) + elif not os.path.isdir(DIRNAME): + os.makedirs(DIRNAME) + + if tracing_format == TORCH_SCRIPT_FORMAT or tracing_format == BOTH_FORMAT: + model_obj = create_model_json_obj( + model_id, + model_version, + TORCH_SCRIPT_FORMAT, + embedding_dimension, + pooling_mode, + workflow_id, + model_uploader, + upload_time, + ) + models.append(model_obj) + + if tracing_format == ONNX_FORMAT or tracing_format == BOTH_FORMAT: + model_obj = create_model_json_obj( + model_id, + model_version, + ONNX_FORMAT, + embedding_dimension, + pooling_mode, + workflow_id, + model_uploader, + upload_time, + ) + models.append(model_obj) + + models = [dict(t) for t in {tuple(m.items()) for m in models}] + models = sort_models(models) + with open(MODEL_JSON_FILEPATH, "w") as f: + json.dump(models, f, indent=4) + + +def update_md_file(): + """ + Update MODEL_UPLOAD_HISTORY.md + + :return: No return value expected + :rtype: None + """ + models = [] + if os.path.exists(MODEL_JSON_FILEPATH): + with open(MODEL_JSON_FILEPATH, "r") as f: + models = json.load(f) + models = sort_models(models) + table_data = KEYS[:] + for m in models: + for k in KEYS: + if k == "Model ID": + table_data.append(f"`{m[k]}`") + else: + table_data.append(m[k]) + + table = Table().create_table( + columns=len(KEYS), rows=len(models) + 1, text=table_data, text_align="center" + ) + + mdFile = MarkDownFile(MD_FILENAME, dirname=DIRNAME) + mdFile.rewrite_all_file(data=MD_HEADER + table) + print(f"Finished updating {MD_FILENAME}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "model_id", + type=str, + help="Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)", + ) + parser.add_argument( + "model_version", type=str, help="Model version number (e.g. 1.0.1)" + ) + parser.add_argument( + "tracing_format", + choices=["BOTH", "TORCH_SCRIPT", "ONNX"], + help="Model format for auto-tracing", + ) + parser.add_argument( + "-ed", + "--embedding_dimension", + type=int, + nargs="?", + default=None, + const=None, + help="Embedding dimension of the model to use if it does not exist in original config.json", + ) + parser.add_argument( + "-pm", + "--pooling_mode", + type=str, + nargs="?", + default=None, + const=None, + choices=["CLS", "MEAN", "MAX", "MEAN_SQRT_LEN"], + help="Pooling mode if it does not exist in original config.json", + ) + parser.add_argument( + "-id", + "--workflow_id", + type=str, + nargs="?", + default=None, + const=None, + help="Workflow Run ID", + ) + parser.add_argument( + "-u", + "--model_uploader", + type=str, + nargs="?", + default=None, + const=None, + help="Model Uploader", + ) + parser.add_argument( + "-t", + "--upload_time", + type=str, + nargs="?", + default=None, + const=None, + help="Upload Time", + ) + args = parser.parse_args() + + update_model_json_file( + args.model_id, + args.model_version, + args.tracing_format, + args.embedding_dimension, + args.pooling_mode, + args.workflow_id, + args.model_uploader, + args.upload_time, + ) + + update_md_file()