Skip to content

Model Upload Workflow: Tracing-Uploading-Releasing #471

Model Upload Workflow: Tracing-Uploading-Releasing

Model Upload Workflow: Tracing-Uploading-Releasing #471

name: "Model Upload Workflow: Tracing-Uploading-Releasing"
on:
# Step 1: Initiate the workflow
workflow_dispatch:
inputs:
model_source:
description: "Model source (e.g. huggingface)"
required: true
type: string
default: "huggingface"
model_id:
description: "Model ID for auto-tracing and uploading (e.g. sentence-transformers/msmarco-distilbert-base-tas-b)"
required: true
type: string
model_version:
description: "Model version number (e.g. 1.0.1)"
required: true
type: string
tracing_format:
description: "Model format for auto-tracing (torch_script/onnx)"
required: true
type: choice
options:
- "BOTH"
- "TORCH_SCRIPT"
- "ONNX"
embedding_dimension:
description: "(Optional) Embedding Dimension (Specify here if it does not exist in original config.json file, or you want to overwrite it.)"
required: false
type: int
pooling_mode:
description: "(Optional) Pooling Mode (Specify here if it does not exist in original config.json file or you want to overwrite it.)"
required: false
type: choice
options:
- ""
- "CLS"
- "MEAN"
- "MAX"
- "MEAN_SQRT_LEN"
model_description:
description: "(Optional) Description (Specify here if you want to overwrite the default model description)"
required: false
type: string
allow_overwrite:
description: "Allow the workflow to overwrite model in model hub"
required: true
type: choice
options:
- "NO"
- "YES"
jobs:
# Step 2: Initiate workflow variable
init-workflow-var:
runs-on: 'ubuntu-latest'
steps:
# - name: Fail if branch is not main
# if: github.ref != 'refs/heads/main'
# run: |
# echo "This workflow should only be triggered on 'main' branch"
# exit 1
- name: Initiate folders
id: init_folders
run: |
model_id=${{ github.event.inputs.model_id }}
echo "model_folder=ml-models/${{github.event.inputs.model_source}}/${model_id}" >> $GITHUB_OUTPUT
echo "sentence_transformer_folder=ml-models/${{github.event.inputs.model_source}}/${model_id%%/*}/" >> $GITHUB_OUTPUT
- name: Initiate workflow_info
id: init_workflow_info
run: |
embedding_dimension=${{ github.event.inputs.embedding_dimension }}
pooling_mode=${{ github.event.inputs.pooling_mode }}
model_description="${{ github.event.inputs.model_description }}"
workflow_info="
============= Workflow Details ==============
- Workflow Name: ${{ github.workflow }}
- Workflow Run ID: ${{ github.run_id }}
- Workflow Initiator: @${{ github.actor }}
- Aloow Overwrite: ${{ github.event.inputs.allow_overwrite }}
========= Workflow Input Information =========
- Model ID: ${{ github.event.inputs.model_id }}
- Model Version: ${{ github.event.inputs.model_version }}
- Tracing Format: ${{ github.event.inputs.tracing_format }}
- Embedding Dimension: ${embedding_dimension:-N/A}
- Pooling Mode: ${pooling_mode:-N/A}
- Model Description: ${model_description:-N/A}
======== Workflow Output Information =========
- Embedding Verification: Passed"
echo "workflow_info<<EOF" >> $GITHUB_OUTPUT
echo "${workflow_info@E}" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
echo "${workflow_info@E}"
- name: Initiate license_line
id: init_license_line
run: |
echo "verified=:white_check_mark: — It is verified that this model is licensed under Apache 2.0" >> $GITHUB_OUTPUT
echo "unverified=- [ ] :warning: The license cannot be verified. Please confirm by yourself that the model is licensed under Apache 2.0 :warning:" >> $GITHUB_OUTPUT
outputs:
model_folder: ${{ steps.init_folders.outputs.model_folder }}
sentence_transformer_folder: ${{ steps.init_folders.outputs.sentence_transformer_folder }}
workflow_info: ${{ steps.init_workflow_info.outputs.workflow_info }}
verified_license_line: ${{ steps.init_license_line.outputs.verified }}
unverified_license_line: ${{ steps.init_license_line.outputs.unverified }}
# Step 3: Check if the model already exists in the model hub
checking-out-model-hub:
needs: init-workflow-var
runs-on: 'ubuntu-latest'
permissions:
id-token: write
contents: read
environment: opensearch-py-ml-cicd-env
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set Up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
# - name: Configure AWS Credentials
# uses: aws-actions/configure-aws-credentials@v2
# with:
# aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }}
# role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }}
# role-session-name: checking-out-model-hub
- name: Check if TORCH_SCRIPT Model Exists
if: github.event.inputs.allow_overwrite == 'NO' && (github.event.inputs.tracing_format == 'TORCH_SCRIPT' || github.event.inputs.tracing_format == 'BOTH')
run: |
TORCH_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \
${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \
${{ github.event.inputs.model_version }} TORCH_SCRIPT)
aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $TORCH_FILE_PATH > /dev/null 2>&1 || TORCH_MODEL_NOT_EXIST=true
if [[ -z $TORCH_MODEL_NOT_EXIST ]]
then
echo "${{ github.event.inputs.model_id }} already exists on model hub for TORCH_SCRIPT format and ${{ github.event.inputs.model_version }} version."
exit 1
fi
- name: Check if ONNX Model Exists
if: github.event.inputs.allow_overwrite == 'NO' && (github.event.inputs.tracing_format == 'ONNX' || github.event.inputs.tracing_format == 'BOTH')
run: |
ONNX_FILE_PATH=$(python utils/model_uploader/save_model_file_path_to_env.py \
${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} ${{ github.event.inputs.model_id }} \
${{ github.event.inputs.model_version }} ONNX)
aws s3api head-object --bucket ${{ secrets.MODEL_BUCKET }} --key $ONNX_FILE_PATH > /dev/null 2>&1 || ONNX_MODEL_NOT_EXIST=true
if [[ -z $ONNX_MODEL_NOT_EXIST ]]
then
echo "${{ github.event.inputs.model_id }} already exists on model hub for ONNX format and ${{ github.event.inputs.model_version }} version."
exit 1
fi
# Step 4: Trace the model, Verify the embeddings & Upload the model files as artifacts
model-auto-tracing:
needs: [init-workflow-var, checking-out-model-hub]
name: model-auto-tracing
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
environment: opensearch-py-ml-cicd-env
strategy:
matrix:
cluster: ["opensearch"]
secured: ["true"]
entry:
- { opensearch_version: 2.7.0 }
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Export Arguments
run: |
echo "MODEL_ID=${{ github.event.inputs.model_id }}" >> $GITHUB_ENV
echo "MODEL_VERSION=${{ github.event.inputs.model_version }}" >> $GITHUB_ENV
echo "TRACING_FORMAT=${{ github.event.inputs.tracing_format }}" >> $GITHUB_ENV
echo "EMBEDDING_DIMENSION=${{ github.event.inputs.embedding_dimension }}" >> $GITHUB_ENV
echo "POOLING_MODE=${{ github.event.inputs.pooling_mode }}" >> $GITHUB_ENV
echo "MODEL_DESCRIPTION=${{ github.event.inputs.model_description }}" >> $GITHUB_ENV
- name: Autotracing ${{ matrix.cluster }} secured=${{ matrix.secured }} version=${{matrix.entry.opensearch_version}}
run: "./.ci/run-tests ${{ matrix.cluster }} ${{ matrix.secured }} ${{ matrix.entry.opensearch_version }} trace"
- name: Limit Model Size to 2GB
run: |
upload_size_in_binary_bytes=$(ls -lR ./upload/ | awk '{ SUM += $5} END {print SUM}')
size_limit_in_binary_bytes="2147483648"
echo "Model Artifact Size: $upload_size_in_binary_bytes binary bytes"
if [ "$upload_size_in_binary_bytes" -ge "$size_limit_in_binary_bytes" ]
then
echo "The workflow cannot upload the model artifact that is larger than 2GB."
exit 1
fi
- name: License Verification
id: license_verification
run: |
apache_verified=$(<trace_output/apache_verified.txt)
if [[ $apache_verified == "True" ]]
then
echo "license_line=${{ needs.init-workflow-var.outputs.verified_license_line }}" >> $GITHUB_OUTPUT
echo "license_info=Automatically Verified" >> $GITHUB_OUTPUT
else
echo "license_line=${{ needs.init-workflow-var.outputs.unverified_license_line }}" >> $GITHUB_OUTPUT
echo "license_info=Manually Verified" >> $GITHUB_OUTPUT
fi
- name: Model Description Info
id: model_description_info
run: |
model_description_info="$(<trace_output/description.txt)"
echo "model_description_info=- Model Description: $model_description_info" >> $GITHUB_OUTPUT
echo "$model_description_info"
- name: Upload Artifact
uses: actions/upload-artifact@v3
with:
name: upload
path: ./upload/
retention-days: 5
if-no-files-found: error
# - name: Configure AWS Credentials
# uses: aws-actions/configure-aws-credentials@v2
# with:
# aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }}
# role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }}
# role-session-name: model-auto-tracing
# - name: Dryrun model uploading
# id: dryrun_model_uploading
# run: |
# dryrun_output=$(aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }} --dryrun \
# | sed 's|s3://${{ secrets.MODEL_BUCKET }}/|s3://(MODEL_BUCKET)/|'
# )
# echo "dryrun_output<<EOF" >> $GITHUB_OUTPUT
# echo "${dryrun_output@E}" >> $GITHUB_OUTPUT
# echo "EOF" >> $GITHUB_OUTPUT
# echo "${dryrun_output@E}"
outputs:
license_line: ${{ steps.license_verification.outputs.license_line }}
license_info: ${{ steps.license_verification.outputs.license_info }}
model_description_info: ${{ steps.model_description_info.outputs.model_description_info }}
dryrun_output: ${{ steps.dryrun_model_uploading.outputs.dryrun_output }}
# Step 5: Ask for manual approval from the CODEOWNERS
manual-approval:
needs: [init-workflow-var, model-auto-tracing]
runs-on: 'ubuntu-latest'
permissions:
issues: write
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Get Approvers
id: get_approvers
run: |
echo "approvers=$(cat .github/CODEOWNERS | grep @ | tr -d '* ' | sed 's/@/,/g' | sed 's/,//1')" >> $GITHUB_OUTPUT
- name: Create Issue Body
id: create_issue_body
run: |
issue_body="Please approve or deny opensearch-py-ml model uploading:
${{ needs.model-auto-tracing.outputs.license_line }}
${{ needs.init-workflow-var.outputs.workflow_info }}
${{ needs.model-auto-tracing.outputs.model_description_info }}
===== Dry Run of Model Uploading =====
${{ needs.model-auto-tracing.outputs.dryrun_output }}"
echo "issue_body<<EOF" >> $GITHUB_OUTPUT
echo "${issue_body@E}" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
echo "${issue_body@E}"
- uses: trstringer/manual-approval@v1
with:
secret: ${{ github.TOKEN }}
approvers: ${{ steps.get_approvers.outputs.approvers }}
minimum-approvals: 2
issue-title: "Upload Model to OpenSearch Model Hub (${{ github.event.inputs.model_id }})"
issue-body: ${{ steps.create_issue_body.outputs.issue_body }}
exclude-workflow-initiator-as-approver: false
# Step 6: Download the artifacts & Upload it to the S3 bucket
model-uploading:
needs: [init-workflow-var, manual-approval]
runs-on: 'ubuntu-latest'
permissions:
id-token: write
contents: read
environment: opensearch-py-ml-cicd-env
steps:
- name: Download Artifact
uses: actions/download-artifact@v2
with:
name: upload
path: ./upload/
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-region: ${{ secrets.MODEL_UPLOADER_AWS_REGION }}
role-to-assume: ${{ secrets.MODEL_UPLOADER_ROLE }}
role-session-name: model-uploading
- name: Copy Files to the Bucket
id: copying_to_bucket
run: |
aws s3 sync ./upload/ s3://${{ secrets.MODEL_BUCKET }}/${{ needs.init-workflow-var.outputs.sentence_transformer_folder }}
echo "upload_time=$(TZ='America/Los_Angeles' date "+%Y-%m-%d %T")" >> $GITHUB_OUTPUT
outputs:
upload_time: ${{ steps.copying_to_bucket.outputs.upload_time }}
# Step 7: Update MODEL_UPLOAD_HISTORY.md & supported_models.json
history-update:
needs: [init-workflow-var, model-auto-tracing, model-uploading]
runs-on: 'ubuntu-latest'
permissions:
id-token: write
contents: write
pull-requests: write
env:
model_info: ${{ github.event.inputs.model_id }} (v.${{ github.event.inputs.model_version }})(${{ github.event.inputs.tracing_format }})
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Set Up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Install Packages
run:
python -m pip install mdutils
- name: Update Model Upload History
run: |
model_description="${{ github.event.inputs.model_description }}"
python utils/model_uploader/update_models_upload_history_md.py \
${{ github.event.inputs.model_id }} \
${{ github.event.inputs.model_version }} \
${{ github.event.inputs.tracing_format }} \
-ed ${{ github.event.inputs.embedding_dimension }} \
-pm ${{ github.event.inputs.pooling_mode }} \
-id ${{ github.run_id }} -u ${{ github.actor }} \
-t "${{ needs.model-uploading.outputs.upload_time }}"
- name: Create PR Body
id: create_pr_body
run: |
pr_body="
- [ ] This PR made commit to only these three files: MODEL_UPLOAD_HISTORY.md, supported_models.json, and CHANGELOG.md.
- [ ] CHANGELOG.md has been updated by the workflow or by you if the workflow fails to do so.
- [ ] Merge conflicts have been resolved.
${{ needs.init-workflow-var.outputs.workflow_info }}
${{ needs.model-auto-tracing.outputs.license_info }}
${{ needs.model-auto-tracing.outputs.model_description_info }}"
echo "pr_body<<EOF" >> $GITHUB_OUTPUT
echo "${pr_body@E}" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
echo "${pr_body@E}"
- name: Create a Branch & Raise a PR
uses: peter-evans/create-pull-request@v5
id: create_pr
with:
committer: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
commit-message: 'GitHub Actions Workflow: Update Model Upload History - ${{ env.model_info }}'
signoff: true
title: 'Update Model Upload History - ${{ env.model_info }}'
body: ${{ steps.create_pr_body.outputs.pr_body }}
labels: ModelUploading
branch: model-uploader/${{ github.run_id }}
delete-branch: true
add-paths: |
./utils/model_uploader/upload_history/MODEL_UPLOAD_HISTORY.md
./utils/model_uploader/upload_history/supported_models.json
- name: Checkout Repository
uses: actions/checkout@v3
with:
ref: model-uploader/${{ github.run_id }}
- name: Create a line for updating CHANGELOG.md
id: create_changelog_line
continue-on-error: true
run: |
pr_ref="([#${{ steps.create_pr.outputs.pull-request-number }}](${{ steps.create_pr.outputs.pull-request-url }}))"
changelog_line="Update model upload history - ${{ env.model_info }} by @${{ github.actor }} $pr_ref"
echo "changelog_line=$changelog_line" >> $GITHUB_OUTPUT
- name: Warning Comment on PR if create_changelog_line fails
if: steps.create_changelog_line.outcome == 'failure'
uses: thollander/actions-comment-pull-request@v2
with:
pr_number: ${{ steps.create_pr.outputs.pull-request-number }}
message: |
Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please update CHANGELOG.md manually.
- name: Update CHANGELOG.md
if: steps.create_changelog_line.outcome == 'success'
id: update_changelog
continue-on-error: true
run: |
python utils/model_uploader/update_changelog_md.py "${{ steps.create_changelog_line.outputs.changelog_line }}"
- name: Commit Updates
if: steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'success'
uses: stefanzweifel/git-auto-commit-action@v4
id: commit
with:
branch: model-uploader/${{ github.run_id }}
commit_user_email: "github-actions[bot]@users.noreply.github.com"
commit_message: 'GitHub Actions Workflow: Update CHANGELOG.md - ${{ env.model_info }}'
commit_options: '--signoff'
file_pattern: CHANGELOG.md
- name: Warning Comment on PR if update_changelog fails
if: steps.create_changelog_line.outcome == 'success' && steps.update_changelog.outcome == 'failure'
uses: thollander/actions-comment-pull-request@v2
with:
pr_number: ${{ steps.create_pr.outputs.pull-request-number }}
message: |
Warning:exclamation:: The workflow failed to update CHANGELOG.md. Please add the following line manually.
>>>
${{ steps.create_changelog_line.outputs.changelog_line }}
# Step 8: Trigger Jenkins ml-models workflow
trigger-ml-models-release-workflow:
needs: [init-workflow-var, history-update]
runs-on: 'ubuntu-latest'
permissions:
contents: read
steps:
- name: Checkout Repository
uses: actions/checkout@v3
- name: Trigger Jenkins Workflow with Generic Webhook
run: |
jenkins_trigger_token=${{ secrets.JENKINS_ML_MODELS_RELEASE_GENERIC_WEBHOOK_TOKEN }}
base_download_path=${{ needs.init-workflow-var.outputs.model_folder }}
version=${{ github.event.inputs.model_version }}
format=${{ github.event.inputs.tracing_format }}
jenkins_params="{\"BASE_DOWNLOAD_PATH\":\"$base_download_path\", \"VERSION\":\"$version\", \"FORMAT\":\"$format\"}"
sh utils/model_uploader/trigger_ml_models_release.sh $jenkins_trigger_token "$jenkins_params"