Skip to content

Commit

Permalink
Allow changing ping behavior based on env variable in SageMaker and e…
Browse files Browse the repository at this point in the history
…ntrypoint updates (#5910)

* Allow changing ping behavior based on env variable in SageMaker

* Add option for additional args

* Make ping further configurable

* Allow further configuration of grpc and http ports

* Update docker/sagemaker/serve

* Update docker/sagemaker/serve

---------

Co-authored-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com>
  • Loading branch information
nikhil-sk and GuanLuo committed Jun 21, 2023
1 parent cbae8c2 commit 7fb4858
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 7 deletions.
36 changes: 34 additions & 2 deletions docker/sagemaker/serve
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@

SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/

# Use 'ready' for ping check in single-model endpoint mode, and use 'live' for ping check in multi-model endpoint model
# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
if [ -n "$SAGEMAKER_TRITON_PING_MODE" ]; then
SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_PING_MODE}
else
SAGEMAKER_TRITON_PING_MODE="ready"
fi

# Note: in Triton on SageMaker, each model url is registered as a separate repository
# e.g., /opt/ml/models/<hash>/model. Specifying MME model repo path as /opt/ml/models causes Triton
# to treat it as an additional empty repository and changes
Expand All @@ -42,8 +50,13 @@ if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then
if [ "$SAGEMAKER_MULTI_MODEL" == "true" ]; then
mkdir -p ${SAGEMAKER_MULTI_MODEL_REPO}
SAGEMAKER_MODEL_REPO=${SAGEMAKER_MULTI_MODEL_REPO}
if [ -n "$SAGEMAKER_TRITON_PING_MODE" ]; then
SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_PING_MODE}
else
SAGEMAKER_TRITON_PING_MODE="live"
fi
is_mme_mode=true
echo "Triton is running in SageMaker MME mode."
echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\""
fi
fi

Expand All @@ -60,6 +73,22 @@ fi
if [ -n "$SAGEMAKER_SAFE_PORT_RANGE" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-safe-port-range=${SAGEMAKER_SAFE_PORT_RANGE}"
fi
if [ -n "$SAGEMAKER_TRITON_ALLOW_GRPC" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=${SAGEMAKER_TRITON_ALLOW_GRPC}"
else
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=false"
fi
if [ -n "$SAGEMAKER_TRITON_ALLOW_METRICS" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=${SAGEMAKER_TRITON_ALLOW_METRICS}"
else
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=false"
fi
if [ -n "$SAGEMAKER_TRITON_METRICS_PORT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --metrics-port=${SAGEMAKER_TRITON_METRICS_PORT}"
fi
if [ -n "$SAGEMAKER_TRITON_GRPC_PORT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --grpc-port=${SAGEMAKER_TRITON_GRPC_PORT}"
fi
if [ -n "$SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --buffer-manager-thread-count=${SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT}"
fi
Expand Down Expand Up @@ -100,6 +129,9 @@ if [ -n "$SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-load-gpu-limit ${i}:${SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT}"
done
fi
if [ -n "$SAGEMAKER_TRITON_ADDITIONAL_ARGS" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} ${SAGEMAKER_TRITON_ADDITIONAL_ARGS}"
fi


if [ "${is_mme_mode}" = false ] && [ -f "${SAGEMAKER_MODEL_REPO}/config.pbtxt" ]; then
Expand Down Expand Up @@ -134,4 +166,4 @@ elif [ "${is_mme_mode}" = false ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}"
fi

tritonserver --allow-sagemaker=true --allow-grpc=false --allow-http=false --allow-metrics=false --model-control-mode=explicit $SAGEMAKER_ARGS
tritonserver --allow-sagemaker=true --allow-http=false --model-control-mode=explicit $SAGEMAKER_ARGS
5 changes: 2 additions & 3 deletions qa/L0_sagemaker/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -353,12 +353,11 @@ if [ "$SERVER_PID" == "0" ]; then
exit 1
fi

# Ping and expect server to still be running (using 'live' instead of 'ready')
# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
# Ping and expect error code in SME mode.
set +e
code=`curl -s -w %{http_code} -o ./ping.out localhost:8080/ping`
set -e
if [ "$code" != "200" ]; then
if [ "$code" == "200" ]; then
cat ./ping.out
echo -e "\n***\n*** Test Failed\n***"
RET=1
Expand Down
3 changes: 2 additions & 1 deletion src/sagemaker_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,8 @@ SagemakerAPIServer::SageMakerMMECheckOOMError(TRITONSERVER_Error* err)
"CUBLAS_STATUS_ALLOC_FAILED",
"CUBLAS_STATUS_NOT_INITIALIZED",
"Failed to allocate memory",
"failed to allocate memory"};
"failed to allocate memory",
"No space left on device"};

/*
TODO: Improve the search to do pattern match on whole words only
Expand Down
2 changes: 1 addition & 1 deletion src/sagemaker_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class SagemakerAPIServer : public HTTPAPIServer {
model_path_regex_(
R"((\/opt\/ml\/models\/[0-9A-Za-z._]+)\/(model)\/?([0-9A-Za-z._]+)?)"),
platform_ensemble_regex_(R"(platform:(\s)*\"ensemble\")"),
ping_mode_("live"),
ping_mode_(GetEnvironmentVariableOrDefault("SAGEMAKER_TRITON_PING_MODE", "ready")),
model_name_(GetEnvironmentVariableOrDefault(
"SAGEMAKER_TRITON_DEFAULT_MODEL_NAME",
"unspecified_SAGEMAKER_TRITON_DEFAULT_MODEL_NAME")),
Expand Down

0 comments on commit 7fb4858

Please sign in to comment.