Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow changing ping behavior based on env variable in SageMaker and entrypoint updates #5910

Merged
merged 6 commits into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions docker/sagemaker/serve
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@

SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/

# Use 'ready' for ping check in single-model endpoint mode, and use 'live' for ping check in multi-model endpoint model
# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
if [ -n "$SAGEMAKER_TRITON_PING_MODE" ]; then
SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_PING_MODE}
else
SAGEMAKER_TRITON_PING_MODE="ready"
fi

# Note: in Triton on SageMaker, each model url is registered as a separate repository
# e.g., /opt/ml/models/<hash>/model. Specifying MME model repo path as /opt/ml/models causes Triton
# to treat it as an additional empty repository and changes
Expand All @@ -42,8 +50,13 @@ if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then
if [ "$SAGEMAKER_MULTI_MODEL" == "true" ]; then
mkdir -p ${SAGEMAKER_MULTI_MODEL_REPO}
SAGEMAKER_MODEL_REPO=${SAGEMAKER_MULTI_MODEL_REPO}
if [ -n "$SAGEMAKER_TRITON_PING_MODE" ]; then
SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_PING_MODE}
else
SAGEMAKER_TRITON_PING_MODE="live"
fi
is_mme_mode=true
echo "Triton is running in SageMaker MME mode."
echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\""
fi
fi

Expand All @@ -60,6 +73,22 @@ fi
if [ -n "$SAGEMAKER_SAFE_PORT_RANGE" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-safe-port-range=${SAGEMAKER_SAFE_PORT_RANGE}"
fi
if [ -n "$SAGEMAKER_TRITON_ALLOW_GRPC" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=${SAGEMAKER_TRITON_ALLOW_GRPC}"
else
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=false"
fi
if [ -n "$SAGEMAKER_TRITON_ALLOW_METRICS" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=${SAGEMAKER_TRITON_ALLOW_METRICS}"
else
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=false"
fi
if [ -n "$SAGEMAKER_TRITON_METRICS_PORT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --metrics-port=${SAGEMAKER_TRITON_METRICS_PORT}"
fi
if [ -n "$SAGEMAKER_TRITON_GRPC_PORT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --grpc-port=${SAGEMAKER_TRITON_GRPC_PORT}"
fi
if [ -n "$SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --buffer-manager-thread-count=${SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT}"
fi
Expand Down Expand Up @@ -100,6 +129,9 @@ if [ -n "$SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-load-gpu-limit ${i}:${SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT}"
done
fi
if [ -n "$SAGEMAKER_TRITON_ADDITIONAL_ARGS" ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} ${SAGEMAKER_TRITON_ADDITIONAL_ARGS}"
fi


if [ "${is_mme_mode}" = false ] && [ -f "${SAGEMAKER_MODEL_REPO}/config.pbtxt" ]; then
Expand Down Expand Up @@ -134,4 +166,4 @@ elif [ "${is_mme_mode}" = false ]; then
SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}"
fi

tritonserver --allow-sagemaker=true --allow-grpc=false --allow-http=false --allow-metrics=false --model-control-mode=explicit $SAGEMAKER_ARGS
tritonserver --allow-sagemaker=true --allow-http=false --model-control-mode=explicit $SAGEMAKER_ARGS
5 changes: 2 additions & 3 deletions qa/L0_sagemaker/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -353,12 +353,11 @@ if [ "$SERVER_PID" == "0" ]; then
exit 1
fi

# Ping and expect server to still be running (using 'live' instead of 'ready')
# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
# Ping and expect error code in SME mode.
set +e
code=`curl -s -w %{http_code} -o ./ping.out localhost:8080/ping`
set -e
if [ "$code" != "200" ]; then
if [ "$code" == "200" ]; then
cat ./ping.out
echo -e "\n***\n*** Test Failed\n***"
RET=1
Expand Down
3 changes: 2 additions & 1 deletion src/sagemaker_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,8 @@ SagemakerAPIServer::SageMakerMMECheckOOMError(TRITONSERVER_Error* err)
"CUBLAS_STATUS_ALLOC_FAILED",
"CUBLAS_STATUS_NOT_INITIALIZED",
"Failed to allocate memory",
"failed to allocate memory"};
"failed to allocate memory",
"No space left on device"};

/*
TODO: Improve the search to do pattern match on whole words only
Expand Down
2 changes: 1 addition & 1 deletion src/sagemaker_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class SagemakerAPIServer : public HTTPAPIServer {
model_path_regex_(
R"((\/opt\/ml\/models\/[0-9A-Za-z._]+)\/(model)\/?([0-9A-Za-z._]+)?)"),
platform_ensemble_regex_(R"(platform:(\s)*\"ensemble\")"),
ping_mode_("live"),
ping_mode_(GetEnvironmentVariableOrDefault("SAGEMAKER_TRITON_PING_MODE", "ready")),
model_name_(GetEnvironmentVariableOrDefault(
"SAGEMAKER_TRITON_DEFAULT_MODEL_NAME",
"unspecified_SAGEMAKER_TRITON_DEFAULT_MODEL_NAME")),
Expand Down