triton-inference-server · GuanLuo · Jun 21, 2023 · Jun 7, 2023 · Jun 7, 2023 · Jun 12, 2023
diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve
@@ -27,6 +27,14 @@
 
 SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/
 
+# Use 'ready' for ping check in single-model endpoint mode, and use 'live' for ping check in multi-model endpoint model
+# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
+if [ -n "$SAGEMAKER_TRITON_PING_MODE" ]; then
+    SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_PING_MODE}
+else
+    SAGEMAKER_TRITON_PING_MODE="ready" 
+fi
+
 # Note: in Triton on SageMaker, each model url is registered as a separate repository
 # e.g., /opt/ml/models/<hash>/model. Specifying MME model repo path as /opt/ml/models causes Triton
 # to treat it as an additional empty repository and changes 
@@ -42,8 +50,13 @@ if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then
     if [ "$SAGEMAKER_MULTI_MODEL" == "true" ]; then
         mkdir -p ${SAGEMAKER_MULTI_MODEL_REPO}
         SAGEMAKER_MODEL_REPO=${SAGEMAKER_MULTI_MODEL_REPO}
+        if [ -n "$SAGEMAKER_TRITON_PING_MODE" ]; then
+            SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_PING_MODE}
+        else
+            SAGEMAKER_TRITON_PING_MODE="live" 
+        fi
         is_mme_mode=true
-        echo "Triton is running in SageMaker MME mode." 
+        echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\"" 
     fi
 fi
 
@@ -60,6 +73,22 @@ fi
 if [ -n "$SAGEMAKER_SAFE_PORT_RANGE" ]; then
     SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-safe-port-range=${SAGEMAKER_SAFE_PORT_RANGE}"
 fi
+if [ -n "$SAGEMAKER_TRITON_ALLOW_GRPC" ]; then
+    SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=${SAGEMAKER_TRITON_ALLOW_GRPC}"
+else
+    SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=false"
+fi
+if [ -n "$SAGEMAKER_TRITON_ALLOW_METRICS" ]; then
+    SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=${SAGEMAKER_TRITON_ALLOW_METRICS}"
+else
+    SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=false"
+fi
+if [ -n "$SAGEMAKER_TRITON_METRICS_PORT" ]; then
+    SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --metrics-port=${SAGEMAKER_TRITON_METRICS_PORT}"
+fi
+if [ -n "$SAGEMAKER_TRITON_GRPC_PORT" ]; then
+    SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --grpc-port=${SAGEMAKER_TRITON_GRPC_PORT}"
+fi
 if [ -n "$SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT" ]; then
     SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --buffer-manager-thread-count=${SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT}"
 fi
@@ -100,6 +129,9 @@ if [ -n "$SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT" ]; then
         SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-load-gpu-limit ${i}:${SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT}"
     done
 fi
+if [ -n "$SAGEMAKER_TRITON_ADDITIONAL_ARGS" ]; then
+    SAGEMAKER_ARGS="${SAGEMAKER_ARGS} ${SAGEMAKER_TRITON_ADDITIONAL_ARGS}"
+fi
 
 
 if [ "${is_mme_mode}" = false ] && [ -f "${SAGEMAKER_MODEL_REPO}/config.pbtxt" ]; then
@@ -134,4 +166,4 @@ elif [ "${is_mme_mode}" = false ]; then
     SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}"
 fi
 
-tritonserver --allow-sagemaker=true --allow-grpc=false --allow-http=false --allow-metrics=false --model-control-mode=explicit $SAGEMAKER_ARGS
+tritonserver --allow-sagemaker=true --allow-http=false --model-control-mode=explicit $SAGEMAKER_ARGS
diff --git a/qa/L0_sagemaker/test.sh b/qa/L0_sagemaker/test.sh
@@ -353,12 +353,11 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 
-# Ping and expect server to still be running (using 'live' instead of 'ready')
-# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
+# Ping and expect error code in SME mode.
 set +e
 code=`curl -s -w %{http_code} -o ./ping.out localhost:8080/ping`
 set -e
-if [ "$code" != "200" ]; then
+if [ "$code" == "200" ]; then
     cat ./ping.out
     echo -e "\n***\n*** Test Failed\n***"
     RET=1

diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc
@@ -904,7 +904,8 @@ SagemakerAPIServer::SageMakerMMECheckOOMError(TRITONSERVER_Error* err)
       "CUBLAS_STATUS_ALLOC_FAILED",
       "CUBLAS_STATUS_NOT_INITIALIZED",
       "Failed to allocate memory",
-      "failed to allocate memory"};
+      "failed to allocate memory",
+      "No space left on device"};
 
   /*
     TODO: Improve the search to do pattern match on whole words only

diff --git a/src/sagemaker_server.h b/src/sagemaker_server.h
@@ -78,7 +78,7 @@ class SagemakerAPIServer : public HTTPAPIServer {
         model_path_regex_(
             R"((\/opt\/ml\/models\/[0-9A-Za-z._]+)\/(model)\/?([0-9A-Za-z._]+)?)"),
         platform_ensemble_regex_(R"(platform:(\s)*\"ensemble\")"),
-        ping_mode_("live"),
+        ping_mode_(GetEnvironmentVariableOrDefault("SAGEMAKER_TRITON_PING_MODE", "ready")),
         model_name_(GetEnvironmentVariableOrDefault(
             "SAGEMAKER_TRITON_DEFAULT_MODEL_NAME",
             "unspecified_SAGEMAKER_TRITON_DEFAULT_MODEL_NAME")),