Allow changing ping behavior based on env variable in SageMaker

nikhil-sk · nikhil-sk · commit 731109edf135 · 2023-06-07T00:11:22.000Z
diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve
@@ -27,6 +27,10 @@
 
 SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/
 
+# Use 'ready' for ping check in single-model endpoint mode, and use 'live' for ping check in multi-model endpoint model
+# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
+SAGEMAKER_TRITON_PING_MODE="ready"
+
 # Note: in Triton on SageMaker, each model url is registered as a separate repository
 # e.g., /opt/ml/models/<hash>/model. Specifying MME model repo path as /opt/ml/models causes Triton
 # to treat it as an additional empty repository and changes 
@@ -42,8 +46,9 @@ if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then
     if [ "$SAGEMAKER_MULTI_MODEL" == "true" ]; then
         mkdir -p ${SAGEMAKER_MULTI_MODEL_REPO}
         SAGEMAKER_MODEL_REPO=${SAGEMAKER_MULTI_MODEL_REPO}
+        SAGEMAKER_TRITON_PING_MODE="live"
         is_mme_mode=true
-        echo "Triton is running in SageMaker MME mode." 
+        echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\"" 
     fi
 fi
 
@@ -134,4 +139,4 @@ elif [ "${is_mme_mode}" = false ]; then
     SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}"
 fi
 
-tritonserver --allow-sagemaker=true --allow-grpc=false --allow-http=false --allow-metrics=false --model-control-mode=explicit $SAGEMAKER_ARGS
+tritonserver --allow-sagemaker=true --allow-grpc=true --allow-http=false --allow-metrics=true --model-control-mode=explicit $SAGEMAKER_ARGS
diff --git a/qa/L0_sagemaker/test.sh b/qa/L0_sagemaker/test.sh
@@ -353,12 +353,11 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 
-# Ping and expect server to still be running (using 'live' instead of 'ready')
-# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
+# Ping and expect error code in SME mode.
 set +e
 code=`curl -s -w %{http_code} -o ./ping.out localhost:8080/ping`
 set -e
-if [ "$code" != "200" ]; then
+if [ "$code" == "200" ]; then
     cat ./ping.out
     echo -e "\n***\n*** Test Failed\n***"
     RET=1
diff --git a/src/sagemaker_server.cc b/src/sagemaker_server.cc
@@ -904,7 +904,8 @@ SagemakerAPIServer::SageMakerMMECheckOOMError(TRITONSERVER_Error* err)
       "CUBLAS_STATUS_ALLOC_FAILED",
       "CUBLAS_STATUS_NOT_INITIALIZED",
       "Failed to allocate memory",
-      "failed to allocate memory"};
+      "failed to allocate memory",
+      "No space left on device"};
 
   /*
     TODO: Improve the search to do pattern match on whole words only
diff --git a/src/sagemaker_server.h b/src/sagemaker_server.h
@@ -78,7 +78,7 @@ class SagemakerAPIServer : public HTTPAPIServer {
         model_path_regex_(
             R"((\/opt\/ml\/models\/[0-9A-Za-z._]+)\/(model)\/?([0-9A-Za-z._]+)?)"),
         platform_ensemble_regex_(R"(platform:(\s)*\"ensemble\")"),
-        ping_mode_("live"),
+        ping_mode_(GetEnvironmentVariableOrDefault("SAGEMAKER_TRITON_PING_MODE", "ready")),
         model_name_(GetEnvironmentVariableOrDefault(
             "SAGEMAKER_TRITON_DEFAULT_MODEL_NAME",
             "unspecified_SAGEMAKER_TRITON_DEFAULT_MODEL_NAME")),