From 99a3f44077d7f8f3934a08054c63c506ddfd1f32 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 3 Jun 2024 09:08:07 -0700
Subject: [PATCH] Added new flag for GPU peer access API control (#7261)

Co-authored-by: Iman Tabrizian <iman.tabrizian@gmail.com>
---
 qa/L0_metrics/test.sh      | 28 ++++++++++++++++++++++++++++
 qa/L0_trace/test.sh        |  1 +
 src/command_line_parser.cc | 18 +++++++++++++++++-
 src/command_line_parser.h  |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
index 61d8bbea3b..4b244f1ba2 100755
--- a/qa/L0_metrics/test.sh
+++ b/qa/L0_metrics/test.sh
@@ -140,6 +140,33 @@ kill $SERVER_PID
 wait $SERVER_PID
 set -e
 
+# Peer access GPU memory utilization Test
+# Custom Pinned memory pool size
+export CUSTOM_PINNED_MEMORY_POOL_SIZE=0 # bytes
+export CUDA_VISIBLE_DEVICES=0
+SERVER_LOG="gpu_peer_memory_test_server.log"
+CLIENT_LOG="gpu_peer_memory_test_client.log"
+
+SERVER_ARGS="$BASE_SERVER_ARGS --model-control-mode=explicit --log-verbose=1 --pinned-memory-pool-byte-size=$CUSTOM_PINNED_MEMORY_POOL_SIZE --enable-peer-access=FALSE --cuda-memory-pool-byte-size 0:0 --log-verbose=1"
+run_and_check_server
+#grep usage stats for triton server from nvidia-smi
+memory_size_without_peering=$(nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader,nounits | grep $(pgrep tritonserver) | awk '{print $3}')
+
+#nvidia-smi only lists process which use gpu memory with --enable-peer-access=FALSE nvidia-smi may not list tritonserver
+if [ -z $memory_size_without_peering ]; then
+  memory_size_without_peering=0
+fi
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+# Check if memory usage HAS reduced to 0 after using the --enable-peer-access flag
+if [ $memory_size_without_peering -ne 0 ]; then
+   # Print the memory usage for each GPU
+  echo "Disabling PEERING does not reduce GPU memory usage to ZERO"
+  echo -e "\n***\n*** GPU Peer enable failed. \n***"
+  RET=1
+fi
 
 ### GPU Metrics
 set +e
@@ -411,3 +438,4 @@ else
 fi
 
 exit $RET
+
diff --git a/qa/L0_trace/test.sh b/qa/L0_trace/test.sh
index 8a9172b02f..7d67afb3ba 100755
--- a/qa/L0_trace/test.sh
+++ b/qa/L0_trace/test.sh
@@ -777,6 +777,7 @@ SERVER_ARGS="--allow-sagemaker=true --model-control-mode=explicit \
                 --load-model=simple --load-model=ensemble_add_sub_int32_int32_int32 \
                 --load-model=repeat_int32 \
                 --load-model=input_all_required \
+                --load-model=dynamic_batch \
                 --load-model=bls_simple --trace-config=level=TIMESTAMPS \
                 --load-model=trace_context --trace-config=rate=1 \
                 --trace-config=count=-1 --trace-config=mode=opentelemetry \
diff --git a/src/command_line_parser.cc b/src/command_line_parser.cc
index 608d0bab03..53a103d33b 100644
--- a/src/command_line_parser.cc
+++ b/src/command_line_parser.cc
@@ -373,7 +373,8 @@ enum TritonOptionId {
   OPTION_BACKEND_CONFIG,
   OPTION_HOST_POLICY,
   OPTION_MODEL_LOAD_GPU_LIMIT,
-  OPTION_MODEL_NAMESPACING
+  OPTION_MODEL_NAMESPACING,
+  OPTION_ENABLE_PEER_ACCESS
 };
 
 void
@@ -461,6 +462,13 @@ TritonParser::SetupOptions()
       {OPTION_MODEL_NAMESPACING, "model-namespacing", Option::ArgBool,
        "Whether model namespacing is enable or not. If true, models with the "
        "same name can be served if they are in different namespace."});
+  model_repo_options_.push_back(
+      {OPTION_ENABLE_PEER_ACCESS, "enable-peer-access", Option::ArgBool,
+       "Whether the server tries to enable peer access or not. Even when this "
+       "options is set to true,  "
+       "peer access could still be not enabled because the underlying system "
+       "doesn't support it."
+       " The server will log a warning in this case. Default is true."});
 
 #if defined(TRITON_ENABLE_HTTP)
   http_options_.push_back(
@@ -1100,6 +1108,11 @@ TritonServerParameters::BuildTritonServerOptions()
       TRITONSERVER_ServerOptionsSetModelNamespacing(
           loptions, enable_model_namespacing_),
       "setting model namespacing");
+  THROW_IF_ERR(
+      ParseException,
+      TRITONSERVER_ServerOptionsSetEnablePeerAccess(
+          loptions, enable_peer_access_),
+      "setting peer access");
 
 #ifdef TRITON_ENABLE_LOGGING
   TRITONSERVER_ServerOptionsSetLogFile(loptions, log_file_.c_str());
@@ -1722,6 +1735,9 @@ TritonParser::Parse(int argc, char** argv)
         case OPTION_MODEL_NAMESPACING:
           lparams.enable_model_namespacing_ = ParseOption<bool>(optarg);
           break;
+        case OPTION_ENABLE_PEER_ACCESS:
+          lparams.enable_peer_access_ = ParseOption<bool>(optarg);
+          break;
       }
     }
     catch (const ParseException& pe) {
diff --git a/src/command_line_parser.h b/src/command_line_parser.h
index bf0cb72d3e..762ee87b6d 100644
--- a/src/command_line_parser.h
+++ b/src/command_line_parser.h
@@ -125,6 +125,7 @@ struct TritonServerParameters {
 
   // Model repository manager configuration
   bool enable_model_namespacing_{false};
+  bool enable_peer_access_{true};
   std::set<std::string> model_repository_paths_{};
   TRITONSERVER_ModelControlMode control_mode_{TRITONSERVER_MODEL_CONTROL_NONE};
   std::set<std::string> startup_models_{};