triton-inference-server · pskiran1 · Jan 12, 2024 · Dec 26, 2023 · Dec 29, 2023 · Dec 29, 2023
diff --git a/docs/user_guide/metrics.md b/docs/user_guide/metrics.md
@@ -56,6 +56,7 @@ To change the interval at which metrics are polled/updated, see the `--metrics-i
 - [Inference Request Metrics](#inference-request-metrics)
 - [GPU Metrics](#gpu-metrics)
 - [CPU Metrics](#cpu-metrics)
+- [Pinned Memory Metrics](#pinned-memory-metrics)
 - [Response Cache Metrics](#response-cache-metrics)
 - [Custom Metrics](#custom-metrics)
 
@@ -282,6 +283,15 @@ If building Triton locally, the `TRITON_ENABLE_METRICS_CPU` CMake build flag can
 |CPU Memory      | CPU Total Memory | `nv_cpu_memory_total_bytes` | Total CPU memory (RAM), in bytes | System-wide | Per interval |
 |                | CPU Used Memory | `nv_cpu_memory_used_bytes` | Used CPU memory (RAM), in bytes | System-wide | Per interval |
 
+## Pinned Memory Metrics
+
+Starting in 24.01, Triton offers Pinned Memory metrics to monitor the utilization of the Pinned Memory pool.
+
+|Category        |Metric            |Metric Name                 |Description                                            |Granularity|Frequency    |
+|----------------|------------------|----------------------------|-------------------------------------------------------|-----------|-------------|
+|Pinned Memory   |Total Pinned memory |`nv_pinned_memory_pool_total_bytes`        |Total Pinned memory, in bytes                      |All models    |Per interval |
+|                |Used Pinned memory |`nv_pinned_memory_pool_used_bytes`        |Used Pinned memory, in bytes                      |All models    |Per interval |
+
 ## Response Cache Metrics
 
 Cache metrics can be reported in two ways:

diff --git a/qa/L0_metrics/metrics_config_test.py b/qa/L0_metrics/metrics_config_test.py
@@ -55,6 +55,10 @@
     "nv_cache_hit_duration_per_model",
     "nv_cache_miss_duration_per_model",
 ]
+PINNED_MEMORY_PATTERNS = [
+    "nv_pinned_memory_pool_total_bytes",
+    "nv_pinned_memory_pool_used_bytes",
+]
 CACHE_SUMMARY_PATTERNS = ["nv_cache_hit_summary", "nv_cache_miss_summary"]
 
 
@@ -65,6 +69,11 @@ def _get_metrics(self):
         r.raise_for_status()
         return r.text
 
+    def test_pinned_memory_metrics_exist(self):
+        metrics = self._get_metrics()
+        for metric in PINNED_MEMORY_PATTERNS:
+            self.assertIn(metric, metrics)
+
     # Counters
     def test_inf_counters_exist(self):
         metrics = self._get_metrics()

diff --git a/qa/L0_metrics/pinned_memory_metrics_test.py b/qa/L0_metrics/pinned_memory_metrics_test.py
@@ -0,0 +1,177 @@
+#!/bin/bash
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import re
+import threading
+import time
+import unittest
+
+import numpy as np
+import requests
+import tritonclient.http as httpclient
+from tritonclient.utils import *
+
+custom_pinned_memory_pool_size = os.environ.get("CUSTOM_PINNED_MEMORY_POOL_SIZE")
+if custom_pinned_memory_pool_size is not None:
+    TOTAL_PINNED_MEMORY_SIZE = int(custom_pinned_memory_pool_size)
+else:
+    # Triton server reserves 256 MB for pinned memory by default.
+    TOTAL_PINNED_MEMORY_SIZE = 2**28  # bytes, Equivalent to 256 MB
+print(f"TOTAL_PINNED_MEMORY_SIZE: {TOTAL_PINNED_MEMORY_SIZE} bytes")
+
+# Pinned memory usage when server is idle (no inference)
+DEFAULT_USED_PINNED_MEMORY_SIZE = 0  # bytes
+
+
+total_bytes_pattern = re.compile(r"pool_total_bytes (\d+)")
+used_bytes_pattern = re.compile(r"pool_used_bytes (\d+)")
+
+
+def _get_metrics():
+    r = requests.get("http://localhost:8002/metrics")
+    r.raise_for_status()
+
+    total_bytes_match = total_bytes_pattern.search(r.text)
+    total_bytes_value = total_bytes_match.group(1)
+
+    used_bytes_match = used_bytes_pattern.search(r.text)
+    used_bytes_value = used_bytes_match.group(1)
+
+    return total_bytes_value, used_bytes_value
+
+
+class TestPinnedMemoryMetrics(unittest.TestCase):
+    def setUp(self):
+        self.inference_completed = threading.Event()
+
+        shape = [1, 16]
+        self.model_name = "libtorch_float32_float32_float32"
+        input0_data = np.random.rand(*shape).astype(np.float32)
+        input1_data = np.random.rand(*shape).astype(np.float32)
+
+        self.inputs = [
+            httpclient.InferInput(
+                "INPUT0", input0_data.shape, "FP32"
+            ).set_data_from_numpy(input0_data),
+            httpclient.InferInput(
+                "INPUT1", input1_data.shape, "FP32"
+            ).set_data_from_numpy(input1_data),
+        ]
+
+        self.outputs = [
+            httpclient.InferRequestedOutput("OUTPUT__0"),
+            httpclient.InferRequestedOutput("OUTPUT__1"),
+        ]
+
+        # Before loading the model
+        self._assert_pinned_memory_utilization()
+
+    def _assert_pinned_memory_utilization(self):
+        total_bytes_value, used_bytes_value = _get_metrics()
+        self.assertEqual(int(total_bytes_value), TOTAL_PINNED_MEMORY_SIZE)
+        self.assertEqual(int(used_bytes_value), DEFAULT_USED_PINNED_MEMORY_SIZE)
+
+    def _collect_metrics(self):
+        while not self.inference_completed.is_set():
+            total_bytes_value, used_bytes_value = _get_metrics()
+            self.assertEqual(int(total_bytes_value), TOTAL_PINNED_MEMORY_SIZE)
+            # Assert pinned memory usage is within anticipated values
+            self.assertIn(int(used_bytes_value), [0, 64, 128, 192, 256])
+
+    def test_pinned_memory_metrics_asynchronous_requests(self):
+        with httpclient.InferenceServerClient(
+            url="localhost:8000", concurrency=10
+        ) as client:
+            if not client.is_model_ready(self.model_name):
+                client.load_model(self.model_name)
+
+            # Before starting the inference
+            self._assert_pinned_memory_utilization()
+
+            # Start a thread to collect metrics asynchronously
+            metrics_thread = threading.Thread(target=self._collect_metrics)
+            metrics_thread.start()
+
+            # Asynchronous inference requests
+            async_requests = []
+            for _ in range(100):
+                async_requests.append(
+                    client.async_infer(
+                        model_name=self.model_name,
+                        inputs=self.inputs,
+                        outputs=self.outputs,
+                    )
+                )
+
+            time.sleep(1)
+
+            # Set the event to indicate that inference is completed
+            self.inference_completed.set()
+
+            # Wait for all inference requests to complete
+            for async_request in async_requests:
+                async_request.get_result()
+
+            # Wait for the metrics thread to complete
+            metrics_thread.join()
+
+        # After Completing inference, used_bytes_value should comedown to 0
+        self._assert_pinned_memory_utilization()
+
+    def test_pinned_memory_metrics_synchronous_requests(self):
+        with httpclient.InferenceServerClient(url="localhost:8000") as client:
+            if not client.is_model_ready(self.model_name):
+                client.load_model(self.model_name)
+
+            # Before starting the inference
+            self._assert_pinned_memory_utilization()
+
+            # Start a thread to collect metrics asynchronously
+            metrics_thread = threading.Thread(target=self._collect_metrics)
+            metrics_thread.start()
+
+            # Synchronous inference requests
+            for _ in range(100):
+                response = client.infer(
+                    model_name=self.model_name, inputs=self.inputs, outputs=self.outputs
+                )
+
+                response.get_response()
+
+            # Set the event to indicate that inference is completed
+            self.inference_completed.set()
+
+            # Wait for the metrics thread to complete
+            metrics_thread.join()
+
+        # After Completing inference, used_bytes_value should comedown to 0
+        self._assert_pinned_memory_utilization()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
@@ -45,12 +45,13 @@ SERVER=${TRITON_DIR}/bin/tritonserver
 BASE_SERVER_ARGS="--model-repository=${MODELDIR}"
 SERVER_ARGS="${BASE_SERVER_ARGS}"
 SERVER_LOG="./inference_server.log"
+PYTHON_TEST="metrics_config_test.py"
 source ../common/util.sh
 
 CLIENT_LOG="client.log"
 TEST_RESULT_FILE="test_results.txt"
 function check_unit_test() {
-    if [ $? -ne 0 ]; then
+    if [ "${PIPESTATUS[0]}" -ne 0 ]; then
         cat $CLIENT_LOG
         echo -e "\n***\n*** Test Failed\n***"
         RET=1
@@ -100,8 +101,6 @@ if [ $? -ne 0 ]; then
 fi
 set -e
 
-### GPU Metrics
-
 # Prepare a libtorch float32 model with basic config
 rm -rf $MODELDIR
 model=libtorch_float32_float32_float32
@@ -112,8 +111,41 @@ mkdir -p $MODELDIR/${model}/1 && \
   sed -i "s/label_filename:.*//" config.pbtxt && \
   echo "instance_group [{ kind: KIND_GPU }]" >> config.pbtxt)
 
+### Pinned memory metrics tests
+set +e
+CLIENT_PY="./pinned_memory_metrics_test.py"
+SERVER_LOG="pinned_memory_metrics_test_server.log"
+SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1"
+run_and_check_server
+python3 ${PYTHON_TEST} MetricsConfigTest.test_pinned_memory_metrics_exist -v 2>&1 | tee ${CLIENT_LOG}
+check_unit_test
+
+CLIENT_LOG="pinned_memory_metrics_test_client.log"
+python3 ${CLIENT_PY} -v 2>&1 | tee ${CLIENT_LOG}
+check_unit_test
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+# Custom Pinned memory pool size
+export CUSTOM_PINNED_MEMORY_POOL_SIZE=1024 # bytes
+SERVER_LOG="custom_pinned_memory_test_server.log"
+CLIENT_LOG="custom_pinned_memory_test_client.log"
+SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1 --model-control-mode=explicit --log-verbose=1 --pinned-memory-pool-byte-size=$CUSTOM_PINNED_MEMORY_POOL_SIZE"
+run_and_check_server
+python3 ${CLIENT_PY} -v 2>&1 | tee ${CLIENT_LOG}
+check_unit_test
+
+kill $SERVER_PID
+wait $SERVER_PID
+set -e
+
+
+### GPU Metrics
 set +e
 export CUDA_VISIBLE_DEVICES=0,1,2
+SERVER_LOG="./inference_server.log"
+CLIENT_LOG="client.log"
 run_and_check_server
 
 num_gpus=`curl -s localhost:8002/metrics | grep "nv_gpu_utilization{" | wc -l`
@@ -227,7 +259,6 @@ MODELDIR="${PWD}/unit_test_models"
 mkdir -p "${MODELDIR}/identity_cache_on/1"
 mkdir -p "${MODELDIR}/identity_cache_off/1"
 BASE_SERVER_ARGS="--model-repository=${MODELDIR} --model-control-mode=explicit"
-PYTHON_TEST="metrics_config_test.py"
 
 # Check default settings: Counters should be enabled, summaries should be disabled
 SERVER_ARGS="${BASE_SERVER_ARGS} --load-model=identity_cache_off"