triton-inference-server · rmccorm4 · Jul 29, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/qa/L0_metrics/cpu_metrics_test.py b/qa/L0_metrics/cpu_metrics_test.py
@@ -0,0 +1,187 @@
+#!/usr/bin/python
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import re
+import threading
+import time
+import unittest
+from collections import defaultdict
+
+import numpy as np
+import requests
+import tritonclient.http as httpclient
+
+CPU_UTILIZATION = "nv_cpu_utilization"
+CPU_USED_MEMORY = "nv_cpu_memory_used_bytes"
+CPU_TOTAL_MEMORY = "nv_cpu_memory_total_bytes"
+
+
+def get_metrics():
+    utilization_pattern = re.compile(rf"{CPU_UTILIZATION} (\d+\.?\d*)")
+    used_bytes_pattern = re.compile(rf"{CPU_USED_MEMORY} (\d+)")
+    total_bytes_pattern = re.compile(rf"{CPU_TOTAL_MEMORY} (\d+)")
+
+    r = requests.get("http://localhost:8002/metrics")
+    r.raise_for_status()
+
+    utilization_match = utilization_pattern.search(r.text)
+    utilization_value = float(utilization_match.group(1))
+
+    used_bytes_match = used_bytes_pattern.search(r.text)
+    used_bytes_value = int(used_bytes_match.group(1))
+
+    total_bytes_match = total_bytes_pattern.search(r.text)
+    total_bytes_value = int(total_bytes_match.group(1))
+
+    return utilization_value, used_bytes_value, total_bytes_value
+
+
+class TestCpuMetrics(unittest.TestCase):
+    def setUp(self):
+        self.inference_completed = threading.Event()
+
+        shape = [1, 16]
+        self.model_name = "libtorch_float32_float32_float32"
+        input0_data = np.random.rand(*shape).astype(np.float32)
+        input1_data = np.random.rand(*shape).astype(np.float32)
+
+        self.inputs = [
+            httpclient.InferInput(
+                "INPUT0", input0_data.shape, "FP32"
+            ).set_data_from_numpy(input0_data),
+            httpclient.InferInput(
+                "INPUT1", input1_data.shape, "FP32"
+            ).set_data_from_numpy(input1_data),
+        ]
+
+    def _validate_metric_variance(self, observed_metrics: dict):
+        dupe_value_tolerance = 5
+        for metric in [CPU_UTILIZATION, CPU_USED_MEMORY]:
+            observed_values = observed_metrics[metric]
+            observed_count = len(observed_values)
+            print(
+                f"Observed {metric} count: {observed_count}, values: {observed_values}"
+            )
+
+            # Must have at least 1 more than the duplicate tolerance
+            self.assertGreater(
+                observed_count,
+                dupe_value_tolerance,
+                f"Found too many sequential duplicate values for {metric}. Double check the server-side --metrics-interval and observation interval in this test, or consider tuning the duplicate tolerance.",
+            )
+
+            # Don't allow observed metric values to be repeated sequentially
+            # more than a certain tolerance. The expectation is that these metrics
+            # will vary while the server is processing requests in the background,
+            # provided the server was configured with a small metrics update interval.
+            sequential_dupes = 0
+            max_sequential_dupes = 0
+            prev_value = observed_values[0]
+            for value in observed_values[1:]:
+                if value == prev_value:
+                    sequential_dupes += 1
+                else:
+                    # If unique value found, reset counter
+                    sequential_dupes = 0
+
+                # For future observability on dupe frequency to tune the tolerance
+                if sequential_dupes > max_sequential_dupes:
+                    max_sequential_dupes = sequential_dupes
+
+                self.assertLess(sequential_dupes, dupe_value_tolerance)
+                prev_value = value
+
+            print(
+                f"Max sequential duplicate values found for {metric}: {max_sequential_dupes}"
+            )
+
+    def _collect_metrics(self, observed_metrics, interval_secs=1):
+        """
+        Collects metrics at provided 'interval_secs' and stores them in the
+        provided 'observed_metrics' dictionary for postprocessing.
+        """
+        # Give the test and server some time to begin processing requests
+        # before beginning observation loop.
+        time.sleep(1)
+
+        while not self.inference_completed.is_set():
+            util_value, used_memory_value, _ = get_metrics()
+            observed_metrics[CPU_UTILIZATION].append(util_value)
+            observed_metrics[CPU_USED_MEMORY].append(used_memory_value)
+            time.sleep(interval_secs)
+
+    def test_cpu_metrics_during_inference(self):
+        with httpclient.InferenceServerClient(
+            url="localhost:8000", concurrency=10
+        ) as client:
+            # Start a thread to collect metrics asynchronously while inferences are
+            # executing, store them in a dictionary for postprocessing validation.
+            observed_metrics = defaultdict(list)
+            metrics_thread = threading.Thread(
+                target=self._collect_metrics, args=(observed_metrics,)
+            )
+            metrics_thread.start()
+
+            # Fire off many asynchronous inference requests to keep server
+            # busy while monitoring the CPU metrics. Ideal target is about
+            # 20-30 seconds of inference to get a good number of metric samples.
+            async_requests = []
+            for _ in range(2000):
+                async_requests.append(
+                    client.async_infer(
+                        model_name=self.model_name,
+                        inputs=self.inputs,
+                    )
+                )
+
+            # Wait for all inference requests to complete
+            for async_request in async_requests:
+                async_request.get_result()
+
+            # Set the event to indicate that inference is completed
+            self.inference_completed.set()
+
+            # Wait for the metrics thread to complete
+            metrics_thread.join()
+
+        self._validate_metric_variance(observed_metrics)
+
+    def test_cpu_metrics_ranges(self):
+        # Test some simple sanity checks on the expected ranges of values
+        # for the CPU related metrics.
+        utilization, used_memory, total_memory = get_metrics()
+        self.assertGreaterEqual(utilization, 0)
+        self.assertLessEqual(utilization, 1.0)
+        self.assertGreater(used_memory, 0)
+        self.assertLessEqual(used_memory, total_memory)
+        # NOTE: Can be improved in future to compare upper bound against psutil
+        # system memory if we introduce the dependency into the test/container.
+        self.assertGreater(total_memory, 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_metrics/test.sh b/qa/L0_metrics/test.sh
@@ -111,6 +111,23 @@ mkdir -p $MODELDIR/${model}/1 && \
   sed -i "s/label_filename:.*//" config.pbtxt && \
   echo "instance_group [{ kind: KIND_GPU }]" >> config.pbtxt)
 
+### CPU / RAM metrics tests
+set +e
+CLIENT_PY="./cpu_metrics_test.py"
+SERVER_LOG="cpu_metrics_test_server.log"
+# NOTE: CPU utilization is computed based on the metrics interval, so having
+# too small of an interval can skew the results.
+SERVER_ARGS="$BASE_SERVER_ARGS --metrics-interval-ms=1000 --log-verbose=1"
+run_and_check_server
+
+CLIENT_LOG="cpu_metrics_test_client.log"
+python3 ${CLIENT_PY} -v 2>&1 | tee ${CLIENT_LOG}
+check_unit_test
+
+kill $SERVER_PID
+wait $SERVER_PID
+set -e
+
 ### Pinned memory metrics tests
 set +e
 CLIENT_PY="./pinned_memory_metrics_test.py"
@@ -236,48 +253,6 @@ for (( i = 0; i < $num_iterations; ++i )); do
   prev_energy=$current_energy
 done
 
-### CPU / RAM Metrics
-
-# The underlying values for these metrics do not always update frequently,
-# so give ample WAIT time to make sure they change and are being updated.
-CPU_METRICS="nv_cpu_utilization nv_cpu_memory_used_bytes"
-WAIT_INTERVAL_SECS=2.0
-for metric in ${CPU_METRICS}; do
-    echo -e "\n=== Checking Metric: ${metric} ===\n"
-    prev_value=`curl -s localhost:8002/metrics | grep ${metric} | grep -v "HELP\|TYPE" | awk '{print $2}'`
-
-    num_not_updated=0
-    num_not_updated_threshold=3
-    for (( i = 0; i < $num_iterations; ++i )); do
-      sleep $WAIT_INTERVAL_SECS
-      current_value=`curl -s localhost:8002/metrics | grep ${metric} | grep -v "HELP\|TYPE" | awk '{print $2}'`
-      if [ $current_value == $prev_value ]; then
-        num_not_updated=$((num_not_updated+1))
-      fi
-      prev_value=$current_value
-    done
-
-    # Give CPU metrics some tolerance to not update, up to a threshold
-    # DLIS-4304: An alternative may be to run some busy work on CPU in the
-    #            background rather than allowing a tolerance threshold
-    if [[ ${num_not_updated} -gt ${num_not_updated_threshold} ]]; then
-        cat $SERVER_LOG
-        echo "Metrics were not updated ${num_not_updated}/${num_iterations} times for interval of ${METRICS_INTERVAL_MS} milliseconds for metric: ${metric}"
-        echo -e "\n***\n*** Metric Interval test failed. \n***"
-        RET=1
-        break
-    fi
-done
-
-# Verify reported total memory is non-zero
-total_memory=`curl -s localhost:8002/metrics | grep "nv_cpu_memory_total_bytes" | grep -v "HELP\|TYPE" | awk '{print $2}'`
-test -z "${total_memory}" && total_memory=0
-if [ ${total_memory} -eq 0 ]; then
-  echo "Found nv_cpu_memory_total_bytes had a value of zero, this should not happen."
-  echo -e "\n***\n*** CPU total memory test failed. \n***"
-  RET=1
-fi
-
 kill $SERVER_PID
 wait $SERVER_PID