triton-inference-server · kthui · Feb 17, 2024 · Feb 3, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -94,7 +94,7 @@ struct TRITONBACKEND_Batcher;
 ///   }
 ///
 #define TRITONBACKEND_API_VERSION_MAJOR 1
-#define TRITONBACKEND_API_VERSION_MINOR 18
+#define TRITONBACKEND_API_VERSION_MINOR 19
 
 /// Get the TRITONBACKEND API version supported by Triton. This value
 /// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
@@ -1356,6 +1356,51 @@ TRITONBACKEND_ModelInstanceReportStatistics(
     const uint64_t compute_start_ns, const uint64_t compute_end_ns,
     const uint64_t exec_end_ns);
 
+/// Record statistics for a decoupled inference response.
+///
+/// All timestamps should be reported in nanonseconds and collected using
+/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
+///
+/// Call this function after calling TRITONBACKEND_ResponseSend, and pass the
+/// same send_flags and error object passed to the TRITONBACKEND_ResponseSend.
+///
+/// For consistency of measurement across different backends, the
+/// timestamps should be collected at the following points during
+/// TRITONBACKEND_ModelInstanceExecute.
+///
+///   TRITONBACKEND_ModelInstanceExecute()
+///     < start of this response >
+///     CAPTURE TIMESPACE (response_start)
+///     < generate this response >
+///     CAPTURE TIMESPACE (compute_output_start)
+///     < allocate output buffers and extract output tensors, including copying
+///       the tensors to/from GPU if necessary >
+///     CAPTURE TIMESPACE (response_end)
+///     < end of this response >
+///     return
+///
+/// \param instance The model instance.
+/// \param response_factory The response factory associated with the inference
+/// request.
+/// \param response_start Timestamp for the start of execution for this
+/// response.
+/// \param compute_output_start Timestamp for the start of extracting output
+/// tensors for this response. Set this to 0 for reporting empty response.
+/// \param response_end Timestamp for the end of extracting output tensors for
+/// this response.
+/// \param send_flags Flags associated with the response. \see
+/// TRITONBACKEND_ResponseSend
+/// \param error The TRITONSERVER_Error to send if the response is an error, or
+/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportResponseStatistics(
+    TRITONBACKEND_ModelInstance* instance,
+    TRITONBACKEND_ResponseFactory* response_factory,
+    const uint64_t response_start, const uint64_t compute_output_start,
+    const uint64_t response_end, const uint32_t send_flags,
+    TRITONSERVER_Error* error);
+
 /// Record statistics for the execution of an entire batch of
 /// inference requests.
 ///

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -953,6 +953,41 @@ TRITONBACKEND_ModelInstanceReportStatistics(
   return nullptr;  // success
 }
 
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportResponseStatistics(
+    TRITONBACKEND_ModelInstance* instance,
+    TRITONBACKEND_ResponseFactory* response_factory,
+    const uint64_t response_start, const uint64_t compute_output_start,
+    const uint64_t response_end, const uint32_t send_flags,
+    TRITONSERVER_Error* error)
+{
+#ifdef TRITON_ENABLE_STATS
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  std::shared_ptr<InferenceResponseFactory>* rf =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
+          response_factory);
+  std::string key = std::to_string((*rf)->ResponseStatsIndex());
+
+  if (error == nullptr) {
+    if (compute_output_start > 0) {
+      RETURN_TRITONSERVER_ERROR_IF_ERROR(
+          ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess(
+              key, response_start, compute_output_start, response_end));
+    } else {
+      RETURN_TRITONSERVER_ERROR_IF_ERROR(
+          ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty(
+              key, response_start, response_end));
+    }
+  } else {
+    RETURN_TRITONSERVER_ERROR_IF_ERROR(
+        ti->Model()->MutableStatsAggregator()->UpdateResponseFail(
+            key, response_start, compute_output_start, response_end));
+  }
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
 TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceReportBatchStatistics(
     TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,

diff --git a/src/infer_response.h b/src/infer_response.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -61,6 +61,10 @@ class InferenceResponseFactory {
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
         is_cancelled_(false)
+#ifdef TRITON_ENABLE_STATS
+        ,
+        response_stats_index_(0)
+#endif  // TRITON_ENABLE_STATS
   {
   }
 
@@ -94,6 +98,11 @@ class InferenceResponseFactory {
   void ReleaseTrace() { trace_ = nullptr; }
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_STATS
+  // Return the current response statistics index and increment it.
+  uint64_t ResponseStatsIndex() { return response_stats_index_++; };
+#endif  // TRITON_ENABLE_STATS
+
  private:
   // The model associated with this factory. For normal
   // requests/responses this will always be defined and acts to keep
@@ -129,6 +138,11 @@ class InferenceResponseFactory {
   // Inference trace associated with this response.
   std::shared_ptr<InferenceTraceProxy> trace_;
 #endif  // TRITON_ENABLE_TRACING
+
+#ifdef TRITON_ENABLE_STATS
+  // Number of response statistics reported.
+  std::atomic<uint64_t> response_stats_index_;
+#endif  // TRITON_ENABLE_STATS
 };
 
 //

diff --git a/src/infer_stats.cc b/src/infer_stats.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -210,6 +210,132 @@ InferenceStatsAggregator::UpdateSuccessCacheMiss(
 #endif  // TRITON_ENABLE_METRICS
 }
 
+Status
+InferenceStatsAggregator::UpdateResponseSuccess(
+    const std::string& key, const uint64_t response_start_ns,
+    const uint64_t compute_output_start_ns, const uint64_t response_end_ns)
+{
+  if (response_start_ns > compute_output_start_ns) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Response start cannot happen after compute output start");
+  }
+  if (compute_output_start_ns > response_end_ns) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Compute output start cannot happen after response end");
+  }
+  const uint64_t compute_infer_duration_ns =
+      compute_output_start_ns - response_start_ns;
+  const uint64_t compute_output_duration_ns =
+      response_end_ns - compute_output_start_ns;
+  const uint64_t total_duration_ns = response_end_ns - response_start_ns;
+
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto it = response_stats_.find(key);
+    if (it == response_stats_.end()) {
+      it = response_stats_.emplace(key, InferResponseStats()).first;
+    }
+
+    it->second.compute_infer_count++;
+    it->second.compute_infer_duration_ns += compute_infer_duration_ns;
+    it->second.compute_output_count++;
+    it->second.compute_output_duration_ns += compute_output_duration_ns;
+    it->second.success_count++;
+    it->second.success_duration_ns += total_duration_ns;
+  }
+
+  return Status::Success;
+}
+
+Status
+InferenceStatsAggregator::UpdateResponseFail(
+    const std::string& key, const uint64_t response_start_ns,
+    const uint64_t compute_output_start_ns, const uint64_t response_end_ns)
+{
+  uint64_t compute_infer_duration_ns, compute_output_duration_ns,
+      total_duration_ns;
+  if (compute_output_start_ns > 0) {
+    // output tensors copied
+    if (response_start_ns > compute_output_start_ns) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "Response start cannot happen after compute output start");
+    }
+    if (compute_output_start_ns > response_end_ns) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "Compute output start cannot happen after response end");
+    }
+    compute_infer_duration_ns = compute_output_start_ns - response_start_ns;
+    compute_output_duration_ns = response_end_ns - compute_output_start_ns;
+    total_duration_ns = response_end_ns - response_start_ns;
+  } else {
+    // no output tensors copied
+    if (response_start_ns > response_end_ns) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "Response start cannot happen after response end");
+    }
+    compute_infer_duration_ns = response_end_ns - response_start_ns;
+    compute_output_duration_ns = 0;
+    total_duration_ns = response_end_ns - response_start_ns;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto it = response_stats_.find(key);
+    if (it == response_stats_.end()) {
+      it = response_stats_.emplace(key, InferResponseStats()).first;
+    }
+
+    it->second.compute_infer_count++;
+    it->second.compute_infer_duration_ns += compute_infer_duration_ns;
+    if (compute_output_duration_ns > 0) {
+      it->second.compute_output_count++;
+      it->second.compute_output_duration_ns += compute_output_duration_ns;
+    }
+    it->second.fail_count++;
+    it->second.fail_duration_ns += total_duration_ns;
+  }
+
+  return Status::Success;
+}
+
+Status
+InferenceStatsAggregator::UpdateResponseEmpty(
+    const std::string& key, const uint64_t response_start_ns,
+    const uint64_t response_end_ns)
+{
+  if (response_start_ns > response_end_ns) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Response start cannot happen after response end");
+  }
+  const uint64_t compute_infer_duration_ns =
+      response_end_ns - response_start_ns;
+  const uint64_t total_duration_ns = response_end_ns - response_start_ns;
+
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto it = response_stats_.find(key);
+    if (it == response_stats_.end()) {
+      it = response_stats_.emplace(key, InferResponseStats()).first;
+    }
+
+    it->second.compute_infer_count++;
+    it->second.compute_infer_duration_ns += compute_infer_duration_ns;
+    it->second.empty_response_count++;
+    it->second.empty_response_duration_ns += total_duration_ns;
+  }
+
+  return Status::Success;
+}
+
 void
 InferenceStatsAggregator::UpdateInferBatchStats(
     MetricModelReporter* metric_reporter, const size_t batch_size,

diff --git a/src/infer_stats.h b/src/infer_stats.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -78,6 +78,27 @@ class InferenceStatsAggregator {
     uint64_t cache_miss_duration_ns_;
   };
 
+  struct InferResponseStats {
+    InferResponseStats()
+        : compute_infer_count(0), compute_infer_duration_ns(0),
+          compute_output_count(0), compute_output_duration_ns(0),
+          success_count(0), success_duration_ns(0), fail_count(0),
+          fail_duration_ns(0), empty_response_count(0),
+          empty_response_duration_ns(0)
+    {
+    }
+    uint64_t compute_infer_count;
+    uint64_t compute_infer_duration_ns;
+    uint64_t compute_output_count;
+    uint64_t compute_output_duration_ns;
+    uint64_t success_count;
+    uint64_t success_duration_ns;
+    uint64_t fail_count;
+    uint64_t fail_duration_ns;
+    uint64_t empty_response_count;
+    uint64_t empty_response_duration_ns;
+  };
+
   struct InferBatchStats {
     InferBatchStats()
         : count_(0), compute_input_duration_ns_(0),
@@ -100,6 +121,11 @@ class InferenceStatsAggregator {
   uint64_t InferenceCount() const { return inference_count_; }
   uint64_t ExecutionCount() const { return execution_count_; }
   const InferStats& ImmutableInferStats() const { return infer_stats_; }
+  const std::map<std::string, InferResponseStats>& ImmutableInferResponseStats()
+      const
+  {
+    return response_stats_;
+  }
   const std::map<size_t, InferBatchStats>& ImmutableInferBatchStats() const
   {
     return batch_stats_;
@@ -140,6 +166,21 @@ class InferenceStatsAggregator {
       MetricModelReporter* metric_reporter,
       const uint64_t cache_miss_duration_ns);
 
+  // Add durations to response stats for a successful response.
+  Status UpdateResponseSuccess(
+      const std::string& key, const uint64_t response_start_ns,
+      const uint64_t compute_output_start_ns, const uint64_t response_end_ns);
+
+  // Add durations to response stats for a failed response.
+  Status UpdateResponseFail(
+      const std::string& key, const uint64_t response_start_ns,
+      const uint64_t compute_output_start_ns, const uint64_t response_end_ns);
+
+  // Add durations to response stats for an empty response.
+  Status UpdateResponseEmpty(
+      const std::string& key, const uint64_t response_start_ns,
+      const uint64_t response_end_ns);
+
   // Add durations to batch infer stats for a batch execution.
   // 'success_request_count' is the number of success requests in the
   // batch that have infer_stats attached.
@@ -163,6 +204,7 @@ class InferenceStatsAggregator {
   uint64_t inference_count_;
   uint64_t execution_count_;
   InferStats infer_stats_;
+  std::map<std::string, InferResponseStats> response_stats_;
   std::map<size_t, InferBatchStats> batch_stats_;
 #endif  // TRITON_ENABLE_STATS
 };