From 2b1c330111c121e52746e87a8f41985805f02b76 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 2 Feb 2024 18:58:29 -0800
Subject: [PATCH 01/10] Add response statistics

---
 include/triton/core/tritonbackend.h |  47 +++++++++-
 src/backend_model_instance.cc       |  37 +++++++-
 src/infer_response.h                |  16 +++-
 src/infer_stats.cc                  | 128 +++++++++++++++++++++++++++-
 src/infer_stats.h                   |  44 +++++++++-
 src/tritonserver.cc                 |  35 +++++++-
 src/tritonserver_stub.cc            |   6 +-
 7 files changed, 306 insertions(+), 7 deletions(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index ef3af71b5..82e8f75af 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1356,6 +1356,51 @@ TRITONBACKEND_ModelInstanceReportStatistics(
     const uint64_t compute_start_ns, const uint64_t compute_end_ns,
     const uint64_t exec_end_ns);
 
+/// Record statistics for a decoupled inference response.
+///
+/// All timestamps should be reported in nanonseconds and collected using
+/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
+///
+/// Call this function after calling TRITONBACKEND_ResponseSend, and pass the
+/// same send_flags and error object passed to the TRITONBACKEND_ResponseSend.
+///
+/// For consistency of measurement across different backends, the
+/// timestamps should be collected at the following points during
+/// TRITONBACKEND_ModelInstanceExecute.
+///
+///   TRITONBACKEND_ModelInstanceExecute()
+///     < start of this response >
+///     CAPTURE TIMESPACE (response_start)
+///     < generate this response >
+///     CAPTURE TIMESPACE (compute_output_start)
+///     < allocate output buffers and extract output tensors, including copying
+///       the tensors to/from GPU if necessary >
+///     CAPTURE TIMESPACE (response_end)
+///     < end of this response >
+///     return
+///
+/// \param instance The model instance.
+/// \param response_factory The response factory associated with the inference
+/// request.
+/// \param response_start Timestamp for the start of execution for this
+/// response.
+/// \param compute_output_start Timestamp for the start of extracting output
+/// tensors for this response. Set this to 0 for reporting empty response.
+/// \param response_end Timestamp for the end of extracting output tensors for
+/// this response.
+/// \param send_flags Flags associated with the response. \see
+/// TRITONBACKEND_ResponseSend \see
+/// \param error The TRITONSERVER_Error to send if the response is an error, or
+/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend \see
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportResponseStatistics(
+    TRITONBACKEND_ModelInstance* instance,
+    TRITONBACKEND_ResponseFactory* response_factory,
+    const uint64_t response_start, const uint64_t compute_output_start,
+    const uint64_t response_end, const uint32_t send_flags,
+    TRITONSERVER_Error* error);
+
 /// Record statistics for the execution of an entire batch of
 /// inference requests.
 ///
diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index a4a9bba2e..d303a88c4 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -953,6 +953,41 @@ TRITONBACKEND_ModelInstanceReportStatistics(
   return nullptr;  // success
 }
 
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportResponseStatistics(
+    TRITONBACKEND_ModelInstance* instance,
+    TRITONBACKEND_ResponseFactory* response_factory,
+    const uint64_t response_start, const uint64_t compute_output_start,
+    const uint64_t response_end, const uint32_t send_flags,
+    TRITONSERVER_Error* error)
+{
+#ifdef TRITON_ENABLE_STATS
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  std::shared_ptr<InferenceResponseFactory>* rf =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
+          response_factory);
+  std::string key = std::to_string((*rf)->ResponseStatsIndex());
+
+  if (error == nullptr) {
+    if (compute_output_start > 0) {
+      RETURN_TRITONSERVER_ERROR_IF_ERROR(
+          ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess(
+              key, response_start, compute_output_start, response_end));
+    } else {
+      RETURN_TRITONSERVER_ERROR_IF_ERROR(
+          ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty(
+              key, response_start, response_end));
+    }
+  } else {
+    RETURN_TRITONSERVER_ERROR_IF_ERROR(
+        ti->Model()->MutableStatsAggregator()->UpdateResponseFail(
+            key, response_start, compute_output_start, response_end));
+  }
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
 TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceReportBatchStatistics(
     TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
diff --git a/src/infer_response.h b/src/infer_response.h
index 75d5f9d48..3dbaa7ee8 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -61,6 +61,10 @@ class InferenceResponseFactory {
         alloc_userp_(alloc_userp), response_fn_(response_fn),
         response_userp_(response_userp), response_delegator_(delegator),
         is_cancelled_(false)
+#ifdef TRITON_ENABLE_STATS
+        ,
+        response_stats_index_(0)
+#endif  // TRITON_ENABLE_STATS
   {
   }
 
@@ -94,6 +98,11 @@ class InferenceResponseFactory {
   void ReleaseTrace() { trace_ = nullptr; }
 #endif  // TRITON_ENABLE_TRACING
 
+#ifdef TRITON_ENABLE_STATS
+  // Return the current response statistics index and increment it.
+  uint64_t ResponseStatsIndex() { return response_stats_index_++; };
+#endif  // TRITON_ENABLE_STATS
+
  private:
   // The model associated with this factory. For normal
   // requests/responses this will always be defined and acts to keep
@@ -129,6 +138,11 @@ class InferenceResponseFactory {
   // Inference trace associated with this response.
   std::shared_ptr<InferenceTraceProxy> trace_;
 #endif  // TRITON_ENABLE_TRACING
+
+#ifdef TRITON_ENABLE_STATS
+  // Number of response statistics reported.
+  std::atomic<uint64_t> response_stats_index_;
+#endif  // TRITON_ENABLE_STATS
 };
 
 //
diff --git a/src/infer_stats.cc b/src/infer_stats.cc
index a1af3b9f2..0f47485c2 100644
--- a/src/infer_stats.cc
+++ b/src/infer_stats.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -210,6 +210,132 @@ InferenceStatsAggregator::UpdateSuccessCacheMiss(
 #endif  // TRITON_ENABLE_METRICS
 }
 
+Status
+InferenceStatsAggregator::UpdateResponseSuccess(
+    const std::string& key, const uint64_t response_start_ns,
+    const uint64_t compute_output_start_ns, const uint64_t response_end_ns)
+{
+  if (response_start_ns > compute_output_start_ns) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Response start cannot happen after compute output start");
+  }
+  if (compute_output_start_ns > response_end_ns) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Compute output start cannot happen after response end");
+  }
+  const uint64_t compute_infer_duration_ns =
+      compute_output_start_ns - response_start_ns;
+  const uint64_t compute_output_duration_ns =
+      response_end_ns - compute_output_start_ns;
+  const uint64_t total_duration_ns = response_end_ns - response_start_ns;
+
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto it = response_stats_.find(key);
+    if (it == response_stats_.end()) {
+      it = response_stats_.emplace(key, InferResponseStats()).first;
+    }
+
+    it->second.compute_infer_count++;
+    it->second.compute_infer_duration_ns += compute_infer_duration_ns;
+    it->second.compute_output_count++;
+    it->second.compute_output_duration_ns += compute_output_duration_ns;
+    it->second.success_count++;
+    it->second.success_duration_ns += total_duration_ns;
+  }
+
+  return Status::Success;
+}
+
+Status
+InferenceStatsAggregator::UpdateResponseFail(
+    const std::string& key, const uint64_t response_start_ns,
+    const uint64_t compute_output_start_ns, const uint64_t response_end_ns)
+{
+  uint64_t compute_infer_duration_ns, compute_output_duration_ns,
+      total_duration_ns;
+  if (compute_output_start_ns > 0) {
+    // output tensors copied
+    if (response_start_ns > compute_output_start_ns) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "Response start cannot happen after compute output start");
+    }
+    if (compute_output_start_ns > response_end_ns) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "Compute output start cannot happen after response end");
+    }
+    compute_infer_duration_ns = compute_output_start_ns - response_start_ns;
+    compute_output_duration_ns = response_end_ns - compute_output_start_ns;
+    total_duration_ns = response_end_ns - response_start_ns;
+  } else {
+    // no output tensors copied
+    if (response_start_ns > response_end_ns) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "Response start cannot happen after response end");
+    }
+    compute_infer_duration_ns = response_end_ns - response_start_ns;
+    compute_output_duration_ns = 0;
+    total_duration_ns = response_end_ns - response_start_ns;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto it = response_stats_.find(key);
+    if (it == response_stats_.end()) {
+      it = response_stats_.emplace(key, InferResponseStats()).first;
+    }
+
+    it->second.compute_infer_count++;
+    it->second.compute_infer_duration_ns += compute_infer_duration_ns;
+    if (compute_output_duration_ns > 0) {
+      it->second.compute_output_count++;
+      it->second.compute_output_duration_ns += compute_output_duration_ns;
+    }
+    it->second.fail_count++;
+    it->second.fail_duration_ns += total_duration_ns;
+  }
+
+  return Status::Success;
+}
+
+Status
+InferenceStatsAggregator::UpdateResponseEmpty(
+    const std::string& key, const uint64_t response_start_ns,
+    const uint64_t response_end_ns)
+{
+  if (response_start_ns > response_end_ns) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Response start cannot happen after response end");
+  }
+  const uint64_t compute_infer_duration_ns =
+      response_end_ns - response_start_ns;
+  const uint64_t total_duration_ns = response_end_ns - response_start_ns;
+
+  {
+    std::lock_guard<std::mutex> lock(mu_);
+
+    auto it = response_stats_.find(key);
+    if (it == response_stats_.end()) {
+      it = response_stats_.emplace(key, InferResponseStats()).first;
+    }
+
+    it->second.compute_infer_count++;
+    it->second.compute_infer_duration_ns += compute_infer_duration_ns;
+    it->second.empty_response_count++;
+    it->second.empty_response_duration_ns += total_duration_ns;
+  }
+
+  return Status::Success;
+}
+
 void
 InferenceStatsAggregator::UpdateInferBatchStats(
     MetricModelReporter* metric_reporter, const size_t batch_size,
diff --git a/src/infer_stats.h b/src/infer_stats.h
index 86ad9a654..2ae2bc226 100644
--- a/src/infer_stats.h
+++ b/src/infer_stats.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -78,6 +78,27 @@ class InferenceStatsAggregator {
     uint64_t cache_miss_duration_ns_;
   };
 
+  struct InferResponseStats {
+    InferResponseStats()
+        : compute_infer_count(0), compute_infer_duration_ns(0),
+          compute_output_count(0), compute_output_duration_ns(0),
+          success_count(0), success_duration_ns(0), fail_count(0),
+          fail_duration_ns(0), empty_response_count(0),
+          empty_response_duration_ns(0)
+    {
+    }
+    uint64_t compute_infer_count;
+    uint64_t compute_infer_duration_ns;
+    uint64_t compute_output_count;
+    uint64_t compute_output_duration_ns;
+    uint64_t success_count;
+    uint64_t success_duration_ns;
+    uint64_t fail_count;
+    uint64_t fail_duration_ns;
+    uint64_t empty_response_count;
+    uint64_t empty_response_duration_ns;
+  };
+
   struct InferBatchStats {
     InferBatchStats()
         : count_(0), compute_input_duration_ns_(0),
@@ -100,6 +121,11 @@ class InferenceStatsAggregator {
   uint64_t InferenceCount() const { return inference_count_; }
   uint64_t ExecutionCount() const { return execution_count_; }
   const InferStats& ImmutableInferStats() const { return infer_stats_; }
+  const std::map<std::string, InferResponseStats>& ImmutableInferResponseStats()
+      const
+  {
+    return response_stats_;
+  }
   const std::map<size_t, InferBatchStats>& ImmutableInferBatchStats() const
   {
     return batch_stats_;
@@ -140,6 +166,21 @@ class InferenceStatsAggregator {
       MetricModelReporter* metric_reporter,
       const uint64_t cache_miss_duration_ns);
 
+  // Add durations to response stats for a successful response.
+  Status UpdateResponseSuccess(
+      const std::string& key, const uint64_t response_start_ns,
+      const uint64_t compute_output_start_ns, const uint64_t response_end_ns);
+
+  // Add durations to response stats for a failed response.
+  Status UpdateResponseFail(
+      const std::string& key, const uint64_t response_start_ns,
+      const uint64_t compute_output_start_ns, const uint64_t response_end_ns);
+
+  // Add durations to response stats for an empty response.
+  Status UpdateResponseEmpty(
+      const std::string& key, const uint64_t response_start_ns,
+      const uint64_t response_end_ns);
+
   // Add durations to batch infer stats for a batch execution.
   // 'success_request_count' is the number of success requests in the
   // batch that have infer_stats attached.
@@ -163,6 +204,7 @@ class InferenceStatsAggregator {
   uint64_t inference_count_;
   uint64_t execution_count_;
   InferStats infer_stats_;
+  std::map<std::string, InferResponseStats> response_stats_;
   std::map<size_t, InferBatchStats> batch_stats_;
 #endif  // TRITON_ENABLE_STATS
 };
diff --git a/src/tritonserver.cc b/src/tritonserver.cc
index 70bdddb75..f2d3c2624 100644
--- a/src/tritonserver.cc
+++ b/src/tritonserver.cc
@@ -1,4 +1,4 @@
-// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -2897,6 +2897,8 @@ TRITONSERVER_ServerModelStatistics(
 
       // Add infer statistic
       const auto& infer_stats = model->StatsAggregator().ImmutableInferStats();
+      const auto& infer_response_stats =
+          model->StatsAggregator().ImmutableInferResponseStats();
       const auto& infer_batch_stats =
           model->StatsAggregator().ImmutableInferBatchStats();
 
@@ -2933,6 +2935,35 @@ TRITONSERVER_ServerModelStatistics(
           metadata, inference_stats, "cache_miss",
           infer_stats.cache_miss_count_, infer_stats.cache_miss_duration_ns_);
 
+      // Add response statistics
+      triton::common::TritonJson::Value response_stats(
+          metadata, triton::common::TritonJson::ValueType::OBJECT);
+      for (const auto& res_pair : infer_response_stats) {
+        triton::common::TritonJson::Value res_stat(
+            metadata, triton::common::TritonJson::ValueType::OBJECT);
+        SetDurationStat(
+            metadata, res_stat, "compute_infer",
+            res_pair.second.compute_infer_count,
+            res_pair.second.compute_infer_duration_ns);
+        SetDurationStat(
+            metadata, res_stat, "compute_output",
+            res_pair.second.compute_output_count,
+            res_pair.second.compute_output_duration_ns);
+        SetDurationStat(
+            metadata, res_stat, "success", res_pair.second.success_count,
+            res_pair.second.success_duration_ns);
+        SetDurationStat(
+            metadata, res_stat, "fail", res_pair.second.fail_count,
+            res_pair.second.fail_duration_ns);
+        SetDurationStat(
+            metadata, res_stat, "empty_response",
+            res_pair.second.empty_response_count,
+            res_pair.second.empty_response_duration_ns);
+        RETURN_IF_STATUS_ERROR(
+            response_stats.Add(res_pair.first.c_str(), std::move(res_stat)));
+      }
+
+      // Add batch statistics
       triton::common::TritonJson::Value batch_stats(
           metadata, triton::common::TritonJson::ValueType::ARRAY);
       for (const auto& batch : infer_batch_stats) {
@@ -2967,6 +2998,8 @@ TRITONSERVER_ServerModelStatistics(
 
       RETURN_IF_STATUS_ERROR(
           model_stat.Add("inference_stats", std::move(inference_stats)));
+      RETURN_IF_STATUS_ERROR(
+          model_stat.Add("response_stats", std::move(response_stats)));
       RETURN_IF_STATUS_ERROR(
           model_stat.Add("batch_stats", std::move(batch_stats)));
       RETURN_IF_STATUS_ERROR(
diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc
index e6efcda12..f3bafe4b0 100644
--- a/src/tritonserver_stub.cc
+++ b/src/tritonserver_stub.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -947,6 +947,10 @@ TRITONBACKEND_ModelInstanceReportStatistics()
 {
 }
 TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceReportResponseStatistics()
+{
+}
+TRITONAPI_DECLSPEC void
 TRITONBACKEND_ModelInstanceReportBatchStatistics()
 {
 }

From c920b3bc7fe80aa55011622deb24abce277c996b Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Wed, 7 Feb 2024 18:31:02 -0800
Subject: [PATCH 02/10] Backend api version 1.19

---
 include/triton/core/tritonbackend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index 82e8f75af..560270f9d 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -94,7 +94,7 @@ struct TRITONBACKEND_Batcher;
 ///   }
 ///
 #define TRITONBACKEND_API_VERSION_MAJOR 1
-#define TRITONBACKEND_API_VERSION_MINOR 18
+#define TRITONBACKEND_API_VERSION_MINOR 19
 
 /// Get the TRITONBACKEND API version supported by Triton. This value
 /// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and

From 021204e8baac2686af7967b5efc49424c1f35c14 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:53:23 -0800
Subject: [PATCH 03/10] Fix see comment

---
 include/triton/core/tritonbackend.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index 560270f9d..8b0c4f3d9 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -1389,9 +1389,9 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 /// \param response_end Timestamp for the end of extracting output tensors for
 /// this response.
 /// \param send_flags Flags associated with the response. \see
-/// TRITONBACKEND_ResponseSend \see
+/// TRITONBACKEND_ResponseSend
 /// \param error The TRITONSERVER_Error to send if the response is an error, or
-/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend \see
+/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend
 /// \return a TRITONSERVER_Error indicating success or failure.
 TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceReportResponseStatistics(

From 629063e283d53d7ad442e27e53ed75fad82fbef7 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 14:58:54 -0800
Subject: [PATCH 04/10] Rename ResponseStatsIndex() to
 GetAndIncrementResponseIndex()

---
 src/backend_model_instance.cc | 2 +-
 src/infer_response.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index d303a88c4..00a204fcf 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -966,7 +966,7 @@ TRITONBACKEND_ModelInstanceReportResponseStatistics(
   std::shared_ptr<InferenceResponseFactory>* rf =
       reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
           response_factory);
-  std::string key = std::to_string((*rf)->ResponseStatsIndex());
+  std::string key = std::to_string((*rf)->GetAndIncrementResponseIndex());
 
   if (error == nullptr) {
     if (compute_output_start > 0) {
diff --git a/src/infer_response.h b/src/infer_response.h
index 3dbaa7ee8..612f8c1fe 100644
--- a/src/infer_response.h
+++ b/src/infer_response.h
@@ -100,7 +100,7 @@ class InferenceResponseFactory {
 
 #ifdef TRITON_ENABLE_STATS
   // Return the current response statistics index and increment it.
-  uint64_t ResponseStatsIndex() { return response_stats_index_++; };
+  uint64_t GetAndIncrementResponseIndex() { return response_stats_index_++; };
 #endif  // TRITON_ENABLE_STATS
 
  private:

From 3336bf5dcdb639d4b8af3c0bd441d3c2cd0828ce Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 8 Feb 2024 15:35:32 -0800
Subject: [PATCH 05/10] Clarify error object ownership

---
 include/triton/core/tritonbackend.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index 8b0c4f3d9..2b8192e2c 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -761,8 +761,9 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseOutput(
 /// \param send_flags Flags associated with the response. \see
 /// TRITONSERVER_ResponseCompleteFlag. \see
 /// TRITONSERVER_InferenceResponseCompleteFn_t.
-/// \param error The TRITONSERVER_Error to send if the response is an
-/// error, or nullptr if the response is successful.
+/// \param error The TRITONSERVER_Error to send if the response is an error, or
+/// nullptr if the response is successful. The caller retains ownership to the
+/// error object and must free it with TRITONSERVER_ErrorDelete.
 /// \return a TRITONSERVER_Error indicating success or failure.
 TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSend(
     TRITONBACKEND_Response* response, const uint32_t send_flags,
@@ -1392,6 +1393,8 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 /// TRITONBACKEND_ResponseSend
 /// \param error The TRITONSERVER_Error to send if the response is an error, or
 /// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend
+/// The caller retains ownership to the error object and must free it with
+/// TRITONSERVER_ErrorDelete.
 /// \return a TRITONSERVER_Error indicating success or failure.
 TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceReportResponseStatistics(

From 527e036ea2c10922307c1dbd22562b9081a409d1 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 13 Feb 2024 15:48:56 -0800
Subject: [PATCH 06/10] Move API parameters into a struct

---
 include/triton/core/tritonbackend.h | 71 +++++++++++++++++++----------
 src/backend_model_instance.cc       | 24 +++++-----
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index 2b8192e2c..54661ed35 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -68,6 +68,23 @@ struct TRITONBACKEND_ModelInstance;
 struct TRITONBACKEND_BackendAttribute;
 struct TRITONBACKEND_Batcher;
 
+struct TRITONBACKEND_ModelInstanceResponseStatistics {
+#ifdef TRITON_ENABLE_STATS
+  TRITONBACKEND_ModelInstanceResponseStatistics()
+      : model_instance(nullptr), response_factory(nullptr), response_start(0),
+        compute_output_start(0), response_end(0), send_flags(0), error(nullptr)
+  {
+  }
+  TRITONBACKEND_ModelInstance* model_instance;
+  TRITONBACKEND_ResponseFactory* response_factory;
+  uint64_t response_start;
+  uint64_t compute_output_start;
+  uint64_t response_end;
+  uint32_t send_flags;
+  TRITONSERVER_Error* error;
+#endif  // TRITON_ENABLE_STATS
+};
+
 ///
 /// TRITONBACKEND API Version
 ///
@@ -1362,11 +1379,8 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 /// All timestamps should be reported in nanonseconds and collected using
 /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
 ///
-/// Call this function after calling TRITONBACKEND_ResponseSend, and pass the
-/// same send_flags and error object passed to the TRITONBACKEND_ResponseSend.
-///
-/// For consistency of measurement across different backends, the
-/// timestamps should be collected at the following points during
+/// For consistency of measurement across different backends, the timestamps
+/// should be collected at the following points during
 /// TRITONBACKEND_ModelInstanceExecute.
 ///
 ///   TRITONBACKEND_ModelInstanceExecute()
@@ -1380,29 +1394,36 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 ///     < end of this response >
 ///     return
 ///
-/// \param instance The model instance.
-/// \param response_factory The response factory associated with the inference
-/// request.
-/// \param response_start Timestamp for the start of execution for this
-/// response.
-/// \param compute_output_start Timestamp for the start of extracting output
-/// tensors for this response. Set this to 0 for reporting empty response.
-/// \param response_end Timestamp for the end of extracting output tensors for
-/// this response.
-/// \param send_flags Flags associated with the response. \see
-/// TRITONBACKEND_ResponseSend
-/// \param error The TRITONSERVER_Error to send if the response is an error, or
-/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend
-/// The caller retains ownership to the error object and must free it with
-/// TRITONSERVER_ErrorDelete.
+/// Create the statistics object after calling TRITONBACKEND_ResponseSend. Use
+/// the same send_flags and error object passed to the
+/// TRITONBACKEND_ResponseSend.
+///
+/// The statistics object is a struct consists of the following fields, subject
+/// to addition in the future.
+///
+///   TRITONBACKEND_ModelInstance* model_instance;
+///     The model instance.
+///   TRITONBACKEND_ResponseFactory* response_factory;
+///     The response factory associated with the inference request.
+///   const uint64_t response_start;
+///     Timestamp for the start of execution for this response.
+///   const uint64_t compute_output_start;
+///     Timestamp for the start of extracting output tensors for this response.
+///     Set this to 0 for reporting empty response.
+///   const uint64_t response_end;
+///     Timestamp for the end of extracting output tensors for this response.
+///   const uint32_t send_flags;
+///     Flags associated with the response. \see TRITONBACKEND_ResponseSend
+///   TRITONSERVER_Error* error;
+///     The TRITONSERVER_Error to send if the response is an error, or nullptr
+///     if the response is successful. \see TRITONBACKEND_ResponseSend.
+///
+/// \param response_statistics The statistics to be recorded. The caller retains
+/// ownership to the object and must free it after this function returns.
 /// \return a TRITONSERVER_Error indicating success or failure.
 TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceReportResponseStatistics(
-    TRITONBACKEND_ModelInstance* instance,
-    TRITONBACKEND_ResponseFactory* response_factory,
-    const uint64_t response_start, const uint64_t compute_output_start,
-    const uint64_t response_end, const uint32_t send_flags,
-    TRITONSERVER_Error* error);
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics);
 
 /// Record statistics for the execution of an entire batch of
 /// inference requests.
diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index 00a204fcf..802ba3510 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -955,33 +955,33 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 
 TRITONAPI_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceReportResponseStatistics(
-    TRITONBACKEND_ModelInstance* instance,
-    TRITONBACKEND_ResponseFactory* response_factory,
-    const uint64_t response_start, const uint64_t compute_output_start,
-    const uint64_t response_end, const uint32_t send_flags,
-    TRITONSERVER_Error* error)
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
 {
 #ifdef TRITON_ENABLE_STATS
-  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  TRITONBACKEND_ModelInstanceResponseStatistics* rs = response_statistics;
+  TritonModelInstance* ti =
+      reinterpret_cast<TritonModelInstance*>(rs->model_instance);
   std::shared_ptr<InferenceResponseFactory>* rf =
       reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
-          response_factory);
+          rs->response_factory);
   std::string key = std::to_string((*rf)->GetAndIncrementResponseIndex());
 
-  if (error == nullptr) {
-    if (compute_output_start > 0) {
+  if (rs->error == nullptr) {
+    if (rs->compute_output_start > 0) {
       RETURN_TRITONSERVER_ERROR_IF_ERROR(
           ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess(
-              key, response_start, compute_output_start, response_end));
+              key, rs->response_start, rs->compute_output_start,
+              rs->response_end));
     } else {
       RETURN_TRITONSERVER_ERROR_IF_ERROR(
           ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty(
-              key, response_start, response_end));
+              key, rs->response_start, rs->response_end));
     }
   } else {
     RETURN_TRITONSERVER_ERROR_IF_ERROR(
         ti->Model()->MutableStatsAggregator()->UpdateResponseFail(
-            key, response_start, compute_output_start, response_end));
+            key, rs->response_start, rs->compute_output_start,
+            rs->response_end));
   }
 #endif  // TRITON_ENABLE_STATS
 

From f18659000e627e98774a32481d23304e73678d0c Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:16:12 -0800
Subject: [PATCH 07/10] Fix typo

---
 include/triton/core/tritonbackend.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index 54661ed35..a11cb740c 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -1337,16 +1337,16 @@ TRITONBACKEND_ModelInstanceReportMemoryUsage(
 /// TRITONBACKEND_ModelInstanceExecute.
 ///
 ///   TRITONBACKEND_ModelInstanceExecute()
-///     CAPTURE TIMESPACE (exec_start_ns)
+///     CAPTURE TIMESTAMP (exec_start_ns)
 ///     < process input tensors to prepare them for inference
 ///       execution, including copying the tensors to/from GPU if
 ///       necessary>
-///     CAPTURE TIMESPACE (compute_start_ns)
+///     CAPTURE TIMESTAMP (compute_start_ns)
 ///     < perform inference computations to produce outputs >
-///     CAPTURE TIMESPACE (compute_end_ns)
+///     CAPTURE TIMESTAMP (compute_end_ns)
 ///     < allocate output buffers and extract output tensors, including
 ///       copying the tensors to/from GPU if necessary>
-///     CAPTURE TIMESPACE (exec_end_ns)
+///     CAPTURE TIMESTAMP (exec_end_ns)
 ///     return
 ///
 /// Note that these statistics are associated with a valid
@@ -1385,12 +1385,12 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 ///
 ///   TRITONBACKEND_ModelInstanceExecute()
 ///     < start of this response >
-///     CAPTURE TIMESPACE (response_start)
+///     CAPTURE TIMESTAMP (response_start)
 ///     < generate this response >
-///     CAPTURE TIMESPACE (compute_output_start)
+///     CAPTURE TIMESTAMP (compute_output_start)
 ///     < allocate output buffers and extract output tensors, including copying
 ///       the tensors to/from GPU if necessary >
-///     CAPTURE TIMESPACE (response_end)
+///     CAPTURE TIMESTAMP (response_end)
 ///     < end of this response >
 ///     return
 ///

From 18f4a5002e5827bbc4f7cefb57f71c9bd9aee480 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Wed, 14 Feb 2024 17:35:54 -0800
Subject: [PATCH 08/10] Make API parameters struct opaque

---
 include/triton/core/tritonbackend.h | 165 +++++++++++++++++++++-------
 src/backend_model_instance.cc       | 158 +++++++++++++++++++++++---
 src/tritonserver_stub.cc            |  32 ++++++
 3 files changed, 296 insertions(+), 59 deletions(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index a11cb740c..ef0f704a4 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -65,26 +65,10 @@ struct TRITONBACKEND_Response;
 struct TRITONBACKEND_Backend;
 struct TRITONBACKEND_Model;
 struct TRITONBACKEND_ModelInstance;
+struct TRITONBACKEND_ModelInstanceResponseStatistics;
 struct TRITONBACKEND_BackendAttribute;
 struct TRITONBACKEND_Batcher;
 
-struct TRITONBACKEND_ModelInstanceResponseStatistics {
-#ifdef TRITON_ENABLE_STATS
-  TRITONBACKEND_ModelInstanceResponseStatistics()
-      : model_instance(nullptr), response_factory(nullptr), response_start(0),
-        compute_output_start(0), response_end(0), send_flags(0), error(nullptr)
-  {
-  }
-  TRITONBACKEND_ModelInstance* model_instance;
-  TRITONBACKEND_ResponseFactory* response_factory;
-  uint64_t response_start;
-  uint64_t compute_output_start;
-  uint64_t response_end;
-  uint32_t send_flags;
-  TRITONSERVER_Error* error;
-#endif  // TRITON_ENABLE_STATS
-};
-
 ///
 /// TRITONBACKEND API Version
 ///
@@ -1374,7 +1358,46 @@ TRITONBACKEND_ModelInstanceReportStatistics(
     const uint64_t compute_start_ns, const uint64_t compute_end_ns,
     const uint64_t exec_end_ns);
 
-/// Record statistics for a decoupled inference response.
+/// Create a new decoupled inference response statistics object.
+///
+/// \param response_statistics The new response statistics object to be created.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsNew(
+    TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics);
+
+/// Delete a decoupled inference response statistics object.
+///
+/// The caller retains ownership to the objects set on the deleted response
+/// statistics object and must free them separately.
+///
+/// \param response_statistics The response statistics object to be deleted.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics);
+
+/// Set model instance to a decoupled inference response statistics object.
+///
+/// \param response_statistics The response statistics object.
+/// \param model_instance The model instance.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    TRITONBACKEND_ModelInstance* model_instance);
+
+/// Set response factory to a decoupled inference response statistics object.
+///
+/// \param response_statistics The response statistics object.
+/// \param response_factory The response factory.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    TRITONBACKEND_ResponseFactory* response_factory);
+
+/// Set response start time to a decoupled inference response statistics object.
 ///
 /// All timestamps should be reported in nanonseconds and collected using
 /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
@@ -1394,32 +1417,90 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 ///     < end of this response >
 ///     return
 ///
-/// Create the statistics object after calling TRITONBACKEND_ResponseSend. Use
-/// the same send_flags and error object passed to the
+/// \param response_statistics The response statistics object.
+/// \param response_start The response start time.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    uint64_t response_start);
+
+/// Set compute output start time to a decoupled inference response statistics
+/// object. Set this to 0 for reporting an empty response.
+///
+/// All timestamps should be reported in nanonseconds and collected using
+/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
+///
+/// For consistency of measurement across different backends, the timestamps
+/// should be collected at the following points during
+/// TRITONBACKEND_ModelInstanceExecute.
+///
+///   TRITONBACKEND_ModelInstanceExecute()
+///     < start of this response >
+///     CAPTURE TIMESTAMP (response_start)
+///     < generate this response >
+///     CAPTURE TIMESTAMP (compute_output_start)
+///     < allocate output buffers and extract output tensors, including copying
+///       the tensors to/from GPU if necessary >
+///     CAPTURE TIMESTAMP (response_end)
+///     < end of this response >
+///     return
+///
+/// \param response_statistics The response statistics object.
+/// \param compute_output_start The compute output start time.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    uint64_t compute_output_start);
+
+/// Set response end time to a decoupled inference response statistics object.
+///
+/// All timestamps should be reported in nanonseconds and collected using
+/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
+///
+/// For consistency of measurement across different backends, the timestamps
+/// should be collected at the following points during
+/// TRITONBACKEND_ModelInstanceExecute.
+///
+///   TRITONBACKEND_ModelInstanceExecute()
+///     < start of this response >
+///     CAPTURE TIMESTAMP (response_start)
+///     < generate this response >
+///     CAPTURE TIMESTAMP (compute_output_start)
+///     < allocate output buffers and extract output tensors, including copying
+///       the tensors to/from GPU if necessary >
+///     CAPTURE TIMESTAMP (response_end)
+///     < end of this response >
+///     return
+///
+/// \param response_statistics The response statistics object.
+/// \param response_end The response end time.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    uint64_t response_end);
+
+/// Set error to a decoupled inference response statistics object.
+///
+/// Use the same error object passed to the TRITONBACKEND_ResponseSend. \see
 /// TRITONBACKEND_ResponseSend.
 ///
-/// The statistics object is a struct consists of the following fields, subject
-/// to addition in the future.
-///
-///   TRITONBACKEND_ModelInstance* model_instance;
-///     The model instance.
-///   TRITONBACKEND_ResponseFactory* response_factory;
-///     The response factory associated with the inference request.
-///   const uint64_t response_start;
-///     Timestamp for the start of execution for this response.
-///   const uint64_t compute_output_start;
-///     Timestamp for the start of extracting output tensors for this response.
-///     Set this to 0 for reporting empty response.
-///   const uint64_t response_end;
-///     Timestamp for the end of extracting output tensors for this response.
-///   const uint32_t send_flags;
-///     Flags associated with the response. \see TRITONBACKEND_ResponseSend
-///   TRITONSERVER_Error* error;
-///     The TRITONSERVER_Error to send if the response is an error, or nullptr
-///     if the response is successful. \see TRITONBACKEND_ResponseSend.
-///
-/// \param response_statistics The statistics to be recorded. The caller retains
-/// ownership to the object and must free it after this function returns.
+/// \param response_statistics The response statistics object.
+/// \param error The error object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    TRITONSERVER_Error* error);
+
+/// Record statistics for a decoupled inference response.
+///
+/// The caller retains ownership to the response statistics and must free it
+/// after this function returns.
+///
+/// \param response_statistics The statistics to be recorded.
 /// \return a TRITONSERVER_Error indicating success or failure.
 TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceReportResponseStatistics(
diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc
index 802ba3510..5e43e093d 100644
--- a/src/backend_model_instance.cc
+++ b/src/backend_model_instance.cc
@@ -795,6 +795,23 @@ TritonModelInstance::TritonBackendThread::BackendThread()
   LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
 }
 
+// Opaque object for the response statistics C-API
+struct ModelInstanceResponseStatistics {
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics()
+      : model_instance(nullptr), response_factory(nullptr), response_start(0),
+        compute_output_start(0), response_end(0), error(nullptr)
+  {
+  }
+  TritonModelInstance* model_instance;
+  std::shared_ptr<InferenceResponseFactory>* response_factory;
+  uint64_t response_start;
+  uint64_t compute_output_start;
+  uint64_t response_end;
+  TRITONSERVER_Error* error;
+#endif  // TRITON_ENABLE_STATS
+};
+
 extern "C" {
 
 TRITONAPI_DECLSPEC TRITONSERVER_Error*
@@ -954,34 +971,141 @@ TRITONBACKEND_ModelInstanceReportStatistics(
 }
 
 TRITONAPI_DECLSPEC TRITONSERVER_Error*
-TRITONBACKEND_ModelInstanceReportResponseStatistics(
+TRITONBACKEND_ModelInstanceResponseStatisticsNew(
+    TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics)
+{
+#ifdef TRITON_ENABLE_STATS
+  *response_statistics =
+      reinterpret_cast<TRITONBACKEND_ModelInstanceResponseStatistics*>(
+          new ModelInstanceResponseStatistics());
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
 {
 #ifdef TRITON_ENABLE_STATS
-  TRITONBACKEND_ModelInstanceResponseStatistics* rs = response_statistics;
-  TritonModelInstance* ti =
-      reinterpret_cast<TritonModelInstance*>(rs->model_instance);
-  std::shared_ptr<InferenceResponseFactory>* rf =
+  delete reinterpret_cast<ModelInstanceResponseStatistics*>(
+      response_statistics);
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    TRITONBACKEND_ModelInstance* model_instance)
+{
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics* rs =
+      reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
+  rs->model_instance = reinterpret_cast<TritonModelInstance*>(model_instance);
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    TRITONBACKEND_ResponseFactory* response_factory)
+{
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics* rs =
+      reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
+  rs->response_factory =
       reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
-          rs->response_factory);
-  std::string key = std::to_string((*rf)->GetAndIncrementResponseIndex());
+          response_factory);
+  ;
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    uint64_t response_start)
+{
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics* rs =
+      reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
+  rs->response_start = response_start;
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    uint64_t compute_output_start)
+{
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics* rs =
+      reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
+  rs->compute_output_start = compute_output_start;
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    uint64_t response_end)
+{
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics* rs =
+      reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
+  rs->response_end = response_end;
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
+    TRITONSERVER_Error* error)
+{
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics* rs =
+      reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
+  rs->error = error;
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportResponseStatistics(
+    TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics)
+{
+#ifdef TRITON_ENABLE_STATS
+  ModelInstanceResponseStatistics* rs =
+      reinterpret_cast<ModelInstanceResponseStatistics*>(response_statistics);
+
+  InferenceStatsAggregator* sa =
+      rs->model_instance->Model()->MutableStatsAggregator();
+  std::string key =
+      std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex());
 
   if (rs->error == nullptr) {
     if (rs->compute_output_start > 0) {
-      RETURN_TRITONSERVER_ERROR_IF_ERROR(
-          ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess(
-              key, rs->response_start, rs->compute_output_start,
-              rs->response_end));
+      RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseSuccess(
+          key, rs->response_start, rs->compute_output_start, rs->response_end));
     } else {
       RETURN_TRITONSERVER_ERROR_IF_ERROR(
-          ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty(
-              key, rs->response_start, rs->response_end));
+          sa->UpdateResponseEmpty(key, rs->response_start, rs->response_end));
     }
   } else {
-    RETURN_TRITONSERVER_ERROR_IF_ERROR(
-        ti->Model()->MutableStatsAggregator()->UpdateResponseFail(
-            key, rs->response_start, rs->compute_output_start,
-            rs->response_end));
+    RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseFail(
+        key, rs->response_start, rs->compute_output_start, rs->response_end));
   }
 #endif  // TRITON_ENABLE_STATS
 
diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc
index f3bafe4b0..dc9bcca52 100644
--- a/src/tritonserver_stub.cc
+++ b/src/tritonserver_stub.cc
@@ -947,6 +947,38 @@ TRITONBACKEND_ModelInstanceReportStatistics()
 {
 }
 TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsNew()
+{
+}
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance()
+{
+}
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory()
+{
+}
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart()
+{
+}
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart()
+{
+}
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd()
+{
+}
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsSetError()
+{
+}
+TRITONAPI_DECLSPEC void
+TRITONBACKEND_ModelInstanceResponseStatisticsDelete()
+{
+}
+TRITONAPI_DECLSPEC void
 TRITONBACKEND_ModelInstanceReportResponseStatistics()
 {
 }

From f295284c23ac21d7193da9ad438a2f0d25a48a87 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 15 Feb 2024 13:03:31 -0800
Subject: [PATCH 09/10] Update comment

---
 include/triton/core/tritonbackend.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index ef0f704a4..69379ca0c 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -1358,7 +1358,7 @@ TRITONBACKEND_ModelInstanceReportStatistics(
     const uint64_t compute_start_ns, const uint64_t compute_end_ns,
     const uint64_t exec_end_ns);
 
-/// Create a new decoupled inference response statistics object.
+/// Create a new inference response statistics object.
 ///
 /// \param response_statistics The new response statistics object to be created.
 /// \return a TRITONSERVER_Error indicating success or failure.
@@ -1366,7 +1366,7 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceResponseStatisticsNew(
     TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics);
 
-/// Delete a decoupled inference response statistics object.
+/// Delete an inference response statistics object.
 ///
 /// The caller retains ownership to the objects set on the deleted response
 /// statistics object and must free them separately.
@@ -1377,7 +1377,7 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
 TRITONBACKEND_ModelInstanceResponseStatisticsDelete(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics);
 
-/// Set model instance to a decoupled inference response statistics object.
+/// Set model instance to an inference response statistics object.
 ///
 /// \param response_statistics The response statistics object.
 /// \param model_instance The model instance.
@@ -1387,7 +1387,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
     TRITONBACKEND_ModelInstance* model_instance);
 
-/// Set response factory to a decoupled inference response statistics object.
+/// Set response factory to an inference response statistics object.
 ///
 /// \param response_statistics The response statistics object.
 /// \param response_factory The response factory.
@@ -1397,7 +1397,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
     TRITONBACKEND_ResponseFactory* response_factory);
 
-/// Set response start time to a decoupled inference response statistics object.
+/// Set response start time to an inference response statistics object.
 ///
 /// All timestamps should be reported in nanonseconds and collected using
 /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
@@ -1425,8 +1425,8 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
     uint64_t response_start);
 
-/// Set compute output start time to a decoupled inference response statistics
-/// object. Set this to 0 for reporting an empty response.
+/// Set compute output start time to an inference response statistics object.
+/// Set this to 0 for reporting an empty response.
 ///
 /// All timestamps should be reported in nanonseconds and collected using
 /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
@@ -1454,7 +1454,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
     uint64_t compute_output_start);
 
-/// Set response end time to a decoupled inference response statistics object.
+/// Set response end time to an inference response statistics object.
 ///
 /// All timestamps should be reported in nanonseconds and collected using
 /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
@@ -1482,7 +1482,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
     uint64_t response_end);
 
-/// Set error to a decoupled inference response statistics object.
+/// Set error to an inference response statistics object.
 ///
 /// Use the same error object passed to the TRITONBACKEND_ResponseSend. \see
 /// TRITONBACKEND_ResponseSend.
@@ -1495,7 +1495,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetError(
     TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics,
     TRITONSERVER_Error* error);
 
-/// Record statistics for a decoupled inference response.
+/// Record statistics for an inference response.
 ///
 /// The caller retains ownership to the response statistics and must free it
 /// after this function returns.

From 68c25ab05e57ddcbaaa875c016144bfed1509eca Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Fri, 16 Feb 2024 12:13:40 -0800
Subject: [PATCH 10/10] [Continue] Update comment

---
 include/triton/core/tritonbackend.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h
index 69379ca0c..ad04d57f8 100644
--- a/include/triton/core/tritonbackend.h
+++ b/include/triton/core/tritonbackend.h
@@ -1426,7 +1426,9 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart(
     uint64_t response_start);
 
 /// Set compute output start time to an inference response statistics object.
-/// Set this to 0 for reporting an empty response.
+///
+/// Do NOT set this compute output start time (or set it to 0), if reporting an
+/// empty response.
 ///
 /// All timestamps should be reported in nanonseconds and collected using
 /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.