From 2b1c330111c121e52746e87a8f41985805f02b76 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Fri, 2 Feb 2024 18:58:29 -0800 Subject: [PATCH 01/10] Add response statistics --- include/triton/core/tritonbackend.h | 47 +++++++++- src/backend_model_instance.cc | 37 +++++++- src/infer_response.h | 16 +++- src/infer_stats.cc | 128 +++++++++++++++++++++++++++- src/infer_stats.h | 44 +++++++++- src/tritonserver.cc | 35 +++++++- src/tritonserver_stub.cc | 6 +- 7 files changed, 306 insertions(+), 7 deletions(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index ef3af71b5..82e8f75af 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -1356,6 +1356,51 @@ TRITONBACKEND_ModelInstanceReportStatistics( const uint64_t compute_start_ns, const uint64_t compute_end_ns, const uint64_t exec_end_ns); +/// Record statistics for a decoupled inference response. +/// +/// All timestamps should be reported in nanonseconds and collected using +/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. +/// +/// Call this function after calling TRITONBACKEND_ResponseSend, and pass the +/// same send_flags and error object passed to the TRITONBACKEND_ResponseSend. +/// +/// For consistency of measurement across different backends, the +/// timestamps should be collected at the following points during +/// TRITONBACKEND_ModelInstanceExecute. +/// +/// TRITONBACKEND_ModelInstanceExecute() +/// < start of this response > +/// CAPTURE TIMESPACE (response_start) +/// < generate this response > +/// CAPTURE TIMESPACE (compute_output_start) +/// < allocate output buffers and extract output tensors, including copying +/// the tensors to/from GPU if necessary > +/// CAPTURE TIMESPACE (response_end) +/// < end of this response > +/// return +/// +/// \param instance The model instance. +/// \param response_factory The response factory associated with the inference +/// request. +/// \param response_start Timestamp for the start of execution for this +/// response. +/// \param compute_output_start Timestamp for the start of extracting output +/// tensors for this response. Set this to 0 for reporting empty response. +/// \param response_end Timestamp for the end of extracting output tensors for +/// this response. +/// \param send_flags Flags associated with the response. \see +/// TRITONBACKEND_ResponseSend \see +/// \param error The TRITONSERVER_Error to send if the response is an error, or +/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend \see +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceReportResponseStatistics( + TRITONBACKEND_ModelInstance* instance, + TRITONBACKEND_ResponseFactory* response_factory, + const uint64_t response_start, const uint64_t compute_output_start, + const uint64_t response_end, const uint32_t send_flags, + TRITONSERVER_Error* error); + /// Record statistics for the execution of an entire batch of /// inference requests. /// diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index a4a9bba2e..d303a88c4 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -953,6 +953,41 @@ TRITONBACKEND_ModelInstanceReportStatistics( return nullptr; // success } +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceReportResponseStatistics( + TRITONBACKEND_ModelInstance* instance, + TRITONBACKEND_ResponseFactory* response_factory, + const uint64_t response_start, const uint64_t compute_output_start, + const uint64_t response_end, const uint32_t send_flags, + TRITONSERVER_Error* error) +{ +#ifdef TRITON_ENABLE_STATS + TritonModelInstance* ti = reinterpret_cast(instance); + std::shared_ptr* rf = + reinterpret_cast*>( + response_factory); + std::string key = std::to_string((*rf)->ResponseStatsIndex()); + + if (error == nullptr) { + if (compute_output_start > 0) { + RETURN_TRITONSERVER_ERROR_IF_ERROR( + ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess( + key, response_start, compute_output_start, response_end)); + } else { + RETURN_TRITONSERVER_ERROR_IF_ERROR( + ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty( + key, response_start, response_end)); + } + } else { + RETURN_TRITONSERVER_ERROR_IF_ERROR( + ti->Model()->MutableStatsAggregator()->UpdateResponseFail( + key, response_start, compute_output_start, response_end)); + } +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceReportBatchStatistics( TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size, diff --git a/src/infer_response.h b/src/infer_response.h index 75d5f9d48..3dbaa7ee8 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -1,4 +1,4 @@ -// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -61,6 +61,10 @@ class InferenceResponseFactory { alloc_userp_(alloc_userp), response_fn_(response_fn), response_userp_(response_userp), response_delegator_(delegator), is_cancelled_(false) +#ifdef TRITON_ENABLE_STATS + , + response_stats_index_(0) +#endif // TRITON_ENABLE_STATS { } @@ -94,6 +98,11 @@ class InferenceResponseFactory { void ReleaseTrace() { trace_ = nullptr; } #endif // TRITON_ENABLE_TRACING +#ifdef TRITON_ENABLE_STATS + // Return the current response statistics index and increment it. + uint64_t ResponseStatsIndex() { return response_stats_index_++; }; +#endif // TRITON_ENABLE_STATS + private: // The model associated with this factory. For normal // requests/responses this will always be defined and acts to keep @@ -129,6 +138,11 @@ class InferenceResponseFactory { // Inference trace associated with this response. std::shared_ptr trace_; #endif // TRITON_ENABLE_TRACING + +#ifdef TRITON_ENABLE_STATS + // Number of response statistics reported. + std::atomic response_stats_index_; +#endif // TRITON_ENABLE_STATS }; // diff --git a/src/infer_stats.cc b/src/infer_stats.cc index a1af3b9f2..0f47485c2 100644 --- a/src/infer_stats.cc +++ b/src/infer_stats.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -210,6 +210,132 @@ InferenceStatsAggregator::UpdateSuccessCacheMiss( #endif // TRITON_ENABLE_METRICS } +Status +InferenceStatsAggregator::UpdateResponseSuccess( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns) +{ + if (response_start_ns > compute_output_start_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after compute output start"); + } + if (compute_output_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Compute output start cannot happen after response end"); + } + const uint64_t compute_infer_duration_ns = + compute_output_start_ns - response_start_ns; + const uint64_t compute_output_duration_ns = + response_end_ns - compute_output_start_ns; + const uint64_t total_duration_ns = response_end_ns - response_start_ns; + + { + std::lock_guard lock(mu_); + + auto it = response_stats_.find(key); + if (it == response_stats_.end()) { + it = response_stats_.emplace(key, InferResponseStats()).first; + } + + it->second.compute_infer_count++; + it->second.compute_infer_duration_ns += compute_infer_duration_ns; + it->second.compute_output_count++; + it->second.compute_output_duration_ns += compute_output_duration_ns; + it->second.success_count++; + it->second.success_duration_ns += total_duration_ns; + } + + return Status::Success; +} + +Status +InferenceStatsAggregator::UpdateResponseFail( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns) +{ + uint64_t compute_infer_duration_ns, compute_output_duration_ns, + total_duration_ns; + if (compute_output_start_ns > 0) { + // output tensors copied + if (response_start_ns > compute_output_start_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after compute output start"); + } + if (compute_output_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Compute output start cannot happen after response end"); + } + compute_infer_duration_ns = compute_output_start_ns - response_start_ns; + compute_output_duration_ns = response_end_ns - compute_output_start_ns; + total_duration_ns = response_end_ns - response_start_ns; + } else { + // no output tensors copied + if (response_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after response end"); + } + compute_infer_duration_ns = response_end_ns - response_start_ns; + compute_output_duration_ns = 0; + total_duration_ns = response_end_ns - response_start_ns; + } + + { + std::lock_guard lock(mu_); + + auto it = response_stats_.find(key); + if (it == response_stats_.end()) { + it = response_stats_.emplace(key, InferResponseStats()).first; + } + + it->second.compute_infer_count++; + it->second.compute_infer_duration_ns += compute_infer_duration_ns; + if (compute_output_duration_ns > 0) { + it->second.compute_output_count++; + it->second.compute_output_duration_ns += compute_output_duration_ns; + } + it->second.fail_count++; + it->second.fail_duration_ns += total_duration_ns; + } + + return Status::Success; +} + +Status +InferenceStatsAggregator::UpdateResponseEmpty( + const std::string& key, const uint64_t response_start_ns, + const uint64_t response_end_ns) +{ + if (response_start_ns > response_end_ns) { + return Status( + Status::Code::INVALID_ARG, + "Response start cannot happen after response end"); + } + const uint64_t compute_infer_duration_ns = + response_end_ns - response_start_ns; + const uint64_t total_duration_ns = response_end_ns - response_start_ns; + + { + std::lock_guard lock(mu_); + + auto it = response_stats_.find(key); + if (it == response_stats_.end()) { + it = response_stats_.emplace(key, InferResponseStats()).first; + } + + it->second.compute_infer_count++; + it->second.compute_infer_duration_ns += compute_infer_duration_ns; + it->second.empty_response_count++; + it->second.empty_response_duration_ns += total_duration_ns; + } + + return Status::Success; +} + void InferenceStatsAggregator::UpdateInferBatchStats( MetricModelReporter* metric_reporter, const size_t batch_size, diff --git a/src/infer_stats.h b/src/infer_stats.h index 86ad9a654..2ae2bc226 100644 --- a/src/infer_stats.h +++ b/src/infer_stats.h @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -78,6 +78,27 @@ class InferenceStatsAggregator { uint64_t cache_miss_duration_ns_; }; + struct InferResponseStats { + InferResponseStats() + : compute_infer_count(0), compute_infer_duration_ns(0), + compute_output_count(0), compute_output_duration_ns(0), + success_count(0), success_duration_ns(0), fail_count(0), + fail_duration_ns(0), empty_response_count(0), + empty_response_duration_ns(0) + { + } + uint64_t compute_infer_count; + uint64_t compute_infer_duration_ns; + uint64_t compute_output_count; + uint64_t compute_output_duration_ns; + uint64_t success_count; + uint64_t success_duration_ns; + uint64_t fail_count; + uint64_t fail_duration_ns; + uint64_t empty_response_count; + uint64_t empty_response_duration_ns; + }; + struct InferBatchStats { InferBatchStats() : count_(0), compute_input_duration_ns_(0), @@ -100,6 +121,11 @@ class InferenceStatsAggregator { uint64_t InferenceCount() const { return inference_count_; } uint64_t ExecutionCount() const { return execution_count_; } const InferStats& ImmutableInferStats() const { return infer_stats_; } + const std::map& ImmutableInferResponseStats() + const + { + return response_stats_; + } const std::map& ImmutableInferBatchStats() const { return batch_stats_; @@ -140,6 +166,21 @@ class InferenceStatsAggregator { MetricModelReporter* metric_reporter, const uint64_t cache_miss_duration_ns); + // Add durations to response stats for a successful response. + Status UpdateResponseSuccess( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns); + + // Add durations to response stats for a failed response. + Status UpdateResponseFail( + const std::string& key, const uint64_t response_start_ns, + const uint64_t compute_output_start_ns, const uint64_t response_end_ns); + + // Add durations to response stats for an empty response. + Status UpdateResponseEmpty( + const std::string& key, const uint64_t response_start_ns, + const uint64_t response_end_ns); + // Add durations to batch infer stats for a batch execution. // 'success_request_count' is the number of success requests in the // batch that have infer_stats attached. @@ -163,6 +204,7 @@ class InferenceStatsAggregator { uint64_t inference_count_; uint64_t execution_count_; InferStats infer_stats_; + std::map response_stats_; std::map batch_stats_; #endif // TRITON_ENABLE_STATS }; diff --git a/src/tritonserver.cc b/src/tritonserver.cc index 70bdddb75..f2d3c2624 100644 --- a/src/tritonserver.cc +++ b/src/tritonserver.cc @@ -1,4 +1,4 @@ -// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -2897,6 +2897,8 @@ TRITONSERVER_ServerModelStatistics( // Add infer statistic const auto& infer_stats = model->StatsAggregator().ImmutableInferStats(); + const auto& infer_response_stats = + model->StatsAggregator().ImmutableInferResponseStats(); const auto& infer_batch_stats = model->StatsAggregator().ImmutableInferBatchStats(); @@ -2933,6 +2935,35 @@ TRITONSERVER_ServerModelStatistics( metadata, inference_stats, "cache_miss", infer_stats.cache_miss_count_, infer_stats.cache_miss_duration_ns_); + // Add response statistics + triton::common::TritonJson::Value response_stats( + metadata, triton::common::TritonJson::ValueType::OBJECT); + for (const auto& res_pair : infer_response_stats) { + triton::common::TritonJson::Value res_stat( + metadata, triton::common::TritonJson::ValueType::OBJECT); + SetDurationStat( + metadata, res_stat, "compute_infer", + res_pair.second.compute_infer_count, + res_pair.second.compute_infer_duration_ns); + SetDurationStat( + metadata, res_stat, "compute_output", + res_pair.second.compute_output_count, + res_pair.second.compute_output_duration_ns); + SetDurationStat( + metadata, res_stat, "success", res_pair.second.success_count, + res_pair.second.success_duration_ns); + SetDurationStat( + metadata, res_stat, "fail", res_pair.second.fail_count, + res_pair.second.fail_duration_ns); + SetDurationStat( + metadata, res_stat, "empty_response", + res_pair.second.empty_response_count, + res_pair.second.empty_response_duration_ns); + RETURN_IF_STATUS_ERROR( + response_stats.Add(res_pair.first.c_str(), std::move(res_stat))); + } + + // Add batch statistics triton::common::TritonJson::Value batch_stats( metadata, triton::common::TritonJson::ValueType::ARRAY); for (const auto& batch : infer_batch_stats) { @@ -2967,6 +2998,8 @@ TRITONSERVER_ServerModelStatistics( RETURN_IF_STATUS_ERROR( model_stat.Add("inference_stats", std::move(inference_stats))); + RETURN_IF_STATUS_ERROR( + model_stat.Add("response_stats", std::move(response_stats))); RETURN_IF_STATUS_ERROR( model_stat.Add("batch_stats", std::move(batch_stats))); RETURN_IF_STATUS_ERROR( diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc index e6efcda12..f3bafe4b0 100644 --- a/src/tritonserver_stub.cc +++ b/src/tritonserver_stub.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -947,6 +947,10 @@ TRITONBACKEND_ModelInstanceReportStatistics() { } TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceReportResponseStatistics() +{ +} +TRITONAPI_DECLSPEC void TRITONBACKEND_ModelInstanceReportBatchStatistics() { } From c920b3bc7fe80aa55011622deb24abce277c996b Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 7 Feb 2024 18:31:02 -0800 Subject: [PATCH 02/10] Backend api version 1.19 --- include/triton/core/tritonbackend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index 82e8f75af..560270f9d 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -94,7 +94,7 @@ struct TRITONBACKEND_Batcher; /// } /// #define TRITONBACKEND_API_VERSION_MAJOR 1 -#define TRITONBACKEND_API_VERSION_MINOR 18 +#define TRITONBACKEND_API_VERSION_MINOR 19 /// Get the TRITONBACKEND API version supported by Triton. This value /// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and From 021204e8baac2686af7967b5efc49424c1f35c14 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 8 Feb 2024 11:53:23 -0800 Subject: [PATCH 03/10] Fix see comment --- include/triton/core/tritonbackend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index 560270f9d..8b0c4f3d9 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -1389,9 +1389,9 @@ TRITONBACKEND_ModelInstanceReportStatistics( /// \param response_end Timestamp for the end of extracting output tensors for /// this response. /// \param send_flags Flags associated with the response. \see -/// TRITONBACKEND_ResponseSend \see +/// TRITONBACKEND_ResponseSend /// \param error The TRITONSERVER_Error to send if the response is an error, or -/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend \see +/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend /// \return a TRITONSERVER_Error indicating success or failure. TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceReportResponseStatistics( From 629063e283d53d7ad442e27e53ed75fad82fbef7 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:58:54 -0800 Subject: [PATCH 04/10] Rename ResponseStatsIndex() to GetAndIncrementResponseIndex() --- src/backend_model_instance.cc | 2 +- src/infer_response.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index d303a88c4..00a204fcf 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -966,7 +966,7 @@ TRITONBACKEND_ModelInstanceReportResponseStatistics( std::shared_ptr* rf = reinterpret_cast*>( response_factory); - std::string key = std::to_string((*rf)->ResponseStatsIndex()); + std::string key = std::to_string((*rf)->GetAndIncrementResponseIndex()); if (error == nullptr) { if (compute_output_start > 0) { diff --git a/src/infer_response.h b/src/infer_response.h index 3dbaa7ee8..612f8c1fe 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -100,7 +100,7 @@ class InferenceResponseFactory { #ifdef TRITON_ENABLE_STATS // Return the current response statistics index and increment it. - uint64_t ResponseStatsIndex() { return response_stats_index_++; }; + uint64_t GetAndIncrementResponseIndex() { return response_stats_index_++; }; #endif // TRITON_ENABLE_STATS private: From 3336bf5dcdb639d4b8af3c0bd441d3c2cd0828ce Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 8 Feb 2024 15:35:32 -0800 Subject: [PATCH 05/10] Clarify error object ownership --- include/triton/core/tritonbackend.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index 8b0c4f3d9..2b8192e2c 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -761,8 +761,9 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseOutput( /// \param send_flags Flags associated with the response. \see /// TRITONSERVER_ResponseCompleteFlag. \see /// TRITONSERVER_InferenceResponseCompleteFn_t. -/// \param error The TRITONSERVER_Error to send if the response is an -/// error, or nullptr if the response is successful. +/// \param error The TRITONSERVER_Error to send if the response is an error, or +/// nullptr if the response is successful. The caller retains ownership to the +/// error object and must free it with TRITONSERVER_ErrorDelete. /// \return a TRITONSERVER_Error indicating success or failure. TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSend( TRITONBACKEND_Response* response, const uint32_t send_flags, @@ -1392,6 +1393,8 @@ TRITONBACKEND_ModelInstanceReportStatistics( /// TRITONBACKEND_ResponseSend /// \param error The TRITONSERVER_Error to send if the response is an error, or /// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend +/// The caller retains ownership to the error object and must free it with +/// TRITONSERVER_ErrorDelete. /// \return a TRITONSERVER_Error indicating success or failure. TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceReportResponseStatistics( From 527e036ea2c10922307c1dbd22562b9081a409d1 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 13 Feb 2024 15:48:56 -0800 Subject: [PATCH 06/10] Move API parameters into a struct --- include/triton/core/tritonbackend.h | 71 +++++++++++++++++++---------- src/backend_model_instance.cc | 24 +++++----- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index 2b8192e2c..54661ed35 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -68,6 +68,23 @@ struct TRITONBACKEND_ModelInstance; struct TRITONBACKEND_BackendAttribute; struct TRITONBACKEND_Batcher; +struct TRITONBACKEND_ModelInstanceResponseStatistics { +#ifdef TRITON_ENABLE_STATS + TRITONBACKEND_ModelInstanceResponseStatistics() + : model_instance(nullptr), response_factory(nullptr), response_start(0), + compute_output_start(0), response_end(0), send_flags(0), error(nullptr) + { + } + TRITONBACKEND_ModelInstance* model_instance; + TRITONBACKEND_ResponseFactory* response_factory; + uint64_t response_start; + uint64_t compute_output_start; + uint64_t response_end; + uint32_t send_flags; + TRITONSERVER_Error* error; +#endif // TRITON_ENABLE_STATS +}; + /// /// TRITONBACKEND API Version /// @@ -1362,11 +1379,8 @@ TRITONBACKEND_ModelInstanceReportStatistics( /// All timestamps should be reported in nanonseconds and collected using /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. /// -/// Call this function after calling TRITONBACKEND_ResponseSend, and pass the -/// same send_flags and error object passed to the TRITONBACKEND_ResponseSend. -/// -/// For consistency of measurement across different backends, the -/// timestamps should be collected at the following points during +/// For consistency of measurement across different backends, the timestamps +/// should be collected at the following points during /// TRITONBACKEND_ModelInstanceExecute. /// /// TRITONBACKEND_ModelInstanceExecute() @@ -1380,29 +1394,36 @@ TRITONBACKEND_ModelInstanceReportStatistics( /// < end of this response > /// return /// -/// \param instance The model instance. -/// \param response_factory The response factory associated with the inference -/// request. -/// \param response_start Timestamp for the start of execution for this -/// response. -/// \param compute_output_start Timestamp for the start of extracting output -/// tensors for this response. Set this to 0 for reporting empty response. -/// \param response_end Timestamp for the end of extracting output tensors for -/// this response. -/// \param send_flags Flags associated with the response. \see -/// TRITONBACKEND_ResponseSend -/// \param error The TRITONSERVER_Error to send if the response is an error, or -/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend -/// The caller retains ownership to the error object and must free it with -/// TRITONSERVER_ErrorDelete. +/// Create the statistics object after calling TRITONBACKEND_ResponseSend. Use +/// the same send_flags and error object passed to the +/// TRITONBACKEND_ResponseSend. +/// +/// The statistics object is a struct consists of the following fields, subject +/// to addition in the future. +/// +/// TRITONBACKEND_ModelInstance* model_instance; +/// The model instance. +/// TRITONBACKEND_ResponseFactory* response_factory; +/// The response factory associated with the inference request. +/// const uint64_t response_start; +/// Timestamp for the start of execution for this response. +/// const uint64_t compute_output_start; +/// Timestamp for the start of extracting output tensors for this response. +/// Set this to 0 for reporting empty response. +/// const uint64_t response_end; +/// Timestamp for the end of extracting output tensors for this response. +/// const uint32_t send_flags; +/// Flags associated with the response. \see TRITONBACKEND_ResponseSend +/// TRITONSERVER_Error* error; +/// The TRITONSERVER_Error to send if the response is an error, or nullptr +/// if the response is successful. \see TRITONBACKEND_ResponseSend. +/// +/// \param response_statistics The statistics to be recorded. The caller retains +/// ownership to the object and must free it after this function returns. /// \return a TRITONSERVER_Error indicating success or failure. TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceReportResponseStatistics( - TRITONBACKEND_ModelInstance* instance, - TRITONBACKEND_ResponseFactory* response_factory, - const uint64_t response_start, const uint64_t compute_output_start, - const uint64_t response_end, const uint32_t send_flags, - TRITONSERVER_Error* error); + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics); /// Record statistics for the execution of an entire batch of /// inference requests. diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index 00a204fcf..802ba3510 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -955,33 +955,33 @@ TRITONBACKEND_ModelInstanceReportStatistics( TRITONAPI_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceReportResponseStatistics( - TRITONBACKEND_ModelInstance* instance, - TRITONBACKEND_ResponseFactory* response_factory, - const uint64_t response_start, const uint64_t compute_output_start, - const uint64_t response_end, const uint32_t send_flags, - TRITONSERVER_Error* error) + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics) { #ifdef TRITON_ENABLE_STATS - TritonModelInstance* ti = reinterpret_cast(instance); + TRITONBACKEND_ModelInstanceResponseStatistics* rs = response_statistics; + TritonModelInstance* ti = + reinterpret_cast(rs->model_instance); std::shared_ptr* rf = reinterpret_cast*>( - response_factory); + rs->response_factory); std::string key = std::to_string((*rf)->GetAndIncrementResponseIndex()); - if (error == nullptr) { - if (compute_output_start > 0) { + if (rs->error == nullptr) { + if (rs->compute_output_start > 0) { RETURN_TRITONSERVER_ERROR_IF_ERROR( ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess( - key, response_start, compute_output_start, response_end)); + key, rs->response_start, rs->compute_output_start, + rs->response_end)); } else { RETURN_TRITONSERVER_ERROR_IF_ERROR( ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty( - key, response_start, response_end)); + key, rs->response_start, rs->response_end)); } } else { RETURN_TRITONSERVER_ERROR_IF_ERROR( ti->Model()->MutableStatsAggregator()->UpdateResponseFail( - key, response_start, compute_output_start, response_end)); + key, rs->response_start, rs->compute_output_start, + rs->response_end)); } #endif // TRITON_ENABLE_STATS From f18659000e627e98774a32481d23304e73678d0c Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Tue, 13 Feb 2024 16:16:12 -0800 Subject: [PATCH 07/10] Fix typo --- include/triton/core/tritonbackend.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index 54661ed35..a11cb740c 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -1337,16 +1337,16 @@ TRITONBACKEND_ModelInstanceReportMemoryUsage( /// TRITONBACKEND_ModelInstanceExecute. /// /// TRITONBACKEND_ModelInstanceExecute() -/// CAPTURE TIMESPACE (exec_start_ns) +/// CAPTURE TIMESTAMP (exec_start_ns) /// < process input tensors to prepare them for inference /// execution, including copying the tensors to/from GPU if /// necessary> -/// CAPTURE TIMESPACE (compute_start_ns) +/// CAPTURE TIMESTAMP (compute_start_ns) /// < perform inference computations to produce outputs > -/// CAPTURE TIMESPACE (compute_end_ns) +/// CAPTURE TIMESTAMP (compute_end_ns) /// < allocate output buffers and extract output tensors, including /// copying the tensors to/from GPU if necessary> -/// CAPTURE TIMESPACE (exec_end_ns) +/// CAPTURE TIMESTAMP (exec_end_ns) /// return /// /// Note that these statistics are associated with a valid @@ -1385,12 +1385,12 @@ TRITONBACKEND_ModelInstanceReportStatistics( /// /// TRITONBACKEND_ModelInstanceExecute() /// < start of this response > -/// CAPTURE TIMESPACE (response_start) +/// CAPTURE TIMESTAMP (response_start) /// < generate this response > -/// CAPTURE TIMESPACE (compute_output_start) +/// CAPTURE TIMESTAMP (compute_output_start) /// < allocate output buffers and extract output tensors, including copying /// the tensors to/from GPU if necessary > -/// CAPTURE TIMESPACE (response_end) +/// CAPTURE TIMESTAMP (response_end) /// < end of this response > /// return /// From 18f4a5002e5827bbc4f7cefb57f71c9bd9aee480 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Wed, 14 Feb 2024 17:35:54 -0800 Subject: [PATCH 08/10] Make API parameters struct opaque --- include/triton/core/tritonbackend.h | 165 +++++++++++++++++++++------- src/backend_model_instance.cc | 158 +++++++++++++++++++++++--- src/tritonserver_stub.cc | 32 ++++++ 3 files changed, 296 insertions(+), 59 deletions(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index a11cb740c..ef0f704a4 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -65,26 +65,10 @@ struct TRITONBACKEND_Response; struct TRITONBACKEND_Backend; struct TRITONBACKEND_Model; struct TRITONBACKEND_ModelInstance; +struct TRITONBACKEND_ModelInstanceResponseStatistics; struct TRITONBACKEND_BackendAttribute; struct TRITONBACKEND_Batcher; -struct TRITONBACKEND_ModelInstanceResponseStatistics { -#ifdef TRITON_ENABLE_STATS - TRITONBACKEND_ModelInstanceResponseStatistics() - : model_instance(nullptr), response_factory(nullptr), response_start(0), - compute_output_start(0), response_end(0), send_flags(0), error(nullptr) - { - } - TRITONBACKEND_ModelInstance* model_instance; - TRITONBACKEND_ResponseFactory* response_factory; - uint64_t response_start; - uint64_t compute_output_start; - uint64_t response_end; - uint32_t send_flags; - TRITONSERVER_Error* error; -#endif // TRITON_ENABLE_STATS -}; - /// /// TRITONBACKEND API Version /// @@ -1374,7 +1358,46 @@ TRITONBACKEND_ModelInstanceReportStatistics( const uint64_t compute_start_ns, const uint64_t compute_end_ns, const uint64_t exec_end_ns); -/// Record statistics for a decoupled inference response. +/// Create a new decoupled inference response statistics object. +/// +/// \param response_statistics The new response statistics object to be created. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsNew( + TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics); + +/// Delete a decoupled inference response statistics object. +/// +/// The caller retains ownership to the objects set on the deleted response +/// statistics object and must free them separately. +/// +/// \param response_statistics The response statistics object to be deleted. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsDelete( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics); + +/// Set model instance to a decoupled inference response statistics object. +/// +/// \param response_statistics The response statistics object. +/// \param model_instance The model instance. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ModelInstance* model_instance); + +/// Set response factory to a decoupled inference response statistics object. +/// +/// \param response_statistics The response statistics object. +/// \param response_factory The response factory. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ResponseFactory* response_factory); + +/// Set response start time to a decoupled inference response statistics object. /// /// All timestamps should be reported in nanonseconds and collected using /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. @@ -1394,32 +1417,90 @@ TRITONBACKEND_ModelInstanceReportStatistics( /// < end of this response > /// return /// -/// Create the statistics object after calling TRITONBACKEND_ResponseSend. Use -/// the same send_flags and error object passed to the +/// \param response_statistics The response statistics object. +/// \param response_start The response start time. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_start); + +/// Set compute output start time to a decoupled inference response statistics +/// object. Set this to 0 for reporting an empty response. +/// +/// All timestamps should be reported in nanonseconds and collected using +/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. +/// +/// For consistency of measurement across different backends, the timestamps +/// should be collected at the following points during +/// TRITONBACKEND_ModelInstanceExecute. +/// +/// TRITONBACKEND_ModelInstanceExecute() +/// < start of this response > +/// CAPTURE TIMESTAMP (response_start) +/// < generate this response > +/// CAPTURE TIMESTAMP (compute_output_start) +/// < allocate output buffers and extract output tensors, including copying +/// the tensors to/from GPU if necessary > +/// CAPTURE TIMESTAMP (response_end) +/// < end of this response > +/// return +/// +/// \param response_statistics The response statistics object. +/// \param compute_output_start The compute output start time. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t compute_output_start); + +/// Set response end time to a decoupled inference response statistics object. +/// +/// All timestamps should be reported in nanonseconds and collected using +/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. +/// +/// For consistency of measurement across different backends, the timestamps +/// should be collected at the following points during +/// TRITONBACKEND_ModelInstanceExecute. +/// +/// TRITONBACKEND_ModelInstanceExecute() +/// < start of this response > +/// CAPTURE TIMESTAMP (response_start) +/// < generate this response > +/// CAPTURE TIMESTAMP (compute_output_start) +/// < allocate output buffers and extract output tensors, including copying +/// the tensors to/from GPU if necessary > +/// CAPTURE TIMESTAMP (response_end) +/// < end of this response > +/// return +/// +/// \param response_statistics The response statistics object. +/// \param response_end The response end time. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_end); + +/// Set error to a decoupled inference response statistics object. +/// +/// Use the same error object passed to the TRITONBACKEND_ResponseSend. \see /// TRITONBACKEND_ResponseSend. /// -/// The statistics object is a struct consists of the following fields, subject -/// to addition in the future. -/// -/// TRITONBACKEND_ModelInstance* model_instance; -/// The model instance. -/// TRITONBACKEND_ResponseFactory* response_factory; -/// The response factory associated with the inference request. -/// const uint64_t response_start; -/// Timestamp for the start of execution for this response. -/// const uint64_t compute_output_start; -/// Timestamp for the start of extracting output tensors for this response. -/// Set this to 0 for reporting empty response. -/// const uint64_t response_end; -/// Timestamp for the end of extracting output tensors for this response. -/// const uint32_t send_flags; -/// Flags associated with the response. \see TRITONBACKEND_ResponseSend -/// TRITONSERVER_Error* error; -/// The TRITONSERVER_Error to send if the response is an error, or nullptr -/// if the response is successful. \see TRITONBACKEND_ResponseSend. -/// -/// \param response_statistics The statistics to be recorded. The caller retains -/// ownership to the object and must free it after this function returns. +/// \param response_statistics The response statistics object. +/// \param error The error object. +/// \return a TRITONSERVER_Error indicating success or failure. +TRITONBACKEND_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetError( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONSERVER_Error* error); + +/// Record statistics for a decoupled inference response. +/// +/// The caller retains ownership to the response statistics and must free it +/// after this function returns. +/// +/// \param response_statistics The statistics to be recorded. /// \return a TRITONSERVER_Error indicating success or failure. TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceReportResponseStatistics( diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index 802ba3510..5e43e093d 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -795,6 +795,23 @@ TritonModelInstance::TritonBackendThread::BackendThread() LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "..."; } +// Opaque object for the response statistics C-API +struct ModelInstanceResponseStatistics { +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics() + : model_instance(nullptr), response_factory(nullptr), response_start(0), + compute_output_start(0), response_end(0), error(nullptr) + { + } + TritonModelInstance* model_instance; + std::shared_ptr* response_factory; + uint64_t response_start; + uint64_t compute_output_start; + uint64_t response_end; + TRITONSERVER_Error* error; +#endif // TRITON_ENABLE_STATS +}; + extern "C" { TRITONAPI_DECLSPEC TRITONSERVER_Error* @@ -954,34 +971,141 @@ TRITONBACKEND_ModelInstanceReportStatistics( } TRITONAPI_DECLSPEC TRITONSERVER_Error* -TRITONBACKEND_ModelInstanceReportResponseStatistics( +TRITONBACKEND_ModelInstanceResponseStatisticsNew( + TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics) +{ +#ifdef TRITON_ENABLE_STATS + *response_statistics = + reinterpret_cast( + new ModelInstanceResponseStatistics()); +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsDelete( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics) { #ifdef TRITON_ENABLE_STATS - TRITONBACKEND_ModelInstanceResponseStatistics* rs = response_statistics; - TritonModelInstance* ti = - reinterpret_cast(rs->model_instance); - std::shared_ptr* rf = + delete reinterpret_cast( + response_statistics); +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ModelInstance* model_instance) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->model_instance = reinterpret_cast(model_instance); +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONBACKEND_ResponseFactory* response_factory) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->response_factory = reinterpret_cast*>( - rs->response_factory); - std::string key = std::to_string((*rf)->GetAndIncrementResponseIndex()); + response_factory); + ; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_start) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->response_start = response_start; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t compute_output_start) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->compute_output_start = compute_output_start; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + uint64_t response_end) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->response_end = response_end; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceResponseStatisticsSetError( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, + TRITONSERVER_Error* error) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + rs->error = error; +#endif // TRITON_ENABLE_STATS + + return nullptr; // success +} + +TRITONAPI_DECLSPEC TRITONSERVER_Error* +TRITONBACKEND_ModelInstanceReportResponseStatistics( + TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics) +{ +#ifdef TRITON_ENABLE_STATS + ModelInstanceResponseStatistics* rs = + reinterpret_cast(response_statistics); + + InferenceStatsAggregator* sa = + rs->model_instance->Model()->MutableStatsAggregator(); + std::string key = + std::to_string((*rs->response_factory)->GetAndIncrementResponseIndex()); if (rs->error == nullptr) { if (rs->compute_output_start > 0) { - RETURN_TRITONSERVER_ERROR_IF_ERROR( - ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess( - key, rs->response_start, rs->compute_output_start, - rs->response_end)); + RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseSuccess( + key, rs->response_start, rs->compute_output_start, rs->response_end)); } else { RETURN_TRITONSERVER_ERROR_IF_ERROR( - ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty( - key, rs->response_start, rs->response_end)); + sa->UpdateResponseEmpty(key, rs->response_start, rs->response_end)); } } else { - RETURN_TRITONSERVER_ERROR_IF_ERROR( - ti->Model()->MutableStatsAggregator()->UpdateResponseFail( - key, rs->response_start, rs->compute_output_start, - rs->response_end)); + RETURN_TRITONSERVER_ERROR_IF_ERROR(sa->UpdateResponseFail( + key, rs->response_start, rs->compute_output_start, rs->response_end)); } #endif // TRITON_ENABLE_STATS diff --git a/src/tritonserver_stub.cc b/src/tritonserver_stub.cc index f3bafe4b0..dc9bcca52 100644 --- a/src/tritonserver_stub.cc +++ b/src/tritonserver_stub.cc @@ -947,6 +947,38 @@ TRITONBACKEND_ModelInstanceReportStatistics() { } TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsNew() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsSetError() +{ +} +TRITONAPI_DECLSPEC void +TRITONBACKEND_ModelInstanceResponseStatisticsDelete() +{ +} +TRITONAPI_DECLSPEC void TRITONBACKEND_ModelInstanceReportResponseStatistics() { } From f295284c23ac21d7193da9ad438a2f0d25a48a87 Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Thu, 15 Feb 2024 13:03:31 -0800 Subject: [PATCH 09/10] Update comment --- include/triton/core/tritonbackend.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index ef0f704a4..69379ca0c 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -1358,7 +1358,7 @@ TRITONBACKEND_ModelInstanceReportStatistics( const uint64_t compute_start_ns, const uint64_t compute_end_ns, const uint64_t exec_end_ns); -/// Create a new decoupled inference response statistics object. +/// Create a new inference response statistics object. /// /// \param response_statistics The new response statistics object to be created. /// \return a TRITONSERVER_Error indicating success or failure. @@ -1366,7 +1366,7 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceResponseStatisticsNew( TRITONBACKEND_ModelInstanceResponseStatistics** response_statistics); -/// Delete a decoupled inference response statistics object. +/// Delete an inference response statistics object. /// /// The caller retains ownership to the objects set on the deleted response /// statistics object and must free them separately. @@ -1377,7 +1377,7 @@ TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceResponseStatisticsDelete( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics); -/// Set model instance to a decoupled inference response statistics object. +/// Set model instance to an inference response statistics object. /// /// \param response_statistics The response statistics object. /// \param model_instance The model instance. @@ -1387,7 +1387,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetModelInstance( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, TRITONBACKEND_ModelInstance* model_instance); -/// Set response factory to a decoupled inference response statistics object. +/// Set response factory to an inference response statistics object. /// /// \param response_statistics The response statistics object. /// \param response_factory The response factory. @@ -1397,7 +1397,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseFactory( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, TRITONBACKEND_ResponseFactory* response_factory); -/// Set response start time to a decoupled inference response statistics object. +/// Set response start time to an inference response statistics object. /// /// All timestamps should be reported in nanonseconds and collected using /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. @@ -1425,8 +1425,8 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, uint64_t response_start); -/// Set compute output start time to a decoupled inference response statistics -/// object. Set this to 0 for reporting an empty response. +/// Set compute output start time to an inference response statistics object. +/// Set this to 0 for reporting an empty response. /// /// All timestamps should be reported in nanonseconds and collected using /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. @@ -1454,7 +1454,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetComputeOutputStart( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, uint64_t compute_output_start); -/// Set response end time to a decoupled inference response statistics object. +/// Set response end time to an inference response statistics object. /// /// All timestamps should be reported in nanonseconds and collected using /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent. @@ -1482,7 +1482,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseEnd( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, uint64_t response_end); -/// Set error to a decoupled inference response statistics object. +/// Set error to an inference response statistics object. /// /// Use the same error object passed to the TRITONBACKEND_ResponseSend. \see /// TRITONBACKEND_ResponseSend. @@ -1495,7 +1495,7 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetError( TRITONBACKEND_ModelInstanceResponseStatistics* response_statistics, TRITONSERVER_Error* error); -/// Record statistics for a decoupled inference response. +/// Record statistics for an inference response. /// /// The caller retains ownership to the response statistics and must free it /// after this function returns. From 68c25ab05e57ddcbaaa875c016144bfed1509eca Mon Sep 17 00:00:00 2001 From: kthui <18255193+kthui@users.noreply.github.com> Date: Fri, 16 Feb 2024 12:13:40 -0800 Subject: [PATCH 10/10] [Continue] Update comment --- include/triton/core/tritonbackend.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/triton/core/tritonbackend.h b/include/triton/core/tritonbackend.h index 69379ca0c..ad04d57f8 100644 --- a/include/triton/core/tritonbackend.h +++ b/include/triton/core/tritonbackend.h @@ -1426,7 +1426,9 @@ TRITONBACKEND_ModelInstanceResponseStatisticsSetResponseStart( uint64_t response_start); /// Set compute output start time to an inference response statistics object. -/// Set this to 0 for reporting an empty response. +/// +/// Do NOT set this compute output start time (or set it to 0), if reporting an +/// empty response. /// /// All timestamps should be reported in nanonseconds and collected using /// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.