Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add response statistics #325

Merged
merged 10 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions include/triton/core/tritonbackend.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -94,7 +94,7 @@ struct TRITONBACKEND_Batcher;
/// }
///
#define TRITONBACKEND_API_VERSION_MAJOR 1
#define TRITONBACKEND_API_VERSION_MINOR 18
#define TRITONBACKEND_API_VERSION_MINOR 19

/// Get the TRITONBACKEND API version supported by Triton. This value
/// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
Expand Down Expand Up @@ -1356,6 +1356,51 @@ TRITONBACKEND_ModelInstanceReportStatistics(
const uint64_t compute_start_ns, const uint64_t compute_end_ns,
const uint64_t exec_end_ns);

/// Record statistics for a decoupled inference response.
///
/// All timestamps should be reported in nanonseconds and collected using
/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
///
/// Call this function after calling TRITONBACKEND_ResponseSend, and pass the
/// same send_flags and error object passed to the TRITONBACKEND_ResponseSend.
///
/// For consistency of measurement across different backends, the
/// timestamps should be collected at the following points during
/// TRITONBACKEND_ModelInstanceExecute.
///
/// TRITONBACKEND_ModelInstanceExecute()
/// < start of this response >
/// CAPTURE TIMESPACE (response_start)
/// < generate this response >
oandreeva-nv marked this conversation as resolved.
Show resolved Hide resolved
/// CAPTURE TIMESPACE (compute_output_start)
/// < allocate output buffers and extract output tensors, including copying
/// the tensors to/from GPU if necessary >
/// CAPTURE TIMESPACE (response_end)
/// < end of this response >
/// return
///
/// \param instance The model instance.
/// \param response_factory The response factory associated with the inference
/// request.
/// \param response_start Timestamp for the start of execution for this
/// response.
/// \param compute_output_start Timestamp for the start of extracting output
/// tensors for this response. Set this to 0 for reporting empty response.
/// \param response_end Timestamp for the end of extracting output tensors for
/// this response.
/// \param send_flags Flags associated with the response. \see
/// TRITONBACKEND_ResponseSend
/// \param error The TRITONSERVER_Error to send if the response is an error, or
/// nullptr if the response is successful. \see TRITONBACKEND_ResponseSend
kthui marked this conversation as resolved.
Show resolved Hide resolved
/// \return a TRITONSERVER_Error indicating success or failure.
TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportResponseStatistics(
TRITONBACKEND_ModelInstance* instance,
TRITONBACKEND_ResponseFactory* response_factory,
const uint64_t response_start, const uint64_t compute_output_start,
const uint64_t response_end, const uint32_t send_flags,
TRITONSERVER_Error* error);
kthui marked this conversation as resolved.
Show resolved Hide resolved

/// Record statistics for the execution of an entire batch of
/// inference requests.
///
Expand Down
37 changes: 36 additions & 1 deletion src/backend_model_instance.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -953,6 +953,41 @@ TRITONBACKEND_ModelInstanceReportStatistics(
return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportResponseStatistics(
TRITONBACKEND_ModelInstance* instance,
TRITONBACKEND_ResponseFactory* response_factory,
const uint64_t response_start, const uint64_t compute_output_start,
const uint64_t response_end, const uint32_t send_flags,
TRITONSERVER_Error* error)
{
#ifdef TRITON_ENABLE_STATS
TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
std::shared_ptr<InferenceResponseFactory>* rf =
reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(
response_factory);
std::string key = std::to_string((*rf)->ResponseStatsIndex());

if (error == nullptr) {
if (compute_output_start > 0) {
RETURN_TRITONSERVER_ERROR_IF_ERROR(
ti->Model()->MutableStatsAggregator()->UpdateResponseSuccess(
key, response_start, compute_output_start, response_end));
} else {
RETURN_TRITONSERVER_ERROR_IF_ERROR(
ti->Model()->MutableStatsAggregator()->UpdateResponseEmpty(
key, response_start, response_end));
}
} else {
RETURN_TRITONSERVER_ERROR_IF_ERROR(
ti->Model()->MutableStatsAggregator()->UpdateResponseFail(
key, response_start, compute_output_start, response_end));
}
#endif // TRITON_ENABLE_STATS

return nullptr; // success
}

TRITONAPI_DECLSPEC TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
Expand Down
16 changes: 15 additions & 1 deletion src/infer_response.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -61,6 +61,10 @@ class InferenceResponseFactory {
alloc_userp_(alloc_userp), response_fn_(response_fn),
response_userp_(response_userp), response_delegator_(delegator),
is_cancelled_(false)
#ifdef TRITON_ENABLE_STATS
,
response_stats_index_(0)
#endif // TRITON_ENABLE_STATS
oandreeva-nv marked this conversation as resolved.
Show resolved Hide resolved
{
}

Expand Down Expand Up @@ -94,6 +98,11 @@ class InferenceResponseFactory {
void ReleaseTrace() { trace_ = nullptr; }
#endif // TRITON_ENABLE_TRACING

#ifdef TRITON_ENABLE_STATS
// Return the current response statistics index and increment it.
uint64_t ResponseStatsIndex() { return response_stats_index_++; };
#endif // TRITON_ENABLE_STATS
oandreeva-nv marked this conversation as resolved.
Show resolved Hide resolved

private:
// The model associated with this factory. For normal
// requests/responses this will always be defined and acts to keep
Expand Down Expand Up @@ -129,6 +138,11 @@ class InferenceResponseFactory {
// Inference trace associated with this response.
std::shared_ptr<InferenceTraceProxy> trace_;
#endif // TRITON_ENABLE_TRACING

#ifdef TRITON_ENABLE_STATS
// Number of response statistics reported.
std::atomic<uint64_t> response_stats_index_;
#endif // TRITON_ENABLE_STATS
};

//
Expand Down
128 changes: 127 additions & 1 deletion src/infer_stats.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -210,6 +210,132 @@ InferenceStatsAggregator::UpdateSuccessCacheMiss(
#endif // TRITON_ENABLE_METRICS
}

Status
InferenceStatsAggregator::UpdateResponseSuccess(
const std::string& key, const uint64_t response_start_ns,
const uint64_t compute_output_start_ns, const uint64_t response_end_ns)
{
if (response_start_ns > compute_output_start_ns) {
return Status(
Status::Code::INVALID_ARG,
"Response start cannot happen after compute output start");
}
kthui marked this conversation as resolved.
Show resolved Hide resolved
if (compute_output_start_ns > response_end_ns) {
return Status(
Status::Code::INVALID_ARG,
"Compute output start cannot happen after response end");
}
kthui marked this conversation as resolved.
Show resolved Hide resolved
const uint64_t compute_infer_duration_ns =
compute_output_start_ns - response_start_ns;
const uint64_t compute_output_duration_ns =
response_end_ns - compute_output_start_ns;
const uint64_t total_duration_ns = response_end_ns - response_start_ns;

{
std::lock_guard<std::mutex> lock(mu_);

auto it = response_stats_.find(key);
if (it == response_stats_.end()) {
it = response_stats_.emplace(key, InferResponseStats()).first;
}

it->second.compute_infer_count++;
it->second.compute_infer_duration_ns += compute_infer_duration_ns;
it->second.compute_output_count++;
it->second.compute_output_duration_ns += compute_output_duration_ns;
it->second.success_count++;
it->second.success_duration_ns += total_duration_ns;
}

return Status::Success;
}

Status
InferenceStatsAggregator::UpdateResponseFail(
const std::string& key, const uint64_t response_start_ns,
const uint64_t compute_output_start_ns, const uint64_t response_end_ns)
{
uint64_t compute_infer_duration_ns, compute_output_duration_ns,
total_duration_ns;
if (compute_output_start_ns > 0) {
// output tensors copied
if (response_start_ns > compute_output_start_ns) {
return Status(
Status::Code::INVALID_ARG,
"Response start cannot happen after compute output start");
}
if (compute_output_start_ns > response_end_ns) {
return Status(
Status::Code::INVALID_ARG,
"Compute output start cannot happen after response end");
}
compute_infer_duration_ns = compute_output_start_ns - response_start_ns;
compute_output_duration_ns = response_end_ns - compute_output_start_ns;
total_duration_ns = response_end_ns - response_start_ns;
} else {
// no output tensors copied
if (response_start_ns > response_end_ns) {
return Status(
Status::Code::INVALID_ARG,
"Response start cannot happen after response end");
}
compute_infer_duration_ns = response_end_ns - response_start_ns;
compute_output_duration_ns = 0;
total_duration_ns = response_end_ns - response_start_ns;
}

{
std::lock_guard<std::mutex> lock(mu_);

auto it = response_stats_.find(key);
if (it == response_stats_.end()) {
it = response_stats_.emplace(key, InferResponseStats()).first;
}

it->second.compute_infer_count++;
it->second.compute_infer_duration_ns += compute_infer_duration_ns;
if (compute_output_duration_ns > 0) {
it->second.compute_output_count++;
it->second.compute_output_duration_ns += compute_output_duration_ns;
}
it->second.fail_count++;
it->second.fail_duration_ns += total_duration_ns;
}

return Status::Success;
}

Status
InferenceStatsAggregator::UpdateResponseEmpty(
const std::string& key, const uint64_t response_start_ns,
const uint64_t response_end_ns)
{
if (response_start_ns > response_end_ns) {
return Status(
Status::Code::INVALID_ARG,
"Response start cannot happen after response end");
}
const uint64_t compute_infer_duration_ns =
response_end_ns - response_start_ns;
const uint64_t total_duration_ns = response_end_ns - response_start_ns;

{
std::lock_guard<std::mutex> lock(mu_);

auto it = response_stats_.find(key);
if (it == response_stats_.end()) {
it = response_stats_.emplace(key, InferResponseStats()).first;
}

it->second.compute_infer_count++;
it->second.compute_infer_duration_ns += compute_infer_duration_ns;
it->second.empty_response_count++;
it->second.empty_response_duration_ns += total_duration_ns;
}

return Status::Success;
}

void
InferenceStatsAggregator::UpdateInferBatchStats(
MetricModelReporter* metric_reporter, const size_t batch_size,
Expand Down
44 changes: 43 additions & 1 deletion src/infer_stats.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -78,6 +78,27 @@ class InferenceStatsAggregator {
uint64_t cache_miss_duration_ns_;
};

struct InferResponseStats {
InferResponseStats()
: compute_infer_count(0), compute_infer_duration_ns(0),
compute_output_count(0), compute_output_duration_ns(0),
success_count(0), success_duration_ns(0), fail_count(0),
fail_duration_ns(0), empty_response_count(0),
empty_response_duration_ns(0)
{
}
uint64_t compute_infer_count;
uint64_t compute_infer_duration_ns;
uint64_t compute_output_count;
uint64_t compute_output_duration_ns;
uint64_t success_count;
uint64_t success_duration_ns;
uint64_t fail_count;
uint64_t fail_duration_ns;
uint64_t empty_response_count;
uint64_t empty_response_duration_ns;
};

struct InferBatchStats {
InferBatchStats()
: count_(0), compute_input_duration_ns_(0),
Expand All @@ -100,6 +121,11 @@ class InferenceStatsAggregator {
uint64_t InferenceCount() const { return inference_count_; }
uint64_t ExecutionCount() const { return execution_count_; }
const InferStats& ImmutableInferStats() const { return infer_stats_; }
const std::map<std::string, InferResponseStats>& ImmutableInferResponseStats()
const
{
return response_stats_;
}
const std::map<size_t, InferBatchStats>& ImmutableInferBatchStats() const
{
return batch_stats_;
Expand Down Expand Up @@ -140,6 +166,21 @@ class InferenceStatsAggregator {
MetricModelReporter* metric_reporter,
const uint64_t cache_miss_duration_ns);

// Add durations to response stats for a successful response.
Status UpdateResponseSuccess(
const std::string& key, const uint64_t response_start_ns,
const uint64_t compute_output_start_ns, const uint64_t response_end_ns);

// Add durations to response stats for a failed response.
Status UpdateResponseFail(
const std::string& key, const uint64_t response_start_ns,
const uint64_t compute_output_start_ns, const uint64_t response_end_ns);

// Add durations to response stats for an empty response.
Status UpdateResponseEmpty(
const std::string& key, const uint64_t response_start_ns,
const uint64_t response_end_ns);

// Add durations to batch infer stats for a batch execution.
// 'success_request_count' is the number of success requests in the
// batch that have infer_stats attached.
Expand All @@ -163,6 +204,7 @@ class InferenceStatsAggregator {
uint64_t inference_count_;
uint64_t execution_count_;
InferStats infer_stats_;
std::map<std::string, InferResponseStats> response_stats_;
std::map<size_t, InferBatchStats> batch_stats_;
#endif // TRITON_ENABLE_STATS
};
Expand Down
Loading
Loading