From 33c1e935dc7260239abcec1bc35a007363be736c Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 24 Jul 2024 22:59:37 -0700 Subject: [PATCH 01/32] Park --- src/grpc/infer_handler.h | 56 ++++++++++++++++++++++++++++++-- src/grpc/stream_infer_handler.cc | 50 ++++++++++++++++++++++++++-- 2 files changed, 101 insertions(+), 5 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 6ef03807a2..84ec9c0963 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -642,7 +642,7 @@ class InferHandlerState { ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0) : cq_(cq), unique_id_(unique_id), ongoing_requests_(0), step_(Steps::START), finish_ok_(true), ongoing_write_(false), - received_notification_(false) + received_notification_(false), grpc_strict_(false) { ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); @@ -669,6 +669,44 @@ class InferHandlerState { return received_notification_ ? ctx_->IsCancelled() : false; } + // Extracts headers from GRPC request and updates state + void ExtractStateFromHeaders(InferHandlerStateType* state) + { + // Probably need to lock + LOG_VERBOSE(1) << "GRPC ExtractStateFromHeaders called" << std::endl; + const auto& metadata = state->context_->ctx_->client_metadata(); + for (const auto& pair : metadata) { + auto& key = pair.first; + std::string param_key = std::string(key.begin(), key.end()); + std::string grpc_strict_key = "grpc_strict"; + if (param_key.compare(grpc_strict_key) == 0) { + // They are equal + state->context_->grpc_strict_ = true; + LOG_VERBOSE(1) << "GRPC streaming strict flag detected" << std::endl; + } + } + } + + void sendGRPCStrictResponse(InferHandlerStateType* state) + { + // Check if streaming error detected AND grpc_mode is strict + if (state->context_->grpc_strict_) { + if (state->IsStreamError()) { + ::grpc::Status dummy_status = ::grpc::Status( + ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status")); + // state->context_->responder_->Finish(state->status_, state); + state->context_->step_ = Steps::COMPLETE; + state->step_ = Steps::PARTIAL_COMPLETION; + + state->context_->responder_->Finish(dummy_status, state); + LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " + << state->status_.error_code() << std::endl; + IssueRequestCancellation(); + } + } else { + LOG_VERBOSE(1) << "GRPC mode NOT strict in Finish" << std::endl; + } + } // Increments the ongoing request counter void IncrementRequestCounter() { ongoing_requests_++; } @@ -996,6 +1034,9 @@ class InferHandlerState { // Tracks whether the async notification has been delivered by // completion queue. bool received_notification_; + + // True if there is an ongoing write to the grpc stream + std::atomic grpc_strict_; }; // This constructor is used to build a wrapper state object @@ -1003,8 +1044,11 @@ class InferHandlerState { // object is used to distinguish a tag from AsyncNotifyWhenDone() // signal. explicit InferHandlerState(Steps start_step, InferHandlerState* state) - : step_(start_step), state_ptr_(state), async_notify_state_(false) + : step_(start_step), state_ptr_(state), async_notify_state_(false), + grpc_stream_error_state_(false) { + LOG_VERBOSE(1) + << "grpc_stream_error_state_ init called in InferHandlerState \n"; state->MarkAsAsyncNotifyState(); } @@ -1013,6 +1057,8 @@ class InferHandlerState { const std::shared_ptr& context, Steps start_step = Steps::START) : tritonserver_(tritonserver), async_notify_state_(false) { + LOG_VERBOSE(1) + << "grpc_stream_error_state_ init called in InferHandlerState 2 \n"; // For debugging and testing const char* dstr = getenv("TRITONSERVER_DELAY_GRPC_RESPONSE"); delay_response_ms_ = 0; @@ -1058,6 +1104,7 @@ class InferHandlerState { // wrapper state object in WAITING_NOTIFICATION step. state_ptr_ = nullptr; async_notify_state_ = false; + grpc_stream_error_state_ = false; } void Release() @@ -1087,7 +1134,8 @@ class InferHandlerState { void MarkAsAsyncNotifyState() { async_notify_state_ = true; } bool IsAsyncNotifyState() { return async_notify_state_; } - + void MarkIsStreamError() { grpc_stream_error_state_ = true; } + bool IsStreamError() { return grpc_stream_error_state_; } // Needed in the response handle for classification outputs. TRITONSERVER_Server* tritonserver_; @@ -1098,6 +1146,7 @@ class InferHandlerState { std::shared_ptr context_; Steps step_; std::recursive_mutex step_mtx_; + std::recursive_mutex grpc_strict_mtx_; // Shared pointer to the inference request object. The lifetime of // inference request object is extended till all the responses from @@ -1139,6 +1188,7 @@ class InferHandlerState { // Tracks whether this state object has been wrapped and send to // AsyncNotifyWhenDone() function as a tag. bool async_notify_state_; + bool grpc_stream_error_state_; }; diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 269808c78a..7dcac8ea6d 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -140,6 +140,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // This means that we only need to take care of // synchronizing this thread and the ResponseComplete // threads. + { + std::lock_guard lock(state->grpc_strict_mtx_); + if (state->IsStreamError() && state->context_->grpc_strict_) { + LOG_VERBOSE(1) << "Ignoring new client request in strict mode \n"; + return false; + } + } if (state->context_->ReceivedNotification()) { std::lock_guard lock(state->step_mtx_); if (state->IsGrpcContextCancelled()) { @@ -189,7 +196,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) state->context_->responder_->Finish(status, state); return !finished; } - + state->context_->ExtractStateFromHeaders(state); } else if (state->step_ == Steps::READ) { TRITONSERVER_Error* err = nullptr; const inference::ModelInferRequest& request = state->request_; @@ -328,7 +335,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // initiated... the completion callback will transition to // WRITEREADY or WRITTEN or CANCELLED. Recording the state and the // irequest to handle gRPC stream cancellation. + // std::time_t currentTime = std::time(nullptr); + + // // Divide the current time by 2 + // std::time_t modTime = currentTime % 2; + if (err == nullptr) { + // if (modTime) { state->context_->InsertInflightState(state); // The payload will be cleaned in request release callback. request_release_payload.release(); @@ -355,6 +368,17 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) GrpcStatusUtil::Create(&status, err); TRITONSERVER_ErrorDelete(err); response->set_error_message(status.error_message()); + // if(state->context_->grpc_strict_) { + // // Set to finish + // state->status_ = status; + // state->MarkIsStreamError(); + // LOG_VERBOSE(1) << "GRPC streaming error detected : " << + // status.error_code() << std::endl; + // } else { + // LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict: + // " << status.error_code() << std::endl; + // } + response->mutable_infer_response()->Clear(); // repopulate the id so that client knows which request failed. @@ -633,12 +657,34 @@ ModelStreamInferHandler::StreamInferResponseComplete( LOG_ERROR << "expected the response allocator to have added the response"; } - if (err != nullptr) { + std::time_t currentTime = std::time(nullptr); + + // Divide the current time by 2 + std::time_t modTime = currentTime % 2; + + if (modTime) { + LOG_VERBOSE(1) << "Generating fake error" << std::endl; failed = true; ::grpc::Status status; GrpcStatusUtil::Create(&status, err); response->mutable_infer_response()->Clear(); response->set_error_message(status.error_message()); + if (state->context_->grpc_strict_) { + // Set to finish + std::lock_guard lock(state->grpc_strict_mtx_); + state->status_ = status; + if (!state->IsStreamError()) { + // Finish only once, if backend ignores cancellation + state->MarkIsStreamError(); + LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)" + << status.error_code() << std::endl; + state->context_->sendGRPCStrictResponse(state); + } + return; + } else { + LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict: " + << status.error_code() << std::endl; + } LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; } From 3affd3a676705fcff09db95d4e0f72966cdb32b3 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Tue, 30 Jul 2024 12:31:10 -0700 Subject: [PATCH 02/32] Park --- src/grpc/infer_handler.h | 53 ++++++++++++++++++-------------- src/grpc/stream_infer_handler.cc | 26 ++++++---------- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 84ec9c0963..5b519d0b80 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -691,17 +691,21 @@ class InferHandlerState { { // Check if streaming error detected AND grpc_mode is strict if (state->context_->grpc_strict_) { - if (state->IsStreamError()) { - ::grpc::Status dummy_status = ::grpc::Status( - ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status")); - // state->context_->responder_->Finish(state->status_, state); - state->context_->step_ = Steps::COMPLETE; - state->step_ = Steps::PARTIAL_COMPLETION; - - state->context_->responder_->Finish(dummy_status, state); - LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " - << state->status_.error_code() << std::endl; - IssueRequestCancellation(); + { + std::lock_guard lock(grpc_strict_mu_); + if (!state->context_->IsStreamError()) { + ::grpc::Status dummy_status = ::grpc::Status( + ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status")); + // state->context_->responder_->Finish(state->status_, state); + state->context_->step_ = Steps::COMPLETE; + state->step_ = Steps::PARTIAL_COMPLETION; + + state->context_->responder_->Finish(dummy_status, state); + LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " + << state->status_.error_code() << std::endl; + state->context_->MarkIsStreamError(); + IssueRequestCancellation(false); + } } } else { LOG_VERBOSE(1) << "GRPC mode NOT strict in Finish" << std::endl; @@ -784,7 +788,7 @@ class InferHandlerState { // Issues the cancellation for all inflight requests // being tracked by this context. - void IssueRequestCancellation() + void IssueRequestCancellation(bool grpc_strict) { { std::lock_guard lock(mu_); @@ -817,7 +821,9 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. - PutTaskBackToQueue(state); + if(!grpc_strict) { + PutTaskBackToQueue(state); + } } } } @@ -851,7 +857,7 @@ class InferHandlerState { // issue cancellation request to all the inflight // states belonging to the context. if (state->context_->step_ != Steps::CANCELLED) { - IssueRequestCancellation(); + IssueRequestCancellation(false); // Mark the context as cancelled state->context_->step_ = Steps::CANCELLED; @@ -979,6 +985,11 @@ class InferHandlerState { return false; } + void MarkIsStreamError() { + grpc_stream_error_state_ = true; } + bool IsStreamError() { + return grpc_stream_error_state_; } + // Return true if this context has completed all reads and writes. bool IsRequestsCompleted() { @@ -1006,6 +1017,7 @@ class InferHandlerState { // orders. A state enters this queue when it has successfully read // a request and exits the queue when it is written. std::recursive_mutex mu_; + std::recursive_mutex grpc_strict_mu_; std::queue states_; std::atomic ongoing_requests_; @@ -1037,6 +1049,8 @@ class InferHandlerState { // True if there is an ongoing write to the grpc stream std::atomic grpc_strict_; + + bool grpc_stream_error_state_; }; // This constructor is used to build a wrapper state object @@ -1044,11 +1058,8 @@ class InferHandlerState { // object is used to distinguish a tag from AsyncNotifyWhenDone() // signal. explicit InferHandlerState(Steps start_step, InferHandlerState* state) - : step_(start_step), state_ptr_(state), async_notify_state_(false), - grpc_stream_error_state_(false) + : step_(start_step), state_ptr_(state), async_notify_state_(false) { - LOG_VERBOSE(1) - << "grpc_stream_error_state_ init called in InferHandlerState \n"; state->MarkAsAsyncNotifyState(); } @@ -1104,7 +1115,6 @@ class InferHandlerState { // wrapper state object in WAITING_NOTIFICATION step. state_ptr_ = nullptr; async_notify_state_ = false; - grpc_stream_error_state_ = false; } void Release() @@ -1134,8 +1144,6 @@ class InferHandlerState { void MarkAsAsyncNotifyState() { async_notify_state_ = true; } bool IsAsyncNotifyState() { return async_notify_state_; } - void MarkIsStreamError() { grpc_stream_error_state_ = true; } - bool IsStreamError() { return grpc_stream_error_state_; } // Needed in the response handle for classification outputs. TRITONSERVER_Server* tritonserver_; @@ -1146,7 +1154,7 @@ class InferHandlerState { std::shared_ptr context_; Steps step_; std::recursive_mutex step_mtx_; - std::recursive_mutex grpc_strict_mtx_; + // std::recursive_mutex grpc_strict_mtx_; // Shared pointer to the inference request object. The lifetime of // inference request object is extended till all the responses from @@ -1188,7 +1196,6 @@ class InferHandlerState { // Tracks whether this state object has been wrapped and send to // AsyncNotifyWhenDone() function as a tag. bool async_notify_state_; - bool grpc_stream_error_state_; }; diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 7dcac8ea6d..a702cc6261 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -140,12 +140,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // This means that we only need to take care of // synchronizing this thread and the ResponseComplete // threads. - { - std::lock_guard lock(state->grpc_strict_mtx_); - if (state->IsStreamError() && state->context_->grpc_strict_) { - LOG_VERBOSE(1) << "Ignoring new client request in strict mode \n"; - return false; - } + // We need an explicit finish indicator. Can't use 'state->step_' + // because we launch an async thread that could update 'state's + // step_ to be FINISH before this thread exits this function. + bool finished = false; + + if(state->context_ == nullptr) { + return !finished; } if (state->context_->ReceivedNotification()) { std::lock_guard lock(state->step_mtx_); @@ -163,11 +164,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) << ", context " << state->context_->unique_id_ << ", " << state->unique_id_ << " step " << state->step_; - // We need an explicit finish indicator. Can't use 'state->step_' - // because we launch an async thread that could update 'state's - // step_ to be FINISH before this thread exits this function. - bool finished = false; - if (state->step_ == Steps::START) { // A new stream connection... If RPC failed on a new request then // the server is shutting down and so we should do nothing. @@ -671,22 +667,18 @@ ModelStreamInferHandler::StreamInferResponseComplete( response->set_error_message(status.error_message()); if (state->context_->grpc_strict_) { // Set to finish - std::lock_guard lock(state->grpc_strict_mtx_); + // std::lock_guard lock(state->grpc_strict_mtx_); state->status_ = status; - if (!state->IsStreamError()) { // Finish only once, if backend ignores cancellation - state->MarkIsStreamError(); LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)" << status.error_code() << std::endl; state->context_->sendGRPCStrictResponse(state); } return; } else { - LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict: " - << status.error_code() << std::endl; + LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict"; } LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; - } TRITONSERVER_ErrorDelete(err); LOG_TRITONSERVER_ERROR( From 7f86c6a76666aa54bb66aa0b85c0c225d0557d2b Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 31 Jul 2024 14:58:38 -0700 Subject: [PATCH 03/32] Working Set --- src/grpc/infer_handler.h | 15 +++++++------- src/grpc/stream_infer_handler.cc | 34 +++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 5b519d0b80..c839e8c7fc 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -642,7 +642,8 @@ class InferHandlerState { ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0) : cq_(cq), unique_id_(unique_id), ongoing_requests_(0), step_(Steps::START), finish_ok_(true), ongoing_write_(false), - received_notification_(false), grpc_strict_(false) + received_notification_(false), grpc_strict_(false), + grpc_stream_error_state_(false) { ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); @@ -702,7 +703,7 @@ class InferHandlerState { state->context_->responder_->Finish(dummy_status, state); LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " - << state->status_.error_code() << std::endl; + << state->status_.error_code() << std::endl; state->context_->MarkIsStreamError(); IssueRequestCancellation(false); } @@ -821,7 +822,7 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. - if(!grpc_strict) { + if (!grpc_strict) { PutTaskBackToQueue(state); } } @@ -985,10 +986,8 @@ class InferHandlerState { return false; } - void MarkIsStreamError() { - grpc_stream_error_state_ = true; } - bool IsStreamError() { - return grpc_stream_error_state_; } + void MarkIsStreamError() { grpc_stream_error_state_ = true; } + bool IsStreamError() { return grpc_stream_error_state_; } // Return true if this context has completed all reads and writes. bool IsRequestsCompleted() @@ -1050,7 +1049,7 @@ class InferHandlerState { // True if there is an ongoing write to the grpc stream std::atomic grpc_strict_; - bool grpc_stream_error_state_; + std::atomic grpc_stream_error_state_; }; // This constructor is used to build a wrapper state object diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index a702cc6261..42eb48c84e 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -144,10 +144,14 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // because we launch an async thread that could update 'state's // step_ to be FINISH before this thread exits this function. bool finished = false; - - if(state->context_ == nullptr) { + if (state->context_ == nullptr) { + return !finished; + } + std::lock_guard lock(state->context_->mu_); + if (state->context_->IsStreamError()) { return !finished; } + if (state->context_->ReceivedNotification()) { std::lock_guard lock(state->step_mtx_); if (state->IsGrpcContextCancelled()) { @@ -577,7 +581,12 @@ ModelStreamInferHandler::StreamInferResponseComplete( void* userp) { State* state = reinterpret_cast(userp); - + // Ignore Response from CORE in case GRPC Strict as we dont care about + LOG_VERBOSE(1) << "Dead Response from CORE"; + std::lock_guard lock(state->context_->mu_); + if (state->context_->IsStreamError()) { + return; + } // Increment the callback index uint32_t response_index = state->cb_count_++; @@ -662,6 +671,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( LOG_VERBOSE(1) << "Generating fake error" << std::endl; failed = true; ::grpc::Status status; + // Converts CORE errors to GRPC error codes GrpcStatusUtil::Create(&status, err); response->mutable_infer_response()->Clear(); response->set_error_message(status.error_message()); @@ -669,16 +679,16 @@ ModelStreamInferHandler::StreamInferResponseComplete( // Set to finish // std::lock_guard lock(state->grpc_strict_mtx_); state->status_ = status; - // Finish only once, if backend ignores cancellation - LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)" - << status.error_code() << std::endl; - state->context_->sendGRPCStrictResponse(state); - } - return; - } else { - LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict"; + // Finish only once, if backend ignores cancellation + LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)" + << status.error_code() << std::endl; + state->context_->sendGRPCStrictResponse(state); } - LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; + return; + } else { + LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict"; + } + LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; TRITONSERVER_ErrorDelete(err); LOG_TRITONSERVER_ERROR( From b65fd742dd16f915e2693759a417a8e44b69e0d1 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 31 Jul 2024 18:34:47 -0700 Subject: [PATCH 04/32] Working Set --- src/grpc/infer_handler.h | 32 +++++++++++++++---------------- src/grpc/stream_infer_handler.cc | 33 +++++--------------------------- 2 files changed, 20 insertions(+), 45 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index c839e8c7fc..4fb5c3f31c 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -694,22 +694,19 @@ class InferHandlerState { if (state->context_->grpc_strict_) { { std::lock_guard lock(grpc_strict_mu_); - if (!state->context_->IsStreamError()) { - ::grpc::Status dummy_status = ::grpc::Status( - ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status")); - // state->context_->responder_->Finish(state->status_, state); + // Check if Error not responded previously + // Avoid closing connection twice on multiple errors from core + if (!state->context_->IsGRPCStrictError()) { state->context_->step_ = Steps::COMPLETE; state->step_ = Steps::PARTIAL_COMPLETION; - - state->context_->responder_->Finish(dummy_status, state); + state->context_->responder_->Finish(state->status_, state); LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " << state->status_.error_code() << std::endl; - state->context_->MarkIsStreamError(); - IssueRequestCancellation(false); + // Mark error for this stream + state->context_->MarkGRPCStrictError(); + IssueRequestCancellation(); } } - } else { - LOG_VERBOSE(1) << "GRPC mode NOT strict in Finish" << std::endl; } } // Increments the ongoing request counter @@ -789,7 +786,7 @@ class InferHandlerState { // Issues the cancellation for all inflight requests // being tracked by this context. - void IssueRequestCancellation(bool grpc_strict) + void IssueRequestCancellation() { { std::lock_guard lock(mu_); @@ -822,9 +819,7 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. - if (!grpc_strict) { - PutTaskBackToQueue(state); - } + PutTaskBackToQueue(state); } } } @@ -858,7 +853,7 @@ class InferHandlerState { // issue cancellation request to all the inflight // states belonging to the context. if (state->context_->step_ != Steps::CANCELLED) { - IssueRequestCancellation(false); + IssueRequestCancellation(); // Mark the context as cancelled state->context_->step_ = Steps::CANCELLED; @@ -986,8 +981,11 @@ class InferHandlerState { return false; } - void MarkIsStreamError() { grpc_stream_error_state_ = true; } - bool IsStreamError() { return grpc_stream_error_state_; } + // Marks error after it has been responded to + void MarkGRPCStrictError() { grpc_stream_error_state_ = true; } + + // Checks if error already responded to in grpc_strict mode + bool IsGRPCStrictError() { return grpc_stream_error_state_; } // Return true if this context has completed all reads and writes. bool IsRequestsCompleted() diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 42eb48c84e..42ff740098 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -144,11 +144,9 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // because we launch an async thread that could update 'state's // step_ to be FINISH before this thread exits this function. bool finished = false; - if (state->context_ == nullptr) { - return !finished; - } std::lock_guard lock(state->context_->mu_); - if (state->context_->IsStreamError()) { + // Check if stream error detected and already connection ended + if (state->context_->IsGRPCStrictError()) { return !finished; } @@ -368,18 +366,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) GrpcStatusUtil::Create(&status, err); TRITONSERVER_ErrorDelete(err); response->set_error_message(status.error_message()); - // if(state->context_->grpc_strict_) { - // // Set to finish - // state->status_ = status; - // state->MarkIsStreamError(); - // LOG_VERBOSE(1) << "GRPC streaming error detected : " << - // status.error_code() << std::endl; - // } else { - // LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict: - // " << status.error_code() << std::endl; - // } - - response->mutable_infer_response()->Clear(); // repopulate the id so that client knows which request failed. response->mutable_infer_response()->set_id(request.id()); @@ -584,7 +570,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( // Ignore Response from CORE in case GRPC Strict as we dont care about LOG_VERBOSE(1) << "Dead Response from CORE"; std::lock_guard lock(state->context_->mu_); - if (state->context_->IsStreamError()) { + if (state->context_->IsGRPCStrictError()) { return; } // Increment the callback index @@ -661,14 +647,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( } else { LOG_ERROR << "expected the response allocator to have added the response"; } - - std::time_t currentTime = std::time(nullptr); - - // Divide the current time by 2 - std::time_t modTime = currentTime % 2; - - if (modTime) { - LOG_VERBOSE(1) << "Generating fake error" << std::endl; + if (err != nullptr) { failed = true; ::grpc::Status status; // Converts CORE errors to GRPC error codes @@ -680,13 +659,11 @@ ModelStreamInferHandler::StreamInferResponseComplete( // std::lock_guard lock(state->grpc_strict_mtx_); state->status_ = status; // Finish only once, if backend ignores cancellation - LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)" + LOG_VERBOSE(1) << "GRPC streaming error detected: " << status.error_code() << std::endl; state->context_->sendGRPCStrictResponse(state); } return; - } else { - LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict"; } LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; From 36e461f3245fb01e37c7cbfe631a43a6d76e7fbc Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 1 Aug 2024 11:43:40 -0700 Subject: [PATCH 05/32] Working Set --- src/grpc/infer_handler.h | 30 +++++++++++++----------------- src/grpc/stream_infer_handler.cc | 22 ++++++++++++---------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 4fb5c3f31c..7bd4330ec0 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -645,6 +645,7 @@ class InferHandlerState { received_notification_(false), grpc_strict_(false), grpc_stream_error_state_(false) { + LOG_VERBOSE(1) << "Context Constructor getting called"; ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); } @@ -690,23 +691,18 @@ class InferHandlerState { void sendGRPCStrictResponse(InferHandlerStateType* state) { - // Check if streaming error detected AND grpc_mode is strict - if (state->context_->grpc_strict_) { - { - std::lock_guard lock(grpc_strict_mu_); - // Check if Error not responded previously - // Avoid closing connection twice on multiple errors from core - if (!state->context_->IsGRPCStrictError()) { - state->context_->step_ = Steps::COMPLETE; - state->step_ = Steps::PARTIAL_COMPLETION; - state->context_->responder_->Finish(state->status_, state); - LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " - << state->status_.error_code() << std::endl; - // Mark error for this stream - state->context_->MarkGRPCStrictError(); - IssueRequestCancellation(); - } - } + std::lock_guard lock(state->context_->mu_); + // Check if Error not responded previously + // Avoid closing connection twice on multiple errors from core + if (!state->context_->IsGRPCStrictError()) { + state->context_->step_ = Steps::COMPLETE; + state->step_ = Steps::PARTIAL_COMPLETION; + state->context_->responder_->Finish(state->status_, state); + LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " + << state->status_.error_code() << std::endl; + // Mark error for this stream + state->context_->MarkGRPCStrictError(); + IssueRequestCancellation(); } } // Increments the ongoing request counter diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 42ff740098..df196640b1 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -144,12 +144,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // because we launch an async thread that could update 'state's // step_ to be FINISH before this thread exits this function. bool finished = false; - std::lock_guard lock(state->context_->mu_); - // Check if stream error detected and already connection ended - if (state->context_->IsGRPCStrictError()) { - return !finished; + if (state->context_->grpc_strict_) { + std::lock_guard lock(state->context_->mu_); + // Check if stream error detected and already connection ended + if (state->context_->IsGRPCStrictError()) { + return !finished; + } } - if (state->context_->ReceivedNotification()) { std::lock_guard lock(state->step_mtx_); if (state->IsGrpcContextCancelled()) { @@ -568,10 +569,11 @@ ModelStreamInferHandler::StreamInferResponseComplete( { State* state = reinterpret_cast(userp); // Ignore Response from CORE in case GRPC Strict as we dont care about - LOG_VERBOSE(1) << "Dead Response from CORE"; - std::lock_guard lock(state->context_->mu_); - if (state->context_->IsGRPCStrictError()) { - return; + if (state->context_->grpc_strict_) { + std::lock_guard lock(state->context_->mu_); + if (state->context_->IsGRPCStrictError()) { + return; + } } // Increment the callback index uint32_t response_index = state->cb_count_++; @@ -648,6 +650,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( LOG_ERROR << "expected the response allocator to have added the response"; } if (err != nullptr) { + LOG_VERBOSE(1) << "Error in CORE Response"; failed = true; ::grpc::Status status; // Converts CORE errors to GRPC error codes @@ -663,7 +666,6 @@ ModelStreamInferHandler::StreamInferResponseComplete( << status.error_code() << std::endl; state->context_->sendGRPCStrictResponse(state); } - return; } LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; From bd549b16284299c9f56ae6a3ea035d333e616a84 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 1 Aug 2024 12:03:59 -0700 Subject: [PATCH 06/32] Working Set --- src/grpc/stream_infer_handler.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index df196640b1..9c62a1a301 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -657,6 +657,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( GrpcStatusUtil::Create(&status, err); response->mutable_infer_response()->Clear(); response->set_error_message(status.error_message()); + LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; if (state->context_->grpc_strict_) { // Set to finish // std::lock_guard lock(state->grpc_strict_mtx_); @@ -665,9 +666,9 @@ ModelStreamInferHandler::StreamInferResponseComplete( LOG_VERBOSE(1) << "GRPC streaming error detected: " << status.error_code() << std::endl; state->context_->sendGRPCStrictResponse(state); + return; } } - LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; TRITONSERVER_ErrorDelete(err); LOG_TRITONSERVER_ERROR( From 85ccd726a667df19c63a152837e5cb61891605ba Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 2 Aug 2024 02:05:39 -0700 Subject: [PATCH 07/32] Tests Added --- .../lifecycle/lifecycle_test.py | 94 ++++++++++++++++++- qa/L0_backend_python/lifecycle/my_test.py | 53 +++++++++++ qa/L0_backend_python/lifecycle/test.sh | 8 ++ .../execute_grpc_error/config.pbtxt | 52 ++++++++++ qa/python_models/execute_grpc_error/model.py | 52 ++++++++++ 5 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 qa/L0_backend_python/lifecycle/my_test.py create mode 100644 qa/python_models/execute_grpc_error/config.pbtxt create mode 100644 qa/python_models/execute_grpc_error/model.py diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index cea94a1dad..f2e52c8c91 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -27,8 +27,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import re import sys +import requests + sys.path.append("../../common") import queue @@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase): def setUp(self): self._shm_leak_detector = shm_util.ShmLeakDetector() + def _get_metrics(self): + metrics_url = "http://localhost:8002/metrics" + r = requests.get(metrics_url) + r.raise_for_status() + return r.text + + def _metrics_before_test(self, model, reason): + pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)' + metrics = self._get_metrics() + match = re.search(pattern, metrics) + if match: + return int(match.group(1)) + else: + raise Exception(f"Failure metrics for model='{model}' not found") + + def _assert_metrics( + self, model_name, reason, expected_count_increase, initial_count + ): + metrics = self._get_metrics() + # Add initial count + expected count for the the test + expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}' + self.assertIn(expected_metric, metrics) + def test_error_code(self): model_name = "error_code" shape = [1, 1] @@ -181,7 +207,7 @@ def test_batch_error(self): def test_infer_pymodel_error(self): model_name = "wrong_model" shape = [2, 2] - + initial_metrics_value = self._metrics_before_test(model_name, "BACKEND") with self._shm_leak_detector.Probe() as shm_probe: with httpclient.InferenceServerClient( f"{_tritonserver_ipaddr}:8000" @@ -207,6 +233,72 @@ def test_infer_pymodel_error(self): self.assertTrue( False, "Wrong exception raised or did not raise an exception" ) + expected_count_increase = 1 + self._assert_metrics( + model_name, + "BACKEND", + expected_count_increase, + initial_metrics_value, + ) + + def test_grpc_strict_error_on(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 3 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + metadata = {"grpc_strict": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + + with self._shm_leak_detector.Probe() as shm_probe: + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if i == 1: + # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request + self.assertIsInstance(result, InferenceServerException) + self.assertEqual(str(result.status()), "StatusCode.INTERNAL") + + def test_grpc_strict_error_off(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 3 + user_data = UserData() + triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") + triton_client.start_stream(callback=partial(callback, user_data)) + + with self._shm_leak_detector.Probe() as shm_probe: + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if i == 1: + # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request + self.assertIsInstance(result, InferenceServerException) + # Existing Behaviour + self.assertEqual(str(result.status()), "NONE") + if i == 2: + # Stream is not killed + self.assertIsInstance(result, InferResult) if __name__ == "__main__": diff --git a/qa/L0_backend_python/lifecycle/my_test.py b/qa/L0_backend_python/lifecycle/my_test.py new file mode 100644 index 0000000000..25d76910e4 --- /dev/null +++ b/qa/L0_backend_python/lifecycle/my_test.py @@ -0,0 +1,53 @@ +import numpy as np +import tritonclient.grpc as grpcclient +from functools import partial +import queue +from tritonclient.utils import * + +class UserData: + def __init__(self): + self._completed_requests = queue.Queue() + +def callback(user_data, result, error): + if error: + user_data._completed_requests.put(error) + else: + user_data._completed_requests.put(result) + +def grpc_strict_error(): + model_name = "execute_error" + shape = [2, 2] + number_of_requests = 3 + user_data = UserData() + triton_server_url = "localhost:8001" # Replace with your Triton server address + + try: + triton_client = grpcclient.InferenceServerClient(triton_server_url) + metadata = {"grpc_strict": "true"} + + triton_client.start_stream(callback=partial(callback, user_data), headers=metadata) + + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + print(f"Request {i + 1} result:") + print(type(result)) + if type(result) == InferenceServerException: + print(result.status()) + + except Exception as e: + print(f"Error occurred: {str(e)}") + finally: + triton_client.stop_stream() + +if __name__ == "__main__": + grpc_strict_error() \ No newline at end of file diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh index dba4581ddd..59b846f56b 100755 --- a/qa/L0_backend_python/lifecycle/test.sh +++ b/qa/L0_backend_python/lifecycle/test.sh @@ -52,6 +52,14 @@ cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/ sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt) +mkdir -p models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/ +cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/ +(cd models/execute_grpc_error && \ + sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \ + sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \ + echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt) + mkdir -p models/execute_return_error/1/ cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/ cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/ diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt new file mode 100644 index 0000000000..3d364f3cc5 --- /dev/null +++ b/qa/python_models/execute_grpc_error/config.pbtxt @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +name: "execute_grpc_error" +backend: "python" +max_batch_size: 64 + +input [ + { + name: "IN" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +output [ + { + name: "OUT" + data_type: TYPE_FP32 + dims: [ -1 ] + } +] + +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py new file mode 100644 index 0000000000..ee74e710f8 --- /dev/null +++ b/qa/python_models/execute_grpc_error/model.py @@ -0,0 +1,52 @@ +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import triton_python_backend_utils as pb_utils + + +class TritonPythonModel: + def __init__(self): + # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure + self.inf_count = 0 + + def execute(self, requests): + """This function is called on inference request.""" + responses = [] + + # Generate the error for the second request + for request in requests: + input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") + out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) + if self.inf_count == 0: + responses.append(pb_utils.InferenceResponse([out_tensor])) + elif self.inf_count == 1: + error = pb_utils.TritonError("An error occurred during execution") + responses.append(pb_utils.InferenceResponse([out_tensor], error)) + elif self.inf_count == 2: + responses.append(pb_utils.InferenceResponse([out_tensor])) + self.inf_count += 1 + + return responses From 37b15e83e952adfe59d94b59fc3711fadd1d5d8d Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 2 Aug 2024 02:19:03 -0700 Subject: [PATCH 08/32] Clean up --- qa/L0_backend_python/lifecycle/my_test.py | 15 +++++++++++---- src/grpc/infer_handler.h | 15 +++------------ src/grpc/stream_infer_handler.cc | 11 +++-------- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/qa/L0_backend_python/lifecycle/my_test.py b/qa/L0_backend_python/lifecycle/my_test.py index 25d76910e4..c4c1899e8f 100644 --- a/qa/L0_backend_python/lifecycle/my_test.py +++ b/qa/L0_backend_python/lifecycle/my_test.py @@ -1,19 +1,23 @@ +import queue +from functools import partial + import numpy as np import tritonclient.grpc as grpcclient -from functools import partial -import queue from tritonclient.utils import * + class UserData: def __init__(self): self._completed_requests = queue.Queue() + def callback(user_data, result, error): if error: user_data._completed_requests.put(error) else: user_data._completed_requests.put(result) + def grpc_strict_error(): model_name = "execute_error" shape = [2, 2] @@ -25,7 +29,9 @@ def grpc_strict_error(): triton_client = grpcclient.InferenceServerClient(triton_server_url) metadata = {"grpc_strict": "true"} - triton_client.start_stream(callback=partial(callback, user_data), headers=metadata) + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) input_datas = [] for i in range(number_of_requests): @@ -49,5 +55,6 @@ def grpc_strict_error(): finally: triton_client.stop_stream() + if __name__ == "__main__": - grpc_strict_error() \ No newline at end of file + grpc_strict_error() diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 0b2ee94510..9ea4677fd5 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -645,7 +645,6 @@ class InferHandlerState { received_notification_(false), grpc_strict_(false), grpc_stream_error_state_(false) { - LOG_VERBOSE(1) << "Context Constructor getting called"; ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); } @@ -674,17 +673,13 @@ class InferHandlerState { // Extracts headers from GRPC request and updates state void ExtractStateFromHeaders(InferHandlerStateType* state) { - // Probably need to lock - LOG_VERBOSE(1) << "GRPC ExtractStateFromHeaders called" << std::endl; const auto& metadata = state->context_->ctx_->client_metadata(); for (const auto& pair : metadata) { auto& key = pair.first; std::string param_key = std::string(key.begin(), key.end()); std::string grpc_strict_key = "grpc_strict"; if (param_key.compare(grpc_strict_key) == 0) { - // They are equal state->context_->grpc_strict_ = true; - LOG_VERBOSE(1) << "GRPC streaming strict flag detected" << std::endl; } } } @@ -698,8 +693,6 @@ class InferHandlerState { state->context_->step_ = Steps::COMPLETE; state->step_ = Steps::PARTIAL_COMPLETION; state->context_->responder_->Finish(state->status_, state); - LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: " - << state->status_.error_code() << std::endl; // Mark error for this stream state->context_->MarkGRPCStrictError(); IssueRequestCancellation(); @@ -1010,7 +1003,6 @@ class InferHandlerState { // orders. A state enters this queue when it has successfully read // a request and exits the queue when it is written. std::recursive_mutex mu_; - std::recursive_mutex grpc_strict_mu_; std::queue states_; std::atomic ongoing_requests_; @@ -1043,9 +1035,11 @@ class InferHandlerState { // completion queue. bool received_notification_; - // True if there is an ongoing write to the grpc stream + // True if set by user via header std::atomic grpc_strict_; + // True if stream already encountered error and closed connection + // State maintained to avoid writes on closed stream std::atomic grpc_stream_error_state_; }; @@ -1064,8 +1058,6 @@ class InferHandlerState { const std::shared_ptr& context, Steps start_step = Steps::START) : tritonserver_(tritonserver), async_notify_state_(false) { - LOG_VERBOSE(1) - << "grpc_stream_error_state_ init called in InferHandlerState 2 \n"; // For debugging and testing const char* dstr = getenv("TRITONSERVER_DELAY_GRPC_RESPONSE"); delay_response_ms_ = 0; @@ -1150,7 +1142,6 @@ class InferHandlerState { std::shared_ptr context_; Steps step_; std::recursive_mutex step_mtx_; - // std::recursive_mutex grpc_strict_mtx_; // Shared pointer to the inference request object. The lifetime of // inference request object is extended till all the responses from diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 1a7a33998b..85ec4d2c32 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -334,13 +334,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // initiated... the completion callback will transition to // WRITEREADY or WRITTEN or CANCELLED. Recording the state and the // irequest to handle gRPC stream cancellation. - // std::time_t currentTime = std::time(nullptr); - - // // Divide the current time by 2 - // std::time_t modTime = currentTime % 2; - if (err == nullptr) { - // if (modTime) { state->context_->InsertInflightState(state); // The payload will be cleaned in request release callback. request_release_payload.release(); @@ -702,8 +696,9 @@ ModelStreamInferHandler::StreamInferResponseComplete( // std::lock_guard lock(state->grpc_strict_mtx_); state->status_ = status; // Finish only once, if backend ignores cancellation - LOG_VERBOSE(1) << "GRPC streaming error detected: " - << status.error_code() << std::endl; + LOG_VERBOSE(1) << "GRPC streaming error detected with status: " + << status.error_code() << "Closing stream connection." + << std::endl; state->context_->sendGRPCStrictResponse(state); return; } From a65f8c30f3d63c0a54aad3be90cd857e134c032a Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Mon, 5 Aug 2024 10:42:40 -0700 Subject: [PATCH 09/32] Tests updated --- .../lifecycle/lifecycle_test.py | 19 ++++++++++++------- qa/python_models/execute_grpc_error/model.py | 10 +++++----- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index f2e52c8c91..6997666090 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -244,7 +244,7 @@ def test_infer_pymodel_error(self): def test_grpc_strict_error_on(self): model_name = "execute_grpc_error" shape = [2, 2] - number_of_requests = 3 + number_of_requests = 2 user_data = UserData() triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") metadata = {"grpc_strict": "true"} @@ -265,7 +265,11 @@ def test_grpc_strict_error_on(self): inputs[0].set_data_from_numpy(input_data) triton_client.async_stream_infer(model_name=model_name, inputs=inputs) result = user_data._completed_requests.get() - if i == 1: + if i == 0: + # Stream is not killed + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + elif i == 1: # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request self.assertIsInstance(result, InferenceServerException) self.assertEqual(str(result.status()), "StatusCode.INTERNAL") @@ -273,7 +277,7 @@ def test_grpc_strict_error_on(self): def test_grpc_strict_error_off(self): model_name = "execute_grpc_error" shape = [2, 2] - number_of_requests = 3 + number_of_requests = 4 user_data = UserData() triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") triton_client.start_stream(callback=partial(callback, user_data)) @@ -291,14 +295,15 @@ def test_grpc_strict_error_off(self): inputs[0].set_data_from_numpy(input_data) triton_client.async_stream_infer(model_name=model_name, inputs=inputs) result = user_data._completed_requests.get() - if i == 1: + if i == 1 or i == 3: # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request self.assertIsInstance(result, InferenceServerException) # Existing Behaviour - self.assertEqual(str(result.status()), "NONE") - if i == 2: + self.assertEqual(str(result.status()), "None") + elif i == 0 or i == 2: # Stream is not killed - self.assertIsInstance(result, InferResult) + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") if __name__ == "__main__": diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py index ee74e710f8..d5087a49ec 100644 --- a/qa/python_models/execute_grpc_error/model.py +++ b/qa/python_models/execute_grpc_error/model.py @@ -30,7 +30,7 @@ class TritonPythonModel: def __init__(self): # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure - self.inf_count = 0 + self.inf_count = 1 def execute(self, requests): """This function is called on inference request.""" @@ -40,13 +40,13 @@ def execute(self, requests): for request in requests: input_tensor = pb_utils.get_input_tensor_by_name(request, "IN") out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy()) - if self.inf_count == 0: + if self.inf_count % 2: + # Every odd request is success responses.append(pb_utils.InferenceResponse([out_tensor])) - elif self.inf_count == 1: + else: + # Every even request is failure error = pb_utils.TritonError("An error occurred during execution") responses.append(pb_utils.InferenceResponse([out_tensor], error)) - elif self.inf_count == 2: - responses.append(pb_utils.InferenceResponse([out_tensor])) self.inf_count += 1 return responses From 080985c0b82ecf1aa482a387cf60031a3c7b8017 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Mon, 5 Aug 2024 23:27:49 -0700 Subject: [PATCH 10/32] Zombie request fixed --- src/grpc/infer_handler.h | 9 +++++---- src/grpc/stream_infer_handler.cc | 11 +++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 9ea4677fd5..cb10efbdd2 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -690,12 +690,11 @@ class InferHandlerState { // Check if Error not responded previously // Avoid closing connection twice on multiple errors from core if (!state->context_->IsGRPCStrictError()) { - state->context_->step_ = Steps::COMPLETE; - state->step_ = Steps::PARTIAL_COMPLETION; state->context_->responder_->Finish(state->status_, state); // Mark error for this stream state->context_->MarkGRPCStrictError(); - IssueRequestCancellation(); + // Fix Me : Last argument not sure for HandleCancellation + state->context_->HandleCancellation(state, true, "grpc_strict_name"); } } // Increments the ongoing request counter @@ -731,6 +730,7 @@ class InferHandlerState { } else { state->step_ = Steps::FINISH; } + LOG_VERBOSE(1) << "PutTaskBackToQueue inside HandleCompletion for " << state->unique_id_; PutTaskBackToQueue(state); } step_ = Steps::FINISH; @@ -808,6 +808,7 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. + LOG_VERBOSE(1) << "PutTaskBackToQueue inside IssueRequestCancellation for " << state->unique_id_; PutTaskBackToQueue(state); } } @@ -845,7 +846,6 @@ class InferHandlerState { IssueRequestCancellation(); // Mark the context as cancelled state->context_->step_ = Steps::CANCELLED; - // The state returns true because the CancelExecution // call above would have raised alarm objects on all // pending inflight states objects. This state will @@ -1355,6 +1355,7 @@ InferHandler< LOG_VERBOSE(1) << "Received notification for " << Name() << ", " << state->unique_id_; } + LOG_VERBOSE(2) << "Inside Next " << state->unique_id_; LOG_VERBOSE(2) << "Grpc::CQ::Next() " << state->context_->DebugString(state); if (!Process(state, ok)) { diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 85ec4d2c32..a3e79b749d 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -148,7 +148,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) std::lock_guard lock(state->context_->mu_); // Check if stream error detected and already connection ended if (state->context_->IsGRPCStrictError()) { - return !finished; + return finished; } } if (state->context_->ReceivedNotification()) { @@ -653,7 +653,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( // that state object can be released. if (is_complete) { state->step_ = Steps::CANCELLED; - state->context_->PutTaskBackToQueue(state); + state->context_->PutTaskBackToQueue(state); } state->complete_ = is_complete; @@ -683,7 +683,6 @@ ModelStreamInferHandler::StreamInferResponseComplete( LOG_ERROR << "expected the response allocator to have added the response"; } if (err != nullptr) { - LOG_VERBOSE(1) << "Error in CORE Response"; failed = true; ::grpc::Status status; // Converts CORE errors to GRPC error codes @@ -700,6 +699,10 @@ ModelStreamInferHandler::StreamInferResponseComplete( << status.error_code() << "Closing stream connection." << std::endl; state->context_->sendGRPCStrictResponse(state); + TRITONSERVER_ErrorDelete(err); + LOG_TRITONSERVER_ERROR( + TRITONSERVER_InferenceResponseDelete(iresponse), + "deleting GRPC inference response"); return; } } @@ -778,7 +781,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( // that state object can be released. if (is_complete) { state->step_ = Steps::CANCELLED; - state->context_->PutTaskBackToQueue(state); + state->context_->PutTaskBackToQueue(state); } state->complete_ = is_complete; From cc34d418e4c97043c64fa7b3c4dc2e3ed001bb51 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Mon, 5 Aug 2024 23:31:25 -0700 Subject: [PATCH 11/32] Pre Commit fixed --- src/grpc/infer_handler.h | 7 ++++--- src/grpc/stream_infer_handler.cc | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index cb10efbdd2..dd17635f6d 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -693,7 +693,7 @@ class InferHandlerState { state->context_->responder_->Finish(state->status_, state); // Mark error for this stream state->context_->MarkGRPCStrictError(); - // Fix Me : Last argument not sure for HandleCancellation + // Fix Me : Last argument not sure for HandleCancellation state->context_->HandleCancellation(state, true, "grpc_strict_name"); } } @@ -730,7 +730,6 @@ class InferHandlerState { } else { state->step_ = Steps::FINISH; } - LOG_VERBOSE(1) << "PutTaskBackToQueue inside HandleCompletion for " << state->unique_id_; PutTaskBackToQueue(state); } step_ = Steps::FINISH; @@ -808,7 +807,9 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. - LOG_VERBOSE(1) << "PutTaskBackToQueue inside IssueRequestCancellation for " << state->unique_id_; + LOG_VERBOSE(1) + << "PutTaskBackToQueue inside IssueRequestCancellation for " + << state->unique_id_; PutTaskBackToQueue(state); } } diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index a3e79b749d..8eed87cd16 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -653,7 +653,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( // that state object can be released. if (is_complete) { state->step_ = Steps::CANCELLED; - state->context_->PutTaskBackToQueue(state); + state->context_->PutTaskBackToQueue(state); } state->complete_ = is_complete; @@ -781,7 +781,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( // that state object can be released. if (is_complete) { state->step_ = Steps::CANCELLED; - state->context_->PutTaskBackToQueue(state); + state->context_->PutTaskBackToQueue(state); } state->complete_ = is_complete; From 40344d58dcbae207a9ff8a625521ddf84302571b Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 7 Aug 2024 00:26:54 -0700 Subject: [PATCH 12/32] Review Comments cleaned up, crash fixed in multi threading --- qa/L0_backend_python/lifecycle/my_test.py | 60 ------------------- .../execute_grpc_error/config.pbtxt | 1 - src/grpc/infer_handler.cc | 3 +- src/grpc/infer_handler.h | 52 ++++++++-------- src/grpc/stream_infer_handler.cc | 8 +-- 5 files changed, 34 insertions(+), 90 deletions(-) delete mode 100644 qa/L0_backend_python/lifecycle/my_test.py diff --git a/qa/L0_backend_python/lifecycle/my_test.py b/qa/L0_backend_python/lifecycle/my_test.py deleted file mode 100644 index c4c1899e8f..0000000000 --- a/qa/L0_backend_python/lifecycle/my_test.py +++ /dev/null @@ -1,60 +0,0 @@ -import queue -from functools import partial - -import numpy as np -import tritonclient.grpc as grpcclient -from tritonclient.utils import * - - -class UserData: - def __init__(self): - self._completed_requests = queue.Queue() - - -def callback(user_data, result, error): - if error: - user_data._completed_requests.put(error) - else: - user_data._completed_requests.put(result) - - -def grpc_strict_error(): - model_name = "execute_error" - shape = [2, 2] - number_of_requests = 3 - user_data = UserData() - triton_server_url = "localhost:8001" # Replace with your Triton server address - - try: - triton_client = grpcclient.InferenceServerClient(triton_server_url) - metadata = {"grpc_strict": "true"} - - triton_client.start_stream( - callback=partial(callback, user_data), headers=metadata - ) - - input_datas = [] - for i in range(number_of_requests): - input_data = np.random.randn(*shape).astype(np.float32) - input_datas.append(input_data) - inputs = [ - grpcclient.InferInput( - "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) - ) - ] - inputs[0].set_data_from_numpy(input_data) - triton_client.async_stream_infer(model_name=model_name, inputs=inputs) - result = user_data._completed_requests.get() - print(f"Request {i + 1} result:") - print(type(result)) - if type(result) == InferenceServerException: - print(result.status()) - - except Exception as e: - print(f"Error occurred: {str(e)}") - finally: - triton_client.stop_stream() - - -if __name__ == "__main__": - grpc_strict_error() diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt index 3d364f3cc5..70e247148a 100644 --- a/qa/python_models/execute_grpc_error/config.pbtxt +++ b/qa/python_models/execute_grpc_error/config.pbtxt @@ -24,7 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -name: "execute_grpc_error" backend: "python" max_batch_size: 64 diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index 35659f4900..d01e88ce94 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -720,7 +720,8 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok) // single thread scenario. StartNewRequest(); } - bool resume = state->context_->HandleCancellation(state, rpc_ok, Name()); + bool resume = state->context_->HandleCancellation( + state, rpc_ok, Name(), false /* is_grpc_strict */); return resume; } diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index dd17635f6d..adf5bb53c4 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -684,7 +684,7 @@ class InferHandlerState { } } - void sendGRPCStrictResponse(InferHandlerStateType* state) + void SendGRPCStrictResponse(InferHandlerStateType* state) { std::lock_guard lock(state->context_->mu_); // Check if Error not responded previously @@ -694,7 +694,9 @@ class InferHandlerState { // Mark error for this stream state->context_->MarkGRPCStrictError(); // Fix Me : Last argument not sure for HandleCancellation - state->context_->HandleCancellation(state, true, "grpc_strict_name"); + state->context_->HandleCancellation( + state, true /* rpc_ok */, "grpc_strict_name", + true /* is_grpc_strict */); } } // Increments the ongoing request counter @@ -807,9 +809,6 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. - LOG_VERBOSE(1) - << "PutTaskBackToQueue inside IssueRequestCancellation for " - << state->unique_id_; PutTaskBackToQueue(state); } } @@ -822,9 +821,11 @@ class InferHandlerState { // Returns whether or not to continue cycling through the gRPC // completion queue or not. bool HandleCancellation( - InferHandlerStateType* state, bool rpc_ok, const std::string& name) + InferHandlerStateType* state, bool rpc_ok, const std::string& name, + bool is_grpc_strict) { - if (!IsCancelled()) { + // Check to avoid early exit in case of grpc_strict + if (!IsCancelled() && !(is_grpc_strict)) { LOG_ERROR << "[INTERNAL] HandleCancellation called even when the context was " "not cancelled for " @@ -1349,23 +1350,26 @@ InferHandler< while (cq_->Next(&tag, &ok)) { State* state = static_cast(tag); - if (state->step_ == Steps::WAITING_NOTIFICATION) { - State* state_wrapper = state; - state = state_wrapper->state_ptr_; - state->context_->SetReceivedNotification(true); - LOG_VERBOSE(1) << "Received notification for " << Name() << ", " - << state->unique_id_; - } - LOG_VERBOSE(2) << "Inside Next " << state->unique_id_; - LOG_VERBOSE(2) << "Grpc::CQ::Next() " - << state->context_->DebugString(state); - if (!Process(state, ok)) { - LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_; - state->context_->EraseState(state); - StateRelease(state); - } else { - LOG_VERBOSE(2) << "Returning from " << Name() << ", " - << state->unique_id_ << ", " << state->step_; + // FIX ME : Ideally should not need this nullptr check, added to resolve + // crash is grpc_strict mode + if (state->context_ != nullptr) { + if (state->step_ == Steps::WAITING_NOTIFICATION) { + State* state_wrapper = state; + state = state_wrapper->state_ptr_; + state->context_->SetReceivedNotification(true); + LOG_VERBOSE(1) << "Received notification for " << Name() << ", " + << state->unique_id_; + } + LOG_VERBOSE(2) << "Grpc::CQ::Next() " + << state->context_->DebugString(state); + if (!Process(state, ok)) { + LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_; + state->context_->EraseState(state); + StateRelease(state); + } else { + LOG_VERBOSE(2) << "Returning from " << Name() << ", " + << state->unique_id_ << ", " << state->step_; + } } } })); diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 8eed87cd16..80005f479c 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -148,13 +148,15 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) std::lock_guard lock(state->context_->mu_); // Check if stream error detected and already connection ended if (state->context_->IsGRPCStrictError()) { + state->step_ = Steps::FINISH; return finished; } } if (state->context_->ReceivedNotification()) { std::lock_guard lock(state->step_mtx_); if (state->IsGrpcContextCancelled()) { - bool resume = state->context_->HandleCancellation(state, rpc_ok, Name()); + bool resume = state->context_->HandleCancellation( + state, rpc_ok, Name(), false /* is_grpc_strict */); return resume; } else { if (state->context_->HandleCompletion()) { @@ -691,14 +693,12 @@ ModelStreamInferHandler::StreamInferResponseComplete( response->set_error_message(status.error_message()); LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; if (state->context_->grpc_strict_) { - // Set to finish - // std::lock_guard lock(state->grpc_strict_mtx_); state->status_ = status; // Finish only once, if backend ignores cancellation LOG_VERBOSE(1) << "GRPC streaming error detected with status: " << status.error_code() << "Closing stream connection." << std::endl; - state->context_->sendGRPCStrictResponse(state); + state->context_->SendGRPCStrictResponse(state); TRITONSERVER_ErrorDelete(err); LOG_TRITONSERVER_ERROR( TRITONSERVER_InferenceResponseDelete(iresponse), From f40f695e67e554b58153d392516062bd00c7bcda Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 7 Aug 2024 10:58:19 -0700 Subject: [PATCH 13/32] Review Comments fixed --- .../lifecycle/lifecycle_test.py | 6 ++-- src/grpc/infer_handler.cc | 2 +- src/grpc/infer_handler.h | 31 ++++++++++++------- src/grpc/stream_infer_handler.cc | 8 ++--- 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index 6997666090..0ea1e6d4e6 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -241,13 +241,13 @@ def test_infer_pymodel_error(self): initial_metrics_value, ) - def test_grpc_strict_error_on(self): + def test_triton_grpc_error_error_on(self): model_name = "execute_grpc_error" shape = [2, 2] number_of_requests = 2 user_data = UserData() triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") - metadata = {"grpc_strict": "true"} + metadata = {"triton_grpc_error": "true"} triton_client.start_stream( callback=partial(callback, user_data), headers=metadata ) @@ -274,7 +274,7 @@ def test_grpc_strict_error_on(self): self.assertIsInstance(result, InferenceServerException) self.assertEqual(str(result.status()), "StatusCode.INTERNAL") - def test_grpc_strict_error_off(self): + def test_triton_grpc_error_error_off(self): model_name = "execute_grpc_error" shape = [2, 2] number_of_requests = 4 diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index d01e88ce94..c45b565a88 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -721,7 +721,7 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok) StartNewRequest(); } bool resume = state->context_->HandleCancellation( - state, rpc_ok, Name(), false /* is_grpc_strict */); + state, rpc_ok, Name(), false /* is_triton_grpc_error */); return resume; } diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index adf5bb53c4..8192ab568c 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -642,7 +642,7 @@ class InferHandlerState { ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0) : cq_(cq), unique_id_(unique_id), ongoing_requests_(0), step_(Steps::START), finish_ok_(true), ongoing_write_(false), - received_notification_(false), grpc_strict_(false), + received_notification_(false), triton_grpc_error_(false), grpc_stream_error_state_(false) { ctx_.reset(new ::grpc::ServerContext()); @@ -676,10 +676,15 @@ class InferHandlerState { const auto& metadata = state->context_->ctx_->client_metadata(); for (const auto& pair : metadata) { auto& key = pair.first; + auto& value = pair.second; std::string param_key = std::string(key.begin(), key.end()); - std::string grpc_strict_key = "grpc_strict"; - if (param_key.compare(grpc_strict_key) == 0) { - state->context_->grpc_strict_ = true; + std::string value_key = std::string(value.begin(), value.end()); + std::string triton_grpc_error_key = "triton_grpc_error"; + if(param_key == triton_grpc_error_key) { + if(value_key == "true") { + LOG_VERBOSE(2) << "GRPC: triton_grpc_error mode detected in new grpc stream"; + state->context_->triton_grpc_error_ = true; + } } } } @@ -695,8 +700,8 @@ class InferHandlerState { state->context_->MarkGRPCStrictError(); // Fix Me : Last argument not sure for HandleCancellation state->context_->HandleCancellation( - state, true /* rpc_ok */, "grpc_strict_name", - true /* is_grpc_strict */); + state, true /* rpc_ok */, "triton_grpc_error_name", + true /* is_triton_grpc_error */); } } // Increments the ongoing request counter @@ -822,10 +827,10 @@ class InferHandlerState { // completion queue or not. bool HandleCancellation( InferHandlerStateType* state, bool rpc_ok, const std::string& name, - bool is_grpc_strict) + bool is_triton_grpc_error) { - // Check to avoid early exit in case of grpc_strict - if (!IsCancelled() && !(is_grpc_strict)) { + // Check to avoid early exit in case of triton_grpc_error + if (!IsCancelled() && !(is_triton_grpc_error)) { LOG_ERROR << "[INTERNAL] HandleCancellation called even when the context was " "not cancelled for " @@ -975,7 +980,7 @@ class InferHandlerState { // Marks error after it has been responded to void MarkGRPCStrictError() { grpc_stream_error_state_ = true; } - // Checks if error already responded to in grpc_strict mode + // Checks if error already responded to in triton_grpc_error mode bool IsGRPCStrictError() { return grpc_stream_error_state_; } // Return true if this context has completed all reads and writes. @@ -1038,10 +1043,12 @@ class InferHandlerState { bool received_notification_; // True if set by user via header - std::atomic grpc_strict_; + // Can be accessed without a lock, as set only once in startstream + std::atomic triton_grpc_error_; // True if stream already encountered error and closed connection // State maintained to avoid writes on closed stream + // Need to acquire lock before access std::atomic grpc_stream_error_state_; }; @@ -1351,7 +1358,7 @@ InferHandler< while (cq_->Next(&tag, &ok)) { State* state = static_cast(tag); // FIX ME : Ideally should not need this nullptr check, added to resolve - // crash is grpc_strict mode + // crash is triton_grpc_error mode if (state->context_ != nullptr) { if (state->step_ == Steps::WAITING_NOTIFICATION) { State* state_wrapper = state; diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 80005f479c..5e1fc89d89 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -144,7 +144,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // because we launch an async thread that could update 'state's // step_ to be FINISH before this thread exits this function. bool finished = false; - if (state->context_->grpc_strict_) { + if (state->context_->triton_grpc_error_) { std::lock_guard lock(state->context_->mu_); // Check if stream error detected and already connection ended if (state->context_->IsGRPCStrictError()) { @@ -156,7 +156,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) std::lock_guard lock(state->step_mtx_); if (state->IsGrpcContextCancelled()) { bool resume = state->context_->HandleCancellation( - state, rpc_ok, Name(), false /* is_grpc_strict */); + state, rpc_ok, Name(), false /* is_triton_grpc_error */); return resume; } else { if (state->context_->HandleCompletion()) { @@ -604,7 +604,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( { State* state = reinterpret_cast(userp); // Ignore Response from CORE in case GRPC Strict as we dont care about - if (state->context_->grpc_strict_) { + if (state->context_->triton_grpc_error_) { std::lock_guard lock(state->context_->mu_); if (state->context_->IsGRPCStrictError()) { return; @@ -692,7 +692,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( response->mutable_infer_response()->Clear(); response->set_error_message(status.error_message()); LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; - if (state->context_->grpc_strict_) { + if (state->context_->triton_grpc_error_) { state->status_ = status; // Finish only once, if backend ignores cancellation LOG_VERBOSE(1) << "GRPC streaming error detected with status: " From 0792bc1d1d0e8d143c322cdabd4a3f7f01a92c81 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 7 Aug 2024 11:25:49 -0700 Subject: [PATCH 14/32] Pre-Commit fixed --- src/grpc/infer_handler.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 8192ab568c..acd20609b8 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -680,9 +680,10 @@ class InferHandlerState { std::string param_key = std::string(key.begin(), key.end()); std::string value_key = std::string(value.begin(), value.end()); std::string triton_grpc_error_key = "triton_grpc_error"; - if(param_key == triton_grpc_error_key) { - if(value_key == "true") { - LOG_VERBOSE(2) << "GRPC: triton_grpc_error mode detected in new grpc stream"; + if (param_key == triton_grpc_error_key) { + if (value_key == "true") { + LOG_VERBOSE(2) + << "GRPC: triton_grpc_error mode detected in new grpc stream"; state->context_->triton_grpc_error_ = true; } } From cdd60bf38a52e3a4f0de7f7f1ee7144cb9e81a67 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 8 Aug 2024 10:35:29 -0700 Subject: [PATCH 15/32] Park --- src/grpc/infer_handler.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index acd20609b8..adcabdfc78 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -696,6 +696,9 @@ class InferHandlerState { // Check if Error not responded previously // Avoid closing connection twice on multiple errors from core if (!state->context_->IsGRPCStrictError()) { + // check state object + // state->context_->step_ = Steps::COMPLETE; + // state->step_ = Steps::COMPLETE; state->context_->responder_->Finish(state->status_, state); // Mark error for this stream state->context_->MarkGRPCStrictError(); @@ -782,7 +785,7 @@ class InferHandlerState { // Issues the cancellation for all inflight requests // being tracked by this context. - void IssueRequestCancellation() + void IssueRequestCancellation(bool is_triton_grpc_error) { { std::lock_guard lock(mu_); @@ -815,7 +818,10 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. - PutTaskBackToQueue(state); + // CHeck for writeready + if(!is_triton_grpc_error) { + PutTaskBackToQueue(state); + } } } } @@ -851,7 +857,7 @@ class InferHandlerState { // issue cancellation request to all the inflight // states belonging to the context. if (state->context_->step_ != Steps::CANCELLED) { - IssueRequestCancellation(); + IssueRequestCancellation(is_triton_grpc_error); // Mark the context as cancelled state->context_->step_ = Steps::CANCELLED; // The state returns true because the CancelExecution @@ -1378,7 +1384,7 @@ InferHandler< LOG_VERBOSE(2) << "Returning from " << Name() << ", " << state->unique_id_ << ", " << state->step_; } - } + } } })); From 6661f21ded4c16039bc972e47bf07bd8f6a67f88 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 8 Aug 2024 20:24:56 -0700 Subject: [PATCH 16/32] Simpler design piggyback on NotifywhenDone() --- src/grpc/grpc_utils.h | 11 +++++ src/grpc/infer_handler.h | 82 ++++++++++++++++++-------------- src/grpc/stream_infer_handler.cc | 8 ---- 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h index 898e4acb4f..9f1c54527a 100644 --- a/src/grpc/grpc_utils.h +++ b/src/grpc/grpc_utils.h @@ -76,6 +76,17 @@ typedef enum { PARTIAL_COMPLETION } Steps; +typedef enum { + // No error from CORE seen yet + NONE, + // Error from CORE encountered, waiting to be picked up by completion queue to + // initiate cancellation + ERROR_WAITING, + // Error from CORE encountered, stream closed + // This state is added to avoid double cancellation + ERROR_CANCELED +} Triton_grpc_error_steps; + // Debugging helper std::ostream& operator<<(std::ostream& out, const Steps& step); diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index adcabdfc78..1f87d4e62f 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -643,7 +643,7 @@ class InferHandlerState { : cq_(cq), unique_id_(unique_id), ongoing_requests_(0), step_(Steps::START), finish_ok_(true), ongoing_write_(false), received_notification_(false), triton_grpc_error_(false), - grpc_stream_error_state_(false) + grpc_stream_error_state_(Triton_grpc_error_steps::NONE) { ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); @@ -665,9 +665,22 @@ class InferHandlerState { bool ReceivedNotification() { return received_notification_; } + // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. + bool IsGRPCError() + { + if (grpc_stream_error_state_ == Triton_grpc_error_steps::ERROR_WAITING) { + // Change the state to ERROR_CANCELED as we have called + // HandleCancellation + grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED; + return true; + } + return false; + } + bool IsCancelled() { - return received_notification_ ? ctx_->IsCancelled() : false; + return received_notification_ ? (ctx_->IsCancelled() || IsGRPCError()) + : false; } // Extracts headers from GRPC request and updates state @@ -696,16 +709,10 @@ class InferHandlerState { // Check if Error not responded previously // Avoid closing connection twice on multiple errors from core if (!state->context_->IsGRPCStrictError()) { - // check state object - // state->context_->step_ = Steps::COMPLETE; - // state->step_ = Steps::COMPLETE; + state->step_ = Steps::COMPLETE; state->context_->responder_->Finish(state->status_, state); // Mark error for this stream state->context_->MarkGRPCStrictError(); - // Fix Me : Last argument not sure for HandleCancellation - state->context_->HandleCancellation( - state, true /* rpc_ok */, "triton_grpc_error_name", - true /* is_triton_grpc_error */); } } // Increments the ongoing request counter @@ -819,9 +826,7 @@ class InferHandlerState { // the object. Hence, need to explicitly place the state on the // completion queue. // CHeck for writeready - if(!is_triton_grpc_error) { - PutTaskBackToQueue(state); - } + PutTaskBackToQueue(state); } } } @@ -985,10 +990,19 @@ class InferHandlerState { } // Marks error after it has been responded to - void MarkGRPCStrictError() { grpc_stream_error_state_ = true; } + void MarkGRPCStrictError() + { + grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_WAITING; + } // Checks if error already responded to in triton_grpc_error mode - bool IsGRPCStrictError() { return grpc_stream_error_state_; } + bool IsGRPCStrictError() + { + if (grpc_stream_error_state_ == Triton_grpc_error_steps::NONE) { + return false; + } + return true; + } // Return true if this context has completed all reads and writes. bool IsRequestsCompleted() @@ -1056,7 +1070,7 @@ class InferHandlerState { // True if stream already encountered error and closed connection // State maintained to avoid writes on closed stream // Need to acquire lock before access - std::atomic grpc_stream_error_state_; + int grpc_stream_error_state_; }; // This constructor is used to build a wrapper state object @@ -1364,27 +1378,23 @@ InferHandler< while (cq_->Next(&tag, &ok)) { State* state = static_cast(tag); - // FIX ME : Ideally should not need this nullptr check, added to resolve - // crash is triton_grpc_error mode - if (state->context_ != nullptr) { - if (state->step_ == Steps::WAITING_NOTIFICATION) { - State* state_wrapper = state; - state = state_wrapper->state_ptr_; - state->context_->SetReceivedNotification(true); - LOG_VERBOSE(1) << "Received notification for " << Name() << ", " - << state->unique_id_; - } - LOG_VERBOSE(2) << "Grpc::CQ::Next() " - << state->context_->DebugString(state); - if (!Process(state, ok)) { - LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_; - state->context_->EraseState(state); - StateRelease(state); - } else { - LOG_VERBOSE(2) << "Returning from " << Name() << ", " - << state->unique_id_ << ", " << state->step_; - } - } + if (state->step_ == Steps::WAITING_NOTIFICATION) { + State* state_wrapper = state; + state = state_wrapper->state_ptr_; + state->context_->SetReceivedNotification(true); + LOG_VERBOSE(1) << "Received notification for " << Name() << ", " + << state->unique_id_; + } + LOG_VERBOSE(2) << "Grpc::CQ::Next() " + << state->context_->DebugString(state); + if (!Process(state, ok)) { + LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_; + state->context_->EraseState(state); + StateRelease(state); + } else { + LOG_VERBOSE(2) << "Returning from " << Name() << ", " + << state->unique_id_ << ", " << state->step_; + } } })); diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 5e1fc89d89..0fa2715841 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -144,14 +144,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // because we launch an async thread that could update 'state's // step_ to be FINISH before this thread exits this function. bool finished = false; - if (state->context_->triton_grpc_error_) { - std::lock_guard lock(state->context_->mu_); - // Check if stream error detected and already connection ended - if (state->context_->IsGRPCStrictError()) { - state->step_ = Steps::FINISH; - return finished; - } - } if (state->context_->ReceivedNotification()) { std::lock_guard lock(state->step_mtx_); if (state->IsGrpcContextCancelled()) { From 22e53592f45bb82a58b53c8ba285e6031ae68c92 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 9 Aug 2024 12:05:12 -0700 Subject: [PATCH 17/32] Cleanup unwanted states from old design --- src/grpc/infer_handler.cc | 3 +-- src/grpc/infer_handler.h | 32 ++++++++++++++++++-------------- src/grpc/stream_infer_handler.cc | 12 ++++++------ 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc index c45b565a88..35659f4900 100644 --- a/src/grpc/infer_handler.cc +++ b/src/grpc/infer_handler.cc @@ -720,8 +720,7 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok) // single thread scenario. StartNewRequest(); } - bool resume = state->context_->HandleCancellation( - state, rpc_ok, Name(), false /* is_triton_grpc_error */); + bool resume = state->context_->HandleCancellation(state, rpc_ok, Name()); return resume; } diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 1f87d4e62f..c93c3ba7eb 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -665,13 +665,19 @@ class InferHandlerState { bool ReceivedNotification() { return received_notification_; } + // Changes the state of grpc_stream_error_state_ to ERROR_CANCELED, + // indicating we have closed the stream and initiated the cancel flow + void SetGRPCErrorCancelled() + { + grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED; + } // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. - bool IsGRPCError() + bool CheckAndUpdateGRPCError() { if (grpc_stream_error_state_ == Triton_grpc_error_steps::ERROR_WAITING) { // Change the state to ERROR_CANCELED as we have called // HandleCancellation - grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED; + SetGRPCErrorCancelled(); return true; } return false; @@ -679,8 +685,9 @@ class InferHandlerState { bool IsCancelled() { - return received_notification_ ? (ctx_->IsCancelled() || IsGRPCError()) - : false; + return received_notification_ + ? (ctx_->IsCancelled() || CheckAndUpdateGRPCError()) + : false; } // Extracts headers from GRPC request and updates state @@ -792,7 +799,7 @@ class InferHandlerState { // Issues the cancellation for all inflight requests // being tracked by this context. - void IssueRequestCancellation(bool is_triton_grpc_error) + void IssueRequestCancellation() { { std::lock_guard lock(mu_); @@ -825,7 +832,6 @@ class InferHandlerState { // The RPC is complete and no callback will be invoked to retrieve // the object. Hence, need to explicitly place the state on the // completion queue. - // CHeck for writeready PutTaskBackToQueue(state); } } @@ -838,11 +844,10 @@ class InferHandlerState { // Returns whether or not to continue cycling through the gRPC // completion queue or not. bool HandleCancellation( - InferHandlerStateType* state, bool rpc_ok, const std::string& name, - bool is_triton_grpc_error) + InferHandlerStateType* state, bool rpc_ok, const std::string& name) { // Check to avoid early exit in case of triton_grpc_error - if (!IsCancelled() && !(is_triton_grpc_error)) { + if (!IsCancelled()) { LOG_ERROR << "[INTERNAL] HandleCancellation called even when the context was " "not cancelled for " @@ -862,7 +867,7 @@ class InferHandlerState { // issue cancellation request to all the inflight // states belonging to the context. if (state->context_->step_ != Steps::CANCELLED) { - IssueRequestCancellation(is_triton_grpc_error); + IssueRequestCancellation(); // Mark the context as cancelled state->context_->step_ = Steps::CANCELLED; // The state returns true because the CancelExecution @@ -1067,10 +1072,9 @@ class InferHandlerState { // Can be accessed without a lock, as set only once in startstream std::atomic triton_grpc_error_; - // True if stream already encountered error and closed connection - // State maintained to avoid writes on closed stream - // Need to acquire lock before access - int grpc_stream_error_state_; + // Indicates the state of triton_grpc_error, only relevant if special + // triton_grpc_error feature set to true by client + Triton_grpc_error_steps grpc_stream_error_state_; }; // This constructor is used to build a wrapper state object diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 0fa2715841..6a14ed9d4c 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -140,15 +140,10 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) // This means that we only need to take care of // synchronizing this thread and the ResponseComplete // threads. - // We need an explicit finish indicator. Can't use 'state->step_' - // because we launch an async thread that could update 'state's - // step_ to be FINISH before this thread exits this function. - bool finished = false; if (state->context_->ReceivedNotification()) { std::lock_guard lock(state->step_mtx_); if (state->IsGrpcContextCancelled()) { - bool resume = state->context_->HandleCancellation( - state, rpc_ok, Name(), false /* is_triton_grpc_error */); + bool resume = state->context_->HandleCancellation(state, rpc_ok, Name()); return resume; } else { if (state->context_->HandleCompletion()) { @@ -161,6 +156,11 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok) << ", context " << state->context_->unique_id_ << ", " << state->unique_id_ << " step " << state->step_; + // We need an explicit finish indicator. Can't use 'state->step_' + // because we launch an async thread that could update 'state's + // step_ to be FINISH before this thread exits this function. + bool finished = false; + if (state->step_ == Steps::START) { // A new stream connection... If RPC failed on a new request then // the server is shutting down and so we should do nothing. From a31ba09119ebe645d0838cdac79c8beac4b6a4f4 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Mon, 12 Aug 2024 15:25:28 -0700 Subject: [PATCH 18/32] Improved tests around triton_grpc_error mode --- .../lifecycle/lifecycle_test.py | 110 +++++++++++++++--- 1 file changed, 93 insertions(+), 17 deletions(-) diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index 0ea1e6d4e6..6fda441767 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -35,6 +35,7 @@ sys.path.append("../../common") import queue +import threading import time import unittest from functools import partial @@ -241,6 +242,8 @@ def test_infer_pymodel_error(self): initial_metrics_value, ) + # Test grpc stream behavior when triton_grpc_error is set to true. + # Expected to close stream and return GRPC error when model returns error. def test_triton_grpc_error_error_on(self): model_name = "execute_grpc_error" shape = [2, 2] @@ -251,7 +254,7 @@ def test_triton_grpc_error_error_on(self): triton_client.start_stream( callback=partial(callback, user_data), headers=metadata ) - + stream_end = False with self._shm_leak_detector.Probe() as shm_probe: input_datas = [] for i in range(number_of_requests): @@ -263,21 +266,100 @@ def test_triton_grpc_error_error_on(self): ) ] inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer( + model_name=model_name, inputs=inputs + ) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request + self.assertEqual(str(result.status()), "StatusCode.INTERNAL") + stream_end = True + else: + # Stream is not killed + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + + # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams. + # Expected to close stream and return GRPC error when model returns error. + def test_triton_grpc_error_multithreaded(self): + thread1 = threading.Thread(target=self.test_triton_grpc_error_error_on) + thread2 = threading.Thread(target=self.test_triton_grpc_error_error_on) + # Start the threads + thread1.start() + thread2.start() + # Wait for both threads to finish + thread1.join() + thread2.join() + + # Test grpc stream behavior when triton_grpc_error is set to true and subsequent stream is cancelled. + # Expected cancellation is successful. + def test_triton_grpc_error_cancel(self): + model_name = "execute_grpc_error" + shape = [2, 2] + number_of_requests = 1 + user_data = UserData() + triton_server_url = "localhost:8001" # Replace with your Triton server address + stream_end = False + triton_client = grpcclient.InferenceServerClient(triton_server_url) + + metadata = {"triton_grpc_error": "true"} + + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: triton_client.async_stream_infer(model_name=model_name, inputs=inputs) result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + stream_end = True if i == 0: - # Stream is not killed - output_data = result.as_numpy("OUT") - self.assertIsNotNone(output_data, "error: expected 'OUT'") - elif i == 1: - # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request - self.assertIsInstance(result, InferenceServerException) - self.assertEqual(str(result.status()), "StatusCode.INTERNAL") + triton_client.stop_stream(cancel_requests=True) + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", + ) + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" + ) + self.assertTrue( + True, + "This should always pass as cancellation should succeed without any exception", + ) + # Test grpc stream behavior when triton_grpc_error is set to false + # and subsequent stream is NOT closed when error is reported from CORE def test_triton_grpc_error_error_off(self): model_name = "execute_grpc_error" shape = [2, 2] number_of_requests = 4 + response_counter = 0 user_data = UserData() triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") triton_client.start_stream(callback=partial(callback, user_data)) @@ -295,15 +377,9 @@ def test_triton_grpc_error_error_off(self): inputs[0].set_data_from_numpy(input_data) triton_client.async_stream_infer(model_name=model_name, inputs=inputs) result = user_data._completed_requests.get() - if i == 1 or i == 3: - # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request - self.assertIsInstance(result, InferenceServerException) - # Existing Behaviour - self.assertEqual(str(result.status()), "None") - elif i == 0 or i == 2: - # Stream is not killed - output_data = result.as_numpy("OUT") - self.assertIsNotNone(output_data, "error: expected 'OUT'") + response_counter += 1 + # Expect stream to not CLOSE as NOT in triton_grpc_error mode + self.assertEqual(response_counter, number_of_requests) if __name__ == "__main__": From af451a2db7eb0611c654e480ef4021b059c6b37d Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 14 Aug 2024 00:10:03 -0700 Subject: [PATCH 19/32] Comments resolved --- .../lifecycle/lifecycle_test.py | 94 +++++++++---------- src/grpc/grpc_utils.h | 6 +- src/grpc/infer_handler.h | 88 +++++++++-------- src/grpc/stream_infer_handler.cc | 4 +- 4 files changed, 93 insertions(+), 99 deletions(-) diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index 6fda441767..bd543d784e 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -255,41 +255,38 @@ def test_triton_grpc_error_error_on(self): callback=partial(callback, user_data), headers=metadata ) stream_end = False - with self._shm_leak_detector.Probe() as shm_probe: - input_datas = [] - for i in range(number_of_requests): - input_data = np.random.randn(*shape).astype(np.float32) - input_datas.append(input_data) - inputs = [ - grpcclient.InferInput( - "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + try: + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + if type(result) == InferenceServerException: + # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request + self.assertEqual(str(result.status()), "StatusCode.INTERNAL") + stream_end = True + else: + # Stream is not killed + output_data = result.as_numpy("OUT") + self.assertIsNotNone(output_data, "error: expected 'OUT'") + except Exception as e: + if stream_end == True: + # We expect the stream to have closed + self.assertTrue( + True, + "This should always pass as cancellation should succeed", ) - ] - inputs[0].set_data_from_numpy(input_data) - try: - triton_client.async_stream_infer( - model_name=model_name, inputs=inputs + else: + self.assertFalse( + True, "Unexpected Stream killed without Error from CORE" ) - result = user_data._completed_requests.get() - if type(result) == InferenceServerException: - # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request - self.assertEqual(str(result.status()), "StatusCode.INTERNAL") - stream_end = True - else: - # Stream is not killed - output_data = result.as_numpy("OUT") - self.assertIsNotNone(output_data, "error: expected 'OUT'") - except Exception as e: - if stream_end == True: - # We expect the stream to have closed - self.assertTrue( - True, - "This should always pass as cancellation should succeed", - ) - else: - self.assertFalse( - True, "Unexpected Stream killed without Error from CORE" - ) # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams. # Expected to close stream and return GRPC error when model returns error. @@ -363,22 +360,21 @@ def test_triton_grpc_error_error_off(self): user_data = UserData() triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") triton_client.start_stream(callback=partial(callback, user_data)) - - with self._shm_leak_detector.Probe() as shm_probe: - input_datas = [] - for i in range(number_of_requests): - input_data = np.random.randn(*shape).astype(np.float32) - input_datas.append(input_data) - inputs = [ - grpcclient.InferInput( - "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) - ) - ] - inputs[0].set_data_from_numpy(input_data) - triton_client.async_stream_infer(model_name=model_name, inputs=inputs) - result = user_data._completed_requests.get() - response_counter += 1 - # Expect stream to not CLOSE as NOT in triton_grpc_error mode + input_datas = [] + for i in range(number_of_requests): + input_data = np.random.randn(*shape).astype(np.float32) + input_datas.append(input_data) + inputs = [ + grpcclient.InferInput( + "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) + ) + ] + inputs[0].set_data_from_numpy(input_data) + triton_client.async_stream_infer(model_name=model_name, inputs=inputs) + result = user_data._completed_requests.get() + response_counter += 1 + # we expect response_counter == number_of_requests, + # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error self.assertEqual(response_counter, number_of_requests) diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h index 9f1c54527a..17cea206d3 100644 --- a/src/grpc/grpc_utils.h +++ b/src/grpc/grpc_utils.h @@ -81,11 +81,11 @@ typedef enum { NONE, // Error from CORE encountered, waiting to be picked up by completion queue to // initiate cancellation - ERROR_WAITING, + ERROR_ENCOUNTERED, // Error from CORE encountered, stream closed // This state is added to avoid double cancellation - ERROR_CANCELED -} Triton_grpc_error_steps; + ERROR_HANDLING_COMPLETE +} TritonGRPCErrorSteps; // Debugging helper std::ostream& operator<<(std::ostream& out, const Steps& step); diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index c93c3ba7eb..9cfa9822df 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -643,7 +643,7 @@ class InferHandlerState { : cq_(cq), unique_id_(unique_id), ongoing_requests_(0), step_(Steps::START), finish_ok_(true), ongoing_write_(false), received_notification_(false), triton_grpc_error_(false), - grpc_stream_error_state_(Triton_grpc_error_steps::NONE) + grpc_stream_error_state_(TritonGRPCErrorSteps::NONE) { ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); @@ -665,19 +665,19 @@ class InferHandlerState { bool ReceivedNotification() { return received_notification_; } - // Changes the state of grpc_stream_error_state_ to ERROR_CANCELED, + // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, // indicating we have closed the stream and initiated the cancel flow - void SetGRPCErrorCancelled() + void MarkGRPCErrorHandlingComplete() { - grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED; + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE; } // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. bool CheckAndUpdateGRPCError() { - if (grpc_stream_error_state_ == Triton_grpc_error_steps::ERROR_WAITING) { - // Change the state to ERROR_CANCELED as we have called + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) { + // Change the state to ERROR_HANDLING_COMPLETE as we have called // HandleCancellation - SetGRPCErrorCancelled(); + MarkGRPCErrorHandlingComplete(); return true; } return false; @@ -689,39 +689,6 @@ class InferHandlerState { ? (ctx_->IsCancelled() || CheckAndUpdateGRPCError()) : false; } - - // Extracts headers from GRPC request and updates state - void ExtractStateFromHeaders(InferHandlerStateType* state) - { - const auto& metadata = state->context_->ctx_->client_metadata(); - for (const auto& pair : metadata) { - auto& key = pair.first; - auto& value = pair.second; - std::string param_key = std::string(key.begin(), key.end()); - std::string value_key = std::string(value.begin(), value.end()); - std::string triton_grpc_error_key = "triton_grpc_error"; - if (param_key == triton_grpc_error_key) { - if (value_key == "true") { - LOG_VERBOSE(2) - << "GRPC: triton_grpc_error mode detected in new grpc stream"; - state->context_->triton_grpc_error_ = true; - } - } - } - } - - void SendGRPCStrictResponse(InferHandlerStateType* state) - { - std::lock_guard lock(state->context_->mu_); - // Check if Error not responded previously - // Avoid closing connection twice on multiple errors from core - if (!state->context_->IsGRPCStrictError()) { - state->step_ = Steps::COMPLETE; - state->context_->responder_->Finish(state->status_, state); - // Mark error for this stream - state->context_->MarkGRPCStrictError(); - } - } // Increments the ongoing request counter void IncrementRequestCounter() { ongoing_requests_++; } @@ -763,6 +730,37 @@ class InferHandlerState { return false; } + // Extracts headers from GRPC request and updates state + void ExtractStateFromHeaders(InferHandlerStateType* state) + { + const auto& metadata = state->context_->ctx_->client_metadata(); + std::string triton_grpc_error_key = "triton_grpc_error"; + + auto it = metadata.find( + {triton_grpc_error_key.data(), triton_grpc_error_key.size()}); + + if (it != metadata.end()) { + if (it->second == "true") { + LOG_VERBOSE(2) + << "GRPC: triton_grpc_error mode detected in new grpc stream"; + triton_grpc_error_ = true; + } + } + } + + void WriteGRPCErrorResponse(InferHandlerStateType* state) + { + std::lock_guard lock(state->context_->mu_); + // Check if Error not responded previously + // Avoid closing connection twice on multiple errors from core + if (!state->context_->GRPCErrorEncountered()) { + state->step_ = Steps::COMPLETE; + state->context_->responder_->Finish(state->status_, state); + // Mark error for this stream + state->context_->MarkGRPCErrorEncountered(); + } + } + const std::string DebugString(InferHandlerStateType* state) { std::string debug_string(""); @@ -995,15 +993,15 @@ class InferHandlerState { } // Marks error after it has been responded to - void MarkGRPCStrictError() + void MarkGRPCErrorEncountered() { - grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_WAITING; + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED; } // Checks if error already responded to in triton_grpc_error mode - bool IsGRPCStrictError() + bool GRPCErrorEncountered() { - if (grpc_stream_error_state_ == Triton_grpc_error_steps::NONE) { + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) { return false; } return true; @@ -1074,7 +1072,7 @@ class InferHandlerState { // Indicates the state of triton_grpc_error, only relevant if special // triton_grpc_error feature set to true by client - Triton_grpc_error_steps grpc_stream_error_state_; + TritonGRPCErrorSteps grpc_stream_error_state_; }; // This constructor is used to build a wrapper state object diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 6a14ed9d4c..836282060a 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -598,7 +598,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( // Ignore Response from CORE in case GRPC Strict as we dont care about if (state->context_->triton_grpc_error_) { std::lock_guard lock(state->context_->mu_); - if (state->context_->IsGRPCStrictError()) { + if (state->context_->GRPCErrorEncountered()) { return; } } @@ -690,7 +690,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( LOG_VERBOSE(1) << "GRPC streaming error detected with status: " << status.error_code() << "Closing stream connection." << std::endl; - state->context_->SendGRPCStrictResponse(state); + state->context_->WriteGRPCErrorResponse(state); TRITONSERVER_ErrorDelete(err); LOG_TRITONSERVER_ERROR( TRITONSERVER_InferenceResponseDelete(iresponse), From 0cb7db0ac2f397eee9ab9b1ec8352efc193226d5 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 14 Aug 2024 00:11:50 -0700 Subject: [PATCH 20/32] Comments resolved --- qa/L0_backend_python/lifecycle/lifecycle_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index bd543d784e..3f1e3d62bb 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -373,7 +373,7 @@ def test_triton_grpc_error_error_off(self): triton_client.async_stream_infer(model_name=model_name, inputs=inputs) result = user_data._completed_requests.get() response_counter += 1 - # we expect response_counter == number_of_requests, + # we expect response_counter == number_of_requests, # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error self.assertEqual(response_counter, number_of_requests) From 8ea26471441dcdabb9fc1b334469dafa8f90ce60 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 14 Aug 2024 10:32:54 -0700 Subject: [PATCH 21/32] New class gRPCErrorTracker created --- src/grpc/grpc_utils.h | 30 +++++++++++++++++- src/grpc/infer_handler.h | 54 +++++--------------------------- src/grpc/stream_infer_handler.cc | 44 ++++++++++++++++++++++++-- 3 files changed, 78 insertions(+), 50 deletions(-) diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h index 17cea206d3..032dec3ad9 100644 --- a/src/grpc/grpc_utils.h +++ b/src/grpc/grpc_utils.h @@ -87,6 +87,35 @@ typedef enum { ERROR_HANDLING_COMPLETE } TritonGRPCErrorSteps; +class gRPCErrorTracker { + public: + // True if set by user via header + // Can be accessed without a lock, as set only once in startstream + std::atomic triton_grpc_error_; + + // Indicates the state of triton_grpc_error, only relevant if special + // triton_grpc_error feature set to true by client + TritonGRPCErrorSteps grpc_stream_error_state_; + + // Constructor + gRPCErrorTracker() + : triton_grpc_error_(false), + grpc_stream_error_state_(TritonGRPCErrorSteps::NONE) + { + } + // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, + // indicating we have closed the stream and initiated the cancel flow + void MarkGRPCErrorHandlingComplete(); + + // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. + bool CheckAndUpdateGRPCError(); + + // Marks error after it has been responded to + void MarkGRPCErrorEncountered(); + + // Checks if error already responded to in triton_grpc_error mode + bool GRPCErrorEncountered(); +}; // Debugging helper std::ostream& operator<<(std::ostream& out, const Steps& step); @@ -194,5 +223,4 @@ TRITONSERVER_Error* ParseClassificationParams( void ReadFile(const std::string& filename, std::string& data); - }}} // namespace triton::server::grpc diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h index 9cfa9822df..6382c96c3c 100644 --- a/src/grpc/infer_handler.h +++ b/src/grpc/infer_handler.h @@ -642,11 +642,11 @@ class InferHandlerState { ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0) : cq_(cq), unique_id_(unique_id), ongoing_requests_(0), step_(Steps::START), finish_ok_(true), ongoing_write_(false), - received_notification_(false), triton_grpc_error_(false), - grpc_stream_error_state_(TritonGRPCErrorSteps::NONE) + received_notification_(false) { ctx_.reset(new ::grpc::ServerContext()); responder_.reset(new ServerResponderType(ctx_.get())); + gRPCErrorTracker_ = std::make_unique(); } void SetCompressionLevel(grpc_compression_level compression_level) @@ -665,28 +665,11 @@ class InferHandlerState { bool ReceivedNotification() { return received_notification_; } - // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, - // indicating we have closed the stream and initiated the cancel flow - void MarkGRPCErrorHandlingComplete() - { - grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE; - } - // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. - bool CheckAndUpdateGRPCError() - { - if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) { - // Change the state to ERROR_HANDLING_COMPLETE as we have called - // HandleCancellation - MarkGRPCErrorHandlingComplete(); - return true; - } - return false; - } - bool IsCancelled() { return received_notification_ - ? (ctx_->IsCancelled() || CheckAndUpdateGRPCError()) + ? (ctx_->IsCancelled() || + gRPCErrorTracker_->CheckAndUpdateGRPCError()) : false; } // Increments the ongoing request counter @@ -743,7 +726,7 @@ class InferHandlerState { if (it->second == "true") { LOG_VERBOSE(2) << "GRPC: triton_grpc_error mode detected in new grpc stream"; - triton_grpc_error_ = true; + state->context_->gRPCErrorTracker_->triton_grpc_error_ = true; } } } @@ -753,11 +736,11 @@ class InferHandlerState { std::lock_guard lock(state->context_->mu_); // Check if Error not responded previously // Avoid closing connection twice on multiple errors from core - if (!state->context_->GRPCErrorEncountered()) { + if (!state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) { state->step_ = Steps::COMPLETE; state->context_->responder_->Finish(state->status_, state); // Mark error for this stream - state->context_->MarkGRPCErrorEncountered(); + state->context_->gRPCErrorTracker_->MarkGRPCErrorEncountered(); } } @@ -992,21 +975,6 @@ class InferHandlerState { return false; } - // Marks error after it has been responded to - void MarkGRPCErrorEncountered() - { - grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED; - } - - // Checks if error already responded to in triton_grpc_error mode - bool GRPCErrorEncountered() - { - if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) { - return false; - } - return true; - } - // Return true if this context has completed all reads and writes. bool IsRequestsCompleted() { @@ -1066,13 +1034,7 @@ class InferHandlerState { // completion queue. bool received_notification_; - // True if set by user via header - // Can be accessed without a lock, as set only once in startstream - std::atomic triton_grpc_error_; - - // Indicates the state of triton_grpc_error, only relevant if special - // triton_grpc_error feature set to true by client - TritonGRPCErrorSteps grpc_stream_error_state_; + std::unique_ptr gRPCErrorTracker_; }; // This constructor is used to build a wrapper state object diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc index 836282060a..6651eca813 100644 --- a/src/grpc/stream_infer_handler.cc +++ b/src/grpc/stream_infer_handler.cc @@ -596,9 +596,9 @@ ModelStreamInferHandler::StreamInferResponseComplete( { State* state = reinterpret_cast(userp); // Ignore Response from CORE in case GRPC Strict as we dont care about - if (state->context_->triton_grpc_error_) { + if (state->context_->gRPCErrorTracker_->triton_grpc_error_) { std::lock_guard lock(state->context_->mu_); - if (state->context_->GRPCErrorEncountered()) { + if (state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) { return; } } @@ -684,7 +684,7 @@ ModelStreamInferHandler::StreamInferResponseComplete( response->mutable_infer_response()->Clear(); response->set_error_message(status.error_message()); LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl; - if (state->context_->triton_grpc_error_) { + if (state->context_->gRPCErrorTracker_->triton_grpc_error_) { state->status_ = status; // Finish only once, if backend ignores cancellation LOG_VERBOSE(1) << "GRPC streaming error detected with status: " @@ -820,4 +820,42 @@ ModelStreamInferHandler::StreamInferResponseComplete( } } +// Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE, +// indicating we have closed the stream and initiated the cancel flow +void +gRPCErrorTracker::MarkGRPCErrorHandlingComplete() +{ + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE; +} + +// Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed. +bool +gRPCErrorTracker::CheckAndUpdateGRPCError() +{ + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) { + // Change the state to ERROR_HANDLING_COMPLETE as we have called + // HandleCancellation + MarkGRPCErrorHandlingComplete(); + return true; + } + return false; +} + +// Marks error after it has been responded to +void +gRPCErrorTracker::MarkGRPCErrorEncountered() +{ + grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED; +} + +// Checks if error already responded to in triton_grpc_error mode +bool +gRPCErrorTracker::GRPCErrorEncountered() +{ + if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) { + return false; + } + return true; +} + }}} // namespace triton::server::grpc From e8c3242b5e902ab983bbac8d9fc2fef4f2fa16a0 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 14 Aug 2024 11:51:04 -0700 Subject: [PATCH 22/32] Docs Updated --- docs/customization_guide/inference_protocols.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md index 592f26e7d1..a241f097da 100644 --- a/docs/customization_guide/inference_protocols.md +++ b/docs/customization_guide/inference_protocols.md @@ -115,6 +115,16 @@ These options can be used to configure the KeepAlive settings: For client-side documentation, see [Client-Side GRPC KeepAlive](https://github.com/triton-inference-server/client/blob/main/README.md#grpc-keepalive). +#### GRPC Status Codes + +Triton implements GRPC error handling for streaming requests when a specific flag is enabled through headers. Upon encountering an error, Triton returns the appropriate GRPC error code and subsequently closes the stream. + +* `triton_grpc_error` : The header value needs to be set to true while starting the stream. + +GRPC status codes can be used for better visibility and monitoring. For more details, see [gRPC Status Codes](https://grpc.io/docs/guides/status-codes/) + +For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes) + ### Limit Endpoint Access (BETA) Triton users may want to restrict access to protocols or APIs that are From 72097e3ba52113f215207461493e905a7de75260 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Wed, 14 Aug 2024 17:05:33 -0700 Subject: [PATCH 23/32] Pipeline test --- qa/L0_decoupled/decoupled_test.py | 16 +++++- qa/L0_decoupled/test.sh | 91 +++++++++++++++++-------------- 2 files changed, 64 insertions(+), 43 deletions(-) diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index 1f76f4845b..d7bc59f5c7 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -116,7 +116,13 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -175,7 +181,13 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh index 98ad134d8b..649e8e4545 100755 --- a/qa/L0_decoupled/test.sh +++ b/qa/L0_decoupled/test.sh @@ -55,51 +55,60 @@ source ../common/util.sh TRIALS="python custom" +GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_false" + +for grpc_trial in $GRPC_TRIALS; do + for trial in $TRIALS; do + if [ $trial == "python" ]; then + MODELDIR=`pwd`/python_models + else + MODELDIR=`pwd`/models + fi -for trial in $TRIALS; do - if [ $trial == "python" ]; then - MODELDIR=`pwd`/python_models - else - MODELDIR=`pwd`/models - fi + if [ $grpc_trial == "triton_grpc_error_true" ]; then + export TRITONSERVER_GRPC_STATUS_FLAG=true + else + unset TRITONSERVER_GRPC_STATUS_FLAG + fi - SERVER_ARGS="--model-repository=$MODELDIR" - cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/. - (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \ - sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt) + SERVER_ARGS="--model-repository=$MODELDIR" + cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/. + (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt) - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi - for i in \ - test_one_to_none \ - test_one_to_one \ - test_one_to_many \ - test_no_streaming \ - test_response_order \ - test_wrong_shape; do - - echo "Test: $i" >>$CLIENT_LOG - set +e - python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - else - check_test_results $TEST_RESULT_FILE 1 - if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" - RET=1 - fi - fi - set -e - done + for i in \ + test_one_to_none \ + test_one_to_one \ + test_one_to_many \ + test_no_streaming \ + test_response_order \ + test_wrong_shape; do + + echo "Test: $i" >>$CLIENT_LOG + set +e + python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + done +done # Will delay the writing of each response by the specified many milliseconds. # This will ensure that there are multiple responses available to be written. From 350af2593b62f9f7fe9a3a54ba6615a6bf200cfe Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 15 Aug 2024 16:16:51 -0700 Subject: [PATCH 24/32] Resolve Unused local variable warning --- qa/L0_backend_python/lifecycle/lifecycle_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index 3f1e3d62bb..607726b961 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -371,7 +371,7 @@ def test_triton_grpc_error_error_off(self): ] inputs[0].set_data_from_numpy(input_data) triton_client.async_stream_infer(model_name=model_name, inputs=inputs) - result = user_data._completed_requests.get() + _ = user_data._completed_requests.get() response_counter += 1 # we expect response_counter == number_of_requests, # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error From e473f29c1b917cfb8a5a0787bd7f56d09d545c29 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 15 Aug 2024 18:24:26 -0700 Subject: [PATCH 25/32] GRPC Cleanup tests updated for triton grpc error --- qa/L0_grpc_state_cleanup/cleanup_test.py | 36 +++- qa/L0_grpc_state_cleanup/test.sh | 239 ++++++++++++----------- 2 files changed, 151 insertions(+), 124 deletions(-) diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py index 431eeb1720..01c04cc66b 100755 --- a/qa/L0_grpc_state_cleanup/cleanup_test.py +++ b/qa/L0_grpc_state_cleanup/cleanup_test.py @@ -161,9 +161,15 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -229,9 +235,15 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -608,9 +620,15 @@ def test_non_decoupled_streaming_multi_response(self): url="localhost:8001", verbose=True ) as client: # Establish stream - client.start_stream( - callback=partial(callback, user_data), stream_timeout=16 - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16, headers=metadata + ) + else: + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16 + ) # Send a request client.async_stream_infer( model_name=self.repeat_non_decoupled_model_name, diff --git a/qa/L0_grpc_state_cleanup/test.sh b/qa/L0_grpc_state_cleanup/test.sh index df302d5ed1..49e11b47a8 100755 --- a/qa/L0_grpc_state_cleanup/test.sh +++ b/qa/L0_grpc_state_cleanup/test.sh @@ -79,89 +79,131 @@ rm -rf models/repeat_int32_non_decoupled && \ sed -i "/model_transaction_policy/,+2d" config.pbtxt && \ sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt) -for i in test_simple_infer \ - test_simple_infer_cancellation \ - test_simple_infer_timeout \ - test_streaming_infer \ - test_streaming_timeout \ - test_streaming_cancellation \ - test_decoupled_infer \ - test_decoupled_cancellation \ - test_decoupled_timeout \ - test_non_decoupled_streaming_multi_response; do - SERVER_LOG="./inference_server.$i.log" - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG +GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_true" - set +e - python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 +for grpc_trial in $GRPC_TRIALS; do + if [ $grpc_trial == "triton_grpc_error_true" ]; then + export TRITONSERVER_GRPC_STATUS_FLAG=true + else + unset TRITONSERVER_GRPC_STATUS_FLAG fi - - kill $SERVER_PID - wait $SERVER_PID - - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" + for i in test_simple_infer \ + test_simple_infer_cancellation \ + test_simple_infer_timeout \ + test_streaming_infer \ + test_streaming_timeout \ + test_streaming_cancellation \ + test_decoupled_infer \ + test_decoupled_cancellation \ + test_decoupled_timeout \ + test_non_decoupled_streaming_multi_response; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" RET=1 - fi - set -e -done - - -for i in test_simple_infer_error_status \ - test_streaming_error_status \ - test_decoupled_error_status; do - SERVER_LOG="./inference_server.$i.log" - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG + fi + + kill $SERVER_PID + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + set -e + done + + + for i in test_simple_infer_error_status \ + test_streaming_error_status \ + test_decoupled_error_status; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + + set -e + done + + for i in test_simple_infer_shutdownserver \ + test_streaming_infer_shutdownserver \ + test_decoupled_infer_shutdownserver \ + test_decoupled_infer_with_params_shutdownserver; do + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + SERVER_LOG="./inference_server.$i.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi - set +e - python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - fi + wait $SERVER_PID - kill $SERVER_PID - wait $SERVER_PID + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 - fi + set -e + done - set -e -done + TEST_NAME=test_decoupled_infer_complete + export TRITONSERVER_DELAY_GRPC_COMPLETE=2000 -for i in test_simple_infer_shutdownserver \ - test_streaming_infer_shutdownserver \ - test_decoupled_infer_shutdownserver \ - test_decoupled_infer_with_params_shutdownserver; do + SERVER_LOG="./inference_server.$TEST_NAME.log" SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" - SERVER_LOG="./inference_server.$i.log" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -169,63 +211,30 @@ for i in test_simple_infer_shutdownserver \ exit 1 fi - echo "Test: $i" >>$CLIENT_LOG + echo "Test: $TEST_NAME" >>$CLIENT_LOG set +e - SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + + SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" + cat $CLIENT_LOG + echo -e "\n***\n*** Test $TEST_NAME Failed\n***" RET=1 fi + kill $SERVER_PID wait $SERVER_PID check_state_release $SERVER_LOG if [ $? -ne 0 ]; then cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 + echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***" + RET=1 fi set -e done -TEST_NAME=test_decoupled_infer_complete -export TRITONSERVER_DELAY_GRPC_COMPLETE=2000 - -SERVER_LOG="./inference_server.$TEST_NAME.log" -SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" -run_server -if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 -fi - -echo "Test: $TEST_NAME" >>$CLIENT_LOG - -set +e - -SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1 -if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test $TEST_NAME Failed\n***" - RET=1 -fi - -kill $SERVER_PID -wait $SERVER_PID - -check_state_release $SERVER_LOG -if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***" - RET=1 -fi - -set -e - if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else From 370c4493f2026d58601662f1feccf638ad70777a Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Thu, 15 Aug 2024 23:24:17 -0700 Subject: [PATCH 26/32] Revert "GRPC Cleanup tests updated for triton grpc error" This reverts commit e473f29c1b917cfb8a5a0787bd7f56d09d545c29. --- qa/L0_grpc_state_cleanup/cleanup_test.py | 36 +--- qa/L0_grpc_state_cleanup/test.sh | 239 +++++++++++------------ 2 files changed, 124 insertions(+), 151 deletions(-) diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py index 01c04cc66b..431eeb1720 100755 --- a/qa/L0_grpc_state_cleanup/cleanup_test.py +++ b/qa/L0_grpc_state_cleanup/cleanup_test.py @@ -161,15 +161,9 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata - ) - else: - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -235,15 +229,9 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata - ) - else: - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -620,15 +608,9 @@ def test_non_decoupled_streaming_multi_response(self): url="localhost:8001", verbose=True ) as client: # Establish stream - if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: - metadata = {"triton_grpc_error": "true"} - client.start_stream( - callback=partial(callback, user_data), stream_timeout=16, headers=metadata - ) - else: - client.start_stream( - callback=partial(callback, user_data), stream_timeout=16 - ) + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16 + ) # Send a request client.async_stream_infer( model_name=self.repeat_non_decoupled_model_name, diff --git a/qa/L0_grpc_state_cleanup/test.sh b/qa/L0_grpc_state_cleanup/test.sh index 49e11b47a8..df302d5ed1 100755 --- a/qa/L0_grpc_state_cleanup/test.sh +++ b/qa/L0_grpc_state_cleanup/test.sh @@ -79,131 +79,89 @@ rm -rf models/repeat_int32_non_decoupled && \ sed -i "/model_transaction_policy/,+2d" config.pbtxt && \ sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt) -GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_true" +for i in test_simple_infer \ + test_simple_infer_cancellation \ + test_simple_infer_timeout \ + test_streaming_infer \ + test_streaming_timeout \ + test_streaming_cancellation \ + test_decoupled_infer \ + test_decoupled_cancellation \ + test_decoupled_timeout \ + test_non_decoupled_streaming_multi_response; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG -for grpc_trial in $GRPC_TRIALS; do - if [ $grpc_trial == "triton_grpc_error_true" ]; then - export TRITONSERVER_GRPC_STATUS_FLAG=true - else - unset TRITONSERVER_GRPC_STATUS_FLAG + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 fi - for i in test_simple_infer \ - test_simple_infer_cancellation \ - test_simple_infer_timeout \ - test_streaming_infer \ - test_streaming_timeout \ - test_streaming_cancellation \ - test_decoupled_infer \ - test_decoupled_cancellation \ - test_decoupled_timeout \ - test_non_decoupled_streaming_multi_response; do - SERVER_LOG="./inference_server.$i.log" - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG - - set +e - python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - fi - - kill $SERVER_PID - wait $SERVER_PID - - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 - fi - set -e - done - - - for i in test_simple_infer_error_status \ - test_streaming_error_status \ - test_decoupled_error_status; do - SERVER_LOG="./inference_server.$i.log" - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG - - set +e - python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - fi - - kill $SERVER_PID - wait $SERVER_PID - - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 - fi - - set -e - done - - for i in test_simple_infer_shutdownserver \ - test_streaming_infer_shutdownserver \ - test_decoupled_infer_shutdownserver \ - test_decoupled_infer_with_params_shutdownserver; do - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" - SERVER_LOG="./inference_server.$i.log" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG - - set +e - SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" + + kill $SERVER_PID + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" RET=1 - fi + fi + set -e +done + + +for i in test_simple_infer_error_status \ + test_streaming_error_status \ + test_decoupled_error_status; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi - wait $SERVER_PID + echo "Test: $i" >>$CLIENT_LOG - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 - fi + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi - set -e - done + kill $SERVER_PID + wait $SERVER_PID - TEST_NAME=test_decoupled_infer_complete - export TRITONSERVER_DELAY_GRPC_COMPLETE=2000 + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + + set -e +done - SERVER_LOG="./inference_server.$TEST_NAME.log" +for i in test_simple_infer_shutdownserver \ + test_streaming_infer_shutdownserver \ + test_decoupled_infer_shutdownserver \ + test_decoupled_infer_with_params_shutdownserver; do SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + SERVER_LOG="./inference_server.$i.log" run_server if [ "$SERVER_PID" == "0" ]; then echo -e "\n***\n*** Failed to start $SERVER\n***" @@ -211,30 +169,63 @@ for grpc_trial in $GRPC_TRIALS; do exit 1 fi - echo "Test: $TEST_NAME" >>$CLIENT_LOG + echo "Test: $i" >>$CLIENT_LOG set +e - - SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1 + SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test $TEST_NAME Failed\n***" + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" RET=1 fi - kill $SERVER_PID wait $SERVER_PID check_state_release $SERVER_LOG if [ $? -ne 0 ]; then cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***" - RET=1 + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 fi set -e done +TEST_NAME=test_decoupled_infer_complete +export TRITONSERVER_DELAY_GRPC_COMPLETE=2000 + +SERVER_LOG="./inference_server.$TEST_NAME.log" +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_NAME" >>$CLIENT_LOG + +set +e + +SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test $TEST_NAME Failed\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +check_state_release $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***" + RET=1 +fi + +set -e + if [ $RET -eq 0 ]; then echo -e "\n***\n*** Test Passed\n***" else From b87c3fc00c921c18333de2b0ee0f7c217d830701 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 16 Aug 2024 01:51:14 -0700 Subject: [PATCH 27/32] GRPC Cleanup tests updated for triton grpc error --- Dockerfile.QA | 4 + qa/L0_decoupled/decoupled_test.py | 18 +- qa/L0_decoupled/test.sh | 93 ++- qa/L0_decoupled_grpc_error/decoupled_test.py | 649 ++++++++++++++++++ qa/L0_decoupled_grpc_error/test.sh | 179 +++++ .../cleanup_test.py | 642 +++++++++++++++++ qa/L0_grpc_error_state_cleanup/test.sh | 235 +++++++ 7 files changed, 1754 insertions(+), 66 deletions(-) create mode 100755 qa/L0_decoupled_grpc_error/decoupled_test.py create mode 100755 qa/L0_decoupled_grpc_error/test.sh create mode 100755 qa/L0_grpc_error_state_cleanup/cleanup_test.py create mode 100755 qa/L0_grpc_error_state_cleanup/test.sh diff --git a/Dockerfile.QA b/Dockerfile.QA index 2c43f735a5..a3073948c5 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -113,6 +113,8 @@ RUN mkdir -p qa/common && \ cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \ mkdir qa/L0_grpc_state_cleanup/models && \ cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \ + mkdir qa/L0_grpc_error_state_cleanup/models && \ + cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_error_state_cleanup/models/ && \ mkdir qa/L0_http/models && \ cp -r docs/examples/model_repository/simple qa/L0_http/models && \ cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_http/models && \ @@ -249,11 +251,13 @@ RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \ mkdir -p qa/L0_decoupled/models/repeat_square/1 && \ mkdir -p qa/L0_decoupled/models/nested_square/1 && \ mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 + mkdir -p qa/L0_grpc_error_state_cleanup/models/repeat_int32/1 RUN if [ "$IGPU_BUILD" == "0" ]; then \ cp backends/repeat/libtriton_repeat.so qa/L0_model_config && \ cp backends/repeat/libtriton_repeat.so qa/L0_decoupled/models/repeat_int32/1 && \ cp backends/repeat/libtriton_repeat.so qa/L0_grpc_state_cleanup/models/repeat_int32/1/. && \ + cp backends/repeat/libtriton_repeat.so qa/L0_grpc_error_state_cleanup/models/repeat_int32/1/. && \ cp backends/square/libtriton_square.so qa/L0_decoupled/models/square_int32/1; \ fi diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index d7bc59f5c7..d0f09deaf9 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -116,13 +116,7 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), headers=metadata - ) - else: - triton_client.start_stream(callback=partial(callback, user_data)) + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -181,13 +175,7 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), headers=metadata - ) - else: - triton_client.start_stream(callback=partial(callback, user_data)) + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -656,4 +644,4 @@ def test_http(self): if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh index 649e8e4545..22c37dff49 100755 --- a/qa/L0_decoupled/test.sh +++ b/qa/L0_decoupled/test.sh @@ -55,60 +55,51 @@ source ../common/util.sh TRIALS="python custom" -GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_false" - -for grpc_trial in $GRPC_TRIALS; do - for trial in $TRIALS; do - if [ $trial == "python" ]; then - MODELDIR=`pwd`/python_models - else - MODELDIR=`pwd`/models - fi - if [ $grpc_trial == "triton_grpc_error_true" ]; then - export TRITONSERVER_GRPC_STATUS_FLAG=true - else - unset TRITONSERVER_GRPC_STATUS_FLAG - fi +for trial in $TRIALS; do + if [ $trial == "python" ]; then + MODELDIR=`pwd`/python_models + else + MODELDIR=`pwd`/models + fi - SERVER_ARGS="--model-repository=$MODELDIR" - cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/. - (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \ - sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt) + SERVER_ARGS="--model-repository=$MODELDIR" + cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/. + (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt) - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi - for i in \ - test_one_to_none \ - test_one_to_one \ - test_one_to_many \ - test_no_streaming \ - test_response_order \ - test_wrong_shape; do - - echo "Test: $i" >>$CLIENT_LOG - set +e - python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - else - check_test_results $TEST_RESULT_FILE 1 - if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" - RET=1 - fi - fi - set -e - done -done + for i in \ + test_one_to_none \ + test_one_to_one \ + test_one_to_many \ + test_no_streaming \ + test_response_order \ + test_wrong_shape; do + + echo "Test: $i" >>$CLIENT_LOG + set +e + python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + done # Will delay the writing of each response by the specified many milliseconds. # This will ensure that there are multiple responses available to be written. @@ -185,4 +176,4 @@ else echo -e "\n***\n*** Test Failed\n***" fi -exit $RET +exit $RET \ No newline at end of file diff --git a/qa/L0_decoupled_grpc_error/decoupled_test.py b/qa/L0_decoupled_grpc_error/decoupled_test.py new file mode 100755 index 0000000000..fc606e3cca --- /dev/null +++ b/qa/L0_decoupled_grpc_error/decoupled_test.py @@ -0,0 +1,649 @@ +#!/usr/bin/env python3 + +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import queue +import time +import unittest +from functools import partial + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient +from tritonclient.utils import InferenceServerException + + +class UserData: + def __init__(self): + self._response_queue = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._response_queue.put(error) + else: + user_data._response_queue.put(result) + + +class DecoupledTest(tu.TestResultCollector): + def setUp(self): + self.trials_ = [ + ("repeat_int32", None), + ("simple_repeat", None), + ("sequence_repeat", None), + ("fan_repeat", self._fan_validate), + ("repeat_square", self._nested_validate), + ("nested_square", self._nested_validate), + ] + self.model_name_ = "repeat_int32" + + self.inputs_ = [] + self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32")) + self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32")) + self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32")) + + self.outputs_ = [] + self.outputs_.append(grpcclient.InferRequestedOutput("OUT")) + self.outputs_.append(grpcclient.InferRequestedOutput("IDX")) + # Some trials only expect a subset of outputs + self.requested_outputs_ = self.outputs_ + + # Client can receive a "triton_final_response" response parameter + # from Triton server that indicates when a response is the final response for + # its request. + # + # For non-decoupled models, there is a 1:1 request:response ratio, so every + # response is the final response, and this parameter is unnecessary. + # + # For decoupled models, there is a 1:N request:response ratio, so there may be + # more one response before receiving the "final" response. + # + # However, decoupled models have the unique property in that they can return + # a flags-only response to the server to indicate completion, which is not + # returned to the client by default (See TRITONBACKEND_ResponseFactorySendFlags). + # + # To forward this flags-only response to the client, users must opt-in to this + # behavior by adding the following argument: + # client.async_stream_infer(..., enable_empty_final_response=True). + # + # If the decoupled backend/model always sends the final response flag along + # with a non-null response, no opt-in is needed. + # + # With this behavior, the client can programmatically detect when all responses + # for an individual request have been received without knowing the expected + # number of responses in advance and without closing the stream. + def _stream_infer_with_params( + self, + request_count, + request_delay, + _, + delay_data, + delay_factor, + user_data, + result_dict, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream(callback=partial(callback, user_data), headers=metadata) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + self.inputs_[1].set_data_from_numpy(delay_data) + triton_client.async_stream_infer( + model_name=self.model_name_, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + # Opt-in to receiving flags-only responses from model/backend + # to help detect final responses for decoupled models. + enable_empty_final_response=True, + ) + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + completed_requests = 0 + while completed_requests < request_count: + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + response = data_item.get_response() + # Request IDs should generally be provided with each request + # to associate decoupled responses with their requests. + if not response.id: + raise ValueError( + "No response id found. Was a request_id provided?" + ) + + # Detect final response. Parameters are oneof and we expect bool_param + if response.parameters.get("triton_final_response").bool_param: + completed_requests += 1 + + # Only process non-empty response, ignore if empty (no outputs) + if response.outputs: + if response.id not in result_dict: + result_dict[response.id] = [] + result_dict[response.id].append((recv_count, data_item)) + recv_count += 1 + + def _stream_infer( + self, + request_count, + request_delay, + expected_count, + delay_data, + delay_factor, + user_data, + result_dict, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream(callback=partial(callback, user_data), headers=metadata) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + self.inputs_[1].set_data_from_numpy(delay_data) + triton_client.async_stream_infer( + model_name=self.model_name_, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + ) + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + while recv_count < expected_count: + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + this_id = data_item.get_response().id + if this_id not in result_dict: + result_dict[this_id] = [] + result_dict[this_id].append((recv_count, data_item)) + + recv_count += 1 + + def _fan_validate(self, result_list, data_offset, repeat_count): + # fan_repeat returns "2 * data_offset" as result + self.assertEqual(len(result_list), repeat_count) + expected_data = 2 * data_offset + for j in range(len(result_list)): + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + expected_data += 2 + + def _nested_validate(self, result_list, data_offset, repeat_count): + # if repeat model returns repeat result n, repeat_square-like model + # will return the same result n times + expected_len = sum(x for x in range(data_offset, data_offset + repeat_count)) + self.assertEqual(len(result_list), expected_len) + expected_data = data_offset + expected_count = expected_data + for j in range(len(result_list)): + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + expected_count -= 1 + if expected_count == 0: + expected_data += 1 + expected_count = expected_data + + def _decoupled_infer( + self, + request_count, + request_delay=0, + repeat_count=1, + data_offset=100, + delay_time=1000, + delay_factor=1, + wait_time=500, + order_sequence=None, + validate_fn=None, + ): + # Initialize data for IN + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + self.inputs_[0].set_shape([repeat_count]) + self.inputs_[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time + self.inputs_[1].set_shape([repeat_count]) + + # Initialize data for WAIT + wait_data = np.array([wait_time], dtype=np.uint32) + self.inputs_[2].set_data_from_numpy(wait_data) + + # use validate_fn to differentiate requested outputs + self.requested_outputs_ = ( + self.outputs_ if validate_fn is None else self.outputs_[0:1] + ) + + for infer_helper in [self._stream_infer, self._stream_infer_with_params]: + user_data = UserData() + result_dict = {} + + try: + if "square" not in self.model_name_: + expected_count = repeat_count * request_count + else: + expected_count = ( + sum(x for x in range(data_offset, data_offset + repeat_count)) + * request_count + ) + infer_helper( + request_count, + request_delay, + expected_count, + delay_data, + delay_factor, + user_data, + result_dict, + ) + except Exception as ex: + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Validate the results.. + for i in range(request_count): + this_id = str(i) + if repeat_count != 0 and this_id not in result_dict.keys(): + self.assertTrue( + False, "response for request id {} not received".format(this_id) + ) + elif repeat_count == 0 and this_id in result_dict.keys(): + self.assertTrue( + False, + "received unexpected response for request id {}".format( + this_id + ), + ) + if repeat_count != 0: + if validate_fn is None: + self.assertEqual(len(result_dict[this_id]), repeat_count) + expected_data = data_offset + result_list = result_dict[this_id] + for j in range(len(result_list)): + if order_sequence is not None: + self.assertEqual( + result_list[j][0], order_sequence[i][j] + ) + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + this_idx = result_list[j][1].as_numpy("IDX") + self.assertEqual(len(this_idx), 1) + self.assertEqual(this_idx[0], j) + expected_data += 1 + else: + validate_fn(result_dict[this_id], data_offset, repeat_count) + + def test_one_to_none(self): + # Test cases where each request generates no response. + # Note the name of the test one_to_none implies the + # mapping between requests and responses. + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + self._decoupled_infer(request_count=1, repeat_count=0, validate_fn=trial[1]) + # Multiple request case + self._decoupled_infer(request_count=5, repeat_count=0, validate_fn=trial[1]) + + def test_one_to_one(self): + # Test cases where each request generates single response. + # Note the name of the test one_to_one implies the + # mapping between requests and responses. + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + # Release request before the response is delivered + self._decoupled_infer(request_count=1, wait_time=500, validate_fn=trial[1]) + # Release request after the response is delivered + self._decoupled_infer(request_count=1, wait_time=2000, validate_fn=trial[1]) + + # Multiple request case + # Release request before the response is delivered + self._decoupled_infer(request_count=5, wait_time=500, validate_fn=trial[1]) + # Release request after the response is delivered + self._decoupled_infer(request_count=5, wait_time=2000, validate_fn=trial[1]) + + def test_one_to_many(self): + # Test cases where each request generates multiple response. + # Note the name of the test one_to_many implies the + # mapping between requests and responses. + + self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=2000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=10000, validate_fn=trial[1] + ) + + # Multiple request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=2000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1] + ) + + def test_one_to_multi_many(self): + # Test cases where each request generates multiple response but the + # responses are delayed so as to stress the control path handling the + # queued responses. + + self.assertTrue("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) + + for trial in self.trials_: + self.model_name_ = trial[0] + # Single request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=8000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=1, repeat_count=5, wait_time=20000, validate_fn=trial[1] + ) + + # Multiple request case + # Release request before the first response is delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1] + ) + # Release request when the responses are getting delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=3000, validate_fn=trial[1] + ) + # Release request after all the responses are delivered + self._decoupled_infer( + request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1] + ) + + def test_response_order(self): + # Test the expected response order for different cases + + self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) + + for trial in self.trials_: + self.model_name_ = trial[0] + + # Case 1: Interleaved responses + self._decoupled_infer( + request_count=2, + request_delay=500, + repeat_count=4, + order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]], + validate_fn=trial[1], + ) + + # Case 2: All responses of second request delivered before any + # response from the first + self._decoupled_infer( + request_count=2, + request_delay=500, + repeat_count=4, + delay_time=2000, + delay_factor=0.1, + order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]], + validate_fn=trial[1], + ) + + # Case 3: Similar to Case 2, but the second request is generated + # after the first response from first request is received + self._decoupled_infer( + request_count=2, + request_delay=2500, + repeat_count=4, + delay_time=2000, + delay_factor=0.1, + order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]], + validate_fn=trial[1], + ) + + # Case 4: All the responses of second requests are dleivered after + # all the responses from first requests are received + self._decoupled_infer( + request_count=2, + request_delay=100, + repeat_count=4, + delay_time=500, + delay_factor=10, + order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]], + validate_fn=trial[1], + ) + + # Case 5: Similar to Case 4, but the second request is generated + # after the first response from the first request is received + self._decoupled_infer( + request_count=2, + request_delay=750, + repeat_count=4, + delay_time=500, + delay_factor=10, + order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]], + validate_fn=trial[1], + ) + + def _no_streaming_helper(self, protocol): + data_offset = 100 + repeat_count = 1 + delay_time = 1000 + wait_time = 2000 + + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time + wait_data = np.array([wait_time], dtype=np.uint32) + + if protocol == "grpc": + # Use the inputs and outputs from the setUp + this_inputs = self.inputs_ + this_outputs = self.outputs_ + else: + this_inputs = [] + this_inputs.append(httpclient.InferInput("IN", [repeat_count], "INT32")) + this_inputs.append(httpclient.InferInput("DELAY", [1], "UINT32")) + this_inputs.append(httpclient.InferInput("WAIT", [1], "UINT32")) + this_outputs = [] + this_outputs.append(httpclient.InferRequestedOutput("OUT")) + + # Initialize data for IN + this_inputs[0].set_shape([repeat_count]) + this_inputs[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + this_inputs[1].set_shape([repeat_count]) + this_inputs[1].set_data_from_numpy(delay_data) + + # Initialize data for WAIT + this_inputs[2].set_data_from_numpy(wait_data) + + if protocol == "grpc": + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + else: + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + + with self.assertRaises(InferenceServerException) as cm: + triton_client.infer( + model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs + ) + + self.assertIn( + "doesn't support models with decoupled transaction policy", + str(cm.exception), + ) + + def test_no_streaming(self): + # Test cases with no streaming inference. Server should give + # appropriate error in such cases. + + self._no_streaming_helper("grpc") + self._no_streaming_helper("http") + + def test_wrong_shape(self): + # Sends mismatching shapes for IN and DELAY. Server should return + # appropriate error message. The shape of IN is [repeat_count], + # where as shape of DELAY is [repeat_count + 1]. + + data_offset = 100 + repeat_count = 1 + delay_time = 1000 + wait_time = 2000 + + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + delay_data = (np.ones([repeat_count + 1], dtype=np.uint32)) * delay_time + wait_data = np.array([wait_time], dtype=np.uint32) + + # Initialize data for IN + self.inputs_[0].set_shape([repeat_count]) + self.inputs_[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + self.inputs_[1].set_shape([repeat_count + 1]) + self.inputs_[1].set_data_from_numpy(delay_data) + + # Initialize data for WAIT + self.inputs_[2].set_data_from_numpy(wait_data) + + user_data = UserData() + result_dict = {} + + with self.assertRaises(InferenceServerException) as cm: + self._stream_infer( + 1, 0, repeat_count, delay_data, 1, user_data, result_dict + ) + + self.assertIn( + "expected IN and DELAY shape to match, got [1] and [2]", str(cm.exception) + ) + + +class NonDecoupledTest(tu.TestResultCollector): + def setUp(self): + self.model_name_ = "repeat_int32" + self.input_data = { + "IN": np.array([1], dtype=np.int32), + "DELAY": np.array([0], dtype=np.uint32), + "WAIT": np.array([0], dtype=np.uint32), + } + + def test_grpc(self): + inputs = [ + grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy( + self.input_data["IN"] + ), + grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( + self.input_data["DELAY"] + ), + grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( + self.input_data["WAIT"] + ), + ] + + triton_client = grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) + # Expect the inference is successful + res = triton_client.infer(model_name=self.model_name_, inputs=inputs) + self.assertEqual(1, res.as_numpy("OUT")[0]) + self.assertEqual(0, res.as_numpy("IDX")[0]) + + def test_http(self): + inputs = [ + httpclient.InferInput("IN", [1], "INT32").set_data_from_numpy( + self.input_data["IN"] + ), + httpclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( + self.input_data["DELAY"] + ), + httpclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( + self.input_data["WAIT"] + ), + ] + + triton_client = httpclient.InferenceServerClient( + url="localhost:8000", verbose=True + ) + # Expect the inference is successful + res = triton_client.infer(model_name=self.model_name_, inputs=inputs) + self.assertEqual(1, res.as_numpy("OUT")[0]) + self.assertEqual(0, res.as_numpy("IDX")[0]) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/qa/L0_decoupled_grpc_error/test.sh b/qa/L0_decoupled_grpc_error/test.sh new file mode 100755 index 0000000000..4fba476b1d --- /dev/null +++ b/qa/L0_decoupled_grpc_error/test.sh @@ -0,0 +1,179 @@ +#!/bin/bash +# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 +TEST_RESULT_FILE='test_results.txt' +DECOUPLED_TEST=decoupled_test.py + +rm -f *.log + +CLIENT_LOG=`pwd`/client.log +DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository +SERVER=/opt/tritonserver/bin/tritonserver +SERVER_ARGS="--model-repository=../L0_decoupled/models" +SERVER_LOG="./inference_server.log" +source ../common/util.sh + + +TRIALS="python custom" + +for trial in $TRIALS; do + if [ $trial == "python" ]; then + MODELDIR=../L0_decoupled/python_models + else + MODELDIR=../L0_decoupled/models + fi + + SERVER_ARGS="--model-repository=$MODELDIR" + cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/. + (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \ + sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt) + + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + for i in \ + test_one_to_none \ + test_one_to_one \ + test_one_to_many \ + test_no_streaming \ + test_response_order \ + test_wrong_shape; do + + echo "Test: $i" >>$CLIENT_LOG + set +e + python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + set -e + done + + # Will delay the writing of each response by the specified many milliseconds. + # This will ensure that there are multiple responses available to be written. + export TRITONSERVER_DELAY_GRPC_RESPONSE=2000 + + echo "Test: test_one_to_multi_many" >>$CLIENT_LOG + set +e + python $DECOUPLED_TEST DecoupledTest.test_one_to_multi_many >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" + RET=1 + else + check_test_results $TEST_RESULT_FILE 1 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi + fi + + set -e + + unset TRITONSERVER_DELAY_GRPC_RESPONSE + + kill $SERVER_PID + wait $SERVER_PID +done + +# Test the server frontend can merge the responses of non-decoupled model that +# sends inference response and COMPLETE flag separately. In other words, from +# the client's perspective there will still be one response. +NON_DECOUPLED_DIR=`pwd`/non_decoupled_models +rm -rf ${NON_DECOUPLED_DIR} && mkdir -p ${NON_DECOUPLED_DIR} +cp -r ../L0_decoupled/models/repeat_int32 ${NON_DECOUPLED_DIR}/. && \ + (cd ${NON_DECOUPLED_DIR}/repeat_int32 && \ + sed -i "s/decoupled: True/decoupled: False/" config.pbtxt) + +SERVER_ARGS="--model-repository=${NON_DECOUPLED_DIR}" +SERVER_LOG="./non_decoupled_inference_server.log" + +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +CLIENT_LOG=`pwd`/non_decoupled_client.log +echo "Test: NonDecoupledTest" >>$CLIENT_LOG +set +e +python $DECOUPLED_TEST NonDecoupledTest >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" + RET=1 +else + check_test_results $TEST_RESULT_FILE 2 + if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test Result Verification Failed\n***" + RET=1 + fi +fi + +set -e + +kill $SERVER_PID +wait $SERVER_PID + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET \ No newline at end of file diff --git a/qa/L0_grpc_error_state_cleanup/cleanup_test.py b/qa/L0_grpc_error_state_cleanup/cleanup_test.py new file mode 100755 index 0000000000..07537013aa --- /dev/null +++ b/qa/L0_grpc_error_state_cleanup/cleanup_test.py @@ -0,0 +1,642 @@ +#!/usr/bin/env python3 + +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys + +sys.path.append("../common") + +import os +import queue +import signal +import time +import unittest +from functools import partial + +import numpy as np +import test_util as tu +import tritonclient.grpc as grpcclient +from tritonclient.utils import InferenceServerException + + +class UserData: + def __init__(self): + self._response_queue = queue.Queue() + + +def callback(user_data, result, error): + if error: + user_data._response_queue.put(error) + else: + user_data._response_queue.put(result) + + +# These state cleanup tests relies on the test.sh +# to check whether all the created request objects +# were properly deleted by the sever. +# The purpose on these unittest is to exercise +# different portions of the gRPC frontend and +# and track the state objects. +class CleanUpTest(tu.TestResultCollector): + SERVER_PID = None + + def setUp(self): + self.decoupled_model_name_ = "repeat_int32" + self.identity_model_name_ = "custom_zero_1_float32" + self.repeat_non_decoupled_model_name = "repeat_int32_non_decoupled" + + def _prepare_inputs_and_outputs(self, kind): + if kind in ("decoupled_streaming", "non_decoupled_streaming"): + self.inputs_ = [] + self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32")) + self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32")) + self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32")) + + self.outputs_ = [] + self.outputs_.append(grpcclient.InferRequestedOutput("OUT")) + self.outputs_.append(grpcclient.InferRequestedOutput("IDX")) + self.requested_outputs_ = self.outputs_ + elif kind in ("simple", "streaming"): + self.inputs_ = [] + self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "FP32")) + + self.outputs_ = [] + self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0")) + self.requested_outputs_ = self.outputs_ + else: + raise ValueError("Unsupported kind specified to prepare inputs/outputs") + + def _simple_infer( + self, + request_count, + cancel_response_idx=None, + client_timeout_pair=None, + kill_server=None, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + self._prepare_inputs_and_outputs("simple") + + input_data = np.array([[1.0]], dtype=np.float32) + self.inputs_[0].set_data_from_numpy(input_data) + + user_data = UserData() + + futures = [] + timeout_idx = None + timeout_value = None + if client_timeout_pair: + timeout_idx, timeout_value = client_timeout_pair + for i in range(request_count): + if kill_server == i: + os.kill(int(self.SERVER_PID), signal.SIGINT) + this_timeout = None + if timeout_idx == i: + this_timeout = timeout_value + futures.append( + triton_client.async_infer( + model_name=self.identity_model_name_, + inputs=self.inputs_, + request_id=str(i), + callback=partial(callback, user_data), + outputs=self.requested_outputs_, + client_timeout=this_timeout, + ) + ) + + if cancel_response_idx is not None: + futures[cancel_response_idx].cancel() + + responses = [] + while len(responses) < len(futures): + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + responses.append(data_item) + + for response in responses: + output0_data = response.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(input_data, output0_data)) + + def _stream_infer_with_params( + self, + request_count, + request_delay, + _, + user_data, + result_dict, + delay_data=None, + delay_factor=None, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata + ) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + self.inputs_[1].set_data_from_numpy(delay_data) + if kill_server == i: + os.kill(int(self.SERVER_PID), signal.SIGINT) + triton_client.async_stream_infer( + model_name=self.decoupled_model_name_, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + # Opt-in to receiving flags-only responses from model/backend + # to help detect final responses for decoupled models. + enable_empty_final_response=True, + ) + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + completed_requests = 0 + while completed_requests < request_count: + if cancel_response_idx == recv_count: + triton_client.stop_stream(cancel_requests=True) + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + response = data_item.get_response() + # Request IDs should generally be provided with each request + # to associate decoupled responses with their requests. + if not response.id: + raise ValueError( + "No response id found. Was a request_id provided?" + ) + + # Detect final response. Parameters are oneof and we expect bool_param + if response.parameters.get("triton_final_response").bool_param: + completed_requests += 1 + + # Only process non-empty response, ignore if empty (no outputs) + if response.outputs: + if response.id not in result_dict: + result_dict[response.id] = [] + result_dict[response.id].append((recv_count, data_item)) + recv_count += 1 + + def _stream_infer( + self, + request_count, + request_delay, + expected_count, + user_data, + result_dict, + delay_data=None, + delay_factor=None, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + ): + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as triton_client: + # Establish stream + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata + ) + # Send specified many requests in parallel + for i in range(request_count): + time.sleep((request_delay / 1000)) + model_name = self.identity_model_name_ + if delay_data is not None: + model_name = self.decoupled_model_name_ + self.inputs_[1].set_data_from_numpy(delay_data) + if kill_server == i: + os.kill(int(self.SERVER_PID), signal.SIGINT) + triton_client.async_stream_infer( + model_name=model_name, + inputs=self.inputs_, + request_id=str(i), + outputs=self.requested_outputs_, + ) + if (delay_data is not None) and (delay_factor is not None): + # Update delay input in accordance with the scaling factor + delay_data = delay_data * delay_factor + delay_data = delay_data.astype(np.uint32) + + # Retrieve results... + recv_count = 0 + while recv_count < expected_count: + if cancel_response_idx == recv_count: + triton_client.stop_stream(cancel_requests=True) + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + this_id = data_item.get_response().id + if this_id not in result_dict: + result_dict[this_id] = [] + result_dict[this_id].append((recv_count, data_item)) + + recv_count += 1 + + def _streaming_infer( + self, + request_count, + request_delay=0, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + should_error=True, + ): + self._prepare_inputs_and_outputs("streaming") + + input_data = np.array([[1.0]], dtype=np.float32) + self.inputs_[0].set_data_from_numpy(input_data) + + user_data = UserData() + result_dict = {} + + try: + expected_count = request_count + self._stream_infer( + request_count, + request_delay, + expected_count, + user_data, + result_dict, + cancel_response_idx=cancel_response_idx, + stream_timeout=stream_timeout, + kill_server=kill_server, + ) + except Exception as ex: + if cancel_response_idx or stream_timeout or should_error: + raise ex + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Validate the results.. + for i in range(request_count): + this_id = str(i) + if this_id not in result_dict.keys(): + self.assertTrue( + False, "response for request id {} not received".format(this_id) + ) + self.assertEqual(len(result_dict[this_id]), 1) + result = result_dict[this_id][0][1] + output0_data = result.as_numpy("OUTPUT0") + self.assertTrue(np.array_equal(input_data, output0_data)) + + def _decoupled_infer( + self, + request_count, + request_delay=0, + repeat_count=1, + data_offset=100, + delay_time=1000, + delay_factor=1, + wait_time=500, + cancel_response_idx=None, + stream_timeout=None, + kill_server=None, + should_error=True, + infer_helper_map=[True, True], + ): + self._prepare_inputs_and_outputs(kind="decoupled_streaming") + + # Initialize data for IN + input_data = np.arange( + start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 + ) + self.inputs_[0].set_shape([repeat_count]) + self.inputs_[0].set_data_from_numpy(input_data) + + # Initialize data for DELAY + delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time + self.inputs_[1].set_shape([repeat_count]) + + # Initialize data for WAIT + wait_data = np.array([wait_time], dtype=np.uint32) + self.inputs_[2].set_data_from_numpy(wait_data) + + infer_helpers = [] + if infer_helper_map[0]: + infer_helpers.append(self._stream_infer) + if infer_helper_map[1]: + infer_helpers.append(self._stream_infer_with_params) + + for infer_helper in infer_helpers: + user_data = UserData() + result_dict = {} + + try: + expected_count = repeat_count * request_count + infer_helper( + request_count, + request_delay, + expected_count, + user_data, + result_dict, + delay_data, + delay_factor, + cancel_response_idx, + stream_timeout, + kill_server, + ) + except Exception as ex: + if cancel_response_idx or stream_timeout or should_error: + raise ex + self.assertTrue(False, "unexpected error {}".format(ex)) + + # Validate the results.. + for i in range(request_count): + this_id = str(i) + if repeat_count != 0 and this_id not in result_dict.keys(): + self.assertTrue( + False, "response for request id {} not received".format(this_id) + ) + elif repeat_count == 0 and this_id in result_dict.keys(): + self.assertTrue( + False, + "received unexpected response for request id {}".format( + this_id + ), + ) + if repeat_count != 0: + self.assertEqual(len(result_dict[this_id]), repeat_count) + expected_data = data_offset + result_list = result_dict[this_id] + for j in range(len(result_list)): + this_data = result_list[j][1].as_numpy("OUT") + self.assertEqual(len(this_data), 1) + self.assertEqual(this_data[0], expected_data) + this_idx = result_list[j][1].as_numpy("IDX") + self.assertEqual(len(this_idx), 1) + self.assertEqual(this_idx[0], j) + expected_data += 1 + + ### + ### Non-Streaming Tests + ### + def test_simple_infer(self): + # This test case sends 10 asynchronous requests and validates + # the response. + self._simple_infer(request_count=10) + + def test_simple_infer_cancellation(self): + # This test case is used to check whether all the states are + # correctly released when one of the request is cancelled from + # the client side. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=10, cancel_response_idx=5) + self.assertIn("Locally cancelled by application!", str(cm.exception)) + + def test_simple_infer_timeout(self): + # This test case is used to check whether all the states are + # correctly released when the request gets timed-out on the client. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=10, client_timeout_pair=[5, 0.1]) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + def test_simple_infer_error_status(self): + # This test case is used to check whether all the state objects are + # released when RPC runs into error. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=10) + self.assertIn( + "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", + str(cm.exception), + ) + + def test_simple_infer_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in the beginning + # of inference run with final parameters being returned. + with self.assertRaises(InferenceServerException) as cm: + self._simple_infer(request_count=20, kill_server=5) + + ### + ### Streaming Tests + ### + def test_streaming_infer(self): + # Sanity test to check whether all the state objects + # are correctly released. Sends 10 requests in a single + # gRPC bidirectional stream. + self._streaming_infer(request_count=10) + + def test_streaming_cancellation(self): + # This test case is used to check whether all the states are + # correctly released when the stream is closed when fifth + # response is received. + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer(request_count=10, cancel_response_idx=5) + self.assertIn("Locally cancelled by application!", str(cm.exception)) + + def test_streaming_timeout(self): + # This test case is used to check whether all the states are + # released when some of the requests timeouts. + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer(request_count=10, request_delay=1, stream_timeout=2) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + def test_streaming_error_status(self): + # This test case is used to check whether all the state objects are + # released when RPC runs into error. + expected_exceptions = [ + "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", + "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.", + ] + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer(request_count=10, should_error=True) + + exception_match = False + for expected_exception in expected_exceptions: + exception_match |= expected_exception in str(cm.exception) + self.assertTrue( + exception_match, "Raised unexpected exception {}".format(str(cm.exception)) + ) + + def test_streaming_infer_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in middle of + # inference run. + with self.assertRaises(InferenceServerException) as cm: + self._streaming_infer( + request_count=10, + request_delay=1, + kill_server=5, + should_error=True, + ) + + ### + ### Decoupled Streaming Tests + ### + def test_decoupled_infer(self): + # Sanity test to check whether all the state objects + # are correctly released. Sends 10 requests in a single + # gRPC bidirectional stream and expects each of these + # requests to generate 10 responses. + self._decoupled_infer(request_count=10, repeat_count=10) + + def test_decoupled_cancellation(self): + # This test case is used to check whether all the states are + # correctly released when the stream is closed when fifth + # response is received. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, repeat_count=10, cancel_response_idx=5 + ) + self.assertIn("Locally cancelled by application!", str(cm.exception)) + + def test_decoupled_timeout(self): + # This test case is used to check whether all the states are + # released when some of the requests timeouts. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, repeat_count=10, request_delay=1, stream_timeout=2 + ) + self.assertIn("Deadline Exceeded", str(cm.exception)) + + def test_decoupled_error_status(self): + # This test case is used to check whether all the state objects are + # released when RPC runs into error. + expected_exceptions = [ + "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", + "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.", + ] + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer(request_count=10, repeat_count=10, should_error=True) + + exception_match = False + for expected_exception in expected_exceptions: + exception_match |= expected_exception in str(cm.exception) + self.assertTrue( + exception_match, "Raised unexpected exception {}".format(str(cm.exception)) + ) + + def test_decoupled_infer_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in middle of + # inference run. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, + repeat_count=10, + request_delay=1, + kill_server=5, + should_error=True, + infer_helper_map=[True, False], + ) + + def test_decoupled_infer_with_params_shutdownserver(self): + # This test case is used to check whether all the state objects are + # released when the server is interrupted to shutdown in middle of + # inference run with final parameters being returned. + with self.assertRaises(InferenceServerException) as cm: + self._decoupled_infer( + request_count=10, + repeat_count=10, + request_delay=1, + kill_server=5, + should_error=True, + infer_helper_map=[False, True], + ) + + def test_decoupled_infer_complete(self): + # Test if the Process() thread could release the state object before + # the StreamInferResponseComplete() thread is done accessing it. + self._decoupled_infer(request_count=1, repeat_count=1, stream_timeout=16) + # Check no error is printed to the log. + with open(os.environ["SERVER_LOG"]) as f: + server_log = f.read() + self.assertNotIn("Should not print this", server_log) + + def test_non_decoupled_streaming_multi_response(self): + # Test non-decoupled streaming infer with more than one response should return + # the first response. + response_count = 4 + expected_response_count = 1 + expected_response_index = 0 + + # Prepare input data + self._prepare_inputs_and_outputs("non_decoupled_streaming") + # Initialize data for IN + data_offset = 100 + input_data = np.arange( + start=data_offset, stop=data_offset + response_count, dtype=np.int32 + ) + self.inputs_[0].set_shape([response_count]) + self.inputs_[0].set_data_from_numpy(input_data) + # Initialize data for DELAY + delay_data = np.zeros([response_count], dtype=np.uint32) + self.inputs_[1].set_shape([response_count]) + self.inputs_[1].set_data_from_numpy(delay_data) + # Initialize data for WAIT + wait_data = np.array([0], dtype=np.uint32) + self.inputs_[2].set_data_from_numpy(wait_data) + + # Infer + user_data = UserData() + with grpcclient.InferenceServerClient( + url="localhost:8001", verbose=True + ) as client: + # Establish stream + metadata = {"triton_grpc_error": "true"} + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16, headers=metadata + ) + # Send a request + client.async_stream_infer( + model_name=self.repeat_non_decoupled_model_name, + inputs=self.inputs_, + request_id="0", + outputs=self.requested_outputs_, + ) + # Wait for all results and stop stream + client.stop_stream() + + # Check infer output + actual_response_count = 0 + while not user_data._response_queue.empty(): + actual_response_count += 1 + data_item = user_data._response_queue.get() + if type(data_item) == InferenceServerException: + raise data_item + else: + response_idx = data_item.as_numpy("IDX")[0] + self.assertEqual(response_idx, expected_response_index) + self.assertEqual(actual_response_count, expected_response_count) + + +if __name__ == "__main__": + CleanUpTest.SERVER_PID = os.environ.get("SERVER_PID", CleanUpTest.SERVER_PID) + unittest.main() diff --git a/qa/L0_grpc_error_state_cleanup/test.sh b/qa/L0_grpc_error_state_cleanup/test.sh new file mode 100755 index 0000000000..df302d5ed1 --- /dev/null +++ b/qa/L0_grpc_error_state_cleanup/test.sh @@ -0,0 +1,235 @@ +#!/bin/bash +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} +if [ "$#" -ge 1 ]; then + REPO_VERSION=$1 +fi +if [ -z "$REPO_VERSION" ]; then + echo -e "Repository version must be specified" + echo -e "\n***\n*** Test Failed\n***" + exit 1 +fi +if [ ! -z "$TEST_REPO_ARCH" ]; then + REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} +fi + +export CUDA_VISIBLE_DEVICES=0 + +RET=0 +CLEANUP_TEST=cleanup_test.py + +rm -f *.log + +CLIENT_LOG=`pwd`/client.log +SERVER=/opt/tritonserver/bin/tritonserver +source ../common/util.sh + +function check_state_release() { + local log_file=$1 + + num_state_release=`cat $log_file | grep "StateRelease" | wc -l` + num_state_new=`cat $log_file | grep "StateNew" | wc -l` + + if [ $num_state_release -ne $num_state_new ]; then + cat $log_file + echo -e "\n***\n*** Test Failed: Mismatch detected, $num_state_new state(s) created, $num_state_release state(s) released. \n***" >> $log_file + return 1 + fi + + return 0 +} + +rm -fr ./models/custom_zero_1_float32 && \ + cp -r ../custom_models/custom_zero_1_float32 ./models/. && \ + mkdir -p ./models/custom_zero_1_float32/1 + +(cd models/custom_zero_1_float32 && \ + echo "parameters [" >> config.pbtxt && \ + echo "{ key: \"execute_delay_ms\"; value: { string_value: \"1000\" }}" >> config.pbtxt && \ + echo "]" >> config.pbtxt) + +rm -rf models/repeat_int32_non_decoupled && \ + cp -r models/repeat_int32 models/repeat_int32_non_decoupled && \ + (cd models/repeat_int32_non_decoupled && \ + sed -i "/model_transaction_policy/,+2d" config.pbtxt && \ + sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt) + +for i in test_simple_infer \ + test_simple_infer_cancellation \ + test_simple_infer_timeout \ + test_streaming_infer \ + test_streaming_timeout \ + test_streaming_cancellation \ + test_decoupled_infer \ + test_decoupled_cancellation \ + test_decoupled_timeout \ + test_non_decoupled_streaming_multi_response; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + set -e +done + + +for i in test_simple_infer_error_status \ + test_streaming_error_status \ + test_decoupled_error_status; do + SERVER_LOG="./inference_server.$i.log" + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + + kill $SERVER_PID + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + + set -e +done + +for i in test_simple_infer_shutdownserver \ + test_streaming_infer_shutdownserver \ + test_decoupled_infer_shutdownserver \ + test_decoupled_infer_with_params_shutdownserver; do + SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" + SERVER_LOG="./inference_server.$i.log" + run_server + if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 + fi + + echo "Test: $i" >>$CLIENT_LOG + + set +e + SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG + echo -e "\n***\n*** Test $i Failed\n***" + RET=1 + fi + + wait $SERVER_PID + + check_state_release $SERVER_LOG + if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $i\n***" + RET=1 + fi + + set -e +done + +TEST_NAME=test_decoupled_infer_complete +export TRITONSERVER_DELAY_GRPC_COMPLETE=2000 + +SERVER_LOG="./inference_server.$TEST_NAME.log" +SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" +run_server +if [ "$SERVER_PID" == "0" ]; then + echo -e "\n***\n*** Failed to start $SERVER\n***" + cat $SERVER_LOG + exit 1 +fi + +echo "Test: $TEST_NAME" >>$CLIENT_LOG + +set +e + +SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1 +if [ $? -ne 0 ]; then + cat $CLIENT_LOG + echo -e "\n***\n*** Test $TEST_NAME Failed\n***" + RET=1 +fi + +kill $SERVER_PID +wait $SERVER_PID + +check_state_release $SERVER_LOG +if [ $? -ne 0 ]; then + cat $SERVER_LOG + echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***" + RET=1 +fi + +set -e + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test Failed\n***" +fi + +exit $RET From cb548ff53e22aafe6e729cc365141bb3d6f90c74 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 16 Aug 2024 01:53:47 -0700 Subject: [PATCH 28/32] Pre-Commit format --- qa/L0_decoupled/decoupled_test.py | 2 +- qa/L0_decoupled_grpc_error/decoupled_test.py | 10 +++++++--- qa/L0_grpc_error_state_cleanup/cleanup_test.py | 12 +++++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index d0f09deaf9..1f76f4845b 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -644,4 +644,4 @@ def test_http(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/qa/L0_decoupled_grpc_error/decoupled_test.py b/qa/L0_decoupled_grpc_error/decoupled_test.py index fc606e3cca..1d6e2f7029 100755 --- a/qa/L0_decoupled_grpc_error/decoupled_test.py +++ b/qa/L0_decoupled_grpc_error/decoupled_test.py @@ -117,7 +117,9 @@ def _stream_infer_with_params( ) as triton_client: # Establish stream metadata = {"triton_grpc_error": "true"} - triton_client.start_stream(callback=partial(callback, user_data), headers=metadata) + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -177,7 +179,9 @@ def _stream_infer( ) as triton_client: # Establish stream metadata = {"triton_grpc_error": "true"} - triton_client.start_stream(callback=partial(callback, user_data), headers=metadata) + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -646,4 +650,4 @@ def test_http(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/qa/L0_grpc_error_state_cleanup/cleanup_test.py b/qa/L0_grpc_error_state_cleanup/cleanup_test.py index 07537013aa..4425c5c667 100755 --- a/qa/L0_grpc_error_state_cleanup/cleanup_test.py +++ b/qa/L0_grpc_error_state_cleanup/cleanup_test.py @@ -163,7 +163,9 @@ def _stream_infer_with_params( # Establish stream metadata = {"triton_grpc_error": "true"} triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, ) # Send specified many requests in parallel for i in range(request_count): @@ -232,7 +234,9 @@ def _stream_infer( # Establish stream metadata = {"triton_grpc_error": "true"} triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, ) # Send specified many requests in parallel for i in range(request_count): @@ -612,7 +616,9 @@ def test_non_decoupled_streaming_multi_response(self): # Establish stream metadata = {"triton_grpc_error": "true"} client.start_stream( - callback=partial(callback, user_data), stream_timeout=16, headers=metadata + callback=partial(callback, user_data), + stream_timeout=16, + headers=metadata, ) # Send a request client.async_stream_infer( From 1b6b3a7ecd4b36cc54e845477118dee1935c92a3 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 16 Aug 2024 08:44:21 -0700 Subject: [PATCH 29/32] Devel build fix --- Dockerfile.QA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.QA b/Dockerfile.QA index a3073948c5..1417515c42 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -250,7 +250,7 @@ RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \ mkdir -p qa/L0_decoupled/models/sequence_repeat/1 && \ mkdir -p qa/L0_decoupled/models/repeat_square/1 && \ mkdir -p qa/L0_decoupled/models/nested_square/1 && \ - mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 + mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 \ mkdir -p qa/L0_grpc_error_state_cleanup/models/repeat_int32/1 RUN if [ "$IGPU_BUILD" == "0" ]; then \ From 70ce2790bbc8e69fcaaf14c510d33db00b1e7bf9 Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 16 Aug 2024 11:41:33 -0700 Subject: [PATCH 30/32] Streamline new tests --- Dockerfile.QA | 12 +- qa/L0_decoupled/decoupled_test.py | 16 +- qa/L0_decoupled_grpc_error/decoupled_test.py | 653 ------------------ qa/L0_decoupled_grpc_error/test.sh | 179 ----- .../cleanup_test.py | 648 ----------------- qa/L0_grpc_error_state_cleanup/test.sh | 235 ------- qa/L0_grpc_state_cleanup/cleanup_test.py | 42 +- 7 files changed, 54 insertions(+), 1731 deletions(-) delete mode 100755 qa/L0_decoupled_grpc_error/decoupled_test.py delete mode 100755 qa/L0_decoupled_grpc_error/test.sh delete mode 100755 qa/L0_grpc_error_state_cleanup/cleanup_test.py delete mode 100755 qa/L0_grpc_error_state_cleanup/test.sh diff --git a/Dockerfile.QA b/Dockerfile.QA index 1417515c42..22f312b930 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -113,8 +113,6 @@ RUN mkdir -p qa/common && \ cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \ mkdir qa/L0_grpc_state_cleanup/models && \ cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \ - mkdir qa/L0_grpc_error_state_cleanup/models && \ - cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_error_state_cleanup/models/ && \ mkdir qa/L0_http/models && \ cp -r docs/examples/model_repository/simple qa/L0_http/models && \ cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_http/models && \ @@ -250,14 +248,12 @@ RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \ mkdir -p qa/L0_decoupled/models/sequence_repeat/1 && \ mkdir -p qa/L0_decoupled/models/repeat_square/1 && \ mkdir -p qa/L0_decoupled/models/nested_square/1 && \ - mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 \ - mkdir -p qa/L0_grpc_error_state_cleanup/models/repeat_int32/1 + mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 RUN if [ "$IGPU_BUILD" == "0" ]; then \ cp backends/repeat/libtriton_repeat.so qa/L0_model_config && \ cp backends/repeat/libtriton_repeat.so qa/L0_decoupled/models/repeat_int32/1 && \ cp backends/repeat/libtriton_repeat.so qa/L0_grpc_state_cleanup/models/repeat_int32/1/. && \ - cp backends/repeat/libtriton_repeat.so qa/L0_grpc_error_state_cleanup/models/repeat_int32/1/. && \ cp backends/square/libtriton_square.so qa/L0_decoupled/models/square_int32/1; \ fi @@ -271,6 +267,12 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \ cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \ qa/L0_decoupled/python_models/square_int32/. +RUN mkdir -p qa/L0_decoupled_grpc_error && \ + cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error && \ + +RUN mkdir -p qa/L0_grpc_error_state_cleanup && \ + cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup + RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \ cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \ qa/L0_repoagent_checksum/models/identity_int32/1/. diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py index 1f76f4845b..d7bc59f5c7 100755 --- a/qa/L0_decoupled/decoupled_test.py +++ b/qa/L0_decoupled/decoupled_test.py @@ -116,7 +116,13 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -175,7 +181,13 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream(callback=partial(callback, user_data)) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), headers=metadata + ) + else: + triton_client.start_stream(callback=partial(callback, user_data)) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) diff --git a/qa/L0_decoupled_grpc_error/decoupled_test.py b/qa/L0_decoupled_grpc_error/decoupled_test.py deleted file mode 100755 index 1d6e2f7029..0000000000 --- a/qa/L0_decoupled_grpc_error/decoupled_test.py +++ /dev/null @@ -1,653 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys - -sys.path.append("../common") - -import os -import queue -import time -import unittest -from functools import partial - -import numpy as np -import test_util as tu -import tritonclient.grpc as grpcclient -import tritonclient.http as httpclient -from tritonclient.utils import InferenceServerException - - -class UserData: - def __init__(self): - self._response_queue = queue.Queue() - - -def callback(user_data, result, error): - if error: - user_data._response_queue.put(error) - else: - user_data._response_queue.put(result) - - -class DecoupledTest(tu.TestResultCollector): - def setUp(self): - self.trials_ = [ - ("repeat_int32", None), - ("simple_repeat", None), - ("sequence_repeat", None), - ("fan_repeat", self._fan_validate), - ("repeat_square", self._nested_validate), - ("nested_square", self._nested_validate), - ] - self.model_name_ = "repeat_int32" - - self.inputs_ = [] - self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32")) - self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32")) - self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32")) - - self.outputs_ = [] - self.outputs_.append(grpcclient.InferRequestedOutput("OUT")) - self.outputs_.append(grpcclient.InferRequestedOutput("IDX")) - # Some trials only expect a subset of outputs - self.requested_outputs_ = self.outputs_ - - # Client can receive a "triton_final_response" response parameter - # from Triton server that indicates when a response is the final response for - # its request. - # - # For non-decoupled models, there is a 1:1 request:response ratio, so every - # response is the final response, and this parameter is unnecessary. - # - # For decoupled models, there is a 1:N request:response ratio, so there may be - # more one response before receiving the "final" response. - # - # However, decoupled models have the unique property in that they can return - # a flags-only response to the server to indicate completion, which is not - # returned to the client by default (See TRITONBACKEND_ResponseFactorySendFlags). - # - # To forward this flags-only response to the client, users must opt-in to this - # behavior by adding the following argument: - # client.async_stream_infer(..., enable_empty_final_response=True). - # - # If the decoupled backend/model always sends the final response flag along - # with a non-null response, no opt-in is needed. - # - # With this behavior, the client can programmatically detect when all responses - # for an individual request have been received without knowing the expected - # number of responses in advance and without closing the stream. - def _stream_infer_with_params( - self, - request_count, - request_delay, - _, - delay_data, - delay_factor, - user_data, - result_dict, - ): - with grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) as triton_client: - # Establish stream - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), headers=metadata - ) - # Send specified many requests in parallel - for i in range(request_count): - time.sleep((request_delay / 1000)) - self.inputs_[1].set_data_from_numpy(delay_data) - triton_client.async_stream_infer( - model_name=self.model_name_, - inputs=self.inputs_, - request_id=str(i), - outputs=self.requested_outputs_, - # Opt-in to receiving flags-only responses from model/backend - # to help detect final responses for decoupled models. - enable_empty_final_response=True, - ) - # Update delay input in accordance with the scaling factor - delay_data = delay_data * delay_factor - delay_data = delay_data.astype(np.uint32) - - # Retrieve results... - recv_count = 0 - completed_requests = 0 - while completed_requests < request_count: - data_item = user_data._response_queue.get() - if type(data_item) == InferenceServerException: - raise data_item - else: - response = data_item.get_response() - # Request IDs should generally be provided with each request - # to associate decoupled responses with their requests. - if not response.id: - raise ValueError( - "No response id found. Was a request_id provided?" - ) - - # Detect final response. Parameters are oneof and we expect bool_param - if response.parameters.get("triton_final_response").bool_param: - completed_requests += 1 - - # Only process non-empty response, ignore if empty (no outputs) - if response.outputs: - if response.id not in result_dict: - result_dict[response.id] = [] - result_dict[response.id].append((recv_count, data_item)) - recv_count += 1 - - def _stream_infer( - self, - request_count, - request_delay, - expected_count, - delay_data, - delay_factor, - user_data, - result_dict, - ): - with grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) as triton_client: - # Establish stream - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), headers=metadata - ) - # Send specified many requests in parallel - for i in range(request_count): - time.sleep((request_delay / 1000)) - self.inputs_[1].set_data_from_numpy(delay_data) - triton_client.async_stream_infer( - model_name=self.model_name_, - inputs=self.inputs_, - request_id=str(i), - outputs=self.requested_outputs_, - ) - # Update delay input in accordance with the scaling factor - delay_data = delay_data * delay_factor - delay_data = delay_data.astype(np.uint32) - - # Retrieve results... - recv_count = 0 - while recv_count < expected_count: - data_item = user_data._response_queue.get() - if type(data_item) == InferenceServerException: - raise data_item - else: - this_id = data_item.get_response().id - if this_id not in result_dict: - result_dict[this_id] = [] - result_dict[this_id].append((recv_count, data_item)) - - recv_count += 1 - - def _fan_validate(self, result_list, data_offset, repeat_count): - # fan_repeat returns "2 * data_offset" as result - self.assertEqual(len(result_list), repeat_count) - expected_data = 2 * data_offset - for j in range(len(result_list)): - this_data = result_list[j][1].as_numpy("OUT") - self.assertEqual(len(this_data), 1) - self.assertEqual(this_data[0], expected_data) - expected_data += 2 - - def _nested_validate(self, result_list, data_offset, repeat_count): - # if repeat model returns repeat result n, repeat_square-like model - # will return the same result n times - expected_len = sum(x for x in range(data_offset, data_offset + repeat_count)) - self.assertEqual(len(result_list), expected_len) - expected_data = data_offset - expected_count = expected_data - for j in range(len(result_list)): - this_data = result_list[j][1].as_numpy("OUT") - self.assertEqual(len(this_data), 1) - self.assertEqual(this_data[0], expected_data) - expected_count -= 1 - if expected_count == 0: - expected_data += 1 - expected_count = expected_data - - def _decoupled_infer( - self, - request_count, - request_delay=0, - repeat_count=1, - data_offset=100, - delay_time=1000, - delay_factor=1, - wait_time=500, - order_sequence=None, - validate_fn=None, - ): - # Initialize data for IN - input_data = np.arange( - start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 - ) - self.inputs_[0].set_shape([repeat_count]) - self.inputs_[0].set_data_from_numpy(input_data) - - # Initialize data for DELAY - delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time - self.inputs_[1].set_shape([repeat_count]) - - # Initialize data for WAIT - wait_data = np.array([wait_time], dtype=np.uint32) - self.inputs_[2].set_data_from_numpy(wait_data) - - # use validate_fn to differentiate requested outputs - self.requested_outputs_ = ( - self.outputs_ if validate_fn is None else self.outputs_[0:1] - ) - - for infer_helper in [self._stream_infer, self._stream_infer_with_params]: - user_data = UserData() - result_dict = {} - - try: - if "square" not in self.model_name_: - expected_count = repeat_count * request_count - else: - expected_count = ( - sum(x for x in range(data_offset, data_offset + repeat_count)) - * request_count - ) - infer_helper( - request_count, - request_delay, - expected_count, - delay_data, - delay_factor, - user_data, - result_dict, - ) - except Exception as ex: - self.assertTrue(False, "unexpected error {}".format(ex)) - - # Validate the results.. - for i in range(request_count): - this_id = str(i) - if repeat_count != 0 and this_id not in result_dict.keys(): - self.assertTrue( - False, "response for request id {} not received".format(this_id) - ) - elif repeat_count == 0 and this_id in result_dict.keys(): - self.assertTrue( - False, - "received unexpected response for request id {}".format( - this_id - ), - ) - if repeat_count != 0: - if validate_fn is None: - self.assertEqual(len(result_dict[this_id]), repeat_count) - expected_data = data_offset - result_list = result_dict[this_id] - for j in range(len(result_list)): - if order_sequence is not None: - self.assertEqual( - result_list[j][0], order_sequence[i][j] - ) - this_data = result_list[j][1].as_numpy("OUT") - self.assertEqual(len(this_data), 1) - self.assertEqual(this_data[0], expected_data) - this_idx = result_list[j][1].as_numpy("IDX") - self.assertEqual(len(this_idx), 1) - self.assertEqual(this_idx[0], j) - expected_data += 1 - else: - validate_fn(result_dict[this_id], data_offset, repeat_count) - - def test_one_to_none(self): - # Test cases where each request generates no response. - # Note the name of the test one_to_none implies the - # mapping between requests and responses. - - for trial in self.trials_: - self.model_name_ = trial[0] - # Single request case - self._decoupled_infer(request_count=1, repeat_count=0, validate_fn=trial[1]) - # Multiple request case - self._decoupled_infer(request_count=5, repeat_count=0, validate_fn=trial[1]) - - def test_one_to_one(self): - # Test cases where each request generates single response. - # Note the name of the test one_to_one implies the - # mapping between requests and responses. - - for trial in self.trials_: - self.model_name_ = trial[0] - # Single request case - # Release request before the response is delivered - self._decoupled_infer(request_count=1, wait_time=500, validate_fn=trial[1]) - # Release request after the response is delivered - self._decoupled_infer(request_count=1, wait_time=2000, validate_fn=trial[1]) - - # Multiple request case - # Release request before the response is delivered - self._decoupled_infer(request_count=5, wait_time=500, validate_fn=trial[1]) - # Release request after the response is delivered - self._decoupled_infer(request_count=5, wait_time=2000, validate_fn=trial[1]) - - def test_one_to_many(self): - # Test cases where each request generates multiple response. - # Note the name of the test one_to_many implies the - # mapping between requests and responses. - - self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) - - for trial in self.trials_: - self.model_name_ = trial[0] - # Single request case - # Release request before the first response is delivered - self._decoupled_infer( - request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1] - ) - # Release request when the responses are getting delivered - self._decoupled_infer( - request_count=1, repeat_count=5, wait_time=2000, validate_fn=trial[1] - ) - # Release request after all the responses are delivered - self._decoupled_infer( - request_count=1, repeat_count=5, wait_time=10000, validate_fn=trial[1] - ) - - # Multiple request case - # Release request before the first response is delivered - self._decoupled_infer( - request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1] - ) - # Release request when the responses are getting delivered - self._decoupled_infer( - request_count=5, repeat_count=5, wait_time=2000, validate_fn=trial[1] - ) - # Release request after all the responses are delivered - self._decoupled_infer( - request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1] - ) - - def test_one_to_multi_many(self): - # Test cases where each request generates multiple response but the - # responses are delayed so as to stress the control path handling the - # queued responses. - - self.assertTrue("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) - - for trial in self.trials_: - self.model_name_ = trial[0] - # Single request case - # Release request before the first response is delivered - self._decoupled_infer( - request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1] - ) - # Release request when the responses are getting delivered - self._decoupled_infer( - request_count=1, repeat_count=5, wait_time=8000, validate_fn=trial[1] - ) - # Release request after all the responses are delivered - self._decoupled_infer( - request_count=1, repeat_count=5, wait_time=20000, validate_fn=trial[1] - ) - - # Multiple request case - # Release request before the first response is delivered - self._decoupled_infer( - request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1] - ) - # Release request when the responses are getting delivered - self._decoupled_infer( - request_count=5, repeat_count=5, wait_time=3000, validate_fn=trial[1] - ) - # Release request after all the responses are delivered - self._decoupled_infer( - request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1] - ) - - def test_response_order(self): - # Test the expected response order for different cases - - self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ) - - for trial in self.trials_: - self.model_name_ = trial[0] - - # Case 1: Interleaved responses - self._decoupled_infer( - request_count=2, - request_delay=500, - repeat_count=4, - order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]], - validate_fn=trial[1], - ) - - # Case 2: All responses of second request delivered before any - # response from the first - self._decoupled_infer( - request_count=2, - request_delay=500, - repeat_count=4, - delay_time=2000, - delay_factor=0.1, - order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]], - validate_fn=trial[1], - ) - - # Case 3: Similar to Case 2, but the second request is generated - # after the first response from first request is received - self._decoupled_infer( - request_count=2, - request_delay=2500, - repeat_count=4, - delay_time=2000, - delay_factor=0.1, - order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]], - validate_fn=trial[1], - ) - - # Case 4: All the responses of second requests are dleivered after - # all the responses from first requests are received - self._decoupled_infer( - request_count=2, - request_delay=100, - repeat_count=4, - delay_time=500, - delay_factor=10, - order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]], - validate_fn=trial[1], - ) - - # Case 5: Similar to Case 4, but the second request is generated - # after the first response from the first request is received - self._decoupled_infer( - request_count=2, - request_delay=750, - repeat_count=4, - delay_time=500, - delay_factor=10, - order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]], - validate_fn=trial[1], - ) - - def _no_streaming_helper(self, protocol): - data_offset = 100 - repeat_count = 1 - delay_time = 1000 - wait_time = 2000 - - input_data = np.arange( - start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 - ) - delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time - wait_data = np.array([wait_time], dtype=np.uint32) - - if protocol == "grpc": - # Use the inputs and outputs from the setUp - this_inputs = self.inputs_ - this_outputs = self.outputs_ - else: - this_inputs = [] - this_inputs.append(httpclient.InferInput("IN", [repeat_count], "INT32")) - this_inputs.append(httpclient.InferInput("DELAY", [1], "UINT32")) - this_inputs.append(httpclient.InferInput("WAIT", [1], "UINT32")) - this_outputs = [] - this_outputs.append(httpclient.InferRequestedOutput("OUT")) - - # Initialize data for IN - this_inputs[0].set_shape([repeat_count]) - this_inputs[0].set_data_from_numpy(input_data) - - # Initialize data for DELAY - this_inputs[1].set_shape([repeat_count]) - this_inputs[1].set_data_from_numpy(delay_data) - - # Initialize data for WAIT - this_inputs[2].set_data_from_numpy(wait_data) - - if protocol == "grpc": - triton_client = grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) - else: - triton_client = httpclient.InferenceServerClient( - url="localhost:8000", verbose=True - ) - - with self.assertRaises(InferenceServerException) as cm: - triton_client.infer( - model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs - ) - - self.assertIn( - "doesn't support models with decoupled transaction policy", - str(cm.exception), - ) - - def test_no_streaming(self): - # Test cases with no streaming inference. Server should give - # appropriate error in such cases. - - self._no_streaming_helper("grpc") - self._no_streaming_helper("http") - - def test_wrong_shape(self): - # Sends mismatching shapes for IN and DELAY. Server should return - # appropriate error message. The shape of IN is [repeat_count], - # where as shape of DELAY is [repeat_count + 1]. - - data_offset = 100 - repeat_count = 1 - delay_time = 1000 - wait_time = 2000 - - input_data = np.arange( - start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 - ) - delay_data = (np.ones([repeat_count + 1], dtype=np.uint32)) * delay_time - wait_data = np.array([wait_time], dtype=np.uint32) - - # Initialize data for IN - self.inputs_[0].set_shape([repeat_count]) - self.inputs_[0].set_data_from_numpy(input_data) - - # Initialize data for DELAY - self.inputs_[1].set_shape([repeat_count + 1]) - self.inputs_[1].set_data_from_numpy(delay_data) - - # Initialize data for WAIT - self.inputs_[2].set_data_from_numpy(wait_data) - - user_data = UserData() - result_dict = {} - - with self.assertRaises(InferenceServerException) as cm: - self._stream_infer( - 1, 0, repeat_count, delay_data, 1, user_data, result_dict - ) - - self.assertIn( - "expected IN and DELAY shape to match, got [1] and [2]", str(cm.exception) - ) - - -class NonDecoupledTest(tu.TestResultCollector): - def setUp(self): - self.model_name_ = "repeat_int32" - self.input_data = { - "IN": np.array([1], dtype=np.int32), - "DELAY": np.array([0], dtype=np.uint32), - "WAIT": np.array([0], dtype=np.uint32), - } - - def test_grpc(self): - inputs = [ - grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy( - self.input_data["IN"] - ), - grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( - self.input_data["DELAY"] - ), - grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( - self.input_data["WAIT"] - ), - ] - - triton_client = grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) - # Expect the inference is successful - res = triton_client.infer(model_name=self.model_name_, inputs=inputs) - self.assertEqual(1, res.as_numpy("OUT")[0]) - self.assertEqual(0, res.as_numpy("IDX")[0]) - - def test_http(self): - inputs = [ - httpclient.InferInput("IN", [1], "INT32").set_data_from_numpy( - self.input_data["IN"] - ), - httpclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy( - self.input_data["DELAY"] - ), - httpclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy( - self.input_data["WAIT"] - ), - ] - - triton_client = httpclient.InferenceServerClient( - url="localhost:8000", verbose=True - ) - # Expect the inference is successful - res = triton_client.infer(model_name=self.model_name_, inputs=inputs) - self.assertEqual(1, res.as_numpy("OUT")[0]) - self.assertEqual(0, res.as_numpy("IDX")[0]) - - -if __name__ == "__main__": - unittest.main() diff --git a/qa/L0_decoupled_grpc_error/test.sh b/qa/L0_decoupled_grpc_error/test.sh deleted file mode 100755 index 4fba476b1d..0000000000 --- a/qa/L0_decoupled_grpc_error/test.sh +++ /dev/null @@ -1,179 +0,0 @@ -#!/bin/bash -# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} -if [ "$#" -ge 1 ]; then - REPO_VERSION=$1 -fi -if [ -z "$REPO_VERSION" ]; then - echo -e "Repository version must be specified" - echo -e "\n***\n*** Test Failed\n***" - exit 1 -fi -if [ ! -z "$TEST_REPO_ARCH" ]; then - REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} -fi - -export CUDA_VISIBLE_DEVICES=0 - -RET=0 -TEST_RESULT_FILE='test_results.txt' -DECOUPLED_TEST=decoupled_test.py - -rm -f *.log - -CLIENT_LOG=`pwd`/client.log -DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository -SERVER=/opt/tritonserver/bin/tritonserver -SERVER_ARGS="--model-repository=../L0_decoupled/models" -SERVER_LOG="./inference_server.log" -source ../common/util.sh - - -TRIALS="python custom" - -for trial in $TRIALS; do - if [ $trial == "python" ]; then - MODELDIR=../L0_decoupled/python_models - else - MODELDIR=../L0_decoupled/models - fi - - SERVER_ARGS="--model-repository=$MODELDIR" - cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/. - (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \ - sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt) - - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - for i in \ - test_one_to_none \ - test_one_to_one \ - test_one_to_many \ - test_no_streaming \ - test_response_order \ - test_wrong_shape; do - - echo "Test: $i" >>$CLIENT_LOG - set +e - python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - else - check_test_results $TEST_RESULT_FILE 1 - if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" - RET=1 - fi - fi - set -e - done - - # Will delay the writing of each response by the specified many milliseconds. - # This will ensure that there are multiple responses available to be written. - export TRITONSERVER_DELAY_GRPC_RESPONSE=2000 - - echo "Test: test_one_to_multi_many" >>$CLIENT_LOG - set +e - python $DECOUPLED_TEST DecoupledTest.test_one_to_multi_many >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" - RET=1 - else - check_test_results $TEST_RESULT_FILE 1 - if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" - RET=1 - fi - fi - - set -e - - unset TRITONSERVER_DELAY_GRPC_RESPONSE - - kill $SERVER_PID - wait $SERVER_PID -done - -# Test the server frontend can merge the responses of non-decoupled model that -# sends inference response and COMPLETE flag separately. In other words, from -# the client's perspective there will still be one response. -NON_DECOUPLED_DIR=`pwd`/non_decoupled_models -rm -rf ${NON_DECOUPLED_DIR} && mkdir -p ${NON_DECOUPLED_DIR} -cp -r ../L0_decoupled/models/repeat_int32 ${NON_DECOUPLED_DIR}/. && \ - (cd ${NON_DECOUPLED_DIR}/repeat_int32 && \ - sed -i "s/decoupled: True/decoupled: False/" config.pbtxt) - -SERVER_ARGS="--model-repository=${NON_DECOUPLED_DIR}" -SERVER_LOG="./non_decoupled_inference_server.log" - -run_server -if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 -fi - -CLIENT_LOG=`pwd`/non_decoupled_client.log -echo "Test: NonDecoupledTest" >>$CLIENT_LOG -set +e -python $DECOUPLED_TEST NonDecoupledTest >>$CLIENT_LOG 2>&1 -if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" - RET=1 -else - check_test_results $TEST_RESULT_FILE 2 - if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test Result Verification Failed\n***" - RET=1 - fi -fi - -set -e - -kill $SERVER_PID -wait $SERVER_PID - -if [ $RET -eq 0 ]; then - echo -e "\n***\n*** Test Passed\n***" -else - echo -e "\n***\n*** Test Failed\n***" -fi - -exit $RET \ No newline at end of file diff --git a/qa/L0_grpc_error_state_cleanup/cleanup_test.py b/qa/L0_grpc_error_state_cleanup/cleanup_test.py deleted file mode 100755 index 4425c5c667..0000000000 --- a/qa/L0_grpc_error_state_cleanup/cleanup_test.py +++ /dev/null @@ -1,648 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys - -sys.path.append("../common") - -import os -import queue -import signal -import time -import unittest -from functools import partial - -import numpy as np -import test_util as tu -import tritonclient.grpc as grpcclient -from tritonclient.utils import InferenceServerException - - -class UserData: - def __init__(self): - self._response_queue = queue.Queue() - - -def callback(user_data, result, error): - if error: - user_data._response_queue.put(error) - else: - user_data._response_queue.put(result) - - -# These state cleanup tests relies on the test.sh -# to check whether all the created request objects -# were properly deleted by the sever. -# The purpose on these unittest is to exercise -# different portions of the gRPC frontend and -# and track the state objects. -class CleanUpTest(tu.TestResultCollector): - SERVER_PID = None - - def setUp(self): - self.decoupled_model_name_ = "repeat_int32" - self.identity_model_name_ = "custom_zero_1_float32" - self.repeat_non_decoupled_model_name = "repeat_int32_non_decoupled" - - def _prepare_inputs_and_outputs(self, kind): - if kind in ("decoupled_streaming", "non_decoupled_streaming"): - self.inputs_ = [] - self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32")) - self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32")) - self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32")) - - self.outputs_ = [] - self.outputs_.append(grpcclient.InferRequestedOutput("OUT")) - self.outputs_.append(grpcclient.InferRequestedOutput("IDX")) - self.requested_outputs_ = self.outputs_ - elif kind in ("simple", "streaming"): - self.inputs_ = [] - self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "FP32")) - - self.outputs_ = [] - self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0")) - self.requested_outputs_ = self.outputs_ - else: - raise ValueError("Unsupported kind specified to prepare inputs/outputs") - - def _simple_infer( - self, - request_count, - cancel_response_idx=None, - client_timeout_pair=None, - kill_server=None, - ): - with grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) as triton_client: - self._prepare_inputs_and_outputs("simple") - - input_data = np.array([[1.0]], dtype=np.float32) - self.inputs_[0].set_data_from_numpy(input_data) - - user_data = UserData() - - futures = [] - timeout_idx = None - timeout_value = None - if client_timeout_pair: - timeout_idx, timeout_value = client_timeout_pair - for i in range(request_count): - if kill_server == i: - os.kill(int(self.SERVER_PID), signal.SIGINT) - this_timeout = None - if timeout_idx == i: - this_timeout = timeout_value - futures.append( - triton_client.async_infer( - model_name=self.identity_model_name_, - inputs=self.inputs_, - request_id=str(i), - callback=partial(callback, user_data), - outputs=self.requested_outputs_, - client_timeout=this_timeout, - ) - ) - - if cancel_response_idx is not None: - futures[cancel_response_idx].cancel() - - responses = [] - while len(responses) < len(futures): - data_item = user_data._response_queue.get() - if type(data_item) == InferenceServerException: - raise data_item - else: - responses.append(data_item) - - for response in responses: - output0_data = response.as_numpy("OUTPUT0") - self.assertTrue(np.array_equal(input_data, output0_data)) - - def _stream_infer_with_params( - self, - request_count, - request_delay, - _, - user_data, - result_dict, - delay_data=None, - delay_factor=None, - cancel_response_idx=None, - stream_timeout=None, - kill_server=None, - ): - with grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) as triton_client: - # Establish stream - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), - stream_timeout=stream_timeout, - headers=metadata, - ) - # Send specified many requests in parallel - for i in range(request_count): - time.sleep((request_delay / 1000)) - self.inputs_[1].set_data_from_numpy(delay_data) - if kill_server == i: - os.kill(int(self.SERVER_PID), signal.SIGINT) - triton_client.async_stream_infer( - model_name=self.decoupled_model_name_, - inputs=self.inputs_, - request_id=str(i), - outputs=self.requested_outputs_, - # Opt-in to receiving flags-only responses from model/backend - # to help detect final responses for decoupled models. - enable_empty_final_response=True, - ) - # Update delay input in accordance with the scaling factor - delay_data = delay_data * delay_factor - delay_data = delay_data.astype(np.uint32) - - # Retrieve results... - recv_count = 0 - completed_requests = 0 - while completed_requests < request_count: - if cancel_response_idx == recv_count: - triton_client.stop_stream(cancel_requests=True) - data_item = user_data._response_queue.get() - if type(data_item) == InferenceServerException: - raise data_item - else: - response = data_item.get_response() - # Request IDs should generally be provided with each request - # to associate decoupled responses with their requests. - if not response.id: - raise ValueError( - "No response id found. Was a request_id provided?" - ) - - # Detect final response. Parameters are oneof and we expect bool_param - if response.parameters.get("triton_final_response").bool_param: - completed_requests += 1 - - # Only process non-empty response, ignore if empty (no outputs) - if response.outputs: - if response.id not in result_dict: - result_dict[response.id] = [] - result_dict[response.id].append((recv_count, data_item)) - recv_count += 1 - - def _stream_infer( - self, - request_count, - request_delay, - expected_count, - user_data, - result_dict, - delay_data=None, - delay_factor=None, - cancel_response_idx=None, - stream_timeout=None, - kill_server=None, - ): - with grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) as triton_client: - # Establish stream - metadata = {"triton_grpc_error": "true"} - triton_client.start_stream( - callback=partial(callback, user_data), - stream_timeout=stream_timeout, - headers=metadata, - ) - # Send specified many requests in parallel - for i in range(request_count): - time.sleep((request_delay / 1000)) - model_name = self.identity_model_name_ - if delay_data is not None: - model_name = self.decoupled_model_name_ - self.inputs_[1].set_data_from_numpy(delay_data) - if kill_server == i: - os.kill(int(self.SERVER_PID), signal.SIGINT) - triton_client.async_stream_infer( - model_name=model_name, - inputs=self.inputs_, - request_id=str(i), - outputs=self.requested_outputs_, - ) - if (delay_data is not None) and (delay_factor is not None): - # Update delay input in accordance with the scaling factor - delay_data = delay_data * delay_factor - delay_data = delay_data.astype(np.uint32) - - # Retrieve results... - recv_count = 0 - while recv_count < expected_count: - if cancel_response_idx == recv_count: - triton_client.stop_stream(cancel_requests=True) - data_item = user_data._response_queue.get() - if type(data_item) == InferenceServerException: - raise data_item - else: - this_id = data_item.get_response().id - if this_id not in result_dict: - result_dict[this_id] = [] - result_dict[this_id].append((recv_count, data_item)) - - recv_count += 1 - - def _streaming_infer( - self, - request_count, - request_delay=0, - cancel_response_idx=None, - stream_timeout=None, - kill_server=None, - should_error=True, - ): - self._prepare_inputs_and_outputs("streaming") - - input_data = np.array([[1.0]], dtype=np.float32) - self.inputs_[0].set_data_from_numpy(input_data) - - user_data = UserData() - result_dict = {} - - try: - expected_count = request_count - self._stream_infer( - request_count, - request_delay, - expected_count, - user_data, - result_dict, - cancel_response_idx=cancel_response_idx, - stream_timeout=stream_timeout, - kill_server=kill_server, - ) - except Exception as ex: - if cancel_response_idx or stream_timeout or should_error: - raise ex - self.assertTrue(False, "unexpected error {}".format(ex)) - - # Validate the results.. - for i in range(request_count): - this_id = str(i) - if this_id not in result_dict.keys(): - self.assertTrue( - False, "response for request id {} not received".format(this_id) - ) - self.assertEqual(len(result_dict[this_id]), 1) - result = result_dict[this_id][0][1] - output0_data = result.as_numpy("OUTPUT0") - self.assertTrue(np.array_equal(input_data, output0_data)) - - def _decoupled_infer( - self, - request_count, - request_delay=0, - repeat_count=1, - data_offset=100, - delay_time=1000, - delay_factor=1, - wait_time=500, - cancel_response_idx=None, - stream_timeout=None, - kill_server=None, - should_error=True, - infer_helper_map=[True, True], - ): - self._prepare_inputs_and_outputs(kind="decoupled_streaming") - - # Initialize data for IN - input_data = np.arange( - start=data_offset, stop=data_offset + repeat_count, dtype=np.int32 - ) - self.inputs_[0].set_shape([repeat_count]) - self.inputs_[0].set_data_from_numpy(input_data) - - # Initialize data for DELAY - delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time - self.inputs_[1].set_shape([repeat_count]) - - # Initialize data for WAIT - wait_data = np.array([wait_time], dtype=np.uint32) - self.inputs_[2].set_data_from_numpy(wait_data) - - infer_helpers = [] - if infer_helper_map[0]: - infer_helpers.append(self._stream_infer) - if infer_helper_map[1]: - infer_helpers.append(self._stream_infer_with_params) - - for infer_helper in infer_helpers: - user_data = UserData() - result_dict = {} - - try: - expected_count = repeat_count * request_count - infer_helper( - request_count, - request_delay, - expected_count, - user_data, - result_dict, - delay_data, - delay_factor, - cancel_response_idx, - stream_timeout, - kill_server, - ) - except Exception as ex: - if cancel_response_idx or stream_timeout or should_error: - raise ex - self.assertTrue(False, "unexpected error {}".format(ex)) - - # Validate the results.. - for i in range(request_count): - this_id = str(i) - if repeat_count != 0 and this_id not in result_dict.keys(): - self.assertTrue( - False, "response for request id {} not received".format(this_id) - ) - elif repeat_count == 0 and this_id in result_dict.keys(): - self.assertTrue( - False, - "received unexpected response for request id {}".format( - this_id - ), - ) - if repeat_count != 0: - self.assertEqual(len(result_dict[this_id]), repeat_count) - expected_data = data_offset - result_list = result_dict[this_id] - for j in range(len(result_list)): - this_data = result_list[j][1].as_numpy("OUT") - self.assertEqual(len(this_data), 1) - self.assertEqual(this_data[0], expected_data) - this_idx = result_list[j][1].as_numpy("IDX") - self.assertEqual(len(this_idx), 1) - self.assertEqual(this_idx[0], j) - expected_data += 1 - - ### - ### Non-Streaming Tests - ### - def test_simple_infer(self): - # This test case sends 10 asynchronous requests and validates - # the response. - self._simple_infer(request_count=10) - - def test_simple_infer_cancellation(self): - # This test case is used to check whether all the states are - # correctly released when one of the request is cancelled from - # the client side. - with self.assertRaises(InferenceServerException) as cm: - self._simple_infer(request_count=10, cancel_response_idx=5) - self.assertIn("Locally cancelled by application!", str(cm.exception)) - - def test_simple_infer_timeout(self): - # This test case is used to check whether all the states are - # correctly released when the request gets timed-out on the client. - with self.assertRaises(InferenceServerException) as cm: - self._simple_infer(request_count=10, client_timeout_pair=[5, 0.1]) - self.assertIn("Deadline Exceeded", str(cm.exception)) - - def test_simple_infer_error_status(self): - # This test case is used to check whether all the state objects are - # released when RPC runs into error. - with self.assertRaises(InferenceServerException) as cm: - self._simple_infer(request_count=10) - self.assertIn( - "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", - str(cm.exception), - ) - - def test_simple_infer_shutdownserver(self): - # This test case is used to check whether all the state objects are - # released when the server is interrupted to shutdown in the beginning - # of inference run with final parameters being returned. - with self.assertRaises(InferenceServerException) as cm: - self._simple_infer(request_count=20, kill_server=5) - - ### - ### Streaming Tests - ### - def test_streaming_infer(self): - # Sanity test to check whether all the state objects - # are correctly released. Sends 10 requests in a single - # gRPC bidirectional stream. - self._streaming_infer(request_count=10) - - def test_streaming_cancellation(self): - # This test case is used to check whether all the states are - # correctly released when the stream is closed when fifth - # response is received. - with self.assertRaises(InferenceServerException) as cm: - self._streaming_infer(request_count=10, cancel_response_idx=5) - self.assertIn("Locally cancelled by application!", str(cm.exception)) - - def test_streaming_timeout(self): - # This test case is used to check whether all the states are - # released when some of the requests timeouts. - with self.assertRaises(InferenceServerException) as cm: - self._streaming_infer(request_count=10, request_delay=1, stream_timeout=2) - self.assertIn("Deadline Exceeded", str(cm.exception)) - - def test_streaming_error_status(self): - # This test case is used to check whether all the state objects are - # released when RPC runs into error. - expected_exceptions = [ - "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", - "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.", - ] - with self.assertRaises(InferenceServerException) as cm: - self._streaming_infer(request_count=10, should_error=True) - - exception_match = False - for expected_exception in expected_exceptions: - exception_match |= expected_exception in str(cm.exception) - self.assertTrue( - exception_match, "Raised unexpected exception {}".format(str(cm.exception)) - ) - - def test_streaming_infer_shutdownserver(self): - # This test case is used to check whether all the state objects are - # released when the server is interrupted to shutdown in middle of - # inference run. - with self.assertRaises(InferenceServerException) as cm: - self._streaming_infer( - request_count=10, - request_delay=1, - kill_server=5, - should_error=True, - ) - - ### - ### Decoupled Streaming Tests - ### - def test_decoupled_infer(self): - # Sanity test to check whether all the state objects - # are correctly released. Sends 10 requests in a single - # gRPC bidirectional stream and expects each of these - # requests to generate 10 responses. - self._decoupled_infer(request_count=10, repeat_count=10) - - def test_decoupled_cancellation(self): - # This test case is used to check whether all the states are - # correctly released when the stream is closed when fifth - # response is received. - with self.assertRaises(InferenceServerException) as cm: - self._decoupled_infer( - request_count=10, repeat_count=10, cancel_response_idx=5 - ) - self.assertIn("Locally cancelled by application!", str(cm.exception)) - - def test_decoupled_timeout(self): - # This test case is used to check whether all the states are - # released when some of the requests timeouts. - with self.assertRaises(InferenceServerException) as cm: - self._decoupled_infer( - request_count=10, repeat_count=10, request_delay=1, stream_timeout=2 - ) - self.assertIn("Deadline Exceeded", str(cm.exception)) - - def test_decoupled_error_status(self): - # This test case is used to check whether all the state objects are - # released when RPC runs into error. - expected_exceptions = [ - "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'", - "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.", - ] - with self.assertRaises(InferenceServerException) as cm: - self._decoupled_infer(request_count=10, repeat_count=10, should_error=True) - - exception_match = False - for expected_exception in expected_exceptions: - exception_match |= expected_exception in str(cm.exception) - self.assertTrue( - exception_match, "Raised unexpected exception {}".format(str(cm.exception)) - ) - - def test_decoupled_infer_shutdownserver(self): - # This test case is used to check whether all the state objects are - # released when the server is interrupted to shutdown in middle of - # inference run. - with self.assertRaises(InferenceServerException) as cm: - self._decoupled_infer( - request_count=10, - repeat_count=10, - request_delay=1, - kill_server=5, - should_error=True, - infer_helper_map=[True, False], - ) - - def test_decoupled_infer_with_params_shutdownserver(self): - # This test case is used to check whether all the state objects are - # released when the server is interrupted to shutdown in middle of - # inference run with final parameters being returned. - with self.assertRaises(InferenceServerException) as cm: - self._decoupled_infer( - request_count=10, - repeat_count=10, - request_delay=1, - kill_server=5, - should_error=True, - infer_helper_map=[False, True], - ) - - def test_decoupled_infer_complete(self): - # Test if the Process() thread could release the state object before - # the StreamInferResponseComplete() thread is done accessing it. - self._decoupled_infer(request_count=1, repeat_count=1, stream_timeout=16) - # Check no error is printed to the log. - with open(os.environ["SERVER_LOG"]) as f: - server_log = f.read() - self.assertNotIn("Should not print this", server_log) - - def test_non_decoupled_streaming_multi_response(self): - # Test non-decoupled streaming infer with more than one response should return - # the first response. - response_count = 4 - expected_response_count = 1 - expected_response_index = 0 - - # Prepare input data - self._prepare_inputs_and_outputs("non_decoupled_streaming") - # Initialize data for IN - data_offset = 100 - input_data = np.arange( - start=data_offset, stop=data_offset + response_count, dtype=np.int32 - ) - self.inputs_[0].set_shape([response_count]) - self.inputs_[0].set_data_from_numpy(input_data) - # Initialize data for DELAY - delay_data = np.zeros([response_count], dtype=np.uint32) - self.inputs_[1].set_shape([response_count]) - self.inputs_[1].set_data_from_numpy(delay_data) - # Initialize data for WAIT - wait_data = np.array([0], dtype=np.uint32) - self.inputs_[2].set_data_from_numpy(wait_data) - - # Infer - user_data = UserData() - with grpcclient.InferenceServerClient( - url="localhost:8001", verbose=True - ) as client: - # Establish stream - metadata = {"triton_grpc_error": "true"} - client.start_stream( - callback=partial(callback, user_data), - stream_timeout=16, - headers=metadata, - ) - # Send a request - client.async_stream_infer( - model_name=self.repeat_non_decoupled_model_name, - inputs=self.inputs_, - request_id="0", - outputs=self.requested_outputs_, - ) - # Wait for all results and stop stream - client.stop_stream() - - # Check infer output - actual_response_count = 0 - while not user_data._response_queue.empty(): - actual_response_count += 1 - data_item = user_data._response_queue.get() - if type(data_item) == InferenceServerException: - raise data_item - else: - response_idx = data_item.as_numpy("IDX")[0] - self.assertEqual(response_idx, expected_response_index) - self.assertEqual(actual_response_count, expected_response_count) - - -if __name__ == "__main__": - CleanUpTest.SERVER_PID = os.environ.get("SERVER_PID", CleanUpTest.SERVER_PID) - unittest.main() diff --git a/qa/L0_grpc_error_state_cleanup/test.sh b/qa/L0_grpc_error_state_cleanup/test.sh deleted file mode 100755 index df302d5ed1..0000000000 --- a/qa/L0_grpc_error_state_cleanup/test.sh +++ /dev/null @@ -1,235 +0,0 @@ -#!/bin/bash -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION} -if [ "$#" -ge 1 ]; then - REPO_VERSION=$1 -fi -if [ -z "$REPO_VERSION" ]; then - echo -e "Repository version must be specified" - echo -e "\n***\n*** Test Failed\n***" - exit 1 -fi -if [ ! -z "$TEST_REPO_ARCH" ]; then - REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH} -fi - -export CUDA_VISIBLE_DEVICES=0 - -RET=0 -CLEANUP_TEST=cleanup_test.py - -rm -f *.log - -CLIENT_LOG=`pwd`/client.log -SERVER=/opt/tritonserver/bin/tritonserver -source ../common/util.sh - -function check_state_release() { - local log_file=$1 - - num_state_release=`cat $log_file | grep "StateRelease" | wc -l` - num_state_new=`cat $log_file | grep "StateNew" | wc -l` - - if [ $num_state_release -ne $num_state_new ]; then - cat $log_file - echo -e "\n***\n*** Test Failed: Mismatch detected, $num_state_new state(s) created, $num_state_release state(s) released. \n***" >> $log_file - return 1 - fi - - return 0 -} - -rm -fr ./models/custom_zero_1_float32 && \ - cp -r ../custom_models/custom_zero_1_float32 ./models/. && \ - mkdir -p ./models/custom_zero_1_float32/1 - -(cd models/custom_zero_1_float32 && \ - echo "parameters [" >> config.pbtxt && \ - echo "{ key: \"execute_delay_ms\"; value: { string_value: \"1000\" }}" >> config.pbtxt && \ - echo "]" >> config.pbtxt) - -rm -rf models/repeat_int32_non_decoupled && \ - cp -r models/repeat_int32 models/repeat_int32_non_decoupled && \ - (cd models/repeat_int32_non_decoupled && \ - sed -i "/model_transaction_policy/,+2d" config.pbtxt && \ - sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt) - -for i in test_simple_infer \ - test_simple_infer_cancellation \ - test_simple_infer_timeout \ - test_streaming_infer \ - test_streaming_timeout \ - test_streaming_cancellation \ - test_decoupled_infer \ - test_decoupled_cancellation \ - test_decoupled_timeout \ - test_non_decoupled_streaming_multi_response; do - SERVER_LOG="./inference_server.$i.log" - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG - - set +e - python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - fi - - kill $SERVER_PID - wait $SERVER_PID - - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 - fi - set -e -done - - -for i in test_simple_infer_error_status \ - test_streaming_error_status \ - test_decoupled_error_status; do - SERVER_LOG="./inference_server.$i.log" - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG - - set +e - python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - fi - - kill $SERVER_PID - wait $SERVER_PID - - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 - fi - - set -e -done - -for i in test_simple_infer_shutdownserver \ - test_streaming_infer_shutdownserver \ - test_decoupled_infer_shutdownserver \ - test_decoupled_infer_with_params_shutdownserver; do - SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" - SERVER_LOG="./inference_server.$i.log" - run_server - if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - echo "Test: $i" >>$CLIENT_LOG - - set +e - SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1 - if [ $? -ne 0 ]; then - echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG - echo -e "\n***\n*** Test $i Failed\n***" - RET=1 - fi - - wait $SERVER_PID - - check_state_release $SERVER_LOG - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $i\n***" - RET=1 - fi - - set -e -done - -TEST_NAME=test_decoupled_infer_complete -export TRITONSERVER_DELAY_GRPC_COMPLETE=2000 - -SERVER_LOG="./inference_server.$TEST_NAME.log" -SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2" -run_server -if [ "$SERVER_PID" == "0" ]; then - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 -fi - -echo "Test: $TEST_NAME" >>$CLIENT_LOG - -set +e - -SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1 -if [ $? -ne 0 ]; then - cat $CLIENT_LOG - echo -e "\n***\n*** Test $TEST_NAME Failed\n***" - RET=1 -fi - -kill $SERVER_PID -wait $SERVER_PID - -check_state_release $SERVER_LOG -if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***" - RET=1 -fi - -set -e - -if [ $RET -eq 0 ]; then - echo -e "\n***\n*** Test Passed\n***" -else - echo -e "\n***\n*** Test Failed\n***" -fi - -exit $RET diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py index 431eeb1720..f7507747e9 100755 --- a/qa/L0_grpc_state_cleanup/cleanup_test.py +++ b/qa/L0_grpc_state_cleanup/cleanup_test.py @@ -161,9 +161,17 @@ def _stream_infer_with_params( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -229,9 +237,17 @@ def _stream_infer( url="localhost:8001", verbose=True ) as triton_client: # Establish stream - triton_client.start_stream( - callback=partial(callback, user_data), stream_timeout=stream_timeout - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + triton_client.start_stream( + callback=partial(callback, user_data), + stream_timeout=stream_timeout, + headers=metadata, + ) + else: + triton_client.start_stream( + callback=partial(callback, user_data), stream_timeout=stream_timeout + ) # Send specified many requests in parallel for i in range(request_count): time.sleep((request_delay / 1000)) @@ -608,9 +624,17 @@ def test_non_decoupled_streaming_multi_response(self): url="localhost:8001", verbose=True ) as client: # Establish stream - client.start_stream( - callback=partial(callback, user_data), stream_timeout=16 - ) + if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ: + metadata = {"triton_grpc_error": "true"} + client.start_stream( + callback=partial(callback, user_data), + stream_timeout=16, + headers=metadata, + ) + else: + client.start_stream( + callback=partial(callback, user_data), stream_timeout=16 + ) # Send a request client.async_stream_infer( model_name=self.repeat_non_decoupled_model_name, From 887aaa237c63dc4954c747209f5958969e3132cc Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 16 Aug 2024 12:06:35 -0700 Subject: [PATCH 31/32] PR comments fixed and main merged --- qa/L0_backend_python/lifecycle/lifecycle_test.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py index 607726b961..d6eb2a8f53 100755 --- a/qa/L0_backend_python/lifecycle/lifecycle_test.py +++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py @@ -255,10 +255,8 @@ def test_triton_grpc_error_error_on(self): callback=partial(callback, user_data), headers=metadata ) stream_end = False - input_datas = [] for i in range(number_of_requests): input_data = np.random.randn(*shape).astype(np.float32) - input_datas.append(input_data) inputs = [ grpcclient.InferInput( "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) @@ -317,10 +315,8 @@ def test_triton_grpc_error_cancel(self): callback=partial(callback, user_data), headers=metadata ) - input_datas = [] for i in range(number_of_requests): input_data = np.random.randn(*shape).astype(np.float32) - input_datas.append(input_data) inputs = [ grpcclient.InferInput( "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) @@ -360,10 +356,8 @@ def test_triton_grpc_error_error_off(self): user_data = UserData() triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") triton_client.start_stream(callback=partial(callback, user_data)) - input_datas = [] for i in range(number_of_requests): input_data = np.random.randn(*shape).astype(np.float32) - input_datas.append(input_data) inputs = [ grpcclient.InferInput( "IN", input_data.shape, np_to_triton_dtype(input_data.dtype) From 0e7670ca7716083b10882622e235ec432419086b Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Fri, 16 Aug 2024 13:58:21 -0700 Subject: [PATCH 32/32] DockerFile fixed --- Dockerfile.QA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.QA b/Dockerfile.QA index 22f312b930..b381abfaaf 100644 --- a/Dockerfile.QA +++ b/Dockerfile.QA @@ -268,7 +268,7 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \ qa/L0_decoupled/python_models/square_int32/. RUN mkdir -p qa/L0_decoupled_grpc_error && \ - cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error && \ + cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error RUN mkdir -p qa/L0_grpc_error_state_cleanup && \ cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup