From 33c1e935dc7260239abcec1bc35a007363be736c Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 24 Jul 2024 22:59:37 -0700
Subject: [PATCH 01/32] Park

---
 src/grpc/infer_handler.h         | 56 ++++++++++++++++++++++++++++++--
 src/grpc/stream_infer_handler.cc | 50 ++++++++++++++++++++++++++--
 2 files changed, 101 insertions(+), 5 deletions(-)
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 6ef03807a2..84ec9c0963 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -642,7 +642,7 @@ class InferHandlerState {
         ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0)
         : cq_(cq), unique_id_(unique_id), ongoing_requests_(0),
           step_(Steps::START), finish_ok_(true), ongoing_write_(false),
-          received_notification_(false)
+          received_notification_(false), grpc_strict_(false)
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
@@ -669,6 +669,44 @@ class InferHandlerState {
       return received_notification_ ? ctx_->IsCancelled() : false;
     }
 
+    // Extracts headers from GRPC request and updates state
+    void ExtractStateFromHeaders(InferHandlerStateType* state)
+    {
+      // Probably need to lock
+      LOG_VERBOSE(1) << "GRPC ExtractStateFromHeaders called" << std::endl;
+      const auto& metadata = state->context_->ctx_->client_metadata();
+      for (const auto& pair : metadata) {
+        auto& key = pair.first;
+        std::string param_key = std::string(key.begin(), key.end());
+        std::string grpc_strict_key = "grpc_strict";
+        if (param_key.compare(grpc_strict_key) == 0) {
+          // They are equal
+          state->context_->grpc_strict_ = true;
+          LOG_VERBOSE(1) << "GRPC streaming strict flag detected" << std::endl;
+        }
+      }
+    }
+
+    void sendGRPCStrictResponse(InferHandlerStateType* state)
+    {
+      // Check if streaming error detected AND grpc_mode is strict
+      if (state->context_->grpc_strict_) {
+        if (state->IsStreamError()) {
+          ::grpc::Status dummy_status = ::grpc::Status(
+              ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status"));
+          //      state->context_->responder_->Finish(state->status_, state);
+          state->context_->step_ = Steps::COMPLETE;
+          state->step_ = Steps::PARTIAL_COMPLETION;
+
+          state->context_->responder_->Finish(dummy_status, state);
+          LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
+                         << state->status_.error_code() << std::endl;
+          IssueRequestCancellation();
+        }
+      } else {
+        LOG_VERBOSE(1) << "GRPC mode NOT strict in Finish" << std::endl;
+      }
+    }
     // Increments the ongoing request counter
     void IncrementRequestCounter() { ongoing_requests_++; }
 
@@ -996,6 +1034,9 @@ class InferHandlerState {
     // Tracks whether the async notification has been delivered by
     // completion queue.
     bool received_notification_;
+
+    // True if there is an ongoing write to the grpc stream
+    std::atomic<bool> grpc_strict_;
   };
 
   // This constructor is used to build a wrapper state object
@@ -1003,8 +1044,11 @@ class InferHandlerState {
   // object is used to distinguish a tag from AsyncNotifyWhenDone()
   // signal.
   explicit InferHandlerState(Steps start_step, InferHandlerState* state)
-      : step_(start_step), state_ptr_(state), async_notify_state_(false)
+      : step_(start_step), state_ptr_(state), async_notify_state_(false),
+        grpc_stream_error_state_(false)
   {
+    LOG_VERBOSE(1)
+        << "grpc_stream_error_state_ init called in InferHandlerState \n";
     state->MarkAsAsyncNotifyState();
   }
 
@@ -1013,6 +1057,8 @@ class InferHandlerState {
       const std::shared_ptr<Context>& context, Steps start_step = Steps::START)
       : tritonserver_(tritonserver), async_notify_state_(false)
   {
+    LOG_VERBOSE(1)
+        << "grpc_stream_error_state_ init called in InferHandlerState 2 \n";
     // For debugging and testing
     const char* dstr = getenv("TRITONSERVER_DELAY_GRPC_RESPONSE");
     delay_response_ms_ = 0;
@@ -1058,6 +1104,7 @@ class InferHandlerState {
     // wrapper state object in WAITING_NOTIFICATION step.
     state_ptr_ = nullptr;
     async_notify_state_ = false;
+    grpc_stream_error_state_ = false;
   }
 
   void Release()
@@ -1087,7 +1134,8 @@ class InferHandlerState {
 
   void MarkAsAsyncNotifyState() { async_notify_state_ = true; }
   bool IsAsyncNotifyState() { return async_notify_state_; }
-
+  void MarkIsStreamError() { grpc_stream_error_state_ = true; }
+  bool IsStreamError() { return grpc_stream_error_state_; }
   // Needed in the response handle for classification outputs.
   TRITONSERVER_Server* tritonserver_;
 
@@ -1098,6 +1146,7 @@ class InferHandlerState {
   std::shared_ptr<Context> context_;
   Steps step_;
   std::recursive_mutex step_mtx_;
+  std::recursive_mutex grpc_strict_mtx_;
 
   // Shared pointer to the inference request object. The lifetime of
   // inference request object is extended till all the responses from
@@ -1139,6 +1188,7 @@ class InferHandlerState {
   // Tracks whether this state object has been wrapped and send to
   // AsyncNotifyWhenDone() function as a tag.
   bool async_notify_state_;
+  bool grpc_stream_error_state_;
 };
 
 
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 269808c78a..7dcac8ea6d 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -140,6 +140,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // This means that we only need to take care of
   // synchronizing this thread and the ResponseComplete
   // threads.
+  {
+    std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
+    if (state->IsStreamError() && state->context_->grpc_strict_) {
+      LOG_VERBOSE(1) << "Ignoring new client request in strict mode \n";
+      return false;
+    }
+  }
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {
@@ -189,7 +196,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       state->context_->responder_->Finish(status, state);
       return !finished;
     }
-
+    state->context_->ExtractStateFromHeaders(state);
   } else if (state->step_ == Steps::READ) {
     TRITONSERVER_Error* err = nullptr;
     const inference::ModelInferRequest& request = state->request_;
@@ -328,7 +335,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     // initiated... the completion callback will transition to
     // WRITEREADY or WRITTEN or CANCELLED. Recording the state and the
     // irequest to handle gRPC stream cancellation.
+    // std::time_t currentTime = std::time(nullptr);
+
+    // // Divide the current time by 2
+    // std::time_t modTime = currentTime % 2;
+
     if (err == nullptr) {
+      // if (modTime) {
       state->context_->InsertInflightState(state);
       // The payload will be cleaned in request release callback.
       request_release_payload.release();
@@ -355,6 +368,17 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       GrpcStatusUtil::Create(&status, err);
       TRITONSERVER_ErrorDelete(err);
       response->set_error_message(status.error_message());
+      // if(state->context_->grpc_strict_) {
+      //   // Set to finish
+      //   state->status_ = status;
+      //   state->MarkIsStreamError();
+      //   LOG_VERBOSE(1) << "GRPC streaming error detected : " <<
+      //   status.error_code() << std::endl;
+      // } else {
+      //   LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict:
+      //   " << status.error_code() << std::endl;
+      // }
+
 
       response->mutable_infer_response()->Clear();
       // repopulate the id so that client knows which request failed.
@@ -633,12 +657,34 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       LOG_ERROR << "expected the response allocator to have added the response";
     }
 
-    if (err != nullptr) {
+    std::time_t currentTime = std::time(nullptr);
+
+    // Divide the current time by 2
+    std::time_t modTime = currentTime % 2;
+
+    if (modTime) {
+      LOG_VERBOSE(1) << "Generating fake error" << std::endl;
       failed = true;
       ::grpc::Status status;
       GrpcStatusUtil::Create(&status, err);
       response->mutable_infer_response()->Clear();
       response->set_error_message(status.error_message());
+      if (state->context_->grpc_strict_) {
+        // Set to finish
+        std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
+        state->status_ = status;
+        if (!state->IsStreamError()) {
+          // Finish only once, if backend ignores cancellation
+          state->MarkIsStreamError();
+          LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)"
+                         << status.error_code() << std::endl;
+          state->context_->sendGRPCStrictResponse(state);
+        }
+        return;
+      } else {
+        LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict: "
+                       << status.error_code() << std::endl;
+      }
       LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
     }
 

From 3affd3a676705fcff09db95d4e0f72966cdb32b3 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Tue, 30 Jul 2024 12:31:10 -0700
Subject: [PATCH 02/32] Park

---
 src/grpc/infer_handler.h         | 53 ++++++++++++++++++--------------
 src/grpc/stream_infer_handler.cc | 26 ++++++----------
 2 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 84ec9c0963..5b519d0b80 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -691,17 +691,21 @@ class InferHandlerState {
     {
       // Check if streaming error detected AND grpc_mode is strict
       if (state->context_->grpc_strict_) {
-        if (state->IsStreamError()) {
-          ::grpc::Status dummy_status = ::grpc::Status(
-              ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status"));
-          //      state->context_->responder_->Finish(state->status_, state);
-          state->context_->step_ = Steps::COMPLETE;
-          state->step_ = Steps::PARTIAL_COMPLETION;
-
-          state->context_->responder_->Finish(dummy_status, state);
-          LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
-                         << state->status_.error_code() << std::endl;
-          IssueRequestCancellation();
+        {
+          std::lock_guard<std::recursive_mutex> lock(grpc_strict_mu_);
+          if (!state->context_->IsStreamError()) {
+            ::grpc::Status dummy_status = ::grpc::Status(
+                ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status"));
+            //      state->context_->responder_->Finish(state->status_, state);
+            state->context_->step_ = Steps::COMPLETE;
+            state->step_ = Steps::PARTIAL_COMPLETION;
+
+            state->context_->responder_->Finish(dummy_status, state);
+            LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
+                            << state->status_.error_code() << std::endl;
+            state->context_->MarkIsStreamError();
+            IssueRequestCancellation(false);
+          }
         }
       } else {
         LOG_VERBOSE(1) << "GRPC mode NOT strict in Finish" << std::endl;
@@ -784,7 +788,7 @@ class InferHandlerState {
 
     // Issues the cancellation for all inflight requests
     // being tracked by this context.
-    void IssueRequestCancellation()
+    void IssueRequestCancellation(bool grpc_strict)
     {
       {
         std::lock_guard<std::recursive_mutex> lock(mu_);
@@ -817,7 +821,9 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
-            PutTaskBackToQueue(state);
+            if(!grpc_strict) {
+              PutTaskBackToQueue(state);
+            }
           }
         }
       }
@@ -851,7 +857,7 @@ class InferHandlerState {
         // issue cancellation request to all the inflight
         // states belonging to the context.
         if (state->context_->step_ != Steps::CANCELLED) {
-          IssueRequestCancellation();
+          IssueRequestCancellation(false);
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
 
@@ -979,6 +985,11 @@ class InferHandlerState {
       return false;
     }
 
+    void MarkIsStreamError() { 
+      grpc_stream_error_state_ = true; }
+    bool IsStreamError() { 
+      return grpc_stream_error_state_; }
+
     // Return true if this context has completed all reads and writes.
     bool IsRequestsCompleted()
     {
@@ -1006,6 +1017,7 @@ class InferHandlerState {
     // orders. A state enters this queue when it has successfully read
     // a request and exits the queue when it is written.
     std::recursive_mutex mu_;
+    std::recursive_mutex grpc_strict_mu_;
     std::queue<InferHandlerStateType*> states_;
     std::atomic<uint32_t> ongoing_requests_;
 
@@ -1037,6 +1049,8 @@ class InferHandlerState {
 
     // True if there is an ongoing write to the grpc stream
     std::atomic<bool> grpc_strict_;
+
+    bool grpc_stream_error_state_;
   };
 
   // This constructor is used to build a wrapper state object
@@ -1044,11 +1058,8 @@ class InferHandlerState {
   // object is used to distinguish a tag from AsyncNotifyWhenDone()
   // signal.
   explicit InferHandlerState(Steps start_step, InferHandlerState* state)
-      : step_(start_step), state_ptr_(state), async_notify_state_(false),
-        grpc_stream_error_state_(false)
+      : step_(start_step), state_ptr_(state), async_notify_state_(false)
   {
-    LOG_VERBOSE(1)
-        << "grpc_stream_error_state_ init called in InferHandlerState \n";
     state->MarkAsAsyncNotifyState();
   }
 
@@ -1104,7 +1115,6 @@ class InferHandlerState {
     // wrapper state object in WAITING_NOTIFICATION step.
     state_ptr_ = nullptr;
     async_notify_state_ = false;
-    grpc_stream_error_state_ = false;
   }
 
   void Release()
@@ -1134,8 +1144,6 @@ class InferHandlerState {
 
   void MarkAsAsyncNotifyState() { async_notify_state_ = true; }
   bool IsAsyncNotifyState() { return async_notify_state_; }
-  void MarkIsStreamError() { grpc_stream_error_state_ = true; }
-  bool IsStreamError() { return grpc_stream_error_state_; }
   // Needed in the response handle for classification outputs.
   TRITONSERVER_Server* tritonserver_;
 
@@ -1146,7 +1154,7 @@ class InferHandlerState {
   std::shared_ptr<Context> context_;
   Steps step_;
   std::recursive_mutex step_mtx_;
-  std::recursive_mutex grpc_strict_mtx_;
+  // std::recursive_mutex grpc_strict_mtx_;
 
   // Shared pointer to the inference request object. The lifetime of
   // inference request object is extended till all the responses from
@@ -1188,7 +1196,6 @@ class InferHandlerState {
   // Tracks whether this state object has been wrapped and send to
   // AsyncNotifyWhenDone() function as a tag.
   bool async_notify_state_;
-  bool grpc_stream_error_state_;
 };
 
 
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 7dcac8ea6d..a702cc6261 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -140,12 +140,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // This means that we only need to take care of
   // synchronizing this thread and the ResponseComplete
   // threads.
-  {
-    std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
-    if (state->IsStreamError() && state->context_->grpc_strict_) {
-      LOG_VERBOSE(1) << "Ignoring new client request in strict mode \n";
-      return false;
-    }
+  // We need an explicit finish indicator. Can't use 'state->step_'
+  // because we launch an async thread that could update 'state's
+  // step_ to be FINISH before this thread exits this function.
+  bool finished = false;
+
+  if(state->context_ == nullptr) {
+    return !finished;
   }
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
@@ -163,11 +164,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
                  << ", context " << state->context_->unique_id_ << ", "
                  << state->unique_id_ << " step " << state->step_;
 
-  // We need an explicit finish indicator. Can't use 'state->step_'
-  // because we launch an async thread that could update 'state's
-  // step_ to be FINISH before this thread exits this function.
-  bool finished = false;
-
   if (state->step_ == Steps::START) {
     // A new stream connection... If RPC failed on a new request then
     // the server is shutting down and so we should do nothing.
@@ -671,22 +667,18 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       response->set_error_message(status.error_message());
       if (state->context_->grpc_strict_) {
         // Set to finish
-        std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
+        // std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
         state->status_ = status;
-        if (!state->IsStreamError()) {
           // Finish only once, if backend ignores cancellation
-          state->MarkIsStreamError();
           LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)"
                          << status.error_code() << std::endl;
           state->context_->sendGRPCStrictResponse(state);
         }
         return;
       } else {
-        LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict: "
-                       << status.error_code() << std::endl;
+        LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict";
       }
       LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
-    }
 
     TRITONSERVER_ErrorDelete(err);
     LOG_TRITONSERVER_ERROR(

From 7f86c6a76666aa54bb66aa0b85c0c225d0557d2b Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 31 Jul 2024 14:58:38 -0700
Subject: [PATCH 03/32] Working Set

---
 src/grpc/infer_handler.h         | 15 +++++++-------
 src/grpc/stream_infer_handler.cc | 34 +++++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 5b519d0b80..c839e8c7fc 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -642,7 +642,8 @@ class InferHandlerState {
         ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0)
         : cq_(cq), unique_id_(unique_id), ongoing_requests_(0),
           step_(Steps::START), finish_ok_(true), ongoing_write_(false),
-          received_notification_(false), grpc_strict_(false)
+          received_notification_(false), grpc_strict_(false),
+          grpc_stream_error_state_(false)
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
@@ -702,7 +703,7 @@ class InferHandlerState {
 
             state->context_->responder_->Finish(dummy_status, state);
             LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
-                            << state->status_.error_code() << std::endl;
+                           << state->status_.error_code() << std::endl;
             state->context_->MarkIsStreamError();
             IssueRequestCancellation(false);
           }
@@ -821,7 +822,7 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
-            if(!grpc_strict) {
+            if (!grpc_strict) {
               PutTaskBackToQueue(state);
             }
           }
@@ -985,10 +986,8 @@ class InferHandlerState {
       return false;
     }
 
-    void MarkIsStreamError() { 
-      grpc_stream_error_state_ = true; }
-    bool IsStreamError() { 
-      return grpc_stream_error_state_; }
+    void MarkIsStreamError() { grpc_stream_error_state_ = true; }
+    bool IsStreamError() { return grpc_stream_error_state_; }
 
     // Return true if this context has completed all reads and writes.
     bool IsRequestsCompleted()
@@ -1050,7 +1049,7 @@ class InferHandlerState {
     // True if there is an ongoing write to the grpc stream
     std::atomic<bool> grpc_strict_;
 
-    bool grpc_stream_error_state_;
+    std::atomic<bool> grpc_stream_error_state_;
   };
 
   // This constructor is used to build a wrapper state object
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index a702cc6261..42eb48c84e 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -144,10 +144,14 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // because we launch an async thread that could update 'state's
   // step_ to be FINISH before this thread exits this function.
   bool finished = false;
-
-  if(state->context_ == nullptr) {
+  if (state->context_ == nullptr) {
+    return !finished;
+  }
+  std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+  if (state->context_->IsStreamError()) {
     return !finished;
   }
+
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {
@@ -577,7 +581,12 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     void* userp)
 {
   State* state = reinterpret_cast<State*>(userp);
-
+  // Ignore Response from CORE in case GRPC Strict as we dont care about
+  LOG_VERBOSE(1) << "Dead Response from CORE";
+  std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+  if (state->context_->IsStreamError()) {
+    return;
+  }
   // Increment the callback index
   uint32_t response_index = state->cb_count_++;
 
@@ -662,6 +671,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       LOG_VERBOSE(1) << "Generating fake error" << std::endl;
       failed = true;
       ::grpc::Status status;
+      // Converts CORE errors to GRPC error codes
       GrpcStatusUtil::Create(&status, err);
       response->mutable_infer_response()->Clear();
       response->set_error_message(status.error_message());
@@ -669,16 +679,16 @@ ModelStreamInferHandler::StreamInferResponseComplete(
         // Set to finish
         // std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
         state->status_ = status;
-          // Finish only once, if backend ignores cancellation
-          LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)"
-                         << status.error_code() << std::endl;
-          state->context_->sendGRPCStrictResponse(state);
-        }
-        return;
-      } else {
-        LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict";
+        // Finish only once, if backend ignores cancellation
+        LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)"
+                       << status.error_code() << std::endl;
+        state->context_->sendGRPCStrictResponse(state);
       }
-      LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
+      return;
+    } else {
+      LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict";
+    }
+    LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
 
     TRITONSERVER_ErrorDelete(err);
     LOG_TRITONSERVER_ERROR(

From b65fd742dd16f915e2693759a417a8e44b69e0d1 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 31 Jul 2024 18:34:47 -0700
Subject: [PATCH 04/32] Working Set

---
 src/grpc/infer_handler.h         | 32 +++++++++++++++----------------
 src/grpc/stream_infer_handler.cc | 33 +++++---------------------------
 2 files changed, 20 insertions(+), 45 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index c839e8c7fc..4fb5c3f31c 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -694,22 +694,19 @@ class InferHandlerState {
       if (state->context_->grpc_strict_) {
         {
           std::lock_guard<std::recursive_mutex> lock(grpc_strict_mu_);
-          if (!state->context_->IsStreamError()) {
-            ::grpc::Status dummy_status = ::grpc::Status(
-                ::grpc::StatusCode::UNAVAILABLE, std::string("Dummy Status"));
-            //      state->context_->responder_->Finish(state->status_, state);
+          // Check if Error not responded previously
+          // Avoid closing connection twice on multiple errors from core
+          if (!state->context_->IsGRPCStrictError()) {
             state->context_->step_ = Steps::COMPLETE;
             state->step_ = Steps::PARTIAL_COMPLETION;
-
-            state->context_->responder_->Finish(dummy_status, state);
+            state->context_->responder_->Finish(state->status_, state);
             LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
                            << state->status_.error_code() << std::endl;
-            state->context_->MarkIsStreamError();
-            IssueRequestCancellation(false);
+            // Mark error for this stream
+            state->context_->MarkGRPCStrictError();
+            IssueRequestCancellation();
           }
         }
-      } else {
-        LOG_VERBOSE(1) << "GRPC mode NOT strict in Finish" << std::endl;
       }
     }
     // Increments the ongoing request counter
@@ -789,7 +786,7 @@ class InferHandlerState {
 
     // Issues the cancellation for all inflight requests
     // being tracked by this context.
-    void IssueRequestCancellation(bool grpc_strict)
+    void IssueRequestCancellation()
     {
       {
         std::lock_guard<std::recursive_mutex> lock(mu_);
@@ -822,9 +819,7 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
-            if (!grpc_strict) {
-              PutTaskBackToQueue(state);
-            }
+            PutTaskBackToQueue(state);
           }
         }
       }
@@ -858,7 +853,7 @@ class InferHandlerState {
         // issue cancellation request to all the inflight
         // states belonging to the context.
         if (state->context_->step_ != Steps::CANCELLED) {
-          IssueRequestCancellation(false);
+          IssueRequestCancellation();
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
 
@@ -986,8 +981,11 @@ class InferHandlerState {
       return false;
     }
 
-    void MarkIsStreamError() { grpc_stream_error_state_ = true; }
-    bool IsStreamError() { return grpc_stream_error_state_; }
+    // Marks error after it has been responded to
+    void MarkGRPCStrictError() { grpc_stream_error_state_ = true; }
+
+    // Checks if error already responded to in grpc_strict mode
+    bool IsGRPCStrictError() { return grpc_stream_error_state_; }
 
     // Return true if this context has completed all reads and writes.
     bool IsRequestsCompleted()
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 42eb48c84e..42ff740098 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -144,11 +144,9 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // because we launch an async thread that could update 'state's
   // step_ to be FINISH before this thread exits this function.
   bool finished = false;
-  if (state->context_ == nullptr) {
-    return !finished;
-  }
   std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-  if (state->context_->IsStreamError()) {
+  // Check if stream error detected and already connection ended
+  if (state->context_->IsGRPCStrictError()) {
     return !finished;
   }
 
@@ -368,18 +366,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       GrpcStatusUtil::Create(&status, err);
       TRITONSERVER_ErrorDelete(err);
       response->set_error_message(status.error_message());
-      // if(state->context_->grpc_strict_) {
-      //   // Set to finish
-      //   state->status_ = status;
-      //   state->MarkIsStreamError();
-      //   LOG_VERBOSE(1) << "GRPC streaming error detected : " <<
-      //   status.error_code() << std::endl;
-      // } else {
-      //   LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict:
-      //   " << status.error_code() << std::endl;
-      // }
-
-
       response->mutable_infer_response()->Clear();
       // repopulate the id so that client knows which request failed.
       response->mutable_infer_response()->set_id(request.id());
@@ -584,7 +570,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
   // Ignore Response from CORE in case GRPC Strict as we dont care about
   LOG_VERBOSE(1) << "Dead Response from CORE";
   std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-  if (state->context_->IsStreamError()) {
+  if (state->context_->IsGRPCStrictError()) {
     return;
   }
   // Increment the callback index
@@ -661,14 +647,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     } else {
       LOG_ERROR << "expected the response allocator to have added the response";
     }
-
-    std::time_t currentTime = std::time(nullptr);
-
-    // Divide the current time by 2
-    std::time_t modTime = currentTime % 2;
-
-    if (modTime) {
-      LOG_VERBOSE(1) << "Generating fake error" << std::endl;
+    if (err != nullptr) {
       failed = true;
       ::grpc::Status status;
       // Converts CORE errors to GRPC error codes
@@ -680,13 +659,11 @@ ModelStreamInferHandler::StreamInferResponseComplete(
         // std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
         state->status_ = status;
         // Finish only once, if backend ignores cancellation
-        LOG_VERBOSE(1) << "GRPC streaming error detected (Only once)"
+        LOG_VERBOSE(1) << "GRPC streaming error detected: "
                        << status.error_code() << std::endl;
         state->context_->sendGRPCStrictResponse(state);
       }
       return;
-    } else {
-      LOG_VERBOSE(1) << "GRPC streaming error detected BUT mode NOT strict";
     }
     LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
 

From 36e461f3245fb01e37c7cbfe631a43a6d76e7fbc Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 1 Aug 2024 11:43:40 -0700
Subject: [PATCH 05/32] Working Set

---
 src/grpc/infer_handler.h         | 30 +++++++++++++-----------------
 src/grpc/stream_infer_handler.cc | 22 ++++++++++++----------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 4fb5c3f31c..7bd4330ec0 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -645,6 +645,7 @@ class InferHandlerState {
           received_notification_(false), grpc_strict_(false),
           grpc_stream_error_state_(false)
     {
+      LOG_VERBOSE(1) << "Context Constructor getting called";
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
     }
@@ -690,23 +691,18 @@ class InferHandlerState {
 
     void sendGRPCStrictResponse(InferHandlerStateType* state)
     {
-      // Check if streaming error detected AND grpc_mode is strict
-      if (state->context_->grpc_strict_) {
-        {
-          std::lock_guard<std::recursive_mutex> lock(grpc_strict_mu_);
-          // Check if Error not responded previously
-          // Avoid closing connection twice on multiple errors from core
-          if (!state->context_->IsGRPCStrictError()) {
-            state->context_->step_ = Steps::COMPLETE;
-            state->step_ = Steps::PARTIAL_COMPLETION;
-            state->context_->responder_->Finish(state->status_, state);
-            LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
-                           << state->status_.error_code() << std::endl;
-            // Mark error for this stream
-            state->context_->MarkGRPCStrictError();
-            IssueRequestCancellation();
-          }
-        }
+      std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+      // Check if Error not responded previously
+      // Avoid closing connection twice on multiple errors from core
+      if (!state->context_->IsGRPCStrictError()) {
+        state->context_->step_ = Steps::COMPLETE;
+        state->step_ = Steps::PARTIAL_COMPLETION;
+        state->context_->responder_->Finish(state->status_, state);
+        LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
+                       << state->status_.error_code() << std::endl;
+        // Mark error for this stream
+        state->context_->MarkGRPCStrictError();
+        IssueRequestCancellation();
       }
     }
     // Increments the ongoing request counter
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 42ff740098..df196640b1 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -144,12 +144,13 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // because we launch an async thread that could update 'state's
   // step_ to be FINISH before this thread exits this function.
   bool finished = false;
-  std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-  // Check if stream error detected and already connection ended
-  if (state->context_->IsGRPCStrictError()) {
-    return !finished;
+  if (state->context_->grpc_strict_) {
+    std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+    // Check if stream error detected and already connection ended
+    if (state->context_->IsGRPCStrictError()) {
+      return !finished;
+    }
   }
-
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {
@@ -568,10 +569,11 @@ ModelStreamInferHandler::StreamInferResponseComplete(
 {
   State* state = reinterpret_cast<State*>(userp);
   // Ignore Response from CORE in case GRPC Strict as we dont care about
-  LOG_VERBOSE(1) << "Dead Response from CORE";
-  std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-  if (state->context_->IsGRPCStrictError()) {
-    return;
+  if (state->context_->grpc_strict_) {
+    std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+    if (state->context_->IsGRPCStrictError()) {
+      return;
+    }
   }
   // Increment the callback index
   uint32_t response_index = state->cb_count_++;
@@ -648,6 +650,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       LOG_ERROR << "expected the response allocator to have added the response";
     }
     if (err != nullptr) {
+      LOG_VERBOSE(1) << "Error in CORE Response";
       failed = true;
       ::grpc::Status status;
       // Converts CORE errors to GRPC error codes
@@ -663,7 +666,6 @@ ModelStreamInferHandler::StreamInferResponseComplete(
                        << status.error_code() << std::endl;
         state->context_->sendGRPCStrictResponse(state);
       }
-      return;
     }
     LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
 

From bd549b16284299c9f56ae6a3ea035d333e616a84 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 1 Aug 2024 12:03:59 -0700
Subject: [PATCH 06/32] Working Set

---
 src/grpc/stream_infer_handler.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index df196640b1..9c62a1a301 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -657,6 +657,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       GrpcStatusUtil::Create(&status, err);
       response->mutable_infer_response()->Clear();
       response->set_error_message(status.error_message());
+      LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
       if (state->context_->grpc_strict_) {
         // Set to finish
         // std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
@@ -665,9 +666,9 @@ ModelStreamInferHandler::StreamInferResponseComplete(
         LOG_VERBOSE(1) << "GRPC streaming error detected: "
                        << status.error_code() << std::endl;
         state->context_->sendGRPCStrictResponse(state);
+        return;
       }
     }
-    LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
 
     TRITONSERVER_ErrorDelete(err);
     LOG_TRITONSERVER_ERROR(

From 85ccd726a667df19c63a152837e5cb61891605ba Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 2 Aug 2024 02:05:39 -0700
Subject: [PATCH 07/32] Tests Added

---
 .../lifecycle/lifecycle_test.py               | 94 ++++++++++++++++++-
 qa/L0_backend_python/lifecycle/my_test.py     | 53 +++++++++++
 qa/L0_backend_python/lifecycle/test.sh        |  8 ++
 .../execute_grpc_error/config.pbtxt           | 52 ++++++++++
 qa/python_models/execute_grpc_error/model.py  | 52 ++++++++++
 5 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 qa/L0_backend_python/lifecycle/my_test.py
 create mode 100644 qa/python_models/execute_grpc_error/config.pbtxt
 create mode 100644 qa/python_models/execute_grpc_error/model.py

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index cea94a1dad..f2e52c8c91 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -27,8 +27,11 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import re
 import sys
 
+import requests
+
 sys.path.append("../../common")
 
 import queue
@@ -63,6 +66,29 @@ class LifecycleTest(unittest.TestCase):
     def setUp(self):
         self._shm_leak_detector = shm_util.ShmLeakDetector()
 
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metrics_before_test(self, model, reason):
+        pattern = rf'nv_inference_request_failure\{{model="{model}",reason="{reason}",version="1"\}} (\d+)'
+        metrics = self._get_metrics()
+        match = re.search(pattern, metrics)
+        if match:
+            return int(match.group(1))
+        else:
+            raise Exception(f"Failure metrics for model='{model}' not found")
+
+    def _assert_metrics(
+        self, model_name, reason, expected_count_increase, initial_count
+    ):
+        metrics = self._get_metrics()
+        # Add initial count + expected count for the the test
+        expected_metric = f'nv_inference_request_failure{{model="{model_name}",reason="{reason}",version="1"}} {expected_count_increase + initial_count}'
+        self.assertIn(expected_metric, metrics)
+
     def test_error_code(self):
         model_name = "error_code"
         shape = [1, 1]
@@ -181,7 +207,7 @@ def test_batch_error(self):
     def test_infer_pymodel_error(self):
         model_name = "wrong_model"
         shape = [2, 2]
-
+        initial_metrics_value = self._metrics_before_test(model_name, "BACKEND")
         with self._shm_leak_detector.Probe() as shm_probe:
             with httpclient.InferenceServerClient(
                 f"{_tritonserver_ipaddr}:8000"
@@ -207,6 +233,72 @@ def test_infer_pymodel_error(self):
                     self.assertTrue(
                         False, "Wrong exception raised or did not raise an exception"
                     )
+        expected_count_increase = 1
+        self._assert_metrics(
+            model_name,
+            "BACKEND",
+            expected_count_increase,
+            initial_metrics_value,
+        )
+
+    def test_grpc_strict_error_on(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 3
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        metadata = {"grpc_strict": "true"}
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
+
+        with self._shm_leak_detector.Probe() as shm_probe:
+            input_datas = []
+            for i in range(number_of_requests):
+                input_data = np.random.randn(*shape).astype(np.float32)
+                input_datas.append(input_data)
+                inputs = [
+                    grpcclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
+                ]
+                inputs[0].set_data_from_numpy(input_data)
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if i == 1:
+                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                    self.assertIsInstance(result, InferenceServerException)
+                    self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
+
+    def test_grpc_strict_error_off(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 3
+        user_data = UserData()
+        triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
+        triton_client.start_stream(callback=partial(callback, user_data))
+
+        with self._shm_leak_detector.Probe() as shm_probe:
+            input_datas = []
+            for i in range(number_of_requests):
+                input_data = np.random.randn(*shape).astype(np.float32)
+                input_datas.append(input_data)
+                inputs = [
+                    grpcclient.InferInput(
+                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                    )
+                ]
+                inputs[0].set_data_from_numpy(input_data)
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if i == 1:
+                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                    self.assertIsInstance(result, InferenceServerException)
+                    # Existing Behaviour
+                    self.assertEqual(str(result.status()), "NONE")
+                if i == 2:
+                    # Stream is not killed
+                    self.assertIsInstance(result, InferResult)
 
 
 if __name__ == "__main__":
diff --git a/qa/L0_backend_python/lifecycle/my_test.py b/qa/L0_backend_python/lifecycle/my_test.py
new file mode 100644
index 0000000000..25d76910e4
--- /dev/null
+++ b/qa/L0_backend_python/lifecycle/my_test.py
@@ -0,0 +1,53 @@
+import numpy as np
+import tritonclient.grpc as grpcclient
+from functools import partial
+import queue
+from tritonclient.utils import *
+
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+def grpc_strict_error():
+    model_name = "execute_error"
+    shape = [2, 2]
+    number_of_requests = 3
+    user_data = UserData()
+    triton_server_url = "localhost:8001"  # Replace with your Triton server address
+
+    try:
+        triton_client = grpcclient.InferenceServerClient(triton_server_url)
+        metadata = {"grpc_strict": "true"}
+
+        triton_client.start_stream(callback=partial(callback, user_data), headers=metadata)
+
+        input_datas = []
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            input_datas.append(input_data)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+            result = user_data._completed_requests.get()
+            print(f"Request {i + 1} result:")
+            print(type(result))
+            if type(result) == InferenceServerException:
+                print(result.status())
+
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+    finally:
+        triton_client.stop_stream()
+
+if __name__ == "__main__":
+    grpc_strict_error()
\ No newline at end of file
diff --git a/qa/L0_backend_python/lifecycle/test.sh b/qa/L0_backend_python/lifecycle/test.sh
index dba4581ddd..59b846f56b 100755
--- a/qa/L0_backend_python/lifecycle/test.sh
+++ b/qa/L0_backend_python/lifecycle/test.sh
@@ -52,6 +52,14 @@ cp ../../python_models/execute_error/config.pbtxt ./models/execute_error/
           sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
           echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 12000000 }" >> config.pbtxt)
 
+mkdir -p models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/model.py ./models/execute_grpc_error/1/
+cp ../../python_models/execute_grpc_error/config.pbtxt ./models/execute_grpc_error/
+(cd models/execute_grpc_error && \
+          sed -i "s/^name:.*/name: \"execute_grpc_error\"/" config.pbtxt && \
+          sed -i "s/^max_batch_size:.*/max_batch_size: 8/" config.pbtxt && \
+          echo "dynamic_batching { preferred_batch_size: [8], max_queue_delay_microseconds: 1200000 }" >> config.pbtxt)
+
 mkdir -p models/execute_return_error/1/
 cp ../../python_models/execute_return_error/model.py ./models/execute_return_error/1/
 cp ../../python_models/execute_return_error/config.pbtxt ./models/execute_return_error/
diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt
new file mode 100644
index 0000000000..3d364f3cc5
--- /dev/null
+++ b/qa/python_models/execute_grpc_error/config.pbtxt
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "execute_grpc_error"
+backend: "python"
+max_batch_size: 64
+
+input [
+  {
+    name: "IN"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+output [
+  {
+    name: "OUT"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py
new file mode 100644
index 0000000000..ee74e710f8
--- /dev/null
+++ b/qa/python_models/execute_grpc_error/model.py
@@ -0,0 +1,52 @@
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    def __init__(self):
+        # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure
+        self.inf_count = 0
+
+    def execute(self, requests):
+        """This function is called on inference request."""
+        responses = []
+
+        # Generate the error for the second request
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
+            out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
+            if self.inf_count == 0:
+                responses.append(pb_utils.InferenceResponse([out_tensor]))
+            elif self.inf_count == 1:
+                error = pb_utils.TritonError("An error occurred during execution")
+                responses.append(pb_utils.InferenceResponse([out_tensor], error))
+            elif self.inf_count == 2:
+                responses.append(pb_utils.InferenceResponse([out_tensor]))
+            self.inf_count += 1
+
+        return responses

From 37b15e83e952adfe59d94b59fc3711fadd1d5d8d Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 2 Aug 2024 02:19:03 -0700
Subject: [PATCH 08/32] Clean up

---
 qa/L0_backend_python/lifecycle/my_test.py | 15 +++++++++++----
 src/grpc/infer_handler.h                  | 15 +++------------
 src/grpc/stream_infer_handler.cc          | 11 +++--------
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/qa/L0_backend_python/lifecycle/my_test.py b/qa/L0_backend_python/lifecycle/my_test.py
index 25d76910e4..c4c1899e8f 100644
--- a/qa/L0_backend_python/lifecycle/my_test.py
+++ b/qa/L0_backend_python/lifecycle/my_test.py
@@ -1,19 +1,23 @@
+import queue
+from functools import partial
+
 import numpy as np
 import tritonclient.grpc as grpcclient
-from functools import partial
-import queue
 from tritonclient.utils import *
 
+
 class UserData:
     def __init__(self):
         self._completed_requests = queue.Queue()
 
+
 def callback(user_data, result, error):
     if error:
         user_data._completed_requests.put(error)
     else:
         user_data._completed_requests.put(result)
 
+
 def grpc_strict_error():
     model_name = "execute_error"
     shape = [2, 2]
@@ -25,7 +29,9 @@ def grpc_strict_error():
         triton_client = grpcclient.InferenceServerClient(triton_server_url)
         metadata = {"grpc_strict": "true"}
 
-        triton_client.start_stream(callback=partial(callback, user_data), headers=metadata)
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
 
         input_datas = []
         for i in range(number_of_requests):
@@ -49,5 +55,6 @@ def grpc_strict_error():
     finally:
         triton_client.stop_stream()
 
+
 if __name__ == "__main__":
-    grpc_strict_error()
\ No newline at end of file
+    grpc_strict_error()
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 0b2ee94510..9ea4677fd5 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -645,7 +645,6 @@ class InferHandlerState {
           received_notification_(false), grpc_strict_(false),
           grpc_stream_error_state_(false)
     {
-      LOG_VERBOSE(1) << "Context Constructor getting called";
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
     }
@@ -674,17 +673,13 @@ class InferHandlerState {
     // Extracts headers from GRPC request and updates state
     void ExtractStateFromHeaders(InferHandlerStateType* state)
     {
-      // Probably need to lock
-      LOG_VERBOSE(1) << "GRPC ExtractStateFromHeaders called" << std::endl;
       const auto& metadata = state->context_->ctx_->client_metadata();
       for (const auto& pair : metadata) {
         auto& key = pair.first;
         std::string param_key = std::string(key.begin(), key.end());
         std::string grpc_strict_key = "grpc_strict";
         if (param_key.compare(grpc_strict_key) == 0) {
-          // They are equal
           state->context_->grpc_strict_ = true;
-          LOG_VERBOSE(1) << "GRPC streaming strict flag detected" << std::endl;
         }
       }
     }
@@ -698,8 +693,6 @@ class InferHandlerState {
         state->context_->step_ = Steps::COMPLETE;
         state->step_ = Steps::PARTIAL_COMPLETION;
         state->context_->responder_->Finish(state->status_, state);
-        LOG_VERBOSE(1) << "GRPC streaming error detected inside finish: "
-                       << state->status_.error_code() << std::endl;
         // Mark error for this stream
         state->context_->MarkGRPCStrictError();
         IssueRequestCancellation();
@@ -1010,7 +1003,6 @@ class InferHandlerState {
     // orders. A state enters this queue when it has successfully read
     // a request and exits the queue when it is written.
     std::recursive_mutex mu_;
-    std::recursive_mutex grpc_strict_mu_;
     std::queue<InferHandlerStateType*> states_;
     std::atomic<uint32_t> ongoing_requests_;
 
@@ -1043,9 +1035,11 @@ class InferHandlerState {
     // completion queue.
     bool received_notification_;
 
-    // True if there is an ongoing write to the grpc stream
+    // True if set by user via header
     std::atomic<bool> grpc_strict_;
 
+    // True if stream already encountered error and closed connection
+    // State maintained to avoid writes on closed stream
     std::atomic<bool> grpc_stream_error_state_;
   };
 
@@ -1064,8 +1058,6 @@ class InferHandlerState {
       const std::shared_ptr<Context>& context, Steps start_step = Steps::START)
       : tritonserver_(tritonserver), async_notify_state_(false)
   {
-    LOG_VERBOSE(1)
-        << "grpc_stream_error_state_ init called in InferHandlerState 2 \n";
     // For debugging and testing
     const char* dstr = getenv("TRITONSERVER_DELAY_GRPC_RESPONSE");
     delay_response_ms_ = 0;
@@ -1150,7 +1142,6 @@ class InferHandlerState {
   std::shared_ptr<Context> context_;
   Steps step_;
   std::recursive_mutex step_mtx_;
-  // std::recursive_mutex grpc_strict_mtx_;
 
   // Shared pointer to the inference request object. The lifetime of
   // inference request object is extended till all the responses from
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 1a7a33998b..85ec4d2c32 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -334,13 +334,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     // initiated... the completion callback will transition to
     // WRITEREADY or WRITTEN or CANCELLED. Recording the state and the
     // irequest to handle gRPC stream cancellation.
-    // std::time_t currentTime = std::time(nullptr);
-
-    // // Divide the current time by 2
-    // std::time_t modTime = currentTime % 2;
-
     if (err == nullptr) {
-      // if (modTime) {
       state->context_->InsertInflightState(state);
       // The payload will be cleaned in request release callback.
       request_release_payload.release();
@@ -702,8 +696,9 @@ ModelStreamInferHandler::StreamInferResponseComplete(
         // std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
         state->status_ = status;
         // Finish only once, if backend ignores cancellation
-        LOG_VERBOSE(1) << "GRPC streaming error detected: "
-                       << status.error_code() << std::endl;
+        LOG_VERBOSE(1) << "GRPC streaming error detected with status: "
+                       << status.error_code() << "Closing stream connection."
+                       << std::endl;
         state->context_->sendGRPCStrictResponse(state);
         return;
       }

From a65f8c30f3d63c0a54aad3be90cd857e134c032a Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 5 Aug 2024 10:42:40 -0700
Subject: [PATCH 09/32] Tests updated

---
 .../lifecycle/lifecycle_test.py               | 19 ++++++++++++-------
 qa/python_models/execute_grpc_error/model.py  | 10 +++++-----
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index f2e52c8c91..6997666090 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -244,7 +244,7 @@ def test_infer_pymodel_error(self):
     def test_grpc_strict_error_on(self):
         model_name = "execute_grpc_error"
         shape = [2, 2]
-        number_of_requests = 3
+        number_of_requests = 2
         user_data = UserData()
         triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
         metadata = {"grpc_strict": "true"}
@@ -265,7 +265,11 @@ def test_grpc_strict_error_on(self):
                 inputs[0].set_data_from_numpy(input_data)
                 triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
                 result = user_data._completed_requests.get()
-                if i == 1:
+                if i == 0:
+                    # Stream is not killed
+                    output_data = result.as_numpy("OUT")
+                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
+                elif i == 1:
                     # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
                     self.assertIsInstance(result, InferenceServerException)
                     self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
@@ -273,7 +277,7 @@ def test_grpc_strict_error_on(self):
     def test_grpc_strict_error_off(self):
         model_name = "execute_grpc_error"
         shape = [2, 2]
-        number_of_requests = 3
+        number_of_requests = 4
         user_data = UserData()
         triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
         triton_client.start_stream(callback=partial(callback, user_data))
@@ -291,14 +295,15 @@ def test_grpc_strict_error_off(self):
                 inputs[0].set_data_from_numpy(input_data)
                 triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
                 result = user_data._completed_requests.get()
-                if i == 1:
+                if i == 1 or i == 3:
                     # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
                     self.assertIsInstance(result, InferenceServerException)
                     # Existing Behaviour
-                    self.assertEqual(str(result.status()), "NONE")
-                if i == 2:
+                    self.assertEqual(str(result.status()), "None")
+                elif i == 0 or i == 2:
                     # Stream is not killed
-                    self.assertIsInstance(result, InferResult)
+                    output_data = result.as_numpy("OUT")
+                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
 
 
 if __name__ == "__main__":
diff --git a/qa/python_models/execute_grpc_error/model.py b/qa/python_models/execute_grpc_error/model.py
index ee74e710f8..d5087a49ec 100644
--- a/qa/python_models/execute_grpc_error/model.py
+++ b/qa/python_models/execute_grpc_error/model.py
@@ -30,7 +30,7 @@
 class TritonPythonModel:
     def __init__(self):
         # Maintain total inference count, so as to return error on 2nd request, all of this to simulate model failure
-        self.inf_count = 0
+        self.inf_count = 1
 
     def execute(self, requests):
         """This function is called on inference request."""
@@ -40,13 +40,13 @@ def execute(self, requests):
         for request in requests:
             input_tensor = pb_utils.get_input_tensor_by_name(request, "IN")
             out_tensor = pb_utils.Tensor("OUT", input_tensor.as_numpy())
-            if self.inf_count == 0:
+            if self.inf_count % 2:
+                # Every odd request is success
                 responses.append(pb_utils.InferenceResponse([out_tensor]))
-            elif self.inf_count == 1:
+            else:
+                # Every even request is failure
                 error = pb_utils.TritonError("An error occurred during execution")
                 responses.append(pb_utils.InferenceResponse([out_tensor], error))
-            elif self.inf_count == 2:
-                responses.append(pb_utils.InferenceResponse([out_tensor]))
             self.inf_count += 1
 
         return responses

From 080985c0b82ecf1aa482a387cf60031a3c7b8017 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 5 Aug 2024 23:27:49 -0700
Subject: [PATCH 10/32] Zombie request fixed

---
 src/grpc/infer_handler.h         |  9 +++++----
 src/grpc/stream_infer_handler.cc | 11 +++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 9ea4677fd5..cb10efbdd2 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -690,12 +690,11 @@ class InferHandlerState {
       // Check if Error not responded previously
       // Avoid closing connection twice on multiple errors from core
       if (!state->context_->IsGRPCStrictError()) {
-        state->context_->step_ = Steps::COMPLETE;
-        state->step_ = Steps::PARTIAL_COMPLETION;
         state->context_->responder_->Finish(state->status_, state);
         // Mark error for this stream
         state->context_->MarkGRPCStrictError();
-        IssueRequestCancellation();
+        // Fix Me : Last argument not sure for HandleCancellation      
+        state->context_->HandleCancellation(state, true, "grpc_strict_name");
       }
     }
     // Increments the ongoing request counter
@@ -731,6 +730,7 @@ class InferHandlerState {
           } else {
             state->step_ = Steps::FINISH;
           }
+          LOG_VERBOSE(1) << "PutTaskBackToQueue inside HandleCompletion for " << state->unique_id_;
           PutTaskBackToQueue(state);
         }
         step_ = Steps::FINISH;
@@ -808,6 +808,7 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
+            LOG_VERBOSE(1) << "PutTaskBackToQueue inside IssueRequestCancellation for " << state->unique_id_;
             PutTaskBackToQueue(state);
           }
         }
@@ -845,7 +846,6 @@ class InferHandlerState {
           IssueRequestCancellation();
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
-
           // The state returns true because the CancelExecution
           // call above would have raised alarm objects on all
           // pending inflight states objects. This state will
@@ -1355,6 +1355,7 @@ InferHandler<
         LOG_VERBOSE(1) << "Received notification for " << Name() << ", "
                        << state->unique_id_;
       }
+      LOG_VERBOSE(2) << "Inside Next " << state->unique_id_;
       LOG_VERBOSE(2) << "Grpc::CQ::Next() "
                      << state->context_->DebugString(state);
       if (!Process(state, ok)) {
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 85ec4d2c32..a3e79b749d 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -148,7 +148,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
     // Check if stream error detected and already connection ended
     if (state->context_->IsGRPCStrictError()) {
-      return !finished;
+      return finished;
     }
   }
   if (state->context_->ReceivedNotification()) {
@@ -653,7 +653,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     // that state object can be released.
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
-      state->context_->PutTaskBackToQueue(state);
+            state->context_->PutTaskBackToQueue(state);
     }
 
     state->complete_ = is_complete;
@@ -683,7 +683,6 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       LOG_ERROR << "expected the response allocator to have added the response";
     }
     if (err != nullptr) {
-      LOG_VERBOSE(1) << "Error in CORE Response";
       failed = true;
       ::grpc::Status status;
       // Converts CORE errors to GRPC error codes
@@ -700,6 +699,10 @@ ModelStreamInferHandler::StreamInferResponseComplete(
                        << status.error_code() << "Closing stream connection."
                        << std::endl;
         state->context_->sendGRPCStrictResponse(state);
+        TRITONSERVER_ErrorDelete(err);
+        LOG_TRITONSERVER_ERROR(
+            TRITONSERVER_InferenceResponseDelete(iresponse),
+            "deleting GRPC inference response");
         return;
       }
     }
@@ -778,7 +781,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     // that state object can be released.
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
-      state->context_->PutTaskBackToQueue(state);
+            state->context_->PutTaskBackToQueue(state);
     }
 
     state->complete_ = is_complete;

From cc34d418e4c97043c64fa7b3c4dc2e3ed001bb51 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 5 Aug 2024 23:31:25 -0700
Subject: [PATCH 11/32] Pre Commit fixed

---
 src/grpc/infer_handler.h         | 7 ++++---
 src/grpc/stream_infer_handler.cc | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index cb10efbdd2..dd17635f6d 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -693,7 +693,7 @@ class InferHandlerState {
         state->context_->responder_->Finish(state->status_, state);
         // Mark error for this stream
         state->context_->MarkGRPCStrictError();
-        // Fix Me : Last argument not sure for HandleCancellation      
+        // Fix Me : Last argument not sure for HandleCancellation
         state->context_->HandleCancellation(state, true, "grpc_strict_name");
       }
     }
@@ -730,7 +730,6 @@ class InferHandlerState {
           } else {
             state->step_ = Steps::FINISH;
           }
-          LOG_VERBOSE(1) << "PutTaskBackToQueue inside HandleCompletion for " << state->unique_id_;
           PutTaskBackToQueue(state);
         }
         step_ = Steps::FINISH;
@@ -808,7 +807,9 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
-            LOG_VERBOSE(1) << "PutTaskBackToQueue inside IssueRequestCancellation for " << state->unique_id_;
+            LOG_VERBOSE(1)
+                << "PutTaskBackToQueue inside IssueRequestCancellation for "
+                << state->unique_id_;
             PutTaskBackToQueue(state);
           }
         }
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index a3e79b749d..8eed87cd16 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -653,7 +653,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     // that state object can be released.
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
-            state->context_->PutTaskBackToQueue(state);
+      state->context_->PutTaskBackToQueue(state);
     }
 
     state->complete_ = is_complete;
@@ -781,7 +781,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
     // that state object can be released.
     if (is_complete) {
       state->step_ = Steps::CANCELLED;
-            state->context_->PutTaskBackToQueue(state);
+      state->context_->PutTaskBackToQueue(state);
     }
 
     state->complete_ = is_complete;

From 40344d58dcbae207a9ff8a625521ddf84302571b Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 7 Aug 2024 00:26:54 -0700
Subject: [PATCH 12/32] Review Comments cleaned up, crash fixed in multi
 threading

---
 qa/L0_backend_python/lifecycle/my_test.py     | 60 -------------------
 .../execute_grpc_error/config.pbtxt           |  1 -
 src/grpc/infer_handler.cc                     |  3 +-
 src/grpc/infer_handler.h                      | 52 ++++++++--------
 src/grpc/stream_infer_handler.cc              |  8 +--
 5 files changed, 34 insertions(+), 90 deletions(-)
 delete mode 100644 qa/L0_backend_python/lifecycle/my_test.py

diff --git a/qa/L0_backend_python/lifecycle/my_test.py b/qa/L0_backend_python/lifecycle/my_test.py
deleted file mode 100644
index c4c1899e8f..0000000000
--- a/qa/L0_backend_python/lifecycle/my_test.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import queue
-from functools import partial
-
-import numpy as np
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import *
-
-
-class UserData:
-    def __init__(self):
-        self._completed_requests = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._completed_requests.put(error)
-    else:
-        user_data._completed_requests.put(result)
-
-
-def grpc_strict_error():
-    model_name = "execute_error"
-    shape = [2, 2]
-    number_of_requests = 3
-    user_data = UserData()
-    triton_server_url = "localhost:8001"  # Replace with your Triton server address
-
-    try:
-        triton_client = grpcclient.InferenceServerClient(triton_server_url)
-        metadata = {"grpc_strict": "true"}
-
-        triton_client.start_stream(
-            callback=partial(callback, user_data), headers=metadata
-        )
-
-        input_datas = []
-        for i in range(number_of_requests):
-            input_data = np.random.randn(*shape).astype(np.float32)
-            input_datas.append(input_data)
-            inputs = [
-                grpcclient.InferInput(
-                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
-                )
-            ]
-            inputs[0].set_data_from_numpy(input_data)
-            triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
-            result = user_data._completed_requests.get()
-            print(f"Request {i + 1} result:")
-            print(type(result))
-            if type(result) == InferenceServerException:
-                print(result.status())
-
-    except Exception as e:
-        print(f"Error occurred: {str(e)}")
-    finally:
-        triton_client.stop_stream()
-
-
-if __name__ == "__main__":
-    grpc_strict_error()
diff --git a/qa/python_models/execute_grpc_error/config.pbtxt b/qa/python_models/execute_grpc_error/config.pbtxt
index 3d364f3cc5..70e247148a 100644
--- a/qa/python_models/execute_grpc_error/config.pbtxt
+++ b/qa/python_models/execute_grpc_error/config.pbtxt
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-name: "execute_grpc_error"
 backend: "python"
 max_batch_size: 64
 
diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
index 35659f4900..d01e88ce94 100644
--- a/src/grpc/infer_handler.cc
+++ b/src/grpc/infer_handler.cc
@@ -720,7 +720,8 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       // single thread scenario.
       StartNewRequest();
     }
-    bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
+    bool resume = state->context_->HandleCancellation(
+        state, rpc_ok, Name(), false /* is_grpc_strict */);
     return resume;
   }
 
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index dd17635f6d..adf5bb53c4 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -684,7 +684,7 @@ class InferHandlerState {
       }
     }
 
-    void sendGRPCStrictResponse(InferHandlerStateType* state)
+    void SendGRPCStrictResponse(InferHandlerStateType* state)
     {
       std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
       // Check if Error not responded previously
@@ -694,7 +694,9 @@ class InferHandlerState {
         // Mark error for this stream
         state->context_->MarkGRPCStrictError();
         // Fix Me : Last argument not sure for HandleCancellation
-        state->context_->HandleCancellation(state, true, "grpc_strict_name");
+        state->context_->HandleCancellation(
+            state, true /* rpc_ok */, "grpc_strict_name",
+            true /* is_grpc_strict */);
       }
     }
     // Increments the ongoing request counter
@@ -807,9 +809,6 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
-            LOG_VERBOSE(1)
-                << "PutTaskBackToQueue inside IssueRequestCancellation for "
-                << state->unique_id_;
             PutTaskBackToQueue(state);
           }
         }
@@ -822,9 +821,11 @@ class InferHandlerState {
     // Returns whether or not to continue cycling through the gRPC
     // completion queue or not.
     bool HandleCancellation(
-        InferHandlerStateType* state, bool rpc_ok, const std::string& name)
+        InferHandlerStateType* state, bool rpc_ok, const std::string& name,
+        bool is_grpc_strict)
     {
-      if (!IsCancelled()) {
+      // Check to avoid early exit in case of grpc_strict
+      if (!IsCancelled() && !(is_grpc_strict)) {
         LOG_ERROR
             << "[INTERNAL] HandleCancellation called even when the context was "
                "not cancelled for "
@@ -1349,23 +1350,26 @@ InferHandler<
 
     while (cq_->Next(&tag, &ok)) {
       State* state = static_cast<State*>(tag);
-      if (state->step_ == Steps::WAITING_NOTIFICATION) {
-        State* state_wrapper = state;
-        state = state_wrapper->state_ptr_;
-        state->context_->SetReceivedNotification(true);
-        LOG_VERBOSE(1) << "Received notification for " << Name() << ", "
-                       << state->unique_id_;
-      }
-      LOG_VERBOSE(2) << "Inside Next " << state->unique_id_;
-      LOG_VERBOSE(2) << "Grpc::CQ::Next() "
-                     << state->context_->DebugString(state);
-      if (!Process(state, ok)) {
-        LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_;
-        state->context_->EraseState(state);
-        StateRelease(state);
-      } else {
-        LOG_VERBOSE(2) << "Returning from " << Name() << ", "
-                       << state->unique_id_ << ", " << state->step_;
+      // FIX ME : Ideally should not need this nullptr check, added to resolve
+      // crash is grpc_strict mode
+      if (state->context_ != nullptr) {
+        if (state->step_ == Steps::WAITING_NOTIFICATION) {
+          State* state_wrapper = state;
+          state = state_wrapper->state_ptr_;
+          state->context_->SetReceivedNotification(true);
+          LOG_VERBOSE(1) << "Received notification for " << Name() << ", "
+                         << state->unique_id_;
+        }
+        LOG_VERBOSE(2) << "Grpc::CQ::Next() "
+                       << state->context_->DebugString(state);
+        if (!Process(state, ok)) {
+          LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_;
+          state->context_->EraseState(state);
+          StateRelease(state);
+        } else {
+          LOG_VERBOSE(2) << "Returning from " << Name() << ", "
+                         << state->unique_id_ << ", " << state->step_;
+        }
       }
     }
   }));
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 8eed87cd16..80005f479c 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -148,13 +148,15 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
     // Check if stream error detected and already connection ended
     if (state->context_->IsGRPCStrictError()) {
+      state->step_ = Steps::FINISH;
       return finished;
     }
   }
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {
-      bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
+      bool resume = state->context_->HandleCancellation(
+          state, rpc_ok, Name(), false /* is_grpc_strict */);
       return resume;
     } else {
       if (state->context_->HandleCompletion()) {
@@ -691,14 +693,12 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       response->set_error_message(status.error_message());
       LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
       if (state->context_->grpc_strict_) {
-        // Set to finish
-        // std::lock_guard<std::recursive_mutex> lock(state->grpc_strict_mtx_);
         state->status_ = status;
         // Finish only once, if backend ignores cancellation
         LOG_VERBOSE(1) << "GRPC streaming error detected with status: "
                        << status.error_code() << "Closing stream connection."
                        << std::endl;
-        state->context_->sendGRPCStrictResponse(state);
+        state->context_->SendGRPCStrictResponse(state);
         TRITONSERVER_ErrorDelete(err);
         LOG_TRITONSERVER_ERROR(
             TRITONSERVER_InferenceResponseDelete(iresponse),

From f40f695e67e554b58153d392516062bd00c7bcda Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 7 Aug 2024 10:58:19 -0700
Subject: [PATCH 13/32] Review Comments fixed

---
 .../lifecycle/lifecycle_test.py               |  6 ++--
 src/grpc/infer_handler.cc                     |  2 +-
 src/grpc/infer_handler.h                      | 31 ++++++++++++-------
 src/grpc/stream_infer_handler.cc              |  8 ++---
 4 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index 6997666090..0ea1e6d4e6 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -241,13 +241,13 @@ def test_infer_pymodel_error(self):
             initial_metrics_value,
         )
 
-    def test_grpc_strict_error_on(self):
+    def test_triton_grpc_error_error_on(self):
         model_name = "execute_grpc_error"
         shape = [2, 2]
         number_of_requests = 2
         user_data = UserData()
         triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
-        metadata = {"grpc_strict": "true"}
+        metadata = {"triton_grpc_error": "true"}
         triton_client.start_stream(
             callback=partial(callback, user_data), headers=metadata
         )
@@ -274,7 +274,7 @@ def test_grpc_strict_error_on(self):
                     self.assertIsInstance(result, InferenceServerException)
                     self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
 
-    def test_grpc_strict_error_off(self):
+    def test_triton_grpc_error_error_off(self):
         model_name = "execute_grpc_error"
         shape = [2, 2]
         number_of_requests = 4
diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
index d01e88ce94..c45b565a88 100644
--- a/src/grpc/infer_handler.cc
+++ b/src/grpc/infer_handler.cc
@@ -721,7 +721,7 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       StartNewRequest();
     }
     bool resume = state->context_->HandleCancellation(
-        state, rpc_ok, Name(), false /* is_grpc_strict */);
+        state, rpc_ok, Name(), false /* is_triton_grpc_error */);
     return resume;
   }
 
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index adf5bb53c4..8192ab568c 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -642,7 +642,7 @@ class InferHandlerState {
         ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0)
         : cq_(cq), unique_id_(unique_id), ongoing_requests_(0),
           step_(Steps::START), finish_ok_(true), ongoing_write_(false),
-          received_notification_(false), grpc_strict_(false),
+          received_notification_(false), triton_grpc_error_(false),
           grpc_stream_error_state_(false)
     {
       ctx_.reset(new ::grpc::ServerContext());
@@ -676,10 +676,15 @@ class InferHandlerState {
       const auto& metadata = state->context_->ctx_->client_metadata();
       for (const auto& pair : metadata) {
         auto& key = pair.first;
+        auto& value = pair.second;
         std::string param_key = std::string(key.begin(), key.end());
-        std::string grpc_strict_key = "grpc_strict";
-        if (param_key.compare(grpc_strict_key) == 0) {
-          state->context_->grpc_strict_ = true;
+        std::string value_key = std::string(value.begin(), value.end());
+        std::string triton_grpc_error_key = "triton_grpc_error";
+        if(param_key == triton_grpc_error_key) {
+          if(value_key == "true") {
+            LOG_VERBOSE(2) << "GRPC: triton_grpc_error mode detected in new grpc stream";
+            state->context_->triton_grpc_error_ = true;
+          }
         }
       }
     }
@@ -695,8 +700,8 @@ class InferHandlerState {
         state->context_->MarkGRPCStrictError();
         // Fix Me : Last argument not sure for HandleCancellation
         state->context_->HandleCancellation(
-            state, true /* rpc_ok */, "grpc_strict_name",
-            true /* is_grpc_strict */);
+            state, true /* rpc_ok */, "triton_grpc_error_name",
+            true /* is_triton_grpc_error */);
       }
     }
     // Increments the ongoing request counter
@@ -822,10 +827,10 @@ class InferHandlerState {
     // completion queue or not.
     bool HandleCancellation(
         InferHandlerStateType* state, bool rpc_ok, const std::string& name,
-        bool is_grpc_strict)
+        bool is_triton_grpc_error)
     {
-      // Check to avoid early exit in case of grpc_strict
-      if (!IsCancelled() && !(is_grpc_strict)) {
+      // Check to avoid early exit in case of triton_grpc_error
+      if (!IsCancelled() && !(is_triton_grpc_error)) {
         LOG_ERROR
             << "[INTERNAL] HandleCancellation called even when the context was "
                "not cancelled for "
@@ -975,7 +980,7 @@ class InferHandlerState {
     // Marks error after it has been responded to
     void MarkGRPCStrictError() { grpc_stream_error_state_ = true; }
 
-    // Checks if error already responded to in grpc_strict mode
+    // Checks if error already responded to in triton_grpc_error mode
     bool IsGRPCStrictError() { return grpc_stream_error_state_; }
 
     // Return true if this context has completed all reads and writes.
@@ -1038,10 +1043,12 @@ class InferHandlerState {
     bool received_notification_;
 
     // True if set by user via header
-    std::atomic<bool> grpc_strict_;
+    // Can be accessed without a lock, as set only once in startstream
+    std::atomic<bool> triton_grpc_error_;
 
     // True if stream already encountered error and closed connection
     // State maintained to avoid writes on closed stream
+    // Need to acquire lock before access
     std::atomic<bool> grpc_stream_error_state_;
   };
 
@@ -1351,7 +1358,7 @@ InferHandler<
     while (cq_->Next(&tag, &ok)) {
       State* state = static_cast<State*>(tag);
       // FIX ME : Ideally should not need this nullptr check, added to resolve
-      // crash is grpc_strict mode
+      // crash is triton_grpc_error mode
       if (state->context_ != nullptr) {
         if (state->step_ == Steps::WAITING_NOTIFICATION) {
           State* state_wrapper = state;
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 80005f479c..5e1fc89d89 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -144,7 +144,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // because we launch an async thread that could update 'state's
   // step_ to be FINISH before this thread exits this function.
   bool finished = false;
-  if (state->context_->grpc_strict_) {
+  if (state->context_->triton_grpc_error_) {
     std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
     // Check if stream error detected and already connection ended
     if (state->context_->IsGRPCStrictError()) {
@@ -156,7 +156,7 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {
       bool resume = state->context_->HandleCancellation(
-          state, rpc_ok, Name(), false /* is_grpc_strict */);
+          state, rpc_ok, Name(), false /* is_triton_grpc_error */);
       return resume;
     } else {
       if (state->context_->HandleCompletion()) {
@@ -604,7 +604,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
 {
   State* state = reinterpret_cast<State*>(userp);
   // Ignore Response from CORE in case GRPC Strict as we dont care about
-  if (state->context_->grpc_strict_) {
+  if (state->context_->triton_grpc_error_) {
     std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
     if (state->context_->IsGRPCStrictError()) {
       return;
@@ -692,7 +692,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       response->mutable_infer_response()->Clear();
       response->set_error_message(status.error_message());
       LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
-      if (state->context_->grpc_strict_) {
+      if (state->context_->triton_grpc_error_) {
         state->status_ = status;
         // Finish only once, if backend ignores cancellation
         LOG_VERBOSE(1) << "GRPC streaming error detected with status: "

From 0792bc1d1d0e8d143c322cdabd4a3f7f01a92c81 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 7 Aug 2024 11:25:49 -0700
Subject: [PATCH 14/32] Pre-Commit fixed

---
 src/grpc/infer_handler.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 8192ab568c..acd20609b8 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -680,9 +680,10 @@ class InferHandlerState {
         std::string param_key = std::string(key.begin(), key.end());
         std::string value_key = std::string(value.begin(), value.end());
         std::string triton_grpc_error_key = "triton_grpc_error";
-        if(param_key == triton_grpc_error_key) {
-          if(value_key == "true") {
-            LOG_VERBOSE(2) << "GRPC: triton_grpc_error mode detected in new grpc stream";
+        if (param_key == triton_grpc_error_key) {
+          if (value_key == "true") {
+            LOG_VERBOSE(2)
+                << "GRPC: triton_grpc_error mode detected in new grpc stream";
             state->context_->triton_grpc_error_ = true;
           }
         }

From cdd60bf38a52e3a4f0de7f7f1ee7144cb9e81a67 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 8 Aug 2024 10:35:29 -0700
Subject: [PATCH 15/32] Park

---
 src/grpc/infer_handler.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index acd20609b8..adcabdfc78 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -696,6 +696,9 @@ class InferHandlerState {
       // Check if Error not responded previously
       // Avoid closing connection twice on multiple errors from core
       if (!state->context_->IsGRPCStrictError()) {
+        // check state object
+        // state->context_->step_ = Steps::COMPLETE;
+        // state->step_ = Steps::COMPLETE; 
         state->context_->responder_->Finish(state->status_, state);
         // Mark error for this stream
         state->context_->MarkGRPCStrictError();
@@ -782,7 +785,7 @@ class InferHandlerState {
 
     // Issues the cancellation for all inflight requests
     // being tracked by this context.
-    void IssueRequestCancellation()
+    void IssueRequestCancellation(bool is_triton_grpc_error)
     {
       {
         std::lock_guard<std::recursive_mutex> lock(mu_);
@@ -815,7 +818,10 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
-            PutTaskBackToQueue(state);
+            // CHeck for writeready
+            if(!is_triton_grpc_error) {
+              PutTaskBackToQueue(state);
+            }
           }
         }
       }
@@ -851,7 +857,7 @@ class InferHandlerState {
         // issue cancellation request to all the inflight
         // states belonging to the context.
         if (state->context_->step_ != Steps::CANCELLED) {
-          IssueRequestCancellation();
+          IssueRequestCancellation(is_triton_grpc_error);
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
           // The state returns true because the CancelExecution
@@ -1378,7 +1384,7 @@ InferHandler<
           LOG_VERBOSE(2) << "Returning from " << Name() << ", "
                          << state->unique_id_ << ", " << state->step_;
         }
-      }
+     }
     }
   }));
 

From 6661f21ded4c16039bc972e47bf07bd8f6a67f88 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 8 Aug 2024 20:24:56 -0700
Subject: [PATCH 16/32] Simpler design piggyback on NotifywhenDone()

---
 src/grpc/grpc_utils.h            | 11 +++++
 src/grpc/infer_handler.h         | 82 ++++++++++++++++++--------------
 src/grpc/stream_infer_handler.cc |  8 ----
 3 files changed, 57 insertions(+), 44 deletions(-)

diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h
index 898e4acb4f..9f1c54527a 100644
--- a/src/grpc/grpc_utils.h
+++ b/src/grpc/grpc_utils.h
@@ -76,6 +76,17 @@ typedef enum {
   PARTIAL_COMPLETION
 } Steps;
 
+typedef enum {
+  // No error from CORE seen yet
+  NONE,
+  // Error from CORE encountered, waiting to be picked up by completion queue to
+  // initiate cancellation
+  ERROR_WAITING,
+  // Error from CORE encountered, stream closed
+  // This state is added to avoid double cancellation
+  ERROR_CANCELED
+} Triton_grpc_error_steps;
+
 // Debugging helper
 std::ostream& operator<<(std::ostream& out, const Steps& step);
 
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index adcabdfc78..1f87d4e62f 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -643,7 +643,7 @@ class InferHandlerState {
         : cq_(cq), unique_id_(unique_id), ongoing_requests_(0),
           step_(Steps::START), finish_ok_(true), ongoing_write_(false),
           received_notification_(false), triton_grpc_error_(false),
-          grpc_stream_error_state_(false)
+          grpc_stream_error_state_(Triton_grpc_error_steps::NONE)
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
@@ -665,9 +665,22 @@ class InferHandlerState {
 
     bool ReceivedNotification() { return received_notification_; }
 
+    // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+    bool IsGRPCError()
+    {
+      if (grpc_stream_error_state_ == Triton_grpc_error_steps::ERROR_WAITING) {
+        // Change the state to ERROR_CANCELED as we have called
+        // HandleCancellation
+        grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED;
+        return true;
+      }
+      return false;
+    }
+
     bool IsCancelled()
     {
-      return received_notification_ ? ctx_->IsCancelled() : false;
+      return received_notification_ ? (ctx_->IsCancelled() || IsGRPCError())
+                                    : false;
     }
 
     // Extracts headers from GRPC request and updates state
@@ -696,16 +709,10 @@ class InferHandlerState {
       // Check if Error not responded previously
       // Avoid closing connection twice on multiple errors from core
       if (!state->context_->IsGRPCStrictError()) {
-        // check state object
-        // state->context_->step_ = Steps::COMPLETE;
-        // state->step_ = Steps::COMPLETE; 
+        state->step_ = Steps::COMPLETE;
         state->context_->responder_->Finish(state->status_, state);
         // Mark error for this stream
         state->context_->MarkGRPCStrictError();
-        // Fix Me : Last argument not sure for HandleCancellation
-        state->context_->HandleCancellation(
-            state, true /* rpc_ok */, "triton_grpc_error_name",
-            true /* is_triton_grpc_error */);
       }
     }
     // Increments the ongoing request counter
@@ -819,9 +826,7 @@ class InferHandlerState {
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
             // CHeck for writeready
-            if(!is_triton_grpc_error) {
-              PutTaskBackToQueue(state);
-            }
+            PutTaskBackToQueue(state);
           }
         }
       }
@@ -985,10 +990,19 @@ class InferHandlerState {
     }
 
     // Marks error after it has been responded to
-    void MarkGRPCStrictError() { grpc_stream_error_state_ = true; }
+    void MarkGRPCStrictError()
+    {
+      grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_WAITING;
+    }
 
     // Checks if error already responded to in triton_grpc_error mode
-    bool IsGRPCStrictError() { return grpc_stream_error_state_; }
+    bool IsGRPCStrictError()
+    {
+      if (grpc_stream_error_state_ == Triton_grpc_error_steps::NONE) {
+        return false;
+      }
+      return true;
+    }
 
     // Return true if this context has completed all reads and writes.
     bool IsRequestsCompleted()
@@ -1056,7 +1070,7 @@ class InferHandlerState {
     // True if stream already encountered error and closed connection
     // State maintained to avoid writes on closed stream
     // Need to acquire lock before access
-    std::atomic<bool> grpc_stream_error_state_;
+    int grpc_stream_error_state_;
   };
 
   // This constructor is used to build a wrapper state object
@@ -1364,27 +1378,23 @@ InferHandler<
 
     while (cq_->Next(&tag, &ok)) {
       State* state = static_cast<State*>(tag);
-      // FIX ME : Ideally should not need this nullptr check, added to resolve
-      // crash is triton_grpc_error mode
-      if (state->context_ != nullptr) {
-        if (state->step_ == Steps::WAITING_NOTIFICATION) {
-          State* state_wrapper = state;
-          state = state_wrapper->state_ptr_;
-          state->context_->SetReceivedNotification(true);
-          LOG_VERBOSE(1) << "Received notification for " << Name() << ", "
-                         << state->unique_id_;
-        }
-        LOG_VERBOSE(2) << "Grpc::CQ::Next() "
-                       << state->context_->DebugString(state);
-        if (!Process(state, ok)) {
-          LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_;
-          state->context_->EraseState(state);
-          StateRelease(state);
-        } else {
-          LOG_VERBOSE(2) << "Returning from " << Name() << ", "
-                         << state->unique_id_ << ", " << state->step_;
-        }
-     }
+      if (state->step_ == Steps::WAITING_NOTIFICATION) {
+        State* state_wrapper = state;
+        state = state_wrapper->state_ptr_;
+        state->context_->SetReceivedNotification(true);
+        LOG_VERBOSE(1) << "Received notification for " << Name() << ", "
+                       << state->unique_id_;
+      }
+      LOG_VERBOSE(2) << "Grpc::CQ::Next() "
+                     << state->context_->DebugString(state);
+      if (!Process(state, ok)) {
+        LOG_VERBOSE(1) << "Done for " << Name() << ", " << state->unique_id_;
+        state->context_->EraseState(state);
+        StateRelease(state);
+      } else {
+        LOG_VERBOSE(2) << "Returning from " << Name() << ", "
+                       << state->unique_id_ << ", " << state->step_;
+      }
     }
   }));
 
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 5e1fc89d89..0fa2715841 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -144,14 +144,6 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // because we launch an async thread that could update 'state's
   // step_ to be FINISH before this thread exits this function.
   bool finished = false;
-  if (state->context_->triton_grpc_error_) {
-    std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-    // Check if stream error detected and already connection ended
-    if (state->context_->IsGRPCStrictError()) {
-      state->step_ = Steps::FINISH;
-      return finished;
-    }
-  }
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {

From 22e53592f45bb82a58b53c8ba285e6031ae68c92 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 9 Aug 2024 12:05:12 -0700
Subject: [PATCH 17/32] Cleanup unwanted states from old design

---
 src/grpc/infer_handler.cc        |  3 +--
 src/grpc/infer_handler.h         | 32 ++++++++++++++++++--------------
 src/grpc/stream_infer_handler.cc | 12 ++++++------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
index c45b565a88..35659f4900 100644
--- a/src/grpc/infer_handler.cc
+++ b/src/grpc/infer_handler.cc
@@ -720,8 +720,7 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok)
       // single thread scenario.
       StartNewRequest();
     }
-    bool resume = state->context_->HandleCancellation(
-        state, rpc_ok, Name(), false /* is_triton_grpc_error */);
+    bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
     return resume;
   }
 
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 1f87d4e62f..c93c3ba7eb 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -665,13 +665,19 @@ class InferHandlerState {
 
     bool ReceivedNotification() { return received_notification_; }
 
+    // Changes the state of grpc_stream_error_state_ to ERROR_CANCELED,
+    // indicating we have closed the stream and initiated the cancel flow
+    void SetGRPCErrorCancelled()
+    {
+      grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED;
+    }
     // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
-    bool IsGRPCError()
+    bool CheckAndUpdateGRPCError()
     {
       if (grpc_stream_error_state_ == Triton_grpc_error_steps::ERROR_WAITING) {
         // Change the state to ERROR_CANCELED as we have called
         // HandleCancellation
-        grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED;
+        SetGRPCErrorCancelled();
         return true;
       }
       return false;
@@ -679,8 +685,9 @@ class InferHandlerState {
 
     bool IsCancelled()
     {
-      return received_notification_ ? (ctx_->IsCancelled() || IsGRPCError())
-                                    : false;
+      return received_notification_
+                 ? (ctx_->IsCancelled() || CheckAndUpdateGRPCError())
+                 : false;
     }
 
     // Extracts headers from GRPC request and updates state
@@ -792,7 +799,7 @@ class InferHandlerState {
 
     // Issues the cancellation for all inflight requests
     // being tracked by this context.
-    void IssueRequestCancellation(bool is_triton_grpc_error)
+    void IssueRequestCancellation()
     {
       {
         std::lock_guard<std::recursive_mutex> lock(mu_);
@@ -825,7 +832,6 @@ class InferHandlerState {
             // The RPC is complete and no callback will be invoked to retrieve
             // the object. Hence, need to explicitly place the state on the
             // completion queue.
-            // CHeck for writeready
             PutTaskBackToQueue(state);
           }
         }
@@ -838,11 +844,10 @@ class InferHandlerState {
     // Returns whether or not to continue cycling through the gRPC
     // completion queue or not.
     bool HandleCancellation(
-        InferHandlerStateType* state, bool rpc_ok, const std::string& name,
-        bool is_triton_grpc_error)
+        InferHandlerStateType* state, bool rpc_ok, const std::string& name)
     {
       // Check to avoid early exit in case of triton_grpc_error
-      if (!IsCancelled() && !(is_triton_grpc_error)) {
+      if (!IsCancelled()) {
         LOG_ERROR
             << "[INTERNAL] HandleCancellation called even when the context was "
                "not cancelled for "
@@ -862,7 +867,7 @@ class InferHandlerState {
         // issue cancellation request to all the inflight
         // states belonging to the context.
         if (state->context_->step_ != Steps::CANCELLED) {
-          IssueRequestCancellation(is_triton_grpc_error);
+          IssueRequestCancellation();
           // Mark the context as cancelled
           state->context_->step_ = Steps::CANCELLED;
           // The state returns true because the CancelExecution
@@ -1067,10 +1072,9 @@ class InferHandlerState {
     // Can be accessed without a lock, as set only once in startstream
     std::atomic<bool> triton_grpc_error_;
 
-    // True if stream already encountered error and closed connection
-    // State maintained to avoid writes on closed stream
-    // Need to acquire lock before access
-    int grpc_stream_error_state_;
+    // Indicates the state of triton_grpc_error, only relevant if special
+    // triton_grpc_error feature set to true by client
+    Triton_grpc_error_steps grpc_stream_error_state_;
   };
 
   // This constructor is used to build a wrapper state object
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 0fa2715841..6a14ed9d4c 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -140,15 +140,10 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // This means that we only need to take care of
   // synchronizing this thread and the ResponseComplete
   // threads.
-  // We need an explicit finish indicator. Can't use 'state->step_'
-  // because we launch an async thread that could update 'state's
-  // step_ to be FINISH before this thread exits this function.
-  bool finished = false;
   if (state->context_->ReceivedNotification()) {
     std::lock_guard<std::recursive_mutex> lock(state->step_mtx_);
     if (state->IsGrpcContextCancelled()) {
-      bool resume = state->context_->HandleCancellation(
-          state, rpc_ok, Name(), false /* is_triton_grpc_error */);
+      bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
       return resume;
     } else {
       if (state->context_->HandleCompletion()) {
@@ -161,6 +156,11 @@ ModelStreamInferHandler::Process(InferHandler::State* state, bool rpc_ok)
                  << ", context " << state->context_->unique_id_ << ", "
                  << state->unique_id_ << " step " << state->step_;
 
+  // We need an explicit finish indicator. Can't use 'state->step_'
+  // because we launch an async thread that could update 'state's
+  // step_ to be FINISH before this thread exits this function.
+  bool finished = false;
+
   if (state->step_ == Steps::START) {
     // A new stream connection... If RPC failed on a new request then
     // the server is shutting down and so we should do nothing.

From a31ba09119ebe645d0838cdac79c8beac4b6a4f4 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Mon, 12 Aug 2024 15:25:28 -0700
Subject: [PATCH 18/32] Improved tests around triton_grpc_error mode

---
 .../lifecycle/lifecycle_test.py               | 110 +++++++++++++++---
 1 file changed, 93 insertions(+), 17 deletions(-)

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index 0ea1e6d4e6..6fda441767 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -35,6 +35,7 @@
 sys.path.append("../../common")
 
 import queue
+import threading
 import time
 import unittest
 from functools import partial
@@ -241,6 +242,8 @@ def test_infer_pymodel_error(self):
             initial_metrics_value,
         )
 
+    # Test grpc stream behavior when triton_grpc_error is set to true.
+    # Expected to close stream and return GRPC error when model returns error.
     def test_triton_grpc_error_error_on(self):
         model_name = "execute_grpc_error"
         shape = [2, 2]
@@ -251,7 +254,7 @@ def test_triton_grpc_error_error_on(self):
         triton_client.start_stream(
             callback=partial(callback, user_data), headers=metadata
         )
-
+        stream_end = False
         with self._shm_leak_detector.Probe() as shm_probe:
             input_datas = []
             for i in range(number_of_requests):
@@ -263,21 +266,100 @@ def test_triton_grpc_error_error_on(self):
                     )
                 ]
                 inputs[0].set_data_from_numpy(input_data)
+                try:
+                    triton_client.async_stream_infer(
+                        model_name=model_name, inputs=inputs
+                    )
+                    result = user_data._completed_requests.get()
+                    if type(result) == InferenceServerException:
+                        # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                        self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
+                        stream_end = True
+                    else:
+                        # Stream is not killed
+                        output_data = result.as_numpy("OUT")
+                        self.assertIsNotNone(output_data, "error: expected 'OUT'")
+                except Exception as e:
+                    if stream_end == True:
+                        # We expect the stream to have closed
+                        self.assertTrue(
+                            True,
+                            "This should always pass as cancellation should succeed",
+                        )
+                    else:
+                        self.assertFalse(
+                            True, "Unexpected Stream killed without Error from CORE"
+                        )
+
+    # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams.
+    # Expected to close stream and return GRPC error when model returns error.
+    def test_triton_grpc_error_multithreaded(self):
+        thread1 = threading.Thread(target=self.test_triton_grpc_error_error_on)
+        thread2 = threading.Thread(target=self.test_triton_grpc_error_error_on)
+        # Start the threads
+        thread1.start()
+        thread2.start()
+        # Wait for both threads to finish
+        thread1.join()
+        thread2.join()
+
+    # Test grpc stream behavior when triton_grpc_error is set to true and subsequent stream is cancelled.
+    # Expected cancellation is successful.
+    def test_triton_grpc_error_cancel(self):
+        model_name = "execute_grpc_error"
+        shape = [2, 2]
+        number_of_requests = 1
+        user_data = UserData()
+        triton_server_url = "localhost:8001"  # Replace with your Triton server address
+        stream_end = False
+        triton_client = grpcclient.InferenceServerClient(triton_server_url)
+
+        metadata = {"triton_grpc_error": "true"}
+
+        triton_client.start_stream(
+            callback=partial(callback, user_data), headers=metadata
+        )
+
+        input_datas = []
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            input_datas.append(input_data)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            try:
                 triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
                 result = user_data._completed_requests.get()
+                if type(result) == InferenceServerException:
+                    stream_end = True
                 if i == 0:
-                    # Stream is not killed
-                    output_data = result.as_numpy("OUT")
-                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
-                elif i == 1:
-                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
-                    self.assertIsInstance(result, InferenceServerException)
-                    self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
+                    triton_client.stop_stream(cancel_requests=True)
+            except Exception as e:
+                if stream_end == True:
+                    # We expect the stream to have closed
+                    self.assertTrue(
+                        True,
+                        "This should always pass as cancellation should succeed",
+                    )
+                else:
+                    self.assertFalse(
+                        True, "Unexpected Stream killed without Error from CORE"
+                    )
+        self.assertTrue(
+            True,
+            "This should always pass as cancellation should succeed without any exception",
+        )
 
+    # Test grpc stream behavior when triton_grpc_error is set to false
+    # and subsequent stream is NOT closed when error is reported from CORE
     def test_triton_grpc_error_error_off(self):
         model_name = "execute_grpc_error"
         shape = [2, 2]
         number_of_requests = 4
+        response_counter = 0
         user_data = UserData()
         triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
         triton_client.start_stream(callback=partial(callback, user_data))
@@ -295,15 +377,9 @@ def test_triton_grpc_error_error_off(self):
                 inputs[0].set_data_from_numpy(input_data)
                 triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
                 result = user_data._completed_requests.get()
-                if i == 1 or i == 3:
-                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
-                    self.assertIsInstance(result, InferenceServerException)
-                    # Existing Behaviour
-                    self.assertEqual(str(result.status()), "None")
-                elif i == 0 or i == 2:
-                    # Stream is not killed
-                    output_data = result.as_numpy("OUT")
-                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
+                response_counter += 1
+        # Expect stream to not CLOSE as NOT in triton_grpc_error mode
+        self.assertEqual(response_counter, number_of_requests)
 
 
 if __name__ == "__main__":

From af451a2db7eb0611c654e480ef4021b059c6b37d Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 14 Aug 2024 00:10:03 -0700
Subject: [PATCH 19/32] Comments resolved

---
 .../lifecycle/lifecycle_test.py               | 94 +++++++++----------
 src/grpc/grpc_utils.h                         |  6 +-
 src/grpc/infer_handler.h                      | 88 +++++++++--------
 src/grpc/stream_infer_handler.cc              |  4 +-
 4 files changed, 93 insertions(+), 99 deletions(-)

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index 6fda441767..bd543d784e 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -255,41 +255,38 @@ def test_triton_grpc_error_error_on(self):
             callback=partial(callback, user_data), headers=metadata
         )
         stream_end = False
-        with self._shm_leak_detector.Probe() as shm_probe:
-            input_datas = []
-            for i in range(number_of_requests):
-                input_data = np.random.randn(*shape).astype(np.float32)
-                input_datas.append(input_data)
-                inputs = [
-                    grpcclient.InferInput(
-                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        input_datas = []
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            input_datas.append(input_data)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            try:
+                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+                result = user_data._completed_requests.get()
+                if type(result) == InferenceServerException:
+                    # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
+                    self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
+                    stream_end = True
+                else:
+                    # Stream is not killed
+                    output_data = result.as_numpy("OUT")
+                    self.assertIsNotNone(output_data, "error: expected 'OUT'")
+            except Exception as e:
+                if stream_end == True:
+                    # We expect the stream to have closed
+                    self.assertTrue(
+                        True,
+                        "This should always pass as cancellation should succeed",
                     )
-                ]
-                inputs[0].set_data_from_numpy(input_data)
-                try:
-                    triton_client.async_stream_infer(
-                        model_name=model_name, inputs=inputs
+                else:
+                    self.assertFalse(
+                        True, "Unexpected Stream killed without Error from CORE"
                     )
-                    result = user_data._completed_requests.get()
-                    if type(result) == InferenceServerException:
-                        # execute_grpc_error intentionally returns error with StatusCode.INTERNAL status on 2nd request
-                        self.assertEqual(str(result.status()), "StatusCode.INTERNAL")
-                        stream_end = True
-                    else:
-                        # Stream is not killed
-                        output_data = result.as_numpy("OUT")
-                        self.assertIsNotNone(output_data, "error: expected 'OUT'")
-                except Exception as e:
-                    if stream_end == True:
-                        # We expect the stream to have closed
-                        self.assertTrue(
-                            True,
-                            "This should always pass as cancellation should succeed",
-                        )
-                    else:
-                        self.assertFalse(
-                            True, "Unexpected Stream killed without Error from CORE"
-                        )
 
     # Test grpc stream behavior when triton_grpc_error is set to true in multiple open streams.
     # Expected to close stream and return GRPC error when model returns error.
@@ -363,22 +360,21 @@ def test_triton_grpc_error_error_off(self):
         user_data = UserData()
         triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
         triton_client.start_stream(callback=partial(callback, user_data))
-
-        with self._shm_leak_detector.Probe() as shm_probe:
-            input_datas = []
-            for i in range(number_of_requests):
-                input_data = np.random.randn(*shape).astype(np.float32)
-                input_datas.append(input_data)
-                inputs = [
-                    grpcclient.InferInput(
-                        "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
-                    )
-                ]
-                inputs[0].set_data_from_numpy(input_data)
-                triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
-                result = user_data._completed_requests.get()
-                response_counter += 1
-        # Expect stream to not CLOSE as NOT in triton_grpc_error mode
+        input_datas = []
+        for i in range(number_of_requests):
+            input_data = np.random.randn(*shape).astype(np.float32)
+            input_datas.append(input_data)
+            inputs = [
+                grpcclient.InferInput(
+                    "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
+                )
+            ]
+            inputs[0].set_data_from_numpy(input_data)
+            triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
+            result = user_data._completed_requests.get()
+            response_counter += 1
+        # we expect response_counter == number_of_requests, 
+        # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error
         self.assertEqual(response_counter, number_of_requests)
 
 
diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h
index 9f1c54527a..17cea206d3 100644
--- a/src/grpc/grpc_utils.h
+++ b/src/grpc/grpc_utils.h
@@ -81,11 +81,11 @@ typedef enum {
   NONE,
   // Error from CORE encountered, waiting to be picked up by completion queue to
   // initiate cancellation
-  ERROR_WAITING,
+  ERROR_ENCOUNTERED,
   // Error from CORE encountered, stream closed
   // This state is added to avoid double cancellation
-  ERROR_CANCELED
-} Triton_grpc_error_steps;
+  ERROR_HANDLING_COMPLETE
+} TritonGRPCErrorSteps;
 
 // Debugging helper
 std::ostream& operator<<(std::ostream& out, const Steps& step);
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index c93c3ba7eb..9cfa9822df 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -643,7 +643,7 @@ class InferHandlerState {
         : cq_(cq), unique_id_(unique_id), ongoing_requests_(0),
           step_(Steps::START), finish_ok_(true), ongoing_write_(false),
           received_notification_(false), triton_grpc_error_(false),
-          grpc_stream_error_state_(Triton_grpc_error_steps::NONE)
+          grpc_stream_error_state_(TritonGRPCErrorSteps::NONE)
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
@@ -665,19 +665,19 @@ class InferHandlerState {
 
     bool ReceivedNotification() { return received_notification_; }
 
-    // Changes the state of grpc_stream_error_state_ to ERROR_CANCELED,
+    // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
     // indicating we have closed the stream and initiated the cancel flow
-    void SetGRPCErrorCancelled()
+    void MarkGRPCErrorHandlingComplete()
     {
-      grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_CANCELED;
+      grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE;
     }
     // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
     bool CheckAndUpdateGRPCError()
     {
-      if (grpc_stream_error_state_ == Triton_grpc_error_steps::ERROR_WAITING) {
-        // Change the state to ERROR_CANCELED as we have called
+      if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) {
+        // Change the state to ERROR_HANDLING_COMPLETE as we have called
         // HandleCancellation
-        SetGRPCErrorCancelled();
+        MarkGRPCErrorHandlingComplete();
         return true;
       }
       return false;
@@ -689,39 +689,6 @@ class InferHandlerState {
                  ? (ctx_->IsCancelled() || CheckAndUpdateGRPCError())
                  : false;
     }
-
-    // Extracts headers from GRPC request and updates state
-    void ExtractStateFromHeaders(InferHandlerStateType* state)
-    {
-      const auto& metadata = state->context_->ctx_->client_metadata();
-      for (const auto& pair : metadata) {
-        auto& key = pair.first;
-        auto& value = pair.second;
-        std::string param_key = std::string(key.begin(), key.end());
-        std::string value_key = std::string(value.begin(), value.end());
-        std::string triton_grpc_error_key = "triton_grpc_error";
-        if (param_key == triton_grpc_error_key) {
-          if (value_key == "true") {
-            LOG_VERBOSE(2)
-                << "GRPC: triton_grpc_error mode detected in new grpc stream";
-            state->context_->triton_grpc_error_ = true;
-          }
-        }
-      }
-    }
-
-    void SendGRPCStrictResponse(InferHandlerStateType* state)
-    {
-      std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-      // Check if Error not responded previously
-      // Avoid closing connection twice on multiple errors from core
-      if (!state->context_->IsGRPCStrictError()) {
-        state->step_ = Steps::COMPLETE;
-        state->context_->responder_->Finish(state->status_, state);
-        // Mark error for this stream
-        state->context_->MarkGRPCStrictError();
-      }
-    }
     // Increments the ongoing request counter
     void IncrementRequestCounter() { ongoing_requests_++; }
 
@@ -763,6 +730,37 @@ class InferHandlerState {
       return false;
     }
 
+    // Extracts headers from GRPC request and updates state
+    void ExtractStateFromHeaders(InferHandlerStateType* state)
+    {
+      const auto& metadata = state->context_->ctx_->client_metadata();
+      std::string triton_grpc_error_key = "triton_grpc_error";
+
+      auto it = metadata.find(
+          {triton_grpc_error_key.data(), triton_grpc_error_key.size()});
+
+      if (it != metadata.end()) {
+        if (it->second == "true") {
+          LOG_VERBOSE(2)
+              << "GRPC: triton_grpc_error mode detected in new grpc stream";
+          triton_grpc_error_ = true;
+        }
+      }
+    }
+
+    void WriteGRPCErrorResponse(InferHandlerStateType* state)
+    {
+      std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
+      // Check if Error not responded previously
+      // Avoid closing connection twice on multiple errors from core
+      if (!state->context_->GRPCErrorEncountered()) {
+        state->step_ = Steps::COMPLETE;
+        state->context_->responder_->Finish(state->status_, state);
+        // Mark error for this stream
+        state->context_->MarkGRPCErrorEncountered();
+      }
+    }
+
     const std::string DebugString(InferHandlerStateType* state)
     {
       std::string debug_string("");
@@ -995,15 +993,15 @@ class InferHandlerState {
     }
 
     // Marks error after it has been responded to
-    void MarkGRPCStrictError()
+    void MarkGRPCErrorEncountered()
     {
-      grpc_stream_error_state_ = Triton_grpc_error_steps::ERROR_WAITING;
+      grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED;
     }
 
     // Checks if error already responded to in triton_grpc_error mode
-    bool IsGRPCStrictError()
+    bool GRPCErrorEncountered()
     {
-      if (grpc_stream_error_state_ == Triton_grpc_error_steps::NONE) {
+      if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) {
         return false;
       }
       return true;
@@ -1074,7 +1072,7 @@ class InferHandlerState {
 
     // Indicates the state of triton_grpc_error, only relevant if special
     // triton_grpc_error feature set to true by client
-    Triton_grpc_error_steps grpc_stream_error_state_;
+    TritonGRPCErrorSteps grpc_stream_error_state_;
   };
 
   // This constructor is used to build a wrapper state object
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 6a14ed9d4c..836282060a 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -598,7 +598,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
   // Ignore Response from CORE in case GRPC Strict as we dont care about
   if (state->context_->triton_grpc_error_) {
     std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-    if (state->context_->IsGRPCStrictError()) {
+    if (state->context_->GRPCErrorEncountered()) {
       return;
     }
   }
@@ -690,7 +690,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
         LOG_VERBOSE(1) << "GRPC streaming error detected with status: "
                        << status.error_code() << "Closing stream connection."
                        << std::endl;
-        state->context_->SendGRPCStrictResponse(state);
+        state->context_->WriteGRPCErrorResponse(state);
         TRITONSERVER_ErrorDelete(err);
         LOG_TRITONSERVER_ERROR(
             TRITONSERVER_InferenceResponseDelete(iresponse),

From 0cb7db0ac2f397eee9ab9b1ec8352efc193226d5 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 14 Aug 2024 00:11:50 -0700
Subject: [PATCH 20/32] Comments resolved

---
 qa/L0_backend_python/lifecycle/lifecycle_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index bd543d784e..3f1e3d62bb 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -373,7 +373,7 @@ def test_triton_grpc_error_error_off(self):
             triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
             result = user_data._completed_requests.get()
             response_counter += 1
-        # we expect response_counter == number_of_requests, 
+        # we expect response_counter == number_of_requests,
         # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error
         self.assertEqual(response_counter, number_of_requests)
 

From 8ea26471441dcdabb9fc1b334469dafa8f90ce60 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 14 Aug 2024 10:32:54 -0700
Subject: [PATCH 21/32] New class gRPCErrorTracker created

---
 src/grpc/grpc_utils.h            | 30 +++++++++++++++++-
 src/grpc/infer_handler.h         | 54 +++++---------------------------
 src/grpc/stream_infer_handler.cc | 44 ++++++++++++++++++++++++--
 3 files changed, 78 insertions(+), 50 deletions(-)

diff --git a/src/grpc/grpc_utils.h b/src/grpc/grpc_utils.h
index 17cea206d3..032dec3ad9 100644
--- a/src/grpc/grpc_utils.h
+++ b/src/grpc/grpc_utils.h
@@ -87,6 +87,35 @@ typedef enum {
   ERROR_HANDLING_COMPLETE
 } TritonGRPCErrorSteps;
 
+class gRPCErrorTracker {
+ public:
+  // True if set by user via header
+  // Can be accessed without a lock, as set only once in startstream
+  std::atomic<bool> triton_grpc_error_;
+
+  // Indicates the state of triton_grpc_error, only relevant if special
+  // triton_grpc_error feature set to true by client
+  TritonGRPCErrorSteps grpc_stream_error_state_;
+
+  // Constructor
+  gRPCErrorTracker()
+      : triton_grpc_error_(false),
+        grpc_stream_error_state_(TritonGRPCErrorSteps::NONE)
+  {
+  }
+  // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
+  // indicating we have closed the stream and initiated the cancel flow
+  void MarkGRPCErrorHandlingComplete();
+
+  // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+  bool CheckAndUpdateGRPCError();
+
+  // Marks error after it has been responded to
+  void MarkGRPCErrorEncountered();
+
+  // Checks if error already responded to in triton_grpc_error mode
+  bool GRPCErrorEncountered();
+};
 // Debugging helper
 std::ostream& operator<<(std::ostream& out, const Steps& step);
 
@@ -194,5 +223,4 @@ TRITONSERVER_Error* ParseClassificationParams(
 
 
 void ReadFile(const std::string& filename, std::string& data);
-
 }}}  // namespace triton::server::grpc
diff --git a/src/grpc/infer_handler.h b/src/grpc/infer_handler.h
index 9cfa9822df..6382c96c3c 100644
--- a/src/grpc/infer_handler.h
+++ b/src/grpc/infer_handler.h
@@ -642,11 +642,11 @@ class InferHandlerState {
         ::grpc::ServerCompletionQueue* cq, const uint64_t unique_id = 0)
         : cq_(cq), unique_id_(unique_id), ongoing_requests_(0),
           step_(Steps::START), finish_ok_(true), ongoing_write_(false),
-          received_notification_(false), triton_grpc_error_(false),
-          grpc_stream_error_state_(TritonGRPCErrorSteps::NONE)
+          received_notification_(false)
     {
       ctx_.reset(new ::grpc::ServerContext());
       responder_.reset(new ServerResponderType(ctx_.get()));
+      gRPCErrorTracker_ = std::make_unique<gRPCErrorTracker>();
     }
 
     void SetCompressionLevel(grpc_compression_level compression_level)
@@ -665,28 +665,11 @@ class InferHandlerState {
 
     bool ReceivedNotification() { return received_notification_; }
 
-    // Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
-    // indicating we have closed the stream and initiated the cancel flow
-    void MarkGRPCErrorHandlingComplete()
-    {
-      grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE;
-    }
-    // Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
-    bool CheckAndUpdateGRPCError()
-    {
-      if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) {
-        // Change the state to ERROR_HANDLING_COMPLETE as we have called
-        // HandleCancellation
-        MarkGRPCErrorHandlingComplete();
-        return true;
-      }
-      return false;
-    }
-
     bool IsCancelled()
     {
       return received_notification_
-                 ? (ctx_->IsCancelled() || CheckAndUpdateGRPCError())
+                 ? (ctx_->IsCancelled() ||
+                    gRPCErrorTracker_->CheckAndUpdateGRPCError())
                  : false;
     }
     // Increments the ongoing request counter
@@ -743,7 +726,7 @@ class InferHandlerState {
         if (it->second == "true") {
           LOG_VERBOSE(2)
               << "GRPC: triton_grpc_error mode detected in new grpc stream";
-          triton_grpc_error_ = true;
+          state->context_->gRPCErrorTracker_->triton_grpc_error_ = true;
         }
       }
     }
@@ -753,11 +736,11 @@ class InferHandlerState {
       std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
       // Check if Error not responded previously
       // Avoid closing connection twice on multiple errors from core
-      if (!state->context_->GRPCErrorEncountered()) {
+      if (!state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) {
         state->step_ = Steps::COMPLETE;
         state->context_->responder_->Finish(state->status_, state);
         // Mark error for this stream
-        state->context_->MarkGRPCErrorEncountered();
+        state->context_->gRPCErrorTracker_->MarkGRPCErrorEncountered();
       }
     }
 
@@ -992,21 +975,6 @@ class InferHandlerState {
       return false;
     }
 
-    // Marks error after it has been responded to
-    void MarkGRPCErrorEncountered()
-    {
-      grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED;
-    }
-
-    // Checks if error already responded to in triton_grpc_error mode
-    bool GRPCErrorEncountered()
-    {
-      if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) {
-        return false;
-      }
-      return true;
-    }
-
     // Return true if this context has completed all reads and writes.
     bool IsRequestsCompleted()
     {
@@ -1066,13 +1034,7 @@ class InferHandlerState {
     // completion queue.
     bool received_notification_;
 
-    // True if set by user via header
-    // Can be accessed without a lock, as set only once in startstream
-    std::atomic<bool> triton_grpc_error_;
-
-    // Indicates the state of triton_grpc_error, only relevant if special
-    // triton_grpc_error feature set to true by client
-    TritonGRPCErrorSteps grpc_stream_error_state_;
+    std::unique_ptr<gRPCErrorTracker> gRPCErrorTracker_;
   };
 
   // This constructor is used to build a wrapper state object
diff --git a/src/grpc/stream_infer_handler.cc b/src/grpc/stream_infer_handler.cc
index 836282060a..6651eca813 100644
--- a/src/grpc/stream_infer_handler.cc
+++ b/src/grpc/stream_infer_handler.cc
@@ -596,9 +596,9 @@ ModelStreamInferHandler::StreamInferResponseComplete(
 {
   State* state = reinterpret_cast<State*>(userp);
   // Ignore Response from CORE in case GRPC Strict as we dont care about
-  if (state->context_->triton_grpc_error_) {
+  if (state->context_->gRPCErrorTracker_->triton_grpc_error_) {
     std::lock_guard<std::recursive_mutex> lock(state->context_->mu_);
-    if (state->context_->GRPCErrorEncountered()) {
+    if (state->context_->gRPCErrorTracker_->GRPCErrorEncountered()) {
       return;
     }
   }
@@ -684,7 +684,7 @@ ModelStreamInferHandler::StreamInferResponseComplete(
       response->mutable_infer_response()->Clear();
       response->set_error_message(status.error_message());
       LOG_VERBOSE(1) << "Failed for ID: " << log_request_id << std::endl;
-      if (state->context_->triton_grpc_error_) {
+      if (state->context_->gRPCErrorTracker_->triton_grpc_error_) {
         state->status_ = status;
         // Finish only once, if backend ignores cancellation
         LOG_VERBOSE(1) << "GRPC streaming error detected with status: "
@@ -820,4 +820,42 @@ ModelStreamInferHandler::StreamInferResponseComplete(
   }
 }
 
+// Changes the state of grpc_stream_error_state_ to ERROR_HANDLING_COMPLETE,
+// indicating we have closed the stream and initiated the cancel flow
+void
+gRPCErrorTracker::MarkGRPCErrorHandlingComplete()
+{
+  grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_HANDLING_COMPLETE;
+}
+
+// Returns true ONLY when GRPC_ERROR from CORE is waiting to be processed.
+bool
+gRPCErrorTracker::CheckAndUpdateGRPCError()
+{
+  if (grpc_stream_error_state_ == TritonGRPCErrorSteps::ERROR_ENCOUNTERED) {
+    // Change the state to ERROR_HANDLING_COMPLETE as we have called
+    // HandleCancellation
+    MarkGRPCErrorHandlingComplete();
+    return true;
+  }
+  return false;
+}
+
+// Marks error after it has been responded to
+void
+gRPCErrorTracker::MarkGRPCErrorEncountered()
+{
+  grpc_stream_error_state_ = TritonGRPCErrorSteps::ERROR_ENCOUNTERED;
+}
+
+// Checks if error already responded to in triton_grpc_error mode
+bool
+gRPCErrorTracker::GRPCErrorEncountered()
+{
+  if (grpc_stream_error_state_ == TritonGRPCErrorSteps::NONE) {
+    return false;
+  }
+  return true;
+}
+
 }}}  // namespace triton::server::grpc

From e8c3242b5e902ab983bbac8d9fc2fef4f2fa16a0 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 14 Aug 2024 11:51:04 -0700
Subject: [PATCH 22/32] Docs Updated

---
 docs/customization_guide/inference_protocols.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md
index 592f26e7d1..a241f097da 100644
--- a/docs/customization_guide/inference_protocols.md
+++ b/docs/customization_guide/inference_protocols.md
@@ -115,6 +115,16 @@ These options can be used to configure the KeepAlive settings:
 
 For client-side documentation, see [Client-Side GRPC KeepAlive](https://github.com/triton-inference-server/client/blob/main/README.md#grpc-keepalive).
 
+#### GRPC Status Codes
+
+Triton implements GRPC error handling for streaming requests when a specific flag is enabled through headers. Upon encountering an error, Triton returns the appropriate GRPC error code and subsequently closes the stream.
+
+* `triton_grpc_error` : The header value needs to be set to true while starting the stream.
+
+GRPC status codes can be used for better visibility and monitoring. For more details, see [gRPC Status Codes](https://grpc.io/docs/guides/status-codes/)
+
+For client-side documentation, see [Client-Side GRPC Status Codes](https://github.com/triton-inference-server/client/tree/main#GRPC-Status-Codes)
+
 ### Limit Endpoint Access (BETA)
 
 Triton users may want to restrict access to protocols or APIs that are

From 72097e3ba52113f215207461493e905a7de75260 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Wed, 14 Aug 2024 17:05:33 -0700
Subject: [PATCH 23/32] Pipeline test

---
 qa/L0_decoupled/decoupled_test.py | 16 +++++-
 qa/L0_decoupled/test.sh           | 91 +++++++++++++++++--------------
 2 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
index 1f76f4845b..d7bc59f5c7 100755
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -116,7 +116,13 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -175,7 +181,13 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
index 98ad134d8b..649e8e4545 100755
--- a/qa/L0_decoupled/test.sh
+++ b/qa/L0_decoupled/test.sh
@@ -55,51 +55,60 @@ source ../common/util.sh
 
 
 TRIALS="python custom"
+GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_false"
+
+for grpc_trial in $GRPC_TRIALS; do
+  for trial in $TRIALS; do
+    if [ $trial == "python" ]; then
+      MODELDIR=`pwd`/python_models
+    else
+      MODELDIR=`pwd`/models
+    fi
 
-for trial in $TRIALS; do
-  if [ $trial == "python" ]; then
-    MODELDIR=`pwd`/python_models
-  else
-    MODELDIR=`pwd`/models
-  fi
+    if [ $grpc_trial == "triton_grpc_error_true" ]; then
+      export TRITONSERVER_GRPC_STATUS_FLAG=true
+    else
+      unset TRITONSERVER_GRPC_STATUS_FLAG
+    fi
 
-  SERVER_ARGS="--model-repository=$MODELDIR"
-  cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/.
-  (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \
-   sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt)
+    SERVER_ARGS="--model-repository=$MODELDIR"
+    cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/.
+    (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \
+     sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt)
 
-  run_server
-  if [ "$SERVER_PID" == "0" ]; then
-      echo -e "\n***\n*** Failed to start $SERVER\n***"
-      cat $SERVER_LOG
-      exit 1
-  fi
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
 
-  for i in \
-              test_one_to_none \
-              test_one_to_one \
-              test_one_to_many \
-              test_no_streaming \
-              test_response_order \
-	      test_wrong_shape; do
-
-      echo "Test: $i" >>$CLIENT_LOG
-      set +e
-      python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
-      if [ $? -ne 0 ]; then
-              echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-              echo -e "\n***\n*** Test $i Failed\n***"
-              RET=1
-      else
-          check_test_results $TEST_RESULT_FILE 1
-          if [ $? -ne 0 ]; then
-              cat $CLIENT_LOG
-              echo -e "\n***\n*** Test Result Verification Failed\n***"
-              RET=1
-          fi
-      fi
-      set -e
-  done
+    for i in \
+                test_one_to_none \
+                test_one_to_one \
+                test_one_to_many \
+                test_no_streaming \
+                test_response_order \
+	        test_wrong_shape; do
+
+        echo "Test: $i" >>$CLIENT_LOG
+        set +e
+        python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
+        if [ $? -ne 0 ]; then
+                echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+                echo -e "\n***\n*** Test $i Failed\n***"
+                RET=1
+        else
+            check_test_results $TEST_RESULT_FILE 1
+            if [ $? -ne 0 ]; then
+                cat $CLIENT_LOG
+                echo -e "\n***\n*** Test Result Verification Failed\n***"
+                RET=1
+            fi
+        fi
+        set -e
+    done
+done
 
   # Will delay the writing of each response by the specified many milliseconds.
   # This will ensure that there are multiple responses available to be written.

From 350af2593b62f9f7fe9a3a54ba6615a6bf200cfe Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 15 Aug 2024 16:16:51 -0700
Subject: [PATCH 24/32] Resolve Unused local variable warning

---
 qa/L0_backend_python/lifecycle/lifecycle_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index 3f1e3d62bb..607726b961 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -371,7 +371,7 @@ def test_triton_grpc_error_error_off(self):
             ]
             inputs[0].set_data_from_numpy(input_data)
             triton_client.async_stream_infer(model_name=model_name, inputs=inputs)
-            result = user_data._completed_requests.get()
+            _ = user_data._completed_requests.get()
             response_counter += 1
         # we expect response_counter == number_of_requests,
         # which indicates that after the first reported grpc error stream did NOT close and mode != triton_grpc_error

From e473f29c1b917cfb8a5a0787bd7f56d09d545c29 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 15 Aug 2024 18:24:26 -0700
Subject: [PATCH 25/32] GRPC Cleanup tests updated for triton grpc error

---
 qa/L0_grpc_state_cleanup/cleanup_test.py |  36 +++-
 qa/L0_grpc_state_cleanup/test.sh         | 239 ++++++++++++-----------
 2 files changed, 151 insertions(+), 124 deletions(-)

diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py
index 431eeb1720..01c04cc66b 100755
--- a/qa/L0_grpc_state_cleanup/cleanup_test.py
+++ b/qa/L0_grpc_state_cleanup/cleanup_test.py
@@ -161,9 +161,15 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -229,9 +235,15 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -608,9 +620,15 @@ def test_non_decoupled_streaming_multi_response(self):
             url="localhost:8001", verbose=True
         ) as client:
             # Establish stream
-            client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=16
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=16, headers=metadata
+                )
+            else:
+                client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=16
+                )
             # Send a request
             client.async_stream_infer(
                 model_name=self.repeat_non_decoupled_model_name,
diff --git a/qa/L0_grpc_state_cleanup/test.sh b/qa/L0_grpc_state_cleanup/test.sh
index df302d5ed1..49e11b47a8 100755
--- a/qa/L0_grpc_state_cleanup/test.sh
+++ b/qa/L0_grpc_state_cleanup/test.sh
@@ -79,89 +79,131 @@ rm -rf models/repeat_int32_non_decoupled && \
         sed -i "/model_transaction_policy/,+2d" config.pbtxt && \
         sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt)
 
-for i in test_simple_infer \
-            test_simple_infer_cancellation \
-            test_simple_infer_timeout \
-            test_streaming_infer \
-            test_streaming_timeout \
-            test_streaming_cancellation \
-            test_decoupled_infer \
-            test_decoupled_cancellation \
-            test_decoupled_timeout \
-            test_non_decoupled_streaming_multi_response; do
-  SERVER_LOG="./inference_server.$i.log"
-  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-  run_server
-  if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-  fi
-
-  echo "Test: $i" >>$CLIENT_LOG
+GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_true"
 
-  set +e
-  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-  if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-    echo -e "\n***\n*** Test $i Failed\n***"
-    RET=1
+for grpc_trial in $GRPC_TRIALS; do
+  if [ $grpc_trial == "triton_grpc_error_true" ]; then
+    export TRITONSERVER_GRPC_STATUS_FLAG=true
+  else
+    unset TRITONSERVER_GRPC_STATUS_FLAG
   fi
-
-  kill $SERVER_PID
-  wait $SERVER_PID
-
-  check_state_release $SERVER_LOG
-  if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** State Verification Failed for $i\n***"
+  for i in test_simple_infer \
+              test_simple_infer_cancellation \
+              test_simple_infer_timeout \
+              test_streaming_infer \
+              test_streaming_timeout \
+              test_streaming_cancellation \
+              test_decoupled_infer \
+              test_decoupled_cancellation \
+              test_decoupled_timeout \
+              test_non_decoupled_streaming_multi_response; do
+    SERVER_LOG="./inference_server.$i.log"
+    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+      echo -e "\n***\n*** Failed to start $SERVER\n***"
+      cat $SERVER_LOG
+      exit 1
+    fi
+
+    echo "Test: $i" >>$CLIENT_LOG
+
+    set +e
+    python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+      echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+      echo -e "\n***\n*** Test $i Failed\n***"
       RET=1
-  fi
-  set -e
-done
-
-
-for i in test_simple_infer_error_status \
-                test_streaming_error_status \
-                test_decoupled_error_status; do
-  SERVER_LOG="./inference_server.$i.log"
-  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value"
-  run_server
-  if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-  fi
-
-  echo "Test: $i" >>$CLIENT_LOG
+    fi
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+
+    check_state_release $SERVER_LOG
+    if [ $? -ne 0 ]; then
+      cat $SERVER_LOG
+      echo -e "\n***\n*** State Verification Failed for $i\n***"
+        RET=1
+    fi
+    set -e
+  done
+
+
+  for i in test_simple_infer_error_status \
+                  test_streaming_error_status \
+                  test_decoupled_error_status; do
+    SERVER_LOG="./inference_server.$i.log"
+    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+      echo -e "\n***\n*** Failed to start $SERVER\n***"
+      cat $SERVER_LOG
+      exit 1
+    fi
+
+    echo "Test: $i" >>$CLIENT_LOG
+
+    set +e
+    python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+      echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+      echo -e "\n***\n*** Test $i Failed\n***"
+      RET=1
+    fi
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+
+    check_state_release $SERVER_LOG
+    if [ $? -ne 0 ]; then
+      cat $SERVER_LOG
+      echo -e "\n***\n*** State Verification Failed for $i\n***"
+        RET=1
+    fi
+
+    set -e
+  done
+
+  for i in test_simple_infer_shutdownserver \
+          test_streaming_infer_shutdownserver \
+          test_decoupled_infer_shutdownserver \
+          test_decoupled_infer_with_params_shutdownserver; do
+    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+    SERVER_LOG="./inference_server.$i.log"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+      echo -e "\n***\n*** Failed to start $SERVER\n***"
+      cat $SERVER_LOG
+      exit 1
+    fi
+
+    echo "Test: $i" >>$CLIENT_LOG
+
+    set +e
+    SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+    if [ $? -ne 0 ]; then
+      echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+      echo -e "\n***\n*** Test $i Failed\n***"
+      RET=1
+    fi
 
-  set +e
-  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-  if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-    echo -e "\n***\n*** Test $i Failed\n***"
-    RET=1
-  fi
+    wait $SERVER_PID
 
-  kill $SERVER_PID
-  wait $SERVER_PID
+    check_state_release $SERVER_LOG
+    if [ $? -ne 0 ]; then
+      cat $SERVER_LOG
+      echo -e "\n***\n*** State Verification Failed for $i\n***"
+        RET=1
+    fi
 
-  check_state_release $SERVER_LOG
-  if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** State Verification Failed for $i\n***"
-      RET=1
-  fi
+    set -e
+  done
 
-  set -e
-done
+  TEST_NAME=test_decoupled_infer_complete
+  export TRITONSERVER_DELAY_GRPC_COMPLETE=2000
 
-for i in test_simple_infer_shutdownserver \
-         test_streaming_infer_shutdownserver \
-         test_decoupled_infer_shutdownserver \
-         test_decoupled_infer_with_params_shutdownserver; do
+  SERVER_LOG="./inference_server.$TEST_NAME.log"
   SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-  SERVER_LOG="./inference_server.$i.log"
   run_server
   if [ "$SERVER_PID" == "0" ]; then
     echo -e "\n***\n*** Failed to start $SERVER\n***"
@@ -169,63 +211,30 @@ for i in test_simple_infer_shutdownserver \
     exit 1
   fi
 
-  echo "Test: $i" >>$CLIENT_LOG
+  echo "Test: $TEST_NAME" >>$CLIENT_LOG
 
   set +e
-  SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+
+  SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1
   if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-    echo -e "\n***\n*** Test $i Failed\n***"
+    cat $CLIENT_LOG
+    echo -e "\n***\n*** Test $TEST_NAME Failed\n***"
     RET=1
   fi
 
+  kill $SERVER_PID
   wait $SERVER_PID
 
   check_state_release $SERVER_LOG
   if [ $? -ne 0 ]; then
     cat $SERVER_LOG
-    echo -e "\n***\n*** State Verification Failed for $i\n***"
-      RET=1
+    echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***"
+    RET=1
   fi
 
   set -e
 done
 
-TEST_NAME=test_decoupled_infer_complete
-export TRITONSERVER_DELAY_GRPC_COMPLETE=2000
-
-SERVER_LOG="./inference_server.$TEST_NAME.log"
-SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-  echo -e "\n***\n*** Failed to start $SERVER\n***"
-  cat $SERVER_LOG
-  exit 1
-fi
-
-echo "Test: $TEST_NAME" >>$CLIENT_LOG
-
-set +e
-
-SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1
-if [ $? -ne 0 ]; then
-  cat $CLIENT_LOG
-  echo -e "\n***\n*** Test $TEST_NAME Failed\n***"
-  RET=1
-fi
-
-kill $SERVER_PID
-wait $SERVER_PID
-
-check_state_release $SERVER_LOG
-if [ $? -ne 0 ]; then
-  cat $SERVER_LOG
-  echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***"
-  RET=1
-fi
-
-set -e
-
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 else

From 370c4493f2026d58601662f1feccf638ad70777a Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Thu, 15 Aug 2024 23:24:17 -0700
Subject: [PATCH 26/32] Revert "GRPC Cleanup tests updated for triton grpc
 error"

This reverts commit e473f29c1b917cfb8a5a0787bd7f56d09d545c29.
---
 qa/L0_grpc_state_cleanup/cleanup_test.py |  36 +---
 qa/L0_grpc_state_cleanup/test.sh         | 239 +++++++++++------------
 2 files changed, 124 insertions(+), 151 deletions(-)

diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py
index 01c04cc66b..431eeb1720 100755
--- a/qa/L0_grpc_state_cleanup/cleanup_test.py
+++ b/qa/L0_grpc_state_cleanup/cleanup_test.py
@@ -161,15 +161,9 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
-                metadata = {"triton_grpc_error": "true"}
-                triton_client.start_stream(
-                    callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
-                )
-            else:
-                triton_client.start_stream(
-                    callback=partial(callback, user_data), stream_timeout=stream_timeout
-                )
+            triton_client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=stream_timeout
+            )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -235,15 +229,9 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
-                metadata = {"triton_grpc_error": "true"}
-                triton_client.start_stream(
-                    callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
-                )
-            else:
-                triton_client.start_stream(
-                    callback=partial(callback, user_data), stream_timeout=stream_timeout
-                )
+            triton_client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=stream_timeout
+            )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -620,15 +608,9 @@ def test_non_decoupled_streaming_multi_response(self):
             url="localhost:8001", verbose=True
         ) as client:
             # Establish stream
-            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
-                metadata = {"triton_grpc_error": "true"}
-                client.start_stream(
-                    callback=partial(callback, user_data), stream_timeout=16, headers=metadata
-                )
-            else:
-                client.start_stream(
-                    callback=partial(callback, user_data), stream_timeout=16
-                )
+            client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=16
+            )
             # Send a request
             client.async_stream_infer(
                 model_name=self.repeat_non_decoupled_model_name,
diff --git a/qa/L0_grpc_state_cleanup/test.sh b/qa/L0_grpc_state_cleanup/test.sh
index 49e11b47a8..df302d5ed1 100755
--- a/qa/L0_grpc_state_cleanup/test.sh
+++ b/qa/L0_grpc_state_cleanup/test.sh
@@ -79,131 +79,89 @@ rm -rf models/repeat_int32_non_decoupled && \
         sed -i "/model_transaction_policy/,+2d" config.pbtxt && \
         sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt)
 
-GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_true"
+for i in test_simple_infer \
+            test_simple_infer_cancellation \
+            test_simple_infer_timeout \
+            test_streaming_infer \
+            test_streaming_timeout \
+            test_streaming_cancellation \
+            test_decoupled_infer \
+            test_decoupled_cancellation \
+            test_decoupled_timeout \
+            test_non_decoupled_streaming_multi_response; do
+  SERVER_LOG="./inference_server.$i.log"
+  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+  fi
+
+  echo "Test: $i" >>$CLIENT_LOG
 
-for grpc_trial in $GRPC_TRIALS; do
-  if [ $grpc_trial == "triton_grpc_error_true" ]; then
-    export TRITONSERVER_GRPC_STATUS_FLAG=true
-  else
-    unset TRITONSERVER_GRPC_STATUS_FLAG
+  set +e
+  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+  if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+    echo -e "\n***\n*** Test $i Failed\n***"
+    RET=1
   fi
-  for i in test_simple_infer \
-              test_simple_infer_cancellation \
-              test_simple_infer_timeout \
-              test_streaming_infer \
-              test_streaming_timeout \
-              test_streaming_cancellation \
-              test_decoupled_infer \
-              test_decoupled_cancellation \
-              test_decoupled_timeout \
-              test_non_decoupled_streaming_multi_response; do
-    SERVER_LOG="./inference_server.$i.log"
-    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-    run_server
-    if [ "$SERVER_PID" == "0" ]; then
-      echo -e "\n***\n*** Failed to start $SERVER\n***"
-      cat $SERVER_LOG
-      exit 1
-    fi
-
-    echo "Test: $i" >>$CLIENT_LOG
-
-    set +e
-    python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-    if [ $? -ne 0 ]; then
-      echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-      echo -e "\n***\n*** Test $i Failed\n***"
-      RET=1
-    fi
-
-    kill $SERVER_PID
-    wait $SERVER_PID
-
-    check_state_release $SERVER_LOG
-    if [ $? -ne 0 ]; then
-      cat $SERVER_LOG
-      echo -e "\n***\n*** State Verification Failed for $i\n***"
-        RET=1
-    fi
-    set -e
-  done
-
-
-  for i in test_simple_infer_error_status \
-                  test_streaming_error_status \
-                  test_decoupled_error_status; do
-    SERVER_LOG="./inference_server.$i.log"
-    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value"
-    run_server
-    if [ "$SERVER_PID" == "0" ]; then
-      echo -e "\n***\n*** Failed to start $SERVER\n***"
-      cat $SERVER_LOG
-      exit 1
-    fi
-
-    echo "Test: $i" >>$CLIENT_LOG
-
-    set +e
-    python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-    if [ $? -ne 0 ]; then
-      echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-      echo -e "\n***\n*** Test $i Failed\n***"
-      RET=1
-    fi
-
-    kill $SERVER_PID
-    wait $SERVER_PID
-
-    check_state_release $SERVER_LOG
-    if [ $? -ne 0 ]; then
-      cat $SERVER_LOG
-      echo -e "\n***\n*** State Verification Failed for $i\n***"
-        RET=1
-    fi
-
-    set -e
-  done
-
-  for i in test_simple_infer_shutdownserver \
-          test_streaming_infer_shutdownserver \
-          test_decoupled_infer_shutdownserver \
-          test_decoupled_infer_with_params_shutdownserver; do
-    SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-    SERVER_LOG="./inference_server.$i.log"
-    run_server
-    if [ "$SERVER_PID" == "0" ]; then
-      echo -e "\n***\n*** Failed to start $SERVER\n***"
-      cat $SERVER_LOG
-      exit 1
-    fi
-
-    echo "Test: $i" >>$CLIENT_LOG
-
-    set +e
-    SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-    if [ $? -ne 0 ]; then
-      echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-      echo -e "\n***\n*** Test $i Failed\n***"
+
+  kill $SERVER_PID
+  wait $SERVER_PID
+
+  check_state_release $SERVER_LOG
+  if [ $? -ne 0 ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** State Verification Failed for $i\n***"
       RET=1
-    fi
+  fi
+  set -e
+done
+
+
+for i in test_simple_infer_error_status \
+                test_streaming_error_status \
+                test_decoupled_error_status; do
+  SERVER_LOG="./inference_server.$i.log"
+  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value"
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+  fi
 
-    wait $SERVER_PID
+  echo "Test: $i" >>$CLIENT_LOG
 
-    check_state_release $SERVER_LOG
-    if [ $? -ne 0 ]; then
-      cat $SERVER_LOG
-      echo -e "\n***\n*** State Verification Failed for $i\n***"
-        RET=1
-    fi
+  set +e
+  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+  if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+    echo -e "\n***\n*** Test $i Failed\n***"
+    RET=1
+  fi
 
-    set -e
-  done
+  kill $SERVER_PID
+  wait $SERVER_PID
 
-  TEST_NAME=test_decoupled_infer_complete
-  export TRITONSERVER_DELAY_GRPC_COMPLETE=2000
+  check_state_release $SERVER_LOG
+  if [ $? -ne 0 ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** State Verification Failed for $i\n***"
+      RET=1
+  fi
+
+  set -e
+done
 
-  SERVER_LOG="./inference_server.$TEST_NAME.log"
+for i in test_simple_infer_shutdownserver \
+         test_streaming_infer_shutdownserver \
+         test_decoupled_infer_shutdownserver \
+         test_decoupled_infer_with_params_shutdownserver; do
   SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+  SERVER_LOG="./inference_server.$i.log"
   run_server
   if [ "$SERVER_PID" == "0" ]; then
     echo -e "\n***\n*** Failed to start $SERVER\n***"
@@ -211,30 +169,63 @@ for grpc_trial in $GRPC_TRIALS; do
     exit 1
   fi
 
-  echo "Test: $TEST_NAME" >>$CLIENT_LOG
+  echo "Test: $i" >>$CLIENT_LOG
 
   set +e
-
-  SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1
+  SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
   if [ $? -ne 0 ]; then
-    cat $CLIENT_LOG
-    echo -e "\n***\n*** Test $TEST_NAME Failed\n***"
+    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+    echo -e "\n***\n*** Test $i Failed\n***"
     RET=1
   fi
 
-  kill $SERVER_PID
   wait $SERVER_PID
 
   check_state_release $SERVER_LOG
   if [ $? -ne 0 ]; then
     cat $SERVER_LOG
-    echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***"
-    RET=1
+    echo -e "\n***\n*** State Verification Failed for $i\n***"
+      RET=1
   fi
 
   set -e
 done
 
+TEST_NAME=test_decoupled_infer_complete
+export TRITONSERVER_DELAY_GRPC_COMPLETE=2000
+
+SERVER_LOG="./inference_server.$TEST_NAME.log"
+SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+  echo -e "\n***\n*** Failed to start $SERVER\n***"
+  cat $SERVER_LOG
+  exit 1
+fi
+
+echo "Test: $TEST_NAME" >>$CLIENT_LOG
+
+set +e
+
+SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+  cat $CLIENT_LOG
+  echo -e "\n***\n*** Test $TEST_NAME Failed\n***"
+  RET=1
+fi
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+check_state_release $SERVER_LOG
+if [ $? -ne 0 ]; then
+  cat $SERVER_LOG
+  echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***"
+  RET=1
+fi
+
+set -e
+
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 else

From b87c3fc00c921c18333de2b0ee0f7c217d830701 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 16 Aug 2024 01:51:14 -0700
Subject: [PATCH 27/32] GRPC Cleanup tests updated for triton grpc error

---
 Dockerfile.QA                                 |   4 +
 qa/L0_decoupled/decoupled_test.py             |  18 +-
 qa/L0_decoupled/test.sh                       |  93 ++-
 qa/L0_decoupled_grpc_error/decoupled_test.py  | 649 ++++++++++++++++++
 qa/L0_decoupled_grpc_error/test.sh            | 179 +++++
 .../cleanup_test.py                           | 642 +++++++++++++++++
 qa/L0_grpc_error_state_cleanup/test.sh        | 235 +++++++
 7 files changed, 1754 insertions(+), 66 deletions(-)
 create mode 100755 qa/L0_decoupled_grpc_error/decoupled_test.py
 create mode 100755 qa/L0_decoupled_grpc_error/test.sh
 create mode 100755 qa/L0_grpc_error_state_cleanup/cleanup_test.py
 create mode 100755 qa/L0_grpc_error_state_cleanup/test.sh

diff --git a/Dockerfile.QA b/Dockerfile.QA
index 2c43f735a5..a3073948c5 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -113,6 +113,8 @@ RUN mkdir -p qa/common && \
     cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \
     mkdir qa/L0_grpc_state_cleanup/models && \
     cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \
+    mkdir qa/L0_grpc_error_state_cleanup/models && \
+    cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_error_state_cleanup/models/ && \
     mkdir qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_http/models && \
@@ -249,11 +251,13 @@ RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \
     mkdir -p qa/L0_decoupled/models/repeat_square/1 && \
     mkdir -p qa/L0_decoupled/models/nested_square/1 && \
     mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1
+    mkdir -p qa/L0_grpc_error_state_cleanup/models/repeat_int32/1
 
 RUN if [ "$IGPU_BUILD" == "0" ]; then \
         cp backends/repeat/libtriton_repeat.so qa/L0_model_config && \
         cp backends/repeat/libtriton_repeat.so qa/L0_decoupled/models/repeat_int32/1 && \
         cp backends/repeat/libtriton_repeat.so qa/L0_grpc_state_cleanup/models/repeat_int32/1/. && \
+        cp backends/repeat/libtriton_repeat.so qa/L0_grpc_error_state_cleanup/models/repeat_int32/1/. && \
         cp backends/square/libtriton_square.so qa/L0_decoupled/models/square_int32/1; \
     fi
 
diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
index d7bc59f5c7..d0f09deaf9 100755
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -116,13 +116,7 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
-                metadata = {"triton_grpc_error": "true"}
-                triton_client.start_stream(
-                    callback=partial(callback, user_data), headers=metadata
-                )
-            else:
-                triton_client.start_stream(callback=partial(callback, user_data))
+            triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -181,13 +175,7 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
-                metadata = {"triton_grpc_error": "true"}
-                triton_client.start_stream(
-                    callback=partial(callback, user_data), headers=metadata
-                )
-            else:
-                triton_client.start_stream(callback=partial(callback, user_data))
+            triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -656,4 +644,4 @@ def test_http(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
\ No newline at end of file
diff --git a/qa/L0_decoupled/test.sh b/qa/L0_decoupled/test.sh
index 649e8e4545..22c37dff49 100755
--- a/qa/L0_decoupled/test.sh
+++ b/qa/L0_decoupled/test.sh
@@ -55,60 +55,51 @@ source ../common/util.sh
 
 
 TRIALS="python custom"
-GRPC_TRIALS="triton_grpc_error_true triton_grpc_error_false"
-
-for grpc_trial in $GRPC_TRIALS; do
-  for trial in $TRIALS; do
-    if [ $trial == "python" ]; then
-      MODELDIR=`pwd`/python_models
-    else
-      MODELDIR=`pwd`/models
-    fi
 
-    if [ $grpc_trial == "triton_grpc_error_true" ]; then
-      export TRITONSERVER_GRPC_STATUS_FLAG=true
-    else
-      unset TRITONSERVER_GRPC_STATUS_FLAG
-    fi
+for trial in $TRIALS; do
+  if [ $trial == "python" ]; then
+    MODELDIR=`pwd`/python_models
+  else
+    MODELDIR=`pwd`/models
+  fi
 
-    SERVER_ARGS="--model-repository=$MODELDIR"
-    cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/.
-    (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \
-     sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt)
+  SERVER_ARGS="--model-repository=$MODELDIR"
+  cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/.
+  (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \
+   sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt)
 
-    run_server
-    if [ "$SERVER_PID" == "0" ]; then
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+      echo -e "\n***\n*** Failed to start $SERVER\n***"
+      cat $SERVER_LOG
+      exit 1
+  fi
 
-    for i in \
-                test_one_to_none \
-                test_one_to_one \
-                test_one_to_many \
-                test_no_streaming \
-                test_response_order \
-	        test_wrong_shape; do
-
-        echo "Test: $i" >>$CLIENT_LOG
-        set +e
-        python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
-        if [ $? -ne 0 ]; then
-                echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-                echo -e "\n***\n*** Test $i Failed\n***"
-                RET=1
-        else
-            check_test_results $TEST_RESULT_FILE 1
-            if [ $? -ne 0 ]; then
-                cat $CLIENT_LOG
-                echo -e "\n***\n*** Test Result Verification Failed\n***"
-                RET=1
-            fi
-        fi
-        set -e
-    done
-done
+  for i in \
+              test_one_to_none \
+              test_one_to_one \
+              test_one_to_many \
+              test_no_streaming \
+              test_response_order \
+	      test_wrong_shape; do
+
+      echo "Test: $i" >>$CLIENT_LOG
+      set +e
+      python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
+      if [ $? -ne 0 ]; then
+              echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+              echo -e "\n***\n*** Test $i Failed\n***"
+              RET=1
+      else
+          check_test_results $TEST_RESULT_FILE 1
+          if [ $? -ne 0 ]; then
+              cat $CLIENT_LOG
+              echo -e "\n***\n*** Test Result Verification Failed\n***"
+              RET=1
+          fi
+      fi
+      set -e
+  done
 
   # Will delay the writing of each response by the specified many milliseconds.
   # This will ensure that there are multiple responses available to be written.
@@ -185,4 +176,4 @@ else
   echo -e "\n***\n*** Test Failed\n***"
 fi
 
-exit $RET
+exit $RET
\ No newline at end of file
diff --git a/qa/L0_decoupled_grpc_error/decoupled_test.py b/qa/L0_decoupled_grpc_error/decoupled_test.py
new file mode 100755
index 0000000000..fc606e3cca
--- /dev/null
+++ b/qa/L0_decoupled_grpc_error/decoupled_test.py
@@ -0,0 +1,649 @@
+#!/usr/bin/env python3
+
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import os
+import queue
+import time
+import unittest
+from functools import partial
+
+import numpy as np
+import test_util as tu
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
+from tritonclient.utils import InferenceServerException
+
+
+class UserData:
+    def __init__(self):
+        self._response_queue = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._response_queue.put(error)
+    else:
+        user_data._response_queue.put(result)
+
+
+class DecoupledTest(tu.TestResultCollector):
+    def setUp(self):
+        self.trials_ = [
+            ("repeat_int32", None),
+            ("simple_repeat", None),
+            ("sequence_repeat", None),
+            ("fan_repeat", self._fan_validate),
+            ("repeat_square", self._nested_validate),
+            ("nested_square", self._nested_validate),
+        ]
+        self.model_name_ = "repeat_int32"
+
+        self.inputs_ = []
+        self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32"))
+        self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32"))
+        self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32"))
+
+        self.outputs_ = []
+        self.outputs_.append(grpcclient.InferRequestedOutput("OUT"))
+        self.outputs_.append(grpcclient.InferRequestedOutput("IDX"))
+        # Some trials only expect a subset of outputs
+        self.requested_outputs_ = self.outputs_
+
+    # Client can receive a "triton_final_response" response parameter
+    # from Triton server that indicates when a response is the final response for
+    # its request.
+    #
+    # For non-decoupled models, there is a 1:1 request:response ratio, so every
+    # response is the final response, and this parameter is unnecessary.
+    #
+    # For decoupled models, there is a 1:N request:response ratio, so there may be
+    # more one response before receiving the "final" response.
+    #
+    # However, decoupled models have the unique property in that they can return
+    # a flags-only response to the server to indicate completion, which is not
+    # returned to the client by default (See TRITONBACKEND_ResponseFactorySendFlags).
+    #
+    # To forward this flags-only response to the client, users must opt-in to this
+    # behavior by adding the following argument:
+    # client.async_stream_infer(..., enable_empty_final_response=True).
+    #
+    # If the decoupled backend/model always sends the final response flag along
+    # with a non-null response, no opt-in is needed.
+    #
+    # With this behavior, the client can programmatically detect when all responses
+    # for an individual request have been received without knowing the expected
+    # number of responses in advance and without closing the stream.
+    def _stream_infer_with_params(
+        self,
+        request_count,
+        request_delay,
+        _,
+        delay_data,
+        delay_factor,
+        user_data,
+        result_dict,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
+            # Establish stream
+            metadata = {"triton_grpc_error": "true"}
+            triton_client.start_stream(callback=partial(callback, user_data), headers=metadata)
+            # Send specified many requests in parallel
+            for i in range(request_count):
+                time.sleep((request_delay / 1000))
+                self.inputs_[1].set_data_from_numpy(delay_data)
+                triton_client.async_stream_infer(
+                    model_name=self.model_name_,
+                    inputs=self.inputs_,
+                    request_id=str(i),
+                    outputs=self.requested_outputs_,
+                    # Opt-in to receiving flags-only responses from model/backend
+                    # to help detect final responses for decoupled models.
+                    enable_empty_final_response=True,
+                )
+                # Update delay input in accordance with the scaling factor
+                delay_data = delay_data * delay_factor
+                delay_data = delay_data.astype(np.uint32)
+
+            # Retrieve results...
+            recv_count = 0
+            completed_requests = 0
+            while completed_requests < request_count:
+                data_item = user_data._response_queue.get()
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    response = data_item.get_response()
+                    # Request IDs should generally be provided with each request
+                    # to associate decoupled responses with their requests.
+                    if not response.id:
+                        raise ValueError(
+                            "No response id found. Was a request_id provided?"
+                        )
+
+                    # Detect final response. Parameters are oneof and we expect bool_param
+                    if response.parameters.get("triton_final_response").bool_param:
+                        completed_requests += 1
+
+                    # Only process non-empty response, ignore if empty (no outputs)
+                    if response.outputs:
+                        if response.id not in result_dict:
+                            result_dict[response.id] = []
+                        result_dict[response.id].append((recv_count, data_item))
+                        recv_count += 1
+
+    def _stream_infer(
+        self,
+        request_count,
+        request_delay,
+        expected_count,
+        delay_data,
+        delay_factor,
+        user_data,
+        result_dict,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
+            # Establish stream
+            metadata = {"triton_grpc_error": "true"}
+            triton_client.start_stream(callback=partial(callback, user_data), headers=metadata)
+            # Send specified many requests in parallel
+            for i in range(request_count):
+                time.sleep((request_delay / 1000))
+                self.inputs_[1].set_data_from_numpy(delay_data)
+                triton_client.async_stream_infer(
+                    model_name=self.model_name_,
+                    inputs=self.inputs_,
+                    request_id=str(i),
+                    outputs=self.requested_outputs_,
+                )
+                # Update delay input in accordance with the scaling factor
+                delay_data = delay_data * delay_factor
+                delay_data = delay_data.astype(np.uint32)
+
+            # Retrieve results...
+            recv_count = 0
+            while recv_count < expected_count:
+                data_item = user_data._response_queue.get()
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    this_id = data_item.get_response().id
+                    if this_id not in result_dict:
+                        result_dict[this_id] = []
+                    result_dict[this_id].append((recv_count, data_item))
+
+                recv_count += 1
+
+    def _fan_validate(self, result_list, data_offset, repeat_count):
+        # fan_repeat returns "2 * data_offset" as result
+        self.assertEqual(len(result_list), repeat_count)
+        expected_data = 2 * data_offset
+        for j in range(len(result_list)):
+            this_data = result_list[j][1].as_numpy("OUT")
+            self.assertEqual(len(this_data), 1)
+            self.assertEqual(this_data[0], expected_data)
+            expected_data += 2
+
+    def _nested_validate(self, result_list, data_offset, repeat_count):
+        # if repeat model returns repeat result n, repeat_square-like model
+        # will return the same result n times
+        expected_len = sum(x for x in range(data_offset, data_offset + repeat_count))
+        self.assertEqual(len(result_list), expected_len)
+        expected_data = data_offset
+        expected_count = expected_data
+        for j in range(len(result_list)):
+            this_data = result_list[j][1].as_numpy("OUT")
+            self.assertEqual(len(this_data), 1)
+            self.assertEqual(this_data[0], expected_data)
+            expected_count -= 1
+            if expected_count == 0:
+                expected_data += 1
+                expected_count = expected_data
+
+    def _decoupled_infer(
+        self,
+        request_count,
+        request_delay=0,
+        repeat_count=1,
+        data_offset=100,
+        delay_time=1000,
+        delay_factor=1,
+        wait_time=500,
+        order_sequence=None,
+        validate_fn=None,
+    ):
+        # Initialize data for IN
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
+        self.inputs_[0].set_shape([repeat_count])
+        self.inputs_[0].set_data_from_numpy(input_data)
+
+        # Initialize data for DELAY
+        delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
+        self.inputs_[1].set_shape([repeat_count])
+
+        # Initialize data for WAIT
+        wait_data = np.array([wait_time], dtype=np.uint32)
+        self.inputs_[2].set_data_from_numpy(wait_data)
+
+        # use validate_fn to differentiate requested outputs
+        self.requested_outputs_ = (
+            self.outputs_ if validate_fn is None else self.outputs_[0:1]
+        )
+
+        for infer_helper in [self._stream_infer, self._stream_infer_with_params]:
+            user_data = UserData()
+            result_dict = {}
+
+            try:
+                if "square" not in self.model_name_:
+                    expected_count = repeat_count * request_count
+                else:
+                    expected_count = (
+                        sum(x for x in range(data_offset, data_offset + repeat_count))
+                        * request_count
+                    )
+                infer_helper(
+                    request_count,
+                    request_delay,
+                    expected_count,
+                    delay_data,
+                    delay_factor,
+                    user_data,
+                    result_dict,
+                )
+            except Exception as ex:
+                self.assertTrue(False, "unexpected error {}".format(ex))
+
+            # Validate the results..
+            for i in range(request_count):
+                this_id = str(i)
+                if repeat_count != 0 and this_id not in result_dict.keys():
+                    self.assertTrue(
+                        False, "response for request id {} not received".format(this_id)
+                    )
+                elif repeat_count == 0 and this_id in result_dict.keys():
+                    self.assertTrue(
+                        False,
+                        "received unexpected response for request id {}".format(
+                            this_id
+                        ),
+                    )
+                if repeat_count != 0:
+                    if validate_fn is None:
+                        self.assertEqual(len(result_dict[this_id]), repeat_count)
+                        expected_data = data_offset
+                        result_list = result_dict[this_id]
+                        for j in range(len(result_list)):
+                            if order_sequence is not None:
+                                self.assertEqual(
+                                    result_list[j][0], order_sequence[i][j]
+                                )
+                            this_data = result_list[j][1].as_numpy("OUT")
+                            self.assertEqual(len(this_data), 1)
+                            self.assertEqual(this_data[0], expected_data)
+                            this_idx = result_list[j][1].as_numpy("IDX")
+                            self.assertEqual(len(this_idx), 1)
+                            self.assertEqual(this_idx[0], j)
+                            expected_data += 1
+                    else:
+                        validate_fn(result_dict[this_id], data_offset, repeat_count)
+
+    def test_one_to_none(self):
+        # Test cases where each request generates no response.
+        # Note the name of the test one_to_none implies the
+        # mapping between requests and responses.
+
+        for trial in self.trials_:
+            self.model_name_ = trial[0]
+            # Single request case
+            self._decoupled_infer(request_count=1, repeat_count=0, validate_fn=trial[1])
+            # Multiple request case
+            self._decoupled_infer(request_count=5, repeat_count=0, validate_fn=trial[1])
+
+    def test_one_to_one(self):
+        # Test cases where each request generates single response.
+        # Note the name of the test one_to_one implies the
+        # mapping between requests and responses.
+
+        for trial in self.trials_:
+            self.model_name_ = trial[0]
+            # Single request case
+            # Release request before the response is delivered
+            self._decoupled_infer(request_count=1, wait_time=500, validate_fn=trial[1])
+            # Release request after the response is delivered
+            self._decoupled_infer(request_count=1, wait_time=2000, validate_fn=trial[1])
+
+            # Multiple request case
+            # Release request before the response is delivered
+            self._decoupled_infer(request_count=5, wait_time=500, validate_fn=trial[1])
+            # Release request after the response is delivered
+            self._decoupled_infer(request_count=5, wait_time=2000, validate_fn=trial[1])
+
+    def test_one_to_many(self):
+        # Test cases where each request generates multiple response.
+        # Note the name of the test one_to_many implies the
+        # mapping between requests and responses.
+
+        self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ)
+
+        for trial in self.trials_:
+            self.model_name_ = trial[0]
+            # Single request case
+            # Release request before the first response is delivered
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
+            # Release request when the responses are getting delivered
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=2000, validate_fn=trial[1]
+            )
+            # Release request after all the responses are delivered
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
+
+            # Multiple request case
+            # Release request before the first response is delivered
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
+            # Release request when the responses are getting delivered
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=2000, validate_fn=trial[1]
+            )
+            # Release request after all the responses are delivered
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
+
+    def test_one_to_multi_many(self):
+        # Test cases where each request generates multiple response but the
+        # responses are delayed so as to stress the control path handling the
+        # queued responses.
+
+        self.assertTrue("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ)
+
+        for trial in self.trials_:
+            self.model_name_ = trial[0]
+            # Single request case
+            # Release request before the first response is delivered
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
+            # Release request when the responses are getting delivered
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=8000, validate_fn=trial[1]
+            )
+            # Release request after all the responses are delivered
+            self._decoupled_infer(
+                request_count=1, repeat_count=5, wait_time=20000, validate_fn=trial[1]
+            )
+
+            # Multiple request case
+            # Release request before the first response is delivered
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
+            )
+            # Release request when the responses are getting delivered
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=3000, validate_fn=trial[1]
+            )
+            # Release request after all the responses are delivered
+            self._decoupled_infer(
+                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
+            )
+
+    def test_response_order(self):
+        # Test the expected response order for different cases
+
+        self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ)
+
+        for trial in self.trials_:
+            self.model_name_ = trial[0]
+
+            # Case 1: Interleaved responses
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=500,
+                repeat_count=4,
+                order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]],
+                validate_fn=trial[1],
+            )
+
+            # Case 2: All responses of second request delivered before any
+            # response from the first
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=500,
+                repeat_count=4,
+                delay_time=2000,
+                delay_factor=0.1,
+                order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]],
+                validate_fn=trial[1],
+            )
+
+            # Case 3: Similar to Case 2, but the second request is generated
+            # after the first response from first request is received
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=2500,
+                repeat_count=4,
+                delay_time=2000,
+                delay_factor=0.1,
+                order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]],
+                validate_fn=trial[1],
+            )
+
+            # Case 4: All the responses of second requests are dleivered after
+            # all the responses from first requests are received
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=100,
+                repeat_count=4,
+                delay_time=500,
+                delay_factor=10,
+                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                validate_fn=trial[1],
+            )
+
+            # Case 5: Similar to Case 4, but the second request is generated
+            # after the first response from the first request is received
+            self._decoupled_infer(
+                request_count=2,
+                request_delay=750,
+                repeat_count=4,
+                delay_time=500,
+                delay_factor=10,
+                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                validate_fn=trial[1],
+            )
+
+    def _no_streaming_helper(self, protocol):
+        data_offset = 100
+        repeat_count = 1
+        delay_time = 1000
+        wait_time = 2000
+
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
+        delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
+        wait_data = np.array([wait_time], dtype=np.uint32)
+
+        if protocol == "grpc":
+            # Use the inputs and outputs from the setUp
+            this_inputs = self.inputs_
+            this_outputs = self.outputs_
+        else:
+            this_inputs = []
+            this_inputs.append(httpclient.InferInput("IN", [repeat_count], "INT32"))
+            this_inputs.append(httpclient.InferInput("DELAY", [1], "UINT32"))
+            this_inputs.append(httpclient.InferInput("WAIT", [1], "UINT32"))
+            this_outputs = []
+            this_outputs.append(httpclient.InferRequestedOutput("OUT"))
+
+        # Initialize data for IN
+        this_inputs[0].set_shape([repeat_count])
+        this_inputs[0].set_data_from_numpy(input_data)
+
+        # Initialize data for DELAY
+        this_inputs[1].set_shape([repeat_count])
+        this_inputs[1].set_data_from_numpy(delay_data)
+
+        # Initialize data for WAIT
+        this_inputs[2].set_data_from_numpy(wait_data)
+
+        if protocol == "grpc":
+            triton_client = grpcclient.InferenceServerClient(
+                url="localhost:8001", verbose=True
+            )
+        else:
+            triton_client = httpclient.InferenceServerClient(
+                url="localhost:8000", verbose=True
+            )
+
+        with self.assertRaises(InferenceServerException) as cm:
+            triton_client.infer(
+                model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs
+            )
+
+        self.assertIn(
+            "doesn't support models with decoupled transaction policy",
+            str(cm.exception),
+        )
+
+    def test_no_streaming(self):
+        # Test cases with no streaming inference. Server should give
+        # appropriate error in such cases.
+
+        self._no_streaming_helper("grpc")
+        self._no_streaming_helper("http")
+
+    def test_wrong_shape(self):
+        # Sends mismatching shapes for IN and DELAY. Server should return
+        # appropriate error message. The shape of IN is [repeat_count],
+        # where as shape of DELAY is [repeat_count + 1].
+
+        data_offset = 100
+        repeat_count = 1
+        delay_time = 1000
+        wait_time = 2000
+
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
+        delay_data = (np.ones([repeat_count + 1], dtype=np.uint32)) * delay_time
+        wait_data = np.array([wait_time], dtype=np.uint32)
+
+        # Initialize data for IN
+        self.inputs_[0].set_shape([repeat_count])
+        self.inputs_[0].set_data_from_numpy(input_data)
+
+        # Initialize data for DELAY
+        self.inputs_[1].set_shape([repeat_count + 1])
+        self.inputs_[1].set_data_from_numpy(delay_data)
+
+        # Initialize data for WAIT
+        self.inputs_[2].set_data_from_numpy(wait_data)
+
+        user_data = UserData()
+        result_dict = {}
+
+        with self.assertRaises(InferenceServerException) as cm:
+            self._stream_infer(
+                1, 0, repeat_count, delay_data, 1, user_data, result_dict
+            )
+
+        self.assertIn(
+            "expected IN and DELAY shape to match, got [1] and [2]", str(cm.exception)
+        )
+
+
+class NonDecoupledTest(tu.TestResultCollector):
+    def setUp(self):
+        self.model_name_ = "repeat_int32"
+        self.input_data = {
+            "IN": np.array([1], dtype=np.int32),
+            "DELAY": np.array([0], dtype=np.uint32),
+            "WAIT": np.array([0], dtype=np.uint32),
+        }
+
+    def test_grpc(self):
+        inputs = [
+            grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy(
+                self.input_data["IN"]
+            ),
+            grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy(
+                self.input_data["DELAY"]
+            ),
+            grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy(
+                self.input_data["WAIT"]
+            ),
+        ]
+
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        # Expect the inference is successful
+        res = triton_client.infer(model_name=self.model_name_, inputs=inputs)
+        self.assertEqual(1, res.as_numpy("OUT")[0])
+        self.assertEqual(0, res.as_numpy("IDX")[0])
+
+    def test_http(self):
+        inputs = [
+            httpclient.InferInput("IN", [1], "INT32").set_data_from_numpy(
+                self.input_data["IN"]
+            ),
+            httpclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy(
+                self.input_data["DELAY"]
+            ),
+            httpclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy(
+                self.input_data["WAIT"]
+            ),
+        ]
+
+        triton_client = httpclient.InferenceServerClient(
+            url="localhost:8000", verbose=True
+        )
+        # Expect the inference is successful
+        res = triton_client.infer(model_name=self.model_name_, inputs=inputs)
+        self.assertEqual(1, res.as_numpy("OUT")[0])
+        self.assertEqual(0, res.as_numpy("IDX")[0])
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/qa/L0_decoupled_grpc_error/test.sh b/qa/L0_decoupled_grpc_error/test.sh
new file mode 100755
index 0000000000..4fba476b1d
--- /dev/null
+++ b/qa/L0_decoupled_grpc_error/test.sh
@@ -0,0 +1,179 @@
+#!/bin/bash
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [ "$#" -ge 1 ]; then
+    REPO_VERSION=$1
+fi
+if [ -z "$REPO_VERSION" ]; then
+    echo -e "Repository version must be specified"
+    echo -e "\n***\n*** Test Failed\n***"
+    exit 1
+fi
+if [ ! -z "$TEST_REPO_ARCH" ]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+export CUDA_VISIBLE_DEVICES=0
+
+RET=0
+TEST_RESULT_FILE='test_results.txt'
+DECOUPLED_TEST=decoupled_test.py
+
+rm -f *.log
+
+CLIENT_LOG=`pwd`/client.log
+DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository
+SERVER=/opt/tritonserver/bin/tritonserver
+SERVER_ARGS="--model-repository=../L0_decoupled/models"
+SERVER_LOG="./inference_server.log"
+source ../common/util.sh
+
+
+TRIALS="python custom"
+
+for trial in $TRIALS; do
+  if [ $trial == "python" ]; then
+    MODELDIR=../L0_decoupled/python_models
+  else
+    MODELDIR=../L0_decoupled/models
+  fi
+
+  SERVER_ARGS="--model-repository=$MODELDIR"
+  cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/.
+  (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \
+   sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt)
+
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+      echo -e "\n***\n*** Failed to start $SERVER\n***"
+      cat $SERVER_LOG
+      exit 1
+  fi
+
+  for i in \
+              test_one_to_none \
+              test_one_to_one \
+              test_one_to_many \
+              test_no_streaming \
+              test_response_order \
+	      test_wrong_shape; do
+
+      echo "Test: $i" >>$CLIENT_LOG
+      set +e
+      python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
+      if [ $? -ne 0 ]; then
+              echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+              echo -e "\n***\n*** Test $i Failed\n***"
+              RET=1
+      else
+          check_test_results $TEST_RESULT_FILE 1
+          if [ $? -ne 0 ]; then
+              cat $CLIENT_LOG
+              echo -e "\n***\n*** Test Result Verification Failed\n***"
+              RET=1
+          fi
+      fi
+      set -e
+  done
+
+  # Will delay the writing of each response by the specified many milliseconds.
+  # This will ensure that there are multiple responses available to be written.
+  export TRITONSERVER_DELAY_GRPC_RESPONSE=2000
+
+  echo "Test: test_one_to_multi_many" >>$CLIENT_LOG
+  set +e
+  python $DECOUPLED_TEST DecoupledTest.test_one_to_multi_many >>$CLIENT_LOG 2>&1
+  if [ $? -ne 0 ]; then
+      echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" >>$CLIENT_LOG
+          echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***"
+          RET=1
+  else
+      check_test_results $TEST_RESULT_FILE 1
+      if [ $? -ne 0 ]; then
+          cat $CLIENT_LOG
+          echo -e "\n***\n*** Test Result Verification Failed\n***"
+          RET=1
+      fi
+  fi
+
+  set -e
+
+  unset TRITONSERVER_DELAY_GRPC_RESPONSE
+
+  kill $SERVER_PID
+  wait $SERVER_PID
+done
+
+# Test the server frontend can merge the responses of non-decoupled model that
+# sends inference response and COMPLETE flag separately. In other words, from
+# the client's perspective there will still be one response.
+NON_DECOUPLED_DIR=`pwd`/non_decoupled_models
+rm -rf ${NON_DECOUPLED_DIR} && mkdir -p ${NON_DECOUPLED_DIR}
+cp -r ../L0_decoupled/models/repeat_int32 ${NON_DECOUPLED_DIR}/. && \
+    (cd ${NON_DECOUPLED_DIR}/repeat_int32 && \
+        sed -i "s/decoupled: True/decoupled: False/" config.pbtxt)
+
+SERVER_ARGS="--model-repository=${NON_DECOUPLED_DIR}"
+SERVER_LOG="./non_decoupled_inference_server.log"
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+CLIENT_LOG=`pwd`/non_decoupled_client.log
+echo "Test: NonDecoupledTest" >>$CLIENT_LOG
+set +e
+python $DECOUPLED_TEST NonDecoupledTest >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" >>$CLIENT_LOG
+        echo -e "\n***\n*** Test NonDecoupledTest Failed\n***"
+        RET=1
+else
+    check_test_results $TEST_RESULT_FILE 2
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+if [ $RET -eq 0 ]; then
+  echo -e "\n***\n*** Test Passed\n***"
+else
+  echo -e "\n***\n*** Test Failed\n***"
+fi
+
+exit $RET
\ No newline at end of file
diff --git a/qa/L0_grpc_error_state_cleanup/cleanup_test.py b/qa/L0_grpc_error_state_cleanup/cleanup_test.py
new file mode 100755
index 0000000000..07537013aa
--- /dev/null
+++ b/qa/L0_grpc_error_state_cleanup/cleanup_test.py
@@ -0,0 +1,642 @@
+#!/usr/bin/env python3
+
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+
+sys.path.append("../common")
+
+import os
+import queue
+import signal
+import time
+import unittest
+from functools import partial
+
+import numpy as np
+import test_util as tu
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+
+
+class UserData:
+    def __init__(self):
+        self._response_queue = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._response_queue.put(error)
+    else:
+        user_data._response_queue.put(result)
+
+
+# These state cleanup tests relies on the test.sh
+# to check whether all the created request objects
+# were properly deleted by the sever.
+# The purpose on these unittest is to exercise
+# different portions of the gRPC frontend and
+# and track the state objects.
+class CleanUpTest(tu.TestResultCollector):
+    SERVER_PID = None
+
+    def setUp(self):
+        self.decoupled_model_name_ = "repeat_int32"
+        self.identity_model_name_ = "custom_zero_1_float32"
+        self.repeat_non_decoupled_model_name = "repeat_int32_non_decoupled"
+
+    def _prepare_inputs_and_outputs(self, kind):
+        if kind in ("decoupled_streaming", "non_decoupled_streaming"):
+            self.inputs_ = []
+            self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32"))
+            self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32"))
+            self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32"))
+
+            self.outputs_ = []
+            self.outputs_.append(grpcclient.InferRequestedOutput("OUT"))
+            self.outputs_.append(grpcclient.InferRequestedOutput("IDX"))
+            self.requested_outputs_ = self.outputs_
+        elif kind in ("simple", "streaming"):
+            self.inputs_ = []
+            self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "FP32"))
+
+            self.outputs_ = []
+            self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0"))
+            self.requested_outputs_ = self.outputs_
+        else:
+            raise ValueError("Unsupported kind specified to prepare inputs/outputs")
+
+    def _simple_infer(
+        self,
+        request_count,
+        cancel_response_idx=None,
+        client_timeout_pair=None,
+        kill_server=None,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
+            self._prepare_inputs_and_outputs("simple")
+
+            input_data = np.array([[1.0]], dtype=np.float32)
+            self.inputs_[0].set_data_from_numpy(input_data)
+
+            user_data = UserData()
+
+            futures = []
+            timeout_idx = None
+            timeout_value = None
+            if client_timeout_pair:
+                timeout_idx, timeout_value = client_timeout_pair
+            for i in range(request_count):
+                if kill_server == i:
+                    os.kill(int(self.SERVER_PID), signal.SIGINT)
+                this_timeout = None
+                if timeout_idx == i:
+                    this_timeout = timeout_value
+                futures.append(
+                    triton_client.async_infer(
+                        model_name=self.identity_model_name_,
+                        inputs=self.inputs_,
+                        request_id=str(i),
+                        callback=partial(callback, user_data),
+                        outputs=self.requested_outputs_,
+                        client_timeout=this_timeout,
+                    )
+                )
+
+            if cancel_response_idx is not None:
+                futures[cancel_response_idx].cancel()
+
+            responses = []
+            while len(responses) < len(futures):
+                data_item = user_data._response_queue.get()
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    responses.append(data_item)
+
+            for response in responses:
+                output0_data = response.as_numpy("OUTPUT0")
+                self.assertTrue(np.array_equal(input_data, output0_data))
+
+    def _stream_infer_with_params(
+        self,
+        request_count,
+        request_delay,
+        _,
+        user_data,
+        result_dict,
+        delay_data=None,
+        delay_factor=None,
+        cancel_response_idx=None,
+        stream_timeout=None,
+        kill_server=None,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
+            # Establish stream
+            metadata = {"triton_grpc_error": "true"}
+            triton_client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
+            )
+            # Send specified many requests in parallel
+            for i in range(request_count):
+                time.sleep((request_delay / 1000))
+                self.inputs_[1].set_data_from_numpy(delay_data)
+                if kill_server == i:
+                    os.kill(int(self.SERVER_PID), signal.SIGINT)
+                triton_client.async_stream_infer(
+                    model_name=self.decoupled_model_name_,
+                    inputs=self.inputs_,
+                    request_id=str(i),
+                    outputs=self.requested_outputs_,
+                    # Opt-in to receiving flags-only responses from model/backend
+                    # to help detect final responses for decoupled models.
+                    enable_empty_final_response=True,
+                )
+                # Update delay input in accordance with the scaling factor
+                delay_data = delay_data * delay_factor
+                delay_data = delay_data.astype(np.uint32)
+
+            # Retrieve results...
+            recv_count = 0
+            completed_requests = 0
+            while completed_requests < request_count:
+                if cancel_response_idx == recv_count:
+                    triton_client.stop_stream(cancel_requests=True)
+                data_item = user_data._response_queue.get()
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    response = data_item.get_response()
+                    # Request IDs should generally be provided with each request
+                    # to associate decoupled responses with their requests.
+                    if not response.id:
+                        raise ValueError(
+                            "No response id found. Was a request_id provided?"
+                        )
+
+                    # Detect final response. Parameters are oneof and we expect bool_param
+                    if response.parameters.get("triton_final_response").bool_param:
+                        completed_requests += 1
+
+                    # Only process non-empty response, ignore if empty (no outputs)
+                    if response.outputs:
+                        if response.id not in result_dict:
+                            result_dict[response.id] = []
+                        result_dict[response.id].append((recv_count, data_item))
+                        recv_count += 1
+
+    def _stream_infer(
+        self,
+        request_count,
+        request_delay,
+        expected_count,
+        user_data,
+        result_dict,
+        delay_data=None,
+        delay_factor=None,
+        cancel_response_idx=None,
+        stream_timeout=None,
+        kill_server=None,
+    ):
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as triton_client:
+            # Establish stream
+            metadata = {"triton_grpc_error": "true"}
+            triton_client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
+            )
+            # Send specified many requests in parallel
+            for i in range(request_count):
+                time.sleep((request_delay / 1000))
+                model_name = self.identity_model_name_
+                if delay_data is not None:
+                    model_name = self.decoupled_model_name_
+                    self.inputs_[1].set_data_from_numpy(delay_data)
+                if kill_server == i:
+                    os.kill(int(self.SERVER_PID), signal.SIGINT)
+                triton_client.async_stream_infer(
+                    model_name=model_name,
+                    inputs=self.inputs_,
+                    request_id=str(i),
+                    outputs=self.requested_outputs_,
+                )
+                if (delay_data is not None) and (delay_factor is not None):
+                    # Update delay input in accordance with the scaling factor
+                    delay_data = delay_data * delay_factor
+                    delay_data = delay_data.astype(np.uint32)
+
+            # Retrieve results...
+            recv_count = 0
+            while recv_count < expected_count:
+                if cancel_response_idx == recv_count:
+                    triton_client.stop_stream(cancel_requests=True)
+                data_item = user_data._response_queue.get()
+                if type(data_item) == InferenceServerException:
+                    raise data_item
+                else:
+                    this_id = data_item.get_response().id
+                    if this_id not in result_dict:
+                        result_dict[this_id] = []
+                    result_dict[this_id].append((recv_count, data_item))
+
+                recv_count += 1
+
+    def _streaming_infer(
+        self,
+        request_count,
+        request_delay=0,
+        cancel_response_idx=None,
+        stream_timeout=None,
+        kill_server=None,
+        should_error=True,
+    ):
+        self._prepare_inputs_and_outputs("streaming")
+
+        input_data = np.array([[1.0]], dtype=np.float32)
+        self.inputs_[0].set_data_from_numpy(input_data)
+
+        user_data = UserData()
+        result_dict = {}
+
+        try:
+            expected_count = request_count
+            self._stream_infer(
+                request_count,
+                request_delay,
+                expected_count,
+                user_data,
+                result_dict,
+                cancel_response_idx=cancel_response_idx,
+                stream_timeout=stream_timeout,
+                kill_server=kill_server,
+            )
+        except Exception as ex:
+            if cancel_response_idx or stream_timeout or should_error:
+                raise ex
+            self.assertTrue(False, "unexpected error {}".format(ex))
+
+        # Validate the results..
+        for i in range(request_count):
+            this_id = str(i)
+            if this_id not in result_dict.keys():
+                self.assertTrue(
+                    False, "response for request id {} not received".format(this_id)
+                )
+            self.assertEqual(len(result_dict[this_id]), 1)
+            result = result_dict[this_id][0][1]
+            output0_data = result.as_numpy("OUTPUT0")
+            self.assertTrue(np.array_equal(input_data, output0_data))
+
+    def _decoupled_infer(
+        self,
+        request_count,
+        request_delay=0,
+        repeat_count=1,
+        data_offset=100,
+        delay_time=1000,
+        delay_factor=1,
+        wait_time=500,
+        cancel_response_idx=None,
+        stream_timeout=None,
+        kill_server=None,
+        should_error=True,
+        infer_helper_map=[True, True],
+    ):
+        self._prepare_inputs_and_outputs(kind="decoupled_streaming")
+
+        # Initialize data for IN
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
+        )
+        self.inputs_[0].set_shape([repeat_count])
+        self.inputs_[0].set_data_from_numpy(input_data)
+
+        # Initialize data for DELAY
+        delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
+        self.inputs_[1].set_shape([repeat_count])
+
+        # Initialize data for WAIT
+        wait_data = np.array([wait_time], dtype=np.uint32)
+        self.inputs_[2].set_data_from_numpy(wait_data)
+
+        infer_helpers = []
+        if infer_helper_map[0]:
+            infer_helpers.append(self._stream_infer)
+        if infer_helper_map[1]:
+            infer_helpers.append(self._stream_infer_with_params)
+
+        for infer_helper in infer_helpers:
+            user_data = UserData()
+            result_dict = {}
+
+            try:
+                expected_count = repeat_count * request_count
+                infer_helper(
+                    request_count,
+                    request_delay,
+                    expected_count,
+                    user_data,
+                    result_dict,
+                    delay_data,
+                    delay_factor,
+                    cancel_response_idx,
+                    stream_timeout,
+                    kill_server,
+                )
+            except Exception as ex:
+                if cancel_response_idx or stream_timeout or should_error:
+                    raise ex
+                self.assertTrue(False, "unexpected error {}".format(ex))
+
+            # Validate the results..
+            for i in range(request_count):
+                this_id = str(i)
+                if repeat_count != 0 and this_id not in result_dict.keys():
+                    self.assertTrue(
+                        False, "response for request id {} not received".format(this_id)
+                    )
+                elif repeat_count == 0 and this_id in result_dict.keys():
+                    self.assertTrue(
+                        False,
+                        "received unexpected response for request id {}".format(
+                            this_id
+                        ),
+                    )
+                if repeat_count != 0:
+                    self.assertEqual(len(result_dict[this_id]), repeat_count)
+                    expected_data = data_offset
+                    result_list = result_dict[this_id]
+                    for j in range(len(result_list)):
+                        this_data = result_list[j][1].as_numpy("OUT")
+                        self.assertEqual(len(this_data), 1)
+                        self.assertEqual(this_data[0], expected_data)
+                        this_idx = result_list[j][1].as_numpy("IDX")
+                        self.assertEqual(len(this_idx), 1)
+                        self.assertEqual(this_idx[0], j)
+                        expected_data += 1
+
+    ###
+    ### Non-Streaming Tests
+    ###
+    def test_simple_infer(self):
+        # This test case sends 10 asynchronous requests and validates
+        # the response.
+        self._simple_infer(request_count=10)
+
+    def test_simple_infer_cancellation(self):
+        # This test case is used to check whether all the states are
+        # correctly released when one of the request is cancelled from
+        # the client side.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._simple_infer(request_count=10, cancel_response_idx=5)
+        self.assertIn("Locally cancelled by application!", str(cm.exception))
+
+    def test_simple_infer_timeout(self):
+        # This test case is used to check whether all the states are
+        # correctly released when the request gets timed-out on the client.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._simple_infer(request_count=10, client_timeout_pair=[5, 0.1])
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+
+    def test_simple_infer_error_status(self):
+        # This test case is used to check whether all the state objects are
+        # released when RPC runs into error.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._simple_infer(request_count=10)
+        self.assertIn(
+            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
+            str(cm.exception),
+        )
+
+    def test_simple_infer_shutdownserver(self):
+        # This test case is used to check whether all the state objects are
+        # released when the server is interrupted to shutdown in the beginning
+        # of inference run with final parameters being returned.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._simple_infer(request_count=20, kill_server=5)
+
+    ###
+    ### Streaming Tests
+    ###
+    def test_streaming_infer(self):
+        # Sanity test to check whether all the state objects
+        # are correctly released. Sends 10 requests in a single
+        # gRPC bidirectional stream.
+        self._streaming_infer(request_count=10)
+
+    def test_streaming_cancellation(self):
+        # This test case is used to check whether all the states are
+        # correctly released when the stream is closed when fifth
+        # response is received.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._streaming_infer(request_count=10, cancel_response_idx=5)
+        self.assertIn("Locally cancelled by application!", str(cm.exception))
+
+    def test_streaming_timeout(self):
+        # This test case is used to check whether all the states are
+        # released when some of the requests timeouts.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._streaming_infer(request_count=10, request_delay=1, stream_timeout=2)
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+
+    def test_streaming_error_status(self):
+        # This test case is used to check whether all the state objects are
+        # released when RPC runs into error.
+        expected_exceptions = [
+            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
+            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
+        ]
+        with self.assertRaises(InferenceServerException) as cm:
+            self._streaming_infer(request_count=10, should_error=True)
+
+        exception_match = False
+        for expected_exception in expected_exceptions:
+            exception_match |= expected_exception in str(cm.exception)
+        self.assertTrue(
+            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
+        )
+
+    def test_streaming_infer_shutdownserver(self):
+        # This test case is used to check whether all the state objects are
+        # released when the server is interrupted to shutdown in middle of
+        # inference run.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._streaming_infer(
+                request_count=10,
+                request_delay=1,
+                kill_server=5,
+                should_error=True,
+            )
+
+    ###
+    ### Decoupled Streaming Tests
+    ###
+    def test_decoupled_infer(self):
+        # Sanity test to check whether all the state objects
+        # are correctly released. Sends 10 requests in a single
+        # gRPC bidirectional stream and expects each of these
+        # requests to generate 10 responses.
+        self._decoupled_infer(request_count=10, repeat_count=10)
+
+    def test_decoupled_cancellation(self):
+        # This test case is used to check whether all the states are
+        # correctly released when the stream is closed when fifth
+        # response is received.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._decoupled_infer(
+                request_count=10, repeat_count=10, cancel_response_idx=5
+            )
+        self.assertIn("Locally cancelled by application!", str(cm.exception))
+
+    def test_decoupled_timeout(self):
+        # This test case is used to check whether all the states are
+        # released when some of the requests timeouts.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._decoupled_infer(
+                request_count=10, repeat_count=10, request_delay=1, stream_timeout=2
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+
+    def test_decoupled_error_status(self):
+        # This test case is used to check whether all the state objects are
+        # released when RPC runs into error.
+        expected_exceptions = [
+            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
+            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
+        ]
+        with self.assertRaises(InferenceServerException) as cm:
+            self._decoupled_infer(request_count=10, repeat_count=10, should_error=True)
+
+        exception_match = False
+        for expected_exception in expected_exceptions:
+            exception_match |= expected_exception in str(cm.exception)
+        self.assertTrue(
+            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
+        )
+
+    def test_decoupled_infer_shutdownserver(self):
+        # This test case is used to check whether all the state objects are
+        # released when the server is interrupted to shutdown in middle of
+        # inference run.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._decoupled_infer(
+                request_count=10,
+                repeat_count=10,
+                request_delay=1,
+                kill_server=5,
+                should_error=True,
+                infer_helper_map=[True, False],
+            )
+
+    def test_decoupled_infer_with_params_shutdownserver(self):
+        # This test case is used to check whether all the state objects are
+        # released when the server is interrupted to shutdown in middle of
+        # inference run with final parameters being returned.
+        with self.assertRaises(InferenceServerException) as cm:
+            self._decoupled_infer(
+                request_count=10,
+                repeat_count=10,
+                request_delay=1,
+                kill_server=5,
+                should_error=True,
+                infer_helper_map=[False, True],
+            )
+
+    def test_decoupled_infer_complete(self):
+        # Test if the Process() thread could release the state object before
+        # the StreamInferResponseComplete() thread is done accessing it.
+        self._decoupled_infer(request_count=1, repeat_count=1, stream_timeout=16)
+        # Check no error is printed to the log.
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        self.assertNotIn("Should not print this", server_log)
+
+    def test_non_decoupled_streaming_multi_response(self):
+        # Test non-decoupled streaming infer with more than one response should return
+        # the first response.
+        response_count = 4
+        expected_response_count = 1
+        expected_response_index = 0
+
+        # Prepare input data
+        self._prepare_inputs_and_outputs("non_decoupled_streaming")
+        # Initialize data for IN
+        data_offset = 100
+        input_data = np.arange(
+            start=data_offset, stop=data_offset + response_count, dtype=np.int32
+        )
+        self.inputs_[0].set_shape([response_count])
+        self.inputs_[0].set_data_from_numpy(input_data)
+        # Initialize data for DELAY
+        delay_data = np.zeros([response_count], dtype=np.uint32)
+        self.inputs_[1].set_shape([response_count])
+        self.inputs_[1].set_data_from_numpy(delay_data)
+        # Initialize data for WAIT
+        wait_data = np.array([0], dtype=np.uint32)
+        self.inputs_[2].set_data_from_numpy(wait_data)
+
+        # Infer
+        user_data = UserData()
+        with grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        ) as client:
+            # Establish stream
+            metadata = {"triton_grpc_error": "true"}
+            client.start_stream(
+                callback=partial(callback, user_data), stream_timeout=16, headers=metadata
+            )
+            # Send a request
+            client.async_stream_infer(
+                model_name=self.repeat_non_decoupled_model_name,
+                inputs=self.inputs_,
+                request_id="0",
+                outputs=self.requested_outputs_,
+            )
+            # Wait for all results and stop stream
+            client.stop_stream()
+
+        # Check infer output
+        actual_response_count = 0
+        while not user_data._response_queue.empty():
+            actual_response_count += 1
+            data_item = user_data._response_queue.get()
+            if type(data_item) == InferenceServerException:
+                raise data_item
+            else:
+                response_idx = data_item.as_numpy("IDX")[0]
+                self.assertEqual(response_idx, expected_response_index)
+        self.assertEqual(actual_response_count, expected_response_count)
+
+
+if __name__ == "__main__":
+    CleanUpTest.SERVER_PID = os.environ.get("SERVER_PID", CleanUpTest.SERVER_PID)
+    unittest.main()
diff --git a/qa/L0_grpc_error_state_cleanup/test.sh b/qa/L0_grpc_error_state_cleanup/test.sh
new file mode 100755
index 0000000000..df302d5ed1
--- /dev/null
+++ b/qa/L0_grpc_error_state_cleanup/test.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [ "$#" -ge 1 ]; then
+    REPO_VERSION=$1
+fi
+if [ -z "$REPO_VERSION" ]; then
+    echo -e "Repository version must be specified"
+    echo -e "\n***\n*** Test Failed\n***"
+    exit 1
+fi
+if [ ! -z "$TEST_REPO_ARCH" ]; then
+    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
+fi
+
+export CUDA_VISIBLE_DEVICES=0
+
+RET=0
+CLEANUP_TEST=cleanup_test.py
+
+rm -f *.log
+
+CLIENT_LOG=`pwd`/client.log
+SERVER=/opt/tritonserver/bin/tritonserver
+source ../common/util.sh
+
+function check_state_release() {
+  local log_file=$1
+
+  num_state_release=`cat $log_file | grep  "StateRelease" | wc -l`
+  num_state_new=`cat $log_file | grep  "StateNew" | wc -l`
+
+  if [ $num_state_release -ne $num_state_new ]; then
+    cat $log_file
+    echo -e "\n***\n*** Test Failed: Mismatch detected, $num_state_new state(s) created, $num_state_release state(s) released. \n***" >> $log_file
+    return 1
+  fi
+
+  return 0
+}
+
+rm -fr ./models/custom_zero_1_float32 && \
+        cp -r ../custom_models/custom_zero_1_float32 ./models/. && \
+        mkdir -p ./models/custom_zero_1_float32/1
+
+(cd models/custom_zero_1_float32 && \
+    echo "parameters [" >> config.pbtxt && \
+    echo "{ key: \"execute_delay_ms\"; value: { string_value: \"1000\" }}" >> config.pbtxt && \
+    echo "]" >> config.pbtxt)
+
+rm -rf models/repeat_int32_non_decoupled && \
+    cp -r models/repeat_int32 models/repeat_int32_non_decoupled && \
+    (cd models/repeat_int32_non_decoupled && \
+        sed -i "/model_transaction_policy/,+2d" config.pbtxt && \
+        sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt)
+
+for i in test_simple_infer \
+            test_simple_infer_cancellation \
+            test_simple_infer_timeout \
+            test_streaming_infer \
+            test_streaming_timeout \
+            test_streaming_cancellation \
+            test_decoupled_infer \
+            test_decoupled_cancellation \
+            test_decoupled_timeout \
+            test_non_decoupled_streaming_multi_response; do
+  SERVER_LOG="./inference_server.$i.log"
+  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+  fi
+
+  echo "Test: $i" >>$CLIENT_LOG
+
+  set +e
+  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+  if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+    echo -e "\n***\n*** Test $i Failed\n***"
+    RET=1
+  fi
+
+  kill $SERVER_PID
+  wait $SERVER_PID
+
+  check_state_release $SERVER_LOG
+  if [ $? -ne 0 ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** State Verification Failed for $i\n***"
+      RET=1
+  fi
+  set -e
+done
+
+
+for i in test_simple_infer_error_status \
+                test_streaming_error_status \
+                test_decoupled_error_status; do
+  SERVER_LOG="./inference_server.$i.log"
+  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value"
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+  fi
+
+  echo "Test: $i" >>$CLIENT_LOG
+
+  set +e
+  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+  if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+    echo -e "\n***\n*** Test $i Failed\n***"
+    RET=1
+  fi
+
+  kill $SERVER_PID
+  wait $SERVER_PID
+
+  check_state_release $SERVER_LOG
+  if [ $? -ne 0 ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** State Verification Failed for $i\n***"
+      RET=1
+  fi
+
+  set -e
+done
+
+for i in test_simple_infer_shutdownserver \
+         test_streaming_infer_shutdownserver \
+         test_decoupled_infer_shutdownserver \
+         test_decoupled_infer_with_params_shutdownserver; do
+  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+  SERVER_LOG="./inference_server.$i.log"
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+  fi
+
+  echo "Test: $i" >>$CLIENT_LOG
+
+  set +e
+  SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
+  if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
+    echo -e "\n***\n*** Test $i Failed\n***"
+    RET=1
+  fi
+
+  wait $SERVER_PID
+
+  check_state_release $SERVER_LOG
+  if [ $? -ne 0 ]; then
+    cat $SERVER_LOG
+    echo -e "\n***\n*** State Verification Failed for $i\n***"
+      RET=1
+  fi
+
+  set -e
+done
+
+TEST_NAME=test_decoupled_infer_complete
+export TRITONSERVER_DELAY_GRPC_COMPLETE=2000
+
+SERVER_LOG="./inference_server.$TEST_NAME.log"
+SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+  echo -e "\n***\n*** Failed to start $SERVER\n***"
+  cat $SERVER_LOG
+  exit 1
+fi
+
+echo "Test: $TEST_NAME" >>$CLIENT_LOG
+
+set +e
+
+SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+  cat $CLIENT_LOG
+  echo -e "\n***\n*** Test $TEST_NAME Failed\n***"
+  RET=1
+fi
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+check_state_release $SERVER_LOG
+if [ $? -ne 0 ]; then
+  cat $SERVER_LOG
+  echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***"
+  RET=1
+fi
+
+set -e
+
+if [ $RET -eq 0 ]; then
+  echo -e "\n***\n*** Test Passed\n***"
+else
+  echo -e "\n***\n*** Test Failed\n***"
+fi
+
+exit $RET

From cb548ff53e22aafe6e729cc365141bb3d6f90c74 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 16 Aug 2024 01:53:47 -0700
Subject: [PATCH 28/32] Pre-Commit format

---
 qa/L0_decoupled/decoupled_test.py              |  2 +-
 qa/L0_decoupled_grpc_error/decoupled_test.py   | 10 +++++++---
 qa/L0_grpc_error_state_cleanup/cleanup_test.py | 12 +++++++++---
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
index d0f09deaf9..1f76f4845b 100755
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -644,4 +644,4 @@ def test_http(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/qa/L0_decoupled_grpc_error/decoupled_test.py b/qa/L0_decoupled_grpc_error/decoupled_test.py
index fc606e3cca..1d6e2f7029 100755
--- a/qa/L0_decoupled_grpc_error/decoupled_test.py
+++ b/qa/L0_decoupled_grpc_error/decoupled_test.py
@@ -117,7 +117,9 @@ def _stream_infer_with_params(
         ) as triton_client:
             # Establish stream
             metadata = {"triton_grpc_error": "true"}
-            triton_client.start_stream(callback=partial(callback, user_data), headers=metadata)
+            triton_client.start_stream(
+                callback=partial(callback, user_data), headers=metadata
+            )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -177,7 +179,9 @@ def _stream_infer(
         ) as triton_client:
             # Establish stream
             metadata = {"triton_grpc_error": "true"}
-            triton_client.start_stream(callback=partial(callback, user_data), headers=metadata)
+            triton_client.start_stream(
+                callback=partial(callback, user_data), headers=metadata
+            )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -646,4 +650,4 @@ def test_http(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/qa/L0_grpc_error_state_cleanup/cleanup_test.py b/qa/L0_grpc_error_state_cleanup/cleanup_test.py
index 07537013aa..4425c5c667 100755
--- a/qa/L0_grpc_error_state_cleanup/cleanup_test.py
+++ b/qa/L0_grpc_error_state_cleanup/cleanup_test.py
@@ -163,7 +163,9 @@ def _stream_infer_with_params(
             # Establish stream
             metadata = {"triton_grpc_error": "true"}
             triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
+                callback=partial(callback, user_data),
+                stream_timeout=stream_timeout,
+                headers=metadata,
             )
             # Send specified many requests in parallel
             for i in range(request_count):
@@ -232,7 +234,9 @@ def _stream_infer(
             # Establish stream
             metadata = {"triton_grpc_error": "true"}
             triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout, headers=metadata
+                callback=partial(callback, user_data),
+                stream_timeout=stream_timeout,
+                headers=metadata,
             )
             # Send specified many requests in parallel
             for i in range(request_count):
@@ -612,7 +616,9 @@ def test_non_decoupled_streaming_multi_response(self):
             # Establish stream
             metadata = {"triton_grpc_error": "true"}
             client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=16, headers=metadata
+                callback=partial(callback, user_data),
+                stream_timeout=16,
+                headers=metadata,
             )
             # Send a request
             client.async_stream_infer(

From 1b6b3a7ecd4b36cc54e845477118dee1935c92a3 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 16 Aug 2024 08:44:21 -0700
Subject: [PATCH 29/32] Devel build fix

---
 Dockerfile.QA | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.QA b/Dockerfile.QA
index a3073948c5..1417515c42 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -250,7 +250,7 @@ RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \
     mkdir -p qa/L0_decoupled/models/sequence_repeat/1 && \
     mkdir -p qa/L0_decoupled/models/repeat_square/1 && \
     mkdir -p qa/L0_decoupled/models/nested_square/1 && \
-    mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1
+    mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 \
     mkdir -p qa/L0_grpc_error_state_cleanup/models/repeat_int32/1
 
 RUN if [ "$IGPU_BUILD" == "0" ]; then \

From 70ce2790bbc8e69fcaaf14c510d33db00b1e7bf9 Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 16 Aug 2024 11:41:33 -0700
Subject: [PATCH 30/32] Streamline new tests

---
 Dockerfile.QA                                 |  12 +-
 qa/L0_decoupled/decoupled_test.py             |  16 +-
 qa/L0_decoupled_grpc_error/decoupled_test.py  | 653 ------------------
 qa/L0_decoupled_grpc_error/test.sh            | 179 -----
 .../cleanup_test.py                           | 648 -----------------
 qa/L0_grpc_error_state_cleanup/test.sh        | 235 -------
 qa/L0_grpc_state_cleanup/cleanup_test.py      |  42 +-
 7 files changed, 54 insertions(+), 1731 deletions(-)
 delete mode 100755 qa/L0_decoupled_grpc_error/decoupled_test.py
 delete mode 100755 qa/L0_decoupled_grpc_error/test.sh
 delete mode 100755 qa/L0_grpc_error_state_cleanup/cleanup_test.py
 delete mode 100755 qa/L0_grpc_error_state_cleanup/test.sh

diff --git a/Dockerfile.QA b/Dockerfile.QA
index 1417515c42..22f312b930 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -113,8 +113,6 @@ RUN mkdir -p qa/common && \
     cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \
     mkdir qa/L0_grpc_state_cleanup/models && \
     cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \
-    mkdir qa/L0_grpc_error_state_cleanup/models && \
-    cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_error_state_cleanup/models/ && \
     mkdir qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple qa/L0_http/models && \
     cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_http/models && \
@@ -250,14 +248,12 @@ RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \
     mkdir -p qa/L0_decoupled/models/sequence_repeat/1 && \
     mkdir -p qa/L0_decoupled/models/repeat_square/1 && \
     mkdir -p qa/L0_decoupled/models/nested_square/1 && \
-    mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1 \
-    mkdir -p qa/L0_grpc_error_state_cleanup/models/repeat_int32/1
+    mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1
 
 RUN if [ "$IGPU_BUILD" == "0" ]; then \
         cp backends/repeat/libtriton_repeat.so qa/L0_model_config && \
         cp backends/repeat/libtriton_repeat.so qa/L0_decoupled/models/repeat_int32/1 && \
         cp backends/repeat/libtriton_repeat.so qa/L0_grpc_state_cleanup/models/repeat_int32/1/. && \
-        cp backends/repeat/libtriton_repeat.so qa/L0_grpc_error_state_cleanup/models/repeat_int32/1/. && \
         cp backends/square/libtriton_square.so qa/L0_decoupled/models/square_int32/1; \
     fi
 
@@ -271,6 +267,12 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \
     cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \
         qa/L0_decoupled/python_models/square_int32/.
 
+RUN mkdir -p qa/L0_decoupled_grpc_error && \
+    cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error && \
+
+RUN mkdir -p qa/L0_grpc_error_state_cleanup && \
+    cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup
+
 RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \
     cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \
         qa/L0_repoagent_checksum/models/identity_int32/1/.
diff --git a/qa/L0_decoupled/decoupled_test.py b/qa/L0_decoupled/decoupled_test.py
index 1f76f4845b..d7bc59f5c7 100755
--- a/qa/L0_decoupled/decoupled_test.py
+++ b/qa/L0_decoupled/decoupled_test.py
@@ -116,7 +116,13 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -175,7 +181,13 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(callback=partial(callback, user_data))
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), headers=metadata
+                )
+            else:
+                triton_client.start_stream(callback=partial(callback, user_data))
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
diff --git a/qa/L0_decoupled_grpc_error/decoupled_test.py b/qa/L0_decoupled_grpc_error/decoupled_test.py
deleted file mode 100755
index 1d6e2f7029..0000000000
--- a/qa/L0_decoupled_grpc_error/decoupled_test.py
+++ /dev/null
@@ -1,653 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-
-sys.path.append("../common")
-
-import os
-import queue
-import time
-import unittest
-from functools import partial
-
-import numpy as np
-import test_util as tu
-import tritonclient.grpc as grpcclient
-import tritonclient.http as httpclient
-from tritonclient.utils import InferenceServerException
-
-
-class UserData:
-    def __init__(self):
-        self._response_queue = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._response_queue.put(error)
-    else:
-        user_data._response_queue.put(result)
-
-
-class DecoupledTest(tu.TestResultCollector):
-    def setUp(self):
-        self.trials_ = [
-            ("repeat_int32", None),
-            ("simple_repeat", None),
-            ("sequence_repeat", None),
-            ("fan_repeat", self._fan_validate),
-            ("repeat_square", self._nested_validate),
-            ("nested_square", self._nested_validate),
-        ]
-        self.model_name_ = "repeat_int32"
-
-        self.inputs_ = []
-        self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32"))
-        self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32"))
-        self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32"))
-
-        self.outputs_ = []
-        self.outputs_.append(grpcclient.InferRequestedOutput("OUT"))
-        self.outputs_.append(grpcclient.InferRequestedOutput("IDX"))
-        # Some trials only expect a subset of outputs
-        self.requested_outputs_ = self.outputs_
-
-    # Client can receive a "triton_final_response" response parameter
-    # from Triton server that indicates when a response is the final response for
-    # its request.
-    #
-    # For non-decoupled models, there is a 1:1 request:response ratio, so every
-    # response is the final response, and this parameter is unnecessary.
-    #
-    # For decoupled models, there is a 1:N request:response ratio, so there may be
-    # more one response before receiving the "final" response.
-    #
-    # However, decoupled models have the unique property in that they can return
-    # a flags-only response to the server to indicate completion, which is not
-    # returned to the client by default (See TRITONBACKEND_ResponseFactorySendFlags).
-    #
-    # To forward this flags-only response to the client, users must opt-in to this
-    # behavior by adding the following argument:
-    # client.async_stream_infer(..., enable_empty_final_response=True).
-    #
-    # If the decoupled backend/model always sends the final response flag along
-    # with a non-null response, no opt-in is needed.
-    #
-    # With this behavior, the client can programmatically detect when all responses
-    # for an individual request have been received without knowing the expected
-    # number of responses in advance and without closing the stream.
-    def _stream_infer_with_params(
-        self,
-        request_count,
-        request_delay,
-        _,
-        delay_data,
-        delay_factor,
-        user_data,
-        result_dict,
-    ):
-        with grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        ) as triton_client:
-            # Establish stream
-            metadata = {"triton_grpc_error": "true"}
-            triton_client.start_stream(
-                callback=partial(callback, user_data), headers=metadata
-            )
-            # Send specified many requests in parallel
-            for i in range(request_count):
-                time.sleep((request_delay / 1000))
-                self.inputs_[1].set_data_from_numpy(delay_data)
-                triton_client.async_stream_infer(
-                    model_name=self.model_name_,
-                    inputs=self.inputs_,
-                    request_id=str(i),
-                    outputs=self.requested_outputs_,
-                    # Opt-in to receiving flags-only responses from model/backend
-                    # to help detect final responses for decoupled models.
-                    enable_empty_final_response=True,
-                )
-                # Update delay input in accordance with the scaling factor
-                delay_data = delay_data * delay_factor
-                delay_data = delay_data.astype(np.uint32)
-
-            # Retrieve results...
-            recv_count = 0
-            completed_requests = 0
-            while completed_requests < request_count:
-                data_item = user_data._response_queue.get()
-                if type(data_item) == InferenceServerException:
-                    raise data_item
-                else:
-                    response = data_item.get_response()
-                    # Request IDs should generally be provided with each request
-                    # to associate decoupled responses with their requests.
-                    if not response.id:
-                        raise ValueError(
-                            "No response id found. Was a request_id provided?"
-                        )
-
-                    # Detect final response. Parameters are oneof and we expect bool_param
-                    if response.parameters.get("triton_final_response").bool_param:
-                        completed_requests += 1
-
-                    # Only process non-empty response, ignore if empty (no outputs)
-                    if response.outputs:
-                        if response.id not in result_dict:
-                            result_dict[response.id] = []
-                        result_dict[response.id].append((recv_count, data_item))
-                        recv_count += 1
-
-    def _stream_infer(
-        self,
-        request_count,
-        request_delay,
-        expected_count,
-        delay_data,
-        delay_factor,
-        user_data,
-        result_dict,
-    ):
-        with grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        ) as triton_client:
-            # Establish stream
-            metadata = {"triton_grpc_error": "true"}
-            triton_client.start_stream(
-                callback=partial(callback, user_data), headers=metadata
-            )
-            # Send specified many requests in parallel
-            for i in range(request_count):
-                time.sleep((request_delay / 1000))
-                self.inputs_[1].set_data_from_numpy(delay_data)
-                triton_client.async_stream_infer(
-                    model_name=self.model_name_,
-                    inputs=self.inputs_,
-                    request_id=str(i),
-                    outputs=self.requested_outputs_,
-                )
-                # Update delay input in accordance with the scaling factor
-                delay_data = delay_data * delay_factor
-                delay_data = delay_data.astype(np.uint32)
-
-            # Retrieve results...
-            recv_count = 0
-            while recv_count < expected_count:
-                data_item = user_data._response_queue.get()
-                if type(data_item) == InferenceServerException:
-                    raise data_item
-                else:
-                    this_id = data_item.get_response().id
-                    if this_id not in result_dict:
-                        result_dict[this_id] = []
-                    result_dict[this_id].append((recv_count, data_item))
-
-                recv_count += 1
-
-    def _fan_validate(self, result_list, data_offset, repeat_count):
-        # fan_repeat returns "2 * data_offset" as result
-        self.assertEqual(len(result_list), repeat_count)
-        expected_data = 2 * data_offset
-        for j in range(len(result_list)):
-            this_data = result_list[j][1].as_numpy("OUT")
-            self.assertEqual(len(this_data), 1)
-            self.assertEqual(this_data[0], expected_data)
-            expected_data += 2
-
-    def _nested_validate(self, result_list, data_offset, repeat_count):
-        # if repeat model returns repeat result n, repeat_square-like model
-        # will return the same result n times
-        expected_len = sum(x for x in range(data_offset, data_offset + repeat_count))
-        self.assertEqual(len(result_list), expected_len)
-        expected_data = data_offset
-        expected_count = expected_data
-        for j in range(len(result_list)):
-            this_data = result_list[j][1].as_numpy("OUT")
-            self.assertEqual(len(this_data), 1)
-            self.assertEqual(this_data[0], expected_data)
-            expected_count -= 1
-            if expected_count == 0:
-                expected_data += 1
-                expected_count = expected_data
-
-    def _decoupled_infer(
-        self,
-        request_count,
-        request_delay=0,
-        repeat_count=1,
-        data_offset=100,
-        delay_time=1000,
-        delay_factor=1,
-        wait_time=500,
-        order_sequence=None,
-        validate_fn=None,
-    ):
-        # Initialize data for IN
-        input_data = np.arange(
-            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
-        )
-        self.inputs_[0].set_shape([repeat_count])
-        self.inputs_[0].set_data_from_numpy(input_data)
-
-        # Initialize data for DELAY
-        delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
-        self.inputs_[1].set_shape([repeat_count])
-
-        # Initialize data for WAIT
-        wait_data = np.array([wait_time], dtype=np.uint32)
-        self.inputs_[2].set_data_from_numpy(wait_data)
-
-        # use validate_fn to differentiate requested outputs
-        self.requested_outputs_ = (
-            self.outputs_ if validate_fn is None else self.outputs_[0:1]
-        )
-
-        for infer_helper in [self._stream_infer, self._stream_infer_with_params]:
-            user_data = UserData()
-            result_dict = {}
-
-            try:
-                if "square" not in self.model_name_:
-                    expected_count = repeat_count * request_count
-                else:
-                    expected_count = (
-                        sum(x for x in range(data_offset, data_offset + repeat_count))
-                        * request_count
-                    )
-                infer_helper(
-                    request_count,
-                    request_delay,
-                    expected_count,
-                    delay_data,
-                    delay_factor,
-                    user_data,
-                    result_dict,
-                )
-            except Exception as ex:
-                self.assertTrue(False, "unexpected error {}".format(ex))
-
-            # Validate the results..
-            for i in range(request_count):
-                this_id = str(i)
-                if repeat_count != 0 and this_id not in result_dict.keys():
-                    self.assertTrue(
-                        False, "response for request id {} not received".format(this_id)
-                    )
-                elif repeat_count == 0 and this_id in result_dict.keys():
-                    self.assertTrue(
-                        False,
-                        "received unexpected response for request id {}".format(
-                            this_id
-                        ),
-                    )
-                if repeat_count != 0:
-                    if validate_fn is None:
-                        self.assertEqual(len(result_dict[this_id]), repeat_count)
-                        expected_data = data_offset
-                        result_list = result_dict[this_id]
-                        for j in range(len(result_list)):
-                            if order_sequence is not None:
-                                self.assertEqual(
-                                    result_list[j][0], order_sequence[i][j]
-                                )
-                            this_data = result_list[j][1].as_numpy("OUT")
-                            self.assertEqual(len(this_data), 1)
-                            self.assertEqual(this_data[0], expected_data)
-                            this_idx = result_list[j][1].as_numpy("IDX")
-                            self.assertEqual(len(this_idx), 1)
-                            self.assertEqual(this_idx[0], j)
-                            expected_data += 1
-                    else:
-                        validate_fn(result_dict[this_id], data_offset, repeat_count)
-
-    def test_one_to_none(self):
-        # Test cases where each request generates no response.
-        # Note the name of the test one_to_none implies the
-        # mapping between requests and responses.
-
-        for trial in self.trials_:
-            self.model_name_ = trial[0]
-            # Single request case
-            self._decoupled_infer(request_count=1, repeat_count=0, validate_fn=trial[1])
-            # Multiple request case
-            self._decoupled_infer(request_count=5, repeat_count=0, validate_fn=trial[1])
-
-    def test_one_to_one(self):
-        # Test cases where each request generates single response.
-        # Note the name of the test one_to_one implies the
-        # mapping between requests and responses.
-
-        for trial in self.trials_:
-            self.model_name_ = trial[0]
-            # Single request case
-            # Release request before the response is delivered
-            self._decoupled_infer(request_count=1, wait_time=500, validate_fn=trial[1])
-            # Release request after the response is delivered
-            self._decoupled_infer(request_count=1, wait_time=2000, validate_fn=trial[1])
-
-            # Multiple request case
-            # Release request before the response is delivered
-            self._decoupled_infer(request_count=5, wait_time=500, validate_fn=trial[1])
-            # Release request after the response is delivered
-            self._decoupled_infer(request_count=5, wait_time=2000, validate_fn=trial[1])
-
-    def test_one_to_many(self):
-        # Test cases where each request generates multiple response.
-        # Note the name of the test one_to_many implies the
-        # mapping between requests and responses.
-
-        self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ)
-
-        for trial in self.trials_:
-            self.model_name_ = trial[0]
-            # Single request case
-            # Release request before the first response is delivered
-            self._decoupled_infer(
-                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
-            )
-            # Release request when the responses are getting delivered
-            self._decoupled_infer(
-                request_count=1, repeat_count=5, wait_time=2000, validate_fn=trial[1]
-            )
-            # Release request after all the responses are delivered
-            self._decoupled_infer(
-                request_count=1, repeat_count=5, wait_time=10000, validate_fn=trial[1]
-            )
-
-            # Multiple request case
-            # Release request before the first response is delivered
-            self._decoupled_infer(
-                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
-            )
-            # Release request when the responses are getting delivered
-            self._decoupled_infer(
-                request_count=5, repeat_count=5, wait_time=2000, validate_fn=trial[1]
-            )
-            # Release request after all the responses are delivered
-            self._decoupled_infer(
-                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
-            )
-
-    def test_one_to_multi_many(self):
-        # Test cases where each request generates multiple response but the
-        # responses are delayed so as to stress the control path handling the
-        # queued responses.
-
-        self.assertTrue("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ)
-
-        for trial in self.trials_:
-            self.model_name_ = trial[0]
-            # Single request case
-            # Release request before the first response is delivered
-            self._decoupled_infer(
-                request_count=1, repeat_count=5, wait_time=500, validate_fn=trial[1]
-            )
-            # Release request when the responses are getting delivered
-            self._decoupled_infer(
-                request_count=1, repeat_count=5, wait_time=8000, validate_fn=trial[1]
-            )
-            # Release request after all the responses are delivered
-            self._decoupled_infer(
-                request_count=1, repeat_count=5, wait_time=20000, validate_fn=trial[1]
-            )
-
-            # Multiple request case
-            # Release request before the first response is delivered
-            self._decoupled_infer(
-                request_count=5, repeat_count=5, wait_time=500, validate_fn=trial[1]
-            )
-            # Release request when the responses are getting delivered
-            self._decoupled_infer(
-                request_count=5, repeat_count=5, wait_time=3000, validate_fn=trial[1]
-            )
-            # Release request after all the responses are delivered
-            self._decoupled_infer(
-                request_count=5, repeat_count=5, wait_time=10000, validate_fn=trial[1]
-            )
-
-    def test_response_order(self):
-        # Test the expected response order for different cases
-
-        self.assertFalse("TRITONSERVER_DELAY_GRPC_RESPONSE" in os.environ)
-
-        for trial in self.trials_:
-            self.model_name_ = trial[0]
-
-            # Case 1: Interleaved responses
-            self._decoupled_infer(
-                request_count=2,
-                request_delay=500,
-                repeat_count=4,
-                order_sequence=[[0, 2, 4, 6], [1, 3, 5, 7]],
-                validate_fn=trial[1],
-            )
-
-            # Case 2: All responses of second request delivered before any
-            # response from the first
-            self._decoupled_infer(
-                request_count=2,
-                request_delay=500,
-                repeat_count=4,
-                delay_time=2000,
-                delay_factor=0.1,
-                order_sequence=[[4, 5, 6, 7], [0, 1, 2, 3]],
-                validate_fn=trial[1],
-            )
-
-            # Case 3: Similar to Case 2, but the second request is generated
-            # after the first response from first request is received
-            self._decoupled_infer(
-                request_count=2,
-                request_delay=2500,
-                repeat_count=4,
-                delay_time=2000,
-                delay_factor=0.1,
-                order_sequence=[[0, 5, 6, 7], [1, 2, 3, 4]],
-                validate_fn=trial[1],
-            )
-
-            # Case 4: All the responses of second requests are dleivered after
-            # all the responses from first requests are received
-            self._decoupled_infer(
-                request_count=2,
-                request_delay=100,
-                repeat_count=4,
-                delay_time=500,
-                delay_factor=10,
-                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
-                validate_fn=trial[1],
-            )
-
-            # Case 5: Similar to Case 4, but the second request is generated
-            # after the first response from the first request is received
-            self._decoupled_infer(
-                request_count=2,
-                request_delay=750,
-                repeat_count=4,
-                delay_time=500,
-                delay_factor=10,
-                order_sequence=[[0, 1, 2, 3], [4, 5, 6, 7]],
-                validate_fn=trial[1],
-            )
-
-    def _no_streaming_helper(self, protocol):
-        data_offset = 100
-        repeat_count = 1
-        delay_time = 1000
-        wait_time = 2000
-
-        input_data = np.arange(
-            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
-        )
-        delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
-        wait_data = np.array([wait_time], dtype=np.uint32)
-
-        if protocol == "grpc":
-            # Use the inputs and outputs from the setUp
-            this_inputs = self.inputs_
-            this_outputs = self.outputs_
-        else:
-            this_inputs = []
-            this_inputs.append(httpclient.InferInput("IN", [repeat_count], "INT32"))
-            this_inputs.append(httpclient.InferInput("DELAY", [1], "UINT32"))
-            this_inputs.append(httpclient.InferInput("WAIT", [1], "UINT32"))
-            this_outputs = []
-            this_outputs.append(httpclient.InferRequestedOutput("OUT"))
-
-        # Initialize data for IN
-        this_inputs[0].set_shape([repeat_count])
-        this_inputs[0].set_data_from_numpy(input_data)
-
-        # Initialize data for DELAY
-        this_inputs[1].set_shape([repeat_count])
-        this_inputs[1].set_data_from_numpy(delay_data)
-
-        # Initialize data for WAIT
-        this_inputs[2].set_data_from_numpy(wait_data)
-
-        if protocol == "grpc":
-            triton_client = grpcclient.InferenceServerClient(
-                url="localhost:8001", verbose=True
-            )
-        else:
-            triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True
-            )
-
-        with self.assertRaises(InferenceServerException) as cm:
-            triton_client.infer(
-                model_name=self.model_name_, inputs=this_inputs, outputs=this_outputs
-            )
-
-        self.assertIn(
-            "doesn't support models with decoupled transaction policy",
-            str(cm.exception),
-        )
-
-    def test_no_streaming(self):
-        # Test cases with no streaming inference. Server should give
-        # appropriate error in such cases.
-
-        self._no_streaming_helper("grpc")
-        self._no_streaming_helper("http")
-
-    def test_wrong_shape(self):
-        # Sends mismatching shapes for IN and DELAY. Server should return
-        # appropriate error message. The shape of IN is [repeat_count],
-        # where as shape of DELAY is [repeat_count + 1].
-
-        data_offset = 100
-        repeat_count = 1
-        delay_time = 1000
-        wait_time = 2000
-
-        input_data = np.arange(
-            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
-        )
-        delay_data = (np.ones([repeat_count + 1], dtype=np.uint32)) * delay_time
-        wait_data = np.array([wait_time], dtype=np.uint32)
-
-        # Initialize data for IN
-        self.inputs_[0].set_shape([repeat_count])
-        self.inputs_[0].set_data_from_numpy(input_data)
-
-        # Initialize data for DELAY
-        self.inputs_[1].set_shape([repeat_count + 1])
-        self.inputs_[1].set_data_from_numpy(delay_data)
-
-        # Initialize data for WAIT
-        self.inputs_[2].set_data_from_numpy(wait_data)
-
-        user_data = UserData()
-        result_dict = {}
-
-        with self.assertRaises(InferenceServerException) as cm:
-            self._stream_infer(
-                1, 0, repeat_count, delay_data, 1, user_data, result_dict
-            )
-
-        self.assertIn(
-            "expected IN and DELAY shape to match, got [1] and [2]", str(cm.exception)
-        )
-
-
-class NonDecoupledTest(tu.TestResultCollector):
-    def setUp(self):
-        self.model_name_ = "repeat_int32"
-        self.input_data = {
-            "IN": np.array([1], dtype=np.int32),
-            "DELAY": np.array([0], dtype=np.uint32),
-            "WAIT": np.array([0], dtype=np.uint32),
-        }
-
-    def test_grpc(self):
-        inputs = [
-            grpcclient.InferInput("IN", [1], "INT32").set_data_from_numpy(
-                self.input_data["IN"]
-            ),
-            grpcclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy(
-                self.input_data["DELAY"]
-            ),
-            grpcclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy(
-                self.input_data["WAIT"]
-            ),
-        ]
-
-        triton_client = grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        )
-        # Expect the inference is successful
-        res = triton_client.infer(model_name=self.model_name_, inputs=inputs)
-        self.assertEqual(1, res.as_numpy("OUT")[0])
-        self.assertEqual(0, res.as_numpy("IDX")[0])
-
-    def test_http(self):
-        inputs = [
-            httpclient.InferInput("IN", [1], "INT32").set_data_from_numpy(
-                self.input_data["IN"]
-            ),
-            httpclient.InferInput("DELAY", [1], "UINT32").set_data_from_numpy(
-                self.input_data["DELAY"]
-            ),
-            httpclient.InferInput("WAIT", [1], "UINT32").set_data_from_numpy(
-                self.input_data["WAIT"]
-            ),
-        ]
-
-        triton_client = httpclient.InferenceServerClient(
-            url="localhost:8000", verbose=True
-        )
-        # Expect the inference is successful
-        res = triton_client.infer(model_name=self.model_name_, inputs=inputs)
-        self.assertEqual(1, res.as_numpy("OUT")[0])
-        self.assertEqual(0, res.as_numpy("IDX")[0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/qa/L0_decoupled_grpc_error/test.sh b/qa/L0_decoupled_grpc_error/test.sh
deleted file mode 100755
index 4fba476b1d..0000000000
--- a/qa/L0_decoupled_grpc_error/test.sh
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/bin/bash
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
-if [ "$#" -ge 1 ]; then
-    REPO_VERSION=$1
-fi
-if [ -z "$REPO_VERSION" ]; then
-    echo -e "Repository version must be specified"
-    echo -e "\n***\n*** Test Failed\n***"
-    exit 1
-fi
-if [ ! -z "$TEST_REPO_ARCH" ]; then
-    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
-fi
-
-export CUDA_VISIBLE_DEVICES=0
-
-RET=0
-TEST_RESULT_FILE='test_results.txt'
-DECOUPLED_TEST=decoupled_test.py
-
-rm -f *.log
-
-CLIENT_LOG=`pwd`/client.log
-DATADIR=/data/inferenceserver/${REPO_VERSION}/qa_model_repository
-SERVER=/opt/tritonserver/bin/tritonserver
-SERVER_ARGS="--model-repository=../L0_decoupled/models"
-SERVER_LOG="./inference_server.log"
-source ../common/util.sh
-
-
-TRIALS="python custom"
-
-for trial in $TRIALS; do
-  if [ $trial == "python" ]; then
-    MODELDIR=../L0_decoupled/python_models
-  else
-    MODELDIR=../L0_decoupled/models
-  fi
-
-  SERVER_ARGS="--model-repository=$MODELDIR"
-  cp -r $DATADIR/libtorch_nobatch_int32_int32_int32 $MODELDIR/.
-  (cd $MODELDIR/libtorch_nobatch_int32_int32_int32 && \
-   sed -i "s/dims:.*\[.*\]/dims: \[ 1 \]/g" config.pbtxt)
-
-  run_server
-  if [ "$SERVER_PID" == "0" ]; then
-      echo -e "\n***\n*** Failed to start $SERVER\n***"
-      cat $SERVER_LOG
-      exit 1
-  fi
-
-  for i in \
-              test_one_to_none \
-              test_one_to_one \
-              test_one_to_many \
-              test_no_streaming \
-              test_response_order \
-	      test_wrong_shape; do
-
-      echo "Test: $i" >>$CLIENT_LOG
-      set +e
-      python $DECOUPLED_TEST DecoupledTest.$i >>$CLIENT_LOG 2>&1
-      if [ $? -ne 0 ]; then
-              echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-              echo -e "\n***\n*** Test $i Failed\n***"
-              RET=1
-      else
-          check_test_results $TEST_RESULT_FILE 1
-          if [ $? -ne 0 ]; then
-              cat $CLIENT_LOG
-              echo -e "\n***\n*** Test Result Verification Failed\n***"
-              RET=1
-          fi
-      fi
-      set -e
-  done
-
-  # Will delay the writing of each response by the specified many milliseconds.
-  # This will ensure that there are multiple responses available to be written.
-  export TRITONSERVER_DELAY_GRPC_RESPONSE=2000
-
-  echo "Test: test_one_to_multi_many" >>$CLIENT_LOG
-  set +e
-  python $DECOUPLED_TEST DecoupledTest.test_one_to_multi_many >>$CLIENT_LOG 2>&1
-  if [ $? -ne 0 ]; then
-      echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***" >>$CLIENT_LOG
-          echo -e "\n***\n*** Test test_one_to_multi_many Failed\n***"
-          RET=1
-  else
-      check_test_results $TEST_RESULT_FILE 1
-      if [ $? -ne 0 ]; then
-          cat $CLIENT_LOG
-          echo -e "\n***\n*** Test Result Verification Failed\n***"
-          RET=1
-      fi
-  fi
-
-  set -e
-
-  unset TRITONSERVER_DELAY_GRPC_RESPONSE
-
-  kill $SERVER_PID
-  wait $SERVER_PID
-done
-
-# Test the server frontend can merge the responses of non-decoupled model that
-# sends inference response and COMPLETE flag separately. In other words, from
-# the client's perspective there will still be one response.
-NON_DECOUPLED_DIR=`pwd`/non_decoupled_models
-rm -rf ${NON_DECOUPLED_DIR} && mkdir -p ${NON_DECOUPLED_DIR}
-cp -r ../L0_decoupled/models/repeat_int32 ${NON_DECOUPLED_DIR}/. && \
-    (cd ${NON_DECOUPLED_DIR}/repeat_int32 && \
-        sed -i "s/decoupled: True/decoupled: False/" config.pbtxt)
-
-SERVER_ARGS="--model-repository=${NON_DECOUPLED_DIR}"
-SERVER_LOG="./non_decoupled_inference_server.log"
-
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-fi
-
-CLIENT_LOG=`pwd`/non_decoupled_client.log
-echo "Test: NonDecoupledTest" >>$CLIENT_LOG
-set +e
-python $DECOUPLED_TEST NonDecoupledTest >>$CLIENT_LOG 2>&1
-if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test NonDecoupledTest Failed\n***" >>$CLIENT_LOG
-        echo -e "\n***\n*** Test NonDecoupledTest Failed\n***"
-        RET=1
-else
-    check_test_results $TEST_RESULT_FILE 2
-    if [ $? -ne 0 ]; then
-        cat $CLIENT_LOG
-        echo -e "\n***\n*** Test Result Verification Failed\n***"
-        RET=1
-    fi
-fi
-
-set -e
-
-kill $SERVER_PID
-wait $SERVER_PID
-
-if [ $RET -eq 0 ]; then
-  echo -e "\n***\n*** Test Passed\n***"
-else
-  echo -e "\n***\n*** Test Failed\n***"
-fi
-
-exit $RET
\ No newline at end of file
diff --git a/qa/L0_grpc_error_state_cleanup/cleanup_test.py b/qa/L0_grpc_error_state_cleanup/cleanup_test.py
deleted file mode 100755
index 4425c5c667..0000000000
--- a/qa/L0_grpc_error_state_cleanup/cleanup_test.py
+++ /dev/null
@@ -1,648 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-
-sys.path.append("../common")
-
-import os
-import queue
-import signal
-import time
-import unittest
-from functools import partial
-
-import numpy as np
-import test_util as tu
-import tritonclient.grpc as grpcclient
-from tritonclient.utils import InferenceServerException
-
-
-class UserData:
-    def __init__(self):
-        self._response_queue = queue.Queue()
-
-
-def callback(user_data, result, error):
-    if error:
-        user_data._response_queue.put(error)
-    else:
-        user_data._response_queue.put(result)
-
-
-# These state cleanup tests relies on the test.sh
-# to check whether all the created request objects
-# were properly deleted by the sever.
-# The purpose on these unittest is to exercise
-# different portions of the gRPC frontend and
-# and track the state objects.
-class CleanUpTest(tu.TestResultCollector):
-    SERVER_PID = None
-
-    def setUp(self):
-        self.decoupled_model_name_ = "repeat_int32"
-        self.identity_model_name_ = "custom_zero_1_float32"
-        self.repeat_non_decoupled_model_name = "repeat_int32_non_decoupled"
-
-    def _prepare_inputs_and_outputs(self, kind):
-        if kind in ("decoupled_streaming", "non_decoupled_streaming"):
-            self.inputs_ = []
-            self.inputs_.append(grpcclient.InferInput("IN", [1], "INT32"))
-            self.inputs_.append(grpcclient.InferInput("DELAY", [1], "UINT32"))
-            self.inputs_.append(grpcclient.InferInput("WAIT", [1], "UINT32"))
-
-            self.outputs_ = []
-            self.outputs_.append(grpcclient.InferRequestedOutput("OUT"))
-            self.outputs_.append(grpcclient.InferRequestedOutput("IDX"))
-            self.requested_outputs_ = self.outputs_
-        elif kind in ("simple", "streaming"):
-            self.inputs_ = []
-            self.inputs_.append(grpcclient.InferInput("INPUT0", [1, 1], "FP32"))
-
-            self.outputs_ = []
-            self.outputs_.append(grpcclient.InferRequestedOutput("OUTPUT0"))
-            self.requested_outputs_ = self.outputs_
-        else:
-            raise ValueError("Unsupported kind specified to prepare inputs/outputs")
-
-    def _simple_infer(
-        self,
-        request_count,
-        cancel_response_idx=None,
-        client_timeout_pair=None,
-        kill_server=None,
-    ):
-        with grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        ) as triton_client:
-            self._prepare_inputs_and_outputs("simple")
-
-            input_data = np.array([[1.0]], dtype=np.float32)
-            self.inputs_[0].set_data_from_numpy(input_data)
-
-            user_data = UserData()
-
-            futures = []
-            timeout_idx = None
-            timeout_value = None
-            if client_timeout_pair:
-                timeout_idx, timeout_value = client_timeout_pair
-            for i in range(request_count):
-                if kill_server == i:
-                    os.kill(int(self.SERVER_PID), signal.SIGINT)
-                this_timeout = None
-                if timeout_idx == i:
-                    this_timeout = timeout_value
-                futures.append(
-                    triton_client.async_infer(
-                        model_name=self.identity_model_name_,
-                        inputs=self.inputs_,
-                        request_id=str(i),
-                        callback=partial(callback, user_data),
-                        outputs=self.requested_outputs_,
-                        client_timeout=this_timeout,
-                    )
-                )
-
-            if cancel_response_idx is not None:
-                futures[cancel_response_idx].cancel()
-
-            responses = []
-            while len(responses) < len(futures):
-                data_item = user_data._response_queue.get()
-                if type(data_item) == InferenceServerException:
-                    raise data_item
-                else:
-                    responses.append(data_item)
-
-            for response in responses:
-                output0_data = response.as_numpy("OUTPUT0")
-                self.assertTrue(np.array_equal(input_data, output0_data))
-
-    def _stream_infer_with_params(
-        self,
-        request_count,
-        request_delay,
-        _,
-        user_data,
-        result_dict,
-        delay_data=None,
-        delay_factor=None,
-        cancel_response_idx=None,
-        stream_timeout=None,
-        kill_server=None,
-    ):
-        with grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        ) as triton_client:
-            # Establish stream
-            metadata = {"triton_grpc_error": "true"}
-            triton_client.start_stream(
-                callback=partial(callback, user_data),
-                stream_timeout=stream_timeout,
-                headers=metadata,
-            )
-            # Send specified many requests in parallel
-            for i in range(request_count):
-                time.sleep((request_delay / 1000))
-                self.inputs_[1].set_data_from_numpy(delay_data)
-                if kill_server == i:
-                    os.kill(int(self.SERVER_PID), signal.SIGINT)
-                triton_client.async_stream_infer(
-                    model_name=self.decoupled_model_name_,
-                    inputs=self.inputs_,
-                    request_id=str(i),
-                    outputs=self.requested_outputs_,
-                    # Opt-in to receiving flags-only responses from model/backend
-                    # to help detect final responses for decoupled models.
-                    enable_empty_final_response=True,
-                )
-                # Update delay input in accordance with the scaling factor
-                delay_data = delay_data * delay_factor
-                delay_data = delay_data.astype(np.uint32)
-
-            # Retrieve results...
-            recv_count = 0
-            completed_requests = 0
-            while completed_requests < request_count:
-                if cancel_response_idx == recv_count:
-                    triton_client.stop_stream(cancel_requests=True)
-                data_item = user_data._response_queue.get()
-                if type(data_item) == InferenceServerException:
-                    raise data_item
-                else:
-                    response = data_item.get_response()
-                    # Request IDs should generally be provided with each request
-                    # to associate decoupled responses with their requests.
-                    if not response.id:
-                        raise ValueError(
-                            "No response id found. Was a request_id provided?"
-                        )
-
-                    # Detect final response. Parameters are oneof and we expect bool_param
-                    if response.parameters.get("triton_final_response").bool_param:
-                        completed_requests += 1
-
-                    # Only process non-empty response, ignore if empty (no outputs)
-                    if response.outputs:
-                        if response.id not in result_dict:
-                            result_dict[response.id] = []
-                        result_dict[response.id].append((recv_count, data_item))
-                        recv_count += 1
-
-    def _stream_infer(
-        self,
-        request_count,
-        request_delay,
-        expected_count,
-        user_data,
-        result_dict,
-        delay_data=None,
-        delay_factor=None,
-        cancel_response_idx=None,
-        stream_timeout=None,
-        kill_server=None,
-    ):
-        with grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        ) as triton_client:
-            # Establish stream
-            metadata = {"triton_grpc_error": "true"}
-            triton_client.start_stream(
-                callback=partial(callback, user_data),
-                stream_timeout=stream_timeout,
-                headers=metadata,
-            )
-            # Send specified many requests in parallel
-            for i in range(request_count):
-                time.sleep((request_delay / 1000))
-                model_name = self.identity_model_name_
-                if delay_data is not None:
-                    model_name = self.decoupled_model_name_
-                    self.inputs_[1].set_data_from_numpy(delay_data)
-                if kill_server == i:
-                    os.kill(int(self.SERVER_PID), signal.SIGINT)
-                triton_client.async_stream_infer(
-                    model_name=model_name,
-                    inputs=self.inputs_,
-                    request_id=str(i),
-                    outputs=self.requested_outputs_,
-                )
-                if (delay_data is not None) and (delay_factor is not None):
-                    # Update delay input in accordance with the scaling factor
-                    delay_data = delay_data * delay_factor
-                    delay_data = delay_data.astype(np.uint32)
-
-            # Retrieve results...
-            recv_count = 0
-            while recv_count < expected_count:
-                if cancel_response_idx == recv_count:
-                    triton_client.stop_stream(cancel_requests=True)
-                data_item = user_data._response_queue.get()
-                if type(data_item) == InferenceServerException:
-                    raise data_item
-                else:
-                    this_id = data_item.get_response().id
-                    if this_id not in result_dict:
-                        result_dict[this_id] = []
-                    result_dict[this_id].append((recv_count, data_item))
-
-                recv_count += 1
-
-    def _streaming_infer(
-        self,
-        request_count,
-        request_delay=0,
-        cancel_response_idx=None,
-        stream_timeout=None,
-        kill_server=None,
-        should_error=True,
-    ):
-        self._prepare_inputs_and_outputs("streaming")
-
-        input_data = np.array([[1.0]], dtype=np.float32)
-        self.inputs_[0].set_data_from_numpy(input_data)
-
-        user_data = UserData()
-        result_dict = {}
-
-        try:
-            expected_count = request_count
-            self._stream_infer(
-                request_count,
-                request_delay,
-                expected_count,
-                user_data,
-                result_dict,
-                cancel_response_idx=cancel_response_idx,
-                stream_timeout=stream_timeout,
-                kill_server=kill_server,
-            )
-        except Exception as ex:
-            if cancel_response_idx or stream_timeout or should_error:
-                raise ex
-            self.assertTrue(False, "unexpected error {}".format(ex))
-
-        # Validate the results..
-        for i in range(request_count):
-            this_id = str(i)
-            if this_id not in result_dict.keys():
-                self.assertTrue(
-                    False, "response for request id {} not received".format(this_id)
-                )
-            self.assertEqual(len(result_dict[this_id]), 1)
-            result = result_dict[this_id][0][1]
-            output0_data = result.as_numpy("OUTPUT0")
-            self.assertTrue(np.array_equal(input_data, output0_data))
-
-    def _decoupled_infer(
-        self,
-        request_count,
-        request_delay=0,
-        repeat_count=1,
-        data_offset=100,
-        delay_time=1000,
-        delay_factor=1,
-        wait_time=500,
-        cancel_response_idx=None,
-        stream_timeout=None,
-        kill_server=None,
-        should_error=True,
-        infer_helper_map=[True, True],
-    ):
-        self._prepare_inputs_and_outputs(kind="decoupled_streaming")
-
-        # Initialize data for IN
-        input_data = np.arange(
-            start=data_offset, stop=data_offset + repeat_count, dtype=np.int32
-        )
-        self.inputs_[0].set_shape([repeat_count])
-        self.inputs_[0].set_data_from_numpy(input_data)
-
-        # Initialize data for DELAY
-        delay_data = (np.ones([repeat_count], dtype=np.uint32)) * delay_time
-        self.inputs_[1].set_shape([repeat_count])
-
-        # Initialize data for WAIT
-        wait_data = np.array([wait_time], dtype=np.uint32)
-        self.inputs_[2].set_data_from_numpy(wait_data)
-
-        infer_helpers = []
-        if infer_helper_map[0]:
-            infer_helpers.append(self._stream_infer)
-        if infer_helper_map[1]:
-            infer_helpers.append(self._stream_infer_with_params)
-
-        for infer_helper in infer_helpers:
-            user_data = UserData()
-            result_dict = {}
-
-            try:
-                expected_count = repeat_count * request_count
-                infer_helper(
-                    request_count,
-                    request_delay,
-                    expected_count,
-                    user_data,
-                    result_dict,
-                    delay_data,
-                    delay_factor,
-                    cancel_response_idx,
-                    stream_timeout,
-                    kill_server,
-                )
-            except Exception as ex:
-                if cancel_response_idx or stream_timeout or should_error:
-                    raise ex
-                self.assertTrue(False, "unexpected error {}".format(ex))
-
-            # Validate the results..
-            for i in range(request_count):
-                this_id = str(i)
-                if repeat_count != 0 and this_id not in result_dict.keys():
-                    self.assertTrue(
-                        False, "response for request id {} not received".format(this_id)
-                    )
-                elif repeat_count == 0 and this_id in result_dict.keys():
-                    self.assertTrue(
-                        False,
-                        "received unexpected response for request id {}".format(
-                            this_id
-                        ),
-                    )
-                if repeat_count != 0:
-                    self.assertEqual(len(result_dict[this_id]), repeat_count)
-                    expected_data = data_offset
-                    result_list = result_dict[this_id]
-                    for j in range(len(result_list)):
-                        this_data = result_list[j][1].as_numpy("OUT")
-                        self.assertEqual(len(this_data), 1)
-                        self.assertEqual(this_data[0], expected_data)
-                        this_idx = result_list[j][1].as_numpy("IDX")
-                        self.assertEqual(len(this_idx), 1)
-                        self.assertEqual(this_idx[0], j)
-                        expected_data += 1
-
-    ###
-    ### Non-Streaming Tests
-    ###
-    def test_simple_infer(self):
-        # This test case sends 10 asynchronous requests and validates
-        # the response.
-        self._simple_infer(request_count=10)
-
-    def test_simple_infer_cancellation(self):
-        # This test case is used to check whether all the states are
-        # correctly released when one of the request is cancelled from
-        # the client side.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._simple_infer(request_count=10, cancel_response_idx=5)
-        self.assertIn("Locally cancelled by application!", str(cm.exception))
-
-    def test_simple_infer_timeout(self):
-        # This test case is used to check whether all the states are
-        # correctly released when the request gets timed-out on the client.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._simple_infer(request_count=10, client_timeout_pair=[5, 0.1])
-        self.assertIn("Deadline Exceeded", str(cm.exception))
-
-    def test_simple_infer_error_status(self):
-        # This test case is used to check whether all the state objects are
-        # released when RPC runs into error.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._simple_infer(request_count=10)
-        self.assertIn(
-            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
-            str(cm.exception),
-        )
-
-    def test_simple_infer_shutdownserver(self):
-        # This test case is used to check whether all the state objects are
-        # released when the server is interrupted to shutdown in the beginning
-        # of inference run with final parameters being returned.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._simple_infer(request_count=20, kill_server=5)
-
-    ###
-    ### Streaming Tests
-    ###
-    def test_streaming_infer(self):
-        # Sanity test to check whether all the state objects
-        # are correctly released. Sends 10 requests in a single
-        # gRPC bidirectional stream.
-        self._streaming_infer(request_count=10)
-
-    def test_streaming_cancellation(self):
-        # This test case is used to check whether all the states are
-        # correctly released when the stream is closed when fifth
-        # response is received.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._streaming_infer(request_count=10, cancel_response_idx=5)
-        self.assertIn("Locally cancelled by application!", str(cm.exception))
-
-    def test_streaming_timeout(self):
-        # This test case is used to check whether all the states are
-        # released when some of the requests timeouts.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._streaming_infer(request_count=10, request_delay=1, stream_timeout=2)
-        self.assertIn("Deadline Exceeded", str(cm.exception))
-
-    def test_streaming_error_status(self):
-        # This test case is used to check whether all the state objects are
-        # released when RPC runs into error.
-        expected_exceptions = [
-            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
-            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
-        ]
-        with self.assertRaises(InferenceServerException) as cm:
-            self._streaming_infer(request_count=10, should_error=True)
-
-        exception_match = False
-        for expected_exception in expected_exceptions:
-            exception_match |= expected_exception in str(cm.exception)
-        self.assertTrue(
-            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
-        )
-
-    def test_streaming_infer_shutdownserver(self):
-        # This test case is used to check whether all the state objects are
-        # released when the server is interrupted to shutdown in middle of
-        # inference run.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._streaming_infer(
-                request_count=10,
-                request_delay=1,
-                kill_server=5,
-                should_error=True,
-            )
-
-    ###
-    ### Decoupled Streaming Tests
-    ###
-    def test_decoupled_infer(self):
-        # Sanity test to check whether all the state objects
-        # are correctly released. Sends 10 requests in a single
-        # gRPC bidirectional stream and expects each of these
-        # requests to generate 10 responses.
-        self._decoupled_infer(request_count=10, repeat_count=10)
-
-    def test_decoupled_cancellation(self):
-        # This test case is used to check whether all the states are
-        # correctly released when the stream is closed when fifth
-        # response is received.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._decoupled_infer(
-                request_count=10, repeat_count=10, cancel_response_idx=5
-            )
-        self.assertIn("Locally cancelled by application!", str(cm.exception))
-
-    def test_decoupled_timeout(self):
-        # This test case is used to check whether all the states are
-        # released when some of the requests timeouts.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._decoupled_infer(
-                request_count=10, repeat_count=10, request_delay=1, stream_timeout=2
-            )
-        self.assertIn("Deadline Exceeded", str(cm.exception))
-
-    def test_decoupled_error_status(self):
-        # This test case is used to check whether all the state objects are
-        # released when RPC runs into error.
-        expected_exceptions = [
-            "This protocol is restricted, expecting header 'triton-grpc-protocol-infer-key'",
-            "The stream is no longer in valid state, the error detail is reported through provided callback. A new stream should be started after stopping the current stream.",
-        ]
-        with self.assertRaises(InferenceServerException) as cm:
-            self._decoupled_infer(request_count=10, repeat_count=10, should_error=True)
-
-        exception_match = False
-        for expected_exception in expected_exceptions:
-            exception_match |= expected_exception in str(cm.exception)
-        self.assertTrue(
-            exception_match, "Raised unexpected exception {}".format(str(cm.exception))
-        )
-
-    def test_decoupled_infer_shutdownserver(self):
-        # This test case is used to check whether all the state objects are
-        # released when the server is interrupted to shutdown in middle of
-        # inference run.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._decoupled_infer(
-                request_count=10,
-                repeat_count=10,
-                request_delay=1,
-                kill_server=5,
-                should_error=True,
-                infer_helper_map=[True, False],
-            )
-
-    def test_decoupled_infer_with_params_shutdownserver(self):
-        # This test case is used to check whether all the state objects are
-        # released when the server is interrupted to shutdown in middle of
-        # inference run with final parameters being returned.
-        with self.assertRaises(InferenceServerException) as cm:
-            self._decoupled_infer(
-                request_count=10,
-                repeat_count=10,
-                request_delay=1,
-                kill_server=5,
-                should_error=True,
-                infer_helper_map=[False, True],
-            )
-
-    def test_decoupled_infer_complete(self):
-        # Test if the Process() thread could release the state object before
-        # the StreamInferResponseComplete() thread is done accessing it.
-        self._decoupled_infer(request_count=1, repeat_count=1, stream_timeout=16)
-        # Check no error is printed to the log.
-        with open(os.environ["SERVER_LOG"]) as f:
-            server_log = f.read()
-        self.assertNotIn("Should not print this", server_log)
-
-    def test_non_decoupled_streaming_multi_response(self):
-        # Test non-decoupled streaming infer with more than one response should return
-        # the first response.
-        response_count = 4
-        expected_response_count = 1
-        expected_response_index = 0
-
-        # Prepare input data
-        self._prepare_inputs_and_outputs("non_decoupled_streaming")
-        # Initialize data for IN
-        data_offset = 100
-        input_data = np.arange(
-            start=data_offset, stop=data_offset + response_count, dtype=np.int32
-        )
-        self.inputs_[0].set_shape([response_count])
-        self.inputs_[0].set_data_from_numpy(input_data)
-        # Initialize data for DELAY
-        delay_data = np.zeros([response_count], dtype=np.uint32)
-        self.inputs_[1].set_shape([response_count])
-        self.inputs_[1].set_data_from_numpy(delay_data)
-        # Initialize data for WAIT
-        wait_data = np.array([0], dtype=np.uint32)
-        self.inputs_[2].set_data_from_numpy(wait_data)
-
-        # Infer
-        user_data = UserData()
-        with grpcclient.InferenceServerClient(
-            url="localhost:8001", verbose=True
-        ) as client:
-            # Establish stream
-            metadata = {"triton_grpc_error": "true"}
-            client.start_stream(
-                callback=partial(callback, user_data),
-                stream_timeout=16,
-                headers=metadata,
-            )
-            # Send a request
-            client.async_stream_infer(
-                model_name=self.repeat_non_decoupled_model_name,
-                inputs=self.inputs_,
-                request_id="0",
-                outputs=self.requested_outputs_,
-            )
-            # Wait for all results and stop stream
-            client.stop_stream()
-
-        # Check infer output
-        actual_response_count = 0
-        while not user_data._response_queue.empty():
-            actual_response_count += 1
-            data_item = user_data._response_queue.get()
-            if type(data_item) == InferenceServerException:
-                raise data_item
-            else:
-                response_idx = data_item.as_numpy("IDX")[0]
-                self.assertEqual(response_idx, expected_response_index)
-        self.assertEqual(actual_response_count, expected_response_count)
-
-
-if __name__ == "__main__":
-    CleanUpTest.SERVER_PID = os.environ.get("SERVER_PID", CleanUpTest.SERVER_PID)
-    unittest.main()
diff --git a/qa/L0_grpc_error_state_cleanup/test.sh b/qa/L0_grpc_error_state_cleanup/test.sh
deleted file mode 100755
index df302d5ed1..0000000000
--- a/qa/L0_grpc_error_state_cleanup/test.sh
+++ /dev/null
@@ -1,235 +0,0 @@
-#!/bin/bash
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
-if [ "$#" -ge 1 ]; then
-    REPO_VERSION=$1
-fi
-if [ -z "$REPO_VERSION" ]; then
-    echo -e "Repository version must be specified"
-    echo -e "\n***\n*** Test Failed\n***"
-    exit 1
-fi
-if [ ! -z "$TEST_REPO_ARCH" ]; then
-    REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
-fi
-
-export CUDA_VISIBLE_DEVICES=0
-
-RET=0
-CLEANUP_TEST=cleanup_test.py
-
-rm -f *.log
-
-CLIENT_LOG=`pwd`/client.log
-SERVER=/opt/tritonserver/bin/tritonserver
-source ../common/util.sh
-
-function check_state_release() {
-  local log_file=$1
-
-  num_state_release=`cat $log_file | grep  "StateRelease" | wc -l`
-  num_state_new=`cat $log_file | grep  "StateNew" | wc -l`
-
-  if [ $num_state_release -ne $num_state_new ]; then
-    cat $log_file
-    echo -e "\n***\n*** Test Failed: Mismatch detected, $num_state_new state(s) created, $num_state_release state(s) released. \n***" >> $log_file
-    return 1
-  fi
-
-  return 0
-}
-
-rm -fr ./models/custom_zero_1_float32 && \
-        cp -r ../custom_models/custom_zero_1_float32 ./models/. && \
-        mkdir -p ./models/custom_zero_1_float32/1
-
-(cd models/custom_zero_1_float32 && \
-    echo "parameters [" >> config.pbtxt && \
-    echo "{ key: \"execute_delay_ms\"; value: { string_value: \"1000\" }}" >> config.pbtxt && \
-    echo "]" >> config.pbtxt)
-
-rm -rf models/repeat_int32_non_decoupled && \
-    cp -r models/repeat_int32 models/repeat_int32_non_decoupled && \
-    (cd models/repeat_int32_non_decoupled && \
-        sed -i "/model_transaction_policy/,+2d" config.pbtxt && \
-        sed -i "s/repeat_int32/repeat_int32_non_decoupled/" config.pbtxt)
-
-for i in test_simple_infer \
-            test_simple_infer_cancellation \
-            test_simple_infer_timeout \
-            test_streaming_infer \
-            test_streaming_timeout \
-            test_streaming_cancellation \
-            test_decoupled_infer \
-            test_decoupled_cancellation \
-            test_decoupled_timeout \
-            test_non_decoupled_streaming_multi_response; do
-  SERVER_LOG="./inference_server.$i.log"
-  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-  run_server
-  if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-  fi
-
-  echo "Test: $i" >>$CLIENT_LOG
-
-  set +e
-  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-  if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-    echo -e "\n***\n*** Test $i Failed\n***"
-    RET=1
-  fi
-
-  kill $SERVER_PID
-  wait $SERVER_PID
-
-  check_state_release $SERVER_LOG
-  if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** State Verification Failed for $i\n***"
-      RET=1
-  fi
-  set -e
-done
-
-
-for i in test_simple_infer_error_status \
-                test_streaming_error_status \
-                test_decoupled_error_status; do
-  SERVER_LOG="./inference_server.$i.log"
-  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2 --grpc-restricted-protocol=inference:infer-key=infer-value"
-  run_server
-  if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-  fi
-
-  echo "Test: $i" >>$CLIENT_LOG
-
-  set +e
-  python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-  if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-    echo -e "\n***\n*** Test $i Failed\n***"
-    RET=1
-  fi
-
-  kill $SERVER_PID
-  wait $SERVER_PID
-
-  check_state_release $SERVER_LOG
-  if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** State Verification Failed for $i\n***"
-      RET=1
-  fi
-
-  set -e
-done
-
-for i in test_simple_infer_shutdownserver \
-         test_streaming_infer_shutdownserver \
-         test_decoupled_infer_shutdownserver \
-         test_decoupled_infer_with_params_shutdownserver; do
-  SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-  SERVER_LOG="./inference_server.$i.log"
-  run_server
-  if [ "$SERVER_PID" == "0" ]; then
-    echo -e "\n***\n*** Failed to start $SERVER\n***"
-    cat $SERVER_LOG
-    exit 1
-  fi
-
-  echo "Test: $i" >>$CLIENT_LOG
-
-  set +e
-  SERVER_PID=$SERVER_PID python $CLEANUP_TEST CleanUpTest.$i >>$CLIENT_LOG 2>&1
-  if [ $? -ne 0 ]; then
-    echo -e "\n***\n*** Test $i Failed\n***" >>$CLIENT_LOG
-    echo -e "\n***\n*** Test $i Failed\n***"
-    RET=1
-  fi
-
-  wait $SERVER_PID
-
-  check_state_release $SERVER_LOG
-  if [ $? -ne 0 ]; then
-    cat $SERVER_LOG
-    echo -e "\n***\n*** State Verification Failed for $i\n***"
-      RET=1
-  fi
-
-  set -e
-done
-
-TEST_NAME=test_decoupled_infer_complete
-export TRITONSERVER_DELAY_GRPC_COMPLETE=2000
-
-SERVER_LOG="./inference_server.$TEST_NAME.log"
-SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
-run_server
-if [ "$SERVER_PID" == "0" ]; then
-  echo -e "\n***\n*** Failed to start $SERVER\n***"
-  cat $SERVER_LOG
-  exit 1
-fi
-
-echo "Test: $TEST_NAME" >>$CLIENT_LOG
-
-set +e
-
-SERVER_LOG=$SERVER_LOG python $CLEANUP_TEST CleanUpTest.$TEST_NAME >>$CLIENT_LOG 2>&1
-if [ $? -ne 0 ]; then
-  cat $CLIENT_LOG
-  echo -e "\n***\n*** Test $TEST_NAME Failed\n***"
-  RET=1
-fi
-
-kill $SERVER_PID
-wait $SERVER_PID
-
-check_state_release $SERVER_LOG
-if [ $? -ne 0 ]; then
-  cat $SERVER_LOG
-  echo -e "\n***\n*** State Verification Failed for $TEST_NAME\n***"
-  RET=1
-fi
-
-set -e
-
-if [ $RET -eq 0 ]; then
-  echo -e "\n***\n*** Test Passed\n***"
-else
-  echo -e "\n***\n*** Test Failed\n***"
-fi
-
-exit $RET
diff --git a/qa/L0_grpc_state_cleanup/cleanup_test.py b/qa/L0_grpc_state_cleanup/cleanup_test.py
index 431eeb1720..f7507747e9 100755
--- a/qa/L0_grpc_state_cleanup/cleanup_test.py
+++ b/qa/L0_grpc_state_cleanup/cleanup_test.py
@@ -161,9 +161,17 @@ def _stream_infer_with_params(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=stream_timeout,
+                    headers=metadata,
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -229,9 +237,17 @@ def _stream_infer(
             url="localhost:8001", verbose=True
         ) as triton_client:
             # Establish stream
-            triton_client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=stream_timeout
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                triton_client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=stream_timeout,
+                    headers=metadata,
+                )
+            else:
+                triton_client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=stream_timeout
+                )
             # Send specified many requests in parallel
             for i in range(request_count):
                 time.sleep((request_delay / 1000))
@@ -608,9 +624,17 @@ def test_non_decoupled_streaming_multi_response(self):
             url="localhost:8001", verbose=True
         ) as client:
             # Establish stream
-            client.start_stream(
-                callback=partial(callback, user_data), stream_timeout=16
-            )
+            if "TRITONSERVER_GRPC_STATUS_FLAG" in os.environ:
+                metadata = {"triton_grpc_error": "true"}
+                client.start_stream(
+                    callback=partial(callback, user_data),
+                    stream_timeout=16,
+                    headers=metadata,
+                )
+            else:
+                client.start_stream(
+                    callback=partial(callback, user_data), stream_timeout=16
+                )
             # Send a request
             client.async_stream_infer(
                 model_name=self.repeat_non_decoupled_model_name,

From 887aaa237c63dc4954c747209f5958969e3132cc Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 16 Aug 2024 12:06:35 -0700
Subject: [PATCH 31/32] PR comments fixed and main merged

---
 qa/L0_backend_python/lifecycle/lifecycle_test.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
index 607726b961..d6eb2a8f53 100755
--- a/qa/L0_backend_python/lifecycle/lifecycle_test.py
+++ b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -255,10 +255,8 @@ def test_triton_grpc_error_error_on(self):
             callback=partial(callback, user_data), headers=metadata
         )
         stream_end = False
-        input_datas = []
         for i in range(number_of_requests):
             input_data = np.random.randn(*shape).astype(np.float32)
-            input_datas.append(input_data)
             inputs = [
                 grpcclient.InferInput(
                     "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
@@ -317,10 +315,8 @@ def test_triton_grpc_error_cancel(self):
             callback=partial(callback, user_data), headers=metadata
         )
 
-        input_datas = []
         for i in range(number_of_requests):
             input_data = np.random.randn(*shape).astype(np.float32)
-            input_datas.append(input_data)
             inputs = [
                 grpcclient.InferInput(
                     "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)
@@ -360,10 +356,8 @@ def test_triton_grpc_error_error_off(self):
         user_data = UserData()
         triton_client = grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001")
         triton_client.start_stream(callback=partial(callback, user_data))
-        input_datas = []
         for i in range(number_of_requests):
             input_data = np.random.randn(*shape).astype(np.float32)
-            input_datas.append(input_data)
             inputs = [
                 grpcclient.InferInput(
                     "IN", input_data.shape, np_to_triton_dtype(input_data.dtype)

From 0e7670ca7716083b10882622e235ec432419086b Mon Sep 17 00:00:00 2001
From: Indrajit Bhosale <iamindrajitb@gmail.com>
Date: Fri, 16 Aug 2024 13:58:21 -0700
Subject: [PATCH 32/32] DockerFile fixed

---
 Dockerfile.QA | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.QA b/Dockerfile.QA
index 22f312b930..b381abfaaf 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -268,7 +268,7 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \
         qa/L0_decoupled/python_models/square_int32/.
 
 RUN mkdir -p qa/L0_decoupled_grpc_error && \
-    cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error && \
+    cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error
 
 RUN mkdir -p qa/L0_grpc_error_state_cleanup && \
     cp -r qa/L0_grpc_state_cleanup/. qa/L0_grpc_error_state_cleanup