triton-inference-server · oandreeva-nv · Jun 6, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/qa/L0_request_cancellation/grpc_cancellation_test.py b/qa/L0_request_cancellation/grpc_cancellation_test.py
@@ -50,6 +50,29 @@ def callback(user_data, result, error):
         user_data._completed_requests.put(result)
 
 
+def prepare_inputs_outputs():
+    inputs, outputs = [], []
+    inputs.append(grpcclient.InferInput("INPUT0", [1, 1], "INT32"))
+    outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
+    inputs[0].set_data_from_numpy(np.array([[5]], dtype=np.int32))
+    return inputs, outputs
+
+
+def grpc_async_infer_request_with_instant_cancellation(model_name):
+    inputs_, outputs_ = prepare_inputs_outputs()
+    user_data = UserData()
+    with grpcclient.InferenceServerClient(url="localhost:8001") as client:
+        t_end = time.time() + 60
+        while time.time() < t_end:
+            future = client.async_infer(
+                model_name=model_name,
+                inputs=inputs_,
+                callback=partial(callback, user_data),
+                outputs=outputs_,
+            )
+            future.cancel()
+
+
 class GrpcCancellationTest(unittest.IsolatedAsyncioTestCase):
     _model_name = "custom_identity_int32"
     _model_delay = 10.0  # seconds
@@ -68,11 +91,7 @@ def tearDown(self):
         self._assert_max_duration()
 
     def _prepare_request(self):
-        self._inputs = []
-        self._inputs.append(grpcclient.InferInput("INPUT0", [1, 1], "INT32"))
-        self._outputs = []
-        self._outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
-        self._inputs[0].set_data_from_numpy(np.array([[10]], dtype=np.int32))
+        self._inputs, self._outputs = prepare_inputs_outputs()
 
     def _assert_max_duration(self):
         max_duration = self._model_delay * 0.5  # seconds

diff --git a/qa/L0_request_cancellation/test.sh b/qa/L0_request_cancellation/test.sh
@@ -111,8 +111,74 @@ for TEST_CASE in "test_grpc_async_infer" "test_grpc_stream_infer" "test_aio_grpc
 done
 
 #
-# End-to-end scheduler tests
+# gRPC cancellation stress test
 #
+rm -rf models && mkdir models
+mkdir -p models/custom_identity_int32/1 && (cd models/custom_identity_int32 && \
+    echo 'name: "custom_identity_int32"' >> config.pbtxt && \
+    echo 'backend: "identity"' >> config.pbtxt && \
+    echo 'max_batch_size: 1024' >> config.pbtxt && \
+    echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_INT32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo 'instance_group [{ kind: KIND_CPU }]' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "500" } }]' >> config.pbtxt)
+
+TEST_LOG="./grpc_cancellation_stress_test.log"
+SERVER_LOG="grpc_cancellation_stress_test.server.log"
+
+SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+python -c "import grpc_cancellation_test; grpc_cancellation_test.grpc_async_infer_request_with_instant_cancellation(model_name=\"custom_identity_int32\")" > $TEST_LOG 2>&1 &
+PYTHON_CLIENT_PID=$!
+PREV_NEW_REQ_HANDL_COUNT=-1
+NUMBER_RUNS=10
+while true; do
+    if ps -p $PYTHON_CLIENT_PID > /dev/null; then
+        echo "Python process stopped. Restarting..."
+        python -c "import grpc_cancellation_test; grpc_cancellation_test.grpc_async_infer_request_with_instant_cancellation(model_name=\"custom_identity_int32\")" > $TEST_LOG 2>&1 &
+        PYTHON_CLIENT_PID=$!
+        (( NUMBER_RUNS -= 1 ))
+    fi
+    CUR_NEW_REQ_HANDL_COUNT=$(cat $SERVER_LOG | grep -c "New request handler for ModelInferHandler")
+    echo $CUR_NEW_REQ_HANDL_COUNT
+    sleep 1
+    if [[ $CUR_NEW_REQ_HANDL_COUNT -gt $PREV_NEW_REQ_HANDL_COUNT ]]; then
+        # Update the previous count
+        PREV_NEW_REQ_HANDL_COUNT=$CUR_NEW_REQ_HANDL_COUNT
+    else
+        # Kill the Python process if the count hasn't increased
+        kill $PYTHON_CLIENT_PID
+        wait $PYTHON_CLIENT_PID
+        echo "Python process killed. Final 'New request handler' count: $CUR_NEW_REQ_HANDL_COUNT"
+        echo "Cancellation notification received count: $(grep -c 'Cancellation notification received for ModelInferHandler, rpc_ok=1, context 0, [0-9]* step' $SERVER_LOG)"
+        echo "Cancellation notification received for START count: $(grep -c 'Cancellation notification received for ModelInferHandler, rpc_ok=1, context 0, [0-9]* step START' $SERVER_LOG)"
+        RET=1
+        break
+    fi
+    if [ "$NUMBER_RUNS" -le 0 ]; then
+        kill $PYTHON_CLIENT_PID
+        wait $PYTHON_CLIENT_PID
+        echo "Python process killed. Final 'New request handler' count: $CUR_NEW_REQ_HANDL_COUNT"
+        echo "Cancellation notification received count: $(grep -c 'Cancellation notification received for ModelInferHandler, rpc_ok=1, context 0, [0-9]* step' $SERVER_LOG)"
+        echo "Cancellation notification received for START count: $(grep -c 'Cancellation notification received for ModelInferHandler, rpc_ok=1, context 0, [0-9]* step START' $SERVER_LOG)"
+        break
+    fi
+    sleep 20
+done
+
+set -e
+kill $SERVER_PID
+wait $SERVER_PID
+#
+# End-to-end scheduler tests
+
 rm -rf models && mkdir models
 mkdir -p models/dynamic_batch/1 && (cd models/dynamic_batch && \
     echo 'name: "dynamic_batch"' >> config.pbtxt && \
@@ -174,7 +240,6 @@ set -e
 
 kill $SERVER_PID
 wait $SERVER_PID
-
 #
 # Implicit state tests
 #

diff --git a/src/grpc/infer_handler.cc b/src/grpc/infer_handler.cc
@@ -694,6 +694,16 @@ ModelInferHandler::Process(InferHandler::State* state, bool rpc_ok)
   // Handle notification for cancellation which can be raised
   // asynchronously if detected on the network.
   if (state->IsGrpcContextCancelled()) {
+    if (rpc_ok && (state->step_ == Steps::START) &&
+        (state->context_->step_ != Steps::CANCELLED)) {
+#ifdef TRITON_ENABLE_TRACING
+      // Can't create trace as we don't know the model to be requested,
+      // track timestamps in 'state'
+      state->trace_timestamps_.emplace_back(std::make_pair(
+          "GRPC_WAITREAD_END", TraceManager::CaptureTimestamp()));
+#endif  // TRITON_ENABLE_TRACING
+      StartNewRequest();
+    }
     bool resume = state->context_->HandleCancellation(state, rpc_ok, Name());
     return resume;
   }