triton-inference-server · pskiran1 · Jan 25, 2024 · Dec 29, 2023 · Dec 29, 2023 · Dec 29, 2023
diff --git a/docs/user_guide/trace.md b/docs/user_guide/trace.md
@@ -274,29 +274,27 @@ GRPC inference requests are reported separately.
 ```
 File: trace.json
 Summary for simple (-1): trace count = 1
-HTTP infer request (avg): 378us
-	Receive (avg): 21us
-	Send (avg): 7us
-	Overhead (avg): 79us
-	Handler (avg): 269us
-  		Overhead (avg): 11us
-  		Queue (avg): 15us
-  		Compute (avg): 242us
-  			Input (avg): 18us
-  			Infer (avg): 208us
-  			Output (avg): 15us
+HTTP infer request (avg): 403.578us
+	Receive (avg): 20.555us
+	Send (avg): 4.52us
+	Overhead (avg): 24.592us
+	Handler (avg): 353.911us
+  		Overhead (avg): 23.675us
+  		Queue (avg): 18.019us
+  		Compute (avg): 312.217us
+  			Input (avg): 24.151us
+  			Infer (avg): 244.186us
+  			Output (avg): 43.88us
 Summary for simple (-1): trace count = 1
-GRPC infer request (avg): 21441us
-	Wait/Read (avg): 20923us
-	Send (avg): 74us
-	Overhead (avg): 46us
-	Handler (avg): 395us
-  		Overhead (avg): 16us
-  		Queue (avg): 47us
-  		Compute (avg): 331us
-  			Input (avg): 30us
-  			Infer (avg): 286us
-  			Output (avg): 14us
+GRPC infer request (avg): 383.601us
+	Send (avg): 62.816us
+	Handler (avg): 392.924us
+  		Overhead (avg): 51.968us
+  		Queue (avg): 21.45us
+  		Compute (avg): 319.506us
+  			Input (avg): 27.76us
+  			Infer (avg): 227.844us
+  			Output (avg): 63.902us
 ```
 
 Use the -t option to get a summary for each trace in the file. This
@@ -309,10 +307,6 @@ the request was enqueued in the scheduling queue.
 $ trace_summary.py -t <trace file>
 ...
 simple (-1):
-  	grpc wait/read start
-  		26529us
-  	grpc wait/read end
-  		39us
   	request handler start
   		15us
   	queue start
@@ -375,20 +369,13 @@ Data Flow:
 
 The meaning of the trace timestamps is:
 
-* GRPC Request Wait/Read: Collected only for inference requests that use the
-  GRPC protocol. The time spent waiting for a request to arrive at the
-  server and for that request to be read. Because wait time is
-  included in the time it is not a useful measure of how much time is
-  spent reading a request from the network. Tracing an HTTP request
-  will provide an accurate measure of the read time.
-
 * HTTP Request Receive: Collected only for inference requests that use the
   HTTP protocol. The time required to read the inference request from
   the network.
 
 * Send: The time required to send the inference response.
 
-* Overhead: Additional time required in the HTTP or GRPC endpoint to
+* Overhead: Additional time required in the HTTP endpoint to
   process the inference request and response.
 
 * Handler: The total time spent handling the inference request, not

diff --git a/qa/common/trace_summary.py b/qa/common/trace_summary.py
@@ -106,20 +106,13 @@ def filter_timestamp(self):
         return "GRPC_WAITREAD_START"
 
     def add_frontend_span(self, span_map, timestamps):
-        if ("GRPC_WAITREAD_START" in timestamps) and ("GRPC_SEND_END" in timestamps):
+        if ("GRPC_WAITREAD_END" in timestamps) and ("GRPC_SEND_END" in timestamps):
             add_span(
                 span_map,
                 timestamps,
                 "GRPC_INFER",
-                "GRPC_WAITREAD_START",
-                "GRPC_SEND_END",
-            )
-            add_span(
-                span_map,
-                timestamps,
-                "GRPC_WAITREAD",
-                "GRPC_WAITREAD_START",
                 "GRPC_WAITREAD_END",
+                "GRPC_SEND_END",
             )
             add_span(
                 span_map, timestamps, "GRPC_SEND", "GRPC_SEND_START", "GRPC_SEND_END"
@@ -130,19 +123,7 @@ def summarize_frontend_span(self, span_map, cnt):
             res = "GRPC infer request (avg): {}us\n".format(
                 span_map["GRPC_INFER"] / (cnt * 1000)
             )
-            res += "\tWait/Read (avg): {}us\n".format(
-                span_map["GRPC_WAITREAD"] / (cnt * 1000)
-            )
             res += "\tSend (avg): {}us\n".format(span_map["GRPC_SEND"] / (cnt * 1000))
-            res += "\tOverhead (avg): {}us\n".format(
-                (
-                    span_map["GRPC_INFER"]
-                    - span_map["REQUEST"]
-                    - span_map["GRPC_WAITREAD"]
-                    - span_map["GRPC_SEND"]
-                )
-                / (cnt * 1000)
-            )
             return res
         else:
             return None
@@ -269,7 +250,9 @@ def summarize(frontend, traces):
                     print("\tparent id: {}".format(trace["parent_id"]))
                 ordered_timestamps = list()
                 for ts in trace["timestamps"]:
-                    ordered_timestamps.append((ts["name"], ts["ns"]))
+                    # skip GRPC_WAITREAD
+                    if not ts["name"].startswith("GRPC_WAITREAD"):
+                        ordered_timestamps.append((ts["name"], ts["ns"]))
                 ordered_timestamps.sort(key=lambda tup: tup[1])
 
                 now = None