tensorflow-lite: use new yolov9s model with separate outputs to fix q…

…uantization accuracy loss
koush · Dec 28, 2024 · e33a793 · e33a793
1 parent 5f7ecc0
commit e33a793
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 34 deletions.
diff --git a/plugins/tensorflow-lite/package-lock.json b/plugins/tensorflow-lite/package-lock.json
diff --git a/plugins/tensorflow-lite/package.json b/plugins/tensorflow-lite/package.json
@@ -58,5 +58,5 @@
    "devDependencies": {
       "@scrypted/sdk": "file:../../sdk"
    },
-   "version": "0.1.72"
+   "version": "0.1.73"
 }
diff --git a/plugins/tensorflow-lite/src/tflite/__init__.py b/plugins/tensorflow-lite/src/tflite/__init__.py
@@ -19,19 +19,20 @@
     pass
 import asyncio
 import concurrent.futures
-import queue
 import re
 from typing import Any, Tuple
 
 import scrypted_sdk
 import tflite_runtime.interpreter as tflite
+from .yolo_separate_outputs import *
 from scrypted_sdk.types import Setting, SettingValue
 
 from common import yolo
 from predict import PredictPlugin
 
 availableModels = [
     "Default",
+    "scrypted_yolov9s_relu_sep_320",
     "scrypted_yolov9t_relu_320",
     "scrypted_yolov9s_relu_320",
     "ssd_mobilenet_v2_coco_quant_postprocess",
@@ -51,6 +52,7 @@
     "efficientdet_lite3x_640_ptq",
 ]
 
+
 def parse_label_contents(contents: str):
     lines = contents.splitlines()
     lines = [line for line in lines if line.strip()]
@@ -96,17 +98,12 @@ def configureModel():
             nonlocal model
 
             if defaultModel:
-                model = "scrypted_yolov9t_relu_320"
-                # if edge_tpus and next(
-                #     (obj for obj in edge_tpus if obj["type"] == "usb"), None
-                # ):
-                #     model = "ssdlite_mobiledet_coco_qat_postprocess"
-                # else:
-                #     model = "efficientdet_lite0_320_ptq"
+                model = "scrypted_yolov9s_relu_sep_320"
             self.yolo = "yolo" in model
             self.yolov9 = "yolov9" in model
             self.scrypted_model = "scrypted" in model
             self.scrypted_yolov10 = "scrypted_yolov10" in model
+            self.scrypted_yolo_sep = "_sep" in model
             self.modelName = model
 
             print(f"model: {model}")
@@ -184,8 +181,7 @@ def executor_initializer():
             thread_name = threading.current_thread().name
             interpreter = available_interpreters.pop()
             self.interpreters[thread_name] = interpreter
-            print('Interpreter initialized on thread {}'.format(thread_name))
-
+            print("Interpreter initialized on thread {}".format(thread_name))
 
         self.executor = concurrent.futures.ThreadPoolExecutor(
             initializer=executor_initializer,
@@ -247,29 +243,41 @@ def predict():
                 interpreter.set_tensor(tensor_index, im)
                 interpreter.invoke()
                 output_details = interpreter.get_output_details()
-                output = output_details[0]
-                x = interpreter.get_tensor(output["index"])
                 input_scale = self.get_input_details()[0]
-                if x.dtype == np.int8:
-                    scale, zero_point = output["quantization"]
-                    combined_scale = scale * input_scale
-                    if self.scrypted_yolov10:
-                        objs = yolo.parse_yolov10(
-                            x[0],
-                            scale=lambda v: (v - zero_point) * combined_scale,
-                            confidence_scale=lambda v: (v - zero_point) * scale,
-                            threshold_scale=lambda v: (v - zero_point) * scale,
-                        )
-                    else:
-                        objs = yolo.parse_yolov9(
-                            x[0],
-                            scale=lambda v: (v - zero_point) * combined_scale,
-                            confidence_scale=lambda v: (v - zero_point) * scale,
-                            threshold_scale=lambda v: (v - zero_point) * scale,
-                        )
+                if self.scrypted_yolo_sep:
+                    outputs = []
+                    for index, output in enumerate(output_details):
+                        o = interpreter.get_tensor(output["index"]).astype(np.float32)
+                        scale, zero_point = output["quantization"]
+                        o -= zero_point
+                        o *= scale
+                        outputs.append(o)
+
+                    output = yolo_separate_outputs.decode_bbox(outputs, [input.width, input.height])
+                    objs = yolo.parse_yolov9(output[0])
                 else:
-                    # this code path is unused.
-                    objs = yolo.parse_yolov9(x[0], scale=lambda v: v * input_scale)
+                    output = output_details[0]
+                    x = interpreter.get_tensor(output["index"])
+                    if x.dtype == np.int8:
+                        scale, zero_point = output["quantization"]
+                        combined_scale = scale * input_scale
+                        if self.scrypted_yolov10:
+                            objs = yolo.parse_yolov10(
+                                x[0],
+                                scale=lambda v: (v - zero_point) * combined_scale,
+                                confidence_scale=lambda v: (v - zero_point) * scale,
+                                threshold_scale=lambda v: (v - zero_point) * scale,
+                            )
+                        else:
+                            objs = yolo.parse_yolov9(
+                                x[0],
+                                scale=lambda v: (v - zero_point) * combined_scale,
+                                confidence_scale=lambda v: (v - zero_point) * scale,
+                                threshold_scale=lambda v: (v - zero_point) * scale,
+                            )
+                    else:
+                        # this code path is unused.
+                        objs = yolo.parse_yolov9(x[0], scale=lambda v: v * input_scale)
             else:
                 tflite_common.set_input(interpreter, input)
                 interpreter.invoke()

diff --git a/plugins/tensorflow-lite/src/tflite/yolo_separate_outputs.py b/plugins/tensorflow-lite/src/tflite/yolo_separate_outputs.py
@@ -0,0 +1,69 @@
+import numpy as np
+
+class DFL:
+    def __init__(self, c1=16):
+        self.c1 = c1
+        self.conv_weights = np.arange(c1).reshape(1, c1, 1, 1)
+
+    def forward(self, x):
+        b, _, a = x.shape  # batch, channels, anchors
+        x = x.reshape(b, 4, self.c1, a).transpose(0, 2, 1, 3)
+        x = softmax(x, axis=1)
+        x = np.sum(self.conv_weights * x, axis=1)
+        return x.reshape(b, 4, a)
+
+def softmax(x, axis=-1):
+    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
+    return e_x / np.sum(e_x, axis=axis, keepdims=True)
+
+def make_anchors(feats, strides, grid_cell_offset=0.5):
+    anchor_points, stride_tensor = [], []
+    assert feats is not None
+    dtype = feats[0].dtype
+    for i, stride in enumerate(strides):
+        _, _, h, w = feats[i].shape
+        sx = np.arange(w, dtype=dtype) + grid_cell_offset  # shift x
+        sy = np.arange(h, dtype=dtype) + grid_cell_offset  # shift y
+        sy, sx = np.meshgrid(sy, sx, indexing="ij")
+        anchor_points.append(np.stack((sx, sy), axis=-1).reshape(-1, 2))
+        stride_tensor.append(np.full((h * w, 1), stride, dtype=dtype))
+    return np.concatenate(anchor_points), np.concatenate(stride_tensor)
+
+def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
+    lt, rb = np.split(distance, 2, axis=dim)
+
+    anchor_points = anchor_points.transpose(0, 2, 1)
+
+    x1y1 = anchor_points - lt
+    x2y2 = anchor_points + rb
+    if xywh:
+        c_xy = (x1y1 + x2y2) / 2
+        wh = x2y2 - x1y1
+        return np.concatenate((c_xy, wh), axis=dim)  # xywh bbox
+    return np.concatenate((x1y1, x2y2), axis=dim)  # xyxy bbox
+
+def decode_bbox(preds, img_shape):
+    num_classes = next((o.shape[2] for o in preds if o.shape[2] != 64), -1)
+    assert num_classes != -1, 'cannot infer postprocessor inputs via output shape if there are 64 classes'
+    pos = [
+        i for i, _ in sorted(enumerate(preds),
+                            key=lambda x: (x[1].shape[2] if num_classes > 64 else -x[1].shape[2], -x[1].shape[1]))]
+    x = np.transpose(
+        np.concatenate([
+            np.concatenate([preds[i] for i in pos[:len(pos) // 2]], axis=1),
+            np.concatenate([preds[i] for i in pos[len(pos) // 2:]], axis=1)], axis=2), (0, 2, 1))
+    reg_max = (x.shape[1] - num_classes) // 4
+    dfl = DFL(reg_max) if reg_max > 1 else lambda x: x
+    img_h, img_w = img_shape[-2], img_shape[-1]
+    strides = [
+        int(np.sqrt(img_shape[-2] * img_shape[-1] / preds[p].shape[1])) for p in pos if preds[p].shape[2] != 64]
+    dims = [(img_h // s, img_w // s) for s in strides]
+    fake_feats = [np.zeros((1, 1, h, w), dtype=preds[0].dtype) for h, w in dims]
+    anchors, strides = [x.transpose(0, 1) for x in make_anchors(fake_feats, strides, 0.5)]  # generate anchors and strides
+
+    strides_tensor = strides.transpose(1, 0)
+    strides_tensor = np.expand_dims(strides_tensor, 0)
+
+    dbox = dist2bbox(dfl.forward(x[:, :-num_classes, :]), anchors[None, ...], xywh=True, dim=1) * strides_tensor
+
+    return np.concatenate((dbox, 1 / (1 + np.exp(-x[:, -num_classes:, :]))), axis=1)