[RetinaNet] Image Converter and ObjectDetector (#1906)

* Rebased phase 1 changes * Rebased phase 1 changes * nit * Retina Phase 2 * nit * Expose Anchor Generator as layer, docstring correction and test correction * nit * Add missing args for prediction heads * - Use FeaturePyramidBackbone cls for RetinaNet backbone. - Correct test cases. * fix decoding error * - Add ground truth arg for RetinaNet model and remove source and target format from preprocessor * nit * Subclass Imageconverter and overload call method for object detection method * Revert "Subclass Imageconverter and overload call method for object detection method" This reverts commit 3b26d3a. * add names to layers * correct fpn coarser level as per torch retinanet model * nit * Polish Prediction head and fpn layers to include flags and norm layers * nit * nit * add prior probability flag for prediction head to use it for classification head and user friendly * compute_shape seems redudant here and correct layers for channels_first * keep compute_output_shape for fpn * nit * Change AnchorGen Implementation as per torch * correct the source format of anchors format * use plain rescaling and normalization no resizing for od models as it can effect the bounding boxes and the ops i backend framework dependent * use single bbox format for model * - Add arg for encoding format - Add required docstrings - Use `center_xywh` encoding for retinanet as per torch weights * make anchor generator optional * init as layers for anchor generator and label encoder and as one more arg for prediction head configuration * nit * - only consider levels from min level to backbone maxlevel fro feature extraction from image encoder * nit * nit * update resizing as per new keras3 resizing layer for bboxes * Revert "update resizing as per new keras3 resizing layer for bboxes" This reverts commit eb555ca. * Add TODO's for keras bounding box ops * Use keras layers to rescale and normalize * check with plain values * use convert_preprocessing_inputs function for basic operations as backend cause some gpu misplacement * use keras for init variables * modify task test for cases when test runs on gpu * modify the order of steps * fix tensor device placement error for torch backend * this should fix error while image size is give and not given cases * use numpy arrays * make `yxyx` as default bbox format and some nit * use image_size argument so that we dont break presets * Add retinanet_resnet50_fpn_coco preset * register retinanet presets
keras-team · Nov 11, 2024 · 5d97d1a · 5d97d1a
1 parent 5a7ecb6
commit 5d97d1a
Show file tree

Hide file tree

Showing 24 changed files with 1,504 additions and 230 deletions.
diff --git a/keras_hub/api/layers/__init__.py b/keras_hub/api/layers/__init__.py
@@ -51,6 +51,10 @@
 from keras_hub.src.models.resnet.resnet_image_converter import (
     ResNetImageConverter,
 )
+from keras_hub.src.models.retinanet.anchor_generator import AnchorGenerator
+from keras_hub.src.models.retinanet.retinanet_image_converter import (
+    RetinaNetImageConverter,
+)
 from keras_hub.src.models.sam.sam_image_converter import SAMImageConverter
 from keras_hub.src.models.sam.sam_mask_decoder import SAMMaskDecoder
 from keras_hub.src.models.sam.sam_prompt_encoder import SAMPromptEncoder

diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
@@ -185,6 +185,10 @@
 from keras_hub.src.models.image_classifier_preprocessor import (
     ImageClassifierPreprocessor,
 )
+from keras_hub.src.models.image_object_detector import ImageObjectDetector
+from keras_hub.src.models.image_object_detector_preprocessor import (
+    ImageObjectDetectorPreprocessor,
+)
 from keras_hub.src.models.image_segmenter import ImageSegmenter
 from keras_hub.src.models.image_segmenter_preprocessor import (
     ImageSegmenterPreprocessor,
@@ -252,6 +256,13 @@
 from keras_hub.src.models.resnet.resnet_image_classifier_preprocessor import (
     ResNetImageClassifierPreprocessor,
 )
+from keras_hub.src.models.retinanet.retinanet_backbone import RetinaNetBackbone
+from keras_hub.src.models.retinanet.retinanet_object_detector import (
+    RetinaNetObjectDetector,
+)
+from keras_hub.src.models.retinanet.retinanet_object_detector_preprocessor import (
+    RetinaNetObjectDetectorPreprocessor,
+)
 from keras_hub.src.models.roberta.roberta_backbone import RobertaBackbone
 from keras_hub.src.models.roberta.roberta_masked_lm import RobertaMaskedLM
 from keras_hub.src.models.roberta.roberta_masked_lm_preprocessor import (

diff --git a/keras_hub/src/bounding_box/__init__.py b/keras_hub/src/bounding_box/__init__.py
@@ -0,0 +1,2 @@
+# TODO: Once all bounding boxes are moved to keras repostory remove the
+# bounding box folder.
diff --git a/keras_hub/src/bounding_box/converters.py b/keras_hub/src/bounding_box/converters.py
@@ -20,29 +20,74 @@ class RequiresImagesException(Exception):
 ALL_AXES = 4
 
 
-def _encode_box_to_deltas(
+def encode_box_to_deltas(
     anchors,
     boxes,
-    anchor_format: str,
-    box_format: str,
+    anchor_format,
+    box_format,
+    encoding_format="center_yxhw",
     variance=None,
     image_shape=None,
 ):
-    """Converts bounding_boxes from `center_yxhw` to delta format."""
+    """Encodes bounding boxes relative to anchors as deltas.
+
+    This function calculates the deltas that represent the difference between
+    bounding boxes and provided anchors. Deltas encode the offsets and scaling
+    factors to apply to anchors to obtain the target boxes.
+
+    Boxes and anchors are first converted to the specified `encoding_format`
+    (defaulting to `center_yxhw`) for consistent delta representation.
+
+    Args:
+        anchors: `Tensors`. Anchor boxes with shape of `(N, 4)` where N is the
+            number of anchors.
+        boxes:  `Tensors` Bounding boxes to encode. Boxes can be of shape
+            `(B, N, 4)` or `(N, 4)`.
+        anchor_format: str. The format of the input `anchors`
+            (e.g., "xyxy", "xywh", etc.).
+        box_format: str. The format of the input `boxes`
+            (e.g., "xyxy", "xywh", etc.).
+        encoding_format: str. The intermediate format to which boxes and anchors
+            are converted before delta calculation. Defaults to "center_yxhw".
+        variance: `List[float]`. A 4-element array/tensor representing variance
+            factors to scale the box deltas. If provided, the calculated deltas
+            are divided by the variance. Defaults to None.
+        image_shape: `Tuple[int]`. The shape of the image (height, width, 3).
+            When using relative bounding box format for `box_format` the
+            `image_shape` is used for normalization.
+    Returns:
+        Encoded box deltas. The return type matches the `encode_format`.
+
+    Raises:
+        ValueError: If `variance` is not None and its length is not 4.
+        ValueError: If `encoding_format` is not `"center_xywh"` or
+            `"center_yxhw"`.
+
+    """
     if variance is not None:
         variance = ops.convert_to_tensor(variance, "float32")
         var_len = variance.shape[-1]
 
         if var_len != 4:
             raise ValueError(f"`variance` must be length 4, got {variance}")
+
+    if encoding_format not in ["center_xywh", "center_yxhw"]:
+        raise ValueError(
+            "`encoding_format` should be one of 'center_xywh' or 'center_yxhw', "
+            f"got {encoding_format}"
+        )
+
     encoded_anchors = convert_format(
         anchors,
         source=anchor_format,
-        target="center_yxhw",
+        target=encoding_format,
         image_shape=image_shape,
     )
     boxes = convert_format(
-        boxes, source=box_format, target="center_yxhw", image_shape=image_shape
+        boxes,
+        source=box_format,
+        target=encoding_format,
+        image_shape=image_shape,
     )
     anchor_dimensions = ops.maximum(
         encoded_anchors[..., 2:], keras.backend.epsilon()
@@ -61,27 +106,72 @@ def _encode_box_to_deltas(
     return boxes_delta
 
 
-def _decode_deltas_to_boxes(
+def decode_deltas_to_boxes(
     anchors,
     boxes_delta,
-    anchor_format: str,
-    box_format: str,
+    anchor_format,
+    box_format,
+    encoded_format="center_yxhw",
     variance=None,
     image_shape=None,
 ):
-    """Converts bounding_boxes from delta format to `center_yxhw`."""
+    """Converts bounding boxes from delta format to the specified `box_format`.
+
+    This function decodes bounding box deltas relative to anchors to obtain the
+    final bounding box coordinates. The boxes are encoded in a specific
+    `encoded_format` (center_yxhw by default) during the decoding process.
+    This allows flexibility in how the deltas are applied to the anchors.
+
+    Args:
+        anchors: Can be `Tensors` or `Dict[Tensors]` where keys are level
+            indices and values are corresponding anchor boxes.
+            The shape of the array/tensor should be `(N, 4)` where N is the
+            number of anchors.
+        boxes_delta Can be `Tensors` or `Dict[Tensors]` Bounding box deltas
+            must have the same type and structure as `anchors`.  The
+            shape of the array/tensor can be `(N, 4)` or `(B, N, 4)` where N is
+            the number of boxes.
+        anchor_format: str. The format of the input `anchors`.
+            (e.g., `"xyxy"`, `"xywh"`, etc.)
+        box_format: str. The desired format for the output boxes.
+            (e.g., `"xyxy"`, `"xywh"`, etc.)
+        encoded_format: str. Raw output format from regression head. Defaults
+            to `"center_yxhw"`.
+        variance: `List[floats]`. A 4-element array/tensor representing
+            variance factors to scale the box deltas. If provided, the deltas
+            are multiplied by the variance before being applied to the anchors.
+            Defaults to None.
+        image_shape:  The shape of the image (height, width).  This is needed
+            if normalization to image size is required when converting between
+            formats. Defaults to None.
+
+    Returns:
+        Decoded box coordinates. The return type matches the `box_format`.
+
+    Raises:
+        ValueError: If `variance` is not None and its length is not 4.
+        ValueError: If `encoded_format` is not `"center_xywh"` or
+            `"center_yxhw"`.
+
+    """
     if variance is not None:
         variance = ops.convert_to_tensor(variance, "float32")
         var_len = variance.shape[-1]
 
         if var_len != 4:
             raise ValueError(f"`variance` must be length 4, got {variance}")
 
+    if encoded_format not in ["center_xywh", "center_yxhw"]:
+        raise ValueError(
+            f"`encoded_format` should be 'center_xywh' or 'center_yxhw', "
+            f"but got '{encoded_format}'."
+        )
+
     def decode_single_level(anchor, box_delta):
         encoded_anchor = convert_format(
             anchor,
             source=anchor_format,
-            target="center_yxhw",
+            target=encoded_format,
             image_shape=image_shape,
         )
         if variance is not None:
@@ -97,7 +187,7 @@ def decode_single_level(anchor, box_delta):
         )
         box = convert_format(
             box,
-            source="center_yxhw",
+            source=encoded_format,
             target=box_format,
             image_shape=image_shape,
         )

diff --git a/keras_hub/src/layers/preprocessing/image_converter.py b/keras_hub/src/layers/preprocessing/image_converter.py
@@ -164,6 +164,11 @@ def _expand_non_channel_dims(self, value, inputs):
         # If inputs are not a tensor type, return a numpy array.
         # This might happen when running under tf.data.
         if ops.is_tensor(inputs):
+            # preprocessing decorator moves tensors to cpu in torch backend and
+            # processed on CPU, and then converted back to the appropriate
+            # device (potentially GPU) after preprocessing.
+            if keras.backend.backend() == "torch" and self.image_size is None:
+                return ops.expand_dims(value, broadcast_dims).cpu()
             return ops.expand_dims(value, broadcast_dims)
         else:
             return np.expand_dims(value, broadcast_dims)

diff --git a/keras_hub/src/models/image_object_detector.py b/keras_hub/src/models/image_object_detector.py
@@ -0,0 +1,87 @@
+import keras
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.task import Task
+
+
+@keras_hub_export("keras_hub.models.ImageObjectDetector")
+class ImageObjectDetector(Task):
+    """Base class for all image object detection tasks.
+
+    The `ImageObjectDetector` tasks wrap a `keras_hub.models.Backbone` and
+    a `keras_hub.models.Preprocessor` to create a model that can be used for
+    object detection. `ImageObjectDetector` tasks take an additional
+    `num_classes` argument, controlling the number of predicted output classes.
+
+    To fine-tune with `fit()`, pass a dataset containing tuples of `(x, y)`
+    labels where `x` is a string and `y` is dictionary with `boxes` and
+    `classes`.
+
+    All `ImageObjectDetector` tasks include a `from_preset()` constructor which
+    can be used to load a pre-trained config and weights.
+    """
+
+    def compile(
+        self,
+        optimizer="auto",
+        box_loss="auto",
+        classification_loss="auto",
+        metrics=None,
+        **kwargs,
+    ):
+        """Configures the `ImageObjectDetector` task for training.
+
+        The `ImageObjectDetector` task extends the default compilation signature of
+        `keras.Model.compile` with defaults for `optimizer`, `loss`, and
+        `metrics`. To override these defaults, pass any value
+        to these arguments during compilation.
+
+        Args:
+            optimizer: `"auto"`, an optimizer name, or a `keras.Optimizer`
+                instance. Defaults to `"auto"`, which uses the default optimizer
+                for the given model and task. See `keras.Model.compile` and
+                `keras.optimizers` for more info on possible `optimizer` values.
+            box_loss: `"auto"`, a loss name, or a `keras.losses.Loss` instance.
+                Defaults to `"auto"`, where a
+                `keras.losses.Huber` loss will be
+                applied for the object detector task. See
+                `keras.Model.compile` and `keras.losses` for more info on
+                possible `loss` values.
+            classification_loss: `"auto"`, a loss name, or a `keras.losses.Loss`
+                instance. Defaults to `"auto"`, where a
+                `keras.losses.BinaryFocalCrossentropy` loss will be
+                applied for the object detector task. See
+                `keras.Model.compile` and `keras.losses` for more info on
+                possible `loss` values.
+            metrics: `a list of metrics to be evaluated by
+                the model during training and testing. Defaults to `None`.
+                See `keras.Model.compile` and `keras.metrics` for
+                more info on possible `metrics` values.
+            **kwargs: See `keras.Model.compile` for a full list of arguments
+                supported by the compile method.
+        """
+        if optimizer == "auto":
+            optimizer = keras.optimizers.Adam(5e-5)
+        if box_loss == "auto":
+            box_loss = keras.losses.Huber(reduction="sum")
+        if classification_loss == "auto":
+            activation = getattr(self, "activation", None)
+            activation = keras.activations.get(activation)
+            from_logits = activation != keras.activations.sigmoid
+            classification_loss = keras.losses.BinaryFocalCrossentropy(
+                from_logits=from_logits, reduction="sum"
+            )
+        if metrics is not None:
+            raise ValueError("User metrics not yet supported")
+
+        losses = {
+            "bbox_regression": box_loss,
+            "cls_logits": classification_loss,
+        }
+
+        super().compile(
+            optimizer=optimizer,
+            loss=losses,
+            metrics=metrics,
+            **kwargs,
+        )
diff --git a/keras_hub/src/models/image_object_detector_preprocessor.py b/keras_hub/src/models/image_object_detector_preprocessor.py
@@ -0,0 +1,57 @@
+import keras
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.preprocessor import Preprocessor
+from keras_hub.src.utils.tensor_utils import preprocessing_function
+
+
+@keras_hub_export("keras_hub.models.ImageObjectDetectorPreprocessor")
+class ImageObjectDetectorPreprocessor(Preprocessor):
+    """Base class for object detector preprocessing layers.
+
+    `ImageObjectDetectorPreprocessor` tasks wraps a
+    `keras_hub.layers.Preprocessor` to create a preprocessing layer for
+    object detection tasks. It is intended to be paired with a
+    `keras_hub.models.ImageObjectDetector` task.
+
+    All `ImageObjectDetectorPreprocessor` take three inputs, `x`, `y`, and
+    `sample_weight`. `x`, the first input, should always be included. It can
+    be a image or batch of images. See examples below. `y` and `sample_weight`
+    are optional inputs that will be passed through unaltered. Usually, `y` will
+    be the a dict of `{"boxes": Tensor(batch_size, num_boxes, 4),
+    "classes": (batch_size, num_boxes)}.
+
+    The layer will returns either `x`, an `(x, y)` tuple if labels were provided,
+    or an `(x, y, sample_weight)` tuple if labels and sample weight were
+    provided. `x` will be the input images after all model preprocessing has
+    been applied.
+
+    All `ImageObjectDetectorPreprocessor` tasks include a `from_preset()`
+    constructor which can be used to load a pre-trained config and vocabularies.
+    You can call the `from_preset()` constructor directly on this base class, in
+    which case the correct class for your model will be automatically
+    instantiated.
+
+    Args:
+        image_converter: Preprocessing pipeline for images.
+
+    Examples.
+    ```python
+    preprocessor = keras_hub.models.ImageObjectDetectorPreprocessor.from_preset(
+        "retinanet_resnet50",
+    )
+    """
+
+    def __init__(
+        self,
+        image_converter=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_converter = image_converter
+
+    @preprocessing_function
+    def call(self, x, y=None, sample_weight=None):
+        if self.image_converter:
+            x = self.image_converter(x)
+        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
diff --git a/keras_hub/src/models/retinanet/__init__.py b/keras_hub/src/models/retinanet/__init__.py
@@ -0,0 +1,5 @@
+from keras_hub.src.models.retinanet.retinanet_backbone import RetinaNetBackbone
+from keras_hub.src.models.retinanet.retinanet_presets import backbone_presets
+from keras_hub.src.utils.preset_utils import register_presets
+
+register_presets(backbone_presets, RetinaNetBackbone)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# TODO: Once all bounding boxes are moved to keras repostory remove the
		# bounding box folder.