Deci-AI · shanibenbaruch · Aug 31, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py
@@ -1,3 +1,6 @@
+import math
+
+import numpy as np
 import pandas as pd
 
 from data_gradients.common.registry.registry import register_feature_extractor
@@ -25,6 +28,10 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
         self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
+        self.hist_transform_name = 'sqrt'
+        transforms = {'sqrt': lambda bbox_area: int(math.sqrt(bbox_area))}
+        self.hist_transform = transforms[self.hist_transform_name]
+
     def update(self, sample: DetectionSample):
         image_area = sample.image.shape[0] * sample.image.shape[1]
         for class_id, bbox_xyxy in zip(sample.class_ids, sample.bboxes_xyxy):
@@ -36,12 +43,15 @@ def update(self, sample: DetectionSample):
                     "class_id": class_id,
                     "class_name": class_name,
                     "relative_bbox_area": 100 * (bbox_area / image_area),
+                    f"bbox_area_{self.hist_transform_name}": self.hist_transform(bbox_area),
                 }
             )
 
     def aggregate(self) -> Feature:
         df = pd.DataFrame(self.data)
 
+        dict_bincount = self._compute_histogram(df=df, transform_name=self.hist_transform_name)
+
         df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")
 
         # Height of the plot is proportional to the number of classes
@@ -66,7 +76,10 @@ def aggregate(self) -> Feature:
             tight_layout=True,
         )
 
-        json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}
+        json = {}
+        for split in df["split"].unique():
+            basic_stats = dict(df[df["split"] == split]["relative_bbox_area"].describe())
+            json[split] = {**basic_stats, "histogram_per_class": dict_bincount[split]}
 
         feature = Feature(
             data=df,
@@ -75,6 +88,56 @@ def aggregate(self) -> Feature:
         )
         return feature
 
+    @staticmethod
+    def _compute_histogram(df: pd.DataFrame, transform_name: str, min_bin_val: int = 1) -> dict:
+        """
+        Compute histograms for bounding box areas per class.
+
+        :param df: DataFrame containing bounding box data.
+        :param transform_name: Type of transformation (like 'sqrt').
+        :param min_bin_val: Minimum size value for the histogram.
+        :return: A dictionary containing relevant histogram information.
+            Example:
+            {
+                'train': {
+                    'transform': 'sqrt',
+                    'bin_width': width between histogram bins,
+                    'min_value': min size value in the histogram,
+                    'max_value': max size value in the histogram,
+                    'histograms': a dictionary of class name and its matching histogram
+                }
+                ...
+            }
+        """
+        max_bin_val = df[f'bbox_area_{transform_name}'].max() + 1
+        max_bin_val = int(max_bin_val)
+
+        assert max_bin_val > min_bin_val, \
+            "Maximum bin value must be greater than the minimum bin value for computing the histogram."
+
+        dict_bincount = {}
+        for split in df['split'].unique():
+            dict_bincount[split] = {}
+            split_data = df[df['split'] == split]
+
+            dict_bincount[split] = {
+                'transform': transform_name,
+                'bin_width': 1,
+                'min_value': min_bin_val,
+                'max_value': max_bin_val,
+                'histograms': {},
+            }
+
+            for class_label in split_data['class_name'].unique():
+                class_data = split_data[split_data['class_name'] == class_label]
+
+                bin_counts = np.bincount(class_data[f'bbox_area_{transform_name}'], minlength=max_bin_val)
+                histogram = bin_counts[min_bin_val:].tolist()
+
+                dict_bincount[split]['histograms'][class_label] = histogram
+
+        return dict_bincount
+
     @property
     def title(self) -> str:
         return "Distribution of Bounding Box Area"
@@ -87,3 +150,4 @@ def description(self) -> str:
             "Another thing to keep in mind is that having too many very small objects may indicate that your are downsizing your original image to a "
             "low resolution that is not appropriate for your objects."
         )
+
diff --git a/tests/deci_core_unit_test_suite_runner.py b/tests/deci_core_unit_test_suite_runner.py
@@ -2,6 +2,7 @@
 import unittest
 
 from tests.unit_tests.average_brightness_test import AverageBrightnessTest
+from tests.unit_tests.feature_extractors.detection.test_bounding_boxes_area import TestComputeHistogram
 
 
 class CoreUnitTestSuiteRunner:
@@ -19,6 +20,7 @@ def _add_modules_to_unit_tests_suite(self):
             :return:
         """
         self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(AverageBrightnessTest))
+        self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestComputeHistogram))
 
 
 if __name__ == "__main__":

diff --git a/tests/unit_tests/feature_extractors/detection/test_bounding_boxes_area.py b/tests/unit_tests/feature_extractors/detection/test_bounding_boxes_area.py
@@ -0,0 +1,108 @@
+import unittest
+
+import pandas as pd
+
+from data_gradients.feature_extractors.object_detection.bounding_boxes_area import DetectionBoundingBoxArea
+
+
+class TestComputeHistogram(unittest.TestCase):
+    def test_compute_histogram(self):
+        test_df = pd.DataFrame({
+            'bbox_area_sqrt': [1, 2, 3, 3, 3, 2, 3],
+            'split': ['train', 'train', 'train', 'train', 'val', 'val', 'val'],
+            'class_name': ['A', 'B', 'A', 'A', 'B', 'A', 'C']
+        })
+
+        result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=1)
+
+        expected_result = {
+            'train': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'min_value': 1,
+                'max_value': 4,
+                'histograms': {
+                    'A': [1, 0, 2],
+                    'B': [0, 1, 0]
+                }
+            },
+            'val': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'min_value': 1,
+                'max_value': 4,
+                'histograms': {
+                    'A': [0, 1, 0],
+                    'B': [0, 0, 1],
+                    'C': [0, 0, 1]
+                }
+            }
+        }
+
+        self.assertEqual(result, expected_result)
+
+    def test_single_data_point(self):
+        test_df = pd.DataFrame({'bbox_area_sqrt': [1], 'split': ['train'], 'class_name': ['A']})
+        result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=1)
+
+        expected_result = {
+            'train': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'min_value': 1,
+                'max_value': 2,
+                'histograms': {
+                    'A': [1]
+                }
+            }
+        }
+
+        self.assertEqual(result, expected_result)
+
+    def test_minimum_maximum_values(self):
+        test_df = pd.DataFrame({
+            'bbox_area_sqrt': [1, 100],
+            'split': ['val', 'val'],
+            'class_name': ['A', 'A']
+        })
+        result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=1)
+
+        expected_result = {
+            'val': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'min_value': 1,
+                'max_value': 101,
+                'histograms': {
+                    'A': [1] + [0] * 98 + [1]
+                }
+            }
+        }
+
+        self.assertEqual(result, expected_result)
+
+    def test_min_bin_val(self):
+        test_df = pd.DataFrame({
+            'bbox_area_sqrt': [3, 3, 3],
+            'split': ['val', 'val', 'val'],
+            'class_name': ['A', 'A', 'A']
+        })
+        result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=2)
+
+        expected_result = {
+            'val': {
+                'transform': 'sqrt',
+                'bin_width': 1,
+                'min_value': 2,
+                'max_value': 4,
+                'histograms': {
+                    'A': [0, 3]
+                }
+            }
+        }
+
+        self.assertEqual(result, expected_result)
+
+
+if __name__ == '__main__':
+    unittest.main()