Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/alg 1639 add proxy information #180

Merged
merged 7 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import math

import numpy as np
import pandas as pd

from data_gradients.common.registry.registry import register_feature_extractor
Expand Down Expand Up @@ -25,6 +28,10 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

self.hist_transform_name = 'sqrt'
transforms = {'sqrt': lambda bbox_area: int(math.sqrt(bbox_area))}
self.hist_transform = transforms[self.hist_transform_name]

def update(self, sample: DetectionSample):
image_area = sample.image.shape[0] * sample.image.shape[1]
for class_id, bbox_xyxy in zip(sample.class_ids, sample.bboxes_xyxy):
Expand All @@ -36,12 +43,15 @@ def update(self, sample: DetectionSample):
"class_id": class_id,
"class_name": class_name,
"relative_bbox_area": 100 * (bbox_area / image_area),
f"bbox_area_{self.hist_transform_name}": self.hist_transform(bbox_area),
}
)

def aggregate(self) -> Feature:
df = pd.DataFrame(self.data)

dict_bincount = self._compute_histogram(df=df, transform_name=self.hist_transform_name)

df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")

# Height of the plot is proportional to the number of classes
Expand All @@ -66,7 +76,10 @@ def aggregate(self) -> Feature:
tight_layout=True,
)

json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}
json = {}
for split in df["split"].unique():
basic_stats = dict(df[df["split"] == split]["relative_bbox_area"].describe())
json[split] = {**basic_stats, "histogram_per_class": dict_bincount[split]}

feature = Feature(
data=df,
Expand All @@ -75,6 +88,56 @@ def aggregate(self) -> Feature:
)
return feature

@staticmethod
def _compute_histogram(df: pd.DataFrame, transform_name: str, min_bin_val: int = 1) -> dict:
"""
Compute histograms for bounding box areas per class.

:param df: DataFrame containing bounding box data.
:param transform_name: Type of transformation (like 'sqrt').
:param min_bin_val: Minimum size value for the histogram.
:return: A dictionary containing relevant histogram information.
Example:
{
'train': {
'transform': 'sqrt',
'bin_width': width between histogram bins,
'min_value': min size value in the histogram,
'max_value': max size value in the histogram,
'histograms': a dictionary of class name and its matching histogram
}
...
}
"""
max_bin_val = df[f'bbox_area_{transform_name}'].max() + 1
max_bin_val = int(max_bin_val)

assert max_bin_val > min_bin_val, \
"Maximum bin value must be greater than the minimum bin value for computing the histogram."

dict_bincount = {}
for split in df['split'].unique():
dict_bincount[split] = {}
split_data = df[df['split'] == split]

dict_bincount[split] = {
'transform': transform_name,
'bin_width': 1,
'min_value': min_bin_val,
'max_value': max_bin_val,
'histograms': {},
}

for class_label in split_data['class_name'].unique():
class_data = split_data[split_data['class_name'] == class_label]

bin_counts = np.bincount(class_data[f'bbox_area_{transform_name}'], minlength=max_bin_val)
histogram = bin_counts[min_bin_val:].tolist()

dict_bincount[split]['histograms'][class_label] = histogram

return dict_bincount

@property
def title(self) -> str:
return "Distribution of Bounding Box Area"
Expand All @@ -87,3 +150,4 @@ def description(self) -> str:
"Another thing to keep in mind is that having too many very small objects may indicate that your are downsizing your original image to a "
"low resolution that is not appropriate for your objects."
)

2 changes: 2 additions & 0 deletions tests/deci_core_unit_test_suite_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import unittest

from tests.unit_tests.average_brightness_test import AverageBrightnessTest
from tests.unit_tests.feature_extractors.detection.test_bounding_boxes_area import TestComputeHistogram


class CoreUnitTestSuiteRunner:
Expand All @@ -19,6 +20,7 @@ def _add_modules_to_unit_tests_suite(self):
:return:
"""
self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(AverageBrightnessTest))
self.unit_tests_suite.addTest(self.test_loader.loadTestsFromModule(TestComputeHistogram))


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import unittest

import pandas as pd

from data_gradients.feature_extractors.object_detection.bounding_boxes_area import DetectionBoundingBoxArea


class TestComputeHistogram(unittest.TestCase):
def test_compute_histogram(self):
test_df = pd.DataFrame({
'bbox_area_sqrt': [1, 2, 3, 3, 3, 2, 3],
'split': ['train', 'train', 'train', 'train', 'val', 'val', 'val'],
'class_name': ['A', 'B', 'A', 'A', 'B', 'A', 'C']
})

result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=1)

expected_result = {
'train': {
'transform': 'sqrt',
'bin_width': 1,
'min_value': 1,
'max_value': 4,
'histograms': {
'A': [1, 0, 2],
'B': [0, 1, 0]
}
},
'val': {
'transform': 'sqrt',
'bin_width': 1,
'min_value': 1,
'max_value': 4,
'histograms': {
'A': [0, 1, 0],
'B': [0, 0, 1],
'C': [0, 0, 1]
}
}
}

self.assertEqual(result, expected_result)

def test_single_data_point(self):
test_df = pd.DataFrame({'bbox_area_sqrt': [1], 'split': ['train'], 'class_name': ['A']})
result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=1)

expected_result = {
'train': {
'transform': 'sqrt',
'bin_width': 1,
'min_value': 1,
'max_value': 2,
'histograms': {
'A': [1]
}
}
}

self.assertEqual(result, expected_result)

def test_minimum_maximum_values(self):
test_df = pd.DataFrame({
'bbox_area_sqrt': [1, 100],
'split': ['val', 'val'],
'class_name': ['A', 'A']
})
result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=1)

expected_result = {
'val': {
'transform': 'sqrt',
'bin_width': 1,
'min_value': 1,
'max_value': 101,
'histograms': {
'A': [1] + [0] * 98 + [1]
}
}
}

self.assertEqual(result, expected_result)

def test_min_bin_val(self):
test_df = pd.DataFrame({
'bbox_area_sqrt': [3, 3, 3],
'split': ['val', 'val', 'val'],
'class_name': ['A', 'A', 'A']
})
result = DetectionBoundingBoxArea._compute_histogram(test_df, transform_name='sqrt', min_bin_val=2)

expected_result = {
'val': {
'transform': 'sqrt',
'bin_width': 1,
'min_value': 2,
'max_value': 4,
'histograms': {
'A': [0, 3]
}
}
}

self.assertEqual(result, expected_result)


if __name__ == '__main__':
unittest.main()