Skip to content

Commit

Permalink
Feature/sg 890 add filter option for multiclass (#129)
Browse files Browse the repository at this point in the history
* wip

* proposal

* wip

* wip

* first draft

* rename

* remove unwanted change

* add doc

* improve doc

* cleanup

* cleanup

* add min_max

* Rename "image_count" to "num_samples" and "annotation_count" to "num_annotations" for better clarity (#133)

Plus save them as ints in json not strings

* Drop-in replacement of appdirs to platformdirs (#134)

Co-authored-by: Ofri Masad <[email protected]>

* fix visualization of normalized images (#137)

* generalize

* add eps

* fix case when dataset to filter is too small

* generalize for many splits

* fix

---------

Co-authored-by: Eugene Khvedchenya <[email protected]>
Co-authored-by: Ofri Masad <[email protected]>
  • Loading branch information
3 people authored Jul 10, 2023
1 parent ef811da commit d5de9dc
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 65 deletions.
12 changes: 9 additions & 3 deletions src/data_gradients/config/detection.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ report_sections:
n_rows: 6
n_cols: 2
heatmap_shape: [200, 200]
- DetectionBoundingBoxArea
- DetectionBoundingBoxArea:
topk: 30
prioritization_mode: train_val_diff
- DetectionBoundingBoxPerImageCount
- DetectionBoundingBoxSize
- DetectionClassFrequency
- DetectionClassesPerImageCount
- DetectionClassFrequency:
topk: 30
prioritization_mode: train_val_diff
- DetectionClassesPerImageCount:
topk: 30
prioritization_mode: train_val_diff
- DetectionBoundingBoxIoU:
num_bins: 10
class_agnostic: true
12 changes: 9 additions & 3 deletions src/data_gradients/config/segmentation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,16 @@ report_sections:
n_rows: 6
n_cols: 2
heatmap_shape: [200, 200]
- SegmentationClassFrequency
- SegmentationClassesPerImageCount
- SegmentationClassFrequency:
topk: 30
prioritization_mode: train_val_diff
- SegmentationClassesPerImageCount:
topk: 30
prioritization_mode: train_val_diff
- SegmentationComponentsPerImageCount
- SegmentationBoundingBoxResolution
- SegmentationBoundingBoxArea
- SegmentationBoundingBoxArea:
topk: 30
prioritization_mode: train_val_diff
- SegmentationComponentsConvexity
- SegmentationComponentsErosion
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,24 @@
from data_gradients.utils.data_classes import DetectionSample
from data_gradients.visualize.seaborn_renderer import ViolinPlotOptions
from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
from data_gradients.feature_extractors.utils import MostImportantValuesSelector


@register_feature_extractor()
class DetectionBoundingBoxArea(AbstractFeatureExtractor):
"""Feature Extractor to compute the area covered Bounding Boxes."""

def __init__(self):
def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
"""
:param topk: How many rows (per split) to show.
:param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
- 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
- 'outliers': Returns the top k rows with the most extreme average values.
- 'max': Returns the top k rows with the highest average values.
- 'min': Returns the top k rows with the lowest average values.
- 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
"""
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

def update(self, sample: DetectionSample):
Expand All @@ -31,6 +42,8 @@ def update(self, sample: DetectionSample):
def aggregate(self) -> Feature:
df = pd.DataFrame(self.data)

df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")

# Height of the plot is proportional to the number of classes
n_unique = len(df["class_name"].unique())
figsize_x = 10
Expand All @@ -53,9 +66,7 @@ def aggregate(self) -> Feature:
tight_layout=True,
)

json = dict(
train=dict(df[df["split"] == "train"]["relative_bbox_area"].describe()), val=dict(df[df["split"] == "val"]["relative_bbox_area"].describe())
)
json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}

feature = Feature(
data=df,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
import pandas as pd

from data_gradients.common.registry.registry import register_feature_extractor
from data_gradients.feature_extractors.abstract_feature_extractor import Feature
from data_gradients.utils.data_classes import DetectionSample
from data_gradients.visualize.seaborn_renderer import BarPlotOptions
from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
from data_gradients.feature_extractors.utils import MostImportantValuesSelector


@register_feature_extractor()
class DetectionClassFrequency(AbstractFeatureExtractor):
"""Feature Extractor to count the number of instance of each class."""

def __init__(self):
def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
"""
:param topk: How many rows (per split) to show.
:param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
- 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
- 'outliers': Returns the top k rows with the most extreme average values.
- 'max': Returns the top k rows with the highest average values.
- 'min': Returns the top k rows with the lowest average values.
- 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
"""
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

def update(self, sample: DetectionSample):
Expand All @@ -34,6 +44,8 @@ def aggregate(self) -> Feature:
split_sums = df_class_count.groupby("split")["n_appearance"].sum()
df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums))

df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="frequency")

# Height of the plot is proportional to the number of classes
n_unique = len(df_class_count["class_name"].unique())
figsize_x = 10
Expand All @@ -53,10 +65,7 @@ def aggregate(self) -> Feature:
tight_layout=True,
)

json = dict(
train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
)
json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}

feature = Feature(
data=df_class_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,25 @@
from data_gradients.utils.data_classes import DetectionSample
from data_gradients.visualize.plot_options import ViolinPlotOptions
from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
from data_gradients.feature_extractors.utils import MostImportantValuesSelector


@register_feature_extractor()
class DetectionClassesPerImageCount(AbstractFeatureExtractor):
"""Feature Extractor to show the distribution of number of instance of each class per image.
This gives information like "The class 'Human' usually appears 2 to 20 times per image."""

def __init__(self):
def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
"""
:param topk: How many rows (per split) to show.
:param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
- 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
- 'outliers': Returns the top k rows with the most extreme average values.
- 'max': Returns the top k rows with the highest average values.
- 'min': Returns the top k rows with the lowest average values.
- 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
"""
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

def update(self, sample: DetectionSample):
Expand All @@ -33,8 +44,11 @@ def aggregate(self) -> Feature:
# Include ("class_name", "class_id", "split", "n_appearance")
# For each class, image, split, I want to know how many bbox I have
# TODO: check this

df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance")

df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance")

# Height of the plot is proportional to the number of classes
n_unique = len(df_class_count["class_name"].unique())
figsize_x = 10
Expand All @@ -55,10 +69,7 @@ def aggregate(self) -> Feature:
tight_layout=True,
)

json = dict(
train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
)
json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}

feature = Feature(
data=df_class_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from data_gradients.utils.data_classes import SegmentationSample
from data_gradients.visualize.seaborn_renderer import ViolinPlotOptions
from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
from data_gradients.feature_extractors.utils import MostImportantValuesSelector


@register_feature_extractor()
Expand All @@ -14,7 +15,17 @@ class SegmentationBoundingBoxArea(AbstractFeatureExtractor):
Get all Bounding Boxes areas and plot them as a percentage of the whole image.
"""

def __init__(self):
def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
"""
:param topk: How many rows (per split) to show.
:param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
- 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
- 'outliers': Returns the top k rows with the most extreme average values.
- 'max': Returns the top k rows with the highest average values.
- 'min': Returns the top k rows with the lowest average values.
- 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
"""
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

def update(self, sample: SegmentationSample):
Expand All @@ -28,21 +39,23 @@ def update(self, sample: SegmentationSample):
"split": sample.split,
"class_name": class_name,
"class_id": class_id,
"bbox_area": 100 * (contour.bbox_area / image_area),
"relative_bbox_area": 100 * (contour.bbox_area / image_area),
}
)

def aggregate(self) -> Feature:
df = pd.DataFrame(self.data)

df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")

# Height of the plot is proportional to the number of classes
n_unique = len(df["class_name"].unique())
figsize_x = 10
figsize_y = min(max(6, int(n_unique * 0.3)), 175)

max_area = min(100, df["bbox_area"].max())
max_area = min(100, df["relative_bbox_area"].max())
plot_options = ViolinPlotOptions(
x_label_key="bbox_area",
x_label_key="relative_bbox_area",
x_label_name="Object Area (in % of image)",
y_label_key="class_name",
y_label_name="Class",
Expand All @@ -55,7 +68,7 @@ def aggregate(self) -> Feature:
bandwidth=0.4,
tight_layout=True,
)
json = dict(train=dict(df[df["split"] == "train"]["bbox_area"].describe()), val=dict(df[df["split"] == "val"]["bbox_area"].describe()))
json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}

feature = Feature(
data=df,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,22 @@
from data_gradients.utils.data_classes import SegmentationSample
from data_gradients.visualize.seaborn_renderer import BarPlotOptions
from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
from data_gradients.feature_extractors.utils import MostImportantValuesSelector


@register_feature_extractor()
class SegmentationClassFrequency(AbstractFeatureExtractor):
def __init__(self):
def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
"""
:param topk: How many rows (per split) to show.
:param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
- 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
- 'outliers': Returns the top k rows with the most extreme average values.
- 'max': Returns the top k rows with the highest average values.
- 'min': Returns the top k rows with the lowest average values.
- 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
"""
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

def update(self, sample: SegmentationSample):
Expand All @@ -34,6 +45,8 @@ def aggregate(self) -> Feature:
split_sums = df_class_count.groupby("split")["n_appearance"].sum()
df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums))

df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="frequency")

# Height of the plot is proportional to the number of classes
n_unique = len(df_class_count["class_name"].unique())
figsize_x = 10
Expand All @@ -53,10 +66,7 @@ def aggregate(self) -> Feature:
tight_layout=True,
)

json = dict(
train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
)
json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}

feature = Feature(
data=df_class_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,22 @@
from data_gradients.utils.data_classes import SegmentationSample
from data_gradients.visualize.plot_options import ViolinPlotOptions
from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
from data_gradients.feature_extractors.utils import MostImportantValuesSelector


@register_feature_extractor()
class SegmentationClassesPerImageCount(AbstractFeatureExtractor):
def __init__(self):
def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
"""
:param topk: How many rows (per split) to show.
:param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
- 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
- 'outliers': Returns the top k rows with the most extreme average values.
- 'max': Returns the top k rows with the highest average values.
- 'min': Returns the top k rows with the lowest average values.
- 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
"""
self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
self.data = []

def update(self, sample: SegmentationSample):
Expand All @@ -34,6 +45,8 @@ def aggregate(self) -> Feature:
# For each class, image, split, I want to know how many bbox I have
df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance")

df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance")

max_n_appearance = df_class_count["n_appearance"].max()

# Height of the plot is proportional to the number of classes
Expand All @@ -56,10 +69,7 @@ def aggregate(self) -> Feature:
tight_layout=True,
)

json = dict(
train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
)
json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}

feature = Feature(
data=df_class_count,
Expand Down
Loading

0 comments on commit d5de9dc

Please sign in to comment.