Feature/sg 890 add filter option for multiclass (#129)

* wip * proposal * wip * wip * first draft * rename * remove unwanted change * add doc * improve doc * cleanup * cleanup * add min_max * Rename "image_count" to "num_samples" and "annotation_count" to "num_annotations" for better clarity (#133) Plus save them as ints in json not strings * Drop-in replacement of appdirs to platformdirs (#134) Co-authored-by: Ofri Masad <[email protected]> * fix visualization of normalized images (#137) * generalize * add eps * fix case when dataset to filter is too small * generalize for many splits * fix --------- Co-authored-by: Eugene Khvedchenya <[email protected]> Co-authored-by: Ofri Masad <[email protected]>
Deci-AI · Jul 10, 2023 · d5de9dc · d5de9dc
1 parent ef811da
commit d5de9dc
Show file tree

Hide file tree

Showing 9 changed files with 176 additions and 65 deletions.
diff --git a/src/data_gradients/config/detection.yaml b/src/data_gradients/config/detection.yaml
@@ -15,11 +15,17 @@ report_sections:
           n_rows: 6
           n_cols: 2
           heatmap_shape: [200, 200]
-      - DetectionBoundingBoxArea
+      - DetectionBoundingBoxArea:
+          topk: 30
+          prioritization_mode: train_val_diff
       - DetectionBoundingBoxPerImageCount
       - DetectionBoundingBoxSize
-      - DetectionClassFrequency
-      - DetectionClassesPerImageCount
+      - DetectionClassFrequency:
+          topk: 30
+          prioritization_mode: train_val_diff
+      - DetectionClassesPerImageCount:
+          topk: 30
+          prioritization_mode: train_val_diff
       - DetectionBoundingBoxIoU:
           num_bins: 10
           class_agnostic: true
diff --git a/src/data_gradients/config/segmentation.yaml b/src/data_gradients/config/segmentation.yaml
@@ -16,10 +16,16 @@ report_sections:
           n_rows: 6
           n_cols: 2
           heatmap_shape: [200, 200]
-      - SegmentationClassFrequency
-      - SegmentationClassesPerImageCount
+      - SegmentationClassFrequency:
+          topk: 30
+          prioritization_mode: train_val_diff
+      - SegmentationClassesPerImageCount:
+          topk: 30
+          prioritization_mode: train_val_diff
       - SegmentationComponentsPerImageCount
       - SegmentationBoundingBoxResolution
-      - SegmentationBoundingBoxArea
+      - SegmentationBoundingBoxArea:
+          topk: 30
+          prioritization_mode: train_val_diff
       - SegmentationComponentsConvexity
       - SegmentationComponentsErosion
diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py
@@ -5,13 +5,24 @@
 from data_gradients.utils.data_classes import DetectionSample
 from data_gradients.visualize.seaborn_renderer import ViolinPlotOptions
 from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
+from data_gradients.feature_extractors.utils import MostImportantValuesSelector
 
 
 @register_feature_extractor()
 class DetectionBoundingBoxArea(AbstractFeatureExtractor):
     """Feature Extractor to compute the area covered Bounding Boxes."""
 
-    def __init__(self):
+    def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
+        """
+        :param topk:                How many rows (per split) to show.
+        :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
+                - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
+                - 'outliers':       Returns the top k rows with the most extreme average values.
+                - 'max':            Returns the top k rows with the highest average values.
+                - 'min':            Returns the top k rows with the lowest average values.
+                - 'min_max':        Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
+        """
+        self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
     def update(self, sample: DetectionSample):
@@ -31,6 +42,8 @@ def update(self, sample: DetectionSample):
     def aggregate(self) -> Feature:
         df = pd.DataFrame(self.data)
 
+        df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")
+
         # Height of the plot is proportional to the number of classes
         n_unique = len(df["class_name"].unique())
         figsize_x = 10
@@ -53,9 +66,7 @@ def aggregate(self) -> Feature:
             tight_layout=True,
         )
 
-        json = dict(
-            train=dict(df[df["split"] == "train"]["relative_bbox_area"].describe()), val=dict(df[df["split"] == "val"]["relative_bbox_area"].describe())
-        )
+        json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}
 
         feature = Feature(
             data=df,

diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py
@@ -1,17 +1,27 @@
 import pandas as pd
-
 from data_gradients.common.registry.registry import register_feature_extractor
 from data_gradients.feature_extractors.abstract_feature_extractor import Feature
 from data_gradients.utils.data_classes import DetectionSample
 from data_gradients.visualize.seaborn_renderer import BarPlotOptions
 from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
+from data_gradients.feature_extractors.utils import MostImportantValuesSelector
 
 
 @register_feature_extractor()
 class DetectionClassFrequency(AbstractFeatureExtractor):
     """Feature Extractor to count the number of instance of each class."""
 
-    def __init__(self):
+    def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
+        """
+        :param topk:                How many rows (per split) to show.
+        :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
+                - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
+                - 'outliers':       Returns the top k rows with the most extreme average values.
+                - 'max':            Returns the top k rows with the highest average values.
+                - 'min':            Returns the top k rows with the lowest average values.
+                - 'min_max':        Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
+        """
+        self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
     def update(self, sample: DetectionSample):
@@ -34,6 +44,8 @@ def aggregate(self) -> Feature:
         split_sums = df_class_count.groupby("split")["n_appearance"].sum()
         df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums))
 
+        df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="frequency")
+
         # Height of the plot is proportional to the number of classes
         n_unique = len(df_class_count["class_name"].unique())
         figsize_x = 10
@@ -53,10 +65,7 @@ def aggregate(self) -> Feature:
             tight_layout=True,
         )
 
-        json = dict(
-            train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
-            val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
-        )
+        json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}
 
         feature = Feature(
             data=df_class_count,

diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py
@@ -5,14 +5,25 @@
 from data_gradients.utils.data_classes import DetectionSample
 from data_gradients.visualize.plot_options import ViolinPlotOptions
 from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
+from data_gradients.feature_extractors.utils import MostImportantValuesSelector
 
 
 @register_feature_extractor()
 class DetectionClassesPerImageCount(AbstractFeatureExtractor):
     """Feature Extractor to show the distribution of number of instance of each class per image.
     This gives information like "The class 'Human' usually appears 2 to 20 times per image."""
 
-    def __init__(self):
+    def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
+        """
+        :param topk:                How many rows (per split) to show.
+        :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
+                - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
+                - 'outliers':       Returns the top k rows with the most extreme average values.
+                - 'max':            Returns the top k rows with the highest average values.
+                - 'min':            Returns the top k rows with the lowest average values.
+                - 'min_max':        Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
+        """
+        self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
     def update(self, sample: DetectionSample):
@@ -33,8 +44,11 @@ def aggregate(self) -> Feature:
         # Include ("class_name", "class_id", "split", "n_appearance")
         # For each class, image, split, I want to know how many bbox I have
         # TODO: check this
+
         df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance")
 
+        df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance")
+
         # Height of the plot is proportional to the number of classes
         n_unique = len(df_class_count["class_name"].unique())
         figsize_x = 10
@@ -55,10 +69,7 @@ def aggregate(self) -> Feature:
             tight_layout=True,
         )
 
-        json = dict(
-            train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
-            val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
-        )
+        json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}
 
         feature = Feature(
             data=df_class_count,

diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py
@@ -5,6 +5,7 @@
 from data_gradients.utils.data_classes import SegmentationSample
 from data_gradients.visualize.seaborn_renderer import ViolinPlotOptions
 from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
+from data_gradients.feature_extractors.utils import MostImportantValuesSelector
 
 
 @register_feature_extractor()
@@ -14,7 +15,17 @@ class SegmentationBoundingBoxArea(AbstractFeatureExtractor):
     Get all Bounding Boxes areas and plot them as a percentage of the whole image.
     """
 
-    def __init__(self):
+    def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
+        """
+        :param topk:                How many rows (per split) to show.
+        :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
+                - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
+                - 'outliers':       Returns the top k rows with the most extreme average values.
+                - 'max':            Returns the top k rows with the highest average values.
+                - 'min':            Returns the top k rows with the lowest average values.
+                - 'min_max':        Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
+        """
+        self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
     def update(self, sample: SegmentationSample):
@@ -28,21 +39,23 @@ def update(self, sample: SegmentationSample):
                         "split": sample.split,
                         "class_name": class_name,
                         "class_id": class_id,
-                        "bbox_area": 100 * (contour.bbox_area / image_area),
+                        "relative_bbox_area": 100 * (contour.bbox_area / image_area),
                     }
                 )
 
     def aggregate(self) -> Feature:
         df = pd.DataFrame(self.data)
 
+        df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area")
+
         # Height of the plot is proportional to the number of classes
         n_unique = len(df["class_name"].unique())
         figsize_x = 10
         figsize_y = min(max(6, int(n_unique * 0.3)), 175)
 
-        max_area = min(100, df["bbox_area"].max())
+        max_area = min(100, df["relative_bbox_area"].max())
         plot_options = ViolinPlotOptions(
-            x_label_key="bbox_area",
+            x_label_key="relative_bbox_area",
             x_label_name="Object Area (in % of image)",
             y_label_key="class_name",
             y_label_name="Class",
@@ -55,7 +68,7 @@ def aggregate(self) -> Feature:
             bandwidth=0.4,
             tight_layout=True,
         )
-        json = dict(train=dict(df[df["split"] == "train"]["bbox_area"].describe()), val=dict(df[df["split"] == "val"]["bbox_area"].describe()))
+        json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()}
 
         feature = Feature(
             data=df,

diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py
@@ -5,11 +5,22 @@
 from data_gradients.utils.data_classes import SegmentationSample
 from data_gradients.visualize.seaborn_renderer import BarPlotOptions
 from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
+from data_gradients.feature_extractors.utils import MostImportantValuesSelector
 
 
 @register_feature_extractor()
 class SegmentationClassFrequency(AbstractFeatureExtractor):
-    def __init__(self):
+    def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
+        """
+        :param topk:                How many rows (per split) to show.
+        :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
+                - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
+                - 'outliers':       Returns the top k rows with the most extreme average values.
+                - 'max':            Returns the top k rows with the highest average values.
+                - 'min':            Returns the top k rows with the lowest average values.
+                - 'min_max':        Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
+        """
+        self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
     def update(self, sample: SegmentationSample):
@@ -34,6 +45,8 @@ def aggregate(self) -> Feature:
         split_sums = df_class_count.groupby("split")["n_appearance"].sum()
         df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums))
 
+        df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="frequency")
+
         # Height of the plot is proportional to the number of classes
         n_unique = len(df_class_count["class_name"].unique())
         figsize_x = 10
@@ -53,10 +66,7 @@ def aggregate(self) -> Feature:
             tight_layout=True,
         )
 
-        json = dict(
-            train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
-            val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
-        )
+        json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}
 
         feature = Feature(
             data=df_class_count,

diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py
@@ -5,11 +5,22 @@
 from data_gradients.utils.data_classes import SegmentationSample
 from data_gradients.visualize.plot_options import ViolinPlotOptions
 from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor
+from data_gradients.feature_extractors.utils import MostImportantValuesSelector
 
 
 @register_feature_extractor()
 class SegmentationClassesPerImageCount(AbstractFeatureExtractor):
-    def __init__(self):
+    def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"):
+        """
+        :param topk:                How many rows (per split) to show.
+        :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown
+                - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values.
+                - 'outliers':       Returns the top k rows with the most extreme average values.
+                - 'max':            Returns the top k rows with the highest average values.
+                - 'min':            Returns the top k rows with the lowest average values.
+                - 'min_max':        Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values.
+        """
+        self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode)
         self.data = []
 
     def update(self, sample: SegmentationSample):
@@ -34,6 +45,8 @@ def aggregate(self) -> Feature:
         # For each class, image, split, I want to know how many bbox I have
         df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance")
 
+        df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance")
+
         max_n_appearance = df_class_count["n_appearance"].max()
 
         # Height of the plot is proportional to the number of classes
@@ -56,10 +69,7 @@ def aggregate(self) -> Feature:
             tight_layout=True,
         )
 
-        json = dict(
-            train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()),
-            val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()),
-        )
+        json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()}
 
         feature = Feature(
             data=df_class_count,