From 37678a04e5367f375500b746099705bca51dcba8 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 29 Jun 2023 17:10:21 +0300 Subject: [PATCH 01/20] wip --- .../batch_processors/formatters/detection.py | 7 +++++++ .../object_detection/classes_frequency_per_image.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/src/data_gradients/batch_processors/formatters/detection.py b/src/data_gradients/batch_processors/formatters/detection.py index 0b8336f0..efb7b184 100644 --- a/src/data_gradients/batch_processors/formatters/detection.py +++ b/src/data_gradients/batch_processors/formatters/detection.py @@ -57,6 +57,10 @@ def format(self, images: Tensor, labels: Tensor) -> Tuple[Tensor, List[Tensor]]: - labels: List of bounding boxes, each of shape (N_i, 5 [label_xyxy]) with N_i being the number of bounding boxes with class_id in class_ids """ + # Might happen if the user passes tensors as [N, 5] with N=1; If poorly coded, the Dataset may instead return a [5] tensor + if labels.numel() == 0: + labels = torch.zeros((0, 5)) + # If the label is of shape [N, 5] we can assume that it represents the targets of a single sample (class_name + 4 bbox coordinates) if labels.ndim == 2 and labels.shape[1] == 5: images = images.unsqueeze(0) @@ -76,6 +80,9 @@ def format(self, images: Tensor, labels: Tensor) -> Tuple[Tensor, List[Tensor]]: images *= 255 images = images.to(torch.uint8) + if labels.numel() == 0: + return images, labels + labels = self.convert_to_label_xyxy( annotated_bboxes=labels, image_shape=images.shape[-2:], diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index 3ba7c474..b5a8d3a3 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -35,6 +35,12 @@ def aggregate(self) -> Feature: # TODO: check this df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") + n_unique = len(df_class_count["class_name"].unique()) + factor = max(1.0, n_unique / 10.0) + figsize_x = min(max(10, int(10 * factor)), 30) + figsize_y = min(max(6, int(6 * factor / 2)), 9) + + print("factor: ", figsize_x, figsize_y) plot_options = ViolinPlotOptions( x_label_key="n_appearance", x_label_name="Number of class instance per Image", @@ -44,6 +50,7 @@ def aggregate(self) -> Feature: title=self.title, x_lim=(0, df_class_count["n_appearance"].max() * 1.2), bandwidth=0.4, + figsize=(figsize_x, figsize_y), x_ticks_rotation=None, labels_key="split", ) From 3240de5c4c46e30351532262138edf1119839e5f Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 2 Jul 2023 15:49:44 +0300 Subject: [PATCH 02/20] proposal --- .../object_detection/bounding_boxes_area.py | 7 +++++++ .../object_detection/bounding_boxes_iou.py | 10 +++++++--- .../object_detection/classes_frequency.py | 6 ++++++ .../object_detection/classes_frequency_per_image.py | 7 +++---- src/data_gradients/visualize/seaborn_renderer.py | 5 +++-- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index 73c5c272..e8bdbe2b 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -31,7 +31,13 @@ def update(self, sample: DetectionSample): def aggregate(self) -> Feature: df = pd.DataFrame(self.data) + # Height of the plot is proportional to the number of classes + n_unique = len(df["class_name"].unique()) + figsize_x = 10 + figsize_y = min(max(6, int(n_unique * 0.3)), 90) + max_area = min(100, df["relative_bbox_area"].max()) + plot_options = ViolinPlotOptions( x_label_key="relative_bbox_area", x_label_name="Bounding Box Area (in % of image)", @@ -42,6 +48,7 @@ def aggregate(self) -> Feature: x_ticks_rotation=None, labels_key="split", x_lim=(0, max_area), + figsize=(figsize_x, figsize_y), bandwidth=0.4, ) diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_iou.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_iou.py index c571f7db..31993210 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_iou.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_iou.py @@ -74,7 +74,7 @@ def aggregate(self) -> Feature: data = {} json = {} - splits = df["split"].unique() + splits = sorted(df["split"].unique()) for split in splits: counts = self._compute_cumulative_counts_at_thresholds(df[df["split"] == split], class_names, self.num_bins) @@ -95,6 +95,11 @@ def aggregate(self) -> Feature: self._show_plot = False return Feature(data=None, plot_options=None, json={}) + # Height of the plot is proportional to the number of classes + figsize_x = min(max(10, len(bins)), 25) + figsize_y = int(num_classes * 0.3) + 4 + figsize_y = min(max(6, figsize_y), 90) + plot_options = HeatmapOptions( xticklabels=xticklabels, yticklabels=class_names + ["All classes"], @@ -106,8 +111,7 @@ def aggregate(self) -> Feature: annot=True, title=self.title, square=True, - # Height of the plot is proportional to the number of classes - figsize=(10, (int(num_classes * 0.3) + 4) * len(splits)), + figsize=(figsize_x, figsize_y), tight_layout=True, x_ticks_rotation=90, ) diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index e1b5225e..dabadff5 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -34,6 +34,11 @@ def aggregate(self) -> Feature: split_sums = df_class_count.groupby("split")["n_appearance"].sum() df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums)) + # Height of the plot is proportional to the number of classes + n_unique = len(df_class_count["class_name"].unique()) + figsize_x = 10 + figsize_y = min(max(6, int(n_unique * 0.3)), 90) + plot_options = BarPlotOptions( x_label_key="frequency", x_label_name="Frequency", @@ -41,6 +46,7 @@ def aggregate(self) -> Feature: y_label_name="Class", order_key="class_id", title=self.title, + figsize=(figsize_x, figsize_y), x_ticks_rotation=None, labels_key="split", orient="h", diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index b5a8d3a3..8f33c030 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -35,12 +35,11 @@ def aggregate(self) -> Feature: # TODO: check this df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") + # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) - factor = max(1.0, n_unique / 10.0) - figsize_x = min(max(10, int(10 * factor)), 30) - figsize_y = min(max(6, int(6 * factor / 2)), 9) + figsize_x = 10 + figsize_y = min(max(6, int(n_unique * 0.3)), 90) - print("factor: ", figsize_x, figsize_y) plot_options = ViolinPlotOptions( x_label_key="n_appearance", x_label_name="Number of class instance per Image", diff --git a/src/data_gradients/visualize/seaborn_renderer.py b/src/data_gradients/visualize/seaborn_renderer.py index 71d743d8..233204bf 100644 --- a/src/data_gradients/visualize/seaborn_renderer.py +++ b/src/data_gradients/visualize/seaborn_renderer.py @@ -378,17 +378,18 @@ def _render_barplot(self, df, options: BarPlotOptions) -> plt.Figure: def _render_heatmap(self, data: Mapping[str, np.ndarray], options: HeatmapOptions) -> plt.Figure: - fig, axes = plt.subplots(nrows=len(data), ncols=1, figsize=options.figsize, tight_layout=options.tight_layout) + fig, axes = plt.subplots(nrows=1, ncols=len(data), figsize=options.figsize, tight_layout=options.tight_layout) fig.subplots_adjust() for i, (key, heatmap) in enumerate(data.items()): ax = axes[i] if len(data) > 1 else axes + cbar = options.cbar if i + 1 == len(data) else False heatmap_args = dict( data=heatmap, xticklabels=options.xticklabels, yticklabels=options.yticklabels, annot=options.annot, - cbar=options.cbar, + cbar=cbar, cbar_kws={"shrink": 0.5}, square=options.square, cmap=options.cmap, From 6a9c8748387eec3a86b8e71a6a9f9a7cf4cf46d6 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Mon, 3 Jul 2023 10:02:22 +0300 Subject: [PATCH 03/20] wip --- .../object_detection/classes_frequency.py | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index dabadff5..a345efd8 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -1,5 +1,5 @@ import pandas as pd - +from abc import ABC, abstractmethod from data_gradients.common.registry.registry import register_feature_extractor from data_gradients.feature_extractors.abstract_feature_extractor import Feature from data_gradients.utils.data_classes import DetectionSample @@ -75,3 +75,41 @@ def description(self) -> str: "For instance, if one of the class only appears in the validation set, you know in advance that your model won't be able to " "learn to predict that class." ) + + +class DataframeExtractor(ABC): + def __init__(self, topk: int): + self.topk = topk + + @abstractmethod + def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: + pass + + +class OutliersExtractor(DataframeExtractor): + def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: + values = df[column] + values_normalized = (values - values.mean()) / values.var() + outliers_index = values_normalized.abs().sort_values(ascending=False).index[: self.topk] + return df[outliers_index] + + +class HighestValuesExtractor(DataframeExtractor): + def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: + return df.sort_values(by=column, ascending=False)[: self.topk] + + +class LowestValuesExtractor(DataframeExtractor): + def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: + return df.sort_values(by=column, ascending=True)[: self.topk] + + +def get_dataframe_extractor_per_frequency(extractor_name: str, topk: int) -> DataframeExtractor: + available_extractors = { + "outliers": OutliersExtractor(topk=topk), + "most_frequent": HighestValuesExtractor(topk=topk), + "least_frequent": LowestValuesExtractor(topk=topk), + } + if extractor_name not in available_extractors.keys(): + raise ValueError + return available_extractors[extractor_name] From 92c3b1a43f23938ecd5ae5f7ab15229c46a70c10 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 5 Jul 2023 15:52:46 +0300 Subject: [PATCH 04/20] wip --- .../object_detection/bounding_boxes_area.py | 3 + .../object_detection/classes_frequency.py | 3 + .../classes_frequency_per_image.py | 3 + .../feature_extractors/utils.py | 77 ++++++++++--------- 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index e8bdbe2b..cc3561b6 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -5,6 +5,7 @@ from data_gradients.utils.data_classes import DetectionSample from data_gradients.visualize.seaborn_renderer import ViolinPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor +from data_gradients.feature_extractors.utils import get_top_values @register_feature_extractor() @@ -31,6 +32,8 @@ def update(self, sample: DetectionSample): def aggregate(self) -> Feature: df = pd.DataFrame(self.data) + df = get_top_values(df=df, id_col="class_id", split_col="split", val_col="relative_bbox_area", mode="outliers") + # Height of the plot is proportional to the number of classes n_unique = len(df["class_name"].unique()) figsize_x = 10 diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index a345efd8..9818bef8 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -5,6 +5,7 @@ from data_gradients.utils.data_classes import DetectionSample from data_gradients.visualize.seaborn_renderer import BarPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor +from data_gradients.feature_extractors.utils import get_top_values @register_feature_extractor() @@ -34,6 +35,8 @@ def aggregate(self) -> Feature: split_sums = df_class_count.groupby("split")["n_appearance"].sum() df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums)) + df_class_count = get_top_values(df=df_class_count, id_col="class_id", split_col="split", val_col="frequency", mode="outliers") + # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) figsize_x = 10 diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index 8f33c030..813705a0 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -5,6 +5,7 @@ from data_gradients.utils.data_classes import DetectionSample from data_gradients.visualize.plot_options import ViolinPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor +from data_gradients.feature_extractors.utils import get_top_values @register_feature_extractor() @@ -35,6 +36,8 @@ def aggregate(self) -> Feature: # TODO: check this df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") + df_class_count = get_top_values(df=df_class_count, id_col="class_id", split_col="split", val_col="n_appearance", mode="outliers") + # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) figsize_x = 10 diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index f80425e6..379b3d8e 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -1,40 +1,47 @@ -from typing import Dict, List, Any, Tuple - import numpy as np -def align_histogram_keys(train_histogram: Dict[str, Any], val_histogram: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: - """Enforces the keys of training and validation histograms to be the same. - If one of the keys is missing, the histogram will filled with defaults value (0, 0.0, "") depending on the situation - - :param train_histogram: Histogram representing metrics from training split. - :param val_histogram: Histogram representing metrics from validation split. - :return: A merged dictionary containing key-value pairs from both "train" and "val" splits. - """ - keys = set(train_histogram.keys()) | set(val_histogram.keys()) - - aligned_train_histogram, aligned_val_histogram = {}, {} - for key in keys: - train_value = train_histogram.get(key) - val_value = val_histogram.get(key) - - value_type = type(train_value) if train_value is not None else type(val_value) - default_value = value_type() - - aligned_train_histogram[key] = train_value or default_value - aligned_val_histogram[key] = val_value or default_value - - return aligned_train_histogram, aligned_val_histogram - - -def normalize_values_to_percentages(counters: List[float], total_count: float) -> List[float]: +def get_top_values(df, id_col, split_col, val_col, mode): """ - Normalize a list of count to percentages relative to a total value. - - :param counters: Values to normalize. - :param total_count: Total number of values, which will be used to calculate percentages. - :return: Values representing the percentages of each input value. + Returns the top 5 rows of the DataFrame based on the mode. + The DataFrame is expected to have three columns: id_col, split_col, val_col. + + Modes: + 'gap' - Returns rows with the biggest gap between 'train' and 'val' split values. + 'outliers' - Returns rows with the most extreme average split values. + 'max' - Returns rows with the highest average split values. + 'min' - Returns rows with the lowest average split values. """ - if total_count == 0: - total_count = 1 - return [np.round(((100 * count) / total_count), 3) for count in counters] + # Verify inputs + for col in [id_col, split_col, val_col]: + if col not in df.columns: + raise ValueError(f"{col} is not a column in the DataFrame") + print(id_col, split_col, val_col) + + # the mean of val_col for each id_col/split_col + df_mean = df.groupby([id_col, split_col])[val_col].mean().reset_index() + + # Pivot DataFrame to have 'train' and 'val' as columns + df_pivot = df_mean.pivot(index=id_col, columns=split_col, values=val_col) + + # Calculate the relative difference or average based on the mode + if mode == "gap": + df_pivot["metric"] = np.abs((df_pivot["train"] - df_pivot["val"]) / ((df_pivot["train"] + df_pivot["val"]) / 2)) + elif mode in ["outliers", "max", "min"]: + df_pivot["metric"] = (df_pivot["train"] + df_pivot["val"]) / 2 + + # Calculate the z-score if mode is 'outliers' + if mode == "outliers": + mean, std = df_pivot["metric"].mean(), df_pivot["metric"].std() + df_pivot["metric"] = (df_pivot["metric"] - mean).abs() / std + + # Get the top 5 class_ids based on the metric + if mode in ["gap", "outliers", "max"]: + top_ids = df_pivot.nlargest(5, "metric").index + elif mode == "min": + top_ids = df_pivot.nsmallest(5, "metric").index + else: + raise ValueError("Invalid mode. Expected one of: gap, outliers, max, min") + + # Filter the original DataFrame to only include rows with the top class_ids + return df[df[id_col].isin(top_ids)] From 8b46be9ef209983997d9392b84f3bf89ee80c150 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 5 Jul 2023 18:25:07 +0300 Subject: [PATCH 05/20] first draft --- src/data_gradients/config/detection.yaml | 12 +- src/data_gradients/config/segmentation.yaml | 12 +- .../object_detection/bounding_boxes_area.py | 7 +- .../object_detection/classes_frequency.py | 7 +- .../classes_frequency_per_image.py | 7 +- .../segmentation/bounding_boxes_area.py | 16 ++- .../segmentation/classes_frequency.py | 6 +- .../classes_frequency_per_image.py | 6 +- .../feature_extractors/utils.py | 105 ++++++++++-------- 9 files changed, 110 insertions(+), 68 deletions(-) diff --git a/src/data_gradients/config/detection.yaml b/src/data_gradients/config/detection.yaml index 2632fc0f..068068bc 100644 --- a/src/data_gradients/config/detection.yaml +++ b/src/data_gradients/config/detection.yaml @@ -15,11 +15,17 @@ report_sections: n_rows: 6 n_cols: 2 heatmap_shape: [200, 200] - - DetectionBoundingBoxArea + - DetectionBoundingBoxArea: + topk: 4 + mode: gap - DetectionBoundingBoxPerImageCount - DetectionBoundingBoxSize - - DetectionClassFrequency - - DetectionClassesPerImageCount + - DetectionClassFrequency: + topk: 4 + mode: gap + - DetectionClassesPerImageCount: + topk: 4 + mode: gap - DetectionBoundingBoxIoU: num_bins: 10 class_agnostic: true diff --git a/src/data_gradients/config/segmentation.yaml b/src/data_gradients/config/segmentation.yaml index aa3b50da..76314607 100644 --- a/src/data_gradients/config/segmentation.yaml +++ b/src/data_gradients/config/segmentation.yaml @@ -16,10 +16,16 @@ report_sections: n_rows: 6 n_cols: 2 heatmap_shape: [200, 200] - - SegmentationClassFrequency - - SegmentationClassesPerImageCount + - SegmentationClassFrequency: + topk: 30 + mode: gap + - SegmentationClassesPerImageCount: + topk: 30 + mode: gap - SegmentationComponentsPerImageCount - SegmentationBoundingBoxResolution - - SegmentationBoundingBoxArea + - SegmentationBoundingBoxArea: + topk: 30 + mode: gap - SegmentationComponentsConvexity - SegmentationComponentsErosion diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index 301ac728..a9b86124 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -5,14 +5,15 @@ from data_gradients.utils.data_classes import DetectionSample from data_gradients.visualize.seaborn_renderer import ViolinPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor -from data_gradients.feature_extractors.utils import get_top_values +from data_gradients.feature_extractors.utils import MostImportantValuesSelector @register_feature_extractor() class DetectionBoundingBoxArea(AbstractFeatureExtractor): """Feature Extractor to compute the area covered Bounding Boxes.""" - def __init__(self): + def __init__(self, topk: int = 40, mode: str = "gap"): + self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) self.data = [] def update(self, sample: DetectionSample): @@ -32,7 +33,7 @@ def update(self, sample: DetectionSample): def aggregate(self) -> Feature: df = pd.DataFrame(self.data) - df = get_top_values(df=df, id_col="class_id", split_col="split", val_col="relative_bbox_area", mode="outliers") + df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area") # Height of the plot is proportional to the number of classes n_unique = len(df["class_name"].unique()) diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index 5905e3aa..b156d68f 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -5,14 +5,15 @@ from data_gradients.utils.data_classes import DetectionSample from data_gradients.visualize.seaborn_renderer import BarPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor -from data_gradients.feature_extractors.utils import get_top_values +from data_gradients.feature_extractors.utils import MostImportantValuesSelector @register_feature_extractor() class DetectionClassFrequency(AbstractFeatureExtractor): """Feature Extractor to count the number of instance of each class.""" - def __init__(self): + def __init__(self, topk: int = 40, mode: str = "gap"): + self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) self.data = [] def update(self, sample: DetectionSample): @@ -35,7 +36,7 @@ def aggregate(self) -> Feature: split_sums = df_class_count.groupby("split")["n_appearance"].sum() df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums)) - df_class_count = get_top_values(df=df_class_count, id_col="class_id", split_col="split", val_col="frequency", mode="outliers") + df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="frequency") # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index eb39701f..7cb8b427 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -5,7 +5,7 @@ from data_gradients.utils.data_classes import DetectionSample from data_gradients.visualize.plot_options import ViolinPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor -from data_gradients.feature_extractors.utils import get_top_values +from data_gradients.feature_extractors.utils import MostImportantValuesSelector @register_feature_extractor() @@ -13,7 +13,8 @@ class DetectionClassesPerImageCount(AbstractFeatureExtractor): """Feature Extractor to show the distribution of number of instance of each class per image. This gives information like "The class 'Human' usually appears 2 to 20 times per image.""" - def __init__(self): + def __init__(self, topk: int = 40, mode: str = "gap"): + self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) self.data = [] def update(self, sample: DetectionSample): @@ -36,7 +37,7 @@ def aggregate(self) -> Feature: # TODO: check this df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") - df_class_count = get_top_values(df=df_class_count, id_col="class_id", split_col="split", val_col="n_appearance", mode="outliers") + df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance") # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py index 5e5f455b..5d8852d2 100644 --- a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py @@ -5,6 +5,7 @@ from data_gradients.utils.data_classes import SegmentationSample from data_gradients.visualize.seaborn_renderer import ViolinPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor +from data_gradients.feature_extractors.utils import MostImportantValuesSelector @register_feature_extractor() @@ -14,7 +15,8 @@ class SegmentationBoundingBoxArea(AbstractFeatureExtractor): Get all Bounding Boxes areas and plot them as a percentage of the whole image. """ - def __init__(self): + def __init__(self, topk: int = 40, mode: str = "gap"): + self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) self.data = [] def update(self, sample: SegmentationSample): @@ -28,21 +30,23 @@ def update(self, sample: SegmentationSample): "split": sample.split, "class_name": class_name, "class_id": class_id, - "bbox_area": 100 * (contour.bbox_area / image_area), + "relative_bbox_area": 100 * (contour.bbox_area / image_area), } ) def aggregate(self) -> Feature: df = pd.DataFrame(self.data) + df = self.value_extractor.select(df=df, id_col="class_id", split_col="split", value_col="relative_bbox_area") + # Height of the plot is proportional to the number of classes n_unique = len(df["class_name"].unique()) figsize_x = 10 figsize_y = min(max(6, int(n_unique * 0.3)), 175) - max_area = min(100, df["bbox_area"].max()) + max_area = min(100, df["relative_bbox_area"].max()) plot_options = ViolinPlotOptions( - x_label_key="bbox_area", + x_label_key="relative_bbox_area", x_label_name="Object Area (in % of image)", y_label_key="class_name", y_label_name="Class", @@ -54,7 +58,9 @@ def aggregate(self) -> Feature: labels_key="split", bandwidth=0.4, ) - json = dict(train=dict(df[df["split"] == "train"]["bbox_area"].describe()), val=dict(df[df["split"] == "val"]["bbox_area"].describe())) + json = dict( + train=dict(df[df["split"] == "train"]["relative_bbox_area"].describe()), val=dict(df[df["split"] == "val"]["relative_bbox_area"].describe()) + ) feature = Feature( data=df, diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index 171158fc..759f7c48 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -5,11 +5,13 @@ from data_gradients.utils.data_classes import SegmentationSample from data_gradients.visualize.seaborn_renderer import BarPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor +from data_gradients.feature_extractors.utils import MostImportantValuesSelector @register_feature_extractor() class SegmentationClassFrequency(AbstractFeatureExtractor): - def __init__(self): + def __init__(self, topk: int = 40, mode: str = "gap"): + self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) self.data = [] def update(self, sample: SegmentationSample): @@ -34,6 +36,8 @@ def aggregate(self) -> Feature: split_sums = df_class_count.groupby("split")["n_appearance"].sum() df_class_count["frequency"] = 100 * (df_class_count["n_appearance"] / df_class_count["split"].map(split_sums)) + df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="frequency") + # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) figsize_x = 10 diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index 1b514a63..e81cd918 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -5,11 +5,13 @@ from data_gradients.utils.data_classes import SegmentationSample from data_gradients.visualize.plot_options import ViolinPlotOptions from data_gradients.feature_extractors.abstract_feature_extractor import AbstractFeatureExtractor +from data_gradients.feature_extractors.utils import MostImportantValuesSelector @register_feature_extractor() class SegmentationClassesPerImageCount(AbstractFeatureExtractor): - def __init__(self): + def __init__(self, topk: int = 40, mode: str = "gap"): + self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) self.data = [] def update(self, sample: SegmentationSample): @@ -34,6 +36,8 @@ def aggregate(self) -> Feature: # For each class, image, split, I want to know how many bbox I have df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") + df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance") + max_n_appearance = df_class_count["n_appearance"].max() # Height of the plot is proportional to the number of classes diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index 379b3d8e..f9429336 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -1,47 +1,60 @@ import numpy as np - - -def get_top_values(df, id_col, split_col, val_col, mode): - """ - Returns the top 5 rows of the DataFrame based on the mode. - The DataFrame is expected to have three columns: id_col, split_col, val_col. - - Modes: - 'gap' - Returns rows with the biggest gap between 'train' and 'val' split values. - 'outliers' - Returns rows with the most extreme average split values. - 'max' - Returns rows with the highest average split values. - 'min' - Returns rows with the lowest average split values. - """ - # Verify inputs - for col in [id_col, split_col, val_col]: - if col not in df.columns: - raise ValueError(f"{col} is not a column in the DataFrame") - print(id_col, split_col, val_col) - - # the mean of val_col for each id_col/split_col - df_mean = df.groupby([id_col, split_col])[val_col].mean().reset_index() - - # Pivot DataFrame to have 'train' and 'val' as columns - df_pivot = df_mean.pivot(index=id_col, columns=split_col, values=val_col) - - # Calculate the relative difference or average based on the mode - if mode == "gap": - df_pivot["metric"] = np.abs((df_pivot["train"] - df_pivot["val"]) / ((df_pivot["train"] + df_pivot["val"]) / 2)) - elif mode in ["outliers", "max", "min"]: - df_pivot["metric"] = (df_pivot["train"] + df_pivot["val"]) / 2 - - # Calculate the z-score if mode is 'outliers' - if mode == "outliers": - mean, std = df_pivot["metric"].mean(), df_pivot["metric"].std() - df_pivot["metric"] = (df_pivot["metric"] - mean).abs() / std - - # Get the top 5 class_ids based on the metric - if mode in ["gap", "outliers", "max"]: - top_ids = df_pivot.nlargest(5, "metric").index - elif mode == "min": - top_ids = df_pivot.nsmallest(5, "metric").index - else: - raise ValueError("Invalid mode. Expected one of: gap, outliers, max, min") - - # Filter the original DataFrame to only include rows with the top class_ids - return df[df[id_col].isin(top_ids)] +import pandas as pd + + +class MostImportantValuesSelector: + def __init__(self, topk: int, mode: str): + """ + :param topk: How many rows (per split) to return. + :param mode: The mode to get the top values for. One of: + - 'gap': Returns rows with the biggest gap between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. + """ + valid_modes = ("gap", "outliers", "max", "min") + if mode not in valid_modes: + raise ValueError(f"Invalid mode '{mode}'. Must be one of: {valid_modes}.") + self.topk = topk + self.mode = mode + + def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): + """ + Returns the top 5 rows of the DataFrame based on the mode. + The DataFrame is expected to have three columns: id_col, split_col, val_col. + + :param df: The DataFrame to get the top values from. + :param id_col: The name of the id column. + :param split_col: The name of the split column. + :param value_col: The name of the value column. + """ + # Verify inputs + for col in [id_col, split_col, value_col]: + if col not in df.columns: + raise ValueError(f"{col} is not a column in the DataFrame") + + # the mean of val_col for each id_col/split_col + df_mean = df.groupby([id_col, split_col])[value_col].mean().reset_index() + + # Pivot DataFrame to have 'train' and 'val' as columns + df_pivot = df_mean.pivot(index=id_col, columns=split_col, values=value_col) + + # Calculate the relative difference or average based on the mode + if self.mode == "gap": + df_pivot["metric"] = np.abs((df_pivot["train"] - df_pivot["val"]) / ((df_pivot["train"] + df_pivot["val"]) / 2)) + elif self.mode in ["outliers", "max", "min"]: + df_pivot["metric"] = (df_pivot["train"] + df_pivot["val"]) / 2 + + if self.mode == "outliers": + mean, std = df_pivot["metric"].mean(), df_pivot["metric"].std() + df_pivot["metric"] = (df_pivot["metric"] - mean).abs() / std + + # Only return the top k. + if self.mode in ["gap", "outliers", "max"]: + top_ids = df_pivot.nlargest(self.topk, "metric").index + return df[df[id_col].isin(top_ids)] + elif self.mode == "min": + top_ids = df_pivot.nsmallest(self.topk, "metric").index + return df[df[id_col].isin(top_ids)] + else: + raise NotImplementedError(f"Mode {self.mode} is not implemented") From 47713074bb6734152ec0f5a9b3bb7b266f1a2c58 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 5 Jul 2023 18:29:07 +0300 Subject: [PATCH 06/20] rename --- src/data_gradients/config/detection.yaml | 6 ++-- src/data_gradients/config/segmentation.yaml | 6 ++-- .../object_detection/bounding_boxes_area.py | 4 +-- .../object_detection/classes_frequency.py | 4 +-- .../classes_frequency_per_image.py | 4 +-- .../segmentation/bounding_boxes_area.py | 4 +-- .../segmentation/classes_frequency.py | 4 +-- .../classes_frequency_per_image.py | 4 +-- .../feature_extractors/utils.py | 30 +++++++++---------- 9 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/data_gradients/config/detection.yaml b/src/data_gradients/config/detection.yaml index 068068bc..eb4aaacd 100644 --- a/src/data_gradients/config/detection.yaml +++ b/src/data_gradients/config/detection.yaml @@ -17,15 +17,15 @@ report_sections: heatmap_shape: [200, 200] - DetectionBoundingBoxArea: topk: 4 - mode: gap + prioritization_mode: train_val_diff - DetectionBoundingBoxPerImageCount - DetectionBoundingBoxSize - DetectionClassFrequency: topk: 4 - mode: gap + prioritization_mode: train_val_diff - DetectionClassesPerImageCount: topk: 4 - mode: gap + prioritization_mode: train_val_diff - DetectionBoundingBoxIoU: num_bins: 10 class_agnostic: true diff --git a/src/data_gradients/config/segmentation.yaml b/src/data_gradients/config/segmentation.yaml index 76314607..a5959544 100644 --- a/src/data_gradients/config/segmentation.yaml +++ b/src/data_gradients/config/segmentation.yaml @@ -18,14 +18,14 @@ report_sections: heatmap_shape: [200, 200] - SegmentationClassFrequency: topk: 30 - mode: gap + prioritization_mode: train_val_diff - SegmentationClassesPerImageCount: topk: 30 - mode: gap + prioritization_mode: train_val_diff - SegmentationComponentsPerImageCount - SegmentationBoundingBoxResolution - SegmentationBoundingBoxArea: topk: 30 - mode: gap + prioritization_mode: train_val_diff - SegmentationComponentsConvexity - SegmentationComponentsErosion diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index a9b86124..d885ea42 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -12,8 +12,8 @@ class DetectionBoundingBoxArea(AbstractFeatureExtractor): """Feature Extractor to compute the area covered Bounding Boxes.""" - def __init__(self, topk: int = 40, mode: str = "gap"): - self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) + def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] def update(self, sample: DetectionSample): diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index b156d68f..306a3a31 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -12,8 +12,8 @@ class DetectionClassFrequency(AbstractFeatureExtractor): """Feature Extractor to count the number of instance of each class.""" - def __init__(self, topk: int = 40, mode: str = "gap"): - self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) + def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] def update(self, sample: DetectionSample): diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index 7cb8b427..dc3eb122 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -13,8 +13,8 @@ class DetectionClassesPerImageCount(AbstractFeatureExtractor): """Feature Extractor to show the distribution of number of instance of each class per image. This gives information like "The class 'Human' usually appears 2 to 20 times per image.""" - def __init__(self, topk: int = 40, mode: str = "gap"): - self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) + def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] def update(self, sample: DetectionSample): diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py index 5d8852d2..f7c26a2d 100644 --- a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py @@ -15,8 +15,8 @@ class SegmentationBoundingBoxArea(AbstractFeatureExtractor): Get all Bounding Boxes areas and plot them as a percentage of the whole image. """ - def __init__(self, topk: int = 40, mode: str = "gap"): - self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) + def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] def update(self, sample: SegmentationSample): diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index 759f7c48..d7114961 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -10,8 +10,8 @@ @register_feature_extractor() class SegmentationClassFrequency(AbstractFeatureExtractor): - def __init__(self, topk: int = 40, mode: str = "gap"): - self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) + def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] def update(self, sample: SegmentationSample): diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index e81cd918..7850e598 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -10,8 +10,8 @@ @register_feature_extractor() class SegmentationClassesPerImageCount(AbstractFeatureExtractor): - def __init__(self, topk: int = 40, mode: str = "gap"): - self.value_extractor = MostImportantValuesSelector(topk=topk, mode=mode) + def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] def update(self, sample: SegmentationSample): diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index f9429336..47ccaad5 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -3,24 +3,24 @@ class MostImportantValuesSelector: - def __init__(self, topk: int, mode: str): + def __init__(self, topk: int, prioritization_mode: str): """ :param topk: How many rows (per split) to return. - :param mode: The mode to get the top values for. One of: - - 'gap': Returns rows with the biggest gap between 'train' and 'val' split values. + :param prioritization_mode: The prioritization_mode to get the top values for. One of: + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - 'outliers': Returns rows with the most extreme average values. - 'max': Returns rows with the highest average values. - 'min': Returns rows with the lowest average values. """ - valid_modes = ("gap", "outliers", "max", "min") - if mode not in valid_modes: - raise ValueError(f"Invalid mode '{mode}'. Must be one of: {valid_modes}.") + valid_modes = ("train_val_diff", "outliers", "max", "min") + if prioritization_mode not in valid_modes: + raise ValueError(f"Invalid `prioritization_mode={prioritization_mode}'. Must be one of: {valid_modes}.") self.topk = topk - self.mode = mode + self.prioritization_mode = prioritization_mode def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): """ - Returns the top 5 rows of the DataFrame based on the mode. + Returns the top 5 rows of the DataFrame based on the prioritization_mode. The DataFrame is expected to have three columns: id_col, split_col, val_col. :param df: The DataFrame to get the top values from. @@ -39,22 +39,22 @@ def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): # Pivot DataFrame to have 'train' and 'val' as columns df_pivot = df_mean.pivot(index=id_col, columns=split_col, values=value_col) - # Calculate the relative difference or average based on the mode - if self.mode == "gap": + # Calculate the relative difference or average based on the prioritization_mode + if self.prioritization_mode == "train_val_diff": df_pivot["metric"] = np.abs((df_pivot["train"] - df_pivot["val"]) / ((df_pivot["train"] + df_pivot["val"]) / 2)) - elif self.mode in ["outliers", "max", "min"]: + elif self.prioritization_mode in ["outliers", "max", "min"]: df_pivot["metric"] = (df_pivot["train"] + df_pivot["val"]) / 2 - if self.mode == "outliers": + if self.prioritization_mode == "outliers": mean, std = df_pivot["metric"].mean(), df_pivot["metric"].std() df_pivot["metric"] = (df_pivot["metric"] - mean).abs() / std # Only return the top k. - if self.mode in ["gap", "outliers", "max"]: + if self.prioritization_mode in ["train_val_diff", "outliers", "max"]: top_ids = df_pivot.nlargest(self.topk, "metric").index return df[df[id_col].isin(top_ids)] - elif self.mode == "min": + elif self.prioritization_mode == "min": top_ids = df_pivot.nsmallest(self.topk, "metric").index return df[df[id_col].isin(top_ids)] else: - raise NotImplementedError(f"Mode {self.mode} is not implemented") + raise NotImplementedError(f"Mode {self.prioritization_mode} is not implemented") From 92adca234b3ed9f075e3c116b0e50e6bdb945419 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 5 Jul 2023 18:30:25 +0300 Subject: [PATCH 07/20] remove unwanted change --- .../object_detection/classes_frequency.py | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index 306a3a31..16a03883 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -1,5 +1,4 @@ import pandas as pd -from abc import ABC, abstractmethod from data_gradients.common.registry.registry import register_feature_extractor from data_gradients.feature_extractors.abstract_feature_extractor import Feature from data_gradients.utils.data_classes import DetectionSample @@ -79,41 +78,3 @@ def description(self) -> str: "For instance, if one of the class only appears in the validation set, you know in advance that your model won't be able to " "learn to predict that class." ) - - -class DataframeExtractor(ABC): - def __init__(self, topk: int): - self.topk = topk - - @abstractmethod - def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: - pass - - -class OutliersExtractor(DataframeExtractor): - def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: - values = df[column] - values_normalized = (values - values.mean()) / values.var() - outliers_index = values_normalized.abs().sort_values(ascending=False).index[: self.topk] - return df[outliers_index] - - -class HighestValuesExtractor(DataframeExtractor): - def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: - return df.sort_values(by=column, ascending=False)[: self.topk] - - -class LowestValuesExtractor(DataframeExtractor): - def extract(self, df: pd.DataFrame, column: str) -> pd.DataFrame: - return df.sort_values(by=column, ascending=True)[: self.topk] - - -def get_dataframe_extractor_per_frequency(extractor_name: str, topk: int) -> DataframeExtractor: - available_extractors = { - "outliers": OutliersExtractor(topk=topk), - "most_frequent": HighestValuesExtractor(topk=topk), - "least_frequent": LowestValuesExtractor(topk=topk), - } - if extractor_name not in available_extractors.keys(): - raise ValueError - return available_extractors[extractor_name] From d1e2b2c3aff16a5c160ec46d12bf8f450c688efc Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 5 Jul 2023 18:32:18 +0300 Subject: [PATCH 08/20] add doc --- .../object_detection/bounding_boxes_area.py | 8 ++++++++ .../object_detection/classes_frequency.py | 8 ++++++++ .../object_detection/classes_frequency_per_image.py | 8 ++++++++ .../segmentation/bounding_boxes_area.py | 8 ++++++++ .../feature_extractors/segmentation/classes_frequency.py | 8 ++++++++ .../segmentation/classes_frequency_per_image.py | 8 ++++++++ 6 files changed, 48 insertions(+) diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index d885ea42..90b388e8 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -13,6 +13,14 @@ class DetectionBoundingBoxArea(AbstractFeatureExtractor): """Feature Extractor to compute the area covered Bounding Boxes.""" def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + """ + :param topk: How many rows (per split) to return. + :param prioritization_mode: The prioritization_mode to get the top values for. One of: + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. + """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index 16a03883..5f9f5f9a 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -12,6 +12,14 @@ class DetectionClassFrequency(AbstractFeatureExtractor): """Feature Extractor to count the number of instance of each class.""" def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + """ + :param topk: How many rows (per split) to return. + :param prioritization_mode: The prioritization_mode to get the top values for. One of: + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. + """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index dc3eb122..d09fdcf4 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -14,6 +14,14 @@ class DetectionClassesPerImageCount(AbstractFeatureExtractor): This gives information like "The class 'Human' usually appears 2 to 20 times per image.""" def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + """ + :param topk: How many rows (per split) to return. + :param prioritization_mode: The prioritization_mode to get the top values for. One of: + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. + """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py index f7c26a2d..574bc8db 100644 --- a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py @@ -16,6 +16,14 @@ class SegmentationBoundingBoxArea(AbstractFeatureExtractor): """ def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + """ + :param topk: How many rows (per split) to return. + :param prioritization_mode: The prioritization_mode to get the top values for. One of: + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. + """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index d7114961..e68eaca9 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -11,6 +11,14 @@ @register_feature_extractor() class SegmentationClassFrequency(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + """ + :param topk: How many rows (per split) to return. + :param prioritization_mode: The prioritization_mode to get the top values for. One of: + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. + """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index 7850e598..4780b6ef 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -11,6 +11,14 @@ @register_feature_extractor() class SegmentationClassesPerImageCount(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + """ + :param topk: How many rows (per split) to return. + :param prioritization_mode: The prioritization_mode to get the top values for. One of: + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. + """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] From 125b0c5bf634eb0784d25deaa854227ccee18964 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 5 Jul 2023 18:35:58 +0300 Subject: [PATCH 09/20] improve doc --- .../object_detection/bounding_boxes_area.py | 12 ++++++------ .../object_detection/classes_frequency.py | 12 ++++++------ .../object_detection/classes_frequency_per_image.py | 12 ++++++------ .../segmentation/bounding_boxes_area.py | 12 ++++++------ .../segmentation/classes_frequency.py | 12 ++++++------ .../segmentation/classes_frequency_per_image.py | 12 ++++++------ 6 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index 90b388e8..17d172b9 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -14,12 +14,12 @@ class DetectionBoundingBoxArea(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): """ - :param topk: How many rows (per split) to return. - :param prioritization_mode: The prioritization_mode to get the top values for. One of: - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + :param topk: How many rows (per split) to show. + :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index 5f9f5f9a..d62278aa 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -13,12 +13,12 @@ class DetectionClassFrequency(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): """ - :param topk: How many rows (per split) to return. - :param prioritization_mode: The prioritization_mode to get the top values for. One of: - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + :param topk: How many rows (per split) to show. + :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index d09fdcf4..d0d48bb2 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -15,12 +15,12 @@ class DetectionClassesPerImageCount(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): """ - :param topk: How many rows (per split) to return. - :param prioritization_mode: The prioritization_mode to get the top values for. One of: - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + :param topk: How many rows (per split) to show. + :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py index 574bc8db..d7b606a0 100644 --- a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py @@ -17,12 +17,12 @@ class SegmentationBoundingBoxArea(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): """ - :param topk: How many rows (per split) to return. - :param prioritization_mode: The prioritization_mode to get the top values for. One of: - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + :param topk: How many rows (per split) to show. + :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index e68eaca9..85eb3bbe 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -12,12 +12,12 @@ class SegmentationClassFrequency(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): """ - :param topk: How many rows (per split) to return. - :param prioritization_mode: The prioritization_mode to get the top values for. One of: - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + :param topk: How many rows (per split) to show. + :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index 4780b6ef..45616def 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -12,12 +12,12 @@ class SegmentationClassesPerImageCount(AbstractFeatureExtractor): def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): """ - :param topk: How many rows (per split) to return. - :param prioritization_mode: The prioritization_mode to get the top values for. One of: - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + :param topk: How many rows (per split) to show. + :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown + - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns rows with the most extreme average values. + - 'max': Returns rows with the highest average values. + - 'min': Returns rows with the lowest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] From 6e4f7208f5fd0bbf349984d9eec904e6e579f7e7 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Wed, 5 Jul 2023 19:19:31 +0300 Subject: [PATCH 10/20] cleanup --- src/data_gradients/config/detection.yaml | 6 +-- .../object_detection/bounding_boxes_area.py | 2 +- .../object_detection/classes_frequency.py | 2 +- .../classes_frequency_per_image.py | 42 +++++++++++++++++-- .../segmentation/bounding_boxes_area.py | 2 +- .../segmentation/classes_frequency.py | 2 +- .../classes_frequency_per_image.py | 2 +- 7 files changed, 47 insertions(+), 11 deletions(-) diff --git a/src/data_gradients/config/detection.yaml b/src/data_gradients/config/detection.yaml index eb4aaacd..0d9d0c64 100644 --- a/src/data_gradients/config/detection.yaml +++ b/src/data_gradients/config/detection.yaml @@ -16,15 +16,15 @@ report_sections: n_cols: 2 heatmap_shape: [200, 200] - DetectionBoundingBoxArea: - topk: 4 + topk: 30 prioritization_mode: train_val_diff - DetectionBoundingBoxPerImageCount - DetectionBoundingBoxSize - DetectionClassFrequency: - topk: 4 + topk: 30 prioritization_mode: train_val_diff - DetectionClassesPerImageCount: - topk: 4 + topk: 30 prioritization_mode: train_val_diff - DetectionBoundingBoxIoU: num_bins: 10 diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index 17d172b9..d736739b 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -12,7 +12,7 @@ class DetectionBoundingBoxArea(AbstractFeatureExtractor): """Feature Extractor to compute the area covered Bounding Boxes.""" - def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index d62278aa..b0872cf3 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -11,7 +11,7 @@ class DetectionClassFrequency(AbstractFeatureExtractor): """Feature Extractor to count the number of instance of each class.""" - def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index d0d48bb2..abc71399 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -13,7 +13,7 @@ class DetectionClassesPerImageCount(AbstractFeatureExtractor): """Feature Extractor to show the distribution of number of instance of each class per image. This gives information like "The class 'Human' usually appears 2 to 20 times per image.""" - def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown @@ -39,14 +39,50 @@ def update(self, sample: DetectionSample): def aggregate(self) -> Feature: df = pd.DataFrame(self.data) - + print(df.describe()) # Include ("class_name", "class_id", "split", "n_appearance") # For each class, image, split, I want to know how many bbox I have # TODO: check this - df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") + df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") + print(df_class_count.groupby("class_name").sum()) + print(self.value_extractor.select(df=df_class_count.copy(), id_col="class_id", split_col="split", value_col="n_appearance").groupby("class_name").sum()) + print(self.value_extractor.select(df=df_class_count.copy(), id_col="class_id", split_col="split", value_col="n_appearance").groupby("class_name").sum()) df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance") + # """ class_id n_appearance + # class_name + # big bus 0 89 + # big truck 126 355 + # bus-l- 46 29 + # bus-s- 39 13 + # car 1540 3310 + # mid truck 185 70 + # small bus 162 30 + # small truck 1337 668 + # truck-l- 976 193 + # truck-m- 1458 321 + # truck-s- 870 126 + # truck-xl- 737 78 + # """ + + # """class_name + # big bus 0 116 + # big truck 156 457 + # bus-l- 38 26 + # bus-s- 51 19 + # car 1552 3303 + # mid truck 215 85 + # small bus 120 22 + # small truck 1295 619 + # truck-l- 960 185 + # truck-m- 1485 336 + # truck-s- 750 115 + # truck-xl- 770 76""" + # """bus-l- 40 25 + # car 1540 3169 + # mid truck 245 90 + # small truck 1281 572""" # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) figsize_x = 10 diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py index d7b606a0..a2f74dd4 100644 --- a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py @@ -15,7 +15,7 @@ class SegmentationBoundingBoxArea(AbstractFeatureExtractor): Get all Bounding Boxes areas and plot them as a percentage of the whole image. """ - def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index 85eb3bbe..c7b406c0 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -10,7 +10,7 @@ @register_feature_extractor() class SegmentationClassFrequency(AbstractFeatureExtractor): - def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index 45616def..de176643 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -10,7 +10,7 @@ @register_feature_extractor() class SegmentationClassesPerImageCount(AbstractFeatureExtractor): - def __init__(self, topk: int = 40, prioritization_mode: str = "train_val_diff"): + def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown From c4c3e932098f9741ddafc709a3799ff37013f752 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 6 Jul 2023 11:32:27 +0300 Subject: [PATCH 11/20] cleanup --- .../classes_frequency_per_image.py | 39 +------------------ .../feature_extractors/utils.py | 9 +++-- 2 files changed, 7 insertions(+), 41 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index abc71399..7459fbe6 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -39,50 +39,15 @@ def update(self, sample: DetectionSample): def aggregate(self) -> Feature: df = pd.DataFrame(self.data) - print(df.describe()) + # Include ("class_name", "class_id", "split", "n_appearance") # For each class, image, split, I want to know how many bbox I have # TODO: check this df_class_count = df.groupby(["class_name", "class_id", "sample_id", "split"]).size().reset_index(name="n_appearance") - print(df_class_count.groupby("class_name").sum()) - print(self.value_extractor.select(df=df_class_count.copy(), id_col="class_id", split_col="split", value_col="n_appearance").groupby("class_name").sum()) - print(self.value_extractor.select(df=df_class_count.copy(), id_col="class_id", split_col="split", value_col="n_appearance").groupby("class_name").sum()) - df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance") - # """ class_id n_appearance - # class_name - # big bus 0 89 - # big truck 126 355 - # bus-l- 46 29 - # bus-s- 39 13 - # car 1540 3310 - # mid truck 185 70 - # small bus 162 30 - # small truck 1337 668 - # truck-l- 976 193 - # truck-m- 1458 321 - # truck-s- 870 126 - # truck-xl- 737 78 - # """ - # """class_name - # big bus 0 116 - # big truck 156 457 - # bus-l- 38 26 - # bus-s- 51 19 - # car 1552 3303 - # mid truck 215 85 - # small bus 120 22 - # small truck 1295 619 - # truck-l- 960 185 - # truck-m- 1485 336 - # truck-s- 750 115 - # truck-xl- 770 76""" + df_class_count = self.value_extractor.select(df=df_class_count, id_col="class_id", split_col="split", value_col="n_appearance") - # """bus-l- 40 25 - # car 1540 3169 - # mid truck 245 90 - # small truck 1281 572""" # Height of the plot is proportional to the number of classes n_unique = len(df_class_count["class_name"].unique()) figsize_x = 10 diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index 47ccaad5..8907f060 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -20,13 +20,14 @@ def __init__(self, topk: int, prioritization_mode: str): def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): """ - Returns the top 5 rows of the DataFrame based on the prioritization_mode. - The DataFrame is expected to have three columns: id_col, split_col, val_col. + Returns the top k rows of the DataFrame based on the prioritization_mode. + The DataFrame is expected to have at least three columns: id_col, split_col, val_col. :param df: The DataFrame to get the top values from. :param id_col: The name of the id column. - :param split_col: The name of the split column. - :param value_col: The name of the value column. + :param split_col: The name of the split column. (Usually 'split') + :param value_col: The name of column that will be used to calculate the metric. + :return: The Dataframe with only the rows associated to the most important values. """ # Verify inputs for col in [id_col, split_col, value_col]: From 354008454bcb85ad457c0117c163b433833b8f2e Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Thu, 6 Jul 2023 11:41:17 +0300 Subject: [PATCH 12/20] add min_max --- .../object_detection/bounding_boxes_area.py | 9 +++++---- .../object_detection/classes_frequency.py | 9 +++++---- .../classes_frequency_per_image.py | 9 +++++---- .../segmentation/bounding_boxes_area.py | 9 +++++---- .../segmentation/classes_frequency.py | 9 +++++---- .../classes_frequency_per_image.py | 9 +++++---- src/data_gradients/feature_extractors/utils.py | 18 ++++++++++++------ 7 files changed, 42 insertions(+), 30 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index 114a2b65..35007edf 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -16,10 +16,11 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns the top k rows with the most extreme average values. + - 'max': Returns the top k rows with the highest average values. + - 'min': Returns the top k rows with the lowest average values. + - 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index ac2d7139..0a692394 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -15,10 +15,11 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns the top k rows with the most extreme average values. + - 'max': Returns the top k rows with the highest average values. + - 'min': Returns the top k rows with the lowest average values. + - 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index 696012b2..9cab0ea5 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -17,10 +17,11 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns the top k rows with the most extreme average values. + - 'max': Returns the top k rows with the highest average values. + - 'min': Returns the top k rows with the lowest average values. + - 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py index 3293e972..910583d6 100644 --- a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py @@ -19,10 +19,11 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns the top k rows with the most extreme average values. + - 'max': Returns the top k rows with the highest average values. + - 'min': Returns the top k rows with the lowest average values. + - 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index 319e25df..5d603d25 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -14,10 +14,11 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns the top k rows with the most extreme average values. + - 'max': Returns the top k rows with the highest average values. + - 'min': Returns the top k rows with the lowest average values. + - 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index 62bcda1f..900849c1 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -14,10 +14,11 @@ def __init__(self, topk: int = 30, prioritization_mode: str = "train_val_diff"): """ :param topk: How many rows (per split) to show. :param prioritization_mode: Strategy to use to chose which class will be prioritized. Only the topk will be shown - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns the top k rows with the most extreme average values. + - 'max': Returns the top k rows with the highest average values. + - 'min': Returns the top k rows with the lowest average values. + - 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values. """ self.value_extractor = MostImportantValuesSelector(topk=topk, prioritization_mode=prioritization_mode) self.data = [] diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index 8907f060..0c532730 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -7,12 +7,13 @@ def __init__(self, topk: int, prioritization_mode: str): """ :param topk: How many rows (per split) to return. :param prioritization_mode: The prioritization_mode to get the top values for. One of: - - 'train_val_diff': Returns rows with the biggest train_val_diff between 'train' and 'val' split values. - - 'outliers': Returns rows with the most extreme average values. - - 'max': Returns rows with the highest average values. - - 'min': Returns rows with the lowest average values. + - 'train_val_diff': Returns the top k rows with the biggest train_val_diff between 'train' and 'val' split values. + - 'outliers': Returns the top k rows with the most extreme average values. + - 'max': Returns the top k rows with the highest average values. + - 'min': Returns the top k rows with the lowest average values. + - 'min_max': Returns the (top k)/2 rows with the biggest average values, and the (top k)/2 with the smallest average values. """ - valid_modes = ("train_val_diff", "outliers", "max", "min") + valid_modes = ("train_val_diff", "outliers", "max", "min", "min_max") if prioritization_mode not in valid_modes: raise ValueError(f"Invalid `prioritization_mode={prioritization_mode}'. Must be one of: {valid_modes}.") self.topk = topk @@ -43,7 +44,7 @@ def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): # Calculate the relative difference or average based on the prioritization_mode if self.prioritization_mode == "train_val_diff": df_pivot["metric"] = np.abs((df_pivot["train"] - df_pivot["val"]) / ((df_pivot["train"] + df_pivot["val"]) / 2)) - elif self.prioritization_mode in ["outliers", "max", "min"]: + elif self.prioritization_mode in ["outliers", "max", "min", "min_max"]: df_pivot["metric"] = (df_pivot["train"] + df_pivot["val"]) / 2 if self.prioritization_mode == "outliers": @@ -57,5 +58,10 @@ def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): elif self.prioritization_mode == "min": top_ids = df_pivot.nsmallest(self.topk, "metric").index return df[df[id_col].isin(top_ids)] + elif self.prioritization_mode == "min_max": + n_min_results = self.topk // 2 + bottom_ids = df_pivot.nlargest(n_min_results, "metric").index + top_ids = df_pivot.nsmallest(self.topk - n_min_results, "metric").index + return pd.concat([df[df[id_col].isin(top_ids)], df[df[id_col].isin(bottom_ids)]]) else: raise NotImplementedError(f"Mode {self.prioritization_mode} is not implemented") From ce7cd1e11cf960c97c6969afb9fd1faea4795517 Mon Sep 17 00:00:00 2001 From: Eugene Khvedchenya Date: Sun, 9 Jul 2023 10:31:46 +0300 Subject: [PATCH 13/20] Rename "image_count" to "num_samples" and "annotation_count" to "num_annotations" for better clarity (#133) Plus save them as ints in json not strings --- .../assets/html/basic_info_fe.html | 10 +++++----- .../feature_extractors/common/summary.py | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/data_gradients/assets/html/basic_info_fe.html b/src/data_gradients/assets/html/basic_info_fe.html index a113550d..a79e862a 100644 --- a/src/data_gradients/assets/html/basic_info_fe.html +++ b/src/data_gradients/assets/html/basic_info_fe.html @@ -15,8 +15,8 @@

 

Images - {{train.image_count}} - {{val.image_count}} + {{train.num_samples}} + {{val.num_samples}} Classes @@ -30,8 +30,8 @@

 

Annotations - {{train.annotation_count}} - {{val.annotation_count}} + {{train.num_annotations}} + {{val.num_annotations}} Annotations per images @@ -70,4 +70,4 @@

 

- \ No newline at end of file + diff --git a/src/data_gradients/feature_extractors/common/summary.py b/src/data_gradients/feature_extractors/common/summary.py index 880e1663..1b3412f0 100644 --- a/src/data_gradients/feature_extractors/common/summary.py +++ b/src/data_gradients/feature_extractors/common/summary.py @@ -14,11 +14,11 @@ @dataclasses.dataclass class BasicStatistics: - image_count: int = 0 + num_samples: int = 0 classes_count: int = 0 classes_in_use: int = 0 classes: List[int] = dataclasses.field(default_factory=list) - annotation_count: int = 0 + num_annotations: int = 0 images_without_annotation: int = 0 images_resolutions: List[int] = dataclasses.field(default_factory=list) annotations_sizes: List[int] = dataclasses.field(default_factory=list) @@ -47,7 +47,7 @@ def update(self, sample: ImageSample): height, width = sample.image.shape[:2] basic_stats.images_resolutions.append([height, width]) - basic_stats.image_count += 1 + basic_stats.num_samples += 1 if isinstance(sample, SegmentationSample): contours = [contour for sublist in sample.contours for contour in sublist] @@ -71,14 +71,14 @@ def update(self, sample: ImageSample): def aggregate(self) -> Feature: for basic_stats in self.stats.values(): - if basic_stats.image_count > 0: + if basic_stats.num_samples > 0: basic_stats.classes_in_use = len(set(basic_stats.classes)) basic_stats.classes = np.array(basic_stats.classes) basic_stats.annotations_per_image = np.array(basic_stats.annotations_per_image) basic_stats.annotations_sizes = np.array(basic_stats.annotations_sizes) - basic_stats.annotation_count = int(np.sum(basic_stats.annotations_per_image)) + basic_stats.num_annotations = int(np.sum(basic_stats.annotations_per_image)) basic_stats.images_without_annotation = np.count_nonzero(basic_stats.annotations_per_image == 0) basic_stats.images_resolutions = np.array(basic_stats.images_resolutions) @@ -92,9 +92,9 @@ def aggregate(self) -> Feature: index_of_med = np.argsort(areas)[len(areas) // 2] basic_stats.med_image_resolution = self.format_resolution(basic_stats.images_resolutions[index_of_med][0]) - basic_stats.annotations_per_image = f"{basic_stats.annotation_count / basic_stats.image_count:.2f}" - basic_stats.image_count = f"{basic_stats.image_count:,}" - basic_stats.annotation_count = f"{basic_stats.annotation_count:,}" + basic_stats.annotations_per_image = f"{basic_stats.num_annotations / basic_stats.num_samples:.2f}" + basic_stats.num_samples = int(basic_stats.num_samples) + basic_stats.num_annotations = int(basic_stats.num_annotations) # To support JSON - delete arrays basic_stats.classes = None From 306ed2eb66f3c708f8b401437caa3a23606b21e2 Mon Sep 17 00:00:00 2001 From: Eugene Khvedchenya Date: Sun, 9 Jul 2023 12:15:19 +0300 Subject: [PATCH 14/20] Drop-in replacement of appdirs to platformdirs (#134) Co-authored-by: Ofri Masad --- requirements.txt | 2 +- src/data_gradients/config/data/data_config.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index e3d4682d..badf6e0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ hydra-core>=1.2.0 omegaconf>=2.2.3 pygments>=2.13.0 tqdm>=4.64.1 -appdirs>=1.4.4 +platformdirs>=2.5.2 opencv-python Pillow tensorboard diff --git a/src/data_gradients/config/data/data_config.py b/src/data_gradients/config/data/data_config.py index e96e596e..fe95b695 100644 --- a/src/data_gradients/config/data/data_config.py +++ b/src/data_gradients/config/data/data_config.py @@ -1,7 +1,8 @@ import os import logging + +import platformdirs import torch -import appdirs from abc import ABC from dataclasses import dataclass, field from typing import Dict, Optional, Callable, Union @@ -31,7 +32,7 @@ class DataConfig(ABC): images_extractor: Union[None, str, Callable[[SupportedDataType], torch.Tensor]] = None labels_extractor: Union[None, str, Callable[[SupportedDataType], torch.Tensor]] = None - DEFAULT_CACHE_DIR: str = field(default=appdirs.user_cache_dir("DataGradients", "Deci"), init=False) + DEFAULT_CACHE_DIR: str = field(default=platformdirs.user_cache_dir("DataGradients", "Deci"), init=False) @classmethod def load_from_json(cls, filename: str, dir_path: Optional[str] = None) -> "DataConfig": From 3282cad68a0d33399423b6a7c68c387b39f6a484 Mon Sep 17 00:00:00 2001 From: Ofri Masad Date: Sun, 9 Jul 2023 13:55:37 +0300 Subject: [PATCH 15/20] fix visualization of normalized images (#137) --- .../batch_processors/formatters/segmentation.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/data_gradients/batch_processors/formatters/segmentation.py b/src/data_gradients/batch_processors/formatters/segmentation.py index a894428b..051023cb 100644 --- a/src/data_gradients/batch_processors/formatters/segmentation.py +++ b/src/data_gradients/batch_processors/formatters/segmentation.py @@ -93,6 +93,11 @@ def format(self, images: Tensor, labels: Tensor) -> Tuple[Tensor, Tensor]: if 0 <= images.min() and images.max() <= 1: images *= 255 images = images.to(torch.uint8) + elif 0 >= images.min(): # images were normalized with some unknown mean and std + images -= images.min() + images /= images.max() + images *= 255 + images = images.to(torch.uint8) return images, labels From 84dac60b18e62b7f80366b949bd88e832ee97a2f Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 9 Jul 2023 14:29:26 +0300 Subject: [PATCH 16/20] generalize --- src/data_gradients/feature_extractors/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index 0c532730..4757a36c 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd @@ -43,9 +42,14 @@ def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): # Calculate the relative difference or average based on the prioritization_mode if self.prioritization_mode == "train_val_diff": - df_pivot["metric"] = np.abs((df_pivot["train"] - df_pivot["val"]) / ((df_pivot["train"] + df_pivot["val"]) / 2)) + # `train_val_diff` only defined when working with 2 sets. + if len(df_pivot.columns) != 2: + raise ValueError(f'`prioritization_mode"train_val_diff"` is only supported when working with 2 sets. Found {len(df_pivot.columns)}.') + delta = (df_pivot.iloc[:, 0] - df_pivot.iloc[:, 1]).abs() + average = (df_pivot.iloc[:, 0] + df_pivot.iloc[:, 1] + 1e-6).abs() / 2 + df_pivot["metric"] = delta / average elif self.prioritization_mode in ["outliers", "max", "min", "min_max"]: - df_pivot["metric"] = (df_pivot["train"] + df_pivot["val"]) / 2 + df_pivot["metric"] = df_pivot.mean(1) if self.prioritization_mode == "outliers": mean, std = df_pivot["metric"].mean(), df_pivot["metric"].std() From 82207d36b1d35736c8b4f33821c9edce0e3e7413 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 9 Jul 2023 14:33:59 +0300 Subject: [PATCH 17/20] add eps --- src/data_gradients/feature_extractors/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index 4757a36c..9118c2df 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -46,14 +46,14 @@ def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): if len(df_pivot.columns) != 2: raise ValueError(f'`prioritization_mode"train_val_diff"` is only supported when working with 2 sets. Found {len(df_pivot.columns)}.') delta = (df_pivot.iloc[:, 0] - df_pivot.iloc[:, 1]).abs() - average = (df_pivot.iloc[:, 0] + df_pivot.iloc[:, 1] + 1e-6).abs() / 2 - df_pivot["metric"] = delta / average + average = (df_pivot.iloc[:, 0] + df_pivot.iloc[:, 1]).abs() / 2 + df_pivot["metric"] = delta / (average + 1e-6) elif self.prioritization_mode in ["outliers", "max", "min", "min_max"]: df_pivot["metric"] = df_pivot.mean(1) if self.prioritization_mode == "outliers": mean, std = df_pivot["metric"].mean(), df_pivot["metric"].std() - df_pivot["metric"] = (df_pivot["metric"] - mean).abs() / std + df_pivot["metric"] = (df_pivot["metric"] - mean).abs() / (std + 1e-6) # Only return the top k. if self.prioritization_mode in ["train_val_diff", "outliers", "max"]: From 01bde99fa4a099d88bb2c31c142532a183fb3fcb Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 9 Jul 2023 15:05:47 +0300 Subject: [PATCH 18/20] fix case when dataset to filter is too small --- src/data_gradients/feature_extractors/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/data_gradients/feature_extractors/utils.py b/src/data_gradients/feature_extractors/utils.py index 9118c2df..5638046a 100644 --- a/src/data_gradients/feature_extractors/utils.py +++ b/src/data_gradients/feature_extractors/utils.py @@ -63,9 +63,13 @@ def select(self, df: pd.DataFrame, id_col: str, split_col: str, value_col: str): top_ids = df_pivot.nsmallest(self.topk, "metric").index return df[df[id_col].isin(top_ids)] elif self.prioritization_mode == "min_max": - n_min_results = self.topk // 2 - bottom_ids = df_pivot.nlargest(n_min_results, "metric").index - top_ids = df_pivot.nsmallest(self.topk - n_min_results, "metric").index + n_max_results = self.topk // 2 + n_min_results = self.topk - n_max_results + + top_ids = df_pivot.nlargest(n_max_results, "metric").index + + n_rows_available = len(df_pivot) - len(top_ids) + bottom_ids = df_pivot.nsmallest(min(n_min_results, n_rows_available), "metric").index return pd.concat([df[df[id_col].isin(top_ids)], df[df[id_col].isin(bottom_ids)]]) else: raise NotImplementedError(f"Mode {self.prioritization_mode} is not implemented") From 00faa28b39abe2bdb072b6712015ce3ec006ff72 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 9 Jul 2023 17:10:15 +0300 Subject: [PATCH 19/20] generalize for many splits --- .../object_detection/bounding_boxes_area.py | 4 +--- .../feature_extractors/object_detection/classes_frequency.py | 5 +---- .../object_detection/classes_frequency_per_image.py | 5 +---- .../feature_extractors/segmentation/bounding_boxes_area.py | 4 +--- .../feature_extractors/segmentation/classes_frequency.py | 5 +---- .../segmentation/classes_frequency_per_image.py | 5 +---- 6 files changed, 6 insertions(+), 22 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py index 35007edf..84e57e27 100644 --- a/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/object_detection/bounding_boxes_area.py @@ -66,9 +66,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = dict( - train=dict(df[df["split"] == "train"]["relative_bbox_area"].describe()), val=dict(df[df["split"] == "val"]["relative_bbox_area"].describe()) - ) + json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()} feature = Feature( data=df, diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index 0a692394..9d60db53 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -65,10 +65,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = dict( - train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()), - val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()), - ) + json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} feature = Feature( data=df_class_count, diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index 9cab0ea5..c7a69967 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -69,10 +69,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = dict( - train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()), - val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()), - ) + json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} feature = Feature( data=df_class_count, diff --git a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py index 910583d6..49ced541 100644 --- a/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py +++ b/src/data_gradients/feature_extractors/segmentation/bounding_boxes_area.py @@ -68,9 +68,7 @@ def aggregate(self) -> Feature: bandwidth=0.4, tight_layout=True, ) - json = dict( - train=dict(df[df["split"] == "train"]["relative_bbox_area"].describe()), val=dict(df[df["split"] == "val"]["relative_bbox_area"].describe()) - ) + json = {split: dict(df[df["split"] == split]["relative_bbox_area"].describe()) for split in df["split"].unique()} feature = Feature( data=df, diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index 5d603d25..262ec4bc 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -66,10 +66,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = dict( - train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()), - val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()), - ) + json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} feature = Feature( data=df_class_count, diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index 900849c1..07d23e72 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -69,10 +69,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = dict( - train=dict(df_class_count[df_class_count["split"] == "train"]["n_appearance"].describe()), - val=dict(df_class_count[df_class_count["split"] == "val"]["n_appearance"].describe()), - ) + json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} feature = Feature( data=df_class_count, From 46a9b5e4be5523f1026e83fcd68e1c9748e84022 Mon Sep 17 00:00:00 2001 From: Louis Dupont Date: Sun, 9 Jul 2023 17:14:46 +0300 Subject: [PATCH 20/20] fix --- .../feature_extractors/object_detection/classes_frequency.py | 2 +- .../object_detection/classes_frequency_per_image.py | 2 +- .../feature_extractors/segmentation/classes_frequency.py | 2 +- .../segmentation/classes_frequency_per_image.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py index 9d60db53..6926c954 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency.py @@ -65,7 +65,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} + json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()} feature = Feature( data=df_class_count, diff --git a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py index c7a69967..a3da58d4 100644 --- a/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/object_detection/classes_frequency_per_image.py @@ -69,7 +69,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} + json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()} feature = Feature( data=df_class_count, diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py index 262ec4bc..1bc237c8 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency.py @@ -66,7 +66,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} + json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()} feature = Feature( data=df_class_count, diff --git a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py index 07d23e72..fc8b9f8b 100644 --- a/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py +++ b/src/data_gradients/feature_extractors/segmentation/classes_frequency_per_image.py @@ -69,7 +69,7 @@ def aggregate(self) -> Feature: tight_layout=True, ) - json = {split: dict(df[df["split"] == split]["n_appearance"].describe()) for split in df["split"].unique()} + json = {split: dict(df_class_count[df_class_count["split"] == split]["n_appearance"].describe()) for split in df_class_count["split"].unique()} feature = Feature( data=df_class_count,