Deci-AI · Louis-Dupont · Nov 8, 2023 · Aug 29, 2023 · Nov 6, 2023 · Nov 7, 2023
diff --git a/documentation/feature_description.md b/documentation/feature_description.md
diff --git a/src/data_gradients/feature_extractors/abstract_feature_extractor.py b/src/data_gradients/feature_extractors/abstract_feature_extractor.py
@@ -19,6 +19,11 @@ class Feature:
 
     json: Union[dict, list]
 
+    title: str
+    description: str
+    notice: Optional[str] = None
+    warning: Optional[str] = None
+
 
 class AbstractFeatureExtractor(ABC):
     @abstractmethod
@@ -30,22 +35,6 @@ def update(self, sample: ImageSample):
     def aggregate(self) -> Feature:
         raise NotImplementedError()
 
-    @property
-    def description(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def title(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def notice(self) -> Optional[str]:
-        return None
-
-    @property
-    def warning(self) -> Optional[str]:
-        return None
-
     def setup_data_sources(self, tran_data: Iterable, val_data: Iterable):
         """
         Called in AnalysisManagerAbstract.__init__ for the purpose of exposing tran_data and val_data

diff --git a/src/data_gradients/feature_extractors/classification/class_distribution_vs_area.py b/src/data_gradients/feature_extractors/classification/class_distribution_vs_area.py
@@ -10,7 +10,19 @@
 
 @register_feature_extractor()
 class ClassificationClassDistributionVsArea(AbstractFeatureExtractor):
-    """Feature Extractor to show image area vs image class violin plot."""
+    """
+    Summarizes how average image dimensions vary among classes and data splits.
+
+    This feature extractor calculates the mean image size (width and height) for each label within the provided splits of the dataset.
+    It highlights potential discrepancies in image resolutions across different classes and dataset splits, which could impact model performance.
+    Disparities in image sizes could indicate a need for more uniform data collection or preprocessing to avoid model biases and ensure consistent
+    performance across all classes and splits.
+
+    Key Uses:
+
+    - Pinpointing classes with significant variations in image resolution to inform data collection and preprocessing.
+    - Assessing the consistency of image resolutions across dataset splits to guide training strategies and augmentation techniques.
+    """
 
     def __init__(self):
         self.data = []
@@ -43,12 +55,9 @@ def aggregate(self) -> Feature:
             y_label_key="class_name",
             y_label_name="Class",
             order_key="class_id",
-            title=self.title,
             figsize=(figsize_x, figsize_y),
-            # x_lim=(0, df_class_count["n_appearance"].max() * 1.2),
             x_ticks_rotation=None,
             labels_key="split" if num_splits > 1 else None,
-            # orient="h",
             tight_layout=True,
         )
 
@@ -60,20 +69,14 @@ def aggregate(self) -> Feature:
             data=df,
             plot_options=plot_options,
             json=json,
+            title="Image size distribution per class",
+            description=(
+                "Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
+                "This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
+                "If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
+                " - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
+                " - If size distribution overlap between splits to too big - "
+                "you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
+            ),
         )
         return feature
-
-    @property
-    def title(self) -> str:
-        return "Image size distribution per class"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
-            "This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
-            "If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
-            " - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
-            " - If size distribution overlap between splits to too big - "
-            "you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
-        )
diff --git a/src/data_gradients/feature_extractors/classification/class_distribution_vs_area_scatter.py b/src/data_gradients/feature_extractors/classification/class_distribution_vs_area_scatter.py
@@ -9,8 +9,20 @@
 
 @register_feature_extractor()
 class ClassificationClassDistributionVsAreaPlot(AbstractFeatureExtractor):
-    """Feature Extractor to show scatter plot of width & height distribution
-    with breakdown along image class and split."""
+    """
+    Visualizes the spread of image widths and heights within each class and across data splits.
+
+    This feature extractor creates a scatter plot to graphically represent the diversity of image dimensions associated with each class label and split
+    in the dataset.
+    By visualizing this data, users can quickly assess whether certain classes or splits contain images that are consistently larger or smaller than others,
+    potentially indicating a need for data preprocessing or augmentation strategies to ensure model robustness.
+
+    Key Uses:
+
+    - Identifying classes with notably different average image sizes that may influence model training.
+    - Detecting splits in the dataset where image size distribution is uneven, prompting the need for more careful split strategies or
+    tailored data augmentation.
+    """
 
     def __init__(self):
         self.data = []
@@ -35,7 +47,6 @@ def aggregate(self) -> Feature:
             x_label_name="Image width (px)",
             y_label_key="image_rows",
             y_label_name="Image height (px)",
-            title=self.title,
             figsize=(10, 10),
             x_ticks_rotation=None,
             labels_key="class_name",
@@ -57,20 +68,14 @@ def aggregate(self) -> Feature:
             data=df,
             plot_options=plot_options,
             json=json,
+            title="Image size distribution per class",
+            description=(
+                "Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
+                "This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
+                "If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
+                " - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
+                " - If size distribution overlap between splits to too big - "
+                "you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
+            ),
         )
         return feature
-
-    @property
-    def title(self) -> str:
-        return "Image size distribution per class"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
-            "This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
-            "If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
-            " - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
-            " - If size distribution overlap between splits to too big - "
-            "you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
-        )
diff --git a/src/data_gradients/feature_extractors/classification/class_frequency.py b/src/data_gradients/feature_extractors/classification/class_frequency.py
@@ -12,7 +12,12 @@
 
 @register_feature_extractor()
 class ClassificationClassFrequency(AbstractFeatureExtractor):
-    """Feature Extractor to count the number of labels of each class."""
+    """
+    Analyzes and visualizes the frequency of each class label across different dataset splits.
+
+    This feature extractor computes the frequency of occurrence for each class label in the dataset, providing insights into the
+    balance or imbalance of class distribution across training and validation.
+    """
 
     def __init__(self, topk: Optional[int] = None, prioritization_mode: str = "train_val_diff"):
         """
@@ -66,7 +71,6 @@ def aggregate(self) -> Feature:
             y_label_key="class_name",
             y_label_name="Class",
             order_key="class_id",
-            title=self.title,
             figsize=(figsize_x, figsize_y),
             x_ticks_rotation=None,
             labels_key="split",
@@ -80,18 +84,12 @@ def aggregate(self) -> Feature:
             data=df_class_count,
             plot_options=plot_options,
             json=json,
+            title="Class Frequency",
+            description=(
+                "This bar plot represents the frequency of appearance of each class. "
+                "This may highlight class distribution gap between training and validation splits. "
+                "For instance, if one of the class only appears in the validation set, you know in advance that your model won't be able to "
+                "learn to predict that class."
+            ),
         )
         return feature
-
-    @property
-    def title(self) -> str:
-        return "Class Frequency"
-
-    @property
-    def description(self) -> str:
-        return (
-            "This bar plot represents the frequency of appearance of each class. "
-            "This may highlight class distribution gap between training and validation splits. "
-            "For instance, if one of the class only appears in the validation set, you know in advance that your model won't be able to "
-            "learn to predict that class."
-        )
diff --git a/src/data_gradients/feature_extractors/classification/summary.py b/src/data_gradients/feature_extractors/classification/summary.py
@@ -24,7 +24,13 @@ class ClassificationBasicStatistics:
 
 @register_feature_extractor()
 class ClassificationSummaryStats(AbstractFeatureExtractor):
-    """Extracts general summary statistics from images."""
+    """
+    Gathers basic statistical data from the dataset.
+
+    This extractor compiles essential statistics from the image samples. It counts the number of images, annotations, and classes,
+    assesses the diversity of image resolutions, and measures the size of annotations. This data is crucial for getting a high-level
+    overview of the dataset's characteristics and composition.
+    """
 
     def __init__(self):
         super().__init__()
@@ -62,17 +68,11 @@ def aggregate(self) -> Feature:
             data=None,
             plot_options=None,
             json=json_res,
+            title="General Statistics",
+            description=self.template.render(**self.stats),
         )
         return feature
 
-    @property
-    def title(self) -> str:
-        return "General Statistics"
-
-    @property
-    def description(self) -> str:
-        return self.template.render(**self.stats)
-
     @staticmethod
     def format_resolution(array: np.ndarray) -> str:
         return "x".join([str(int(x)) for x in array])
diff --git a/src/data_gradients/feature_extractors/common/heatmap.py b/src/data_gradients/feature_extractors/common/heatmap.py
@@ -41,7 +41,27 @@ def aggregate(self) -> Feature:
                     normalized_heatmaps_per_split_per_cls[class_name][split] = (255 * (heatmap / (heatmap.max() + 1e-6))).astype(np.uint8)
 
         fig = combine_images_per_split_per_class(images_per_split_per_class=normalized_heatmaps_per_split_per_cls, n_cols=self.n_cols)
-        plot_options = FigureRenderer(title=self.title)
+        plot_options = FigureRenderer()
         json = {class_name: "No Data" for class_name in normalized_heatmaps_per_split_per_cls.keys()}
 
-        return Feature(data=fig, plot_options=plot_options, json=json)
+        feature = Feature(
+            data=fig,
+            plot_options=plot_options,
+            json=json,
+            title=self._generate_title(),
+            description=self._generate_description(),
+            notice=self._generate_notice(),
+        )
+        return feature
+
+    @abstractmethod
+    def _generate_title(self) -> str:
+        ...
+
+    @abstractmethod
+    def _generate_description(self) -> str:
+        ...
+
+    @abstractmethod
+    def _generate_notice(self) -> str:
+        ...
diff --git a/src/data_gradients/feature_extractors/common/image_average_brightness.py b/src/data_gradients/feature_extractors/common/image_average_brightness.py
@@ -10,7 +10,13 @@
 
 @register_feature_extractor()
 class ImagesAverageBrightness(AbstractFeatureExtractor):
-    """Extracts the distribution of the image 'brightness'."""
+    """
+    Provides a graphical representation of image brightness distribution.
+
+    This feature quantifies the brightness of images and plots the distribution per data split, aiding in the detection of
+    variances like uniform lighting conditions. Useful for comparing training and validation sets to ensure model robustness
+    against varying brightness levels.
+    """
 
     def __init__(self):
         self.image_channels = None
@@ -30,7 +36,6 @@ def aggregate(self) -> Feature:
                 x_label_name="Split",
                 y_label_key="brightness",
                 y_label_name="Average Brightness",
-                title=self.title,
                 x_ticks_rotation=None,
                 orient="v",
                 show_values=False,
@@ -39,7 +44,6 @@ def aggregate(self) -> Feature:
             plot_options = KDEPlotOptions(
                 x_label_key="brightness",
                 x_label_name="Average Brightness of Images",
-                title=self.title,
                 x_lim=(0, 255),
                 x_ticks_rotation=None,
                 labels_key="split",
@@ -54,17 +58,11 @@ def aggregate(self) -> Feature:
             data=df,
             plot_options=plot_options,
             json=json,
+            title="Image Brightness Distribution",
+            description=(
+                "This graph shows the distribution of the brightness levels across all images. \n"
+                "This may for instance uncover differences between the training and validation sets, "
+                "such as the presence of exclusively daytime images in the training set and nighttime images in the validation set."
+            ),
         )
         return feature
-
-    @property
-    def title(self) -> str:
-        return "Image Brightness Distribution"
-
-    @property
-    def description(self) -> str:
-        return (
-            "This graph shows the distribution of the brightness levels across all images. \n"
-            "This may for instance uncover differences between the training and validation sets, "
-            "such as the presence of exclusively daytime images in the training set and nighttime images in the validation set."
-        )
diff --git a/src/data_gradients/feature_extractors/common/image_color_distribution.py b/src/data_gradients/feature_extractors/common/image_color_distribution.py
@@ -10,7 +10,13 @@
 
 @register_feature_extractor()
 class ImageColorDistribution(AbstractFeatureExtractor):
-    """Extracts the distribution of the image 'brightness'."""
+    """
+    Analyzes and presents the color intensity distribution across image datasets.
+
+    This feature assesses the distribution of color intensities in images and provides detailed visualizations for each
+    color channel. It is designed to highlight differences and consistencies in color usage between training and
+    validation datasets, which can be critical for adjusting image preprocessing parameters or for enhancing data augmentation techniques.
+    """
 
     def __init__(self):
         self.image_channels = None
@@ -52,7 +58,6 @@ def aggregate(self) -> Feature:
             x_label_key="pixel_value",
             x_label_name="Color Intensity",
             weights="n",
-            title=self.title,
             x_lim=(0, 255),
             x_ticks_rotation=None,
             labels_key="Color",
@@ -74,17 +79,12 @@ def aggregate(self) -> Feature:
             data=df,
             plot_options=plot_options,
             json=json,
+            title="Color Distribution",
+            description=(
+                "Visualize the spread of color intensities with a frequency distribution for each channel, delineated from darkest (0) to brightest (255). "
+                "By comparing these distributions between training and validation sets, you can identify any significant variations that might affect model "
+                "performance. "
+                "For instance, if one dataset shows a higher concentration of darker values, it could suggest a need for lighting correction in preprocessing."
+            ),
         )
         return feature
-
-    @property
-    def title(self) -> str:
-        return "Color Distribution"
-
-    @property
-    def description(self) -> str:
-        return (
-            "Here's a comparison of image channel intensity (scaled 0-255) distributions across the entire dataset. \n"
-            "It can reveal discrepancies in the image characteristics between the two datasets, as well as potential flaws in the augmentation process. \n"
-            "E.g., a notable difference in the mean value of a specific color between the two datasets may indicate an issue with the augmentation process."
-        )