Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/sg 1070 Remove description property -> include it in update instead + add docstrings #206

Merged
merged 3 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
343 changes: 264 additions & 79 deletions documentation/feature_description.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ class Feature:

json: Union[dict, list]

title: str
description: str
notice: Optional[str] = None
warning: Optional[str] = None


class AbstractFeatureExtractor(ABC):
@abstractmethod
Expand All @@ -30,22 +35,6 @@ def update(self, sample: ImageSample):
def aggregate(self) -> Feature:
raise NotImplementedError()

@property
def description(self) -> str:
raise NotImplementedError()

@property
def title(self) -> str:
raise NotImplementedError()

@property
def notice(self) -> Optional[str]:
return None

@property
def warning(self) -> Optional[str]:
return None

def setup_data_sources(self, tran_data: Iterable, val_data: Iterable):
"""
Called in AnalysisManagerAbstract.__init__ for the purpose of exposing tran_data and val_data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,19 @@

@register_feature_extractor()
class ClassificationClassDistributionVsArea(AbstractFeatureExtractor):
"""Feature Extractor to show image area vs image class violin plot."""
"""
Summarizes how average image dimensions vary among classes and data splits.
This feature extractor calculates the mean image size (width and height) for each label within the provided splits of the dataset.
It highlights potential discrepancies in image resolutions across different classes and dataset splits, which could impact model performance.
Disparities in image sizes could indicate a need for more uniform data collection or preprocessing to avoid model biases and ensure consistent
performance across all classes and splits.
Key Uses:
- Pinpointing classes with significant variations in image resolution to inform data collection and preprocessing.
- Assessing the consistency of image resolutions across dataset splits to guide training strategies and augmentation techniques.
"""

def __init__(self):
self.data = []
Expand Down Expand Up @@ -43,12 +55,9 @@ def aggregate(self) -> Feature:
y_label_key="class_name",
y_label_name="Class",
order_key="class_id",
title=self.title,
figsize=(figsize_x, figsize_y),
# x_lim=(0, df_class_count["n_appearance"].max() * 1.2),
x_ticks_rotation=None,
labels_key="split" if num_splits > 1 else None,
# orient="h",
tight_layout=True,
)

Expand All @@ -60,20 +69,14 @@ def aggregate(self) -> Feature:
data=df,
plot_options=plot_options,
json=json,
title="Image size distribution per class",
description=(
"Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
"This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
"If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
" - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
" - If size distribution overlap between splits to too big - "
"you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
),
)
return feature

@property
def title(self) -> str:
return "Image size distribution per class"

@property
def description(self) -> str:
return (
"Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
"This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
"If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
" - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
" - If size distribution overlap between splits to too big - "
"you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
)
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,20 @@

@register_feature_extractor()
class ClassificationClassDistributionVsAreaPlot(AbstractFeatureExtractor):
"""Feature Extractor to show scatter plot of width & height distribution
with breakdown along image class and split."""
"""
Visualizes the spread of image widths and heights within each class and across data splits.

This feature extractor creates a scatter plot to graphically represent the diversity of image dimensions associated with each class label and split
in the dataset.
By visualizing this data, users can quickly assess whether certain classes or splits contain images that are consistently larger or smaller than others,
potentially indicating a need for data preprocessing or augmentation strategies to ensure model robustness.

Key Uses:

- Identifying classes with notably different average image sizes that may influence model training.
- Detecting splits in the dataset where image size distribution is uneven, prompting the need for more careful split strategies or
tailored data augmentation.
"""

def __init__(self):
self.data = []
Expand All @@ -35,7 +47,6 @@ def aggregate(self) -> Feature:
x_label_name="Image width (px)",
y_label_key="image_rows",
y_label_name="Image height (px)",
title=self.title,
figsize=(10, 10),
x_ticks_rotation=None,
labels_key="class_name",
Expand All @@ -57,20 +68,14 @@ def aggregate(self) -> Feature:
data=df,
plot_options=plot_options,
json=json,
title="Image size distribution per class",
description=(
"Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
"This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
"If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
" - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
" - If size distribution overlap between splits to too big - "
"you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
),
)
return feature

@property
def title(self) -> str:
return "Image size distribution per class"

@property
def description(self) -> str:
return (
"Distribution of image size (mean value of image width & height) with respect to assigned image label and (when possible) a split.\n"
"This may highlight issues when classes in train/val has different image resolution which may negatively affect the accuracy of the model.\n"
"If you see a large difference in image size between classes and splits - you may need to adjust data collection process or training regime:\n"
" - When splitting data into train/val/test - make sure that the image size distribution is similar between splits.\n"
" - If size distribution overlap between splits to too big - "
"you can address this (to some extent) by using more agressize values for zoom-in/zoo-out augmentation at training time.\n"
)
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@

@register_feature_extractor()
class ClassificationClassFrequency(AbstractFeatureExtractor):
"""Feature Extractor to count the number of labels of each class."""
"""
Analyzes and visualizes the frequency of each class label across different dataset splits.

This feature extractor computes the frequency of occurrence for each class label in the dataset, providing insights into the
balance or imbalance of class distribution across training and validation.
"""

def __init__(self, topk: Optional[int] = None, prioritization_mode: str = "train_val_diff"):
"""
Expand Down Expand Up @@ -66,7 +71,6 @@ def aggregate(self) -> Feature:
y_label_key="class_name",
y_label_name="Class",
order_key="class_id",
title=self.title,
figsize=(figsize_x, figsize_y),
x_ticks_rotation=None,
labels_key="split",
Expand All @@ -80,18 +84,12 @@ def aggregate(self) -> Feature:
data=df_class_count,
plot_options=plot_options,
json=json,
title="Class Frequency",
description=(
"This bar plot represents the frequency of appearance of each class. "
"This may highlight class distribution gap between training and validation splits. "
"For instance, if one of the class only appears in the validation set, you know in advance that your model won't be able to "
"learn to predict that class."
),
)
return feature

@property
def title(self) -> str:
return "Class Frequency"

@property
def description(self) -> str:
return (
"This bar plot represents the frequency of appearance of each class. "
"This may highlight class distribution gap between training and validation splits. "
"For instance, if one of the class only appears in the validation set, you know in advance that your model won't be able to "
"learn to predict that class."
)
18 changes: 9 additions & 9 deletions src/data_gradients/feature_extractors/classification/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ class ClassificationBasicStatistics:

@register_feature_extractor()
class ClassificationSummaryStats(AbstractFeatureExtractor):
"""Extracts general summary statistics from images."""
"""
Gathers basic statistical data from the dataset.

This extractor compiles essential statistics from the image samples. It counts the number of images, annotations, and classes,
assesses the diversity of image resolutions, and measures the size of annotations. This data is crucial for getting a high-level
overview of the dataset's characteristics and composition.
"""

def __init__(self):
super().__init__()
Expand Down Expand Up @@ -62,17 +68,11 @@ def aggregate(self) -> Feature:
data=None,
plot_options=None,
json=json_res,
title="General Statistics",
description=self.template.render(**self.stats),
)
return feature

@property
def title(self) -> str:
return "General Statistics"

@property
def description(self) -> str:
return self.template.render(**self.stats)

@staticmethod
def format_resolution(array: np.ndarray) -> str:
return "x".join([str(int(x)) for x in array])
24 changes: 22 additions & 2 deletions src/data_gradients/feature_extractors/common/heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,27 @@ def aggregate(self) -> Feature:
normalized_heatmaps_per_split_per_cls[class_name][split] = (255 * (heatmap / (heatmap.max() + 1e-6))).astype(np.uint8)

fig = combine_images_per_split_per_class(images_per_split_per_class=normalized_heatmaps_per_split_per_cls, n_cols=self.n_cols)
plot_options = FigureRenderer(title=self.title)
plot_options = FigureRenderer()
json = {class_name: "No Data" for class_name in normalized_heatmaps_per_split_per_cls.keys()}

return Feature(data=fig, plot_options=plot_options, json=json)
feature = Feature(
data=fig,
plot_options=plot_options,
json=json,
title=self._generate_title(),
description=self._generate_description(),
notice=self._generate_notice(),
)
return feature

@abstractmethod
def _generate_title(self) -> str:
...

@abstractmethod
def _generate_description(self) -> str:
...

@abstractmethod
def _generate_notice(self) -> str:
...
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@

@register_feature_extractor()
class ImagesAverageBrightness(AbstractFeatureExtractor):
"""Extracts the distribution of the image 'brightness'."""
"""
Provides a graphical representation of image brightness distribution.
This feature quantifies the brightness of images and plots the distribution per data split, aiding in the detection of
variances like uniform lighting conditions. Useful for comparing training and validation sets to ensure model robustness
against varying brightness levels.
"""

def __init__(self):
self.image_channels = None
Expand All @@ -30,7 +36,6 @@ def aggregate(self) -> Feature:
x_label_name="Split",
y_label_key="brightness",
y_label_name="Average Brightness",
title=self.title,
x_ticks_rotation=None,
orient="v",
show_values=False,
Expand All @@ -39,7 +44,6 @@ def aggregate(self) -> Feature:
plot_options = KDEPlotOptions(
x_label_key="brightness",
x_label_name="Average Brightness of Images",
title=self.title,
x_lim=(0, 255),
x_ticks_rotation=None,
labels_key="split",
Expand All @@ -54,17 +58,11 @@ def aggregate(self) -> Feature:
data=df,
plot_options=plot_options,
json=json,
title="Image Brightness Distribution",
description=(
"This graph shows the distribution of the brightness levels across all images. \n"
"This may for instance uncover differences between the training and validation sets, "
"such as the presence of exclusively daytime images in the training set and nighttime images in the validation set."
),
)
return feature

@property
def title(self) -> str:
return "Image Brightness Distribution"

@property
def description(self) -> str:
return (
"This graph shows the distribution of the brightness levels across all images. \n"
"This may for instance uncover differences between the training and validation sets, "
"such as the presence of exclusively daytime images in the training set and nighttime images in the validation set."
)
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@

@register_feature_extractor()
class ImageColorDistribution(AbstractFeatureExtractor):
"""Extracts the distribution of the image 'brightness'."""
"""
Analyzes and presents the color intensity distribution across image datasets.
This feature assesses the distribution of color intensities in images and provides detailed visualizations for each
color channel. It is designed to highlight differences and consistencies in color usage between training and
validation datasets, which can be critical for adjusting image preprocessing parameters or for enhancing data augmentation techniques.
"""

def __init__(self):
self.image_channels = None
Expand Down Expand Up @@ -52,7 +58,6 @@ def aggregate(self) -> Feature:
x_label_key="pixel_value",
x_label_name="Color Intensity",
weights="n",
title=self.title,
x_lim=(0, 255),
x_ticks_rotation=None,
labels_key="Color",
Expand All @@ -74,17 +79,12 @@ def aggregate(self) -> Feature:
data=df,
plot_options=plot_options,
json=json,
title="Color Distribution",
description=(
"Visualize the spread of color intensities with a frequency distribution for each channel, delineated from darkest (0) to brightest (255). "
"By comparing these distributions between training and validation sets, you can identify any significant variations that might affect model "
"performance. "
"For instance, if one dataset shows a higher concentration of darker values, it could suggest a need for lighting correction in preprocessing."
),
)
return feature

@property
def title(self) -> str:
return "Color Distribution"

@property
def description(self) -> str:
return (
"Here's a comparison of image channel intensity (scaled 0-255) distributions across the entire dataset. \n"
"It can reveal discrepancies in the image characteristics between the two datasets, as well as potential flaws in the augmentation process. \n"
"E.g., a notable difference in the mean value of a specific color between the two datasets may indicate an issue with the augmentation process."
)
Loading