Merge branch 'master' into bugfix/speech_backbone

Lightning-Universe · Apr 7, 2022 · 890f183 · 890f183
2 parents e9966c9 + 1d926b5
commit 890f183
Show file tree

Hide file tree

Showing 42 changed files with 288 additions and 421 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added `available_outputs` method to the `Task` ([#1206](https://github.com/PyTorchLightning/lightning-flash/pull/1206))
 
+- Added support for using the `ImageEmbedder` SSL training for all image classifier backbones ([#1264](https://github.com/PyTorchLightning/lightning-flash/pull/1264))
+
+- Added support for audio file formats to `AudioClassificationData` ([#1085](https://github.com/PyTorchLightning/lightning-flash/pull/1085))
+
 ### Changed
 
 ### Deprecated

diff --git a/docs/extensions/stability.py b/docs/extensions/stability.py
@@ -22,7 +22,7 @@
     <p class="admonition-title">{title}</p>
     <p>
 
-This {scope} is currently in Beta. The API and functionality may change without warning in future
+{message} The API and functionality may change without warning in future
 releases. :ref:`More details <stability>`.
 
 .. raw:: html
@@ -33,15 +33,16 @@
 
 
 class Beta(Directive):
-    has_content = True
+    has_content = False
     required_arguments = 1
     optional_arguments = 0
+    final_argument_whitespace = True
 
     def run(self):
 
-        scope = self.arguments[0]
+        message = self.arguments[-1].strip()
 
-        admonition_rst = ADMONITION_TEMPLATE.format(type="beta", title="Beta", scope=scope)
+        admonition_rst = ADMONITION_TEMPLATE.format(type="beta", title="Beta", message=message)
         admonition_list = StringList(admonition_rst.split("\n"))
         admonition = nodes.paragraph()
         self.state.nested_parse(admonition_list, self.content_offset, admonition)

diff --git a/docs/source/general/flash_zero.rst b/docs/source/general/flash_zero.rst
@@ -1,3 +1,5 @@
+.. beta:: Flash Zero is currently in Beta.
+
 .. _flash_zero:
 
 **********

diff --git a/docs/source/integrations/baal.rst b/docs/source/integrations/baal.rst
@@ -1,3 +1,5 @@
+.. beta:: The BaaL integration is currently in Beta.
+
 .. _baal:
 
 ####

diff --git a/docs/source/integrations/learn2learn.rst b/docs/source/integrations/learn2learn.rst
@@ -1,3 +1,5 @@
+.. beta:: The Learn2Learn integration is currently in Beta.
+
 .. _learn2learn:
 
 ###########

diff --git a/docs/source/integrations/vissl.rst b/docs/source/integrations/vissl.rst
@@ -1,3 +1,5 @@
+.. beta:: The VISSL integration is currently in Beta.
+
 .. _vissl:
 
 #####

diff --git a/docs/source/reference/image_embedder.rst b/docs/source/reference/image_embedder.rst
@@ -3,6 +3,9 @@
    :card_description: Learn to generate embeddings from images with Flash.
    :image: https://pl-flash-data.s3.amazonaws.com/assets/thumbnails/image_embedder.svg
    :tags: Image,Embedding
+   :beta:
+
+.. beta:: The VISSL integration is currently in Beta.
 
 .. warning::
 
@@ -44,10 +47,6 @@ Here's the full example:
 To learn how to view the available backbones / heads for this task, see :ref:`backbones_heads`.
 You can view the available training strategies with the :meth:`~flash.image.embedding.model.ImageEmbedder.available_training_strategies` method.
 
-.. note::
-
-    The ``"dino"`` training strategy only supports single GPU training with ``strategy="ddp"``.
-
 The ``head`` and ``pretraining_transform`` arguments should match the choice of ``training_strategy`` following this table:
 
 =====================  =====================  ==========================
@@ -56,5 +55,4 @@ The ``head`` and ``pretraining_transform`` arguments should match the choice of
 ``simclr``             ``simclr_head``        ``simclr_transform``
 ``barlow_twins``       ``barlow_twins_head``  ``barlow_twins_transform``
 ``swav``               ``swav_head``          ``swav_transform``
-``dino``               ``dino_head``          ``dino_transform``
 =====================  =====================  ==========================
diff --git a/docs/source/reference/pointcloud_object_detection.rst b/docs/source/reference/pointcloud_object_detection.rst
@@ -5,7 +5,7 @@
    :tags: Point-Cloud,Detection
    :beta:
 
-.. beta:: task
+.. beta:: Point cloud object detection is currently in Beta.
 
 .. _pointcloud_object_detection:
 

diff --git a/docs/source/reference/pointcloud_segmentation.rst b/docs/source/reference/pointcloud_segmentation.rst
@@ -5,7 +5,7 @@
    :tags: Point-Cloud,Segmentation
    :beta:
 
-.. beta:: task
+.. beta:: Point cloud segmentation is currently in Beta.
 
 .. _pointcloud_segmentation:
 

diff --git a/docs/source/reference/style_transfer.rst b/docs/source/reference/style_transfer.rst
@@ -3,6 +3,9 @@
    :card_description: Learn about image style transfer with Flash and build an example which transfers style from The Starry Night to images from the COCO data set.
    :image: https://pl-flash-data.s3.amazonaws.com/assets/thumbnails/style_transfer.svg
    :tags: Image,Style-Transfer
+   :beta:
+
+.. beta:: Style transfer is currently in Beta.
 
 .. _style_transfer:
 

diff --git a/flash/audio/classification/data.py b/flash/audio/classification/data.py
@@ -57,6 +57,8 @@ def from_files(
         test_files: Optional[Sequence[str]] = None,
         test_targets: Optional[Sequence[Any]] = None,
         predict_files: Optional[Sequence[str]] = None,
+        sampling_rate: int = 16000,
+        n_fft: int = 400,
         input_cls: Type[Input] = AudioClassificationFilesInput,
         transform: INPUT_TRANSFORM_TYPE = AudioClassificationInputTransform,
         transform_kwargs: Optional[Dict] = None,
@@ -66,8 +68,10 @@ def from_files(
         """Load the :class:`~flash.audio.classification.data.AudioClassificationData` from lists of files and
         corresponding lists of targets.
 
-        The supported file extensions are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``, ``.bmp``, ``.pgm``, ``.tif``,
-        ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for precomputed spectrograms are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``,
+        ``.bmp``, ``.pgm``, ``.tif``, ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for raw audio (where spectrograms will be computed automatically) are: ``.wav``,
+        ``.ogg``, ``.flac``, ``.mat``, and ``.mp3``.
         The targets can be in any of our
         :ref:`supported classification target formats <formatting_classification_targets>`.
         To learn how to customize the transforms applied for each stage, read our
@@ -81,6 +85,8 @@ def from_files(
             test_files: The list of spectrogram image files to use when testing.
             test_targets: The list of targets to use when testing.
             predict_files: The list of spectrogram image files to use when predicting.
+            sampling_rate: Sampling rate to use when loading raw audio files.
+            n_fft: The size of the FFT to use when creating spectrograms from raw audio.
             target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
                 control how targets are handled. See :ref:`formatting_classification_targets` for more details.
             input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
@@ -137,6 +143,8 @@ def from_files(
         """
 
         ds_kw = dict(
+            sampling_rate=sampling_rate,
+            n_fft=n_fft,
             target_formatter=target_formatter,
         )
 
@@ -160,6 +168,8 @@ def from_folders(
         val_folder: Optional[str] = None,
         test_folder: Optional[str] = None,
         predict_folder: Optional[str] = None,
+        sampling_rate: int = 16000,
+        n_fft: int = 400,
         input_cls: Type[Input] = AudioClassificationFolderInput,
         transform: INPUT_TRANSFORM_TYPE = AudioClassificationInputTransform,
         transform_kwargs: Optional[Dict] = None,
@@ -169,8 +179,10 @@ def from_folders(
         """Load the :class:`~flash.audio.classification.data.AudioClassificationData` from folders containing
         spectrogram images.
 
-        The supported file extensions are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``, ``.bmp``, ``.pgm``, ``.tif``,
-        ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for precomputed spectrograms are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``,
+        ``.bmp``, ``.pgm``, ``.tif``, ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for raw audio (where spectrograms will be computed automatically) are: ``.wav``,
+        ``.ogg``, ``.flac``, ``.mat``, and ``.mp3``.
         For train, test, and validation data, the folders are expected to contain a sub-folder for each class.
         Here's the required structure:
 
@@ -203,6 +215,8 @@ def from_folders(
             val_folder: The folder containing spectrogram images to use when validating.
             test_folder: The folder containing spectrogram images to use when testing.
             predict_folder: The folder containing spectrogram images to use when predicting.
+            sampling_rate: Sampling rate to use when loading raw audio files.
+            n_fft: The size of the FFT to use when creating spectrograms from raw audio.
             target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
                 control how targets are handled. See :ref:`formatting_classification_targets` for more details.
             input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
@@ -262,6 +276,8 @@ def from_folders(
         """
 
         ds_kw = dict(
+            sampling_rate=sampling_rate,
+            n_fft=n_fft,
             target_formatter=target_formatter,
         )
 
@@ -471,6 +487,8 @@ def from_data_frame(
         predict_data_frame: Optional[pd.DataFrame] = None,
         predict_images_root: Optional[str] = None,
         predict_resolver: Optional[Callable[[str, str], str]] = None,
+        sampling_rate: int = 16000,
+        n_fft: int = 400,
         input_cls: Type[Input] = AudioClassificationDataFrameInput,
         transform: INPUT_TRANSFORM_TYPE = AudioClassificationInputTransform,
         transform_kwargs: Optional[Dict] = None,
@@ -481,8 +499,10 @@ def from_data_frame(
         containing spectrogram image file paths and their corresponding targets.
 
         Input spectrogram image paths will be extracted from the ``input_field`` in the DataFrame.
-        The supported file extensions are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``, ``.bmp``, ``.pgm``, ``.tif``,
-        ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for precomputed spectrograms are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``,
+        ``.bmp``, ``.pgm``, ``.tif``, ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for raw audio (where spectrograms will be computed automatically) are: ``.wav``,
+        ``.ogg``, ``.flac``, ``.mat``, and ``.mp3``.
         The targets will be extracted from the ``target_fields`` in the DataFrame and can be in any of our
         :ref:`supported classification target formats <formatting_classification_targets>`.
         To learn how to customize the transforms applied for each stage, read our
@@ -507,6 +527,8 @@ def from_data_frame(
             predict_images_root: The root directory containing predict spectrogram images.
             predict_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a
                 spectrogram image file path.
+            sampling_rate: Sampling rate to use when loading raw audio files.
+            n_fft: The size of the FFT to use when creating spectrograms from raw audio.
             target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
                 control how targets are handled. See :ref:`formatting_classification_targets` for more details.
             input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
@@ -585,6 +607,8 @@ def from_data_frame(
         """
 
         ds_kw = dict(
+            sampling_rate=sampling_rate,
+            n_fft=n_fft,
             target_formatter=target_formatter,
         )
 
@@ -623,6 +647,8 @@ def from_csv(
         predict_file: Optional[str] = None,
         predict_images_root: Optional[str] = None,
         predict_resolver: Optional[Callable[[PATH_TYPE, Any], PATH_TYPE]] = None,
+        sampling_rate: int = 16000,
+        n_fft: int = 400,
         input_cls: Type[Input] = AudioClassificationCSVInput,
         transform: INPUT_TRANSFORM_TYPE = AudioClassificationInputTransform,
         transform_kwargs: Optional[Dict] = None,
@@ -633,8 +659,10 @@ def from_csv(
         spectrogram image file paths and their corresponding targets.
 
         Input spectrogram images will be extracted from the ``input_field`` column in the CSV files.
-        The supported file extensions are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``, ``.bmp``, ``.pgm``, ``.tif``,
-        ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for precomputed spectrograms are: ``.jpg``, ``.jpeg``, ``.png``, ``.ppm``,
+        ``.bmp``, ``.pgm``, ``.tif``, ``.tiff``, ``.webp``, and ``.npy``.
+        The supported file extensions for raw audio (where spectrograms will be computed automatically) are: ``.wav``,
+        ``.ogg``, ``.flac``, ``.mat``, and ``.mp3``.
         The targets will be extracted from the ``target_fields`` in the CSV files and can be in any of our
         :ref:`supported classification target formats <formatting_classification_targets>`.
         To learn how to customize the transforms applied for each stage, read our
@@ -659,6 +687,8 @@ def from_csv(
             predict_images_root: The root directory containing predict spectrogram images.
             predict_resolver: Optionally provide a function which converts an entry from the ``input_field`` into a
                 spectrogram image file path.
+            sampling_rate: Sampling rate to use when loading raw audio files.
+            n_fft: The size of the FFT to use when creating spectrograms from raw audio.
             target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
                 control how targets are handled. See :ref:`formatting_classification_targets` for more details.
             input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
@@ -747,6 +777,8 @@ def from_csv(
         """
 
         ds_kw = dict(
+            sampling_rate=sampling_rate,
+            n_fft=n_fft,
             target_formatter=target_formatter,
         )