From 983a888f498131911a1296400448b004a53c1717 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ewah1g13@soton.ac.uk>
Date: Fri, 19 Mar 2021 16:26:58 +0000
Subject: [PATCH 01/25] Fix all_gather for tpu_cores=8 (#6587)

---
 CHANGELOG.md                          | 3 +++
 pytorch_lightning/accelerators/tpu.py | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e59d48fbc8665..d2a0d3641b40b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -164,6 +164,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/PyTorchLightning/pytorch-lightning/pull/6541))
 
 
+- Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587))
+
+
 ## [1.2.3] - 2021-03-09
 
 ### Fixed
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 57e65a62f6783..5c4fb2815aa6d 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -46,12 +46,12 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra
         Function to gather a tensor from several distributed processes
         Args:
             tensor: tensor of shape (batch, ...)
-            group: the process group to gather results from. Defaults to all processes (world)
-            sync_grads: flag that allows users to synchronize gradients for all_gather op
+            group: not available with TPUs
+            sync_grads: not available with TPUs
         Return:
             A tensor of shape (world_size, batch, ...)
         """
         # todo: Add support for backward with all_gather
-        if torch.distributed.is_initialized():
-            return xm.all_gather(tensor, group=group, sync_grads=sync_grads)
+        if isinstance(self.training_type_plugin, TPUSpawnPlugin) and self.training_type_plugin.is_distributed:
+            return xm.all_gather(tensor).view(-1, *tensor.shape)
         return tensor

From 87c03b10389bb88d1b1a4e5fbc40e8e02091fd04 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Sat, 20 Mar 2021 01:02:57 +0530
Subject: [PATCH 02/25] Update Gradient Clipping for TPU Accelerator (#6576)

---
 CHANGELOG.md                                  |  3 ++
 pytorch_lightning/accelerators/tpu.py         | 18 +++++++++++-
 .../plugins/precision/precision_plugin.py     |  1 -
 tests/models/test_tpu.py                      | 28 +++++++++++++++++++
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2a0d3641b40b..bd8f5e31770d2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -167,6 +167,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587))
 
 
+- Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576))
+
+
 ## [1.2.3] - 2021-03-09
 
 ### Fixed
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 5c4fb2815aa6d..fb4af24c93505 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch.optim import Optimizer
@@ -12,6 +12,9 @@
 
 if _XLA_AVAILABLE:
     import torch_xla.core.xla_model as xm
+    from torch_xla._patched_functions import clip_grad_norm_
+
+    xla_clip_grad_norm_ = clip_grad_norm_
 
 if TYPE_CHECKING:
     from pytorch_lightning.core.lightning import LightningModule
@@ -55,3 +58,16 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra
         if isinstance(self.training_type_plugin, TPUSpawnPlugin) and self.training_type_plugin.is_distributed:
             return xm.all_gather(tensor).view(-1, *tensor.shape)
         return tensor
+
+    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[float, int], norm_type: float = 2.0):
+
+        model = self.lightning_module
+        parameters = model.parameters()
+
+        grad_clip_val = float(clip_val)
+        if grad_clip_val <= 0:
+            return
+
+        max_norm = grad_clip_val
+
+        xla_clip_grad_norm_(parameters, max_norm, norm_type)
diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
index 2b1579cf497c0..7172d82391bd3 100644
--- a/pytorch_lightning/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -100,7 +100,6 @@ def post_optimizer_step(self, optimizer: 'Optimizer', optimizer_idx: int) -> Non
 
     def clip_gradients(self, optimizer: 'Optimizer', clip_val: Union[int, float], norm_type: float = 2.0) -> None:
         """Clips the gradients to a specific value"""
-        # TODO: separate TPU case from here
         if clip_val is None:
             return
 
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 0c922c99149fa..5358b9f881048 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -355,3 +355,31 @@ def test_reduce(rank):
                 assert result.item() == 8
 
     xmp.spawn(test_reduce, nprocs=8, start_method='fork')
+
+
+@pytest.mark.parametrize("clip_val", [0, 10])
+@RunIf(tpu=True)
+@pl_multi_process_test
+@mock.patch("pytorch_lightning.accelerators.tpu.xla_clip_grad_norm_")
+def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir):
+    """
+    Ensure that clip gradients is only called if the value is greater than 0.
+    """
+    tutils.reset_seed()
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        tpu_cores=1,
+        precision=16,
+        limit_train_batches=4,
+        limit_val_batches=4,
+        gradient_clip_val=clip_val,
+    )
+    model = BoringModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+
+    if clip_val > 0:
+        mock_clip_grad_norm.assert_called()
+    else:
+        mock_clip_grad_norm.assert_not_called()

From 57807969310b618c307a1509c1a9b186f6487754 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Fri, 19 Mar 2021 22:25:46 +0100
Subject: [PATCH 03/25] NGC container PoC (#6187)

* add NVIDIA flows

* push

* pull

* ...

* extras

* ci prune

* fix

* tag

* .

* list
---
 .github/workflows/ci_dockers.yml     | 18 ++----------------
 .github/workflows/events-nightly.yml | 23 +++++++++++++++++++++++
 .github/workflows/release-docker.yml | 26 +++++++++++++++++++++++++-
 dockers/nvidia/Dockerfile            | 19 ++++++++++---------
 dockers/release/Dockerfile           |  1 -
 5 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 9f77fb76aa593..897e16a12d44f 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -29,9 +29,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
 
-      # https://github.com/docker/setup-buildx-action
-      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
-      - uses: docker/setup-buildx-action@v1
       - name: Build PL Docker
         # publish master/release
         uses: docker/build-push-action@v2
@@ -54,9 +51,6 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
 
-      # https://github.com/docker/setup-buildx-action
-      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
-      - uses: docker/setup-buildx-action@v1
       - name: Build XLA Docker
         # publish master/release
         uses: docker/build-push-action@v2
@@ -93,9 +87,6 @@ jobs:
           echo "::set-output name=CUDA::$cuda"
         id: extend
 
-      # https://github.com/docker/setup-buildx-action
-      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
-      - uses: docker/setup-buildx-action@v1
       - name: Build CUDA Docker
         # publish master/release
         uses: docker/build-push-action@v2
@@ -130,9 +121,6 @@ jobs:
           echo "::set-output name=CUDA::$cuda"
         id: extend
 
-      # https://github.com/docker/setup-buildx-action
-      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
-      - uses: docker/setup-buildx-action@v1
       - name: Build CUDA Docker
         # publish master/release
         uses: docker/build-push-action@v2
@@ -150,10 +138,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v2
-      # https://github.com/docker/setup-buildx-action
-      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
-      - uses: docker/setup-buildx-action@v1
-      - name: Build CUDA Docker
+
+      - name: Build NVIDIA Docker
         uses: docker/build-push-action@v2
         with:
           file: dockers/nvidia/Dockerfile
diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 24d8ce4002e5d..5ad4396a006f7 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -126,3 +126,26 @@ jobs:
           push: true
           tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
         timeout-minutes: 55
+
+#  docker-nvidia:
+#    runs-on: ubuntu-20.04
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v2
+#
+#      # https://github.com/docker/setup-buildx-action
+#      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
+#      - uses: docker/setup-buildx-action@v1
+#      - name: Login to DockerHub
+#        uses: docker/login-action@v1
+#        with:
+#          username: ${{ secrets.DOCKER_USERNAME }}
+#          password: ${{ secrets.DOCKER_PASSWORD }}
+#
+#      - name: Publish NVIDIA to Docker Hub
+#        uses: docker/build-push-action@v2
+#        with:
+#          file: dockers/nvidia/Dockerfile
+#          push: true
+#          tags: nvcr.io/pytorchlightning/pytorch_lightning:nvidia
+#        timeout-minutes: 55
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index f285794cbc33b..36ecbe229ac7c 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -8,7 +8,7 @@ on:
     types: [created]
 
 jobs:
-  build-PL:
+  cuda-PL:
     runs-on: ubuntu-20.04
     strategy:
       fail-fast: false
@@ -36,3 +36,27 @@ jobs:
           build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
           tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
         timeout-minutes: 55
+
+#  nvidia-PL:
+#    runs-on: ubuntu-20.04
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v2
+#
+#      - name: Get release version
+#        if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release'
+#        id: get_version
+#        run: echo "::set-output name=RELEASE_VERSION::$(echo ${GITHUB_REF##*/})"
+#
+#      - name: Publish Releases to Docker
+#        # only on releases
+#        uses: docker/build-push-action@v1.1.0
+#        if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release'
+#        with:
+#          repository: nvcr.io/pytorchlightning/pytorch_lightning
+#          username: ${{ secrets.DOCKER_USERNAME }}
+#          password: ${{ secrets.DOCKER_PASSWORD }}
+#          dockerfile: dockers/nvidia/Dockerfile
+#          build_args: LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+#          tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-nvidia"
+#        timeout-minutes: 55
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
index ea567a5306eed..4b04bc9426d4d 100644
--- a/dockers/nvidia/Dockerfile
+++ b/dockers/nvidia/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/pytorch:20.12-py3
+FROM nvcr.io/nvidia/pytorch:21.02-py3
 
 MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
@@ -22,16 +22,17 @@ COPY ./ ./pytorch-lightning/
 
 # install dependencies
 RUN \
-    # Disable cache
     #conda install "pip>20.1" && \
-    #pip config set global.cache-dir false && \
-    if [ -z $LIGHTNING_VERSION ] ; then \
-        pip install ./pytorch-lightning --no-cache-dir ; \
+    pip list | grep torch && \
+    if [ ! -z "$LIGHTNING_VERSION" ] ; then \
         rm -rf pytorch-lightning ; \
-    else \
-        rm -rf pytorch-lightning ; \
-        pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --no-cache-dir ; \
-    fi
+        wget https://github.com/PyTorchLightning/pytorch-lightning/archive/${LIGHTNING_VERSION}.zip --progress=bar:force:noscroll ; \
+        unzip ${LIGHTNING_VERSION}.zip ; \
+        mv pytorch-lightning-*/ pytorch-lightning ; \
+        rm *.zip ; \
+    fi && \
+    pip install ./pytorch-lightning["extra"] --no-cache-dir && \
+    rm -rf pytorch-lightning
 
 RUN python --version && \
     pip --version && \
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
index 3584ee02746e3..0eec1e41a5a3f 100644
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -25,7 +25,6 @@ COPY ./ ./pytorch-lightning/
 
 # install dependencies
 RUN \
-    # Disable cache
     #conda install "pip>20.1" && \
     if [ ! -z "$LIGHTNING_VERSION" ] ; then \
         rm -rf pytorch-lightning ; \

From 3b72bccdf2999a46a34fcc9c98b9bed879ad32d4 Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkam@users.noreply.github.com>
Date: Fri, 19 Mar 2021 14:38:49 -0700
Subject: [PATCH 04/25] Automatically set sync_batchnorm for
 training_type_plugin (#6536)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
Co-authored-by: Kaushik Bokka <kaushikbokka@gmail.com>
---
 .../connectors/accelerator_connector.py       |  5 +++
 tests/plugins/test_custom_plugin.py           | 41 +++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 tests/plugins/test_custom_plugin.py

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 7ff288282259a..a7160c4bb2d3c 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -430,6 +430,11 @@ def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> Tra
         if hasattr(training_type, 'num_nodes') and getattr(training_type, 'num_nodes') is None:
             training_type.num_nodes = self.num_nodes
 
+        # Automatically set sync_batchnorm if None.
+        # Useful for custom plugins.
+        if hasattr(training_type, 'sync_batchnorm') and getattr(training_type, 'sync_batchnorm') is None:
+            training_type.sync_batchnorm = self.sync_batchnorm
+
         return training_type
 
     def select_accelerator(self) -> Accelerator:
diff --git a/tests/plugins/test_custom_plugin.py b/tests/plugins/test_custom_plugin.py
new file mode 100644
index 0000000000000..872b49ef48635
--- /dev/null
+++ b/tests/plugins/test_custom_plugin.py
@@ -0,0 +1,41 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pytorch_lightning import Trainer
+from pytorch_lightning.plugins import DDPPlugin
+from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
+
+
+class CustomParallelPlugin(DDPPlugin):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Set to None so it will be overwritten by the accelerator connector.
+        self.sync_batchnorm = None
+
+
+@RunIf(skip_windows=True)
+def test_sync_batchnorm_set(tmpdir):
+    """Tests if sync_batchnorm is automatically set for custom plugin."""
+    model = BoringModel()
+    plugin = CustomParallelPlugin()
+    assert plugin.sync_batchnorm is None
+    trainer = Trainer(
+        max_epochs=1,
+        plugins=[plugin],
+        default_root_dir=tmpdir,
+        sync_batchnorm=True,
+    )
+    trainer.fit(model)
+    assert plugin.sync_batchnorm is True

From 3a56a6024e0d8b239801cce558381807c24ba3d0 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Fri, 19 Mar 2021 22:48:52 +0100
Subject: [PATCH 05/25] Prune metrics: other classification 7/n (#6584)

* confusion_matrix

* iou

* f_beta

* hamming_distance

* stat_scores

* tests

* flake8

* chlog
---
 CHANGELOG.md                                  |   2 +
 .../classification/confusion_matrix.py        |  90 +----
 .../metrics/classification/f_beta.py          | 180 +---------
 .../classification/hamming_distance.py        |  85 +----
 .../metrics/classification/iou.py             |  83 +----
 .../metrics/classification/stat_scores.py     | 239 +-------------
 .../metrics/functional/confusion_matrix.py    |  75 +----
 .../metrics/functional/f_beta.py              | 120 +------
 .../metrics/functional/hamming_distance.py    |  58 +---
 pytorch_lightning/metrics/functional/iou.py   |  88 +----
 .../metrics/functional/stat_scores.py         | 271 +--------------
 tests/metrics/classification/__init__.py      |   0
 tests/metrics/classification/inputs.py        |  66 ----
 .../classification/test_confusion_matrix.py   | 128 -------
 tests/metrics/classification/test_f_beta.py   | 153 ---------
 .../classification/test_hamming_distance.py   |  80 -----
 tests/metrics/classification/test_inputs.py   | 312 ------------------
 tests/metrics/classification/test_iou.py      | 216 ------------
 .../classification/test_stat_scores.py        | 255 --------------
 tests/metrics/test_remove_1-5_metrics.py      |  75 +++++
 20 files changed, 155 insertions(+), 2421 deletions(-)
 delete mode 100644 tests/metrics/classification/__init__.py
 delete mode 100644 tests/metrics/classification/inputs.py
 delete mode 100644 tests/metrics/classification/test_confusion_matrix.py
 delete mode 100644 tests/metrics/classification/test_f_beta.py
 delete mode 100644 tests/metrics/classification/test_hamming_distance.py
 delete mode 100644 tests/metrics/classification/test_inputs.py
 delete mode 100644 tests/metrics/classification/test_iou.py
 delete mode 100644 tests/metrics/classification/test_stat_scores.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bd8f5e31770d2..01c7ae193555a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -78,6 +78,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
     [#6573](https://github.com/PyTorchLightning/pytorch-lightning/pull/6573),
 
+    [#6584](https://github.com/PyTorchLightning/pytorch-lightning/pull/6584),
+
 )
 
 
diff --git a/pytorch_lightning/metrics/classification/confusion_matrix.py b/pytorch_lightning/metrics/classification/confusion_matrix.py
index 112fb4940e6e2..aacd8dcf3b498 100644
--- a/pytorch_lightning/metrics/classification/confusion_matrix.py
+++ b/pytorch_lightning/metrics/classification/confusion_matrix.py
@@ -13,64 +13,14 @@
 # limitations under the License.
 from typing import Any, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import ConfusionMatrix as _ConfusionMatrix
 
-from pytorch_lightning.metrics.functional.confusion_matrix import _confusion_matrix_compute, _confusion_matrix_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class ConfusionMatrix(Metric):
-    """
-    Computes the `confusion matrix
-    <https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix>`_.  Works with binary,
-    multiclass, and multilabel data.  Accepts probabilities from a model output or
-    integer class values in prediction.  Works with multi-dimensional preds and
-    target.
-
-    Note:
-        This metric produces a multi-dimensional output, so it can not be directly logged.
-
-    Forward accepts
-
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        normalize: Normalization mode for confusion matrix. Choose from
-
-            - ``None`` or ``'none'``: no normalization (default)
-            - ``'true'``: normalization over the targets (most commonly used)
-            - ``'pred'``: normalization over the predictions
-            - ``'all'``: normalization over the whole matrix
-
-        threshold:
-            Threshold value for binary or multi-label probabilites. default: 0.5
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import ConfusionMatrix
-        >>> target = torch.tensor([1, 1, 0, 0])
-        >>> preds = torch.tensor([0, 1, 0, 0])
-        >>> confmat = ConfusionMatrix(num_classes=2)
-        >>> confmat(preds, target)
-        tensor([[2., 0.],
-                [1., 1.]])
-
-    """
+class ConfusionMatrix(_ConfusionMatrix):
 
+    @deprecated(target=_ConfusionMatrix, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -80,35 +30,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-        self.num_classes = num_classes
-        self.normalize = normalize
-        self.threshold = threshold
-
-        allowed_normalize = ('true', 'pred', 'all', 'none', None)
-        assert self.normalize in allowed_normalize, \
-            f"Argument average needs to one of the following: {allowed_normalize}"
-
-        self.add_state("confmat", default=torch.zeros(num_classes, num_classes), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        confmat = _confusion_matrix_update(preds, target, self.num_classes, self.threshold)
-        self.confmat += confmat
+        This implementation refers to :class:`~torchmetrics.ConfusionMatrix`.
 
-    def compute(self) -> torch.Tensor:
-        """
-        Computes confusion matrix
+        .. deprecated::
+            Use :class:`~torchmetrics.ConfusionMatrix`. Will be removed in v1.5.0.
         """
-        return _confusion_matrix_compute(self.confmat, self.normalize)
diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py
index a46b01a1aa8b7..bac3cc3e99c4e 100644
--- a/pytorch_lightning/metrics/classification/f_beta.py
+++ b/pytorch_lightning/metrics/classification/f_beta.py
@@ -13,72 +13,15 @@
 # limitations under the License.
 from typing import Any, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import F1 as _F1
+from torchmetrics import FBeta as _FBeta
 
-from pytorch_lightning.metrics.functional.f_beta import _fbeta_compute, _fbeta_update
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class FBeta(Metric):
-    r"""
-    Computes `F-score <https://en.wikipedia.org/wiki/F-score>`_, specifically:
-
-    .. math::
-        F_\beta = (1 + \beta^2) * \frac{\text{precision} * \text{recall}}
-        {(\beta^2 * \text{precision}) + \text{recall}}
-
-    Where :math:`\beta` is some positive real factor. Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    Forward accepts
-
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        beta: Beta coefficient in the F measure.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Raises:
-        ValueError:
-            If ``average`` is none of ``"micro"``, ``"macro"``, ``"weighted"``, ``"none"``, ``None``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import FBeta
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> f_beta = FBeta(num_classes=3, beta=0.5)
-        >>> f_beta(preds, target)
-        tensor(0.3333)
-
-    """
+class FBeta(_FBeta):
 
+    @deprecated(target=_FBeta, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -90,103 +33,17 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-
-        self.num_classes = num_classes
-        self.beta = beta
-        self.threshold = threshold
-        self.average = average
-        self.multilabel = multilabel
-
-        allowed_average = ("micro", "macro", "weighted", "none", None)
-        if self.average not in allowed_average:
-            raise ValueError(
-                'Argument `average` expected to be one of the following:'
-                f' {allowed_average} but got {self.average}'
-            )
-
-        self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
-        self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
-        self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        true_positives, predicted_positives, actual_positives = _fbeta_update(
-            preds, target, self.num_classes, self.threshold, self.multilabel
-        )
-
-        self.true_positives += true_positives
-        self.predicted_positives += predicted_positives
-        self.actual_positives += actual_positives
+        This implementation refers to :class:`~torchmetrics.FBeta`.
 
-    def compute(self) -> torch.Tensor:
+        .. deprecated::
+            Use :class:`~torchmetrics.FBeta`. Will be removed in v1.5.0.
         """
-        Computes fbeta over state.
-        """
-        return _fbeta_compute(
-            self.true_positives, self.predicted_positives, self.actual_positives, self.beta, self.average
-        )
-
-
-class F1(FBeta):
-    """
-    Computes F1 metric. F1 metrics correspond to a harmonic mean of the
-    precision and recall scores.
-
-    Works with binary, multiclass, and multilabel data.
-    Accepts logits from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
 
-    Forward accepts
 
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-        compute_on_step:
-            Forward only calls ``update()`` and returns None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-        >>> from pytorch_lightning.metrics import F1
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> f1 = F1(num_classes=3)
-        >>> f1(preds, target)
-        tensor(0.3333)
-    """
+class F1(_F1):
 
+    @deprecated(target=_F1, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -197,16 +54,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        if multilabel is not False:
-            rank_zero_warn(f'The `multilabel={multilabel}` parameter is unused and will not have any effect.')
+        """
+        This implementation refers to :class:`~torchmetrics.F1`.
 
-        super().__init__(
-            num_classes=num_classes,
-            beta=1.0,
-            threshold=threshold,
-            average=average,
-            multilabel=multilabel,
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
+        .. deprecated::
+            Use :class:`~torchmetrics.F1`. Will be removed in v1.5.0.
+        """
diff --git a/pytorch_lightning/metrics/classification/hamming_distance.py b/pytorch_lightning/metrics/classification/hamming_distance.py
index dceb90c0a4ca9..b59c3e1053ab8 100644
--- a/pytorch_lightning/metrics/classification/hamming_distance.py
+++ b/pytorch_lightning/metrics/classification/hamming_distance.py
@@ -13,59 +13,14 @@
 # limitations under the License.
 from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import HammingDistance as _HammingDistance
 
-from pytorch_lightning.metrics.functional.hamming_distance import _hamming_distance_compute, _hamming_distance_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class HammingDistance(Metric):
-    r"""
-    Computes the average `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`_ (also
-    known as Hamming loss) between targets and predictions:
-
-    .. math::
-        \text{Hamming distance} = \frac{1}{N \cdot L}\sum_i^N \sum_l^L 1(y_{il} \neq \hat{y_{il}})
-
-    Where :math:`y` is a tensor of target values, :math:`\hat{y}` is a tensor of predictions,
-    and :math:`\bullet_{il}` refers to the :math:`l`-th label of the :math:`i`-th sample of that
-    tensor.
-
-    This is the same as ``1-accuracy`` for binary data, while for all other types of inputs it
-    treats each possible label separately - meaning that, for example, multi-class data is
-    treated as if it were multi-label.
-
-    Args:
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-        compute_on_step:
-            Forward only calls ``update()`` and return ``None`` if this is set to ``False``.
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step.
-        process_group:
-            Specify the process group on which synchronization is called.
-            default: ``None`` (which selects the entire world)
-        dist_sync_fn:
-            Callback that performs the allgather operation on the metric state. When ``None``, DDP
-            will be used to perform the all gather.
-
-    Raises:
-        ValueError:
-            If ``threshold`` is not between ``0`` and ``1``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import HammingDistance
-        >>> target = torch.tensor([[0, 1], [1, 1]])
-        >>> preds = torch.tensor([[0, 1], [0, 1]])
-        >>> hamming_distance = HammingDistance()
-        >>> hamming_distance(preds, target)
-        tensor(0.2500)
-
-    """
+class HammingDistance(_HammingDistance):
 
+    @deprecated(target=_HammingDistance, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         threshold: float = 0.5,
@@ -74,35 +29,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum")
-        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
-
-        if not 0 < threshold < 1:
-            raise ValueError("The `threshold` should lie in the (0,1) interval.")
-        self.threshold = threshold
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
         """
-        Update state with predictions and targets.
+        This implementation refers to :class:`~torchmetrics.HammingDistance`.
 
-        Args:
-            preds: Predictions from model (probabilities, or labels)
-            target: Ground truth labels
-        """
-        correct, total = _hamming_distance_update(preds, target, self.threshold)
-
-        self.correct += correct
-        self.total += total
-
-    def compute(self) -> torch.Tensor:
-        """
-        Computes hamming distance based on inputs passed in to ``update`` previously.
+        .. deprecated::
+            Use :class:`~torchmetrics.HammingDistance`. Will be removed in v1.5.0.
         """
-        return _hamming_distance_compute(self.correct, self.total)
diff --git a/pytorch_lightning/metrics/classification/iou.py b/pytorch_lightning/metrics/classification/iou.py
index a261b767a8190..d5b5d8eeb47e2 100644
--- a/pytorch_lightning/metrics/classification/iou.py
+++ b/pytorch_lightning/metrics/classification/iou.py
@@ -13,70 +13,14 @@
 # limitations under the License.
 from typing import Any, Optional
 
-import torch
+from torchmetrics import IoU as _IoU
 
-from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix
-from pytorch_lightning.metrics.functional.iou import _iou_from_confmat
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class IoU(ConfusionMatrix):
-    r"""
-    Computes `Intersection over union, or Jaccard index calculation <https://en.wikipedia.org/wiki/Jaccard_index>`_:
-
-    .. math:: J(A,B) = \frac{|A\cap B|}{|A\cup B|}
-
-    Where: :math:`A` and :math:`B` are both tensors of the same size, containing integer class values.
-    They may be subject to conversion from input data (see description below). Note that it is different from box IoU.
-
-    Works with binary, multiclass and multi-label data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    Forward accepts
-
-    - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
-    - ``target`` (long tensor): ``(N, ...)``
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        num_classes: Number of classes in the dataset.
-        ignore_index: optional int specifying a target class to ignore. If given, this class index does not contribute
-            to the returned score, regardless of reduction method. Has no effect if given an int that is not in the
-            range [0, num_classes-1]. By default, no index is ignored, and all classes are used.
-        absent_score: score to use for an individual class, if no instances of the class index were present in
-            `pred` AND no instances of the class index were present in `target`. For example, if we have 3 classes,
-            [0, 0] for `pred`, and [0, 2] for `target`, then class 1 would be assigned the `absent_score`.
-        threshold:
-            Threshold value for binary or multi-label probabilities.
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False.
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step.
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-        >>> from pytorch_lightning.metrics import IoU
-        >>> target = torch.randint(0, 2, (10, 25, 25))
-        >>> pred = torch.tensor(target)
-        >>> pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
-        >>> iou = IoU(num_classes=2)
-        >>> iou(pred, target)
-        tensor(0.9660)
-
-    """
+class IoU(_IoU):
 
+    @deprecated(target=_IoU, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_classes: int,
@@ -88,20 +32,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        super().__init__(
-            num_classes=num_classes,
-            normalize=None,
-            threshold=threshold,
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-        self.reduction = reduction
-        self.ignore_index = ignore_index
-        self.absent_score = absent_score
-
-    def compute(self) -> torch.Tensor:
         """
-        Computes intersection over union (IoU)
+        This implementation refers to :class:`~torchmetrics.IoU`.
+
+        .. deprecated::
+            Use :class:`~torchmetrics.IoU`. Will be removed in v1.5.0.
         """
-        return _iou_from_confmat(self.confmat, self.num_classes, self.ignore_index, self.absent_score, self.reduction)
diff --git a/pytorch_lightning/metrics/classification/stat_scores.py b/pytorch_lightning/metrics/classification/stat_scores.py
index 672b0f41c6fc5..2c4764477b262 100644
--- a/pytorch_lightning/metrics/classification/stat_scores.py
+++ b/pytorch_lightning/metrics/classification/stat_scores.py
@@ -11,120 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Optional, Tuple
+from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import StatScores as _StatScores
 
-from pytorch_lightning.metrics.functional.stat_scores import _stat_scores_compute, _stat_scores_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class StatScores(Metric):
-    """Computes the number of true positives, false positives, true negatives, false negatives.
-    Related to `Type I and Type II errors <https://en.wikipedia.org/wiki/Type_I_and_type_II_errors>`__
-    and the `confusion matrix <https://en.wikipedia.org/wiki/Confusion_matrix#Table_of_confusion>`__.
-
-    The reduction method (how the statistics are aggregated) is controlled by the
-    ``reduce`` parameter, and additionally by the ``mdmc_reduce`` parameter in the
-    multi-dimensional multi-class case.
-
-    Args:
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-
-        top_k:
-            Number of highest probability entries for each sample to convert to 1s - relevant
-            only for inputs with probability predictions. If this parameter is set for multi-label
-            inputs, it will take precedence over ``threshold``. For (multi-dim) multi-class inputs,
-            this parameter defaults to 1.
-
-            Should be left unset (``None``) for inputs with label predictions.
-
-        reduce:
-            Defines the reduction that is applied. Should be one of the following:
-
-            - ``'micro'`` [default]: Counts the statistics by summing over all [sample, class]
-              combinations (globally). Each statistic is represented by a single integer.
-            - ``'macro'``: Counts the statistics for each class separately (over all samples).
-              Each statistic is represented by a ``(C,)`` tensor. Requires ``num_classes``
-              to be set.
-            - ``'samples'``: Counts the statistics for each sample separately (over all classes).
-              Each statistic is represented by a ``(N, )`` 1d tensor.
-
-            Note that what is considered a sample in the multi-dimensional multi-class case
-            depends on the value of ``mdmc_reduce``.
-
-        num_classes:
-            Number of classes. Necessary for (multi-dimensional) multi-class or multi-label data.
-
-        ignore_index:
-            Specify a class (label) to ignore. If given, this class index does not contribute
-            to the returned score, regardless of reduction method. If an index is ignored, and
-            ``reduce='macro'``, the class statistics for the ignored class will all be returned
-            as ``-1``.
-
-        mdmc_reduce:
-            Defines how the multi-dimensional multi-class inputs are handeled. Should be
-            one of the following:
-
-            - ``None`` [default]: Should be left unchanged if your data is not multi-dimensional multi-class.
-
-            - ``'samplewise'``: In this case, the statistics are computed separately for each
-              sample on the ``N`` axis, and then the outputs are concatenated together. In each
-              sample the extra axes ``...`` are flattened to become the sub-sample axis, and
-              statistics for each sample are computed by treating the sub-sample axis as the
-              ``N`` axis for that sample.
-
-            - ``'global'``: In this case the ``N`` and ``...`` dimensions of the inputs are
-              flattened into a new ``N_X`` sample axis, i.e. the inputs are treated as if they
-              were ``(N_X, C)``. From here on the ``reduce`` parameter applies as usual.
-
-        is_multiclass:
-            Used only in certain special cases, where you want to treat inputs as a different type
-            than what they appear to be.
-
-        compute_on_step:
-            Forward only calls ``update()`` and return ``None`` if this is set to ``False``.
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step
-        process_group:
-            Specify the process group on which synchronization is called.
-            default: ``None`` (which selects the entire world)
-        dist_sync_fn:
-            Callback that performs the allgather operation on the metric state. When ``None``, DDP
-            will be used to perform the allgather.
-
-    Raises:
-        ValueError:
-            If ``threshold`` is not a ``float`` between ``0`` and ``1``.
-        ValueError:
-            If ``reduce`` is none of ``"micro"``, ``"macro"`` or ``"samples"``.
-        ValueError:
-            If ``mdmc_reduce`` is none of ``None``, ``"samplewise"``, ``"global"``.
-        ValueError:
-            If ``reduce`` is set to ``"macro"`` and ``num_classes`` is not provided.
-        ValueError:
-            If ``num_classes`` is set
-            and ``ignore_index`` is not in the range ``0`` <= ``ignore_index`` < ``num_classes``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.classification import StatScores
-        >>> preds  = torch.tensor([1, 0, 2, 1])
-        >>> target = torch.tensor([1, 1, 2, 0])
-        >>> stat_scores = StatScores(reduce='macro', num_classes=3)
-        >>> stat_scores(preds, target)
-        tensor([[0, 1, 2, 1, 1],
-                [1, 1, 1, 1, 2],
-                [1, 0, 3, 0, 1]])
-        >>> stat_scores = StatScores(reduce='micro')
-        >>> stat_scores(preds, target)
-        tensor([2, 2, 6, 2, 4])
-
-    """
+class StatScores(_StatScores):
 
+    @deprecated(target=_StatScores, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         threshold: float = 0.5,
@@ -139,128 +35,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.reduce = reduce
-        self.mdmc_reduce = mdmc_reduce
-        self.num_classes = num_classes
-        self.threshold = threshold
-        self.is_multiclass = is_multiclass
-        self.ignore_index = ignore_index
-        self.top_k = top_k
-
-        if not 0 < threshold < 1:
-            raise ValueError(f"The `threshold` should be a float in the (0,1) interval, got {threshold}")
-
-        if reduce not in ["micro", "macro", "samples"]:
-            raise ValueError(f"The `reduce` {reduce} is not valid.")
-
-        if mdmc_reduce not in [None, "samplewise", "global"]:
-            raise ValueError(f"The `mdmc_reduce` {mdmc_reduce} is not valid.")
-
-        if reduce == "macro" and (not num_classes or num_classes < 1):
-            raise ValueError("When you set `reduce` as 'macro', you have to provide the number of classes.")
-
-        if num_classes and ignore_index is not None and (not 0 <= ignore_index < num_classes or num_classes == 1):
-            raise ValueError(f"The `ignore_index` {ignore_index} is not valid for inputs with {num_classes} classes")
-
-        if mdmc_reduce != "samplewise" and reduce != "samples":
-            if reduce == "micro":
-                zeros_shape = []
-            elif reduce == "macro":
-                zeros_shape = (num_classes, )
-            default, reduce_fn = lambda: torch.zeros(zeros_shape, dtype=torch.long), "sum"
-        else:
-            default, reduce_fn = lambda: [], None
-
-        for s in ("tp", "fp", "tn", "fn"):
-            self.add_state(s, default=default(), dist_reduce_fx=reduce_fn)
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model (probabilities or labels)
-            target: Ground truth values
-        """
-
-        tp, fp, tn, fn = _stat_scores_update(
-            preds,
-            target,
-            reduce=self.reduce,
-            mdmc_reduce=self.mdmc_reduce,
-            threshold=self.threshold,
-            num_classes=self.num_classes,
-            top_k=self.top_k,
-            is_multiclass=self.is_multiclass,
-            ignore_index=self.ignore_index,
-        )
-
-        # Update states
-        if self.reduce != "samples" and self.mdmc_reduce != "samplewise":
-            self.tp += tp
-            self.fp += fp
-            self.tn += tn
-            self.fn += fn
-        else:
-            self.tp.append(tp)
-            self.fp.append(fp)
-            self.tn.append(tn)
-            self.fn.append(fn)
-
-    def _get_final_stats(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Performs concatenation on the stat scores if neccesary,
-        before passing them to a compute function.
         """
+        This implementation refers to :class:`~torchmetrics.StatScores`.
 
-        if isinstance(self.tp, list):
-            tp = torch.cat(self.tp)
-            fp = torch.cat(self.fp)
-            tn = torch.cat(self.tn)
-            fn = torch.cat(self.fn)
-        else:
-            tp, fp, tn, fn = self.tp, self.fp, self.tn, self.fn
-
-        return tp, fp, tn, fn
-
-    def compute(self) -> torch.Tensor:
-        """
-        Computes the stat scores based on inputs passed in to ``update`` previously.
-
-        Return:
-            The metric returns a tensor of shape ``(..., 5)``, where the last dimension corresponds
-            to ``[tp, fp, tn, fn, sup]`` (``sup`` stands for support and equals ``tp + fn``). The
-            shape depends on the ``reduce`` and ``mdmc_reduce`` (in case of multi-dimensional
-            multi-class data) parameters:
-
-            - If the data is not multi-dimensional multi-class, then
-
-              - If ``reduce='micro'``, the shape will be ``(5, )``
-              - If ``reduce='macro'``, the shape will be ``(C, 5)``,
-                where ``C`` stands for the number of classes
-              - If ``reduce='samples'``, the shape will be ``(N, 5)``, where ``N`` stands for
-                the number of samples
-
-            - If the data is multi-dimensional multi-class and ``mdmc_reduce='global'``, then
-
-              - If ``reduce='micro'``, the shape will be ``(5, )``
-              - If ``reduce='macro'``, the shape will be ``(C, 5)``
-              - If ``reduce='samples'``, the shape will be ``(N*X, 5)``, where ``X`` stands for
-                the product of sizes of all "extra" dimensions of the data (i.e. all dimensions
-                except for ``C`` and ``N``)
-
-            - If the data is multi-dimensional multi-class and ``mdmc_reduce='samplewise'``, then
-
-              - If ``reduce='micro'``, the shape will be ``(N, 5)``
-              - If ``reduce='macro'``, the shape will be ``(N, C, 5)``
-              - If ``reduce='samples'``, the shape will be ``(N, X, 5)``
-
+        .. deprecated::
+            Use :class:`~torchmetrics.StatScores`. Will be removed in v1.5.0.
         """
-        tp, fp, tn, fn = self._get_final_stats()
-        return _stat_scores_compute(tp, fp, tn, fn)
diff --git a/pytorch_lightning/metrics/functional/confusion_matrix.py b/pytorch_lightning/metrics/functional/confusion_matrix.py
index e77fc4224d25e..5cf8818176696 100644
--- a/pytorch_lightning/metrics/functional/confusion_matrix.py
+++ b/pytorch_lightning/metrics/functional/confusion_matrix.py
@@ -14,45 +14,12 @@
 from typing import Optional
 
 import torch
-from torchmetrics.classification.checks import _input_format_classification
-from torchmetrics.utilities.enums import DataType
+from torchmetrics.functional import confusion_matrix as _confusion_matrix
 
-from pytorch_lightning.utilities import rank_zero_warn
-
-
-def _confusion_matrix_update(
-    preds: torch.Tensor, target: torch.Tensor, num_classes: int, threshold: float = 0.5
-) -> torch.Tensor:
-    preds, target, mode = _input_format_classification(preds, target, threshold)
-    if mode not in (DataType.BINARY, DataType.MULTILABEL):
-        preds = preds.argmax(dim=1)
-        target = target.argmax(dim=1)
-    unique_mapping = (target.view(-1) * num_classes + preds.view(-1)).to(torch.long)
-    bins = torch.bincount(unique_mapping, minlength=num_classes**2)
-    confmat = bins.reshape(num_classes, num_classes)
-    return confmat
-
-
-def _confusion_matrix_compute(confmat: torch.Tensor, normalize: Optional[str] = None) -> torch.Tensor:
-    allowed_normalize = ('true', 'pred', 'all', 'none', None)
-    assert normalize in allowed_normalize, \
-        f"Argument average needs to one of the following: {allowed_normalize}"
-    confmat = confmat.float()
-    if normalize is not None and normalize != 'none':
-        if normalize == 'true':
-            cm = confmat / confmat.sum(axis=1, keepdim=True)
-        elif normalize == 'pred':
-            cm = confmat / confmat.sum(axis=0, keepdim=True)
-        elif normalize == 'all':
-            cm = confmat / confmat.sum()
-        nan_elements = cm[torch.isnan(cm)].nelement()
-        if nan_elements != 0:
-            cm[torch.isnan(cm)] = 0
-            rank_zero_warn(f'{nan_elements} nan values found in confusion matrix have been replaced with zeros.')
-        return cm
-    return confmat
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_confusion_matrix, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def confusion_matrix(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -61,38 +28,6 @@ def confusion_matrix(
     threshold: float = 0.5
 ) -> torch.Tensor:
     """
-    Computes the confusion matrix. Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        preds: (float or long tensor), Either a ``(N, ...)`` tensor with labels or
-            ``(N, C, ...)`` where C is the number of classes, tensor with labels/probabilities
-        target: ``target`` (long tensor), tensor with shape ``(N, ...)`` with ground true labels
-        num_classes: Number of classes in the dataset.
-        normalize: Normalization mode for confusion matrix. Choose from
-
-            - ``None`` or ``'none'``: no normalization (default)
-            - ``'true'``: normalization over the targets (most commonly used)
-            - ``'pred'``: normalization over the predictions
-            - ``'all'``: normalization over the whole matrix
-
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import confusion_matrix
-        >>> target = torch.tensor([1, 1, 0, 0])
-        >>> preds = torch.tensor([0, 1, 0, 0])
-        >>> confusion_matrix(preds, target, num_classes=2)
-        tensor([[2., 0.],
-                [1., 1.]])
+    .. deprecated::
+        Use :func:`torchmetrics.functional.confusion_matrix`. Will be removed in v1.5.0.
     """
-    confmat = _confusion_matrix_update(preds, target, num_classes, threshold)
-    return _confusion_matrix_compute(confmat, normalize)
diff --git a/pytorch_lightning/metrics/functional/f_beta.py b/pytorch_lightning/metrics/functional/f_beta.py
index 5be4786297b65..e4d926e0ab8bf 100644
--- a/pytorch_lightning/metrics/functional/f_beta.py
+++ b/pytorch_lightning/metrics/functional/f_beta.py
@@ -11,46 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
-
 import torch
-from torchmetrics.utilities import class_reduce
-from torchmetrics.utilities.checks import _input_format_classification_one_hot
-
-
-def _fbeta_update(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    num_classes: int,
-    threshold: float = 0.5,
-    multilabel: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    preds, target = _input_format_classification_one_hot(num_classes, preds, target, threshold, multilabel)
-    true_positives = torch.sum(preds * target, dim=1)
-    predicted_positives = torch.sum(preds, dim=1)
-    actual_positives = torch.sum(target, dim=1)
-    return true_positives, predicted_positives, actual_positives
-
+from torchmetrics.functional import f1 as _f1
+from torchmetrics.functional import fbeta as _fbeta
 
-def _fbeta_compute(
-    true_positives: torch.Tensor,
-    predicted_positives: torch.Tensor,
-    actual_positives: torch.Tensor,
-    beta: float = 1.0,
-    average: str = "micro"
-) -> torch.Tensor:
-    if average == "micro":
-        precision = true_positives.sum().float() / predicted_positives.sum()
-        recall = true_positives.sum().float() / actual_positives.sum()
-    else:
-        precision = true_positives.float() / predicted_positives
-        recall = true_positives.float() / actual_positives
-
-    num = (1 + beta**2) * precision * recall
-    denom = beta**2 * precision + recall
-    return class_reduce(num, denom, weights=actual_positives, class_reduction=average)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_fbeta, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def fbeta(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -61,49 +29,12 @@ def fbeta(
     multilabel: bool = False
 ) -> torch.Tensor:
     """
-    Computes f_beta metric.
-
-    Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        preds: predictions from model (probabilities, or labels)
-        target: ground truth labels
-        num_classes: Number of classes in the dataset.
-        beta: Beta coefficient in the F measure.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import fbeta
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> fbeta(preds, target, num_classes=3, beta=0.5)
-        tensor(0.3333)
-
+    .. deprecated::
+        Use :func:`torchmetrics.functional.accuracy`. Will be removed in v1.5.0.
     """
-    true_positives, predicted_positives, actual_positives = _fbeta_update(
-        preds, target, num_classes, threshold, multilabel
-    )
-    return _fbeta_compute(true_positives, predicted_positives, actual_positives, beta, average)
 
 
+@deprecated(target=_f1, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def f1(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -113,39 +44,6 @@ def f1(
     multilabel: bool = False
 ) -> torch.Tensor:
     """
-    Computes F1 metric. F1 metrics correspond to a equally weighted average of the
-    precision and recall scores.
-
-    Works with binary, multiclass, and multilabel data.
-    Accepts probabilities from a model output or integer class values in prediction.
-    Works with multi-dimensional preds and target.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
-
-    Args:
-        preds: predictions from model (probabilities, or labels)
-        target: ground truth labels
-        num_classes: Number of classes in the dataset.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-
-        average:
-            - ``'micro'`` computes metric globally
-            - ``'macro'`` computes metric for each class and uniformly averages them
-            - ``'weighted'`` computes metric for each class and does a weighted-average,
-              where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` or ``None`` computes and returns the metric per class
-
-        multilabel: If predictions are from multilabel classification.
-
-    Example:
-        >>> from pytorch_lightning.metrics.functional import f1
-        >>> target = torch.tensor([0, 1, 2, 0, 1, 2])
-        >>> preds = torch.tensor([0, 2, 1, 0, 0, 1])
-        >>> f1(preds, target, num_classes=3)
-        tensor(0.3333)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.f1`. Will be removed in v1.5.0.
     """
-    return fbeta(preds, target, num_classes, 1.0, threshold, average, multilabel)
diff --git a/pytorch_lightning/metrics/functional/hamming_distance.py b/pytorch_lightning/metrics/functional/hamming_distance.py
index 3254dcbf8badb..ef6bb3277fef2 100644
--- a/pytorch_lightning/metrics/functional/hamming_distance.py
+++ b/pytorch_lightning/metrics/functional/hamming_distance.py
@@ -11,61 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple, Union
-
 import torch
-from torchmetrics.classification.checks import _input_format_classification
-
-
-def _hamming_distance_update(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    threshold: float = 0.5,
-) -> Tuple[torch.Tensor, int]:
-    preds, target, _ = _input_format_classification(preds, target, threshold=threshold)
-
-    correct = (preds == target).sum()
-    total = preds.numel()
+from torchmetrics.functional import hamming_distance as _hamming_distance
 
-    return correct, total
-
-
-def _hamming_distance_compute(correct: torch.Tensor, total: Union[int, torch.Tensor]) -> torch.Tensor:
-    return 1 - correct.float() / total
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_hamming_distance, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def hamming_distance(preds: torch.Tensor, target: torch.Tensor, threshold: float = 0.5) -> torch.Tensor:
-    r"""
-    Computes the average `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`_ (also
-    known as Hamming loss) between targets and predictions:
-
-    .. math::
-        \text{Hamming distance} = \frac{1}{N \cdot L} \sum_i^N \sum_l^L 1(y_{il} \neq \hat{y}_{il})
-
-    Where :math:`y` is a tensor of target values, :math:`\hat{y}` is a tensor of predictions,
-    and :math:`\bullet_{il}` refers to the :math:`l`-th label of the :math:`i`-th sample of that
-    tensor.
-
-    This is the same as ``1-accuracy`` for binary data, while for all other types of inputs it
-    treats each possible label separately - meaning that, for example, multi-class data is
-    treated as if it were multi-label.
-
-    Args:
-        preds: Predictions from model
-        target: Ground truth
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import hamming_distance
-        >>> target = torch.tensor([[0, 1], [1, 1]])
-        >>> preds = torch.tensor([[0, 1], [0, 1]])
-        >>> hamming_distance(preds, target)
-        tensor(0.2500)
-
     """
-
-    correct, total = _hamming_distance_update(preds, target, threshold)
-    return _hamming_distance_compute(correct, total)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.hamming_distance`. Will be removed in v1.5.0.
+    """
diff --git a/pytorch_lightning/metrics/functional/iou.py b/pytorch_lightning/metrics/functional/iou.py
index 0f8152d314848..7ae520eb25dee 100644
--- a/pytorch_lightning/metrics/functional/iou.py
+++ b/pytorch_lightning/metrics/functional/iou.py
@@ -14,35 +14,12 @@
 from typing import Optional
 
 import torch
-from torchmetrics.utilities import reduce
-from torchmetrics.utilities.data import get_num_classes
+from torchmetrics.functional import iou as _iou
 
-from pytorch_lightning.metrics.functional.confusion_matrix import _confusion_matrix_update
-
-
-def _iou_from_confmat(
-    confmat: torch.Tensor,
-    num_classes: int,
-    ignore_index: Optional[int] = None,
-    absent_score: float = 0.0,
-    reduction: str = 'elementwise_mean',
-):
-    intersection = torch.diag(confmat)
-    union = confmat.sum(0) + confmat.sum(1) - intersection
-
-    # If this class is absent in both target AND pred (union == 0), then use the absent_score for this class.
-    scores = intersection.float() / union.float()
-    scores[union == 0] = absent_score
-
-    # Remove the ignored class index from the scores.
-    if ignore_index is not None and ignore_index >= 0 and ignore_index < num_classes:
-        scores = torch.cat([
-            scores[:ignore_index],
-            scores[ignore_index + 1:],
-        ])
-    return reduce(scores, reduction=reduction)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_iou, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def iou(
     pred: torch.Tensor,
     target: torch.Tensor,
@@ -52,60 +29,7 @@ def iou(
     num_classes: Optional[int] = None,
     reduction: str = 'elementwise_mean',
 ) -> torch.Tensor:
-    r"""
-    Computes `Intersection over union, or Jaccard index calculation <https://en.wikipedia.org/wiki/Jaccard_index>`_:
-
-    .. math:: J(A,B) = \frac{|A\cap B|}{|A\cup B|}
-
-    Where: :math:`A` and :math:`B` are both tensors of the same size,
-    containing integer class values. They may be subject to conversion from
-    input data (see description below).
-
-    Note that it is different from box IoU.
-
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
-    to convert into integer labels. This is the case for binary and multi-label probabilities.
-
-    If pred has an extra dimension as in the case of multi-class scores we
-    perform an argmax on ``dim=1``.
-
-    Args:
-        preds: tensor containing predictions from model (probabilities, or labels) with shape ``[N, d1, d2, ...]``
-        target: tensor containing ground truth labels with shape ``[N, d1, d2, ...]``
-        ignore_index: optional int specifying a target class to ignore. If given,
-            this class index does not contribute to the returned score, regardless
-            of reduction method. Has no effect if given an int that is not in the
-            range [0, num_classes-1], where num_classes is either given or derived
-            from pred and target. By default, no index is ignored, and all classes are used.
-        absent_score: score to use for an individual class, if no instances of
-            the class index were present in `pred` AND no instances of the class
-            index were present in `target`. For example, if we have 3 classes,
-            [0, 0] for `pred`, and [0, 2] for `target`, then class 1 would be
-            assigned the `absent_score`.
-        threshold:
-            Threshold value for binary or multi-label probabilities. default: 0.5
-        num_classes:
-            Optionally specify the number of classes
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-    Return:
-        IoU score : Tensor containing single value if reduction is
-        'elementwise_mean', or number of classes if reduction is 'none'
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import iou
-        >>> target = torch.randint(0, 2, (10, 25, 25))
-        >>> pred = torch.tensor(target)
-        >>> pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
-        >>> iou(pred, target)
-        tensor(0.9660)
     """
-
-    num_classes = get_num_classes(pred=pred, target=target, num_classes=num_classes)
-    confmat = _confusion_matrix_update(pred, target, num_classes, threshold)
-    return _iou_from_confmat(confmat, num_classes, ignore_index, absent_score, reduction)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.iou`. Will be removed in v1.5.0.
+    """
diff --git a/pytorch_lightning/metrics/functional/stat_scores.py b/pytorch_lightning/metrics/functional/stat_scores.py
index fb1849d3805b2..6f234e84d9aab 100644
--- a/pytorch_lightning/metrics/functional/stat_scores.py
+++ b/pytorch_lightning/metrics/functional/stat_scores.py
@@ -11,130 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
-from torchmetrics.classification.checks import _input_format_classification
+from torchmetrics.functional import stat_scores as _stat_scores
 
-
-def _del_column(tensor: torch.Tensor, index: int):
-    """ Delete the column at index."""
-
-    return torch.cat([tensor[:, :index], tensor[:, (index + 1):]], 1)
-
-
-def _stat_scores(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    reduce: str = "micro",
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Calculate the number of tp, fp, tn, fn.
-
-    Args:
-        preds:
-            An ``(N, C)`` or ``(N, C, X)`` tensor of predictions (0 or 1)
-        target:
-            An ``(N, C)`` or ``(N, C, X)`` tensor of true labels (0 or 1)
-        reduce:
-            One of ``'micro'``, ``'macro'``, ``'samples'``
-
-    Return:
-        Returns a list of 4 tensors; tp, fp, tn, fn.
-        The shape of the returned tensors depnds on the shape of the inputs
-        and the ``reduce`` parameter:
-
-        If inputs are of the shape ``(N, C)``, then
-        - If ``reduce='micro'``, the returned tensors are 1 element tensors
-        - If ``reduce='macro'``, the returned tensors are ``(C,)`` tensors
-        - If ``reduce'samples'``, the returned tensors are ``(N,)`` tensors
-
-        If inputs are of the shape ``(N, C, X)``, then
-        - If ``reduce='micro'``, the returned tensors are ``(N,)`` tensors
-        - If ``reduce='macro'``, the returned tensors are ``(N,C)`` tensors
-        - If ``reduce='samples'``, the returned tensors are ``(N,X)`` tensors
-    """
-    if reduce == "micro":
-        dim = [0, 1] if preds.ndim == 2 else [1, 2]
-    elif reduce == "macro":
-        dim = 0 if preds.ndim == 2 else 2
-    elif reduce == "samples":
-        dim = 1
-
-    true_pred, false_pred = target == preds, target != preds
-    pos_pred, neg_pred = preds == 1, preds == 0
-
-    tp = (true_pred * pos_pred).sum(dim=dim)
-    fp = (false_pred * pos_pred).sum(dim=dim)
-
-    tn = (true_pred * neg_pred).sum(dim=dim)
-    fn = (false_pred * neg_pred).sum(dim=dim)
-
-    return tp.long(), fp.long(), tn.long(), fn.long()
-
-
-def _stat_scores_update(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    reduce: str = "micro",
-    mdmc_reduce: Optional[str] = None,
-    num_classes: Optional[int] = None,
-    top_k: Optional[int] = None,
-    threshold: float = 0.5,
-    is_multiclass: Optional[bool] = None,
-    ignore_index: Optional[int] = None,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-
-    preds, target, _ = _input_format_classification(
-        preds, target, threshold=threshold, num_classes=num_classes, is_multiclass=is_multiclass, top_k=top_k
-    )
-
-    if ignore_index is not None and not 0 <= ignore_index < preds.shape[1]:
-        raise ValueError(f"The `ignore_index` {ignore_index} is not valid for inputs with {preds.shape[0]} classes")
-
-    if ignore_index is not None and preds.shape[1] == 1:
-        raise ValueError("You can not use `ignore_index` with binary data.")
-
-    if preds.ndim == 3:
-        if not mdmc_reduce:
-            raise ValueError(
-                "When your inputs are multi-dimensional multi-class, you have to set the `mdmc_reduce` parameter"
-            )
-        if mdmc_reduce == "global":
-            preds = torch.transpose(preds, 1, 2).reshape(-1, preds.shape[1])
-            target = torch.transpose(target, 1, 2).reshape(-1, target.shape[1])
-
-    # Delete what is in ignore_index, if applicable (and classes don't matter):
-    if ignore_index is not None and reduce != "macro":
-        preds = _del_column(preds, ignore_index)
-        target = _del_column(target, ignore_index)
-
-    tp, fp, tn, fn = _stat_scores(preds, target, reduce=reduce)
-
-    # Take care of ignore_index
-    if ignore_index is not None and reduce == "macro":
-        tp[..., ignore_index] = -1
-        fp[..., ignore_index] = -1
-        tn[..., ignore_index] = -1
-        fn[..., ignore_index] = -1
-
-    return tp, fp, tn, fn
-
-
-def _stat_scores_compute(tp: torch.Tensor, fp: torch.Tensor, tn: torch.Tensor, fn: torch.Tensor) -> torch.Tensor:
-
-    outputs = [
-        tp.unsqueeze(-1),
-        fp.unsqueeze(-1),
-        tn.unsqueeze(-1),
-        fn.unsqueeze(-1),
-        tp.unsqueeze(-1) + fn.unsqueeze(-1),  # support
-    ]
-    outputs = torch.cat(outputs, -1)
-    outputs = torch.where(outputs < 0, torch.tensor(-1, device=outputs.device), outputs)
-
-    return outputs
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_stat_scores, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def stat_scores(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -146,149 +31,7 @@ def stat_scores(
     is_multiclass: Optional[bool] = None,
     ignore_index: Optional[int] = None,
 ) -> torch.Tensor:
-    """Computes the number of true positives, false positives, true negatives, false negatives.
-    Related to `Type I and Type II errors <https://en.wikipedia.org/wiki/Type_I_and_type_II_errors>`__
-    and the `confusion matrix <https://en.wikipedia.org/wiki/Confusion_matrix#Table_of_confusion>`__.
-
-    The reduction method (how the statistics are aggregated) is controlled by the
-    ``reduce`` parameter, and additionally by the ``mdmc_reduce`` parameter in the
-    multi-dimensional multi-class case.
-
-    Args:
-        preds: Predictions from model (probabilities or labels)
-        target: Ground truth values
-        threshold:
-            Threshold probability value for transforming probability predictions to binary
-            (0 or 1) predictions, in the case of binary or multi-label inputs.
-
-        top_k:
-            Number of highest probability entries for each sample to convert to 1s - relevant
-            only for inputs with probability predictions. If this parameter is set for multi-label
-            inputs, it will take precedence over ``threshold``. For (multi-dim) multi-class inputs,
-            this parameter defaults to 1.
-
-            Should be left unset (``None``) for inputs with label predictions.
-
-        reduce:
-            Defines the reduction that is applied. Should be one of the following:
-
-            - ``'micro'`` [default]: Counts the statistics by summing over all [sample, class]
-              combinations (globally). Each statistic is represented by a single integer.
-            - ``'macro'``: Counts the statistics for each class separately (over all samples).
-              Each statistic is represented by a ``(C,)`` tensor. Requires ``num_classes``
-              to be set.
-            - ``'samples'``: Counts the statistics for each sample separately (over all classes).
-              Each statistic is represented by a ``(N, )`` 1d tensor.
-
-            Note that what is considered a sample in the multi-dimensional multi-class case
-            depends on the value of ``mdmc_reduce``.
-
-        num_classes:
-            Number of classes. Necessary for (multi-dimensional) multi-class or multi-label data.
-
-        ignore_index:
-            Specify a class (label) to ignore. If given, this class index does not contribute
-            to the returned score, regardless of reduction method. If an index is ignored, and
-            ``reduce='macro'``, the class statistics for the ignored class will all be returned
-            as ``-1``.
-
-        mdmc_reduce:
-            Defines how the multi-dimensional multi-class inputs are handeled. Should be
-            one of the following:
-
-            - ``None`` [default]: Should be left unchanged if your data is not multi-dimensional multi-class.
-
-            - ``'samplewise'``: In this case, the statistics are computed separately for each
-              sample on the ``N`` axis, and then the outputs are concatenated together. In each
-              sample the extra axes ``...`` are flattened to become the sub-sample axis, and
-              statistics for each sample are computed by treating the sub-sample axis as the
-              ``N`` axis for that sample.
-
-            - ``'global'``: In this case the ``N`` and ``...`` dimensions of the inputs are
-              flattened into a new ``N_X`` sample axis, i.e. the inputs are treated as if they
-              were ``(N_X, C)``. From here on the ``reduce`` parameter applies as usual.
-
-        is_multiclass:
-            Used only in certain special cases, where you want to treat inputs as a different type
-            than what they appear to be.
-
-    Return:
-        The metric returns a tensor of shape ``(..., 5)``, where the last dimension corresponds
-        to ``[tp, fp, tn, fn, sup]`` (``sup`` stands for support and equals ``tp + fn``). The
-        shape depends on the ``reduce`` and ``mdmc_reduce`` (in case of multi-dimensional
-        multi-class data) parameters:
-
-        - If the data is not multi-dimensional multi-class, then
-
-          - If ``reduce='micro'``, the shape will be ``(5, )``
-          - If ``reduce='macro'``, the shape will be ``(C, 5)``,
-            where ``C`` stands for the number of classes
-          - If ``reduce='samples'``, the shape will be ``(N, 5)``, where ``N`` stands for
-            the number of samples
-
-        - If the data is multi-dimensional multi-class and ``mdmc_reduce='global'``, then
-
-          - If ``reduce='micro'``, the shape will be ``(5, )``
-          - If ``reduce='macro'``, the shape will be ``(C, 5)``
-          - If ``reduce='samples'``, the shape will be ``(N*X, 5)``, where ``X`` stands for
-            the product of sizes of all "extra" dimensions of the data (i.e. all dimensions
-            except for ``C`` and ``N``)
-
-        - If the data is multi-dimensional multi-class and ``mdmc_reduce='samplewise'``, then
-
-          - If ``reduce='micro'``, the shape will be ``(N, 5)``
-          - If ``reduce='macro'``, the shape will be ``(N, C, 5)``
-          - If ``reduce='samples'``, the shape will be ``(N, X, 5)``
-
-    Raises:
-        ValueError:
-            If ``reduce`` is none of ``"micro"``, ``"macro"`` or ``"samples"``.
-        ValueError:
-            If ``mdmc_reduce`` is none of ``None``, ``"samplewise"``, ``"global"``.
-        ValueError:
-            If ``reduce`` is set to ``"macro"`` and ``num_classes`` is not provided.
-        ValueError:
-            If ``num_classes`` is set
-            and ``ignore_index`` is not in the range ``[0, num_classes)``.
-        ValueError:
-            If ``ignore_index`` is used with ``binary data``.
-        ValueError:
-            If inputs are ``multi-dimensional multi-class`` and ``mdmc_reduce`` is not provided.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import stat_scores
-        >>> preds  = torch.tensor([1, 0, 2, 1])
-        >>> target = torch.tensor([1, 1, 2, 0])
-        >>> stat_scores(preds, target, reduce='macro', num_classes=3)
-        tensor([[0, 1, 2, 1, 1],
-                [1, 1, 1, 1, 2],
-                [1, 0, 3, 0, 1]])
-        >>> stat_scores(preds, target, reduce='micro')
-        tensor([2, 2, 6, 2, 4])
     """
-
-    if reduce not in ["micro", "macro", "samples"]:
-        raise ValueError(f"The `reduce` {reduce} is not valid.")
-
-    if mdmc_reduce not in [None, "samplewise", "global"]:
-        raise ValueError(f"The `mdmc_reduce` {mdmc_reduce} is not valid.")
-
-    if reduce == "macro" and (not num_classes or num_classes < 1):
-        raise ValueError("When you set `reduce` as 'macro', you have to provide the number of classes.")
-
-    if num_classes and ignore_index is not None and (not 0 <= ignore_index < num_classes or num_classes == 1):
-        raise ValueError(f"The `ignore_index` {ignore_index} is not valid for inputs with {num_classes} classes")
-
-    tp, fp, tn, fn = _stat_scores_update(
-        preds,
-        target,
-        reduce=reduce,
-        mdmc_reduce=mdmc_reduce,
-        top_k=top_k,
-        threshold=threshold,
-        num_classes=num_classes,
-        is_multiclass=is_multiclass,
-        ignore_index=ignore_index,
-    )
-    return _stat_scores_compute(tp, fp, tn, fn)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.stat_scores`. Will be removed in v1.5.0.
+    """
diff --git a/tests/metrics/classification/__init__.py b/tests/metrics/classification/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/metrics/classification/inputs.py b/tests/metrics/classification/inputs.py
deleted file mode 100644
index 7f2ac450385fe..0000000000000
--- a/tests/metrics/classification/inputs.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from collections import namedtuple
-
-import torch
-
-from tests.metrics.utils import BATCH_SIZE, EXTRA_DIM, NUM_BATCHES, NUM_CLASSES
-
-Input = namedtuple('Input', ["preds", "target"])
-
-_input_binary_prob = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE), target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-_input_binary = Input(
-    preds=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE)),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-_input_multilabel_prob = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES))
-)
-
-_input_multilabel_multidim_prob = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM))
-)
-
-_input_multilabel = Input(
-    preds=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES)),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES))
-)
-
-_input_multilabel_multidim = Input(
-    preds=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM)),
-    target=torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM))
-)
-
-# Generate edge multilabel edge case, where nothing matches (scores are undefined)
-__temp_preds = torch.randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES))
-__temp_target = abs(__temp_preds - 1)
-
-_input_multilabel_no_match = Input(preds=__temp_preds, target=__temp_target)
-
-__mc_prob_preds = torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES)
-__mc_prob_preds = __mc_prob_preds / __mc_prob_preds.sum(dim=2, keepdim=True)
-
-_input_multiclass_prob = Input(
-    preds=__mc_prob_preds, target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-_input_multiclass = Input(
-    preds=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE)),
-    target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE))
-)
-
-__mdmc_prob_preds = torch.rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM)
-__mdmc_prob_preds = __mdmc_prob_preds / __mdmc_prob_preds.sum(dim=2, keepdim=True)
-
-_input_multidim_multiclass_prob = Input(
-    preds=__mdmc_prob_preds, target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM))
-)
-
-_input_multidim_multiclass = Input(
-    preds=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM)),
-    target=torch.randint(high=NUM_CLASSES, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM))
-)
diff --git a/tests/metrics/classification/test_confusion_matrix.py b/tests/metrics/classification/test_confusion_matrix.py
deleted file mode 100644
index 5371044d6d4b0..0000000000000
--- a/tests/metrics/classification/test_confusion_matrix.py
+++ /dev/null
@@ -1,128 +0,0 @@
-from functools import partial
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import confusion_matrix as sk_confusion_matrix
-
-from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix
-from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_cm_binary_prob(preds, target, normalize=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_binary(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multilabel_prob(preds, target, normalize=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multilabel(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multiclass_prob(preds, target, normalize=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multiclass(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multidim_multiclass_prob(preds, target, normalize=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-def _sk_cm_multidim_multiclass(preds, target, normalize=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize)
-
-
-@pytest.mark.parametrize("normalize", ['true', 'pred', 'all', None])
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_classes",
-    [(_input_binary_prob.preds, _input_binary_prob.target, _sk_cm_binary_prob, 2),
-     (_input_binary.preds, _input_binary.target, _sk_cm_binary, 2),
-     (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_cm_multilabel_prob, 2),
-     (_input_mlb.preds, _input_mlb.target, _sk_cm_multilabel, 2),
-     (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_cm_multiclass_prob, NUM_CLASSES),
-     (_input_mcls.preds, _input_mcls.target, _sk_cm_multiclass, NUM_CLASSES),
-     (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_cm_multidim_multiclass_prob, NUM_CLASSES),
-     (_input_mdmc.preds, _input_mdmc.target, _sk_cm_multidim_multiclass, NUM_CLASSES)]
-)
-class TestConfusionMatrix(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_confusion_matrix(self, normalize, preds, target, sk_metric, num_classes, ddp, dist_sync_on_step):
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=ConfusionMatrix,
-            sk_metric=partial(sk_metric, normalize=normalize),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "normalize": normalize
-            }
-        )
-
-    def test_confusion_matrix_functional(self, normalize, preds, target, sk_metric, num_classes):
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=confusion_matrix,
-            sk_metric=partial(sk_metric, normalize=normalize),
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "normalize": normalize
-            }
-        )
-
-
-def test_warning_on_nan(tmpdir):
-    preds = torch.randint(3, size=(20, ))
-    target = torch.randint(3, size=(20, ))
-
-    with pytest.warns(UserWarning, match='.* nan values found in confusion matrix have been replaced with zeros.'):
-        confusion_matrix(preds, target, num_classes=5, normalize='true')
diff --git a/tests/metrics/classification/test_f_beta.py b/tests/metrics/classification/test_f_beta.py
deleted file mode 100644
index b9458fb6c530c..0000000000000
--- a/tests/metrics/classification/test_f_beta.py
+++ /dev/null
@@ -1,153 +0,0 @@
-from functools import partial
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import fbeta_score
-
-from pytorch_lightning.metrics import F1, FBeta
-from pytorch_lightning.metrics.functional import f1, fbeta
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_no_match as _input_mlb_nomatch
-from tests.metrics.classification.inputs import _input_multilabel_prob as _mlb_prob_inputs
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_fbeta_binary_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average='binary', beta=beta)
-
-
-def _sk_fbeta_binary(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average='binary', beta=beta)
-
-
-def _sk_fbeta_multilabel_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = (preds.view(-1, NUM_CLASSES).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1, NUM_CLASSES).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multilabel(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1, NUM_CLASSES).numpy()
-    sk_target = target.view(-1, NUM_CLASSES).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multiclass_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multiclass(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multidim_multiclass_prob(preds, target, average='micro', beta=1.0):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-def _sk_fbeta_multidim_multiclass(preds, target, average='micro', beta=1.0):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return fbeta_score(y_true=sk_target, y_pred=sk_preds, average=average, beta=beta)
-
-
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_classes, multilabel",
-    [
-        (_input_binary_prob.preds, _input_binary_prob.target, _sk_fbeta_binary_prob, 1, False),
-        (_input_binary.preds, _input_binary.target, _sk_fbeta_binary, 1, False),
-        (_mlb_prob_inputs.preds, _mlb_prob_inputs.target, _sk_fbeta_multilabel_prob, NUM_CLASSES, True),
-        (_input_mlb.preds, _input_mlb.target, _sk_fbeta_multilabel, NUM_CLASSES, True),
-        (_input_mlb_nomatch.preds, _input_mlb_nomatch.target, _sk_fbeta_multilabel, NUM_CLASSES, True),
-        (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_fbeta_multiclass_prob, NUM_CLASSES, False),
-        (_input_mcls.preds, _input_mcls.target, _sk_fbeta_multiclass, NUM_CLASSES, False),
-        (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_fbeta_multidim_multiclass_prob, NUM_CLASSES, False),
-        (_input_mdmc.preds, _input_mdmc.target, _sk_fbeta_multidim_multiclass, NUM_CLASSES, False),
-    ],
-)
-@pytest.mark.parametrize("average", ['micro', 'macro', 'weighted', None])
-@pytest.mark.parametrize("beta", [0.5, 1.0, 2.0])
-class TestFBeta(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_fbeta(self, preds, target, sk_metric, num_classes, multilabel, average, beta, ddp, dist_sync_on_step):
-        metric_class = F1 if beta == 1.0 else partial(FBeta, beta=beta)
-
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=metric_class,
-            sk_metric=partial(sk_metric, average=average, beta=beta),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "average": average,
-                "multilabel": multilabel,
-                "threshold": THRESHOLD,
-            },
-            check_dist_sync_on_step=False,
-            check_batch=False,
-        )
-
-    def test_fbeta_functional(self, preds, target, sk_metric, num_classes, multilabel, average, beta):
-        metric_functional = f1 if beta == 1.0 else partial(fbeta, beta=beta)
-
-        self.run_functional_metric_test(
-            preds=preds,
-            target=target,
-            metric_functional=metric_functional,
-            sk_metric=partial(sk_metric, average=average, beta=beta),
-            metric_args={
-                "num_classes": num_classes,
-                "average": average,
-                "multilabel": multilabel,
-                "threshold": THRESHOLD
-            }
-        )
-
-
-@pytest.mark.parametrize(['pred', 'target', 'beta', 'exp_score'], [
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], 0.5, [0.5, 0.5]),
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], 1, [0.5, 0.5]),
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], 2, [0.5, 0.5]),
-])
-def test_fbeta_score(pred, target, beta, exp_score):
-    score = fbeta(torch.tensor(pred), torch.tensor(target), num_classes=1, beta=beta, average='none')
-    assert torch.allclose(score, torch.tensor(exp_score))
-
-
-@pytest.mark.parametrize(['pred', 'target', 'exp_score'], [
-    pytest.param([0., 0., 0., 0.], [1., 1., 1., 1.], [0.0, 0.0]),
-    pytest.param([1., 0., 1., 0.], [0., 1., 1., 0.], [0.5, 0.5]),
-    pytest.param([1., 0., 1., 0.], [1., 0., 1., 0.], [1.0, 1.0]),
-])
-def test_f1_score(pred, target, exp_score):
-    score = f1(torch.tensor(pred), torch.tensor(target), num_classes=1, average='none')
-    assert torch.allclose(score, torch.tensor(exp_score))
diff --git a/tests/metrics/classification/test_hamming_distance.py b/tests/metrics/classification/test_hamming_distance.py
deleted file mode 100644
index a4db9c7f339b2..0000000000000
--- a/tests/metrics/classification/test_hamming_distance.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import pytest
-import torch
-from sklearn.metrics import hamming_loss as sk_hamming_loss
-from torchmetrics.classification.checks import _input_format_classification
-
-from pytorch_lightning.metrics import HammingDistance
-from pytorch_lightning.metrics.functional import hamming_distance
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_multidim as _input_mlmd
-from tests.metrics.classification.inputs import _input_multilabel_multidim_prob as _input_mlmd_prob
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_hamming_loss(preds, target):
-    sk_preds, sk_target, _ = _input_format_classification(preds, target, threshold=THRESHOLD)
-    sk_preds, sk_target = sk_preds.numpy(), sk_target.numpy()
-    sk_preds, sk_target = sk_preds.reshape(sk_preds.shape[0], -1), sk_target.reshape(sk_target.shape[0], -1)
-
-    return sk_hamming_loss(y_true=sk_target, y_pred=sk_preds)
-
-
-@pytest.mark.parametrize(
-    "preds, target",
-    [
-        (_input_binary_prob.preds, _input_binary_prob.target),
-        (_input_binary.preds, _input_binary.target),
-        (_input_mlb_prob.preds, _input_mlb_prob.target),
-        (_input_mlb.preds, _input_mlb.target),
-        (_input_mcls_prob.preds, _input_mcls_prob.target),
-        (_input_mcls.preds, _input_mcls.target),
-        (_input_mdmc_prob.preds, _input_mdmc_prob.target),
-        (_input_mdmc.preds, _input_mdmc.target),
-        (_input_mlmd_prob.preds, _input_mlmd_prob.target),
-        (_input_mlmd.preds, _input_mlmd.target),
-    ],
-)
-class TestHammingDistance(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [False, True])
-    def test_hamming_distance_class(self, ddp, dist_sync_on_step, preds, target):
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=HammingDistance,
-            sk_metric=_sk_hamming_loss,
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={"threshold": THRESHOLD},
-        )
-
-    def test_hamming_distance_fn(self, preds, target):
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=hamming_distance,
-            sk_metric=_sk_hamming_loss,
-            metric_args={"threshold": THRESHOLD},
-        )
-
-
-@pytest.mark.parametrize("threshold", [1.5])
-def test_wrong_params(threshold):
-    preds, target = _input_mcls_prob.preds, _input_mcls_prob.target
-
-    with pytest.raises(ValueError):
-        ham_dist = HammingDistance(threshold=threshold)
-        ham_dist(preds, target)
-        ham_dist.compute()
-
-    with pytest.raises(ValueError):
-        hamming_distance(preds, target, threshold=threshold)
diff --git a/tests/metrics/classification/test_inputs.py b/tests/metrics/classification/test_inputs.py
deleted file mode 100644
index f07a9c2821f56..0000000000000
--- a/tests/metrics/classification/test_inputs.py
+++ /dev/null
@@ -1,312 +0,0 @@
-import pytest
-import torch
-from torch import rand, randint
-from torchmetrics.classification.checks import _input_format_classification
-from torchmetrics.utilities.data import select_topk, to_onehot
-from torchmetrics.utilities.enums import DataType
-
-from tests.metrics.classification.inputs import _input_binary as _bin
-from tests.metrics.classification.inputs import _input_binary_prob as _bin_prob
-from tests.metrics.classification.inputs import _input_multiclass as _mc
-from tests.metrics.classification.inputs import _input_multiclass_prob as _mc_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _ml
-from tests.metrics.classification.inputs import _input_multilabel_multidim as _mlmd
-from tests.metrics.classification.inputs import _input_multilabel_multidim_prob as _mlmd_prob
-from tests.metrics.classification.inputs import _input_multilabel_prob as _ml_prob
-from tests.metrics.classification.inputs import Input
-from tests.metrics.utils import BATCH_SIZE, EXTRA_DIM, NUM_BATCHES, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-# Some additional inputs to test on
-_ml_prob_half = Input(_ml_prob.preds.half(), _ml_prob.target)
-
-_mc_prob_2cls_preds = rand(NUM_BATCHES, BATCH_SIZE, 2)
-_mc_prob_2cls_preds /= _mc_prob_2cls_preds.sum(dim=2, keepdim=True)
-_mc_prob_2cls = Input(_mc_prob_2cls_preds, randint(high=2, size=(NUM_BATCHES, BATCH_SIZE)))
-
-_mdmc_prob_many_dims_preds = rand(NUM_BATCHES, BATCH_SIZE, NUM_CLASSES, EXTRA_DIM, EXTRA_DIM)
-_mdmc_prob_many_dims_preds /= _mdmc_prob_many_dims_preds.sum(dim=2, keepdim=True)
-_mdmc_prob_many_dims = Input(
-    _mdmc_prob_many_dims_preds,
-    randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM, EXTRA_DIM)),
-)
-
-_mdmc_prob_2cls_preds = rand(NUM_BATCHES, BATCH_SIZE, 2, EXTRA_DIM)
-_mdmc_prob_2cls_preds /= _mdmc_prob_2cls_preds.sum(dim=2, keepdim=True)
-_mdmc_prob_2cls = Input(_mdmc_prob_2cls_preds, randint(high=2, size=(NUM_BATCHES, BATCH_SIZE, EXTRA_DIM)))
-
-# Some utils
-T = torch.Tensor
-
-
-def _idn(x):
-    return x
-
-
-def _usq(x):
-    return x.unsqueeze(-1)
-
-
-def _thrs(x):
-    return x >= THRESHOLD
-
-
-def _rshp1(x):
-    return x.reshape(x.shape[0], -1)
-
-
-def _rshp2(x):
-    return x.reshape(x.shape[0], x.shape[1], -1)
-
-
-def _onehot(x):
-    return to_onehot(x, NUM_CLASSES)
-
-
-def _onehot2(x):
-    return to_onehot(x, 2)
-
-
-def _top1(x):
-    return select_topk(x, 1)
-
-
-def _top2(x):
-    return select_topk(x, 2)
-
-
-# To avoid ugly black line wrapping
-def _ml_preds_tr(x):
-    return _rshp1(_thrs(x))
-
-
-def _onehot_rshp1(x):
-    return _onehot(_rshp1(x))
-
-
-def _onehot2_rshp1(x):
-    return _onehot2(_rshp1(x))
-
-
-def _top1_rshp2(x):
-    return _top1(_rshp2(x))
-
-
-def _top2_rshp2(x):
-    return _top2(_rshp2(x))
-
-
-def _probs_to_mc_preds_tr(x):
-    return _onehot2(_thrs(x))
-
-
-def _mlmd_prob_to_mc_preds_tr(x):
-    return _onehot2(_rshp1(_thrs(x)))
-
-
-########################
-# Test correct inputs
-########################
-
-
-@pytest.mark.parametrize(
-    "inputs, num_classes, is_multiclass, top_k, exp_mode, post_preds, post_target",
-    [
-        #############################
-        # Test usual expected cases
-        (_bin, None, False, None, "multi-class", _usq, _usq),
-        (_bin, 1, False, None, "multi-class", _usq, _usq),
-        (_bin_prob, None, None, None, "binary", lambda x: _usq(_thrs(x)), _usq),
-        (_ml_prob, None, None, None, "multi-label", _thrs, _idn),
-        (_ml, None, False, None, "multi-dim multi-class", _idn, _idn),
-        (_ml_prob, None, None, None, "multi-label", _ml_preds_tr, _rshp1),
-        (_ml_prob, None, None, 2, "multi-label", _top2, _rshp1),
-        (_mlmd, None, False, None, "multi-dim multi-class", _rshp1, _rshp1),
-        (_mc, NUM_CLASSES, None, None, "multi-class", _onehot, _onehot),
-        (_mc_prob, None, None, None, "multi-class", _top1, _onehot),
-        (_mc_prob, None, None, 2, "multi-class", _top2, _onehot),
-        (_mdmc, NUM_CLASSES, None, None, "multi-dim multi-class", _onehot, _onehot),
-        (_mdmc_prob, None, None, None, "multi-dim multi-class", _top1_rshp2, _onehot),
-        (_mdmc_prob, None, None, 2, "multi-dim multi-class", _top2_rshp2, _onehot),
-        (_mdmc_prob_many_dims, None, None, None, "multi-dim multi-class", _top1_rshp2, _onehot_rshp1),
-        (_mdmc_prob_many_dims, None, None, 2, "multi-dim multi-class", _top2_rshp2, _onehot_rshp1),
-        ###########################
-        # Test some special cases
-        # Make sure that half precision works, i.e. is converted to full precision
-        (_ml_prob_half, None, None, None, "multi-label", lambda x: _ml_preds_tr(x.float()), _rshp1),
-        # Binary as multiclass
-        (_bin, None, None, None, "multi-class", _onehot2, _onehot2),
-        # Binary probs as multiclass
-        (_bin_prob, None, True, None, "binary", _probs_to_mc_preds_tr, _onehot2),
-        # Multilabel as multiclass
-        (_ml, None, True, None, "multi-dim multi-class", _onehot2, _onehot2),
-        # Multilabel probs as multiclass
-        (_ml_prob, None, True, None, "multi-label", _probs_to_mc_preds_tr, _onehot2),
-        # Multidim multilabel as multiclass
-        (_mlmd, None, True, None, "multi-dim multi-class", _onehot2_rshp1, _onehot2_rshp1),
-        # Multidim multilabel probs as multiclass
-        (_mlmd_prob, None, True, None, "multi-label", _mlmd_prob_to_mc_preds_tr, _onehot2_rshp1),
-        # Multiclass prob with 2 classes as binary
-        (_mc_prob_2cls, None, False, None, "multi-class", lambda x: _top1(x)[:, [1]], _usq),
-        # Multi-dim multi-class with 2 classes as multi-label
-        (_mdmc_prob_2cls, None, False, None, "multi-dim multi-class", lambda x: _top1(x)[:, 1], _idn),
-    ],
-)
-def test_usual_cases(inputs, num_classes, is_multiclass, top_k, exp_mode, post_preds, post_target):
-
-    def __get_data_type_enum(str_exp_mode):
-        return next(DataType[n] for n in dir(DataType) if DataType[n] == str_exp_mode)
-
-    for exp_mode in (exp_mode, __get_data_type_enum(exp_mode)):
-        preds_out, target_out, mode = _input_format_classification(
-            preds=inputs.preds[0],
-            target=inputs.target[0],
-            threshold=THRESHOLD,
-            num_classes=num_classes,
-            is_multiclass=is_multiclass,
-            top_k=top_k,
-        )
-
-        assert mode == exp_mode
-        assert torch.equal(preds_out, post_preds(inputs.preds[0]).int())
-        assert torch.equal(target_out, post_target(inputs.target[0]).int())
-
-        # Test that things work when batch_size = 1
-        preds_out, target_out, mode = _input_format_classification(
-            preds=inputs.preds[0][[0], ...],
-            target=inputs.target[0][[0], ...],
-            threshold=THRESHOLD,
-            num_classes=num_classes,
-            is_multiclass=is_multiclass,
-            top_k=top_k,
-        )
-
-        assert mode == exp_mode
-        assert torch.equal(preds_out, post_preds(inputs.preds[0][[0], ...]).int())
-        assert torch.equal(target_out, post_target(inputs.target[0][[0], ...]).int())
-
-
-# Test that threshold is correctly applied
-def test_threshold():
-    target = T([1, 1, 1]).int()
-    preds_probs = T([0.5 - 1e-5, 0.5, 0.5 + 1e-5])
-
-    preds_probs_out, _, _ = _input_format_classification(preds_probs, target, threshold=0.5)
-
-    assert torch.equal(torch.tensor([0, 1, 1], dtype=torch.int), preds_probs_out.squeeze().int())
-
-
-########################################################################
-# Test incorrect inputs
-########################################################################
-
-
-@pytest.mark.parametrize("threshold", [-0.5, 0.0, 1.0, 1.5])
-def test_incorrect_threshold(threshold):
-    preds, target = rand(size=(7, )), randint(high=2, size=(7, ))
-    with pytest.raises(ValueError):
-        _input_format_classification(preds, target, threshold=threshold)
-
-
-@pytest.mark.parametrize(
-    "preds, target, num_classes, is_multiclass",
-    [
-        # Target not integer
-        (randint(high=2, size=(7, )), randint(high=2, size=(7, )).float(), None, None),
-        # Target negative
-        (randint(high=2, size=(7, )), -randint(high=2, size=(7, )), None, None),
-        # Preds negative integers
-        (-randint(high=2, size=(7, )), randint(high=2, size=(7, )), None, None),
-        # Negative probabilities
-        (-rand(size=(7, )), randint(high=2, size=(7, )), None, None),
-        # is_multiclass=False and target > 1
-        (rand(size=(7, )), randint(low=2, high=4, size=(7, )), None, False),
-        # is_multiclass=False and preds integers with > 1
-        (randint(low=2, high=4, size=(7, )), randint(high=2, size=(7, )), None, False),
-        # Wrong batch size
-        (randint(high=2, size=(8, )), randint(high=2, size=(7, )), None, None),
-        # Completely wrong shape
-        (randint(high=2, size=(7, )), randint(high=2, size=(7, 4)), None, None),
-        # Same #dims, different shape
-        (randint(high=2, size=(7, 3)), randint(high=2, size=(7, 4)), None, None),
-        # Same shape and preds floats, target not binary
-        (rand(size=(7, 3)), randint(low=2, high=4, size=(7, 3)), None, None),
-        # #dims in preds = 1 + #dims in target, C shape not second or last
-        (rand(size=(7, 3, 4, 3)), randint(high=4, size=(7, 3, 3)), None, None),
-        # #dims in preds = 1 + #dims in target, preds not float
-        (randint(high=2, size=(7, 3, 3, 4)), randint(high=4, size=(7, 3, 3)), None, None),
-        # is_multiclass=False, with C dimension > 2
-        (_mc_prob.preds[0], randint(high=2, size=(BATCH_SIZE, )), None, False),
-        # Probs of multiclass preds do not sum up to 1
-        (rand(size=(7, 3, 5)), randint(high=2, size=(7, 5)), None, None),
-        # Max target larger or equal to C dimension
-        (_mc_prob.preds[0], randint(low=NUM_CLASSES + 1, high=100, size=(BATCH_SIZE, )), None, None),
-        # C dimension not equal to num_classes
-        (_mc_prob.preds[0], _mc_prob.target[0], NUM_CLASSES + 1, None),
-        # Max target larger than num_classes (with #dim preds = 1 + #dims target)
-        (_mc_prob.preds[0], randint(low=NUM_CLASSES + 1, high=100, size=(BATCH_SIZE, NUM_CLASSES)), 4, None),
-        # Max target larger than num_classes (with #dim preds = #dims target)
-        (randint(high=4, size=(7, 3)), randint(low=5, high=7, size=(7, 3)), 4, None),
-        # Max preds larger than num_classes (with #dim preds = #dims target)
-        (randint(low=5, high=7, size=(7, 3)), randint(high=4, size=(7, 3)), 4, None),
-        # Num_classes=1, but is_multiclass not false
-        (randint(high=2, size=(7, )), randint(high=2, size=(7, )), 1, None),
-        # is_multiclass=False, but implied class dimension (for multi-label, from shape) != num_classes
-        (randint(high=2, size=(7, 3, 3)), randint(high=2, size=(7, 3, 3)), 4, False),
-        # Multilabel input with implied class dimension != num_classes
-        (rand(size=(7, 3, 3)), randint(high=2, size=(7, 3, 3)), 4, False),
-        # Multilabel input with is_multiclass=True, but num_classes != 2 (or None)
-        (rand(size=(7, 3)), randint(high=2, size=(7, 3)), 4, True),
-        # Binary input, num_classes > 2
-        (rand(size=(7, )), randint(high=2, size=(7, )), 4, None),
-        # Binary input, num_classes == 2 and is_multiclass not True
-        (rand(size=(7, )), randint(high=2, size=(7, )), 2, None),
-        (rand(size=(7, )), randint(high=2, size=(7, )), 2, False),
-        # Binary input, num_classes == 1 and is_multiclass=True
-        (rand(size=(7, )), randint(high=2, size=(7, )), 1, True),
-    ],
-)
-def test_incorrect_inputs(preds, target, num_classes, is_multiclass):
-    with pytest.raises(ValueError):
-        _input_format_classification(
-            preds=preds, target=target, threshold=THRESHOLD, num_classes=num_classes, is_multiclass=is_multiclass
-        )
-
-
-@pytest.mark.parametrize(
-    "preds, target, num_classes, is_multiclass, top_k",
-    [
-        # Topk set with non (md)mc or ml prob data
-        (_bin.preds[0], _bin.target[0], None, None, 2),
-        (_bin_prob.preds[0], _bin_prob.target[0], None, None, 2),
-        (_mc.preds[0], _mc.target[0], None, None, 2),
-        (_ml.preds[0], _ml.target[0], None, None, 2),
-        (_mlmd.preds[0], _mlmd.target[0], None, None, 2),
-        (_mdmc.preds[0], _mdmc.target[0], None, None, 2),
-        # top_k = 0
-        (_mc_prob_2cls.preds[0], _mc_prob_2cls.target[0], None, None, 0),
-        # top_k = float
-        (_mc_prob_2cls.preds[0], _mc_prob_2cls.target[0], None, None, 0.123),
-        # top_k =2 with 2 classes, is_multiclass=False
-        (_mc_prob_2cls.preds[0], _mc_prob_2cls.target[0], None, False, 2),
-        # top_k = number of classes (C dimension)
-        (_mc_prob.preds[0], _mc_prob.target[0], None, None, NUM_CLASSES),
-        # is_multiclass = True for ml prob inputs, top_k set
-        (_ml_prob.preds[0], _ml_prob.target[0], None, True, 2),
-        # top_k = num_classes for ml prob inputs
-        (_ml_prob.preds[0], _ml_prob.target[0], None, True, NUM_CLASSES),
-    ],
-)
-def test_incorrect_inputs_topk(preds, target, num_classes, is_multiclass, top_k):
-    with pytest.raises(ValueError):
-        _input_format_classification(
-            preds=preds,
-            target=target,
-            threshold=THRESHOLD,
-            num_classes=num_classes,
-            is_multiclass=is_multiclass,
-            top_k=top_k,
-        )
diff --git a/tests/metrics/classification/test_iou.py b/tests/metrics/classification/test_iou.py
deleted file mode 100644
index 6bb100f68165a..0000000000000
--- a/tests/metrics/classification/test_iou.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from functools import partial
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import jaccard_score as sk_jaccard_score
-
-from pytorch_lightning.metrics.classification.iou import IoU
-from pytorch_lightning.metrics.functional.iou import iou
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob
-from tests.metrics.classification.inputs import _input_multiclass as _input_mcls
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mcls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mlb
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-
-def _sk_iou_binary_prob(preds, target, average=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_binary(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multilabel_prob(preds, target, average=None):
-    sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8)
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multilabel(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multiclass_prob(preds, target, average=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multiclass(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multidim_multiclass_prob(preds, target, average=None):
-    sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-def _sk_iou_multidim_multiclass(preds, target, average=None):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-
-    return sk_jaccard_score(y_true=sk_target, y_pred=sk_preds, average=average)
-
-
-@pytest.mark.parametrize("reduction", ['elementwise_mean', 'none'])
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_classes",
-    [(_input_binary_prob.preds, _input_binary_prob.target, _sk_iou_binary_prob, 2),
-     (_input_binary.preds, _input_binary.target, _sk_iou_binary, 2),
-     (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_iou_multilabel_prob, 2),
-     (_input_mlb.preds, _input_mlb.target, _sk_iou_multilabel, 2),
-     (_input_mcls_prob.preds, _input_mcls_prob.target, _sk_iou_multiclass_prob, NUM_CLASSES),
-     (_input_mcls.preds, _input_mcls.target, _sk_iou_multiclass, NUM_CLASSES),
-     (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_iou_multidim_multiclass_prob, NUM_CLASSES),
-     (_input_mdmc.preds, _input_mdmc.target, _sk_iou_multidim_multiclass, NUM_CLASSES)]
-)
-class TestIoU(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_confusion_matrix(self, reduction, preds, target, sk_metric, num_classes, ddp, dist_sync_on_step):
-        average = 'macro' if reduction == 'elementwise_mean' else None  # convert tags
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=IoU,
-            sk_metric=partial(sk_metric, average=average),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "reduction": reduction
-            }
-        )
-
-    def test_confusion_matrix_functional(self, reduction, preds, target, sk_metric, num_classes):
-        average = 'macro' if reduction == 'elementwise_mean' else None  # convert tags
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=iou,
-            sk_metric=partial(sk_metric, average=average),
-            metric_args={
-                "num_classes": num_classes,
-                "threshold": THRESHOLD,
-                "reduction": reduction
-            }
-        )
-
-
-@pytest.mark.parametrize(['half_ones', 'reduction', 'ignore_index', 'expected'], [
-    pytest.param(False, 'none', None, torch.Tensor([1, 1, 1])),
-    pytest.param(False, 'elementwise_mean', None, torch.Tensor([1])),
-    pytest.param(False, 'none', 0, torch.Tensor([1, 1])),
-    pytest.param(True, 'none', None, torch.Tensor([0.5, 0.5, 0.5])),
-    pytest.param(True, 'elementwise_mean', None, torch.Tensor([0.5])),
-    pytest.param(True, 'none', 0, torch.Tensor([0.5, 0.5])),
-])
-def test_iou(half_ones, reduction, ignore_index, expected):
-    pred = (torch.arange(120) % 3).view(-1, 1)
-    target = (torch.arange(120) % 3).view(-1, 1)
-    if half_ones:
-        pred[:60] = 1
-    iou_val = iou(
-        pred=pred,
-        target=target,
-        ignore_index=ignore_index,
-        reduction=reduction,
-    )
-    assert torch.allclose(iou_val, expected, atol=1e-9)
-
-
-# test `absent_score`
-@pytest.mark.parametrize(
-    ['pred', 'target', 'ignore_index', 'absent_score', 'num_classes', 'expected'],
-    [
-        # Note that -1 is used as the absent_score in almost all tests here to distinguish it from the range of valid
-        # scores the function can return ([0., 1.] range, inclusive).
-        # 2 classes, class 0 is correct everywhere, class 1 is absent.
-        pytest.param([0], [0], None, -1., 2, [1., -1.]),
-        pytest.param([0, 0], [0, 0], None, -1., 2, [1., -1.]),
-        # absent_score not applied if only class 0 is present and it's the only class.
-        pytest.param([0], [0], None, -1., 1, [1.]),
-        # 2 classes, class 1 is correct everywhere, class 0 is absent.
-        pytest.param([1], [1], None, -1., 2, [-1., 1.]),
-        pytest.param([1, 1], [1, 1], None, -1., 2, [-1., 1.]),
-        # When 0 index ignored, class 0 does not get a score (not even the absent_score).
-        pytest.param([1], [1], 0, -1., 2, [1.0]),
-        # 3 classes. Only 0 and 2 are present, and are perfectly predicted. 1 should get absent_score.
-        pytest.param([0, 2], [0, 2], None, -1., 3, [1., -1., 1.]),
-        pytest.param([2, 0], [2, 0], None, -1., 3, [1., -1., 1.]),
-        # 3 classes. Only 0 and 1 are present, and are perfectly predicted. 2 should get absent_score.
-        pytest.param([0, 1], [0, 1], None, -1., 3, [1., 1., -1.]),
-        pytest.param([1, 0], [1, 0], None, -1., 3, [1., 1., -1.]),
-        # 3 classes, class 0 is 0.5 IoU, class 1 is 0 IoU (in pred but not target; should not get absent_score), class
-        # 2 is absent.
-        pytest.param([0, 1], [0, 0], None, -1., 3, [0.5, 0., -1.]),
-        # 3 classes, class 0 is 0.5 IoU, class 1 is 0 IoU (in target but not pred; should not get absent_score), class
-        # 2 is absent.
-        pytest.param([0, 0], [0, 1], None, -1., 3, [0.5, 0., -1.]),
-        # Sanity checks with absent_score of 1.0.
-        pytest.param([0, 2], [0, 2], None, 1.0, 3, [1., 1., 1.]),
-        pytest.param([0, 2], [0, 2], 0, 1.0, 3, [1., 1.]),
-    ]
-)
-def test_iou_absent_score(pred, target, ignore_index, absent_score, num_classes, expected):
-    iou_val = iou(
-        pred=torch.tensor(pred),
-        target=torch.tensor(target),
-        ignore_index=ignore_index,
-        absent_score=absent_score,
-        num_classes=num_classes,
-        reduction='none',
-    )
-    assert torch.allclose(iou_val, torch.tensor(expected).to(iou_val))
-
-
-# example data taken from
-# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/metrics/tests/test_ranking.py
-@pytest.mark.parametrize(
-    ['pred', 'target', 'ignore_index', 'num_classes', 'reduction', 'expected'],
-    [
-        # Ignoring an index outside of [0, num_classes-1] should have no effect.
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], None, 3, 'none', [1, 1 / 2, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], -1, 3, 'none', [1, 1 / 2, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 255, 3, 'none', [1, 1 / 2, 2 / 3]),
-        # Ignoring a valid index drops only that index from the result.
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 0, 3, 'none', [1 / 2, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 1, 3, 'none', [1, 2 / 3]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 2, 3, 'none', [1, 1 / 2]),
-        # When reducing to mean or sum, the ignored index does not contribute to the output.
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 0, 3, 'elementwise_mean', [7 / 12]),
-        pytest.param([0, 1, 1, 2, 2], [0, 1, 2, 2, 2], 0, 3, 'sum', [7 / 6]),
-    ]
-)
-def test_iou_ignore_index(pred, target, ignore_index, num_classes, reduction, expected):
-    iou_val = iou(
-        pred=torch.tensor(pred),
-        target=torch.tensor(target),
-        ignore_index=ignore_index,
-        num_classes=num_classes,
-        reduction=reduction,
-    )
-    assert torch.allclose(iou_val, torch.tensor(expected).to(iou_val))
diff --git a/tests/metrics/classification/test_stat_scores.py b/tests/metrics/classification/test_stat_scores.py
deleted file mode 100644
index 6ccb5abed6711..0000000000000
--- a/tests/metrics/classification/test_stat_scores.py
+++ /dev/null
@@ -1,255 +0,0 @@
-from functools import partial
-from typing import Callable, Optional
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import multilabel_confusion_matrix
-from torchmetrics.classification.checks import _input_format_classification
-
-from pytorch_lightning.metrics import StatScores
-from pytorch_lightning.metrics.functional import stat_scores
-from tests.metrics.classification.inputs import _input_binary, _input_binary_prob, _input_multiclass
-from tests.metrics.classification.inputs import _input_multiclass_prob as _input_mccls_prob
-from tests.metrics.classification.inputs import _input_multidim_multiclass as _input_mdmc
-from tests.metrics.classification.inputs import _input_multidim_multiclass_prob as _input_mdmc_prob
-from tests.metrics.classification.inputs import _input_multilabel as _input_mcls
-from tests.metrics.classification.inputs import _input_multilabel_prob as _input_mlb_prob
-from tests.metrics.utils import MetricTester, NUM_CLASSES, THRESHOLD
-
-torch.manual_seed(42)
-
-
-def _sk_stat_scores(preds, target, reduce, num_classes, is_multiclass, ignore_index, top_k, mdmc_reduce=None):
-    preds, target, _ = _input_format_classification(
-        preds, target, threshold=THRESHOLD, num_classes=num_classes, is_multiclass=is_multiclass, top_k=top_k
-    )
-    sk_preds, sk_target = preds.numpy(), target.numpy()
-
-    if reduce != "macro" and ignore_index is not None and preds.shape[1] > 1:
-        sk_preds = np.delete(sk_preds, ignore_index, 1)
-        sk_target = np.delete(sk_target, ignore_index, 1)
-
-    if preds.shape[1] == 1 and reduce == "samples":
-        sk_target = sk_target.T
-        sk_preds = sk_preds.T
-
-    sk_stats = multilabel_confusion_matrix(
-        sk_target, sk_preds, samplewise=(reduce == "samples") and preds.shape[1] != 1
-    )
-
-    if preds.shape[1] == 1 and reduce != "samples":
-        sk_stats = sk_stats[[1]].reshape(-1, 4)[:, [3, 1, 0, 2]]
-    else:
-        sk_stats = sk_stats.reshape(-1, 4)[:, [3, 1, 0, 2]]
-
-    if reduce == "micro":
-        sk_stats = sk_stats.sum(axis=0, keepdims=True)
-
-    sk_stats = np.concatenate([sk_stats, sk_stats[:, [3]] + sk_stats[:, [0]]], 1)
-
-    if reduce == "micro":
-        sk_stats = sk_stats[0]
-
-    if reduce == "macro" and ignore_index is not None and preds.shape[1]:
-        sk_stats[ignore_index, :] = -1
-
-    return sk_stats
-
-
-def _sk_stat_scores_mdim_mcls(preds, target, reduce, mdmc_reduce, num_classes, is_multiclass, ignore_index, top_k):
-    preds, target, _ = _input_format_classification(
-        preds, target, threshold=THRESHOLD, num_classes=num_classes, is_multiclass=is_multiclass, top_k=top_k
-    )
-
-    if mdmc_reduce == "global":
-        preds = torch.transpose(preds, 1, 2).reshape(-1, preds.shape[1])
-        target = torch.transpose(target, 1, 2).reshape(-1, target.shape[1])
-
-        return _sk_stat_scores(preds, target, reduce, None, False, ignore_index, top_k)
-    elif mdmc_reduce == "samplewise":
-        scores = []
-
-        for i in range(preds.shape[0]):
-            pred_i = preds[i, ...].T
-            target_i = target[i, ...].T
-            scores_i = _sk_stat_scores(pred_i, target_i, reduce, None, False, ignore_index, top_k)
-
-            scores.append(np.expand_dims(scores_i, 0))
-
-        return np.concatenate(scores)
-
-
-@pytest.mark.parametrize(
-    "reduce, mdmc_reduce, num_classes, inputs, ignore_index",
-    [
-        ["unknown", None, None, _input_binary, None],
-        ["micro", "unknown", None, _input_binary, None],
-        ["macro", None, None, _input_binary, None],
-        ["micro", None, None, _input_mdmc_prob, None],
-        ["micro", None, None, _input_binary_prob, 0],
-        ["micro", None, None, _input_mccls_prob, NUM_CLASSES],
-        ["micro", None, NUM_CLASSES, _input_mccls_prob, NUM_CLASSES],
-    ],
-)
-def test_wrong_params(reduce, mdmc_reduce, num_classes, inputs, ignore_index):
-    """Test a combination of parameters that are invalid and should raise an error.
-
-    This includes invalid ``reduce`` and ``mdmc_reduce`` parameter values, not setting
-    ``num_classes`` when ``reduce='macro'`, not setting ``mdmc_reduce`` when inputs
-    are multi-dim multi-class``, setting ``ignore_index`` when inputs are binary, as well
-    as setting ``ignore_index`` to a value higher than the number of classes.
-    """
-    with pytest.raises(ValueError):
-        stat_scores(
-            inputs.preds[0], inputs.target[0], reduce, mdmc_reduce, num_classes=num_classes, ignore_index=ignore_index
-        )
-
-    with pytest.raises(ValueError):
-        sts = StatScores(reduce=reduce, mdmc_reduce=mdmc_reduce, num_classes=num_classes, ignore_index=ignore_index)
-        sts(inputs.preds[0], inputs.target[0])
-
-
-def test_wrong_threshold():
-    with pytest.raises(ValueError):
-        StatScores(threshold=1.5)
-
-
-@pytest.mark.parametrize("ignore_index", [None, 0])
-@pytest.mark.parametrize("reduce", ["micro", "macro", "samples"])
-@pytest.mark.parametrize(
-    "preds, target, sk_fn, mdmc_reduce, num_classes, is_multiclass, top_k",
-    [
-        (_input_binary_prob.preds, _input_binary_prob.target, _sk_stat_scores, None, 1, None, None),
-        (_input_binary.preds, _input_binary.target, _sk_stat_scores, None, 1, False, None),
-        (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, None),
-        (_input_mlb_prob.preds, _input_mlb_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, 2),
-        (_input_mcls.preds, _input_mcls.target, _sk_stat_scores, None, NUM_CLASSES, False, None),
-        (_input_mccls_prob.preds, _input_mccls_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, None),
-        (_input_mccls_prob.preds, _input_mccls_prob.target, _sk_stat_scores, None, NUM_CLASSES, None, 2),
-        (_input_multiclass.preds, _input_multiclass.target, _sk_stat_scores, None, NUM_CLASSES, None, None),
-        (_input_mdmc.preds, _input_mdmc.target, _sk_stat_scores_mdim_mcls, "samplewise", NUM_CLASSES, None, None),
-        (
-            _input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_stat_scores_mdim_mcls, "samplewise", NUM_CLASSES, None,
-            None
-        ),
-        (_input_mdmc.preds, _input_mdmc.target, _sk_stat_scores_mdim_mcls, "global", NUM_CLASSES, None, None),
-        (_input_mdmc_prob.preds, _input_mdmc_prob.target, _sk_stat_scores_mdim_mcls, "global", NUM_CLASSES, None, None),
-    ],
-)
-class TestStatScores(MetricTester):
-    # DDP tests temporarily disabled due to hanging issues
-    @pytest.mark.parametrize("ddp", [False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_stat_scores_class(
-        self,
-        ddp: bool,
-        dist_sync_on_step: bool,
-        sk_fn: Callable,
-        preds: torch.Tensor,
-        target: torch.Tensor,
-        reduce: str,
-        mdmc_reduce: Optional[str],
-        num_classes: Optional[int],
-        is_multiclass: Optional[bool],
-        ignore_index: Optional[int],
-        top_k: Optional[int],
-    ):
-        if ignore_index is not None and preds.ndim == 2:
-            pytest.skip("Skipping ignore_index test with binary inputs.")
-
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=StatScores,
-            sk_metric=partial(
-                sk_fn,
-                reduce=reduce,
-                mdmc_reduce=mdmc_reduce,
-                num_classes=num_classes,
-                is_multiclass=is_multiclass,
-                ignore_index=ignore_index,
-                top_k=top_k,
-            ),
-            dist_sync_on_step=dist_sync_on_step,
-            metric_args={
-                "num_classes": num_classes,
-                "reduce": reduce,
-                "mdmc_reduce": mdmc_reduce,
-                "threshold": THRESHOLD,
-                "is_multiclass": is_multiclass,
-                "ignore_index": ignore_index,
-                "top_k": top_k,
-            },
-            check_dist_sync_on_step=True,
-            check_batch=True,
-        )
-
-    def test_stat_scores_fn(
-        self,
-        sk_fn: Callable,
-        preds: torch.Tensor,
-        target: torch.Tensor,
-        reduce: str,
-        mdmc_reduce: Optional[str],
-        num_classes: Optional[int],
-        is_multiclass: Optional[bool],
-        ignore_index: Optional[int],
-        top_k: Optional[int],
-    ):
-        if ignore_index is not None and preds.ndim == 2:
-            pytest.skip("Skipping ignore_index test with binary inputs.")
-
-        self.run_functional_metric_test(
-            preds,
-            target,
-            metric_functional=stat_scores,
-            sk_metric=partial(
-                sk_fn,
-                reduce=reduce,
-                mdmc_reduce=mdmc_reduce,
-                num_classes=num_classes,
-                is_multiclass=is_multiclass,
-                ignore_index=ignore_index,
-                top_k=top_k,
-            ),
-            metric_args={
-                "num_classes": num_classes,
-                "reduce": reduce,
-                "mdmc_reduce": mdmc_reduce,
-                "threshold": THRESHOLD,
-                "is_multiclass": is_multiclass,
-                "ignore_index": ignore_index,
-                "top_k": top_k,
-            },
-        )
-
-
-_mc_k_target = torch.tensor([0, 1, 2])
-_mc_k_preds = torch.tensor([[0.35, 0.4, 0.25], [0.1, 0.5, 0.4], [0.2, 0.1, 0.7]])
-_ml_k_target = torch.tensor([[0, 1, 0], [1, 1, 0], [0, 0, 0]])
-_ml_k_preds = torch.tensor([[0.9, 0.2, 0.75], [0.1, 0.7, 0.8], [0.6, 0.1, 0.7]])
-
-
-@pytest.mark.parametrize(
-    "k, preds, target, reduce, expected",
-    [
-        (1, _mc_k_preds, _mc_k_target, "micro", torch.tensor([2, 1, 5, 1, 3])),
-        (2, _mc_k_preds, _mc_k_target, "micro", torch.tensor([3, 3, 3, 0, 3])),
-        (1, _ml_k_preds, _ml_k_target, "micro", torch.tensor([0, 3, 3, 3, 3])),
-        (2, _ml_k_preds, _ml_k_target, "micro", torch.tensor([1, 5, 1, 2, 3])),
-        (1, _mc_k_preds, _mc_k_target, "macro", torch.tensor([[0, 1, 1], [0, 1, 0], [2, 1, 2], [1, 0, 0], [1, 1, 1]])),
-        (2, _mc_k_preds, _mc_k_target, "macro", torch.tensor([[1, 1, 1], [1, 1, 1], [1, 1, 1], [0, 0, 0], [1, 1, 1]])),
-        (1, _ml_k_preds, _ml_k_target, "macro", torch.tensor([[0, 0, 0], [1, 0, 2], [1, 1, 1], [1, 2, 0], [1, 2, 0]])),
-        (2, _ml_k_preds, _ml_k_target, "macro", torch.tensor([[0, 1, 0], [2, 0, 3], [0, 1, 0], [1, 1, 0], [1, 2, 0]])),
-    ],
-)
-def test_top_k(k: int, preds: torch.Tensor, target: torch.Tensor, reduce: str, expected: torch.Tensor):
-    """ A simple test to check that top_k works as expected """
-
-    class_metric = StatScores(top_k=k, reduce=reduce, num_classes=3)
-    class_metric.update(preds, target)
-
-    assert torch.equal(class_metric.compute(), expected.T)
-    assert torch.equal(stat_scores(preds, target, top_k=k, reduce=reduce, num_classes=3), expected.T)
diff --git a/tests/metrics/test_remove_1-5_metrics.py b/tests/metrics/test_remove_1-5_metrics.py
index 41ccfb6da8015..339d07b163632 100644
--- a/tests/metrics/test_remove_1-5_metrics.py
+++ b/tests/metrics/test_remove_1-5_metrics.py
@@ -21,21 +21,33 @@
     AUC,
     AUROC,
     AveragePrecision,
+    ConfusionMatrix,
+    F1,
+    FBeta,
+    HammingDistance,
+    IoU,
     MetricCollection,
     Precision,
     PrecisionRecallCurve,
     Recall,
     ROC,
+    StatScores,
 )
 from pytorch_lightning.metrics.functional import (
     auc,
     auroc,
     average_precision,
+    confusion_matrix,
+    f1,
+    fbeta,
+    hamming_distance,
+    iou,
     precision,
     precision_recall,
     precision_recall_curve,
     recall,
     roc,
+    stat_scores,
 )
 from pytorch_lightning.metrics.functional.accuracy import accuracy
 from pytorch_lightning.metrics.utils import get_num_classes, select_topk, to_categorical, to_onehot
@@ -162,3 +174,66 @@ def test_v1_5_metric_precision_recall():
         assert torch.equal(prec, torch.tensor([1., 1., 1., 1.]))
         assert torch.allclose(rc, torch.tensor([1., 0.6667, 0.3333, 0.]), atol=1e-4)
         assert torch.equal(thrs, torch.tensor([1, 2, 3]))
+
+
+def test_v1_5_metric_classif_mix():
+    ConfusionMatrix.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        ConfusionMatrix(num_classes=1)
+
+    FBeta.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        FBeta(num_classes=1)
+
+    F1.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        F1(num_classes=1)
+
+    HammingDistance.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        HammingDistance()
+
+    StatScores.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        StatScores()
+
+    target = torch.tensor([1, 1, 0, 0])
+    preds = torch.tensor([0, 1, 0, 0])
+    confusion_matrix.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.equal(confusion_matrix(preds, target, num_classes=2), torch.tensor([[2., 0.], [1., 1.]]))
+
+    target = torch.tensor([0, 1, 2, 0, 1, 2])
+    preds = torch.tensor([0, 2, 1, 0, 0, 1])
+    fbeta.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.allclose(fbeta(preds, target, num_classes=3, beta=0.5), torch.tensor(0.3333), atol=1e-4)
+
+    f1.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.allclose(f1(preds, target, num_classes=3), torch.tensor(0.3333), atol=1e-4)
+
+    target = torch.tensor([[0, 1], [1, 1]])
+    preds = torch.tensor([[0, 1], [0, 1]])
+    hamming_distance.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert hamming_distance(preds, target) == torch.tensor(0.25)
+
+    preds = torch.tensor([1, 0, 2, 1])
+    target = torch.tensor([1, 1, 2, 0])
+    stat_scores.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.equal(stat_scores(preds, target, reduce='micro'), torch.tensor([2, 2, 6, 2, 4]))
+
+
+def test_v1_5_metric_detect():
+    IoU.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        IoU(num_classes=1)
+
+    target = torch.randint(0, 2, (10, 25, 25))
+    pred = torch.tensor(target)
+    pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
+    iou.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert torch.allclose(iou(pred, target), torch.tensor(0.9660), atol=1e-4)

From cb590392880aefcb0830bf00ec08e4beef6d4f7e Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Sat, 20 Mar 2021 19:58:59 +0100
Subject: [PATCH 06/25] fixing examples (#6600)

* try Azure

* -e

* path
---
 azure-pipelines.yml                            | 10 +++++-----
 pl_examples/basic_examples/submit_ddp2_job.sh  |  2 +-
 pl_examples/basic_examples/submit_ddp_job.sh   |  2 +-
 tests/__init__.py                              |  4 ++--
 tests/base/model_template.py                   |  3 ++-
 tests/checkpointing/test_legacy_checkpoints.py |  4 ++--
 tests/helpers/advanced_models.py               |  4 +++-
 tests/helpers/datasets.py                      | 15 +++++----------
 tests/helpers/test_datasets.py                 | 11 ++++++++---
 9 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 6d9c4fd6dc3af..9e2ff77563fa0 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -113,12 +113,12 @@ jobs:
         python -m pytest benchmarks -v --maxfail=2 --durations=0
       displayName: 'Testing: benchmarks'
 
-    - bash: |
+    - script: |
+        set -e
         python -m pytest pl_examples -v --maxfail=2 --durations=0
         python setup.py install --user --quiet
         bash pl_examples/run_ddp-example.sh
-        cd pl_examples/basic_examples
-        bash submit_ddp_job.sh
-        bash submit_ddp2_job.sh
-        pip uninstall -y pytorch-lightning
+        # cd pl_examples/basic_examples
+        # bash submit_ddp_job.sh
+        # bash submit_ddp2_job.sh
       displayName: 'Examples'
diff --git a/pl_examples/basic_examples/submit_ddp2_job.sh b/pl_examples/basic_examples/submit_ddp2_job.sh
index 6fed6afef0d1c..026589a604c36 100755
--- a/pl_examples/basic_examples/submit_ddp2_job.sh
+++ b/pl_examples/basic_examples/submit_ddp2_job.sh
@@ -24,4 +24,4 @@ source activate $1
 # -------------------------
 
 # run script from above
-srun python3 image_classifier.py --accelerator 'ddp2' --gpus 2 --num_nodes 2
+srun python3 simple_image_classifier.py --accelerator 'ddp2' --gpus 2 --num_nodes 2 --max_epochs 5
diff --git a/pl_examples/basic_examples/submit_ddp_job.sh b/pl_examples/basic_examples/submit_ddp_job.sh
index 383579c4346b6..b4f5ff0a64d92 100755
--- a/pl_examples/basic_examples/submit_ddp_job.sh
+++ b/pl_examples/basic_examples/submit_ddp_job.sh
@@ -24,4 +24,4 @@ source activate $1
 # -------------------------
 
 # run script from above
-srun python3 image_classifier.py --accelerator 'ddp' --gpus 2 --num_nodes 2
+srun python3 simple_image_classifier.py --accelerator 'ddp' --gpus 2 --num_nodes 2 --max_epochs 5
diff --git a/tests/__init__.py b/tests/__init__.py
index 433f183896dee..fc634e6b73fec 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -19,8 +19,8 @@
 _TEST_ROOT = os.path.dirname(__file__)
 _PROJECT_ROOT = os.path.dirname(_TEST_ROOT)
 _TEMP_PATH = os.path.join(_PROJECT_ROOT, 'test_temp')
-DATASETS_PATH = os.path.join(_PROJECT_ROOT, 'Datasets')
-LEGACY_PATH = os.path.join(_PROJECT_ROOT, 'legacy')
+PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
+PATH_LEGACY = os.path.join(_PROJECT_ROOT, 'legacy')
 
 # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages
 if _PROJECT_ROOT not in os.getenv('PYTHONPATH', ""):
diff --git a/tests/base/model_template.py b/tests/base/model_template.py
index 1ec2df7865caa..86578fef4c699 100644
--- a/tests/base/model_template.py
+++ b/tests/base/model_template.py
@@ -18,6 +18,7 @@
 import torch.nn.functional as F
 
 from pytorch_lightning.core.lightning import LightningModule
+from tests import PATH_DATASETS
 from tests.base.model_optimizers import ConfigureOptimizersPool
 from tests.base.model_test_dataloaders import TestDataloaderVariations
 from tests.base.model_test_epoch_ends import TestEpochEndVariations
@@ -28,7 +29,7 @@
 from tests.base.model_valid_dataloaders import ValDataloaderVariations
 from tests.base.model_valid_epoch_ends import ValidationEpochEndVariations
 from tests.base.model_valid_steps import ValidationStepVariations
-from tests.helpers.datasets import PATH_DATASETS, TrialMNIST
+from tests.helpers.datasets import TrialMNIST
 
 
 class EvalModelTemplate(
diff --git a/tests/checkpointing/test_legacy_checkpoints.py b/tests/checkpointing/test_legacy_checkpoints.py
index 0b47e25c7ce23..325cc4925f4f4 100644
--- a/tests/checkpointing/test_legacy_checkpoints.py
+++ b/tests/checkpointing/test_legacy_checkpoints.py
@@ -18,9 +18,9 @@
 import pytest
 
 from pytorch_lightning import Trainer
-from tests import LEGACY_PATH
+from tests import PATH_LEGACY
 
-LEGACY_CHECKPOINTS_PATH = os.path.join(LEGACY_PATH, 'checkpoints')
+LEGACY_CHECKPOINTS_PATH = os.path.join(PATH_LEGACY, 'checkpoints')
 CHECKPOINT_EXTENSION = ".ckpt"
 
 
diff --git a/tests/helpers/advanced_models.py b/tests/helpers/advanced_models.py
index 7ad678b3046fd..2b0146e1ee099 100644
--- a/tests/helpers/advanced_models.py
+++ b/tests/helpers/advanced_models.py
@@ -20,6 +20,7 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning.core.lightning import LightningModule
+from tests import PATH_DATASETS
 from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST
 
 
@@ -165,7 +166,7 @@ def configure_optimizers(self):
         return [opt_g, opt_d], []
 
     def train_dataloader(self):
-        return DataLoader(TrialMNIST(train=True, download=True), batch_size=16)
+        return DataLoader(TrialMNIST(root=PATH_DATASETS, train=True, download=True), batch_size=16)
 
 
 class ParityModuleRNN(LightningModule):
@@ -223,6 +224,7 @@ def configure_optimizers(self):
 
     def train_dataloader(self):
         return DataLoader(MNIST(
+            root=PATH_DATASETS,
             train=True,
             download=True,
         ), batch_size=128, num_workers=1)
diff --git a/tests/helpers/datasets.py b/tests/helpers/datasets.py
index e7bdad0f1538c..77035796ca3b1 100644
--- a/tests/helpers/datasets.py
+++ b/tests/helpers/datasets.py
@@ -22,11 +22,6 @@
 from torch import Tensor
 from torch.utils.data import Dataset
 
-from tests import _PROJECT_ROOT
-
-#: local path to test datasets
-PATH_DATASETS = os.path.join(_PROJECT_ROOT, 'Datasets')
-
 
 class MNIST(Dataset):
     """
@@ -47,7 +42,7 @@ class MNIST(Dataset):
             downloaded again.
 
     Examples:
-        >>> dataset = MNIST(download=True)
+        >>> dataset = MNIST(".", download=True)
         >>> len(dataset)
         60000
         >>> torch.bincount(dataset.targets)
@@ -65,7 +60,7 @@ class MNIST(Dataset):
 
     def __init__(
         self,
-        root: str = PATH_DATASETS,
+        root: str,
         train: bool = True,
         normalize: tuple = (0.1307, 0.3081),
         download: bool = True,
@@ -152,7 +147,7 @@ class TrialMNIST(MNIST):
         kwargs: Same as MNIST
 
     Examples:
-        >>> dataset = TrialMNIST(download=True)
+        >>> dataset = TrialMNIST(".", download=True)
         >>> len(dataset)
         300
         >>> sorted(set([d.item() for d in dataset.targets]))
@@ -161,7 +156,7 @@ class TrialMNIST(MNIST):
         tensor([100, 100, 100])
     """
 
-    def __init__(self, num_samples: int = 100, digits: Optional[Sequence] = (0, 1, 2), **kwargs):
+    def __init__(self, root: str, num_samples: int = 100, digits: Optional[Sequence] = (0, 1, 2), **kwargs):
         # number of examples per class
         self.num_samples = num_samples
         # take just a subset of MNIST dataset
@@ -169,7 +164,7 @@ def __init__(self, num_samples: int = 100, digits: Optional[Sequence] = (0, 1, 2
 
         self.cache_folder_name = f"digits-{'-'.join(str(d) for d in self.digits)}_nb-{self.num_samples}"
 
-        super().__init__(normalize=(0.5, 1.0), **kwargs)
+        super().__init__(root, normalize=(0.5, 1.0), **kwargs)
 
     @staticmethod
     def _prepare_subset(full_data: torch.Tensor, full_targets: torch.Tensor, num_samples: int, digits: Sequence):
diff --git a/tests/helpers/test_datasets.py b/tests/helpers/test_datasets.py
index 6319fdb562504..42b5df0ff91a4 100644
--- a/tests/helpers/test_datasets.py
+++ b/tests/helpers/test_datasets.py
@@ -16,12 +16,17 @@
 import cloudpickle
 import pytest
 
+from tests import PATH_DATASETS
 from tests.helpers.datasets import AverageDataset, MNIST, TrialMNIST
 
 
-@pytest.mark.parametrize('dataset_cls', [MNIST, TrialMNIST, AverageDataset])
-def test_pickling_dataset_mnist(tmpdir, dataset_cls):
-    mnist = dataset_cls()
+@pytest.mark.parametrize('dataset_cls,args', [
+    (MNIST, dict(root=PATH_DATASETS)),
+    (TrialMNIST, dict(root=PATH_DATASETS)),
+    (AverageDataset, dict()),
+])
+def test_pickling_dataset_mnist(tmpdir, dataset_cls, args):
+    mnist = dataset_cls(**args)
 
     mnist_pickled = pickle.dumps(mnist)
     pickle.loads(mnist_pickled)

From 634d83134fea4bb701c24abd5a4a38adb0eddbcd Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Sun, 21 Mar 2021 00:15:49 +0100
Subject: [PATCH 07/25] Add AMP for validation, prediction and testing (#6565)

* Add Tests for val and test-steps

* Add native AMP

* pep8 tests

* pep8 plugin

* changelog
---
 CHANGELOG.md                                  |  2 ++
 .../plugins/precision/native_amp.py           | 18 +++++++++++
 tests/models/test_amp.py                      | 31 +++++++++++++++++--
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 01c7ae193555a..d696535311c9d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -116,6 +116,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Added Autocast in validation, test and predict modes for Native AMP ([#6565](https://github.com/PyTorchLightning/pytorch-lightning/pull/6565))
+
 - Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011))
 
 
diff --git a/pytorch_lightning/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py
index dc822680bcbda..3c83945c8a1b7 100644
--- a/pytorch_lightning/plugins/precision/native_amp.py
+++ b/pytorch_lightning/plugins/precision/native_amp.py
@@ -103,3 +103,21 @@ def train_step_context(self) -> Generator[None, None, None]:
         """Enable autocast context"""
         with torch.cuda.amp.autocast():
             yield
+
+    @contextmanager
+    def val_step_context(self) -> Generator[None, None, None]:
+        """Enable autocast context"""
+        with torch.cuda.amp.autocast():
+            yield
+
+    @contextmanager
+    def test_step_context(self) -> Generator[None, None, None]:
+        """Enable autocast context"""
+        with torch.cuda.amp.autocast():
+            yield
+
+    @contextmanager
+    def predict_context(self) -> Generator[None, None, None]:
+        """Enable autocast context"""
+        with torch.cuda.amp.autocast():
+            yield
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 9853db342436b..0b9d6776c1aaa 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -17,24 +17,43 @@
 import pytest
 import torch
 from torch import optim
+from torch.utils.data import DataLoader
 
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers import BoringModel
+from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
 
 class AMPTestModel(BoringModel):
 
-    def training_step(self, batch, batch_idx):
+    def _step(self, batch, batch_idx):
         assert torch.is_autocast_enabled()
         output = self(batch)
         assert output.dtype == torch.float16
         loss = self.loss(batch, output)
-        return {"loss": loss}
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        output = self._step(batch, batch_idx)
+        return {"loss": output}
+
+    def validation_step(self, batch, batch_idx):
+        output = self._step(batch, batch_idx)
+        return {"x": output}
+
+    def test_step(self, batch, batch_idx):
+        output = self._step(batch, batch_idx)
+        return {"y": output}
+
+    def predict(self, batch, batch_idx, dataloader_idx=None):
+        assert torch.is_autocast_enabled()
+        output = self(batch)
+        assert output.dtype == torch.float16
+        return output
 
 
 @pytest.mark.skip(reason='dp + amp not supported currently')  # TODO
@@ -54,6 +73,8 @@ def test_amp_single_gpu_dp(tmpdir):
     model = AMPTestModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
+    trainer.test(model)
+    trainer.predict(model, DataLoader(RandomDataset(32, 64)))
 
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
 
@@ -73,6 +94,8 @@ def test_amp_single_gpu_ddp_spawn(tmpdir):
     model = AMPTestModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
+    trainer.test(model)
+    trainer.predict(model, DataLoader(RandomDataset(32, 64)))
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
 
 
@@ -112,6 +135,8 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
     model = AMPTestModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
+    trainer.test(model)
+    trainer.predict(model, DataLoader(RandomDataset(32, 64)))
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
 
 

From 37f22c99ffc16ae4010ba7f2ff42f0b86cd1f0ad Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Mon, 22 Mar 2021 02:37:54 +0530
Subject: [PATCH 08/25] Add trainer.predict config validation (#6543)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 CHANGELOG.md                                  |  4 +-
 .../trainer/configuration_validator.py        |  9 +++-
 tests/trainer/test_config_validator.py        | 50 ++++++++++++++++++-
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d696535311c9d..6004a28dd0829 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,8 +40,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added no return warning to predict ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139))
 
 
-- Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))
+- Added `Trainer.predict` config validation ([#6543](https://github.com/PyTorchLightning/pytorch-lightning/pull/6543))
+
 
+- Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))
 
 
 ### Changed
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
index 8c539b5ff478d..a7ba2b1c40123 100644
--- a/pytorch_lightning/trainer/configuration_validator.py
+++ b/pytorch_lightning/trainer/configuration_validator.py
@@ -40,7 +40,8 @@ def verify_loop_configurations(self, model: LightningModule) -> None:
             self.__verify_eval_loop_configuration(model, 'val')
         elif self.trainer.state == TrainerState.TESTING:
             self.__verify_eval_loop_configuration(model, 'test')
-        # TODO: add predict
+        elif self.trainer.state == TrainerState.PREDICTING:
+            self.__verify_predict_loop_configuration(model)
 
     def __verify_train_loop_configuration(self, model):
         # -----------------------------------
@@ -99,3 +100,9 @@ def __verify_eval_loop_configuration(self, model: LightningModule, stage: str) -
             rank_zero_warn(f'you passed in a {loader_name} but have no {step_name}. Skipping {stage} loop')
         if has_step and not has_loader:
             rank_zero_warn(f'you defined a {step_name} but have no {loader_name}. Skipping {stage} loop')
+
+    def __verify_predict_loop_configuration(self, model: LightningModule) -> None:
+
+        has_predict_dataloader = is_overridden('predict_dataloader', model)
+        if not has_predict_dataloader:
+            raise MisconfigurationException('Dataloader not found for `Trainer.predict`')
diff --git a/tests/trainer/test_config_validator.py b/tests/trainer/test_config_validator.py
index 59e10480a485e..9fccd9b36440a 100644
--- a/tests/trainer/test_config_validator.py
+++ b/tests/trainer/test_config_validator.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
+import torch
 
-from pytorch_lightning import Trainer
+from pytorch_lightning import LightningDataModule, LightningModule, Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers import BoringModel
+from tests.helpers import BoringModel, RandomDataset
 
 
 def test_wrong_train_setting(tmpdir):
@@ -101,3 +102,48 @@ def test_val_loop_config(tmpdir):
         model = BoringModel()
         model.validation_step = None
         trainer.validate(model)
+
+
+@pytest.mark.parametrize("datamodule", [False, True])
+def test_trainer_predict_verify_config(tmpdir, datamodule):
+
+    class TestModel(LightningModule):
+
+        def __init__(self):
+            super().__init__()
+            self.layer = torch.nn.Linear(32, 2)
+
+        def forward(self, x):
+            return self.layer(x)
+
+    class TestLightningDataModule(LightningDataModule):
+
+        def __init__(self, dataloaders):
+            super().__init__()
+            self._dataloaders = dataloaders
+
+        def test_dataloader(self):
+            return self._dataloaders
+
+        def predict_dataloader(self):
+            return self._dataloaders
+
+    dataloaders = [torch.utils.data.DataLoader(RandomDataset(32, 2)), torch.utils.data.DataLoader(RandomDataset(32, 2))]
+
+    model = TestModel()
+
+    trainer = Trainer(default_root_dir=tmpdir)
+
+    if datamodule:
+        datamodule = TestLightningDataModule(dataloaders)
+        results = trainer.predict(model, datamodule=datamodule)
+    else:
+        results = trainer.predict(model, dataloaders=dataloaders)
+
+    assert len(results) == 2
+    assert results[0][0].shape == torch.Size([1, 2])
+
+    model.predict_dataloader = None
+
+    with pytest.raises(MisconfigurationException, match="Dataloader not found for `Trainer.predict`"):
+        trainer.predict(model)

From 42a7b7058573bc659eb1fd6a64035ae80e270211 Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Mon, 22 Mar 2021 02:40:54 +0530
Subject: [PATCH 09/25] Add DDP Spawn being default for Multi GPUs (#6292)

---
 docs/source/advanced/multi_gpu.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/advanced/multi_gpu.rst b/docs/source/advanced/multi_gpu.rst
index a2a74c7587ae3..5cdb0b377f2b7 100644
--- a/docs/source/advanced/multi_gpu.rst
+++ b/docs/source/advanced/multi_gpu.rst
@@ -267,7 +267,7 @@ Lightning allows multiple ways of training
 - TPUs (``tpu_cores=8|x``) (tpu or TPU pod)
 
 .. note::
-    If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used.
+    If you request multiple GPUs or nodes without setting a mode, DDP Spawn will be automatically used.
 
 For a deeper understanding of what Lightning is doing, feel free to read this
 `guide <https://medium.com/@_willfalcon/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565>`_.

From 51c9260fad5b1ed3b4e41a9ebf460bf4c609fe2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 22 Mar 2021 00:39:55 +0100
Subject: [PATCH 10/25] Move profiler tests (#6619)

---
 tests/special_tests.sh        |   4 +-
 tests/test_profiler.py        | 143 ++++++++++++++++++++++++++++++++--
 tests/trainer/test_trainer.py | 125 -----------------------------
 3 files changed, 138 insertions(+), 134 deletions(-)

diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index dd67af470c4ec..3fe9d6c0e277c 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -34,9 +34,9 @@ python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp
 python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_dp
 python ${DEFAULTS} tests/trainer/logging_/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
 python ${DEFAULTS} tests/callbacks/test_pruning.py::test_pruning_callback_ddp
-python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_trainer_ddp
+python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_trainer_ddp
 python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp
 python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler
 python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model
 python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_ddp
-nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_nested_emit_nvtx
+nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 9b51ca7f7c6d2..5221c0cbf7bf6 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -14,12 +14,17 @@
 import logging
 import os
 import time
+from distutils.version import LooseVersion
 from pathlib import Path
 
 import numpy as np
 import pytest
+import torch
 
-from pytorch_lightning.profiler import AdvancedProfiler, SimpleProfiler
+from pytorch_lightning import Trainer
+from pytorch_lightning.profiler import AdvancedProfiler, SimpleProfiler, PyTorchProfiler
+from tests.helpers import BoringModel
+from tests.helpers.runif import RunIf
 
 PROFILER_OVERHEAD_MAX_TOLERANCE = 0.0005
 
@@ -44,12 +49,6 @@ def simple_profiler():
     return profiler
 
 
-@pytest.fixture
-def advanced_profiler(tmpdir):
-    profiler = AdvancedProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
-    return profiler
-
-
 @pytest.mark.parametrize(["action", "expected"], [
     pytest.param("a", [3, 1]),
     pytest.param("b", [2]),
@@ -116,6 +115,12 @@ def test_simple_profiler_value_errors(simple_profiler):
     simple_profiler.stop(action)
 
 
+@pytest.fixture
+def advanced_profiler(tmpdir):
+    profiler = AdvancedProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
+    return profiler
+
+
 @pytest.mark.parametrize(["action", "expected"], [
     pytest.param("a", [3, 1]),
     pytest.param("b", [2]),
@@ -187,3 +192,127 @@ def test_advanced_profiler_value_errors(advanced_profiler):
 
     advanced_profiler.start(action)
     advanced_profiler.stop(action)
+
+
+@pytest.fixture
+def pytorch_profiler(tmpdir):
+    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"), local_rank=0)
+    return profiler
+
+
+def test_pytorch_profiler_describe(pytorch_profiler):
+    """Ensure the profiler won't fail when reporting the summary."""
+    with pytorch_profiler.profile("test_step"):
+        pass
+
+    # log to stdout and print to file
+    pytorch_profiler.describe()
+    data = Path(pytorch_profiler.output_fname).read_text()
+    assert len(data) > 0
+
+
+def test_pytorch_profiler_value_errors(pytorch_profiler):
+    """Ensure errors are raised where expected."""
+
+    action = "test_step"
+    with pytest.raises(ValueError):
+        pytorch_profiler.stop(action)
+
+    pytorch_profiler.start(action)
+    pytorch_profiler.stop(action)
+
+
+@RunIf(min_gpus=2, special=True)
+@pytest.mark.parametrize("use_output_filename", [False, True])
+def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename):
+    """Ensure that the profiler can be given to the training and default step are properly recorded. """
+
+    if use_output_filename:
+        output_filename = os.path.join(tmpdir, "profiler.txt")
+    else:
+        output_filename = None
+
+    profiler = PyTorchProfiler(output_filename=output_filename)
+
+    model = BoringModel()
+    trainer = Trainer(
+        fast_dev_run=True,
+        profiler=profiler,
+        accelerator="ddp",
+        gpus=2,
+    )
+    trainer.fit(model)
+
+    enabled = use_output_filename or not use_output_filename and profiler.local_rank == 0
+
+    if enabled:
+        assert len(profiler.summary()) > 0
+        assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}
+    else:
+        assert profiler.summary() is None
+        assert set(profiler.profiled_actions.keys()) == set()
+
+    if use_output_filename:
+        profiler.describe()
+        data = Path(profiler.output_fname).read_text()
+        assert len(data) > 0
+
+
+def test_pytorch_profiler_nested(tmpdir):
+    """Ensure that the profiler handles nested context"""
+
+    pytorch_profiler = PyTorchProfiler(
+        profiled_functions=["a", "b", "c"], use_cuda=False, output_filename=os.path.join(tmpdir, "profiler.txt")
+    )
+
+    with pytorch_profiler.profile("a"):
+        a = torch.ones(42)
+        with pytorch_profiler.profile("b"):
+            b = torch.zeros(42)
+        with pytorch_profiler.profile("c"):
+            _ = a + b
+
+    pa = pytorch_profiler.profiled_actions
+
+    # From PyTorch 1.8.0, less operation are being traced.
+    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
+        expected_ = {
+            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'add'],
+            'b': ['zeros', 'empty', 'zero_'],
+            'c': ['add'],
+        }
+    # From PyTorch 1.6.0, more operation are being traced.
+    elif LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+        expected_ = {
+            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty'],
+            'b': ['zeros', 'empty', 'zero_', 'fill_'],
+            'c': ['add', 'empty'],
+        }
+    else:
+        expected_ = {
+            'a': ['add'],
+            'b': [],
+            'c': ['add'],
+        }
+
+    for n in ('a', 'b', 'c'):
+        pa[n] = [e.name for e in pa[n]]
+        if LooseVersion(torch.__version__) >= LooseVersion("1.7.1"):
+            pa[n] = [e.replace("aten::", "") for e in pa[n]]
+        assert pa[n] == expected_[n]
+
+
+@RunIf(min_gpus=1, special=True)
+def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
+    """
+    This test check emit_nvtx is correctly supported
+    """
+    profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
+
+    model = BoringModel()
+    trainer = Trainer(
+        fast_dev_run=True,
+        profiler=profiler,
+        gpus=1,
+    )
+    trainer.fit(model)
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 3375b02c5496b..66889bb7e1139 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -17,7 +17,6 @@
 import sys
 from argparse import Namespace
 from copy import deepcopy
-from distutils.version import LooseVersion
 from pathlib import Path
 from unittest.mock import ANY, call, patch
 
@@ -43,12 +42,6 @@
 from tests.helpers.runif import RunIf
 
 
-@pytest.fixture
-def pytorch_profiler(tmpdir):
-    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"), local_rank=0)
-    return profiler
-
-
 @pytest.mark.parametrize("url_ckpt", [True, False])
 def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
     """Tests use case where trainer saves the model, and user loads it from tags independently."""
@@ -1488,124 +1481,6 @@ def test_trainer_predict_ddp_cpu(tmpdir):
     predict(tmpdir, "ddp_cpu", 0, 2)
 
 
-def test_pytorch_profiler_describe(pytorch_profiler):
-    """Ensure the profiler won't fail when reporting the summary."""
-    with pytorch_profiler.profile("test_step"):
-        pass
-
-    # log to stdout and print to file
-    pytorch_profiler.describe()
-    data = Path(pytorch_profiler.output_fname).read_text()
-    assert len(data) > 0
-
-
-def test_pytorch_profiler_value_errors(pytorch_profiler):
-    """Ensure errors are raised where expected."""
-
-    action = "test_step"
-    with pytest.raises(ValueError):
-        pytorch_profiler.stop(action)
-
-    pytorch_profiler.start(action)
-    pytorch_profiler.stop(action)
-
-
-@RunIf(min_gpus=2, special=True)
-@pytest.mark.parametrize("use_output_filename", [False, True])
-def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename):
-    """Ensure that the profiler can be given to the training and default step are properly recorded. """
-
-    if use_output_filename:
-        output_filename = os.path.join(tmpdir, "profiler.txt")
-    else:
-        output_filename = None
-
-    profiler = PyTorchProfiler(output_filename=output_filename)
-
-    model = BoringModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        profiler=profiler,
-        accelerator="ddp",
-        gpus=2,
-    )
-    trainer.fit(model)
-
-    enabled = use_output_filename or not use_output_filename and profiler.local_rank == 0
-
-    if enabled:
-        assert len(profiler.summary()) > 0
-        assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}
-    else:
-        assert profiler.summary() is None
-        assert set(profiler.profiled_actions.keys()) == set()
-
-    if use_output_filename:
-        profiler.describe()
-        data = Path(profiler.output_fname).read_text()
-        assert len(data) > 0
-
-
-def test_pytorch_profiler_nested(tmpdir):
-    """Ensure that the profiler handles nested context"""
-
-    pytorch_profiler = PyTorchProfiler(
-        profiled_functions=["a", "b", "c"], use_cuda=False, output_filename=os.path.join(tmpdir, "profiler.txt")
-    )
-
-    with pytorch_profiler.profile("a"):
-        a = torch.ones(42)
-        with pytorch_profiler.profile("b"):
-            b = torch.zeros(42)
-        with pytorch_profiler.profile("c"):
-            _ = a + b
-
-    pa = pytorch_profiler.profiled_actions
-
-    # From PyTorch 1.8.0, less operation are being traced.
-    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
-        expected_ = {
-            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'add'],
-            'b': ['zeros', 'empty', 'zero_'],
-            'c': ['add'],
-        }
-    # From PyTorch 1.6.0, more operation are being traced.
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
-        expected_ = {
-            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty'],
-            'b': ['zeros', 'empty', 'zero_', 'fill_'],
-            'c': ['add', 'empty'],
-        }
-    else:
-        expected_ = {
-            'a': ['add'],
-            'b': [],
-            'c': ['add'],
-        }
-
-    for n in ('a', 'b', 'c'):
-        pa[n] = [e.name for e in pa[n]]
-        if LooseVersion(torch.__version__) >= LooseVersion("1.7.1"):
-            pa[n] = [e.replace("aten::", "") for e in pa[n]]
-        assert pa[n] == expected_[n]
-
-
-@RunIf(min_gpus=1, special=True)
-def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
-    """
-    This test check emit_nvtx is correctly supported
-    """
-    profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True)
-
-    model = BoringModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        profiler=profiler,
-        gpus=1,
-    )
-    trainer.fit(model)
-
-
 @pytest.mark.parametrize(
     ["limit_train_batches", "global_step", "num_training_batches", "current_epoch", "should_train"],
     [(0.2, 0, 0, 0, False), (0.5, 10, 2, 4, True)],

From 870247ffe6a5ce819cf7e0a22b997a765d2f6675 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Mon, 22 Mar 2021 01:38:10 +0100
Subject: [PATCH 11/25] drop mypy from .pre-commit-config.yaml (#6542)

---
 .pre-commit-config.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 21c52539a890d..45eca43de93ac 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,8 +33,3 @@ repos:
     hooks:
       - id: yapf
         args: [--parallel, --in-place]
-
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.790
-    hooks:
-      - id: mypy

From 853523ee643fe0f0cc30d40d9e85a8869e7edfd8 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ewah1g13@soton.ac.uk>
Date: Mon, 22 Mar 2021 08:53:51 +0000
Subject: [PATCH 12/25] Clean utilities/argparse and add missing tests (#6607)

---
 pytorch_lightning/utilities/argparse.py       | 10 +---
 ...est_argparse_utils.py => test_argparse.py} | 46 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 9 deletions(-)
 rename tests/utilities/{test_argparse_utils.py => test_argparse.py} (80%)

diff --git a/pytorch_lightning/utilities/argparse.py b/pytorch_lightning/utilities/argparse.py
index 49cbaf3c6bdcf..46d88184ee190 100644
--- a/pytorch_lightning/utilities/argparse.py
+++ b/pytorch_lightning/utilities/argparse.py
@@ -67,7 +67,7 @@ def parse_argparser(cls, arg_parser: Union[ArgumentParser, Namespace]) -> Namesp
                 # Value has been passed as a flag => It is currently None, so we need to set it to True
                 # We always set to True, regardless of the default value.
                 # Users must pass False directly, but when passing nothing True is assumed.
-                # i.e. the only way to disable somthing that defaults to True is to use the long form:
+                # i.e. the only way to disable something that defaults to True is to use the long form:
                 # "--a_default_true_arg False" becomes False, while "--a_default_false_arg" becomes None,
                 # which then becomes True here.
 
@@ -242,9 +242,6 @@ def add_argparse_args(
         if arg == 'track_grad_norm':
             use_type = float
 
-        if arg_default is inspect._empty:
-            arg_default = None
-
         parser.add_argument(
             f'--{arg}',
             dest=arg,
@@ -291,10 +288,7 @@ def _gpus_allowed_type(x) -> Union[int, str]:
 
 
 def _gpus_arg_default(x) -> Union[int, str]:
-    if ',' in x:
-        return str(x)
-    else:
-        return int(x)
+    return _gpus_allowed_type(x)
 
 
 def _int_or_float_type(x) -> Union[int, float]:
diff --git a/tests/utilities/test_argparse_utils.py b/tests/utilities/test_argparse.py
similarity index 80%
rename from tests/utilities/test_argparse_utils.py
rename to tests/utilities/test_argparse.py
index b2eac514941e6..fdf5ae0cafe65 100644
--- a/tests/utilities/test_argparse_utils.py
+++ b/tests/utilities/test_argparse.py
@@ -1,17 +1,51 @@
 import io
-from argparse import ArgumentParser
+from argparse import ArgumentParser, Namespace
 from typing import List
+from unittest.mock import MagicMock
 
 import pytest
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.utilities.argparse import (
     add_argparse_args,
+    from_argparse_args,
     get_abbrev_qualified_cls_name,
+    parse_argparser,
     parse_args_from_docstring,
+    _gpus_arg_default,
+    _int_or_float_type
 )
 
 
+class ArgparseExample:
+    def __init__(self, a: int = 0, b: str = '', c: bool = False):
+        self.a = a
+        self.b = b
+        self.c = c
+
+
+def test_from_argparse_args():
+    args = Namespace(a=1, b='test', c=True, d='not valid')
+    my_instance = from_argparse_args(ArgparseExample, args)
+    assert my_instance.a == 1
+    assert my_instance.b == 'test'
+    assert my_instance.c
+
+    parser = ArgumentParser()
+    mock_trainer = MagicMock()
+    _ = from_argparse_args(mock_trainer, parser)
+    mock_trainer.parse_argparser.assert_called_once_with(parser)
+
+
+def test_parse_argparser():
+    args = Namespace(a=1, b='test', c=None, d='not valid')
+    new_args = parse_argparser(ArgparseExample, args)
+    assert new_args.a == 1
+    assert new_args.b == 'test'
+    assert new_args.c
+    assert new_args.d == 'not valid'
+
+
 def test_parse_args_from_docstring_normal():
     args_help = parse_args_from_docstring(
         """Constrain image dataset
@@ -168,3 +202,13 @@ def test_add_argparse_args_no_argument_group():
     args = parser.parse_args(fake_argv)
     assert args.main_arg == "abc"
     assert args.my_parameter == 2
+
+
+def test_gpus_arg_default():
+    assert _gpus_arg_default('1,2') == '1,2'
+    assert _gpus_arg_default('1') == 1
+
+
+def test_int_or_float_type():
+    assert isinstance(_int_or_float_type('0.0'), float)
+    assert isinstance(_int_or_float_type('0'), int)

From 58c9fa7edbb40dd3fbfc544ee042e8b23693db08 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Mon, 22 Mar 2021 11:43:53 +0000
Subject: [PATCH 13/25] Allow training type plugin to delay optimizer creation
 (FSDP 2/n) (#6331)

* Allow training_type_plugin to delay optimizer configure

* Add missing references to trainer, add a CPU accelerator based test
---
 pytorch_lightning/accelerators/accelerator.py |  9 +++--
 .../training_type/training_type_plugin.py     | 10 ++++++
 pytorch_lightning/trainer/trainer.py          |  4 +--
 tests/accelerators/test_cpu.py                | 35 ++++++++++++++++++-
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index ceb9d98505acc..60e6ea88b4250 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -85,7 +85,8 @@ def setup(self, trainer: 'Trainer', model: LightningModule) -> None:
             model: the LightningModule
         """
         self.setup_training_type_plugin(self.training_type_plugin, model)
-        self.setup_optimizers(trainer)
+        if not self.training_type_plugin.setup_optimizers_in_pre_dispatch:
+            self.setup_optimizers(trainer)
         self.setup_precision_plugin(self.precision_plugin)
 
     def start_training(self, trainer: 'Trainer') -> None:
@@ -97,12 +98,14 @@ def start_evaluating(self, trainer: 'Trainer') -> None:
     def start_predicting(self, trainer: 'Trainer') -> None:
         self.training_type_plugin.start_predicting(trainer)
 
-    def pre_dispatch(self) -> None:
+    def pre_dispatch(self, trainer: 'Trainer') -> None:
         """Hook to do something before the training/evaluation/prediction starts."""
         self.training_type_plugin.pre_dispatch()
+        if self.training_type_plugin.setup_optimizers_in_pre_dispatch:
+            self.setup_optimizers(trainer)
         self.precision_plugin.pre_dispatch()
 
-    def post_dispatch(self) -> None:
+    def post_dispatch(self, trainer: 'Trainer') -> None:
         """Hook to do something before the training/evaluation/prediction starts."""
         self.training_type_plugin.post_dispatch()
         self.precision_plugin.post_dispatch()
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 6a87792c7bd03..b6f1be359bbf2 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -182,3 +182,13 @@ def init_optimizers(self, trainer: "Trainer", model: LightningModule):
 
     def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
         optimizer.step(closure=lambda_closure, **kwargs)
+
+    @property
+    def setup_optimizers_in_pre_dispatch(self) -> bool:
+        """
+        Override to delay setting optimizers and schedulers till after dispatch.
+        This is useful when the `TrainingTypePlugin` requires operating on the wrapped accelerator model.
+        However this may break certain precision plugins such as APEX which require optimizers to be set.
+        Returns: If True, delay setup optimizers till pre_dispatch, else call within setup.
+        """
+        return False
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 53b4920bd85ef..0e9e28c9996f2 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -495,7 +495,7 @@ def fit(
         return self.accelerator.results or 1
 
     def pre_dispatch(self):
-        self.accelerator.pre_dispatch()
+        self.accelerator.pre_dispatch(self)
 
         # log hyper-parameters
         if self.logger is not None:
@@ -505,7 +505,7 @@ def pre_dispatch(self):
             self.logger.save()
 
     def post_dispatch(self):
-        self.accelerator.post_dispatch()
+        self.accelerator.post_dispatch(self)
         self.accelerator.teardown()
 
     def dispatch(self):
diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py
index 81a5132e47356..349e4175a7444 100644
--- a/tests/accelerators/test_cpu.py
+++ b/tests/accelerators/test_cpu.py
@@ -2,11 +2,12 @@
 
 import pytest
 import torch
-
+from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.plugins import SingleDevicePlugin
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers.boring_model import BoringModel
 
 
 def test_unsupported_precision_plugins():
@@ -18,3 +19,35 @@ def test_unsupported_precision_plugins():
     )
     with pytest.raises(MisconfigurationException, match=r"amp \+ cpu is not supported."):
         accelerator.setup(trainer=trainer, model=model)
+
+
+@pytest.mark.parametrize("delay_dispatch", [True, False])
+def test_plugin_setup_optimizers_in_pre_dispatch(tmpdir, delay_dispatch):
+    """
+    Test when using a custom training type plugin that delays setup optimizers,
+    we do not call setup optimizers till ``pre_dispatch``.
+    """
+
+    class TestModel(BoringModel):
+        def on_fit_start(self):
+            if delay_dispatch:
+                # Ensure we haven't setup optimizers if we've delayed dispatch
+                assert len(self.trainer.optimizers) == 0
+            else:
+                assert len(self.trainer.optimizers) > 0
+
+        def on_fit_end(self):
+            assert len(self.trainer.optimizers) > 0
+
+    class CustomPlugin(SingleDevicePlugin):
+        @property
+        def setup_optimizers_in_pre_dispatch(self) -> bool:
+            return delay_dispatch
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        plugins=CustomPlugin(device=torch.device("cpu"))
+    )
+    trainer.fit(model)

From e2e1de0fb73e6ba69fb26b7ade4371c5ee6a1845 Mon Sep 17 00:00:00 2001
From: camruta <79558951+camruta@users.noreply.github.com>
Date: Mon, 22 Mar 2021 04:49:06 -0700
Subject: [PATCH 14/25] Add teardown method to BaseProfiler. (#6370)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: ananthsub <ananth.subramaniam@gmail.com>
---
 .gitignore                                 |  1 +
 CHANGELOG.md                               | 18 ++++++++++++------
 pytorch_lightning/profiler/profilers.py    | 20 ++++++++++++++------
 pytorch_lightning/profiler/pytorch.py      |  8 +++++---
 pytorch_lightning/trainer/trainer.py       |  1 +
 pytorch_lightning/trainer/training_loop.py |  1 +
 tests/test_profiler.py                     | 22 ++++++++++++++++++++--
 7 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index cd0ba22453512..c007140257188 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,3 +157,4 @@ tags
 data
 MNIST
 runs
+*traces*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6004a28dd0829..5f005f583c5ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,8 +14,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added a way to print to terminal without breaking up the progress bar ([#5470](https://github.com/PyTorchLightning/pytorch-lightning/pull/5470))
 
+
 - Added support to checkpoint after training steps in `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146))
 
+
 - Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072))
 
 
@@ -37,6 +39,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/PyTorchLightning/pytorch-lightning/pull/6274))
 
 
+- Added `teardown` method to `BaseProfiler` to enable subclasses defining post-profiling steps outside of `__del__` ([#6370](https://github.com/PyTorchLightning/pytorch-lightning/pull/6370))
+
+
 - Added no return warning to predict ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139))
 
 
@@ -120,6 +125,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added Autocast in validation, test and predict modes for Native AMP ([#6565](https://github.com/PyTorchLightning/pytorch-lightning/pull/6565))
 
+
 - Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011))
 
 
@@ -147,6 +153,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed LightningModule `all_gather` on cpu tensors ([#6416](https://github.com/PyTorchLightning/pytorch-lightning/pull/6416))
 
 
+- Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587))
+
+
+- Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576))
+
+
 - Fixed torch distributed not available in setup hook for DDP ([#6506](https://github.com/PyTorchLightning/pytorch-lightning/pull/6506))
 
 
@@ -170,12 +182,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/PyTorchLightning/pytorch-lightning/pull/6541))
 
 
-- Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587))
-
-
-- Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576))
-
-
 ## [1.2.3] - 2021-03-09
 
 ### Fixed
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index d704ba83236c1..55898dc2ee4e1 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -55,6 +55,10 @@ def start(self, action_name: str) -> None:
     def stop(self, action_name: str) -> None:
         """Defines how to record the duration once an action is complete."""
 
+    def teardown(self) -> None:
+        """Execute arbitrary post-profiling tear-down steps as defined by subclass."""
+        pass
+
     @contextmanager
     def profile(self, action_name: str) -> None:
         """
@@ -211,14 +215,16 @@ def log_row(action, mean, total):
     def describe(self):
         """Logs a profile report after the conclusion of the training run."""
         super().describe()
-        if self.output_file:
-            self.output_file.flush()
+        self.teardown()
 
-    def __del__(self):
+    def teardown(self) -> None:
         """Close profiler's stream."""
         if self.output_file:
             self.output_file.close()
 
+    def __del__(self):
+        self.teardown()
+
 
 class AdvancedProfiler(BaseProfiler):
     """
@@ -283,10 +289,12 @@ def summary(self) -> str:
     def describe(self):
         """Logs a profile report after the conclusion of the training run."""
         super().describe()
-        if self.output_file:
-            self.output_file.flush()
+        self.teardown()
 
-    def __del__(self):
+    def teardown(self) -> None:
         """Close profiler's stream."""
         if self.output_file:
             self.output_file.close()
+
+    def __del__(self):
+        self.teardown()
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
index 88a33a3d367f8..fdde80589acf3 100644
--- a/pytorch_lightning/profiler/pytorch.py
+++ b/pytorch_lightning/profiler/pytorch.py
@@ -294,10 +294,12 @@ def summary(self) -> str:
     def describe(self):
         """Logs a profile report after the conclusion of the training run."""
         super().describe()
-        if self.output_file:
-            self.output_file.flush()
+        self.teardown()
 
-    def __del__(self):
+    def teardown(self) -> None:
         """Close profiler's stream."""
         if self.output_file:
             self.output_file.close()
+
+    def __del__(self):
+        self.teardown()
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 0e9e28c9996f2..a5b99871d55f9 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1077,6 +1077,7 @@ def call_teardown_hook(self, model: LightningModule) -> None:
         else:
             state = None
 
+        self.profiler.teardown()
         self.teardown(stage=state)
         model.teardown(stage=state)
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 7e737c424ff26..a77d91a7402b4 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -140,6 +140,7 @@ def on_train_end(self):
             self.trainer.logger.finalize("success")
 
         # summarize profile results
+        # todo (tchaton) All ranks should call describe.
         if self.trainer.global_rank == 0:
             self.trainer.profiler.describe()
 
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 5221c0cbf7bf6..ccdd8a569c9a8 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -252,8 +252,8 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename):
         assert profiler.summary() is None
         assert set(profiler.profiled_actions.keys()) == set()
 
-    if use_output_filename:
-        profiler.describe()
+    # todo (tchaton) add support for all ranks
+    if use_output_filename and os.getenv("LOCAL_RANK") == "0":
         data = Path(profiler.output_fname).read_text()
         assert len(data) > 0
 
@@ -316,3 +316,21 @@ def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
         gpus=1,
     )
     trainer.fit(model)
+
+
+@pytest.mark.parametrize("cls", (SimpleProfiler, AdvancedProfiler, PyTorchProfiler))
+def test_profiler_teardown(tmpdir, cls):
+    """
+    This test checks if profiler teardown method is called when trainer is exiting.
+    """
+    profiler = cls(output_filename=os.path.join(tmpdir, "profiler.txt"))
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        profiler=profiler,
+    )
+    trainer.fit(model)
+
+    assert profiler.output_file.closed

From 1fae10a2dc8224379eac84d6242e0847c2685565 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 22 Mar 2021 13:39:19 +0100
Subject: [PATCH 15/25] refactoring setup (#6590)

* refactoring setup

* .

* docs

* flake8
---
 docs/source/conf.py                     | 23 ++++---
 pytorch_lightning/__init__.py           | 81 +++++++------------------
 pytorch_lightning/callbacks/progress.py |  3 +-
 pytorch_lightning/info.py               | 35 +++++++++++
 pytorch_lightning/setup_tools.py        |  6 +-
 setup.py                                | 46 ++++++++------
 6 files changed, 101 insertions(+), 93 deletions(-)
 create mode 100644 pytorch_lightning/info.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ccf824bb37d9b..11a0d2a0538bb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -13,7 +13,6 @@
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
 # import m2r
-import builtins
 import glob
 import os
 import shutil
@@ -27,10 +26,13 @@
 
 FOLDER_GENERATED = 'generated'
 SPHINX_MOCK_REQUIREMENTS = int(os.environ.get('SPHINX_MOCK_REQUIREMENTS', True))
-if SPHINX_MOCK_REQUIREMENTS:
-    builtins.__LIGHTNING_SETUP__ = True
 
-import pytorch_lightning  # noqa: E402
+try:
+    from pytorch_lightning import info
+except ImportError:
+    # alternative https://stackoverflow.com/a/67692/4521646
+    sys.path.append(os.path.join(PATH_ROOT, "pytorch_lightning"))
+    import info
 
 # -- Project documents -------------------------------------------------------
 
@@ -79,13 +81,13 @@ def _transform_changelog(path_in: str, path_out: str) -> None:
 # -- Project information -----------------------------------------------------
 
 project = 'PyTorch Lightning'
-copyright = pytorch_lightning.__copyright__
-author = pytorch_lightning.__author__
+copyright = info.__copyright__
+author = info.__author__
 
 # The short X.Y version
-version = pytorch_lightning.__version__
+version = info.__version__
 # The full version, including alpha/beta/rc tags
-release = pytorch_lightning.__version__
+release = info.__version__
 
 # -- General configuration ---------------------------------------------------
 
@@ -176,8 +178,8 @@ def _transform_changelog(path_in: str, path_out: str) -> None:
 # documentation.
 
 html_theme_options = {
-    'pytorch_project': pytorch_lightning.__homepage__,
-    'canonical_url': pytorch_lightning.__homepage__,
+    'pytorch_project': info.__homepage__,
+    'canonical_url': info.__homepage__,
     'collapse_navigation': False,
     'display_version': True,
     'logo_only': False,
@@ -279,6 +281,7 @@ def _transform_changelog(path_in: str, path_out: str) -> None:
     'torch': ('https://pytorch.org/docs/stable/', None),
     'numpy': ('https://numpy.org/doc/stable/', None),
     'PIL': ('https://pillow.readthedocs.io/en/stable/', None),
+    'torchmetrics': ('https://torchmetrics.readthedocs.io/en/stable/', None),
 }
 
 # -- Options for todo extension ----------------------------------------------
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 569078c994ba4..b9660475bf2f7 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -2,42 +2,17 @@
 
 import logging
 import os
-import sys
-import time
 
-_this_year = time.strftime("%Y")
-__version__ = '1.3.0dev'
-__author__ = 'William Falcon et al.'
-__author_email__ = 'waf2107@columbia.edu'
-__license__ = 'Apache-2.0'
-__copyright__ = f'Copyright (c) 2018-{_this_year}, {__author__}.'
-__homepage__ = 'https://github.com/PyTorchLightning/pytorch-lightning'
-# this has to be simple string, see: https://github.com/pypa/twine/issues/522
-__docs__ = (
-    "PyTorch Lightning is the lightweight PyTorch wrapper for ML researchers."
-    " Scale your models. Write less boilerplate."
+from pytorch_lightning.info import (  # noqa: F401
+    __author__,
+    __author_email__,
+    __copyright__,
+    __docs__,
+    __homepage__,
+    __license__,
+    __version__,
 )
-__long_docs__ = """
-Lightning is a way to organize your PyTorch code to decouple the science code from the engineering.
- It's more of a style-guide than a framework.
 
-In Lightning, you organize your code into 3 distinct categories:
-
-1. Research code (goes in the LightningModule).
-2. Engineering code (you delete, and is handled by the Trainer).
-3. Non-essential research code (logging, etc. this goes in Callbacks).
-
-Although your research/production project might start simple, once you add things like GPU AND TPU training,
- 16-bit precision, etc, you end up spending more time engineering than researching.
- Lightning automates AND rigorously tests those parts for you.
-
-Overall, Lightning guarantees rigorously tested, correct, modern best practices for the automated parts.
-
-Documentation
--------------
-- https://pytorch-lightning.readthedocs.io/en/latest
-- https://pytorch-lightning.readthedocs.io/en/stable
-"""
 _root_logger = logging.getLogger()
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
@@ -50,32 +25,20 @@
 _PACKAGE_ROOT = os.path.dirname(__file__)
 _PROJECT_ROOT = os.path.dirname(_PACKAGE_ROOT)
 
-try:
-    # This variable is injected in the __builtins__ by the build
-    # process. It used to enable importing subpackages of skimage when
-    # the binaries are not built
-    _ = None if __LIGHTNING_SETUP__ else None
-except NameError:
-    __LIGHTNING_SETUP__: bool = False
-
-if __LIGHTNING_SETUP__:  # pragma: no-cover
-    sys.stdout.write(f'Partial import of `{__name__}` during the build process.\n')  # pragma: no-cover
-    # We are not importing the rest of the lightning during the build process, as it may not be compiled yet
-else:
-    from pytorch_lightning import metrics
-    from pytorch_lightning.callbacks import Callback
-    from pytorch_lightning.core import LightningDataModule, LightningModule
-    from pytorch_lightning.trainer import Trainer
-    from pytorch_lightning.utilities.seed import seed_everything
-
-    __all__ = [
-        'Trainer',
-        'LightningDataModule',
-        'LightningModule',
-        'Callback',
-        'seed_everything',
-        'metrics',
-    ]
+from pytorch_lightning import metrics  # noqa: E402
+from pytorch_lightning.callbacks import Callback  # noqa: E402
+from pytorch_lightning.core import LightningDataModule, LightningModule  # noqa: E402
+from pytorch_lightning.trainer import Trainer  # noqa: E402
+from pytorch_lightning.utilities.seed import seed_everything  # noqa: E402
+
+__all__ = [
+    'Trainer',
+    'LightningDataModule',
+    'LightningModule',
+    'Callback',
+    'seed_everything',
+    'metrics',
+]
 
 # for compatibility with namespace packages
 __import__('pkg_resources').declare_namespace(__name__)
diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py
index 74e57e2b5642e..78db9a7dba12e 100644
--- a/pytorch_lightning/callbacks/progress.py
+++ b/pytorch_lightning/callbacks/progress.py
@@ -39,8 +39,7 @@
 
 class tqdm(_tqdm):
     """
-    Custom tqdm progressbar where we append 0 to floating points/strings to
-    prevent the progress bar from flickering
+    Custom tqdm progressbar where we append 0 to floating points/strings to prevent the progress bar from flickering
     """
 
     @staticmethod
diff --git a/pytorch_lightning/info.py b/pytorch_lightning/info.py
new file mode 100644
index 0000000000000..0e7a1c25a74f1
--- /dev/null
+++ b/pytorch_lightning/info.py
@@ -0,0 +1,35 @@
+import time
+
+_this_year = time.strftime("%Y")
+__version__ = '1.3.0dev'
+__author__ = 'William Falcon et al.'
+__author_email__ = 'waf2107@columbia.edu'
+__license__ = 'Apache-2.0'
+__copyright__ = f'Copyright (c) 2018-{_this_year}, {__author__}.'
+__homepage__ = 'https://github.com/PyTorchLightning/pytorch-lightning'
+# this has to be simple string, see: https://github.com/pypa/twine/issues/522
+__docs__ = (
+    "PyTorch Lightning is the lightweight PyTorch wrapper for ML researchers."
+    " Scale your models. Write less boilerplate."
+)
+__long_docs__ = """
+Lightning is a way to organize your PyTorch code to decouple the science code from the engineering.
+ It's more of a style-guide than a framework.
+
+In Lightning, you organize your code into 3 distinct categories:
+
+1. Research code (goes in the LightningModule).
+2. Engineering code (you delete, and is handled by the Trainer).
+3. Non-essential research code (logging, etc. this goes in Callbacks).
+
+Although your research/production project might start simple, once you add things like GPU AND TPU training,
+ 16-bit precision, etc, you end up spending more time engineering than researching.
+ Lightning automates AND rigorously tests those parts for you.
+
+Overall, Lightning guarantees rigorously tested, correct, modern best practices for the automated parts.
+
+Documentation
+-------------
+- https://pytorch-lightning.readthedocs.io/en/latest
+- https://pytorch-lightning.readthedocs.io/en/stable
+"""
diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py
index f5aed2608635e..3362ccb479895 100644
--- a/pytorch_lightning/setup_tools.py
+++ b/pytorch_lightning/setup_tools.py
@@ -16,7 +16,7 @@
 import re
 from typing import List
 
-from pytorch_lightning import __homepage__, __version__, _PROJECT_ROOT
+_PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
 
 
 def _load_requirements(path_dir: str, file_name: str = 'requirements.txt', comment_char: str = '#') -> List[str]:
@@ -40,10 +40,10 @@ def _load_requirements(path_dir: str, file_name: str = 'requirements.txt', comme
     return reqs
 
 
-def _load_readme_description(path_dir: str, homepage: str = __homepage__, version: str = __version__) -> str:
+def _load_readme_description(path_dir: str, homepage: str, version: str) -> str:
     """Load readme as decribtion
 
-    >>> _load_readme_description(_PROJECT_ROOT)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+    >>> _load_readme_description(_PROJECT_ROOT, "", "")  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
     '<div align="center">...'
     """
     path_readme = os.path.join(path_dir, "README.md")
diff --git a/setup.py b/setup.py
index 5d619d51977b2..e53e24ebf0702 100755
--- a/setup.py
+++ b/setup.py
@@ -16,20 +16,22 @@
 import os
 
 # Always prefer setuptools over distutils
+import sys
+
 from setuptools import find_packages, setup
 
 try:
-    import builtins
+    from pytorch_lightning import info, setup_tools
 except ImportError:
-    import __builtin__ as builtins
+    # alternative https://stackoverflow.com/a/67692/4521646
+    sys.path.append("pytorch_lightning")
+    import info
+    import setup_tools
 
 # https://packaging.python.org/guides/single-sourcing-package-version/
 # http://blog.ionelmc.ro/2014/05/25/python-packaging/
-PATH_ROOT = os.path.dirname(__file__)
-builtins.__LIGHTNING_SETUP__ = True
-
-import pytorch_lightning  # noqa: E402
-from pytorch_lightning.setup_tools import _load_readme_description, _load_requirements  # noqa: E402
+_PATH_ROOT = os.path.dirname(__file__)
+_PATH_REQUIRE = os.path.join(_PATH_ROOT, 'requirements')
 
 # https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-extras
 # Define package extras. These are only installed if you specify them.
@@ -37,10 +39,10 @@
 # From local copy of repo, use like `pip install ".[dev, docs]"`
 extras = {
     # 'docs': load_requirements(file_name='docs.txt'),
-    'examples': _load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='examples.txt'),
-    'loggers': _load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='loggers.txt'),
-    'extra': _load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='extra.txt'),
-    'test': _load_requirements(path_dir=os.path.join(PATH_ROOT, 'requirements'), file_name='test.txt')
+    'examples': setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name='examples.txt'),
+    'loggers': setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name='loggers.txt'),
+    'extra': setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name='extra.txt'),
+    'test': setup_tools._load_requirements(path_dir=_PATH_REQUIRE, file_name='test.txt')
 }
 extras['dev'] = extras['extra'] + extras['loggers'] + extras['test']
 extras['all'] = extras['dev'] + extras['examples']  # + extras['docs']
@@ -53,6 +55,12 @@
     # filter cpu only packages
     extras[ex] = [pkg for pkg in extras[kw] if not any(pgpu.lower() in pkg.lower() for pgpu in PACKAGES_GPU_ONLY)]
 
+long_description = setup_tools._load_readme_description(
+    _PATH_ROOT,
+    homepage=info.__homepage__,
+    version=info.__version__,
+)
+
 # https://packaging.python.org/discussions/install-requires-vs-requirements /
 # keep the meta-data here for simplicity in reading this file... it's not obvious
 # what happens and to non-engineers they won't know to look in init ...
@@ -60,22 +68,22 @@
 # engineer specific practices
 setup(
     name="pytorch-lightning",
-    version=pytorch_lightning.__version__,
-    description=pytorch_lightning.__docs__,
-    author=pytorch_lightning.__author__,
-    author_email=pytorch_lightning.__author_email__,
-    url=pytorch_lightning.__homepage__,
+    version=info.__version__,
+    description=info.__docs__,
+    author=info.__author__,
+    author_email=info.__author_email__,
+    url=info.__homepage__,
     download_url='https://github.com/PyTorchLightning/pytorch-lightning',
-    license=pytorch_lightning.__license__,
+    license=info.__license__,
     packages=find_packages(exclude=['tests', 'tests/*', 'benchmarks', 'legacy', 'legacy/*']),
-    long_description=_load_readme_description(PATH_ROOT),
+    long_description=long_description,
     long_description_content_type='text/markdown',
     include_package_data=True,
     zip_safe=False,
     keywords=['deep learning', 'pytorch', 'AI'],
     python_requires='>=3.6',
     setup_requires=[],
-    install_requires=_load_requirements(PATH_ROOT),
+    install_requires=setup_tools._load_requirements(_PATH_ROOT),
     extras_require=extras,
     project_urls={
         "Bug Tracker": "https://github.com/PyTorchLightning/pytorch-lightning/issues",

From e62c7c7839beea9be336fe9f30873d005f9cdc5e Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 22 Mar 2021 17:49:01 +0100
Subject: [PATCH 16/25] hotfix: mock examples (#6632)

* mock examples

* drop from GA
---
 azure-pipelines.yml     | 2 ++
 pl_examples/__init__.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 9e2ff77563fa0..b7a2d851052ed 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -121,4 +121,6 @@ jobs:
         # cd pl_examples/basic_examples
         # bash submit_ddp_job.sh
         # bash submit_ddp2_job.sh
+      env:
+        PL_USE_MOCKED_MNIST: "1"
       displayName: 'Examples'
diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py
index ffd60f9ed71af..150ac309ddceb 100644
--- a/pl_examples/__init__.py
+++ b/pl_examples/__init__.py
@@ -15,10 +15,10 @@
 _DATASETS_PATH = os.path.join(_PACKAGE_ROOT, 'Datasets')
 
 _TORCHVISION_AVAILABLE = _module_available("torchvision")
-_TORCHVISION_MNIST_AVAILABLE = True
+_TORCHVISION_MNIST_AVAILABLE = not bool(os.environ.get("PL_USE_MOCKED_MNIST", False))
 _DALI_AVAILABLE = _module_available("nvidia.dali")
 
-if _TORCHVISION_AVAILABLE:
+if _TORCHVISION_MNIST_AVAILABLE:
     try:
         from torchvision.datasets.mnist import MNIST
         MNIST(_DATASETS_PATH, download=True)

From 2064ece5825dfa07c339ed8c6e8ea59183e5938e Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 22 Mar 2021 18:32:31 +0000
Subject: [PATCH 17/25] [refactor] Add setup to profilers + _run_stage_setup to
 trainer 2/5 (#6633)

* add setup

* update

* updates on comment

* Minor changes

* Extra import

* Docs

Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com>
---
 CHANGELOG.md                                  |  3 ++
 .../plugins/training_type/horovod.py          |  6 +--
 .../training_type/training_type_plugin.py     |  6 +--
 pytorch_lightning/profiler/profilers.py       | 53 ++++++++-----------
 pytorch_lightning/profiler/pytorch.py         | 21 ++------
 .../trainer/connectors/profiler_connector.py  |  5 +-
 pytorch_lightning/trainer/properties.py       | 10 ++++
 pytorch_lightning/trainer/trainer.py          | 28 +++++-----
 pytorch_lightning/trainer/training_loop.py    |  3 --
 tests/test_profiler.py                        | 17 +++---
 10 files changed, 72 insertions(+), 80 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f005f583c5ed..51ad97decd867 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,6 +42,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `teardown` method to `BaseProfiler` to enable subclasses defining post-profiling steps outside of `__del__` ([#6370](https://github.com/PyTorchLightning/pytorch-lightning/pull/6370))
 
 
+- Added `setup` method to `BaseProfiler` to enable subclasses defining pre-profiling steps for every process ([#6633](https://github.com/PyTorchLightning/pytorch-lightning/pull/6633))
+
+
 - Added no return warning to predict ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139))
 
 
diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
index 9f1bafe309f89..8d0add27cbb29 100644
--- a/pytorch_lightning/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -96,14 +96,14 @@ def start_training(self, trainer):
                 stack.enter_context(optimizer.skip_synchronize())
 
             # set up training routine
-            self._results = trainer.run_train()
+            self._results = trainer.run_stage()
 
         # Make sure all workers have finished training before returning to the user
         hvd.join()
 
     def start_evaluating(self, trainer):
         with ExitStack():
-            self._results = trainer.run_evaluate()
+            self._results = trainer.run_stage()
 
         # Make sure all workers have finished training before returning to the user
         hvd.join()
@@ -111,7 +111,7 @@ def start_evaluating(self, trainer):
     def start_predicting(self, trainer):
         with ExitStack():
             # set up training routine
-            self._results = trainer.run_predict()
+            self._results = trainer.run_stage()
 
         # Make sure all workers have finished training before returning to the user
         hvd.join()
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index b6f1be359bbf2..89f27963caadf 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -132,15 +132,15 @@ def rpc_enabled(self) -> bool:
 
     def start_training(self, trainer: 'Trainer') -> None:
         # double dispatch to initiate the training loop
-        self._results = trainer.run_train()
+        self._results = trainer.run_stage()
 
     def start_evaluating(self, trainer: 'Trainer') -> None:
         # double dispatch to initiate the test loop
-        self._results = trainer.run_evaluate()
+        self._results = trainer.run_stage()
 
     def start_predicting(self, trainer: 'Trainer') -> None:
         # double dispatch to initiate the predicting loop
-        self._results = trainer.run_predict()
+        self._results = trainer.run_stage()
 
     def training_step(self, *args, **kwargs):
         return self.lightning_module.training_step(*args, **kwargs)
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 55898dc2ee4e1..5668fd6654b2f 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -55,9 +55,23 @@ def start(self, action_name: str) -> None:
     def stop(self, action_name: str) -> None:
         """Defines how to record the duration once an action is complete."""
 
-    def teardown(self) -> None:
-        """Execute arbitrary post-profiling tear-down steps as defined by subclass."""
-        pass
+    def setup(
+        self,
+        stage: Optional[str] = None,
+        local_rank: Optional[int] = None,
+        log_dir: Optional[str] = None
+    ) -> None:
+        """Execute arbitrary pre-profiling set-up steps."""
+        self.stage = stage
+        self.local_rank = local_rank
+        self.log_dir = log_dir
+
+    def teardown(self, stage: Optional[str] = None) -> None:
+        """Execute arbitrary post-profiling tear-down steps."""
+        self.stage = stage
+        if self.output_file:
+            self.output_file.close()
+            self.output_file = None
 
     @contextmanager
     def profile(self, action_name: str) -> None:
@@ -94,13 +108,15 @@ def describe(self) -> None:
         """Logs a profile report after the conclusion of the training run."""
         for write in self.write_streams:
             write(self.summary())
+        if self.output_file is not None:
+            self.output_file.flush()
 
     @abstractmethod
     def summary(self) -> str:
         """Create profiler summary in text format."""
 
-    def on_train_start(self, local_rank: Optional[int] = None):
-        self.local_rank = local_rank
+    def __del__(self):
+        self.teardown(None)
 
 
 class PassThroughProfiler(BaseProfiler):
@@ -110,6 +126,7 @@ class PassThroughProfiler(BaseProfiler):
     """
 
     def __init__(self):
+        self.output_file = None
         super().__init__(output_streams=None)
 
     def start(self, action_name: str) -> None:
@@ -212,19 +229,6 @@ def log_row(action, mean, total):
         output_string += os.linesep
         return output_string
 
-    def describe(self):
-        """Logs a profile report after the conclusion of the training run."""
-        super().describe()
-        self.teardown()
-
-    def teardown(self) -> None:
-        """Close profiler's stream."""
-        if self.output_file:
-            self.output_file.close()
-
-    def __del__(self):
-        self.teardown()
-
 
 class AdvancedProfiler(BaseProfiler):
     """
@@ -285,16 +289,3 @@ def summary(self) -> str:
             output_string += f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
 
         return output_string
-
-    def describe(self):
-        """Logs a profile report after the conclusion of the training run."""
-        super().describe()
-        self.teardown()
-
-    def teardown(self) -> None:
-        """Close profiler's stream."""
-        if self.output_file:
-            self.output_file.close()
-
-    def __del__(self):
-        self.teardown()
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
index fdde80589acf3..c35979fa918af 100644
--- a/pytorch_lightning/profiler/pytorch.py
+++ b/pytorch_lightning/profiler/pytorch.py
@@ -162,11 +162,11 @@ def __init__(
         self.output_fname = output_filename
         self.output_file = None
         if local_rank is not None:
-            self.on_train_start(local_rank=local_rank)
-            self.on_train_start = super().on_train_start
+            self.setup(local_rank=local_rank)
+            self.setup = super().setup
 
-    def on_train_start(self, local_rank: Optional[str] = None):
-        self.local_rank = local_rank
+    def setup(self, stage: Optional[str] = None, local_rank: Optional[int] = None, log_dir: Optional[str] = None):
+        super().setup(stage=stage, local_rank=local_rank, log_dir=log_dir)
 
         # when logging to `log.info`, only perform profiling on rank 0
         if local_rank != 0 and self.output_fname is None:
@@ -290,16 +290,3 @@ def summary(self) -> str:
             output_string += (f"{os.linesep}Profile stats for: {action} rank: {local_rank} {os.linesep}{stats}")
 
         return output_string
-
-    def describe(self):
-        """Logs a profile report after the conclusion of the training run."""
-        super().describe()
-        self.teardown()
-
-    def teardown(self) -> None:
-        """Close profiler's stream."""
-        if self.output_file:
-            self.output_file.close()
-
-    def __del__(self):
-        self.teardown()
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index 98d65c1285ff7..e628d6d96bd19 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -54,6 +54,7 @@ def on_trainer_init(self, profiler: Union[BaseProfiler, str]):
                 )
         self.trainer.profiler = profiler or PassThroughProfiler()
 
-    def on_train_start(self, trainer):
+    def setup(self) -> None:
+        trainer = self.trainer
         local_rank = trainer.local_rank if trainer.world_size > 1 else None
-        self.trainer.profiler.on_train_start(local_rank)
+        trainer.profiler.setup(stage=trainer._setup_state, local_rank=local_rank, log_dir=trainer.log_dir)
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index b5654b148afc6..315e3c60c0557 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -491,6 +491,16 @@ def sanity_checking(self, val: bool) -> None:
         elif self.sanity_checking:
             self._running_stage = None
 
+    @property
+    def _setup_state(self) -> TrainerState:
+        # 'fit' is passed for `trainer.tune()` as there aren't "tune_dataloaders"
+        return TrainerState.FITTING if self.state == TrainerState.TUNING else self.state
+
+    @property
+    def _teardown_state(self) -> Optional[TrainerState]:
+        if self.state.running:
+            return self._setup_state
+
 
 # Used to represent the concrete type TrainerProperties class methods are called on.
 _T = TypeVar('_T', bound=TrainerProperties)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a5b99871d55f9..f7bd1757b9bc2 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -445,13 +445,15 @@ def fit(
                                 |                             ||
                          {self.dispatch}                      ||
                                 |                             ||  LIGHTNING
-                {self.accelerator.start_training} or          ||
-                {self.accelerator.start_evaluating} or        ||  FLOW
-                {self.accelerator.start_predicting}           ||
+                  {self.accelerator.start_training}           ||
+                or {self.accelerator.start_evaluating}        ||
+                or {self.accelerator.start_predicting}        ||  FLOW
+                                |                             ||
+                         {self.run_stage}                     ||
                                 |                             ||  DIRECTION
-                        {self.run_train} or                   ||
-                     {self.run_evaluation} or                 ||
-                       {self.run_predict}                     ||
+                        {self.run_train}                      ||
+                     or {self.run_evaluation}                 ||
+                     or {self.run_predict}                    ||
                                 |                             ||
                              results                          \/
         This is used to guide readers to the core loops: train, test, predict.
@@ -518,6 +520,9 @@ def dispatch(self):
 
     def run_stage(self):
         results = None
+
+        self.profile_connector.setup()
+
         if self.evaluating:
             results = self.run_evaluate()
         elif self.predicting:
@@ -1060,8 +1065,7 @@ def tune(
 
     def call_setup_hook(self, model: LightningModule) -> None:
         assert self.state.running, f"TrainerState: {self.state}"
-        # 'fit' is passed for `trainer.tune()` as there aren't "tune_dataloaders"
-        state = TrainerState.FITTING if self.state == TrainerState.TUNING else self.state
+        state = self._setup_state
 
         if self.datamodule is not None:
             called = getattr(self.datamodule, f'has_setup_{state}')
@@ -1072,12 +1076,8 @@ def call_setup_hook(self, model: LightningModule) -> None:
         model.setup(stage=state)
 
     def call_teardown_hook(self, model: LightningModule) -> None:
-        if self.state.running:
-            state = TrainerState.FITTING if self.state == TrainerState.TUNING else self.state
-        else:
-            state = None
-
-        self.profiler.teardown()
+        state = self._teardown_state
+        self.profiler.teardown(stage=state)
         self.teardown(stage=state)
         model.teardown(stage=state)
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index a77d91a7402b4..384a1b67a64f8 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -102,9 +102,6 @@ def on_train_start(self):
         # hook
         self.trainer.call_hook("on_train_start")
 
-        # provide rank to profiler
-        self.trainer.profile_connector.on_train_start(self.trainer)
-
     def setup_fit(self, model, train_dataloader=None, val_dataloaders=None, datamodule=None):
         # clean hparams
         if hasattr(model, "hparams"):
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index ccdd8a569c9a8..cc4fff3b7ede4 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -22,7 +22,8 @@
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.profiler import AdvancedProfiler, SimpleProfiler, PyTorchProfiler
+from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.profiler import AdvancedProfiler, PyTorchProfiler, SimpleProfiler
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -323,14 +324,16 @@ def test_profiler_teardown(tmpdir, cls):
     """
     This test checks if profiler teardown method is called when trainer is exiting.
     """
+
+    class TestCallback(Callback):
+
+        def on_fit_end(self, trainer, pl_module) -> None:
+            assert trainer.profiler.output_file is not None
+
     profiler = cls(output_filename=os.path.join(tmpdir, "profiler.txt"))
 
     model = BoringModel()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        fast_dev_run=True,
-        profiler=profiler,
-    )
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, profiler=profiler, callbacks=[TestCallback()])
     trainer.fit(model)
 
-    assert profiler.output_file.closed
+    assert profiler.output_file is None

From 8cd75a4dd51939881da265752c2d81307cbe4d9e Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 23 Mar 2021 08:51:45 +0100
Subject: [PATCH 18/25] fix comparing versions (#6434)

* fix comparing versions

* chlog

* .

* ...

* datasets
---
 .github/workflows/docs-checks.yml      |  2 +-
 CHANGELOG.md                           |  3 +++
 Makefile                               |  2 +-
 docs/source/conf.py                    |  1 +
 pytorch_lightning/utilities/imports.py | 22 ++++++++++++++++++----
 requirements/extra.txt                 |  1 +
 6 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 5ee4f23b4b3cc..4488c598c8ac7 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -98,7 +98,7 @@ jobs:
           # First run the same pipeline as Read-The-Docs
           cd docs
           make clean
-          make html --debug --jobs $(nproc) SPHINXOPTS="-W"
+          make html --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going"
 
       - name: Upload built docs
         uses: actions/upload-artifact@v2
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 51ad97decd867..c542b854af104 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -165,6 +165,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed torch distributed not available in setup hook for DDP ([#6506](https://github.com/PyTorchLightning/pytorch-lightning/pull/6506))
 
 
+- Fixed comparing required versions ([#6434](https://github.com/PyTorchLightning/pytorch-lightning/pull/6434))
+
+
 ## [1.2.4] - 2021-03-16
 
 ### Changed
diff --git a/Makefile b/Makefile
index d35e0b77f8429..04b08fa2d27d1 100644
--- a/Makefile
+++ b/Makefile
@@ -29,4 +29,4 @@ test: clean
 
 docs: clean
 	pip install --quiet -r requirements/docs.txt
-	python -m sphinx -b html -W docs/source docs/build
+	python -m sphinx -b html -W --keep-going docs/source docs/build
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 11a0d2a0538bb..6163de976da40 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -334,6 +334,7 @@ def package_list_from_file(file):
 }
 MOCK_PACKAGES = []
 if SPHINX_MOCK_REQUIREMENTS:
+    MOCK_PACKAGES += ['fairscale']
     # mock also base packages when we are on RTD since we don't install them there
     MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements.txt'))
     MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements', 'extra.txt'))
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 41a13d6c678a0..8090c4ed6590f 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """General utilities"""
+import importlib
 import operator
 import platform
 import sys
@@ -19,7 +20,7 @@
 from importlib.util import find_spec
 
 import torch
-from pkg_resources import DistributionNotFound, get_distribution
+from pkg_resources import DistributionNotFound
 
 
 def _module_available(module_path: str) -> bool:
@@ -42,11 +43,24 @@ def _module_available(module_path: str) -> bool:
 
 
 def _compare_version(package: str, op, version) -> bool:
+    """
+    Compare package version with some requirements
+
+    >>> _compare_version("torch", operator.ge, "0.1")
+    True
+    """
     try:
-        pkg_version = LooseVersion(get_distribution(package).version)
-        return op(pkg_version, LooseVersion(version))
-    except DistributionNotFound:
+        pkg = importlib.import_module(package)
+    except (ModuleNotFoundError, DistributionNotFound):
+        return False
+    try:
+        pkg_version = LooseVersion(pkg.__version__)
+    except AttributeError:
         return False
+    if not (hasattr(pkg_version, "vstring") and hasattr(pkg_version, "version")):
+        # this is mock by sphinx, so it shall return True ro generate all summaries
+        return True
+    return op(pkg_version, LooseVersion(version))
 
 
 _IS_WINDOWS = platform.system() == "Windows"
diff --git a/requirements/extra.txt b/requirements/extra.txt
index a05c4971ac450..715916c4e36ac 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -7,4 +7,5 @@ torchtext>=0.5
 # onnx>=1.7.0
 onnxruntime>=1.3.0
 hydra-core>=1.0
+# todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
 https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip

From efce2b77779467884df9a3d9c16c3176ea81a650 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 23 Mar 2021 09:35:51 +0100
Subject: [PATCH 19/25] Prune metrics: regression 8/n (#6636)

* explained_variance

* tests

* mean_absolute_error

* mean_squared_error

* mean_relative_error

* mean_squared_log_error

* chlog
---
 CHANGELOG.md                                  |   2 +
 .../metrics/functional/explained_variance.py  |  68 +----------
 .../metrics/functional/mean_absolute_error.py |  34 +-----
 .../metrics/functional/mean_relative_error.py |  37 +-----
 .../metrics/functional/mean_squared_error.py  |  34 +-----
 .../functional/mean_squared_log_error.py      |  34 +-----
 .../metrics/regression/explained_variance.py  | 106 ++----------------
 .../metrics/regression/mean_absolute_error.py |  64 ++---------
 .../metrics/regression/mean_squared_error.py  |  65 ++---------
 .../regression/mean_squared_log_error.py      |  67 ++---------
 tests/accelerators/test_cpu.py                |   1 +
 .../regression/test_explained_variance.py     |  77 -------------
 tests/metrics/regression/test_mean_error.py   |  87 --------------
 tests/metrics/test_remove_1-5_metrics.py      |  59 +++++++++-
 tests/utilities/test_argparse.py              |   4 +-
 15 files changed, 115 insertions(+), 624 deletions(-)
 delete mode 100644 tests/metrics/regression/test_explained_variance.py
 delete mode 100644 tests/metrics/regression/test_mean_error.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c542b854af104..57a071bff297a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -90,6 +90,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
     [#6584](https://github.com/PyTorchLightning/pytorch-lightning/pull/6584),
 
+    [#6636](https://github.com/PyTorchLightning/pytorch-lightning/pull/6636),
+
 )
 
 
diff --git a/pytorch_lightning/metrics/functional/explained_variance.py b/pytorch_lightning/metrics/functional/explained_variance.py
index fa8d43c06c7ef..bcfe698bf4c5e 100644
--- a/pytorch_lightning/metrics/functional/explained_variance.py
+++ b/pytorch_lightning/metrics/functional/explained_variance.py
@@ -11,77 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Sequence, Tuple, Union
+from typing import Sequence, Union
 
 import torch
-from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.functional import explained_variance as _explained_variance
 
-
-def _explained_variance_update(preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-    _check_same_shape(preds, target)
-    return preds, target
-
-
-def _explained_variance_compute(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    multioutput: str = 'uniform_average',
-) -> Union[torch.Tensor, Sequence[torch.Tensor]]:
-    diff_avg = torch.mean(target - preds, dim=0)
-    numerator = torch.mean((target - preds - diff_avg)**2, dim=0)
-
-    target_avg = torch.mean(target, dim=0)
-    denominator = torch.mean((target - target_avg)**2, dim=0)
-
-    # Take care of division by zero
-    nonzero_numerator = numerator != 0
-    nonzero_denominator = denominator != 0
-    valid_score = nonzero_numerator & nonzero_denominator
-    output_scores = torch.ones_like(diff_avg)
-    output_scores[valid_score] = 1.0 - (numerator[valid_score] / denominator[valid_score])
-    output_scores[nonzero_numerator & ~nonzero_denominator] = 0.
-
-    # Decide what to do in multioutput case
-    # Todo: allow user to pass in tensor with weights
-    if multioutput == 'raw_values':
-        return output_scores
-    if multioutput == 'uniform_average':
-        return torch.mean(output_scores)
-    if multioutput == 'variance_weighted':
-        denom_sum = torch.sum(denominator)
-        return torch.sum(denominator / denom_sum * output_scores)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_explained_variance, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def explained_variance(
     preds: torch.Tensor,
     target: torch.Tensor,
     multioutput: str = 'uniform_average',
 ) -> Union[torch.Tensor, Sequence[torch.Tensor]]:
     """
-    Computes explained variance.
-
-    Args:
-        preds: estimated labels
-        target: ground truth labels
-        multioutput: Defines aggregation in the case of multiple output scores. Can be one
-            of the following strings (default is `'uniform_average'`.):
-
-            * `'raw_values'` returns full set of scores
-            * `'uniform_average'` scores are uniformly averaged
-            * `'variance_weighted'` scores are weighted by their individual variances
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import explained_variance
-        >>> target = torch.tensor([3, -0.5, 2, 7])
-        >>> preds = torch.tensor([2.5, 0.0, 2, 8])
-        >>> explained_variance(preds, target)
-        tensor(0.9572)
-
-        >>> target = torch.tensor([[0.5, 1], [-1, 1], [7, -6]])
-        >>> preds = torch.tensor([[0, 2], [-1, 2], [8, -5]])
-        >>> explained_variance(preds, target, multioutput='raw_values')
-        tensor([0.9677, 1.0000])
+    .. deprecated::
+        Use :func:`torchmetrics.functional.explained_variance`. Will be removed in v1.5.0.
     """
-    preds, target = _explained_variance_update(preds, target)
-    return _explained_variance_compute(preds, target, multioutput)
diff --git a/pytorch_lightning/metrics/functional/mean_absolute_error.py b/pytorch_lightning/metrics/functional/mean_absolute_error.py
index 2bd8f125ecb9e..85aa07c802eca 100644
--- a/pytorch_lightning/metrics/functional/mean_absolute_error.py
+++ b/pytorch_lightning/metrics/functional/mean_absolute_error.py
@@ -11,40 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
 
 import torch
-from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.functional import mean_absolute_error as _mean_absolute_error
 
-
-def _mean_absolute_error_update(preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, int]:
-    _check_same_shape(preds, target)
-    sum_abs_error = torch.sum(torch.abs(preds - target))
-    n_obs = target.numel()
-    return sum_abs_error, n_obs
-
-
-def _mean_absolute_error_compute(sum_abs_error: torch.Tensor, n_obs: int) -> torch.Tensor:
-    return sum_abs_error / n_obs
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_mean_absolute_error, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def mean_absolute_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
     """
-    Computes mean absolute error
-
-    Args:
-        pred: estimated labels
-        target: ground truth labels
-
-    Return:
-        Tensor with MAE
-
-    Example:
-        >>> from pytorch_lightning.metrics.functional import mean_absolute_error
-        >>> x = torch.tensor([0., 1, 2, 3])
-        >>> y = torch.tensor([0., 1, 2, 2])
-        >>> mean_absolute_error(x, y)
-        tensor(0.2500)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.mean_absolute_error`. Will be removed in v1.5.0.
     """
-    sum_abs_error, n_obs = _mean_absolute_error_update(preds, target)
-    return _mean_absolute_error_compute(sum_abs_error, n_obs)
diff --git a/pytorch_lightning/metrics/functional/mean_relative_error.py b/pytorch_lightning/metrics/functional/mean_relative_error.py
index bfe5eb6b847d7..be21371bdc91a 100644
--- a/pytorch_lightning/metrics/functional/mean_relative_error.py
+++ b/pytorch_lightning/metrics/functional/mean_relative_error.py
@@ -11,43 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
 
 import torch
-from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.functional.regression.mean_relative_error import mean_relative_error as _mean_relative_error
 
-
-def _mean_relative_error_update(preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, int]:
-    _check_same_shape(preds, target)
-    target_nz = target.clone()
-    target_nz[target == 0] = 1
-    sum_rltv_error = torch.sum(torch.abs((preds - target) / target_nz))
-    n_obs = target.numel()
-    return sum_rltv_error, n_obs
-
-
-def _mean_relative_error_compute(sum_rltv_error: torch.Tensor, n_obs: int) -> torch.Tensor:
-    return sum_rltv_error / n_obs
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_mean_relative_error, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def mean_relative_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
     """
-    Computes mean relative error
-
-    Args:
-        pred: estimated labels
-        target: ground truth labels
-
-    Return:
-        Tensor with mean relative error
-
-    Example:
-
-        >>> x = torch.tensor([0., 1, 2, 3])
-        >>> y = torch.tensor([0., 1, 2, 2])
-        >>> mean_relative_error(x, y)
-        tensor(0.1250)
-
+    .. deprecated::
+        Use :func:`torchmetrics.functional.regression.mean_relative_error`. Will be removed in v1.5.0.
     """
-    sum_rltv_error, n_obs = _mean_relative_error_update(preds, target)
-    return _mean_relative_error_compute(sum_rltv_error, n_obs)
diff --git a/pytorch_lightning/metrics/functional/mean_squared_error.py b/pytorch_lightning/metrics/functional/mean_squared_error.py
index 66c0aadef0651..9d1850dcd8689 100644
--- a/pytorch_lightning/metrics/functional/mean_squared_error.py
+++ b/pytorch_lightning/metrics/functional/mean_squared_error.py
@@ -11,40 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
 
 import torch
-from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.functional import mean_squared_error as _mean_squared_error
 
-
-def _mean_squared_error_update(preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, int]:
-    _check_same_shape(preds, target)
-    sum_squared_error = torch.sum(torch.pow(preds - target, 2))
-    n_obs = target.numel()
-    return sum_squared_error, n_obs
-
-
-def _mean_squared_error_compute(sum_squared_error: torch.Tensor, n_obs: int) -> torch.Tensor:
-    return sum_squared_error / n_obs
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_mean_squared_error, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def mean_squared_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
     """
-    Computes mean squared error
-
-    Args:
-        preds: estimated labels
-        target: ground truth labels
-
-    Return:
-        Tensor with MSE
-
-    Example:
-        >>> from pytorch_lightning.metrics.functional import mean_squared_error
-        >>> x = torch.tensor([0., 1, 2, 3])
-        >>> y = torch.tensor([0., 1, 2, 2])
-        >>> mean_squared_error(x, y)
-        tensor(0.2500)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.mean_squared_error`. Will be removed in v1.5.0.
     """
-    sum_squared_error, n_obs = _mean_squared_error_update(preds, target)
-    return _mean_squared_error_compute(sum_squared_error, n_obs)
diff --git a/pytorch_lightning/metrics/functional/mean_squared_log_error.py b/pytorch_lightning/metrics/functional/mean_squared_log_error.py
index baec63c7248f2..56654ea47daf2 100644
--- a/pytorch_lightning/metrics/functional/mean_squared_log_error.py
+++ b/pytorch_lightning/metrics/functional/mean_squared_log_error.py
@@ -11,40 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
 
 import torch
-from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.functional import mean_squared_log_error as _mean_squared_log_error
 
-
-def _mean_squared_log_error_update(preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, int]:
-    _check_same_shape(preds, target)
-    sum_squared_log_error = torch.sum(torch.pow(torch.log1p(preds) - torch.log1p(target), 2))
-    n_obs = target.numel()
-    return sum_squared_log_error, n_obs
-
-
-def _mean_squared_log_error_compute(sum_squared_log_error: torch.Tensor, n_obs: int) -> torch.Tensor:
-    return sum_squared_log_error / n_obs
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_mean_squared_log_error, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def mean_squared_log_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
     """
-    Computes mean squared log error
-
-    Args:
-        preds: estimated labels
-        target: ground truth labels
-
-    Return:
-        Tensor with RMSLE
-
-    Example:
-        >>> from pytorch_lightning.metrics.functional import mean_squared_log_error
-        >>> x = torch.tensor([0., 1, 2, 3])
-        >>> y = torch.tensor([0., 1, 2, 2])
-        >>> mean_squared_log_error(x, y)
-        tensor(0.0207)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.mean_squared_log_error`. Will be removed in v1.5.0.
     """
-    sum_squared_log_error, n_obs = _mean_squared_log_error_update(preds, target)
-    return _mean_squared_log_error_compute(sum_squared_log_error, n_obs)
diff --git a/pytorch_lightning/metrics/regression/explained_variance.py b/pytorch_lightning/metrics/regression/explained_variance.py
index 8b0259694ef4c..4f820718545cb 100644
--- a/pytorch_lightning/metrics/regression/explained_variance.py
+++ b/pytorch_lightning/metrics/regression/explained_variance.py
@@ -13,72 +13,14 @@
 # limitations under the License.
 from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import ExplainedVariance as _ExplainedVariance
 
-from pytorch_lightning.metrics.functional.explained_variance import (
-    _explained_variance_compute,
-    _explained_variance_update,
-)
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class ExplainedVariance(Metric):
-    r"""
-    Computes `explained variance
-    <https://en.wikipedia.org/wiki/Explained_variation>`_:
-
-    .. math:: \text{ExplainedVariance} = 1 - \frac{\text{Var}(y - \hat{y})}{\text{Var}(y)}
-
-    Where :math:`y` is a tensor of target values, and :math:`\hat{y}` is a
-    tensor of predictions.
-
-    Forward accepts
-
-    - ``preds`` (float tensor): ``(N,)`` or ``(N, ...)`` (multioutput)
-    - ``target`` (long tensor): ``(N,)`` or ``(N, ...)`` (multioutput)
-
-    In the case of multioutput, as default the variances will be uniformly
-    averaged over the additional dimensions. Please see argument `multioutput`
-    for changing this behavior.
-
-    Args:
-        multioutput:
-            Defines aggregation in the case of multiple output scores. Can be one
-            of the following strings (default is `'uniform_average'`.):
-
-            * `'raw_values'` returns full set of scores
-            * `'uniform_average'` scores are uniformly averaged
-            * `'variance_weighted'` scores are weighted by their individual variances
-
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Raises:
-        ValueError:
-            If ``multioutput`` is not one of ``"raw_values"``, ``"uniform_average"`` or ``"variance_weighted"``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import ExplainedVariance
-        >>> target = torch.tensor([3, -0.5, 2, 7])
-        >>> preds = torch.tensor([2.5, 0.0, 2, 8])
-        >>> explained_variance = ExplainedVariance()
-        >>> explained_variance(preds, target)
-        tensor(0.9572)
-
-        >>> target = torch.tensor([[0.5, 1], [-1, 1], [7, -6]])
-        >>> preds = torch.tensor([[0, 2], [-1, 2], [8, -5]])
-        >>> explained_variance = ExplainedVariance(multioutput='raw_values')
-        >>> explained_variance(preds, target)
-        tensor([0.9677, 1.0000])
-    """
+class ExplainedVariance(_ExplainedVariance):
 
+    @deprecated(target=_ExplainedVariance, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         multioutput: str = 'uniform_average',
@@ -87,43 +29,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-        allowed_multioutput = ('raw_values', 'uniform_average', 'variance_weighted')
-        if multioutput not in allowed_multioutput:
-            raise ValueError(
-                f'Invalid input to argument `multioutput`. Choose one of the following: {allowed_multioutput}'
-            )
-        self.multioutput = multioutput
-        self.add_state("y", default=[], dist_reduce_fx=None)
-        self.add_state("y_pred", default=[], dist_reduce_fx=None)
-
-        rank_zero_warn(
-            'Metric `ExplainedVariance` will save all targets and'
-            ' predictions in buffer. For large datasets this may lead'
-            ' to large memory footprint.'
-        )
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        preds, target = _explained_variance_update(preds, target)
-        self.y_pred.append(preds)
-        self.y.append(target)
+        This implementation refers to :class:`~torchmetrics.ExplainedVariance`.
 
-    def compute(self):
-        """
-        Computes explained variance over state.
+        .. deprecated::
+            Use :class:`~torchmetrics.ExplainedVariance`. Will be removed in v1.5.0.
         """
-        preds = torch.cat(self.y_pred, dim=0)
-        target = torch.cat(self.y, dim=0)
-        return _explained_variance_compute(preds, target, self.multioutput)
diff --git a/pytorch_lightning/metrics/regression/mean_absolute_error.py b/pytorch_lightning/metrics/regression/mean_absolute_error.py
index 484ccbe83284e..8510275c127d7 100644
--- a/pytorch_lightning/metrics/regression/mean_absolute_error.py
+++ b/pytorch_lightning/metrics/regression/mean_absolute_error.py
@@ -13,42 +13,14 @@
 # limitations under the License.
 from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import MeanAbsoluteError as _MeanAbsoluteError
 
-from pytorch_lightning.metrics.functional.mean_absolute_error import (
-    _mean_absolute_error_compute,
-    _mean_absolute_error_update,
-)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class MeanAbsoluteError(Metric):
-    r"""
-    Computes `mean absolute error <https://en.wikipedia.org/wiki/Mean_absolute_error>`_ (MAE):
-
-    .. math:: \text{MAE} = \frac{1}{N}\sum_i^N | y_i - \hat{y_i} |
-
-    Where :math:`y` is a tensor of target values, and :math:`\hat{y}` is a tensor of predictions.
-
-    Args:
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import MeanAbsoluteError
-        >>> target = torch.tensor([3.0, -0.5, 2.0, 7.0])
-        >>> preds = torch.tensor([2.5, 0.0, 2.0, 8.0])
-        >>> mean_absolute_error = MeanAbsoluteError()
-        >>> mean_absolute_error(preds, target)
-        tensor(0.5000)
-    """
+class MeanAbsoluteError(_MeanAbsoluteError):
 
+    @deprecated(target=_MeanAbsoluteError, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         compute_on_step: bool = True,
@@ -56,31 +28,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.add_state("sum_abs_error", default=torch.tensor(0.0), dist_reduce_fx="sum")
-        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        sum_abs_error, n_obs = _mean_absolute_error_update(preds, target)
+        This implementation refers to :class:`~torchmetrics.MeanAbsoluteError`.
 
-        self.sum_abs_error += sum_abs_error
-        self.total += n_obs
-
-    def compute(self):
-        """
-        Computes mean absolute error over state.
+        .. deprecated::
+            Use :class:`~torchmetrics.MeanAbsoluteError`. Will be removed in v1.5.0.
         """
-        return _mean_absolute_error_compute(self.sum_abs_error, self.total)
diff --git a/pytorch_lightning/metrics/regression/mean_squared_error.py b/pytorch_lightning/metrics/regression/mean_squared_error.py
index c26371514e7cd..cbe09faf0046c 100644
--- a/pytorch_lightning/metrics/regression/mean_squared_error.py
+++ b/pytorch_lightning/metrics/regression/mean_squared_error.py
@@ -13,43 +13,14 @@
 # limitations under the License.
 from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import MeanSquaredError as _MeanSquaredError
 
-from pytorch_lightning.metrics.functional.mean_squared_error import (
-    _mean_squared_error_compute,
-    _mean_squared_error_update,
-)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class MeanSquaredError(Metric):
-    r"""
-    Computes `mean squared error <https://en.wikipedia.org/wiki/Mean_squared_error>`_ (MSE):
-
-    .. math:: \text{MSE} = \frac{1}{N}\sum_i^N(y_i - \hat{y_i})^2
-
-    Where :math:`y` is a tensor of target values, and :math:`\hat{y}` is a tensor of predictions.
-
-    Args:
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import MeanSquaredError
-        >>> target = torch.tensor([2.5, 5.0, 4.0, 8.0])
-        >>> preds = torch.tensor([3.0, 5.0, 2.5, 7.0])
-        >>> mean_squared_error = MeanSquaredError()
-        >>> mean_squared_error(preds, target)
-        tensor(0.8750)
-
-    """
+class MeanSquaredError(_MeanSquaredError):
 
+    @deprecated(target=_MeanSquaredError, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         compute_on_step: bool = True,
@@ -57,31 +28,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.add_state("sum_squared_error", default=torch.tensor(0.0), dist_reduce_fx="sum")
-        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        sum_squared_error, n_obs = _mean_squared_error_update(preds, target)
-
-        self.sum_squared_error += sum_squared_error
-        self.total += n_obs
+        This implementation refers to :class:`~torchmetrics.MeanSquaredError`.
 
-    def compute(self):
-        """
-        Computes mean squared error over state.
+        .. deprecated::
+            Use :class:`~torchmetrics.MeanSquaredError`. Will be removed in v1.5.0.
         """
-        return _mean_squared_error_compute(self.sum_squared_error, self.total)
diff --git a/pytorch_lightning/metrics/regression/mean_squared_log_error.py b/pytorch_lightning/metrics/regression/mean_squared_log_error.py
index caaf09a3663ff..795d6f5409abf 100644
--- a/pytorch_lightning/metrics/regression/mean_squared_log_error.py
+++ b/pytorch_lightning/metrics/regression/mean_squared_log_error.py
@@ -13,45 +13,14 @@
 # limitations under the License.
 from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import MeanSquaredLogError as _MeanSquaredLogError
 
-from pytorch_lightning.metrics.functional.mean_squared_log_error import (
-    _mean_squared_log_error_compute,
-    _mean_squared_log_error_update,
-)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class MeanSquaredLogError(Metric):
-    r"""
-    Computes `mean squared logarithmic error
-    <https://scikit-learn.org/stable/modules/model_evaluation.html#mean-squared-log-error>`_
-    (MSLE):
-
-    .. math:: \text{MSLE} = \frac{1}{N}\sum_i^N (\log_e(1 + y_i) - \log_e(1 + \hat{y_i}))^2
-
-    Where :math:`y` is a tensor of target values, and :math:`\hat{y}` is a tensor of predictions.
-
-    Args:
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import MeanSquaredLogError
-        >>> target = torch.tensor([2.5, 5, 4, 8])
-        >>> preds = torch.tensor([3, 5, 2.5, 7])
-        >>> mean_squared_log_error = MeanSquaredLogError()
-        >>> mean_squared_log_error(preds, target)
-        tensor(0.0397)
-
-    """
+class MeanSquaredLogError(_MeanSquaredLogError):
 
+    @deprecated(target=_MeanSquaredLogError, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         compute_on_step: bool = True,
@@ -59,31 +28,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.add_state("sum_squared_log_error", default=torch.tensor(0.0), dist_reduce_fx="sum")
-        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        sum_squared_log_error, n_obs = _mean_squared_log_error_update(preds, target)
-
-        self.sum_squared_log_error += sum_squared_log_error
-        self.total += n_obs
+        This implementation refers to :class:`~torchmetrics.MeanSquaredLogError`.
 
-    def compute(self):
-        """
-        Compute mean squared logarithmic error over state.
+        .. deprecated::
+            Use :class:`~torchmetrics.MeanSquaredLogError`. Will be removed in v1.5.0.
         """
-        return _mean_squared_log_error_compute(self.sum_squared_log_error, self.total)
diff --git a/tests/accelerators/test_cpu.py b/tests/accelerators/test_cpu.py
index 349e4175a7444..bcb351984a175 100644
--- a/tests/accelerators/test_cpu.py
+++ b/tests/accelerators/test_cpu.py
@@ -2,6 +2,7 @@
 
 import pytest
 import torch
+
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.plugins import SingleDevicePlugin
diff --git a/tests/metrics/regression/test_explained_variance.py b/tests/metrics/regression/test_explained_variance.py
deleted file mode 100644
index adab562ac6055..0000000000000
--- a/tests/metrics/regression/test_explained_variance.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from collections import namedtuple
-from functools import partial
-
-import pytest
-import torch
-from sklearn.metrics import explained_variance_score
-
-from pytorch_lightning.metrics.functional import explained_variance
-from pytorch_lightning.metrics.regression import ExplainedVariance
-from tests.metrics.utils import BATCH_SIZE, MetricTester, NUM_BATCHES
-
-torch.manual_seed(42)
-
-num_targets = 5
-
-Input = namedtuple('Input', ["preds", "target"])
-
-_single_target_inputs = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE),
-    target=torch.rand(NUM_BATCHES, BATCH_SIZE),
-)
-
-_multi_target_inputs = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, num_targets),
-    target=torch.rand(NUM_BATCHES, BATCH_SIZE, num_targets),
-)
-
-
-def _single_target_sk_metric(preds, target, sk_fn=explained_variance_score):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-    return sk_fn(sk_target, sk_preds)
-
-
-def _multi_target_sk_metric(preds, target, sk_fn=explained_variance_score):
-    sk_preds = preds.view(-1, num_targets).numpy()
-    sk_target = target.view(-1, num_targets).numpy()
-    return sk_fn(sk_target, sk_preds)
-
-
-@pytest.mark.parametrize("multioutput", ['raw_values', 'uniform_average', 'variance_weighted'])
-@pytest.mark.parametrize(
-    "preds, target, sk_metric",
-    [
-        (_single_target_inputs.preds, _single_target_inputs.target, _single_target_sk_metric),
-        (_multi_target_inputs.preds, _multi_target_inputs.target, _multi_target_sk_metric),
-    ],
-)
-class TestExplainedVariance(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_explained_variance(self, multioutput, preds, target, sk_metric, ddp, dist_sync_on_step):
-        self.run_class_metric_test(
-            ddp,
-            preds,
-            target,
-            ExplainedVariance,
-            partial(sk_metric, sk_fn=partial(explained_variance_score, multioutput=multioutput)),
-            dist_sync_on_step,
-            metric_args=dict(multioutput=multioutput),
-        )
-
-    def test_explained_variance_functional(self, multioutput, preds, target, sk_metric):
-        self.run_functional_metric_test(
-            preds,
-            target,
-            explained_variance,
-            partial(sk_metric, sk_fn=partial(explained_variance_score, multioutput=multioutput)),
-            metric_args=dict(multioutput=multioutput),
-        )
-
-
-def test_error_on_different_shape(metric_class=ExplainedVariance):
-    metric = metric_class()
-    with pytest.raises(RuntimeError, match='Predictions and targets are expected to have the same shape'):
-        metric(torch.randn(100, ), torch.randn(50, ))
diff --git a/tests/metrics/regression/test_mean_error.py b/tests/metrics/regression/test_mean_error.py
deleted file mode 100644
index 041ce12f11164..0000000000000
--- a/tests/metrics/regression/test_mean_error.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from collections import namedtuple
-from functools import partial
-
-import pytest
-import torch
-from sklearn.metrics import mean_absolute_error as sk_mean_absolute_error
-from sklearn.metrics import mean_squared_error as sk_mean_squared_error
-from sklearn.metrics import mean_squared_log_error as sk_mean_squared_log_error
-
-from pytorch_lightning.metrics.functional import mean_absolute_error, mean_squared_error, mean_squared_log_error
-from pytorch_lightning.metrics.regression import MeanAbsoluteError, MeanSquaredError, MeanSquaredLogError
-from tests.metrics.utils import BATCH_SIZE, MetricTester, NUM_BATCHES
-
-torch.manual_seed(42)
-
-num_targets = 5
-
-Input = namedtuple('Input', ["preds", "target"])
-
-_single_target_inputs = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE),
-    target=torch.rand(NUM_BATCHES, BATCH_SIZE),
-)
-
-_multi_target_inputs = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, num_targets),
-    target=torch.rand(NUM_BATCHES, BATCH_SIZE, num_targets),
-)
-
-
-def _single_target_sk_metric(preds, target, sk_fn=mean_squared_error):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-    return sk_fn(sk_preds, sk_target)
-
-
-def _multi_target_sk_metric(preds, target, sk_fn=mean_squared_error):
-    sk_preds = preds.view(-1, num_targets).numpy()
-    sk_target = target.view(-1, num_targets).numpy()
-    return sk_fn(sk_preds, sk_target)
-
-
-@pytest.mark.parametrize(
-    "preds, target, sk_metric",
-    [
-        (_single_target_inputs.preds, _single_target_inputs.target, _single_target_sk_metric),
-        (_multi_target_inputs.preds, _multi_target_inputs.target, _multi_target_sk_metric),
-    ],
-)
-@pytest.mark.parametrize(
-    "metric_class, metric_functional, sk_fn",
-    [
-        (MeanSquaredError, mean_squared_error, sk_mean_squared_error),
-        (MeanAbsoluteError, mean_absolute_error, sk_mean_absolute_error),
-        (MeanSquaredLogError, mean_squared_log_error, sk_mean_squared_log_error),
-    ],
-)
-class TestMeanError(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_mean_error_class(
-        self, preds, target, sk_metric, metric_class, metric_functional, sk_fn, ddp, dist_sync_on_step
-    ):
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=preds,
-            target=target,
-            metric_class=metric_class,
-            sk_metric=partial(sk_metric, sk_fn=sk_fn),
-            dist_sync_on_step=dist_sync_on_step,
-        )
-
-    def test_mean_error_functional(self, preds, target, sk_metric, metric_class, metric_functional, sk_fn):
-        self.run_functional_metric_test(
-            preds=preds,
-            target=target,
-            metric_functional=metric_functional,
-            sk_metric=partial(sk_metric, sk_fn=sk_fn),
-        )
-
-
-@pytest.mark.parametrize("metric_class", [MeanSquaredError, MeanAbsoluteError, MeanSquaredLogError])
-def test_error_on_different_shape(metric_class):
-    metric = metric_class()
-    with pytest.raises(RuntimeError, match='Predictions and targets are expected to have the same shape'):
-        metric(torch.randn(100, ), torch.randn(50, ))
diff --git a/tests/metrics/test_remove_1-5_metrics.py b/tests/metrics/test_remove_1-5_metrics.py
index 339d07b163632..eaf17ec0792da 100644
--- a/tests/metrics/test_remove_1-5_metrics.py
+++ b/tests/metrics/test_remove_1-5_metrics.py
@@ -22,10 +22,14 @@
     AUROC,
     AveragePrecision,
     ConfusionMatrix,
+    ExplainedVariance,
     F1,
     FBeta,
     HammingDistance,
     IoU,
+    MeanAbsoluteError,
+    MeanSquaredError,
+    MeanSquaredLogError,
     MetricCollection,
     Precision,
     PrecisionRecallCurve,
@@ -38,10 +42,14 @@
     auroc,
     average_precision,
     confusion_matrix,
+    explained_variance,
     f1,
     fbeta,
     hamming_distance,
     iou,
+    mean_absolute_error,
+    mean_squared_error,
+    mean_squared_log_error,
     precision,
     precision_recall,
     precision_recall_curve,
@@ -50,6 +58,7 @@
     stat_scores,
 )
 from pytorch_lightning.metrics.functional.accuracy import accuracy
+from pytorch_lightning.metrics.functional.mean_relative_error import mean_relative_error
 from pytorch_lightning.metrics.utils import get_num_classes, select_topk, to_categorical, to_onehot
 
 
@@ -232,8 +241,52 @@ def test_v1_5_metric_detect():
         IoU(num_classes=1)
 
     target = torch.randint(0, 2, (10, 25, 25))
-    pred = torch.tensor(target)
-    pred[2:5, 7:13, 9:15] = 1 - pred[2:5, 7:13, 9:15]
+    preds = torch.tensor(target)
+    preds[2:5, 7:13, 9:15] = 1 - preds[2:5, 7:13, 9:15]
     iou.warned = False
     with pytest.deprecated_call(match='It will be removed in v1.5.0'):
-        assert torch.allclose(iou(pred, target), torch.tensor(0.9660), atol=1e-4)
+        assert torch.allclose(iou(preds, target), torch.tensor(0.9660), atol=1e-4)
+
+
+def test_v1_5_metric_regress():
+    ExplainedVariance.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        ExplainedVariance()
+
+    MeanAbsoluteError.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        MeanAbsoluteError()
+
+    MeanSquaredError.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        MeanSquaredError()
+
+    MeanSquaredLogError.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        MeanSquaredLogError()
+
+    target = torch.tensor([3, -0.5, 2, 7])
+    preds = torch.tensor([2.5, 0.0, 2, 8])
+    explained_variance.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        res = explained_variance(preds, target)
+        assert torch.allclose(res, torch.tensor(0.9572), atol=1e-4)
+
+    x = torch.tensor([0., 1, 2, 3])
+    y = torch.tensor([0., 1, 2, 2])
+    mean_absolute_error.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert mean_absolute_error(x, y) == 0.25
+
+    mean_relative_error.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert mean_relative_error(x, y) == 0.125
+
+    mean_squared_error.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        assert mean_squared_error(x, y) == 0.25
+
+    mean_squared_log_error.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        res = mean_squared_log_error(x, y)
+        assert torch.allclose(res, torch.tensor(0.0207), atol=1e-4)
diff --git a/tests/utilities/test_argparse.py b/tests/utilities/test_argparse.py
index fdf5ae0cafe65..aef266d639b4a 100644
--- a/tests/utilities/test_argparse.py
+++ b/tests/utilities/test_argparse.py
@@ -7,13 +7,13 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.utilities.argparse import (
+    _gpus_arg_default,
+    _int_or_float_type,
     add_argparse_args,
     from_argparse_args,
     get_abbrev_qualified_cls_name,
     parse_argparser,
     parse_args_from_docstring,
-    _gpus_arg_default,
-    _int_or_float_type
 )
 
 

From f93414d085784e177e75a143bafefa5ffdadd0c8 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 23 Mar 2021 11:01:25 +0100
Subject: [PATCH 20/25] Prune metyrics: regression 9/n (#6637)

* psnr

* r2score

* ssim

* chlog
---
 CHANGELOG.md                                  |   2 +
 pytorch_lightning/metrics/functional/psnr.py  |  88 +----------
 .../metrics/functional/r2score.py             | 124 +--------------
 pytorch_lightning/metrics/functional/ssim.py  | 142 +-----------------
 pytorch_lightning/metrics/regression/psnr.py  | 123 +--------------
 .../metrics/regression/r2score.py             | 122 +--------------
 pytorch_lightning/metrics/regression/ssim.py  |  78 +---------
 tests/metrics/regression/test_psnr.py         | 133 ----------------
 tests/metrics/regression/test_r2score.py      | 114 --------------
 tests/metrics/regression/test_ssim.py         | 104 -------------
 tests/metrics/test_remove_1-5_metrics.py      |  39 +++++
 11 files changed, 80 insertions(+), 989 deletions(-)
 delete mode 100644 tests/metrics/regression/test_psnr.py
 delete mode 100644 tests/metrics/regression/test_r2score.py
 delete mode 100644 tests/metrics/regression/test_ssim.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 57a071bff297a..4cf3e0f1fd326 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -92,6 +92,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
     [#6636](https://github.com/PyTorchLightning/pytorch-lightning/pull/6636),
 
+    [#6637](https://github.com/PyTorchLightning/pytorch-lightning/pull/6637),
+
 )
 
 
diff --git a/pytorch_lightning/metrics/functional/psnr.py b/pytorch_lightning/metrics/functional/psnr.py
index 0b50ea092b7fa..dd7aa44ae628e 100644
--- a/pytorch_lightning/metrics/functional/psnr.py
+++ b/pytorch_lightning/metrics/functional/psnr.py
@@ -14,46 +14,12 @@
 from typing import Optional, Tuple, Union
 
 import torch
-from torchmetrics.utilities import reduce
+from torchmetrics.functional import psnr as _psnr
 
-from pytorch_lightning.utilities import rank_zero_warn
-
-
-def _psnr_compute(
-    sum_squared_error: torch.Tensor,
-    n_obs: torch.Tensor,
-    data_range: torch.Tensor,
-    base: float = 10.0,
-    reduction: str = 'elementwise_mean',
-) -> torch.Tensor:
-    psnr_base_e = 2 * torch.log(data_range) - torch.log(sum_squared_error / n_obs)
-    psnr = psnr_base_e * (10 / torch.log(torch.tensor(base)))
-    return reduce(psnr, reduction=reduction)
-
-
-def _psnr_update(preds: torch.Tensor,
-                 target: torch.Tensor,
-                 dim: Optional[Union[int, Tuple[int, ...]]] = None) -> Tuple[torch.Tensor, torch.Tensor]:
-    if dim is None:
-        sum_squared_error = torch.sum(torch.pow(preds - target, 2))
-        n_obs = torch.tensor(target.numel(), device=target.device)
-        return sum_squared_error, n_obs
-
-    sum_squared_error = torch.sum(torch.pow(preds - target, 2), dim=dim)
-
-    if isinstance(dim, int):
-        dim_list = [dim]
-    else:
-        dim_list = list(dim)
-    if not dim_list:
-        n_obs = torch.tensor(target.numel(), device=target.device)
-    else:
-        n_obs = torch.tensor(target.size(), device=target.device)[dim_list].prod()
-        n_obs = n_obs.expand_as(sum_squared_error)
-
-    return sum_squared_error, n_obs
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_psnr, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def psnr(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -63,50 +29,6 @@ def psnr(
     dim: Optional[Union[int, Tuple[int, ...]]] = None,
 ) -> torch.Tensor:
     """
-    Computes the peak signal-to-noise ratio
-
-    Args:
-        preds: estimated signal
-        target: groun truth signal
-        data_range:
-            the range of the data. If None, it is determined from the data (max - min). ``data_range`` must be given
-            when ``dim`` is not None.
-        base: a base of a logarithm to use (default: 10)
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-        dim:
-            Dimensions to reduce PSNR scores over provided as either an integer or a list of integers. Default is
-            None meaning scores will be reduced across all dimensions.
-    Return:
-        Tensor with PSNR score
-
-    Raises:
-        ValueError:
-            If ``dim`` is not ``None`` and ``data_range`` is not provided.
-
-    Example:
-        >>> from pytorch_lightning.metrics.functional import psnr
-        >>> pred = torch.tensor([[0.0, 1.0], [2.0, 3.0]])
-        >>> target = torch.tensor([[3.0, 2.0], [1.0, 0.0]])
-        >>> psnr(pred, target)
-        tensor(2.5527)
-
+    .. deprecated::
+        Use :func:`torchmetrics.functional.psnr`. Will be removed in v1.5.0.
     """
-    if dim is None and reduction != 'elementwise_mean':
-        rank_zero_warn(f'The `reduction={reduction}` will not have any effect when `dim` is None.')
-
-    if data_range is None:
-        if dim is not None:
-            # Maybe we could use `torch.amax(target, dim=dim) - torch.amin(target, dim=dim)` in PyTorch 1.7 to calculate
-            # `data_range` in the future.
-            raise ValueError("The `data_range` must be given when `dim` is not None.")
-
-        data_range = target.max() - target.min()
-    else:
-        data_range = torch.tensor(float(data_range))
-    sum_squared_error, n_obs = _psnr_update(preds, target, dim=dim)
-    return _psnr_compute(sum_squared_error, n_obs, data_range, base=base, reduction=reduction)
diff --git a/pytorch_lightning/metrics/functional/r2score.py b/pytorch_lightning/metrics/functional/r2score.py
index d3f1090564a88..49273d9cefaed 100644
--- a/pytorch_lightning/metrics/functional/r2score.py
+++ b/pytorch_lightning/metrics/functional/r2score.py
@@ -11,133 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
 
 import torch
-from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.functional import r2score as _r2score
 
-from pytorch_lightning.utilities import rank_zero_warn
-
-
-def _r2score_update(
-    preds: torch.tensor,
-    target: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    _check_same_shape(preds, target)
-    if preds.ndim > 2:
-        raise ValueError(
-            'Expected both prediction and target to be 1D or 2D tensors,'
-            f' but recevied tensors with dimension {preds.shape}'
-        )
-    if len(preds) < 2:
-        raise ValueError('Needs atleast two samples to calculate r2 score.')
-
-    sum_error = torch.sum(target, dim=0)
-    sum_squared_error = torch.sum(torch.pow(target, 2.0), dim=0)
-    residual = torch.sum(torch.pow(target - preds, 2.0), dim=0)
-    total = target.size(0)
-
-    return sum_squared_error, sum_error, residual, total
-
-
-def _r2score_compute(
-    sum_squared_error: torch.Tensor,
-    sum_error: torch.Tensor,
-    residual: torch.Tensor,
-    total: torch.Tensor,
-    adjusted: int = 0,
-    multioutput: str = "uniform_average"
-) -> torch.Tensor:
-    mean_error = sum_error / total
-    diff = sum_squared_error - sum_error * mean_error
-    raw_scores = 1 - (residual / diff)
-
-    if multioutput == "raw_values":
-        r2score = raw_scores
-    elif multioutput == "uniform_average":
-        r2score = torch.mean(raw_scores)
-    elif multioutput == "variance_weighted":
-        diff_sum = torch.sum(diff)
-        r2score = torch.sum(diff / diff_sum * raw_scores)
-    else:
-        raise ValueError(
-            'Argument `multioutput` must be either `raw_values`,'
-            f' `uniform_average` or `variance_weighted`. Received {multioutput}.'
-        )
-
-    if adjusted < 0 or not isinstance(adjusted, int):
-        raise ValueError('`adjusted` parameter should be an integer larger or' ' equal to 0.')
-
-    if adjusted != 0:
-        if adjusted > total - 1:
-            rank_zero_warn(
-                "More independent regressions than datapoints in"
-                " adjusted r2 score. Falls back to standard r2 score.", UserWarning
-            )
-        elif adjusted == total - 1:
-            rank_zero_warn("Division by zero in adjusted r2 score. Falls back to" " standard r2 score.", UserWarning)
-        else:
-            r2score = 1 - (1 - r2score) * (total - 1) / (total - adjusted - 1)
-    return r2score
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_r2score, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def r2score(
     preds: torch.Tensor,
     target: torch.Tensor,
     adjusted: int = 0,
     multioutput: str = "uniform_average",
 ) -> torch.Tensor:
-    r"""
-    Computes r2 score also known as `coefficient of determination
-    <https://en.wikipedia.org/wiki/Coefficient_of_determination>`_:
-
-    .. math:: R^2 = 1 - \frac{SS_res}{SS_tot}
-
-    where :math:`SS_res=\sum_i (y_i - f(x_i))^2` is the sum of residual squares, and
-    :math:`SS_tot=\sum_i (y_i - \bar{y})^2` is total sum of squares. Can also calculate
-    adjusted r2 score given by
-
-    .. math:: R^2_adj = 1 - \frac{(1-R^2)(n-1)}{n-k-1}
-
-    where the parameter :math:`k` (the number of independent regressors) should
-    be provided as the ``adjusted`` argument.
-
-    Args:
-        preds: estimated labels
-        target: ground truth labels
-        adjusted: number of independent regressors for calculating adjusted r2 score.
-            Default 0 (standard r2 score).
-        multioutput: Defines aggregation in the case of multiple output scores. Can be one
-            of the following strings (default is ``'uniform_average'``.):
-
-            * ``'raw_values'`` returns full set of scores
-            * ``'uniform_average'`` scores are uniformly averaged
-            * ``'variance_weighted'`` scores are weighted by their individual variances
-
-    Raises:
-        ValueError:
-            If both ``preds`` and ``targets`` are not ``1D`` or ``2D`` tensors.
-        ValueError:
-            If ``len(preds)`` is less than ``2``
-            since at least ``2`` sampels are needed to calculate r2 score.
-        ValueError:
-            If ``multioutput`` is not one of ``raw_values``,
-            ``uniform_average`` or ``variance_weighted``.
-        ValueError:
-            If ``adjusted`` is not an ``integer`` greater than ``0``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics.functional import r2score
-        >>> target = torch.tensor([3, -0.5, 2, 7])
-        >>> preds = torch.tensor([2.5, 0.0, 2, 8])
-        >>> r2score(preds, target)
-        tensor(0.9486)
-
-        >>> target = torch.tensor([[0.5, 1], [-1, 1], [7, -6]])
-        >>> preds = torch.tensor([[0, 2], [-1, 2], [8, -5]])
-        >>> r2score(preds, target, multioutput='raw_values')
-        tensor([0.9654, 0.9082])
     """
-    sum_squared_error, sum_error, residual, total = _r2score_update(preds, target)
-    return _r2score_compute(sum_squared_error, sum_error, residual, total, adjusted, multioutput)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.r2score`. Will be removed in v1.5.0.
+    """
diff --git a/pytorch_lightning/metrics/functional/ssim.py b/pytorch_lightning/metrics/functional/ssim.py
index 4899a3ad3be4d..8809fec8d8ff1 100644
--- a/pytorch_lightning/metrics/functional/ssim.py
+++ b/pytorch_lightning/metrics/functional/ssim.py
@@ -11,107 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Sequence, Tuple
+from typing import Optional, Sequence
 
 import torch
-from torch.nn import functional as F
-from torchmetrics.utilities import reduce
-from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.functional import ssim as _ssim
 
-
-def _gaussian(kernel_size: int, sigma: int, dtype: torch.dtype, device: torch.device):
-    dist = torch.arange(start=(1 - kernel_size) / 2, end=(1 + kernel_size) / 2, step=1, dtype=dtype, device=device)
-    gauss = torch.exp(-torch.pow(dist / sigma, 2) / 2)
-    return (gauss / gauss.sum()).unsqueeze(dim=0)  # (1, kernel_size)
-
-
-def _gaussian_kernel(
-    channel: int, kernel_size: Sequence[int], sigma: Sequence[float], dtype: torch.dtype, device: torch.device
-):
-    gaussian_kernel_x = _gaussian(kernel_size[0], sigma[0], dtype, device)
-    gaussian_kernel_y = _gaussian(kernel_size[1], sigma[1], dtype, device)
-    kernel = torch.matmul(gaussian_kernel_x.t(), gaussian_kernel_y)  # (kernel_size, 1) * (1, kernel_size)
-
-    return kernel.expand(channel, 1, kernel_size[0], kernel_size[1])
-
-
-def _ssim_update(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    if preds.dtype != target.dtype:
-        raise TypeError(
-            "Expected `preds` and `target` to have the same data type."
-            f" Got pred: {preds.dtype} and target: {target.dtype}."
-        )
-    _check_same_shape(preds, target)
-    if len(preds.shape) != 4:
-        raise ValueError(
-            "Expected `preds` and `target` to have BxCxHxW shape."
-            f" Got pred: {preds.shape} and target: {target.shape}."
-        )
-    return preds, target
-
-
-def _ssim_compute(
-    preds: torch.Tensor,
-    target: torch.Tensor,
-    kernel_size: Sequence[int] = (11, 11),
-    sigma: Sequence[float] = (1.5, 1.5),
-    reduction: str = "elementwise_mean",
-    data_range: Optional[float] = None,
-    k1: float = 0.01,
-    k2: float = 0.03,
-):
-    if len(kernel_size) != 2 or len(sigma) != 2:
-        raise ValueError(
-            "Expected `kernel_size` and `sigma` to have the length of two."
-            f" Got kernel_size: {len(kernel_size)} and sigma: {len(sigma)}."
-        )
-
-    if any(x % 2 == 0 or x <= 0 for x in kernel_size):
-        raise ValueError(f"Expected `kernel_size` to have odd positive number. Got {kernel_size}.")
-
-    if any(y <= 0 for y in sigma):
-        raise ValueError(f"Expected `sigma` to have positive number. Got {sigma}.")
-
-    if data_range is None:
-        data_range = max(preds.max() - preds.min(), target.max() - target.min())
-
-    c1 = pow(k1 * data_range, 2)
-    c2 = pow(k2 * data_range, 2)
-    device = preds.device
-
-    channel = preds.size(1)
-    dtype = preds.dtype
-    kernel = _gaussian_kernel(channel, kernel_size, sigma, dtype, device)
-    pad_w = (kernel_size[0] - 1) // 2
-    pad_h = (kernel_size[1] - 1) // 2
-
-    preds = F.pad(preds, (pad_w, pad_w, pad_h, pad_h), mode='reflect')
-    target = F.pad(target, (pad_w, pad_w, pad_h, pad_h), mode='reflect')
-
-    input_list = torch.cat((preds, target, preds * preds, target * target, preds * target))  # (5 * B, C, H, W)
-    outputs = F.conv2d(input_list, kernel, groups=channel)
-    output_list = [outputs[x * preds.size(0):(x + 1) * preds.size(0)] for x in range(len(outputs))]
-
-    mu_pred_sq = output_list[0].pow(2)
-    mu_target_sq = output_list[1].pow(2)
-    mu_pred_target = output_list[0] * output_list[1]
-
-    sigma_pred_sq = output_list[2] - mu_pred_sq
-    sigma_target_sq = output_list[3] - mu_target_sq
-    sigma_pred_target = output_list[4] - mu_pred_target
-
-    upper = 2 * sigma_pred_target + c2
-    lower = sigma_pred_sq + sigma_target_sq + c2
-
-    ssim_idx = ((2 * mu_pred_target + c1) * upper) / ((mu_pred_sq + mu_target_sq + c1) * lower)
-    ssim_idx = ssim_idx[..., pad_h:-pad_h, pad_w:-pad_w]
-
-    return reduce(ssim_idx, reduction)
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
+@deprecated(target=_ssim, ver_deprecate="1.3.0", ver_remove="1.5.0")
 def ssim(
     preds: torch.Tensor,
     target: torch.Tensor,
@@ -123,44 +31,6 @@ def ssim(
     k2: float = 0.03,
 ) -> torch.Tensor:
     """
-    Computes Structual Similarity Index Measure
-
-    Args:
-        preds: estimated image
-        target: ground truth image
-        kernel_size: size of the gaussian kernel (default: (11, 11))
-        sigma: Standard deviation of the gaussian kernel (default: (1.5, 1.5))
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-        data_range: Range of the image. If ``None``, it is determined from the image (max - min)
-        k1: Parameter of SSIM. Default: 0.01
-        k2: Parameter of SSIM. Default: 0.03
-
-    Return:
-        Tensor with SSIM score
-
-    Raises:
-        TypeError:
-            If ``preds`` and ``target`` don't have the same data type.
-        ValueError:
-            If ``preds`` and ``target`` don't have ``BxCxHxW shape``.
-        ValueError:
-            If the length of ``kernel_size`` or ``sigma`` is not ``2``.
-        ValueError:
-            If one of the elements of ``kernel_size`` is not an ``odd positive number``.
-        ValueError:
-            If one of the elements of ``sigma`` is not a ``positive number``.
-
-    Example:
-        >>> from pytorch_lightning.metrics.functional import ssim
-        >>> preds = torch.rand([16, 1, 16, 16])
-        >>> target = preds * 0.75
-        >>> ssim(preds, target)
-        tensor(0.9219)
+    .. deprecated::
+        Use :func:`torchmetrics.functional.ssim`. Will be removed in v1.5.0.
     """
-    preds, target = _ssim_update(preds, target)
-    return _ssim_compute(preds, target, kernel_size, sigma, reduction, data_range, k1, k2)
diff --git a/pytorch_lightning/metrics/regression/psnr.py b/pytorch_lightning/metrics/regression/psnr.py
index 746ff1e52d574..85b8eceaa24c5 100644
--- a/pytorch_lightning/metrics/regression/psnr.py
+++ b/pytorch_lightning/metrics/regression/psnr.py
@@ -11,61 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Optional, Sequence, Tuple, Union
+from typing import Any, Optional, Tuple, Union
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import PSNR as _PSNR
 
-from pytorch_lightning import utilities
-from pytorch_lightning.metrics.functional.psnr import _psnr_compute, _psnr_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class PSNR(Metric):
-    r"""
-    Computes `peak signal-to-noise ratio <https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio>`_ (PSNR):
-
-    .. math:: \text{PSNR}(I, J) = 10 * \log_{10} \left(\frac{\max(I)^2}{\text{MSE}(I, J)}\right)
-
-    Where :math:`\text{MSE}` denotes the `mean-squared-error
-    <https://en.wikipedia.org/wiki/Mean_squared_error>`_ function.
-
-    Args:
-        data_range:
-            the range of the data. If None, it is determined from the data (max - min).
-            The ``data_range`` must be given when ``dim`` is not None.
-        base: a base of a logarithm to use (default: 10)
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-        dim:
-            Dimensions to reduce PSNR scores over, provided as either an integer or a list of integers. Default is
-            None meaning scores will be reduced across all dimensions and all batches.
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Raises:
-        ValueError:
-            If ``dim`` is not ``None`` and ``data_range`` is not given.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import PSNR
-        >>> psnr = PSNR()
-        >>> preds = torch.tensor([[0.0, 1.0], [2.0, 3.0]])
-        >>> target = torch.tensor([[3.0, 2.0], [1.0, 0.0]])
-        >>> psnr(preds, target)
-        tensor(2.5527)
-
-    """
+class PSNR(_PSNR):
 
+    @deprecated(target=_PSNR, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         data_range: Optional[float] = None,
@@ -76,71 +31,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-
-        if dim is None and reduction != 'elementwise_mean':
-            utilities.rank_zero_warn(f'The `reduction={reduction}` will not have any effect when `dim` is None.')
-
-        if dim is None:
-            self.add_state("sum_squared_error", default=torch.tensor(0.0), dist_reduce_fx="sum")
-            self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
-        else:
-            self.add_state("sum_squared_error", default=[])
-            self.add_state("total", default=[])
-
-        if data_range is None:
-            if dim is not None:
-                # Maybe we could use `torch.amax(target, dim=dim) - torch.amin(target, dim=dim)` in PyTorch 1.7 to
-                # calculate `data_range` in the future.
-                raise ValueError("The `data_range` must be given when `dim` is not None.")
-
-            self.data_range = None
-            self.add_state("min_target", default=torch.tensor(0.0), dist_reduce_fx=torch.min)
-            self.add_state("max_target", default=torch.tensor(0.0), dist_reduce_fx=torch.max)
-        else:
-            self.register_buffer("data_range", torch.tensor(float(data_range)))
-        self.base = base
-        self.reduction = reduction
-        self.dim = tuple(dim) if isinstance(dim, Sequence) else dim
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
         """
-        Update state with predictions and targets.
+        This implementation refers to :class:`~torchmetrics.PSNR`.
 
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
+        .. deprecated::
+            Use :class:`~torchmetrics.PSNR`. Will be removed in v1.5.0.
         """
-        sum_squared_error, n_obs = _psnr_update(preds, target, dim=self.dim)
-        if self.dim is None:
-            if self.data_range is None:
-                # keep track of min and max target values
-                self.min_target = min(target.min(), self.min_target)
-                self.max_target = max(target.max(), self.max_target)
-
-            self.sum_squared_error += sum_squared_error
-            self.total += n_obs
-        else:
-            self.sum_squared_error.append(sum_squared_error)
-            self.total.append(n_obs)
-
-    def compute(self):
-        """
-        Compute peak signal-to-noise ratio over state.
-        """
-        if self.data_range is not None:
-            data_range = self.data_range
-        else:
-            data_range = self.max_target - self.min_target
-
-        if self.dim is None:
-            sum_squared_error = self.sum_squared_error
-            total = self.total
-        else:
-            sum_squared_error = torch.cat([values.flatten() for values in self.sum_squared_error])
-            total = torch.cat([values.flatten() for values in self.total])
-        return _psnr_compute(sum_squared_error, total, data_range, base=self.base, reduction=self.reduction)
diff --git a/pytorch_lightning/metrics/regression/r2score.py b/pytorch_lightning/metrics/regression/r2score.py
index 8156b8bc72d48..52621d6df7c28 100644
--- a/pytorch_lightning/metrics/regression/r2score.py
+++ b/pytorch_lightning/metrics/regression/r2score.py
@@ -13,81 +13,14 @@
 # limitations under the License.
 from typing import Any, Callable, Optional
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import R2Score as _R2Score
 
-from pytorch_lightning.metrics.functional.r2score import _r2score_compute, _r2score_update
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class R2Score(Metric):
-    r"""
-    Computes r2 score also known as `coefficient of determination
-    <https://en.wikipedia.org/wiki/Coefficient_of_determination>`_:
-
-    .. math:: R^2 = 1 - \frac{SS_res}{SS_tot}
-
-    where :math:`SS_res=\sum_i (y_i - f(x_i))^2` is the sum of residual squares, and
-    :math:`SS_tot=\sum_i (y_i - \bar{y})^2` is total sum of squares. Can also calculate
-    adjusted r2 score given by
-
-    .. math:: R^2_adj = 1 - \frac{(1-R^2)(n-1)}{n-k-1}
-
-    where the parameter :math:`k` (the number of independent regressors) should
-    be provided as the `adjusted` argument.
-
-    Forward accepts
-
-    - ``preds`` (float tensor): ``(N,)`` or ``(N, M)`` (multioutput)
-    - ``target`` (float tensor): ``(N,)`` or ``(N, M)`` (multioutput)
-
-    In the case of multioutput, as default the variances will be uniformly
-    averaged over the additional dimensions. Please see argument `multioutput`
-    for changing this behavior.
-
-    Args:
-        num_outputs:
-            Number of outputs in multioutput setting (default is 1)
-        adjusted:
-            number of independent regressors for calculating adjusted r2 score.
-            Default 0 (standard r2 score).
-        multioutput:
-            Defines aggregation in the case of multiple output scores. Can be one
-            of the following strings (default is ``'uniform_average'``.):
-
-            * ``'raw_values'`` returns full set of scores
-            * ``'uniform_average'`` scores are uniformly averaged
-            * ``'variance_weighted'`` scores are weighted by their individual variances
-
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects the entire world)
-
-    Raises:
-        ValueError:
-            If ``adjusted`` parameter is not an integer larger or equal to 0.
-        ValueError:
-            If ``multioutput`` is not one of ``"raw_values"``, ``"uniform_average"`` or ``"variance_weighted"``.
-
-    Example:
-
-        >>> from pytorch_lightning.metrics import R2Score
-        >>> target = torch.tensor([3, -0.5, 2, 7])
-        >>> preds = torch.tensor([2.5, 0.0, 2, 8])
-        >>> r2score = R2Score()
-        >>> r2score(preds, target)
-        tensor(0.9486)
-
-        >>> target = torch.tensor([[0.5, 1], [-1, 1], [7, -6]])
-        >>> preds = torch.tensor([[0, 2], [-1, 2], [8, -5]])
-        >>> r2score = R2Score(num_outputs=2, multioutput='raw_values')
-        >>> r2score(preds, target)
-        tensor([0.9654, 0.9082])
-    """
+class R2Score(_R2Score):
 
+    @deprecated(target=_R2Score, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         num_outputs: int = 1,
@@ -98,50 +31,9 @@ def __init__(
         process_group: Optional[Any] = None,
         dist_sync_fn: Callable = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn,
-        )
-
-        self.num_outputs = num_outputs
-
-        if adjusted < 0 or not isinstance(adjusted, int):
-            raise ValueError('`adjusted` parameter should be an integer larger or equal to 0.')
-        self.adjusted = adjusted
-
-        allowed_multioutput = ('raw_values', 'uniform_average', 'variance_weighted')
-        if multioutput not in allowed_multioutput:
-            raise ValueError(
-                f'Invalid input to argument `multioutput`. Choose one of the following: {allowed_multioutput}'
-            )
-        self.multioutput = multioutput
-
-        self.add_state("sum_squared_error", default=torch.zeros(self.num_outputs), dist_reduce_fx="sum")
-        self.add_state("sum_error", default=torch.zeros(self.num_outputs), dist_reduce_fx="sum")
-        self.add_state("residual", default=torch.zeros(self.num_outputs), dist_reduce_fx="sum")
-        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
-        """
-        Update state with predictions and targets.
-
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
         """
-        sum_squared_error, sum_error, residual, total = _r2score_update(preds, target)
+        This implementation refers to :class:`~torchmetrics.R2Score`.
 
-        self.sum_squared_error += sum_squared_error
-        self.sum_error += sum_error
-        self.residual += residual
-        self.total += total
-
-    def compute(self) -> torch.Tensor:
-        """
-        Computes r2 score over the metric states.
+        .. deprecated::
+            Use :class:`~torchmetrics.R2Score`. Will be removed in v1.5.0.
         """
-        return _r2score_compute(
-            self.sum_squared_error, self.sum_error, self.residual, self.total, self.adjusted, self.multioutput
-        )
diff --git a/pytorch_lightning/metrics/regression/ssim.py b/pytorch_lightning/metrics/regression/ssim.py
index a3bbab938ffad..b290808c6fa5e 100644
--- a/pytorch_lightning/metrics/regression/ssim.py
+++ b/pytorch_lightning/metrics/regression/ssim.py
@@ -13,43 +13,14 @@
 # limitations under the License.
 from typing import Any, Optional, Sequence
 
-import torch
-from torchmetrics import Metric
+from torchmetrics import SSIM as _SSIM
 
-from pytorch_lightning.metrics.functional.ssim import _ssim_compute, _ssim_update
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.deprecation import deprecated
 
 
-class SSIM(Metric):
-    """
-    Computes `Structual Similarity Index Measure
-    <https://en.wikipedia.org/wiki/Structural_similarity>`_ (SSIM).
-
-    Args:
-        kernel_size: size of the gaussian kernel (default: (11, 11))
-        sigma: Standard deviation of the gaussian kernel (default: (1.5, 1.5))
-        reduction: a method to reduce metric score over labels.
-
-            - ``'elementwise_mean'``: takes the mean (default)
-            - ``'sum'``: takes the sum
-            - ``'none'``: no reduction will be applied
-
-        data_range: Range of the image. If ``None``, it is determined from the image (max - min)
-        k1: Parameter of SSIM. Default: 0.01
-        k2: Parameter of SSIM. Default: 0.03
-
-    Return:
-        Tensor with SSIM score
-
-    Example:
-        >>> from pytorch_lightning.metrics import SSIM
-        >>> preds = torch.rand([16, 1, 16, 16])
-        >>> target = preds * 0.75
-        >>> ssim = SSIM()
-        >>> ssim(preds, target)
-        tensor(0.9219)
-    """
+class SSIM(_SSIM):
 
+    @deprecated(target=_SSIM, ver_deprecate="1.3.0", ver_remove="1.5.0")
     def __init__(
         self,
         kernel_size: Sequence[int] = (11, 11),
@@ -62,44 +33,9 @@ def __init__(
         dist_sync_on_step: bool = False,
         process_group: Optional[Any] = None,
     ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-        )
-        rank_zero_warn(
-            'Metric `SSIM` will save all targets and'
-            ' predictions in buffer. For large datasets this may lead'
-            ' to large memory footprint.'
-        )
-
-        self.add_state("y", default=[], dist_reduce_fx=None)
-        self.add_state("y_pred", default=[], dist_reduce_fx=None)
-        self.kernel_size = kernel_size
-        self.sigma = sigma
-        self.data_range = data_range
-        self.k1 = k1
-        self.k2 = k2
-        self.reduction = reduction
-
-    def update(self, preds: torch.Tensor, target: torch.Tensor):
         """
-        Update state with predictions and targets.
+        This implementation refers to :class:`~torchmetrics.SSIM`.
 
-        Args:
-            preds: Predictions from model
-            target: Ground truth values
-        """
-        preds, target = _ssim_update(preds, target)
-        self.y_pred.append(preds)
-        self.y.append(target)
-
-    def compute(self):
-        """
-        Computes explained variance over state.
+        .. deprecated::
+            Use :class:`~torchmetrics.SSIM`. Will be removed in v1.5.0.
         """
-        preds = torch.cat(self.y_pred, dim=0)
-        target = torch.cat(self.y, dim=0)
-        return _ssim_compute(
-            preds, target, self.kernel_size, self.sigma, self.reduction, self.data_range, self.k1, self.k2
-        )
diff --git a/tests/metrics/regression/test_psnr.py b/tests/metrics/regression/test_psnr.py
deleted file mode 100644
index eb07fffb9d55c..0000000000000
--- a/tests/metrics/regression/test_psnr.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import namedtuple
-from functools import partial
-
-import numpy as np
-import pytest
-import torch
-from skimage.metrics import peak_signal_noise_ratio
-
-from pytorch_lightning.metrics.functional import psnr
-from pytorch_lightning.metrics.regression import PSNR
-from tests.metrics.utils import BATCH_SIZE, MetricTester, NUM_BATCHES
-
-torch.manual_seed(42)
-
-Input = namedtuple('Input', ["preds", "target"])
-
-_input_size = (NUM_BATCHES, BATCH_SIZE, 32, 32)
-_inputs = [
-    Input(
-        preds=torch.randint(n_cls_pred, _input_size, dtype=torch.float),
-        target=torch.randint(n_cls_target, _input_size, dtype=torch.float),
-    ) for n_cls_pred, n_cls_target in [(10, 10), (5, 10), (10, 5)]
-]
-
-
-def _to_sk_peak_signal_noise_ratio_inputs(value, dim):
-    value = value.numpy()
-    batches = value[None] if value.ndim == len(_input_size) - 1 else value
-
-    if dim is None:
-        return [batches]
-
-    num_dims = np.size(dim)
-    if not num_dims:
-        return batches
-
-    inputs = []
-    for batch in batches:
-        batch = np.moveaxis(batch, dim, np.arange(-num_dims, 0))
-        psnr_input_shape = batch.shape[-num_dims:]
-        inputs.extend(batch.reshape(-1, *psnr_input_shape))
-    return inputs
-
-
-def _sk_psnr(preds, target, data_range, reduction, dim):
-    sk_preds_lists = _to_sk_peak_signal_noise_ratio_inputs(preds, dim=dim)
-    sk_target_lists = _to_sk_peak_signal_noise_ratio_inputs(target, dim=dim)
-    np_reduce_map = {"elementwise_mean": np.mean, "none": np.array, "sum": np.sum}
-    return np_reduce_map[reduction]([
-        peak_signal_noise_ratio(sk_target, sk_preds, data_range=data_range)
-        for sk_target, sk_preds in zip(sk_target_lists, sk_preds_lists)
-    ])
-
-
-def _base_e_sk_psnr(preds, target, data_range, reduction, dim):
-    return _sk_psnr(preds, target, data_range, reduction, dim) * np.log(10)
-
-
-@pytest.mark.parametrize(
-    "preds, target, data_range, reduction, dim",
-    [
-        (_inputs[0].preds, _inputs[0].target, 10, "elementwise_mean", None),
-        (_inputs[1].preds, _inputs[1].target, 10, "elementwise_mean", None),
-        (_inputs[2].preds, _inputs[2].target, 5, "elementwise_mean", None),
-        (_inputs[2].preds, _inputs[2].target, 5, "elementwise_mean", 1),
-        (_inputs[2].preds, _inputs[2].target, 5, "elementwise_mean", (1, 2)),
-        (_inputs[2].preds, _inputs[2].target, 5, "sum", (1, 2)),
-    ],
-)
-@pytest.mark.parametrize(
-    "base, sk_metric",
-    [
-        (10.0, _sk_psnr),
-        (2.718281828459045, _base_e_sk_psnr),
-    ],
-)
-class TestPSNR(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_psnr(self, preds, target, data_range, base, reduction, dim, sk_metric, ddp, dist_sync_on_step):
-        _args = {"data_range": data_range, "base": base, "reduction": reduction, "dim": dim}
-        self.run_class_metric_test(
-            ddp,
-            preds,
-            target,
-            PSNR,
-            partial(sk_metric, data_range=data_range, reduction=reduction, dim=dim),
-            metric_args=_args,
-            dist_sync_on_step=dist_sync_on_step,
-        )
-
-    def test_psnr_functional(self, preds, target, sk_metric, data_range, base, reduction, dim):
-        _args = {"data_range": data_range, "base": base, "reduction": reduction, "dim": dim}
-        self.run_functional_metric_test(
-            preds,
-            target,
-            psnr,
-            partial(sk_metric, data_range=data_range, reduction=reduction, dim=dim),
-            metric_args=_args,
-        )
-
-
-@pytest.mark.parametrize("reduction", ["none", "sum"])
-def test_reduction_for_dim_none(reduction):
-    match = f"The `reduction={reduction}` will not have any effect when `dim` is None."
-    with pytest.warns(UserWarning, match=match):
-        PSNR(reduction=reduction, dim=None)
-
-    with pytest.warns(UserWarning, match=match):
-        psnr(_inputs[0].preds, _inputs[0].target, reduction=reduction, dim=None)
-
-
-def test_missing_data_range():
-    with pytest.raises(ValueError):
-        PSNR(data_range=None, dim=0)
-
-    with pytest.raises(ValueError):
-        psnr(_inputs[0].preds, _inputs[0].target, data_range=None, dim=0)
diff --git a/tests/metrics/regression/test_r2score.py b/tests/metrics/regression/test_r2score.py
deleted file mode 100644
index 232b003e6116a..0000000000000
--- a/tests/metrics/regression/test_r2score.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from collections import namedtuple
-from functools import partial
-
-import pytest
-import torch
-from sklearn.metrics import r2_score as sk_r2score
-
-from pytorch_lightning.metrics.functional import r2score
-from pytorch_lightning.metrics.regression import R2Score
-from tests.metrics.utils import BATCH_SIZE, MetricTester, NUM_BATCHES
-
-torch.manual_seed(42)
-
-num_targets = 5
-
-Input = namedtuple('Input', ["preds", "target"])
-
-_single_target_inputs = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE),
-    target=torch.rand(NUM_BATCHES, BATCH_SIZE),
-)
-
-_multi_target_inputs = Input(
-    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, num_targets),
-    target=torch.rand(NUM_BATCHES, BATCH_SIZE, num_targets),
-)
-
-
-def _single_target_sk_metric(preds, target, adjusted, multioutput):
-    sk_preds = preds.view(-1).numpy()
-    sk_target = target.view(-1).numpy()
-    r2_score = sk_r2score(sk_target, sk_preds, multioutput=multioutput)
-    if adjusted != 0:
-        r2_score = 1 - (1 - r2_score) * (sk_preds.shape[0] - 1) / (sk_preds.shape[0] - adjusted - 1)
-    return r2_score
-
-
-def _multi_target_sk_metric(preds, target, adjusted, multioutput):
-    sk_preds = preds.view(-1, num_targets).numpy()
-    sk_target = target.view(-1, num_targets).numpy()
-    r2_score = sk_r2score(sk_target, sk_preds, multioutput=multioutput)
-    if adjusted != 0:
-        r2_score = 1 - (1 - r2_score) * (sk_preds.shape[0] - 1) / (sk_preds.shape[0] - adjusted - 1)
-    return r2_score
-
-
-@pytest.mark.parametrize("adjusted", [0, 5, 10])
-@pytest.mark.parametrize("multioutput", ['raw_values', 'uniform_average', 'variance_weighted'])
-@pytest.mark.parametrize(
-    "preds, target, sk_metric, num_outputs",
-    [
-        (_single_target_inputs.preds, _single_target_inputs.target, _single_target_sk_metric, 1),
-        (_multi_target_inputs.preds, _multi_target_inputs.target, _multi_target_sk_metric, num_targets),
-    ],
-)
-class TestR2Score(MetricTester):
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_r2(self, adjusted, multioutput, preds, target, sk_metric, num_outputs, ddp, dist_sync_on_step):
-        self.run_class_metric_test(
-            ddp,
-            preds,
-            target,
-            R2Score,
-            partial(sk_metric, adjusted=adjusted, multioutput=multioutput),
-            dist_sync_on_step,
-            metric_args=dict(adjusted=adjusted, multioutput=multioutput, num_outputs=num_outputs),
-        )
-
-    def test_r2_functional(self, adjusted, multioutput, preds, target, sk_metric, num_outputs):
-        self.run_functional_metric_test(
-            preds,
-            target,
-            r2score,
-            partial(sk_metric, adjusted=adjusted, multioutput=multioutput),
-            metric_args=dict(adjusted=adjusted, multioutput=multioutput),
-        )
-
-
-def test_error_on_different_shape(metric_class=R2Score):
-    metric = metric_class()
-    with pytest.raises(RuntimeError, match='Predictions and targets are expected to have the same shape'):
-        metric(torch.randn(100, ), torch.randn(50, ))
-
-
-def test_error_on_multidim_tensors(metric_class=R2Score):
-    metric = metric_class()
-    with pytest.raises(
-        ValueError,
-        match=r'Expected both prediction and target to be 1D or 2D tensors,'
-        r' but recevied tensors with dimension .'
-    ):
-        metric(torch.randn(10, 20, 5), torch.randn(10, 20, 5))
-
-
-def test_error_on_too_few_samples(metric_class=R2Score):
-    metric = metric_class()
-    with pytest.raises(ValueError, match='Needs atleast two samples to calculate r2 score.'):
-        metric(torch.randn(1, ), torch.randn(1, ))
-
-
-def test_warning_on_too_large_adjusted(metric_class=R2Score):
-    metric = metric_class(adjusted=10)
-
-    with pytest.warns(
-        UserWarning,
-        match="More independent regressions than datapoints in"
-        " adjusted r2 score. Falls back to standard r2 score."
-    ):
-        metric(torch.randn(10, ), torch.randn(10, ))
-
-    with pytest.warns(UserWarning, match="Division by zero in adjusted r2 score. Falls back to" " standard r2 score."):
-        metric(torch.randn(11, ), torch.randn(11, ))
diff --git a/tests/metrics/regression/test_ssim.py b/tests/metrics/regression/test_ssim.py
deleted file mode 100644
index f7e4b7a58e001..0000000000000
--- a/tests/metrics/regression/test_ssim.py
+++ /dev/null
@@ -1,104 +0,0 @@
-from collections import namedtuple
-from functools import partial
-
-import pytest
-import torch
-from skimage.metrics import structural_similarity
-
-from pytorch_lightning.metrics.functional import ssim
-from pytorch_lightning.metrics.regression import SSIM
-from tests.metrics.utils import BATCH_SIZE, MetricTester, NUM_BATCHES
-
-torch.manual_seed(42)
-
-Input = namedtuple('Input', ["preds", "target", "multichannel"])
-
-_inputs = []
-for size, channel, coef, multichannel, dtype in [
-    (12, 3, 0.9, True, torch.float),
-    (13, 1, 0.8, False, torch.float32),
-    (14, 1, 0.7, False, torch.double),
-    (15, 3, 0.6, True, torch.float64),
-]:
-    preds = torch.rand(NUM_BATCHES, BATCH_SIZE, channel, size, size, dtype=dtype)
-    _inputs.append(Input(
-        preds=preds,
-        target=preds * coef,
-        multichannel=multichannel,
-    ))
-
-
-def _sk_metric(preds, target, data_range, multichannel):
-    c, h, w = preds.shape[-3:]
-    sk_preds = preds.view(-1, c, h, w).permute(0, 2, 3, 1).numpy()
-    sk_target = target.view(-1, c, h, w).permute(0, 2, 3, 1).numpy()
-    if not multichannel:
-        sk_preds = sk_preds[:, :, :, 0]
-        sk_target = sk_target[:, :, :, 0]
-
-    return structural_similarity(
-        sk_target,
-        sk_preds,
-        data_range=data_range,
-        multichannel=multichannel,
-        gaussian_weights=True,
-        win_size=11,
-        sigma=1.5,
-        use_sample_covariance=False
-    )
-
-
-@pytest.mark.parametrize(
-    "preds, target, multichannel",
-    [(i.preds, i.target, i.multichannel) for i in _inputs],
-)
-class TestSSIM(MetricTester):
-    atol = 6e-5
-
-    @pytest.mark.parametrize("ddp", [True, False])
-    @pytest.mark.parametrize("dist_sync_on_step", [True, False])
-    def test_ssim(self, preds, target, multichannel, ddp, dist_sync_on_step):
-        self.run_class_metric_test(
-            ddp,
-            preds,
-            target,
-            SSIM,
-            partial(_sk_metric, data_range=1.0, multichannel=multichannel),
-            metric_args={"data_range": 1.0},
-            dist_sync_on_step=dist_sync_on_step,
-        )
-
-    def test_ssim_functional(self, preds, target, multichannel):
-        self.run_functional_metric_test(
-            preds,
-            target,
-            ssim,
-            partial(_sk_metric, data_range=1.0, multichannel=multichannel),
-            metric_args={"data_range": 1.0},
-        )
-
-
-@pytest.mark.parametrize(
-    ['pred', 'target', 'kernel', 'sigma'],
-    [
-        pytest.param([1, 16, 16], [1, 16, 16], [11, 11], [1.5, 1.5]),  # len(shape)
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11, 11], [1.5]),  # len(kernel), len(sigma)
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11], [1.5, 1.5]),  # len(kernel), len(sigma)
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11], [1.5]),  # len(kernel), len(sigma)
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11, 0], [1.5, 1.5]),  # invalid kernel input
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11, 10], [1.5, 1.5]),  # invalid kernel input
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11, -11], [1.5, 1.5]),  # invalid kernel input
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11, 11], [1.5, 0]),  # invalid sigma input
-        pytest.param([1, 1, 16, 16], [1, 1, 16, 16], [11, 0], [1.5, -1.5]),  # invalid sigma input
-    ],
-)
-def test_ssim_invalid_inputs(pred, target, kernel, sigma):
-    pred_t = torch.rand(pred)
-    target_t = torch.rand(target, dtype=torch.float64)
-    with pytest.raises(TypeError):
-        ssim(pred_t, target_t)
-
-    pred = torch.rand(pred)
-    target = torch.rand(target)
-    with pytest.raises(ValueError):
-        ssim(pred, target, kernel, sigma)
diff --git a/tests/metrics/test_remove_1-5_metrics.py b/tests/metrics/test_remove_1-5_metrics.py
index eaf17ec0792da..43dd330bcfcbe 100644
--- a/tests/metrics/test_remove_1-5_metrics.py
+++ b/tests/metrics/test_remove_1-5_metrics.py
@@ -33,8 +33,11 @@
     MetricCollection,
     Precision,
     PrecisionRecallCurve,
+    PSNR,
+    R2Score,
     Recall,
     ROC,
+    SSIM,
     StatScores,
 )
 from pytorch_lightning.metrics.functional import (
@@ -53,8 +56,11 @@
     precision,
     precision_recall,
     precision_recall_curve,
+    psnr,
+    r2score,
     recall,
     roc,
+    ssim,
     stat_scores,
 )
 from pytorch_lightning.metrics.functional.accuracy import accuracy
@@ -290,3 +296,36 @@ def test_v1_5_metric_regress():
     with pytest.deprecated_call(match='It will be removed in v1.5.0'):
         res = mean_squared_log_error(x, y)
         assert torch.allclose(res, torch.tensor(0.0207), atol=1e-4)
+
+    PSNR.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        PSNR()
+
+    R2Score.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        R2Score()
+
+    SSIM.__init__.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        SSIM()
+
+    preds = torch.tensor([[0.0, 1.0], [2.0, 3.0]])
+    target = torch.tensor([[3.0, 2.0], [1.0, 0.0]])
+    psnr.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        res = psnr(preds, target)
+        assert torch.allclose(res, torch.tensor(2.5527), atol=1e-4)
+
+    target = torch.tensor([3, -0.5, 2, 7])
+    preds = torch.tensor([2.5, 0.0, 2, 8])
+    r2score.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        res = r2score(preds, target)
+        assert torch.allclose(res, torch.tensor(0.9486), atol=1e-4)
+
+    preds = torch.rand([16, 1, 16, 16])
+    target = preds * 0.75
+    ssim.warned = False
+    with pytest.deprecated_call(match='It will be removed in v1.5.0'):
+        res = ssim(preds, target)
+        assert torch.allclose(res, torch.tensor(0.9219), atol=1e-4)

From 36d180e53271295359c1ca7da1d222bbd2ed7940 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 23 Mar 2021 11:07:35 +0100
Subject: [PATCH 21/25] Refactor base profilers 3/5 (#6621)

Co-authored-by: tchaton <thomas@grid.ai>
---
 .gitignore                                   |   2 +-
 CHANGELOG.md                                 |   9 +
 pytorch_lightning/profiler/profilers.py      | 270 +++++++++++++------
 pytorch_lightning/profiler/pytorch.py        |  76 ++----
 pytorch_lightning/trainer/evaluation_loop.py |   5 +
 pytorch_lightning/trainer/training_loop.py   |   4 +-
 tests/deprecated_api/test_remove_1-5.py      |  10 +
 tests/test_profiler.py                       | 167 ++++++++----
 tests/trainer/properties/test_get_model.py   |   5 +-
 9 files changed, 355 insertions(+), 193 deletions(-)

diff --git a/.gitignore b/.gitignore
index c007140257188..99939ff7fce0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,4 @@ tags
 data
 MNIST
 runs
-*traces*
+*trace*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4cf3e0f1fd326..32cf9122efe34 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -51,6 +51,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `Trainer.predict` config validation ([#6543](https://github.com/PyTorchLightning/pytorch-lightning/pull/6543))
 
 
+- Added `AbstractProfiler` interface ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
+
+
 - Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))
 
 
@@ -68,6 +71,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed `setup()` and `teardown()` stage argument to take any of `{fit,validate,test,predict}` ([#6386](https://github.com/PyTorchLightning/pytorch-lightning/pull/6386))
 
 
+- Changed profilers to save separate report files per state and rank ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
+
+
 ### Deprecated
 
 - `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146))
@@ -76,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
 
 
+- Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
+
+
 - Deprecated metrics in favor of `torchmetrics` ([#6505](https://github.com/PyTorchLightning/pytorch-lightning/pull/6505),
 
     [#6530](https://github.com/PyTorchLightning/pytorch-lightning/pull/6530),
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 5668fd6654b2f..54bc5cdf0122c 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -21,31 +21,19 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from contextlib import contextmanager
-from typing import Optional, Union
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, TextIO, Tuple, Union
 
 import numpy as np
 
+from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 
 log = logging.getLogger(__name__)
 
 
-class BaseProfiler(ABC):
-    """
-    If you wish to write a custom profiler, you should inhereit from this class.
-    """
-
-    def __init__(self, output_streams: Optional[Union[list, tuple]] = None):
-        """
-        Args:
-            output_streams: callable
-        """
-        if output_streams:
-            if not isinstance(output_streams, (list, tuple)):
-                output_streams = [output_streams]
-        else:
-            output_streams = []
-        self.write_streams = output_streams
+class AbstractProfiler(ABC):
+    """Specification of a profiler."""
 
     @abstractmethod
     def start(self, action_name: str) -> None:
@@ -55,23 +43,47 @@ def start(self, action_name: str) -> None:
     def stop(self, action_name: str) -> None:
         """Defines how to record the duration once an action is complete."""
 
-    def setup(
+    @abstractmethod
+    def summary(self) -> str:
+        """Create profiler summary in text format."""
+
+    @abstractmethod
+    def setup(self, **kwargs: Any) -> None:
+        """Execute arbitrary pre-profiling set-up steps as defined by subclass."""
+
+    @abstractmethod
+    def teardown(self, **kwargs: Any) -> None:
+        """Execute arbitrary post-profiling tear-down steps as defined by subclass."""
+
+
+class BaseProfiler(AbstractProfiler):
+    """
+    If you wish to write a custom profiler, you should inherit from this class.
+    """
+
+    def __init__(
         self,
-        stage: Optional[str] = None,
-        local_rank: Optional[int] = None,
-        log_dir: Optional[str] = None
+        dirpath: Optional[Union[str, Path]] = None,
+        filename: Optional[str] = None,
+        output_filename: Optional[str] = None,
     ) -> None:
-        """Execute arbitrary pre-profiling set-up steps."""
-        self.stage = stage
-        self.local_rank = local_rank
-        self.log_dir = log_dir
+        self.dirpath = dirpath
+        self.filename = filename
+        if output_filename is not None:
+            rank_zero_warn(
+                "`Profiler` signature has changed in v1.3. The `output_filename` parameter has been removed in"
+                " favor of `dirpath` and `filename`. Support for the old signature will be removed in v1.5",
+                DeprecationWarning
+            )
+            filepath = Path(output_filename)
+            self.dirpath = filepath.parent
+            self.filename = filepath.stem
 
-    def teardown(self, stage: Optional[str] = None) -> None:
-        """Execute arbitrary post-profiling tear-down steps."""
-        self.stage = stage
-        if self.output_file:
-            self.output_file.close()
-            self.output_file = None
+        self._output_file: Optional[TextIO] = None
+        self._write_stream: Optional[Callable] = None
+        self._local_rank: Optional[int] = None
+        self._log_dir: Optional[str] = None
+        self._stage: Optional[str] = None
 
     @contextmanager
     def profile(self, action_name: str) -> None:
@@ -104,19 +116,94 @@ def profile_iterable(self, iterable, action_name: str) -> None:
                 self.stop(action_name)
                 break
 
+    def _rank_zero_info(self, *args, **kwargs) -> None:
+        if self._local_rank in (None, 0):
+            log.info(*args, **kwargs)
+
+    def _prepare_filename(self) -> str:
+        filename = ""
+        if self._stage is not None:
+            filename += f"{self._stage}-"
+        filename += str(self.filename)
+        if self._local_rank is not None:
+            filename += f"-{self.local_rank}"
+        filename += ".txt"
+        return filename
+
+    def _prepare_streams(self) -> None:
+        if self._write_stream is not None:
+            return
+        if self.filename:
+            dirpath = self.dirpath or self._log_dir
+            filepath = os.path.join(dirpath, self._prepare_filename())
+            fs = get_filesystem(filepath)
+            file = fs.open(filepath, "a")
+            self._output_file = file
+            self._write_stream = file.write
+        else:
+            self._write_stream = self._rank_zero_info
+
     def describe(self) -> None:
-        """Logs a profile report after the conclusion of the training run."""
-        for write in self.write_streams:
-            write(self.summary())
-        if self.output_file is not None:
-            self.output_file.flush()
+        """Logs a profile report after the conclusion of run."""
+        # there are pickling issues with open file handles in Python 3.6
+        # so to avoid them, we open and close the files within this function
+        # by calling `_prepare_streams` and `teardown`
+        self._prepare_streams()
+        self._write_stream(self.summary())
+        if self._output_file is not None:
+            self._output_file.flush()
+        self.teardown(stage=self._stage)
+
+    def _stats_to_str(self, stats: Dict[str, str]) -> str:
+        stage = f"{self._stage.upper()} " if self._stage is not None else ""
+        output = [stage + "Profiler Report"]
+        for action, value in stats.items():
+            header = f"Profile stats for: {action}"
+            if self._local_rank is not None:
+                header += f" rank: {self._local_rank}"
+            output.append(header)
+            output.append(value)
+        return os.linesep.join(output)
+
+    def setup(
+        self,
+        stage: Optional[str] = None,
+        local_rank: Optional[int] = None,
+        log_dir: Optional[str] = None,
+    ) -> None:
+        """Execute arbitrary pre-profiling set-up steps."""
+        self._stage = stage
+        self._local_rank = local_rank
+        self._log_dir = log_dir
+        if self.dirpath is None:
+            self.dirpath = self._log_dir
+
+    def teardown(self, stage: Optional[str] = None) -> None:
+        """
+        Execute arbitrary post-profiling tear-down steps.
+
+        Closes the currently open file and stream.
+        """
+        self._write_stream = None
+        if self._output_file is not None:
+            self._output_file.close()
+            self._output_file = None  # can't pickle TextIOWrapper
+
+    def __del__(self) -> None:
+        self.teardown(stage=self._stage)
+
+    def start(self, action_name: str) -> None:
+        raise NotImplementedError
+
+    def stop(self, action_name: str) -> None:
+        raise NotImplementedError
 
-    @abstractmethod
     def summary(self) -> str:
-        """Create profiler summary in text format."""
+        raise NotImplementedError
 
-    def __del__(self):
-        self.teardown(None)
+    @property
+    def local_rank(self):
+        return '0' if self._local_rank is None else self._local_rank
 
 
 class PassThroughProfiler(BaseProfiler):
@@ -125,10 +212,6 @@ class PassThroughProfiler(BaseProfiler):
     The Trainer uses this class by default.
     """
 
-    def __init__(self):
-        self.output_file = None
-        super().__init__(output_streams=None)
-
     def start(self, action_name: str) -> None:
         pass
 
@@ -145,30 +228,32 @@ class SimpleProfiler(BaseProfiler):
     the mean duration of each action and the total time spent over the entire training run.
     """
 
-    def __init__(self, output_filename: Optional[str] = None, extended=True):
+    def __init__(
+        self,
+        dirpath: Optional[Union[str, Path]] = None,
+        filename: Optional[str] = None,
+        extended: bool = True,
+        output_filename: Optional[str] = None,
+    ) -> None:
         """
         Args:
-            output_filename: optionally save profile results to file instead of printing
-                to std out when training is finished.
+            dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the
+                ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`)
+                will be used.
+
+            filename: If present, filename where the profiler results will be saved instead of printing to stdout.
+                The ``.txt`` extension will be used automatically.
 
         Raises:
             ValueError:
                 If you attempt to start an action which has already started, or
                 if you attempt to stop recording an action which was never started.
         """
-        self.current_actions = {}
+        super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
+        self.current_actions: Dict[str, float] = {}
         self.recorded_durations = defaultdict(list)
         self.extended = extended
-
-        self.output_fname = output_filename
-        self.output_file = None
-        if self.output_fname:
-            fs = get_filesystem(self.output_fname)
-            self.output_file = fs.open(self.output_fname, "w")
-
-        streaming_out = [self.output_file.write] if self.output_file else [log.info]
         self.start_time = time.monotonic()
-        super().__init__(output_streams=streaming_out)
 
     def start(self, action_name: str) -> None:
         if action_name in self.current_actions:
@@ -183,14 +268,18 @@ def stop(self, action_name: str) -> None:
         duration = end_time - start_time
         self.recorded_durations[action_name].append(duration)
 
-    def make_report(self):
+    def _make_report(self) -> Tuple[list, float]:
         total_duration = time.monotonic() - self.start_time
         report = [[a, d, 100. * np.sum(d) / total_duration] for a, d in self.recorded_durations.items()]
         report.sort(key=lambda x: x[2], reverse=True)
         return report, total_duration
 
     def summary(self) -> str:
-        output_string = "\n\nProfiler Report\n"
+        sep = os.linesep
+        output_string = ""
+        if self._stage is not None:
+            output_string += f"{self._stage.upper()} "
+        output_string += f"Profiler Report{sep}"
 
         if self.extended:
 
@@ -198,16 +287,16 @@ def summary(self) -> str:
                 max_key = np.max([len(k) for k in self.recorded_durations.keys()])
 
                 def log_row(action, mean, num_calls, total, per):
-                    row = f"{os.linesep}{action:<{max_key}s}\t|  {mean:<15}\t|"
+                    row = f"{sep}{action:<{max_key}s}\t|  {mean:<15}\t|"
                     row += f"{num_calls:<15}\t|  {total:<15}\t|  {per:<15}\t|"
                     return row
 
                 output_string += log_row("Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %")
                 output_string_len = len(output_string)
-                output_string += f"{os.linesep}{'-' * output_string_len}"
-                report, total_duration = self.make_report()
+                output_string += f"{sep}{'-' * output_string_len}"
+                report, total_duration = self._make_report()
                 output_string += log_row("Total", "-", "_", f"{total_duration:.5}", "100 %")
-                output_string += f"{os.linesep}{'-' * output_string_len}"
+                output_string += f"{sep}{'-' * output_string_len}"
                 for action, durations, duration_per in report:
                     output_string += log_row(
                         action,
@@ -219,14 +308,14 @@ def log_row(action, mean, num_calls, total, per):
         else:
 
             def log_row(action, mean, total):
-                return f"{os.linesep}{action:<20s}\t|  {mean:<15}\t|  {total:<15}"
+                return f"{sep}{action:<20s}\t|  {mean:<15}\t|  {total:<15}"
 
             output_string += log_row("Action", "Mean duration (s)", "Total time (s)")
-            output_string += f"{os.linesep}{'-' * 65}"
+            output_string += f"{sep}{'-' * 65}"
 
             for action, durations in self.recorded_durations.items():
                 output_string += log_row(action, f"{np.mean(durations):.5}", f"{np.sum(durations):.5}")
-        output_string += os.linesep
+        output_string += sep
         return output_string
 
 
@@ -237,11 +326,22 @@ class AdvancedProfiler(BaseProfiler):
     verbose and you should only use this if you want very detailed reports.
     """
 
-    def __init__(self, output_filename: Optional[str] = None, line_count_restriction: float = 1.0):
+    def __init__(
+        self,
+        dirpath: Optional[Union[str, Path]] = None,
+        filename: Optional[str] = None,
+        line_count_restriction: float = 1.0,
+        output_filename: Optional[str] = None,
+    ) -> None:
         """
         Args:
-            output_filename: optionally save profile results to file instead of printing
-                to std out when training is finished.
+            dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the
+                ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`)
+                will be used.
+
+            filename: If present, filename where the profiler results will be saved instead of printing to stdout.
+                The ``.txt`` extension will be used automatically.
+
             line_count_restriction: this can be used to limit the number of functions
                 reported for each action. either an integer (to select a count of lines),
                 or a decimal fraction between 0.0 and 1.0 inclusive (to select a percentage of lines)
@@ -250,18 +350,10 @@ def __init__(self, output_filename: Optional[str] = None, line_count_restriction
             ValueError:
                 If you attempt to stop recording an action which was never started.
         """
-        self.profiled_actions = {}
+        super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
+        self.profiled_actions: Dict[str, cProfile.Profile] = {}
         self.line_count_restriction = line_count_restriction
 
-        self.output_fname = output_filename
-        self.output_file = None
-        if self.output_fname:
-            fs = get_filesystem(self.output_fname)
-            self.output_file = fs.open(self.output_fname, "w")
-
-        streaming_out = [self.output_file.write] if self.output_file else [log.info]
-        super().__init__(output_streams=streaming_out)
-
     def start(self, action_name: str) -> None:
         if action_name not in self.profiled_actions:
             self.profiled_actions[action_name] = cProfile.Profile()
@@ -270,9 +362,7 @@ def start(self, action_name: str) -> None:
     def stop(self, action_name: str) -> None:
         pr = self.profiled_actions.get(action_name)
         if pr is None:
-            raise ValueError(  # pragma: no-cover
-                f"Attempting to stop recording an action ({action_name}) which was never started."
-            )
+            raise ValueError(f"Attempting to stop recording an action ({action_name}) which was never started.")
         pr.disable()
 
     def summary(self) -> str:
@@ -282,10 +372,16 @@ def summary(self) -> str:
             ps = pstats.Stats(pr, stream=s).strip_dirs().sort_stats('cumulative')
             ps.print_stats(self.line_count_restriction)
             recorded_stats[action_name] = s.getvalue()
+        return self._stats_to_str(recorded_stats)
 
-        # log to standard out
-        output_string = f"{os.linesep}Profiler Report{os.linesep}"
-        for action, stats in recorded_stats.items():
-            output_string += f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
+    def teardown(self, stage: Optional[str] = None) -> None:
+        super().teardown(stage=stage)
+        self.profiled_actions = {}
 
-        return output_string
+    def __reduce__(self):
+        # avoids `TypeError: cannot pickle 'cProfile.Profile' object`
+        return (
+            self.__class__,
+            tuple(),
+            dict(dirpath=self.dirpath, filename=self.filename, line_count_restriction=self.line_count_restriction),
+        )
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
index c35979fa918af..55b1c286789f4 100644
--- a/pytorch_lightning/profiler/pytorch.py
+++ b/pytorch_lightning/profiler/pytorch.py
@@ -16,13 +16,12 @@
 import inspect
 import logging
 import os
-from typing import List, Optional
+from pathlib import Path
+from typing import List, Optional, Union
 
 import torch
 
 from pytorch_lightning.profiler.profilers import BaseProfiler
-from pytorch_lightning.utilities import rank_zero_only
-from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -46,7 +45,8 @@ class PyTorchProfiler(BaseProfiler):
 
     def __init__(
         self,
-        output_filename: Optional[str] = None,
+        dirpath: Optional[Union[str, Path]] = None,
+        filename: Optional[str] = None,
         enabled: bool = True,
         use_cuda: bool = False,
         record_shapes: bool = False,
@@ -61,18 +61,19 @@ def __init__(
         row_limit: int = 20,
         sort_by_key: Optional[str] = None,
         profiled_functions: Optional[List] = None,
-        local_rank: Optional[int] = None,
+        output_filename: Optional[str] = None,
     ):
         """
         This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
         different operators inside your model - both on the CPU and GPU
 
         Args:
+            dirpath: Directory path for the ``filename``. If ``dirpath`` is ``None`` but ``filename`` is present, the
+                ``trainer.log_dir`` (from :class:`~pytorch_lightning.loggers.tensorboard.TensorBoardLogger`)
+                will be used.
 
-            output_filename: optionally save profile results to file instead of printing
-                to std out when training is finished. When using ``ddp``,
-                each rank will stream the profiled operation to their own file
-                with the extension ``_{rank}.txt``
+            filename: If present, filename where the profiler results will be saved instead of printing to stdout.
+                The ``.txt`` extension will be used automatically.
 
             enabled: Setting this to False makes this context manager a no-op.
 
@@ -116,13 +117,9 @@ def __init__(
             profiled_functions: list of profiled functions which will create a context manager on.
                 Any other will be pass through.
 
-            local_rank: When running in distributed setting, local_rank is used for each process
-                to write to their own file if `output_fname` is provided.
-
         Raises:
             MisconfigurationException:
-                If arg ``sort_by_key`` is not present in ``AVAILABLE_SORT_KEYS``, or
-                if log file is not a ``.txt`` file.
+                If arg ``sort_by_key`` is not present in ``AVAILABLE_SORT_KEYS``.
             ValueError:
                 If you attempt to stop recording an action which was never started.
         """
@@ -159,37 +156,20 @@ def __init__(
         self.running_stack = []
         self.profiler = None
 
-        self.output_fname = output_filename
-        self.output_file = None
-        if local_rank is not None:
-            self.setup(local_rank=local_rank)
-            self.setup = super().setup
+        super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
 
-    def setup(self, stage: Optional[str] = None, local_rank: Optional[int] = None, log_dir: Optional[str] = None):
+    def setup(
+        self,
+        stage: Optional[str] = None,
+        local_rank: Optional[int] = None,
+        log_dir: Optional[str] = None
+    ) -> None:
         super().setup(stage=stage, local_rank=local_rank, log_dir=log_dir)
 
-        # when logging to `log.info`, only perform profiling on rank 0
-        if local_rank != 0 and self.output_fname is None:
-            self.wrap_functions_into_rank_zero_only()
-
-        if self.output_fname:
-            if local_rank is not None:
-                if '.txt' not in self.output_fname:
-                    raise MisconfigurationException("Log file should be .txt file.")
-
-                self.output_fname = self.output_fname.replace(".txt", f"_{self.local_rank}.txt")
-
-            fs = get_filesystem(self.output_fname)
-            self.output_file = fs.open(self.output_fname, "w")
-
-        streaming_out = [self.output_file.write] if self.output_file else [log.info]
-        super().__init__(output_streams=streaming_out)
-
-    def wrap_functions_into_rank_zero_only(self):
-        self.start = rank_zero_only(self.start)
-        self.stop = rank_zero_only(self.stop)
-        self.summary = rank_zero_only(self.summary)
-        self.describe = rank_zero_only(self.describe)
+        # if the user didn't provide `path_to_export_trace`,
+        # set it as TensorBoardLogger log_dir if exists
+        if self.path_to_export_trace is None:
+            self.path_to_export_trace = log_dir
 
     def start(self, action_name: str) -> None:
         if action_name not in self.profiled_functions:
@@ -231,6 +211,7 @@ def _stop(self, action_name: str) -> None:
             # when running ``emit_nvtx``, PyTorch requires 2 context manager.
             # The parent_profiler is being closed too.
             self._parent_profiler.__exit__(None, None, None)
+            self._parent_profiler = None
             return
 
         function_events = self.profiler.function_events
@@ -258,7 +239,6 @@ def stop(self, action_name: str) -> None:
     def summary(self) -> str:
         recorded_stats = {}
         output_string = ''
-        local_rank = '0' if self.local_rank is None else self.local_rank
 
         if not self.enabled:
             return output_string
@@ -271,7 +251,7 @@ def summary(self) -> str:
             function_events.populate_cpu_children = lambda: None
 
             if self.export_to_chrome:
-                filename = f"{action_name}_{local_rank}_trace.json"
+                filename = f"{action_name}_{self.local_rank}_trace.json"
                 path_to_trace = filename if self.path_to_export_trace is None \
                     else os.path.join(self.path_to_export_trace, filename)
                 function_events.export_chrome_trace(path_to_trace)
@@ -283,10 +263,4 @@ def summary(self) -> str:
                 data = function_events.key_averages(group_by_input_shapes=self.group_by_input_shapes)
                 table = data.table(sort_by=self.sort_by_key, row_limit=self.row_limit)
                 recorded_stats[action_name] = table
-
-        # log to standard out
-        output_string = f"{os.linesep}Profiler Report{os.linesep}"
-        for action, stats in recorded_stats.items():
-            output_string += (f"{os.linesep}Profile stats for: {action} rank: {local_rank} {os.linesep}{stats}")
-
-        return output_string
+        return self._stats_to_str(recorded_stats)
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index 20c842939fe17..da41b9855b44a 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -15,6 +15,7 @@
 import torch
 
 from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.trainer.supporters import PredictionCollection
 from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.model_helpers import is_overridden
@@ -99,6 +100,10 @@ def on_evaluation_end(self, *args, **kwargs):
         else:
             self.trainer.call_hook('on_validation_end', *args, **kwargs)
 
+        if self.trainer.state != TrainerState.FITTING:
+            # summarize profile results
+            self.trainer.profiler.describe()
+
     def reload_evaluation_dataloaders(self):
         model = self.trainer.lightning_module
         if self.trainer.testing:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 384a1b67a64f8..cc471f76b6033 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -137,9 +137,7 @@ def on_train_end(self):
             self.trainer.logger.finalize("success")
 
         # summarize profile results
-        # todo (tchaton) All ranks should call describe.
-        if self.trainer.global_rank == 0:
-            self.trainer.profiler.describe()
+        self.trainer.profiler.describe()
 
         # give accelerators a chance to finish
         self.trainer.accelerator.on_train_end()
diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py
index f449a37e33c25..0c5f581d7775c 100644
--- a/tests/deprecated_api/test_remove_1-5.py
+++ b/tests/deprecated_api/test_remove_1-5.py
@@ -20,6 +20,7 @@
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.profiler import BaseProfiler, SimpleProfiler, AdvancedProfiler, PyTorchProfiler
 from pytorch_lightning.trainer.callback_hook import warning_cache as callback_warning_cache
 from tests.deprecated_api import no_deprecated_call
 from tests.helpers import BoringModel
@@ -203,3 +204,12 @@ def on_test_epoch_end(self, outputs):
     model = NewSignatureModel()
     with no_deprecated_call(match="`ModelHooks.on_test_epoch_end` signature has changed in v1.3."):
         trainer.test(model)
+
+
+@pytest.mark.parametrize("cls", (BaseProfiler, SimpleProfiler, AdvancedProfiler, PyTorchProfiler))
+def test_v1_5_0_profiler_output_filename(tmpdir, cls):
+    filepath = str(tmpdir / "test.txt")
+    with pytest.deprecated_call(match="`output_filename` parameter has been removed"):
+        profiler = cls(output_filename=filepath)
+    assert profiler.dirpath == tmpdir
+    assert profiler.filename == "test"
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index cc4fff3b7ede4..cf6afcc9b626c 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -14,6 +14,7 @@
 import logging
 import os
 import time
+from copy import deepcopy
 from distutils.version import LooseVersion
 from pathlib import Path
 
@@ -21,8 +22,7 @@
 import pytest
 import torch
 
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import Callback
+from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.profiler import AdvancedProfiler, PyTorchProfiler, SimpleProfiler
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
@@ -46,8 +46,7 @@ def _sleep_generator(durations):
 
 @pytest.fixture
 def simple_profiler():
-    profiler = SimpleProfiler()
-    return profiler
+    return SimpleProfiler()
 
 
 @pytest.mark.parametrize(["action", "expected"], [
@@ -93,14 +92,6 @@ def test_simple_profiler_overhead(simple_profiler, n_iter=5):
     assert all(durations < PROFILER_OVERHEAD_MAX_TOLERANCE)
 
 
-def test_simple_profiler_describe(caplog, simple_profiler):
-    """Ensure the profiler won't fail when reporting the summary."""
-    with caplog.at_level(logging.INFO):
-        simple_profiler.describe()
-
-    assert "Profiler Report" in caplog.text
-
-
 def test_simple_profiler_value_errors(simple_profiler):
     """Ensure errors are raised where expected."""
 
@@ -116,10 +107,75 @@ def test_simple_profiler_value_errors(simple_profiler):
     simple_profiler.stop(action)
 
 
+def test_simple_profiler_deepcopy(tmpdir):
+    simple_profiler = SimpleProfiler(dirpath=tmpdir, filename="test")
+    simple_profiler.describe()
+    assert deepcopy(simple_profiler)
+
+
+def test_simple_profiler_log_dir(tmpdir):
+    """Ensure the profiler dirpath defaults to `trainer.log_dir` when not present"""
+    profiler = SimpleProfiler(filename="profiler")
+    assert profiler._log_dir is None
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        profiler=profiler,
+    )
+    trainer.fit(model)
+
+    expected = profiler.dirpath
+    assert trainer.log_dir == expected
+    assert profiler._log_dir == trainer.log_dir
+    assert Path(os.path.join(profiler.dirpath, "fit-profiler.txt")).exists()
+
+
+@RunIf(skip_windows=True)
+def test_simple_profiler_distributed_files(tmpdir):
+    """Ensure the proper files are saved in distributed"""
+    profiler = SimpleProfiler(dirpath=tmpdir, filename='profiler')
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=2,
+        accelerator="ddp_cpu",
+        num_processes=2,
+        profiler=profiler,
+        logger=False,
+    )
+    trainer.fit(model)
+    trainer.validate(model)
+    trainer.test(model)
+
+    actual = set(os.listdir(profiler.dirpath))
+    expected = {f"{stage}-profiler-{rank}.txt" for stage in ("fit", "validate", "test") for rank in (0, 1)}
+    assert actual == expected
+
+    for f in profiler.dirpath.listdir():
+        assert f.read_text('utf-8')
+
+
+def test_simple_profiler_logs(tmpdir, caplog, simple_profiler):
+    """Ensure that the number of printed logs is correct"""
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=2,
+        profiler=simple_profiler,
+        logger=False,
+    )
+    with caplog.at_level(logging.INFO, logger="pytorch_lightning.profiler.profilers"):
+        trainer.fit(model)
+        trainer.test(model)
+
+    assert caplog.text.count("Profiler Report") == 2
+
+
 @pytest.fixture
 def advanced_profiler(tmpdir):
-    profiler = AdvancedProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
-    return profiler
+    return AdvancedProfiler(dirpath=tmpdir, filename="profiler")
 
 
 @pytest.mark.parametrize(["action", "expected"], [
@@ -180,7 +236,8 @@ def test_advanced_profiler_describe(tmpdir, advanced_profiler):
         pass
     # log to stdout and print to file
     advanced_profiler.describe()
-    data = Path(advanced_profiler.output_fname).read_text()
+    path = advanced_profiler.dirpath / f"{advanced_profiler.filename}.txt"
+    data = path.read_text("utf-8")
     assert len(data) > 0
 
 
@@ -195,10 +252,14 @@ def test_advanced_profiler_value_errors(advanced_profiler):
     advanced_profiler.stop(action)
 
 
+def test_advanced_profiler_deepcopy(advanced_profiler):
+    advanced_profiler.describe()
+    assert deepcopy(advanced_profiler)
+
+
 @pytest.fixture
 def pytorch_profiler(tmpdir):
-    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"), local_rank=0)
-    return profiler
+    return PyTorchProfiler(dirpath=tmpdir, filename="profiler")
 
 
 def test_pytorch_profiler_describe(pytorch_profiler):
@@ -208,7 +269,8 @@ def test_pytorch_profiler_describe(pytorch_profiler):
 
     # log to stdout and print to file
     pytorch_profiler.describe()
-    data = Path(pytorch_profiler.output_fname).read_text()
+    path = pytorch_profiler.dirpath / f"{pytorch_profiler.filename}.txt"
+    data = path.read_text("utf-8")
     assert len(data) > 0
 
 
@@ -223,47 +285,53 @@ def test_pytorch_profiler_value_errors(pytorch_profiler):
     pytorch_profiler.stop(action)
 
 
-@RunIf(min_gpus=2, special=True)
-@pytest.mark.parametrize("use_output_filename", [False, True])
-def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename):
-    """Ensure that the profiler can be given to the training and default step are properly recorded. """
-
-    if use_output_filename:
-        output_filename = os.path.join(tmpdir, "profiler.txt")
-    else:
-        output_filename = None
+@RunIf(min_torch="1.6.0")
+def test_advanced_profiler_cprofile_deepcopy(tmpdir):
+    """Checks for pickle issue reported in #6522"""
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        profiler="advanced",
+        stochastic_weight_avg=True,
+    )
+    trainer.fit(model)
 
-    profiler = PyTorchProfiler(output_filename=output_filename)
 
+@RunIf(min_gpus=2, special=True)
+def test_pytorch_profiler_trainer_ddp(tmpdir):
+    """Ensure that the profiler can be given to the training and default step are properly recorded. """
+    pytorch_profiler = PyTorchProfiler(dirpath=None, filename="profiler")
     model = BoringModel()
     trainer = Trainer(
-        fast_dev_run=True,
-        profiler=profiler,
+        max_epochs=1,
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        profiler=pytorch_profiler,
         accelerator="ddp",
         gpus=2,
     )
     trainer.fit(model)
 
-    enabled = use_output_filename or not use_output_filename and profiler.local_rank == 0
+    assert len(pytorch_profiler.summary()) > 0
+    assert set(pytorch_profiler.profiled_actions) == {'training_step_and_backward', 'validation_step'}
 
-    if enabled:
-        assert len(profiler.summary()) > 0
-        assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}
-    else:
-        assert profiler.summary() is None
-        assert set(profiler.profiled_actions.keys()) == set()
+    files = sorted(f for f in os.listdir(pytorch_profiler.dirpath) if "fit" in f)
+    rank = int(os.getenv("LOCAL_RANK", "0"))
+    expected = f"fit-profiler-{rank}.txt"
+    assert files[rank] == expected
 
-    # todo (tchaton) add support for all ranks
-    if use_output_filename and os.getenv("LOCAL_RANK") == "0":
-        data = Path(profiler.output_fname).read_text()
-        assert len(data) > 0
+    path = os.path.join(pytorch_profiler.dirpath, expected)
+    data = Path(path).read_text("utf-8")
+    assert len(data) > 0
 
 
 def test_pytorch_profiler_nested(tmpdir):
     """Ensure that the profiler handles nested context"""
 
     pytorch_profiler = PyTorchProfiler(
-        profiled_functions=["a", "b", "c"], use_cuda=False, output_filename=os.path.join(tmpdir, "profiler.txt")
+        profiled_functions=["a", "b", "c"], use_cuda=False, dirpath=tmpdir, filename="profiler"
     )
 
     with pytorch_profiler.profile("a"):
@@ -327,13 +395,18 @@ def test_profiler_teardown(tmpdir, cls):
 
     class TestCallback(Callback):
 
-        def on_fit_end(self, trainer, pl_module) -> None:
-            assert trainer.profiler.output_file is not None
-
-    profiler = cls(output_filename=os.path.join(tmpdir, "profiler.txt"))
+        def on_fit_end(self, trainer, *args, **kwargs) -> None:
+            # describe sets it to None
+            assert trainer.profiler._output_file is None
 
+    profiler = cls(dirpath=tmpdir, filename="profiler")
     model = BoringModel()
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, profiler=profiler, callbacks=[TestCallback()])
     trainer.fit(model)
 
-    assert profiler.output_file is None
+    assert profiler._output_file is None
+
+
+def test_pytorch_profiler_deepcopy(pytorch_profiler):
+    pytorch_profiler.describe()
+    assert deepcopy(pytorch_profiler)
diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py
index 4dc5b5f34b50c..3eb0596b55fc4 100644
--- a/tests/trainer/properties/test_get_model.py
+++ b/tests/trainer/properties/test_get_model.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from pytorch_lightning import Trainer
-from tests.accelerators import DDPLauncher
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -84,8 +83,7 @@ def test_get_model_gpu(tmpdir):
 
 
 @RunIf(min_gpus=1, skip_windows=True)
-@DDPLauncher.run("--accelerator [accelerator]", max_epochs=["1"], accelerator=["ddp", "ddp_spawn"])
-def test_get_model_ddp_gpu(tmpdir, args=None):
+def test_get_model_ddp_gpu(tmpdir):
     """
     Tests that `trainer.lightning_module` extracts the model correctly when using GPU + ddp accelerators
     """
@@ -99,7 +97,6 @@ def test_get_model_ddp_gpu(tmpdir, args=None):
         limit_val_batches=2,
         max_epochs=1,
         gpus=1,
-        accelerator=args.accelerator
     )
     trainer.fit(model)
     return 1

From a74909affa0535da02e64b94f6d5f9b2da03c08f Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 23 Mar 2021 16:05:32 +0100
Subject: [PATCH 22/25] prune metrics: info retrieval (#6649)

---
 CHANGELOG.md                                  |   2 -
 pytorch_lightning/metrics/__init__.py         |   1 -
 .../metrics/functional/__init__.py            |   1 -
 .../functional/ir_average_precision.py        |  54 -------
 .../metrics/retrieval/__init__.py             |  15 --
 .../retrieval/mean_average_precision.py       |  61 --------
 .../metrics/retrieval/retrieval_metric.py     | 140 ------------------
 tests/metrics/functional/test_retrieval.py    |  36 -----
 tests/metrics/retrieval/__init__.py           |   0
 tests/metrics/retrieval/test_map.py           | 119 ---------------
 10 files changed, 429 deletions(-)
 delete mode 100644 pytorch_lightning/metrics/functional/ir_average_precision.py
 delete mode 100644 pytorch_lightning/metrics/retrieval/__init__.py
 delete mode 100644 pytorch_lightning/metrics/retrieval/mean_average_precision.py
 delete mode 100644 pytorch_lightning/metrics/retrieval/retrieval_metric.py
 delete mode 100644 tests/metrics/functional/test_retrieval.py
 delete mode 100644 tests/metrics/retrieval/__init__.py
 delete mode 100644 tests/metrics/retrieval/test_map.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32cf9122efe34..81bfa85cc073f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,8 +9,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-- Added `RetrievalMAP` metric, the corresponding functional version `retrieval_average_precision` and a generic superclass for retrieval metrics `RetrievalMetric` ([#5032](https://github.com/PyTorchLightning/pytorch-lightning/pull/5032))
-
 
 - Added a way to print to terminal without breaking up the progress bar ([#5470](https://github.com/PyTorchLightning/pytorch-lightning/pull/5470))
 
diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py
index 500689f3182fb..1da24737a3752 100644
--- a/pytorch_lightning/metrics/__init__.py
+++ b/pytorch_lightning/metrics/__init__.py
@@ -39,7 +39,6 @@
     R2Score,
     SSIM,
 )
-from pytorch_lightning.metrics.retrieval import RetrievalMAP  # noqa: F401
 
 warn(
     "`pytorch_lightning.metrics.*` module has been renamed to `torchmetrics.*` and split off to its own package"
diff --git a/pytorch_lightning/metrics/functional/__init__.py b/pytorch_lightning/metrics/functional/__init__.py
index 1701389cd1c64..3b31dad5d3411 100644
--- a/pytorch_lightning/metrics/functional/__init__.py
+++ b/pytorch_lightning/metrics/functional/__init__.py
@@ -28,7 +28,6 @@
 from pytorch_lightning.metrics.functional.hamming_distance import hamming_distance  # noqa: F401
 from pytorch_lightning.metrics.functional.image_gradients import image_gradients  # noqa: F401
 from pytorch_lightning.metrics.functional.iou import iou  # noqa: F401
-from pytorch_lightning.metrics.functional.ir_average_precision import retrieval_average_precision  # noqa: F401
 from pytorch_lightning.metrics.functional.mean_absolute_error import mean_absolute_error  # noqa: F401
 from pytorch_lightning.metrics.functional.mean_squared_error import mean_squared_error  # noqa: F401
 from pytorch_lightning.metrics.functional.mean_squared_log_error import mean_squared_log_error  # noqa: F401
diff --git a/pytorch_lightning/metrics/functional/ir_average_precision.py b/pytorch_lightning/metrics/functional/ir_average_precision.py
deleted file mode 100644
index 83b14a21c5553..0000000000000
--- a/pytorch_lightning/metrics/functional/ir_average_precision.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-
-
-def retrieval_average_precision(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-    r"""
-    Computes average precision (for information retrieval), as explained
-    `here <https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Average_precision>`_.
-
-    `preds` and `target` should be of the same shape and live on the same device. If no `target` is ``True``,
-    0 is returned. Target must be of type `bool` or `int`, otherwise an error is raised.
-
-    Args:
-        preds: estimated probabilities of each document to be relevant.
-        target: ground truth about each document being relevant or not. Requires `bool` or `int` tensor.
-
-    Return:
-        a single-value tensor with the average precision (AP) of the predictions `preds` wrt the labels `target`.
-
-    Example:
-        >>> preds = torch.tensor([0.2, 0.3, 0.5])
-        >>> target = torch.tensor([True, False, True])
-        >>> retrieval_average_precision(preds, target)
-        tensor(0.8333)
-    """
-
-    if preds.shape != target.shape or preds.device != target.device:
-        raise ValueError("`preds` and `target` must have the same shape and live on the same device")
-
-    if target.dtype not in (torch.bool, torch.int16, torch.int32, torch.int64):
-        raise ValueError("`target` must be a tensor of booleans or integers")
-
-    if target.dtype is not torch.bool:
-        target = target.bool()
-
-    if target.sum() == 0:
-        return torch.tensor(0, device=preds.device)
-
-    target = target[torch.argsort(preds, dim=-1, descending=True)]
-    positions = torch.arange(1, len(target) + 1, device=target.device, dtype=torch.float32)[target > 0]
-    res = torch.div((torch.arange(len(positions), device=positions.device, dtype=torch.float32) + 1), positions).mean()
-    return res
diff --git a/pytorch_lightning/metrics/retrieval/__init__.py b/pytorch_lightning/metrics/retrieval/__init__.py
deleted file mode 100644
index c5c12b3b6643c..0000000000000
--- a/pytorch_lightning/metrics/retrieval/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pytorch_lightning.metrics.retrieval.mean_average_precision import RetrievalMAP  # noqa: F401
-from pytorch_lightning.metrics.retrieval.retrieval_metric import RetrievalMetric  # noqa: F401
diff --git a/pytorch_lightning/metrics/retrieval/mean_average_precision.py b/pytorch_lightning/metrics/retrieval/mean_average_precision.py
deleted file mode 100644
index 956a53cca2e77..0000000000000
--- a/pytorch_lightning/metrics/retrieval/mean_average_precision.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import torch
-
-from pytorch_lightning.metrics.functional.ir_average_precision import retrieval_average_precision
-from pytorch_lightning.metrics.retrieval.retrieval_metric import RetrievalMetric
-
-
-class RetrievalMAP(RetrievalMetric):
-    r"""
-    Computes `Mean Average Precision
-    <https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision>`_.
-
-    Works with binary data. Accepts integer or float predictions from a model output.
-
-    Forward accepts
-    - ``indexes`` (long tensor): ``(N, ...)``
-    - ``preds`` (float tensor): ``(N, ...)``
-    - ``target`` (long or bool tensor): ``(N, ...)``
-
-    `indexes`, `preds` and `target` must have the same dimension.
-    `indexes` indicate to which query a prediction belongs.
-    Predictions will be first grouped by indexes and then MAP will be computed as the mean
-    of the Average Precisions over each query.
-
-    Args:
-        query_without_relevant_docs:
-            Specify what to do with queries that do not have at least a positive target. Choose from:
-
-            - ``'skip'``: skip those queries (default); if all queries are skipped, ``0.0`` is returned
-            - ``'error'``: raise a ``ValueError``
-            - ``'pos'``: score on those queries is counted as ``1.0``
-            - ``'neg'``: score on those queries is counted as ``0.0``
-        exclude:
-            Do not take into account predictions where the target is equal to this value. default `-100`
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects
-            the entire world)
-        dist_sync_fn:
-            Callback that performs the allgather operation on the metric state. When `None`, DDP
-            will be used to perform the allgather. default: None
-
-    Example:
-        >>> from pytorch_lightning.metrics import RetrievalMAP
-        >>> indexes = torch.tensor([0, 0, 0, 1, 1, 1, 1])
-        >>> preds = torch.tensor([0.2, 0.3, 0.5, 0.1, 0.3, 0.5, 0.2])
-        >>> target = torch.tensor([False, False, True, False, True, False, False])
-
-        >>> map = RetrievalMAP()
-        >>> map(indexes, preds, target)
-        tensor(0.7500)
-        >>> map.compute()
-        tensor(0.7500)
-    """
-
-    def _metric(self, preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        valid_indexes = target != self.exclude
-        return retrieval_average_precision(preds[valid_indexes], target[valid_indexes])
diff --git a/pytorch_lightning/metrics/retrieval/retrieval_metric.py b/pytorch_lightning/metrics/retrieval/retrieval_metric.py
deleted file mode 100644
index 6f9088d00083c..0000000000000
--- a/pytorch_lightning/metrics/retrieval/retrieval_metric.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Optional
-
-import torch
-from torchmetrics import Metric
-
-from pytorch_lightning.metrics.utils import get_group_indexes
-
-#: get_group_indexes is used to group predictions belonging to the same query
-
-IGNORE_IDX = -100
-
-
-class RetrievalMetric(Metric, ABC):
-    r"""
-    Works with binary data. Accepts integer or float predictions from a model output.
-
-    Forward accepts
-    - ``indexes`` (long tensor): ``(N, ...)``
-    - ``preds`` (float or int tensor): ``(N, ...)``
-    - ``target`` (long or bool tensor): ``(N, ...)``
-
-    `indexes`, `preds` and `target` must have the same dimension and will be flatten
-    to single dimension once provided.
-
-    `indexes` indicate to which query a prediction belongs.
-    Predictions will be first grouped by indexes. Then the
-    real metric, defined by overriding the `_metric` method,
-    will be computed as the mean of the scores over each query.
-
-    Args:
-        query_without_relevant_docs:
-            Specify what to do with queries that do not have at least a positive target. Choose from:
-
-            - ``'skip'``: skip those queries (default); if all queries are skipped, ``0.0`` is returned
-            - ``'error'``: raise a ``ValueError``
-            - ``'pos'``: score on those queries is counted as ``1.0``
-            - ``'neg'``: score on those queries is counted as ``0.0``
-        exclude:
-            Do not take into account predictions where the target is equal to this value. default `-100`
-        compute_on_step:
-            Forward only calls ``update()`` and return None if this is set to False. default: True
-        dist_sync_on_step:
-            Synchronize metric state across processes at each ``forward()``
-            before returning the value at the step. default: False
-        process_group:
-            Specify the process group on which synchronization is called. default: None (which selects
-            the entire world)
-        dist_sync_fn:
-            Callback that performs the allgather operation on the metric state. When `None`, DDP
-            will be used to perform the allgather. default: None
-
-    """
-
-    def __init__(
-        self,
-        query_without_relevant_docs: str = 'skip',
-        exclude: int = IGNORE_IDX,
-        compute_on_step: bool = True,
-        dist_sync_on_step: bool = False,
-        process_group: Optional[Any] = None,
-        dist_sync_fn: Callable = None
-    ):
-        super().__init__(
-            compute_on_step=compute_on_step,
-            dist_sync_on_step=dist_sync_on_step,
-            process_group=process_group,
-            dist_sync_fn=dist_sync_fn
-        )
-
-        query_without_relevant_docs_options = ('error', 'skip', 'pos', 'neg')
-        if query_without_relevant_docs not in query_without_relevant_docs_options:
-            raise ValueError(
-                f"`query_without_relevant_docs` received a wrong value {query_without_relevant_docs}. "
-                f"Allowed values are {query_without_relevant_docs_options}"
-            )
-
-        self.query_without_relevant_docs = query_without_relevant_docs
-        self.exclude = exclude
-
-        self.add_state("idx", default=[], dist_reduce_fx=None)
-        self.add_state("preds", default=[], dist_reduce_fx=None)
-        self.add_state("target", default=[], dist_reduce_fx=None)
-
-    def update(self, idx: torch.Tensor, preds: torch.Tensor, target: torch.Tensor) -> None:
-        if not (idx.shape == target.shape == preds.shape):
-            raise ValueError("`idx`, `preds` and `target` must be of the same shape")
-
-        idx = idx.to(dtype=torch.int64).flatten()
-        preds = preds.to(dtype=torch.float32).flatten()
-        target = target.to(dtype=torch.int64).flatten()
-
-        self.idx.append(idx)
-        self.preds.append(preds)
-        self.target.append(target)
-
-    def compute(self) -> torch.Tensor:
-        r"""
-        First concat state `idx`, `preds` and `target` since they were stored as lists. After that,
-        compute list of groups that will help in keeping together predictions about the same query.
-        Finally, for each group compute the `_metric` if the number of positive targets is at least
-        1, otherwise behave as specified by `self.query_without_relevant_docs`.
-        """
-
-        idx = torch.cat(self.idx, dim=0)
-        preds = torch.cat(self.preds, dim=0)
-        target = torch.cat(self.target, dim=0)
-
-        res = []
-        kwargs = {'device': idx.device, 'dtype': torch.float32}
-
-        groups = get_group_indexes(idx)
-        for group in groups:
-
-            mini_preds = preds[group]
-            mini_target = target[group]
-
-            if not mini_target.sum():
-                if self.query_without_relevant_docs == 'error':
-                    raise ValueError(
-                        f"`{self.__class__.__name__}.compute()` was provided with "
-                        f"a query without positive targets, indexes: {group}"
-                    )
-                if self.query_without_relevant_docs == 'pos':
-                    res.append(torch.tensor(1.0, **kwargs))
-                elif self.query_without_relevant_docs == 'neg':
-                    res.append(torch.tensor(0.0, **kwargs))
-            else:
-                res.append(self._metric(mini_preds, mini_target))
-
-        if len(res) > 0:
-            return torch.stack(res).mean()
-        return torch.tensor(0.0, **kwargs)
-
-    @abstractmethod
-    def _metric(self, preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        r"""
-        Compute a metric over a predictions and target of a single group.
-        This method should be overridden by subclasses.
-        """
diff --git a/tests/metrics/functional/test_retrieval.py b/tests/metrics/functional/test_retrieval.py
deleted file mode 100644
index a0573cba1d27e..0000000000000
--- a/tests/metrics/functional/test_retrieval.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import math
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import average_precision_score as sk_average_precision
-
-from pytorch_lightning import seed_everything
-from pytorch_lightning.metrics.functional.ir_average_precision import retrieval_average_precision
-
-
-@pytest.mark.parametrize(['sklearn_metric', 'torch_metric'], [
-    pytest.param(sk_average_precision, retrieval_average_precision),
-])
-def test_against_sklearn(sklearn_metric, torch_metric):
-    """Compare PL metrics to sklearn version. """
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    seed_everything(0)
-
-    rounds = 25
-    sizes = [1, 4, 10, 100]
-
-    for size in sizes:
-        for _ in range(rounds):
-            a = np.random.randn(size)
-            b = np.random.randn(size) > 0
-
-            sk = torch.tensor(sklearn_metric(b, a), device=device)
-            pl = torch_metric(torch.tensor(a, device=device), torch.tensor(b, device=device))
-
-            # `torch_metric`s return 0 when no label is True
-            # while `sklearn.average_precision_score` returns NaN
-            if math.isnan(sk):
-                assert pl == 0
-            else:
-                assert torch.allclose(sk.float(), pl.float())
diff --git a/tests/metrics/retrieval/__init__.py b/tests/metrics/retrieval/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/metrics/retrieval/test_map.py b/tests/metrics/retrieval/test_map.py
deleted file mode 100644
index fe43f19b20eb6..0000000000000
--- a/tests/metrics/retrieval/test_map.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import math
-import random
-from typing import Callable, List
-
-import numpy as np
-import pytest
-import torch
-from sklearn.metrics import average_precision_score as sk_average_precision
-from torchmetrics import Metric
-
-from pytorch_lightning import seed_everything
-from pytorch_lightning.metrics.retrieval.mean_average_precision import RetrievalMAP
-
-
-@pytest.mark.parametrize(['sklearn_metric', 'torch_class_metric'], [
-    [sk_average_precision, RetrievalMAP],
-])
-def test_against_sklearn(sklearn_metric: Callable, torch_class_metric: Metric) -> None:
-    """Compare PL metrics to sklearn version. """
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    seed_everything(0)
-
-    rounds = 20
-    sizes = [1, 4, 10, 100]
-    batch_sizes = [1, 4, 10]
-    query_without_relevant_docs_options = ['skip', 'pos', 'neg']
-
-    def compute_sklearn_metric(target: List[np.ndarray], preds: List[np.ndarray], behaviour: str) -> torch.Tensor:
-        """ Compute sk metric with multiple iterations using the base `sklearn_metric`. """
-        sk_results = []
-        kwargs = {'device': device, 'dtype': torch.float32}
-
-        for b, a in zip(target, preds):
-            res = sklearn_metric(b, a)
-
-            if math.isnan(res):
-                if behaviour == 'skip':
-                    pass
-                elif behaviour == 'pos':
-                    sk_results.append(torch.tensor(1.0, **kwargs))
-                else:
-                    sk_results.append(torch.tensor(0.0, **kwargs))
-            else:
-                sk_results.append(torch.tensor(res, **kwargs))
-        if len(sk_results) > 0:
-            sk_results = torch.stack(sk_results).mean()
-        else:
-            sk_results = torch.tensor(0.0, **kwargs)
-
-        return sk_results
-
-    def do_test(batch_size: int, size: int) -> None:
-        """ For each possible behaviour of the metric, check results are correct. """
-        for behaviour in query_without_relevant_docs_options:
-            metric = torch_class_metric(query_without_relevant_docs=behaviour)
-            shape = (size, )
-
-            indexes = []
-            preds = []
-            target = []
-
-            for i in range(batch_size):
-                indexes.append(np.ones(shape, dtype=int) * i)
-                preds.append(np.random.randn(*shape))
-                target.append(np.random.randn(*shape) > 0)
-
-            sk_results = compute_sklearn_metric(target, preds, behaviour)
-
-            indexes_tensor = torch.cat([torch.tensor(i) for i in indexes])
-            preds_tensor = torch.cat([torch.tensor(p) for p in preds])
-            target_tensor = torch.cat([torch.tensor(t) for t in target])
-
-            # lets assume data are not ordered
-            perm = torch.randperm(indexes_tensor.nelement())
-            indexes_tensor = indexes_tensor.view(-1)[perm].view(indexes_tensor.size())
-            preds_tensor = preds_tensor.view(-1)[perm].view(preds_tensor.size())
-            target_tensor = target_tensor.view(-1)[perm].view(target_tensor.size())
-
-            # shuffle ids to require also sorting of documents ability from the lightning metric
-            pl_result = metric(indexes_tensor, preds_tensor, target_tensor)
-
-            assert torch.allclose(sk_results.float(), pl_result.float(), equal_nan=True)
-
-    for batch_size in batch_sizes:
-        for size in sizes:
-            for _ in range(rounds):
-                do_test(batch_size, size)
-
-
-@pytest.mark.parametrize(['torch_class_metric'], [
-    [RetrievalMAP],
-])
-def test_input_data(torch_class_metric: Metric) -> None:
-    """Check PL metrics inputs are controlled correctly. """
-
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    seed_everything(0)
-
-    for _ in range(10):
-
-        length = random.randint(0, 20)
-
-        # check error when `query_without_relevant_docs='error'` is raised correctly
-        indexes = torch.tensor([0] * length, device=device, dtype=torch.int64)
-        preds = torch.rand(size=(length, ), device=device, dtype=torch.float32)
-        target = torch.tensor([False] * length, device=device, dtype=torch.bool)
-
-        metric = torch_class_metric(query_without_relevant_docs='error')
-
-        try:
-            metric(indexes, preds, target)
-        except Exception as e:
-            assert isinstance(e, ValueError)
-
-        # check ValueError with non-accepted argument
-        try:
-            metric = torch_class_metric(query_without_relevant_docs='casual_argument')
-        except Exception as e:
-            assert isinstance(e, ValueError)

From 0995d30fab0590d155895a77535663794118b5f6 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 23 Mar 2021 15:13:13 +0000
Subject: [PATCH 23/25] Flash predict step (#6577)

* add predict_step

* Update predict_loop.py

* Update trainer.py

* Update trainer.py

* resolve bugs

* update

* update

* update

* resolve bug

* resolve some failing tests

* udpate tests

* update

* resolve tests

* add a test

* remove typo

* add a test for attachement

* update

* changed to on_train_dataloader

* remove __flash_special_attr__

* resolve tests

* update

* update

* update

* update on comments

* Update pytorch_lightning/trainer/data_loading.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 docs/source/starter/introduction_guide.rst    |  6 +-
 pytorch_lightning/accelerators/accelerator.py | 11 ++-
 pytorch_lightning/core/hooks.py               | 24 +++++++
 pytorch_lightning/core/lightning.py           |  2 +-
 pytorch_lightning/overrides/base.py           |  2 +-
 .../plugins/training_type/ddp.py              |  2 +-
 .../plugins/training_type/ddp_spawn.py        |  2 +-
 pytorch_lightning/plugins/training_type/dp.py |  2 +-
 .../plugins/training_type/tpu_spawn.py        |  4 +-
 .../training_type/training_type_plugin.py     |  4 +-
 .../trainer/connectors/data_connector.py      |  4 ++
 pytorch_lightning/trainer/data_loading.py     | 15 ++--
 pytorch_lightning/trainer/predict_loop.py     | 12 +++-
 pytorch_lightning/trainer/trainer.py          |  8 ++-
 tests/overrides/test_data_parallel.py         |  2 +-
 tests/trainer/test_dataloaders.py             | 68 +++++++++++++++++++
 tests/trainer/test_trainer.py                 | 36 +++++++++-
 17 files changed, 174 insertions(+), 30 deletions(-)

diff --git a/docs/source/starter/introduction_guide.rst b/docs/source/starter/introduction_guide.rst
index c65894367a39e..551b8182caa7d 100644
--- a/docs/source/starter/introduction_guide.rst
+++ b/docs/source/starter/introduction_guide.rst
@@ -882,8 +882,8 @@ Or maybe we have a model that we use to do generation
     generated_imgs = model(z)
 
 
-To perform inference at scale, it is possible to use ``trainer.predict`` with LightningModule ``predict`` function
-By default, LightningModule ``predict`` calls forward, but it can be overriden to add any processing logic.
+To perform inference at scale, it is possible to use ``trainer.predict`` with LightningModule ``predict_step`` function
+By default, LightningModule ``predict_step`` calls forward, but it can be overriden to add any processing logic.
 
 .. code-block:: python
 
@@ -893,7 +893,7 @@ By default, LightningModule ``predict`` calls forward, but it can be overriden t
             imgs = self.decoder(z)
             return imgs
 
-        def predict(self, batch, batch_idx: int , dataloader_idx: int = None):
+        def predict_step(self, batch, batch_idx: int , dataloader_idx: int = None):
             return self(batch)
 
 
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 60e6ea88b4250..9ea2cec491d2c 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -219,7 +219,7 @@ def test_step(self, args: List[Union[Any, int]]) -> _STEP_OUTPUT_TYPE:
         with self.precision_plugin.test_step_context(), self.training_type_plugin.test_step_context():
             return self.training_type_plugin.test_step(*args)
 
-    def predict(self, args: List[Union[Any, int]]) -> _STEP_OUTPUT_TYPE:
+    def predict_step(self, args: List[Union[Any, int]]) -> _STEP_OUTPUT_TYPE:
         """The actual predict step.
 
         Args:
@@ -235,7 +235,7 @@ def predict(self, args: List[Union[Any, int]]) -> _STEP_OUTPUT_TYPE:
         args[0] = batch
 
         with self.precision_plugin.predict_context(), self.training_type_plugin.predict_context():
-            return self.training_type_plugin.predict(*args)
+            return self.training_type_plugin.predict_step(*args)
 
     def training_step_end(self, output: _STEP_OUTPUT_TYPE) -> _STEP_OUTPUT_TYPE:
         """A hook to do something at the end of the training step
@@ -359,7 +359,12 @@ def setup_precision_plugin(self, plugin: PrecisionPlugin) -> None:
 
     def to_device(self, batch: Any) -> Any:
         """Pushes the batch to the root device"""
-        return self.batch_to_device(batch, self.root_device)
+        # Todo (tchaton) Better fix
+        is_dict = isinstance(batch, dict)
+        if is_dict:
+            batch = [batch]
+        batch = self.batch_to_device(batch, self.root_device)
+        return batch[0] if is_dict else batch
 
     @property
     def amp_backend(self) -> Optional[LightningEnum]:
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 9624f94652713..8c68cc96eabc2 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -282,6 +282,18 @@ def on_test_end(self) -> None:
         """
         # do something at the end of testing
 
+    def on_predict_start(self) -> None:
+        """
+        Called at the beginning of predicting.
+        """
+        # do something at the start of predicting
+
+    def on_predict_end(self) -> None:
+        """
+        Called at the end of predicting.
+        """
+        # do something at the end of predicting
+
     def on_before_zero_grad(self, optimizer: Optimizer) -> None:
         """
         Called after optimizer.step() and before optimizer.zero_grad().
@@ -594,6 +606,18 @@ def predict_dataloader(self) -> Union[DataLoader, List[DataLoader]]:
             will have an argument ``dataloader_idx`` which matches the order here.
         """
 
+    def on_train_dataloader(self) -> None:
+        """Called before requesting the train dataloader."""
+
+    def on_val_dataloader(self) -> None:
+        """Called before requesting the val dataloader."""
+
+    def on_test_dataloader(self) -> None:
+        """Called before requesting the test dataloader."""
+
+    def on_predict_dataloader(self) -> None:
+        """Called before requesting the predict dataloader."""
+
     def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any:
         """
         Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 044dd95f3b8c6..4d36fe48448dc 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1054,7 +1054,7 @@ def test_epoch_end(self, outputs):
                     self.log('final_metric', final_value)
         """
 
-    def predict(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None):
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None):
         """
         Use this function with trainer.predict(...). Override if you need to add any processing logic.
         """
diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py
index 1d6f4e93b5779..0c1ac7b359fd0 100644
--- a/pytorch_lightning/overrides/base.py
+++ b/pytorch_lightning/overrides/base.py
@@ -53,7 +53,7 @@ def forward(self, *inputs, **kwargs):
         elif trainer and (trainer.sanity_checking or trainer.validating):
             output = self.module.validation_step(*inputs, **kwargs)
         elif trainer and trainer.predicting:
-            output = self.module.predict(*inputs, **kwargs)
+            output = self.module.predict_step(*inputs, **kwargs)
         else:
             output = self.module(*inputs, **kwargs)
 
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index bcadf16607b4f..58e26e7db32d8 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -298,7 +298,7 @@ def validation_step(self, *args, **kwargs):
     def test_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
-    def predict(self, *args, **kwargs):
+    def predict_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
     def post_training_step(self):
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index ea1efd6e15873..87d7fa5faecac 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -282,7 +282,7 @@ def validation_step(self, *args, **kwargs):
     def test_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
-    def predict(self, *args, **kwargs):
+    def predict_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
     def post_training_step(self):
diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index b96b7097d07c7..a8e42e0fa747a 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -83,7 +83,7 @@ def validation_step(self, *args, **kwargs):
     def test_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
-    def predict(self, *args, **kwargs):
+    def predict_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
     def training_step_end(self, output):
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index c883ff504f24d..3887e0cd98908 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -294,8 +294,8 @@ def validation_step(self, *args, **kwargs):
     def test_step(self, *args, **kwargs):
         return self.lightning_module.test_step(*args, **kwargs)
 
-    def predict(self, *args, **kwargs):
-        return self.lightning_module.predict(*args, **kwargs)
+    def predict_step(self, *args, **kwargs):
+        return self.lightning_module.predict_step(*args, **kwargs)
 
     def save_checkpoint(self, filepath, weights_only: bool = False):
         """Save model/training states as a checkpoint file through state-dump and file-write.
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 89f27963caadf..08dca63a7c925 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -154,8 +154,8 @@ def validation_step(self, *args, **kwargs):
     def test_step(self, *args, **kwargs):
         return self.lightning_module.test_step(*args, **kwargs)
 
-    def predict(self, *args, **kwargs):
-        return self.lightning_module.predict(*args, **kwargs)
+    def predict_step(self, *args, **kwargs):
+        return self.lightning_module.predict_step(*args, **kwargs)
 
     def training_step_end(self, output):
         return output
diff --git a/pytorch_lightning/trainer/connectors/data_connector.py b/pytorch_lightning/trainer/connectors/data_connector.py
index b3fc0b4eb7b29..5d2f141dc64a8 100644
--- a/pytorch_lightning/trainer/connectors/data_connector.py
+++ b/pytorch_lightning/trainer/connectors/data_connector.py
@@ -150,6 +150,10 @@ def attach_datamodule(self, model, datamodule: Optional[LightningDataModule] = N
             self.trainer.datamodule = datamodule
             datamodule.trainer = self.trainer
 
+            # experimental feature for Flash
+            if hasattr(datamodule, "data_pipeline"):
+                model.data_pipeline = datamodule.data_pipeline
+
 
 class _PatchDataLoader(object):
     r"""
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 56da7039bbca7..1a9c69d107b97 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -16,7 +16,7 @@
 import platform
 from abc import ABC
 from copy import deepcopy
-from typing import Callable, Iterable, List, Tuple, Union
+from typing import Iterable, List, Tuple, Union
 
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -191,7 +191,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         Args:
             model: The current `LightningModule`
         """
-        self.train_dataloader = self.request_dataloader(model.train_dataloader)
+        self.train_dataloader = self.request_dataloader(model, "train")
 
         if self.overfit_batches > 0:
             if hasattr(self.train_dataloader, 'sampler') and isinstance(self.train_dataloader.sampler, RandomSampler):
@@ -271,7 +271,7 @@ def _reset_eval_dataloader(
         """
         # always get the loaders first so we can count how many there are
         loader_name = f'{mode}_dataloader'
-        dataloaders = self.request_dataloader(getattr(model, loader_name))
+        dataloaders = self.request_dataloader(model, mode)
 
         if not isinstance(dataloaders, list):
             dataloaders = [dataloaders]
@@ -280,7 +280,7 @@ def _reset_eval_dataloader(
         # duplicate it the numb of times needed to match the train loaders
         if self.overfit_batches > 0:
             num_loaders = len(dataloaders)
-            train_dataloader = self.request_dataloader(getattr(model, 'train_dataloader'))
+            train_dataloader = self.request_dataloader(model, 'train')
             dataloaders = [deepcopy(train_dataloader) for _ in range(num_loaders)]
 
         self.dev_debugger.track_load_dataloader_call(loader_name, dataloaders=dataloaders)
@@ -380,7 +380,7 @@ def reset_predict_dataloader(self, model) -> None:
         if has_loader:
             self.num_predict_batches, self.predict_dataloaders = self._reset_eval_dataloader(model, 'predict')
 
-    def request_dataloader(self, dataloader_fx: Callable) -> DataLoader:
+    def request_dataloader(self, model: LightningModule, stage: str) -> DataLoader:
         """Handles downloading data in the GPU or TPU case.
 
         Args:
@@ -389,9 +389,10 @@ def request_dataloader(self, dataloader_fx: Callable) -> DataLoader:
         Returns:
             The dataloader
         """
-        dataloader = dataloader_fx()
+        if model.trainer is not None:
+            model.trainer.call_hook(f"on_{stage}_dataloader")
+        dataloader: DataLoader = getattr(model, f'{stage}_dataloader')()
         dataloader = self._flatten_dl_only(dataloader)
-
         self.accelerator.barrier('get_dataloaders')
         return dataloader
 
diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py
index 70329b4fdf514..53e82fd3f62b3 100644
--- a/pytorch_lightning/trainer/predict_loop.py
+++ b/pytorch_lightning/trainer/predict_loop.py
@@ -65,7 +65,7 @@ def _get_num_dataloaders(self, dataloaders):
             length = len(dataloaders[0])
         return length
 
-    def predict(self, batch, batch_idx, dataloader_idx):
+    def predict_step(self, batch, batch_idx, dataloader_idx):
         # configure args
         args = [batch, batch_idx]
         if self.num_dataloaders:
@@ -74,7 +74,7 @@ def predict(self, batch, batch_idx, dataloader_idx):
         model_ref = self.trainer.lightning_module
 
         model_ref._current_fx_name = "predict"
-        predictions = self.trainer.accelerator.predict(args)
+        predictions = self.trainer.accelerator.predict_step(args)
 
         if predictions is None:
             self.warning_cache.warn("predict returned None if it was on purpose, ignore this warning...")
@@ -99,3 +99,11 @@ def _convert_to_numpy(v):
             return results[0]
 
         return results
+
+    def on_predict_start(self):
+        # hook
+        self.trainer.call_hook("on_predict_start")
+
+    def on_predict_end(self):
+        # hook
+        self.trainer.call_hook("on_predict_end")
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index f7bd1757b9bc2..bb5d6919964e5 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -762,6 +762,8 @@ def run_evaluate(self):
         return eval_loop_results
 
     def run_predict(self):
+        self.predict_loop.on_predict_start()
+
         # prepare dataloaders
         dataloaders, max_batches = self.predict_loop.get_predict_dataloaders()
 
@@ -784,7 +786,6 @@ def run_predict(self):
         for dataloader_idx, dataloader in enumerate(dataloaders):
             dataloader = self.accelerator.process_dataloader(dataloader)
             dl_max_batches = self.predict_loop.max_batches[dataloader_idx]
-
             for batch_idx, batch in enumerate(dataloader):
                 if batch is None:
                     continue
@@ -794,10 +795,11 @@ def run_predict(self):
                     break
 
                 # lightning module methods
-                with self.profiler.profile("predict"):
-                    self.predict_loop.predict(batch, batch_idx, dataloader_idx)
+                with self.profiler.profile("predict_step"):
+                    self.predict_loop.predict_step(batch, batch_idx, dataloader_idx)
 
         results = self.predict_loop.on_predict_epoch_end()
+        self.predict_loop.on_predict_end()
         return results
 
     def run_sanity_check(self, ref_model):
diff --git a/tests/overrides/test_data_parallel.py b/tests/overrides/test_data_parallel.py
index 3921e7ef33b8e..aaf47c82d5f08 100644
--- a/tests/overrides/test_data_parallel.py
+++ b/tests/overrides/test_data_parallel.py
@@ -24,7 +24,7 @@
         ("training", "training_step"),
         ("testing", "test_step"),
         ("validating", "validation_step"),
-        ("predicting", "predict"),
+        ("predicting", "predict_step"),
     ]
 )
 def test_lightning_wrapper_module_methods(wrapper_class, stage):
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 52c51777e2a89..505af173b7910 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -1159,3 +1159,71 @@ def test_replace_sampler_with_multiprocessing_context(tmpdir):
 
     new_data_loader = trainer.replace_sampler(train, SequentialSampler(train.dataset))
     assert (new_data_loader.multiprocessing_context == train.multiprocessing_context)
+
+
+def test_request_dataloader(tmpdir):
+    """
+        This test asserts dataloader can be modified and properly set to the trainer.
+    """
+
+    class DataLoaderWrapper:
+
+        def __init__(self, loader):
+            self.loader = loader
+            self._iter = iter(self.loader)
+
+        def __iter__(self):
+            self._iter = iter(self.loader)
+            return self._iter
+
+        def __next__(self):
+            return next(self._iter)
+
+    class DataLoaderFunc:
+
+        def __init__(self, loader):
+            self.loader = loader
+
+        def __call__(self):
+            return self.loader
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.on_train_dataloader_called = False
+            self.on_train_batch_start_called = False
+            self.on_val_dataloader_called = False
+            self.on_val_batch_start_called = False
+
+        def on_train_dataloader(self) -> None:
+            loader = self.train_dataloader()
+            self.train_dataloader = DataLoaderFunc(DataLoaderWrapper(loader))
+            self.on_train_dataloader_called = True
+
+        def on_train_batch_start(self, batch, batch_idx: int, dataloader_idx: int) -> None:
+            assert isinstance(self.trainer.train_dataloader.loaders, DataLoaderWrapper)
+            self.on_train_batch_start_called = True
+
+        def on_val_dataloader(self) -> None:
+            loader = self.val_dataloader()
+            self.val_dataloader = DataLoaderFunc(DataLoaderWrapper(loader))
+            self.on_val_dataloader_called = True
+
+        def on_validation_batch_start(self, batch, batch_idx: int, dataloader_idx: int) -> None:
+            assert isinstance(self.trainer.val_dataloaders[0], DataLoaderWrapper)
+            self.on_val_batch_start_called = True
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        max_epochs=1,
+    )
+    model = TestModel()
+    trainer.fit(model)
+    trainer.test(model)
+    assert model.on_train_dataloader_called
+    assert model.on_train_batch_start_called
+    assert model.on_val_dataloader_called
+    assert model.on_val_batch_start_called
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 66889bb7e1139..d461d9d152e74 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1440,11 +1440,11 @@ def test_trainer_predict_no_return(tmpdir):
 
     class CustomBoringModel(BoringModel):
 
-        def predict(self, batch, batch_idx, dataloader_idx=None):
+        def predict_step(self, batch, batch_idx, dataloader_idx=None):
             if (batch_idx + 1) % 2 == 0:
                 return
 
-            return super().predict(batch, batch_idx, dataloader_idx)
+            return super().predict_step(batch, batch_idx, dataloader_idx)
 
     with pytest.warns(UserWarning, match='predict returned None'):
         predict(tmpdir, None, None, 1, model=CustomBoringModel())
@@ -1731,3 +1731,35 @@ def test_check_val_every_n_epoch_exception(tmpdir):
             max_epochs=1,
             check_val_every_n_epoch=1.2,
         )
+
+
+def test_trainer_attach_data_pipeline_to_model(tmpdir):
+
+    class DataPipeline:
+
+        pass
+
+    class TestDataModule(LightningDataModule):
+
+        data_pipeline = DataPipeline()
+
+        def train_dataloader(self):
+            return DataLoader(RandomDataset(32, 64))
+
+        def val_dataloader(self):
+            return DataLoader(RandomDataset(32, 64))
+
+        def test_dataloader(self):
+            return DataLoader(RandomDataset(32, 64))
+
+    class TestCallback(Callback):
+
+        def on_fit_start(self, trainer, pl_module: LightningModule) -> None:
+            """Called when fit begins"""
+            assert isinstance(pl_module.data_pipeline, DataPipeline)
+
+    model = BoringModel()
+    dm = TestDataModule()
+
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, callbacks=[TestCallback()])
+    trainer.fit(model, datamodule=dm)

From 3cf0c3117a6c0ddff9bef5a216cad1cb4af5b6e6 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 23 Mar 2021 17:41:36 +0100
Subject: [PATCH 24/25] fix back-compatibility for Accel (#6655)

---
 pytorch_lightning/accelerators/accelerator.py | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 9ea2cec491d2c..4aa5fedf2b210 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.plugins.training_type import TrainingTypePlugin
 from pytorch_lightning.trainer.states import TrainerState
+from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.enums import AMPType, LightningEnum
 
@@ -437,3 +438,27 @@ def results(self) -> Any:
         In distributed training, we make sure to transfer the results to the appropriate master process.
         """
         return self.training_type_plugin.results
+
+    # todo: remove in v1.5
+    def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: LightningModule) -> None:
+        """
+        Attaches the training type plugin to the accelerator.
+        Also transfers ownership of the model to this plugin
+
+        .. deprecated::v1.3
+            Will be removed in v1.5.0.
+        """
+        rank_zero_warn('Accelerator method `connect_training_type_plugin` was deprecated in v1.3.'
+                       ' It will be removed in v1.5.')
+        self.setup_training_type_plugin(plugin, model)
+
+    # todo: remove in v1.5
+    def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None:
+        """Attaches the precision plugin to the accelerator
+
+        .. deprecated::v1.3
+            Will be removed in v1.5.0.
+        """
+        rank_zero_warn('Accelerator method `connect_precision_plugin` was deprecated in v1.3.'
+                       ' It will be removed in v1.5.')
+        self.setup_precision_plugin(plugin)

From 51b10f78f4b4c4b704219c619dc5e73784aca57b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 23 Mar 2021 18:13:29 +0100
Subject: [PATCH 25/25] Refactor PyTorch profiler 4/5 (#6349)

Co-authored-by: thomas chaton <thomas@grid.ai>
---
 CHANGELOG.md                                  |   9 +
 pytorch_lightning/profiler/profilers.py       |  12 +-
 pytorch_lightning/profiler/pytorch.py         | 363 +++++++++++-------
 .../trainer/connectors/profiler_connector.py  |   3 +-
 pytorch_lightning/trainer/predict_loop.py     |   4 +
 pytorch_lightning/trainer/training_loop.py    |   2 +-
 pytorch_lightning/utilities/imports.py        |   1 +
 tests/checkpointing/test_torch_saving.py      |   1 +
 tests/deprecated_api/test_remove_1-5.py       |   5 +
 tests/test_profiler.py                        | 176 ++++++---
 tests/trainer/properties/test_get_model.py    |  20 -
 11 files changed, 377 insertions(+), 219 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 81bfa85cc073f..e1106189e0c17 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -52,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `AbstractProfiler` interface ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
 
 
+- Added support for including module names for forward in the autograd trace of `PyTorchProfiler` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))
+
+
 - Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))
 
 
@@ -72,6 +75,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed profilers to save separate report files per state and rank ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
 
 
+- Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))
+
+
 ### Deprecated
 
 - `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146))
@@ -83,6 +89,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621))
 
 
+- Deprecated `PytorchProfiler(profiled_functions)` in favor of `record_functions` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349))
+
+
 - Deprecated metrics in favor of `torchmetrics` ([#6505](https://github.com/PyTorchLightning/pytorch-lightning/pull/6505),
 
     [#6530](https://github.com/PyTorchLightning/pytorch-lightning/pull/6530),
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 54bc5cdf0122c..46d72583fb466 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -126,7 +126,7 @@ def _prepare_filename(self) -> str:
             filename += f"{self._stage}-"
         filename += str(self.filename)
         if self._local_rank is not None:
-            filename += f"-{self.local_rank}"
+            filename += f"-{self._local_rank}"
         filename += ".txt"
         return filename
 
@@ -134,8 +134,7 @@ def _prepare_streams(self) -> None:
         if self._write_stream is not None:
             return
         if self.filename:
-            dirpath = self.dirpath or self._log_dir
-            filepath = os.path.join(dirpath, self._prepare_filename())
+            filepath = os.path.join(self.dirpath, self._prepare_filename())
             fs = get_filesystem(filepath)
             file = fs.open(filepath, "a")
             self._output_file = file
@@ -175,8 +174,7 @@ def setup(
         self._stage = stage
         self._local_rank = local_rank
         self._log_dir = log_dir
-        if self.dirpath is None:
-            self.dirpath = self._log_dir
+        self.dirpath = self.dirpath or log_dir
 
     def teardown(self, stage: Optional[str] = None) -> None:
         """
@@ -202,8 +200,8 @@ def summary(self) -> str:
         raise NotImplementedError
 
     @property
-    def local_rank(self):
-        return '0' if self._local_rank is None else self._local_rank
+    def local_rank(self) -> int:
+        return 0 if self._local_rank is None else self._local_rank
 
 
 class PassThroughProfiler(BaseProfiler):
diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py
index 55b1c286789f4..974883a4724c6 100644
--- a/pytorch_lightning/profiler/pytorch.py
+++ b/pytorch_lightning/profiler/pytorch.py
@@ -12,25 +12,92 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Profiler to check if there are any bottlenecks in your code."""
-
 import inspect
 import logging
 import os
+from functools import partial
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Type, TYPE_CHECKING, Union
 
 import torch
+from torch import nn, Tensor
+from torch.autograd.profiler import record_function
 
 from pytorch_lightning.profiler.profilers import BaseProfiler
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
+if TYPE_CHECKING:
+    from torch.autograd.profiler import EventList
+    from torch.utils.hooks import RemovableHandle
+
+    from pytorch_lightning.core.lightning import LightningModule
+
 log = logging.getLogger(__name__)
 
+_PROFILER = Union[torch.autograd.profiler.profile, torch.cuda.profiler.profile, torch.autograd.profiler.emit_nvtx]
+
+
+class RegisterRecordFunction:
+    """
+    While profiling autograd operations, this class will add labels for module names around the forward function.
+
+    The Lightning PyTorch Profiler will activate this feature automatically. It can be deactivated as follows:
+
+    Example::
+        from pytorch_lightning.profilers import PyTorchProfiler
+        profiler = PyTorchProfiler(record_module_names=False)
+        Trainer(profiler=profiler)
+
+    It can be used outside of Lightning as follows:
+
+    Example::
+        from pytorch_lightning import Trainer, seed_everything
+        with RegisterRecordFunction(model):
+            out = model(batch)
+    """
+
+    def __init__(self, model: nn.Module) -> None:
+        self._model = model
+        self._records: Dict[str, record_function] = {}
+        self._handles: Dict[str, List['RemovableHandle']] = {}
+
+    def _start_recording_forward(self, _: nn.Module, input: Tensor, record_name: str) -> Tensor:
+        record = record_function(record_name)
+        record.__enter__()
+        self._records[record_name] = record
+        return input
+
+    def _stop_recording_forward(self, _: nn.Module, __: Tensor, output: Tensor, record_name: str) -> Tensor:
+        self._records[record_name].__exit__(None, None, None)
+        return output
+
+    def __enter__(self) -> None:
+        for module_name, module in self._model.named_modules():
+            if module_name:
+                full_name = f"{type(module).__module__}.{type(module).__name__}"
+                record_name = f"{full_name}: {module_name}"
+                pre_forward_handle = module.register_forward_pre_hook(
+                    partial(self._start_recording_forward, record_name=record_name)
+                )
+                post_forward_handle = module.register_forward_hook(
+                    partial(self._stop_recording_forward, record_name=record_name)
+                )
+
+                self._handles[module_name] = [pre_forward_handle, post_forward_handle]
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        for handles in self._handles.values():
+            for h in handles:
+                h.remove()
+        self._handles = {}
+
 
 class PyTorchProfiler(BaseProfiler):
 
-    PROFILED_FUNCTIONS = ("training_step_and_backward", "validation_step", "test_step")
+    RECORD_FUNCTIONS = (
+        "training_step_and_backward", "training_step", "backward", "validation_step", "test_step", "predict_step"
+    )
     AVAILABLE_SORT_KEYS = (
         "cpu_time",
         "cuda_time",
@@ -42,27 +109,24 @@ class PyTorchProfiler(BaseProfiler):
         "self_cuda_memory_usage",
         "count",
     )
+    START_RECORD_FUNCTIONS = ('on_train_start', 'on_validation_start', 'on_test_start', 'on_predict_start')
 
     def __init__(
         self,
         dirpath: Optional[Union[str, Path]] = None,
         filename: Optional[str] = None,
-        enabled: bool = True,
-        use_cuda: bool = False,
-        record_shapes: bool = False,
-        profile_memory: bool = False,
         group_by_input_shapes: bool = False,
-        with_stack: bool = False,
-        use_kineto: bool = False,
-        use_cpu: bool = True,
         emit_nvtx: bool = False,
-        export_to_chrome: bool = False,
-        path_to_export_trace: str = None,
+        export_to_chrome: bool = True,
+        path_to_export_trace: Optional[str] = None,
         row_limit: int = 20,
         sort_by_key: Optional[str] = None,
+        record_functions: List[str] = None,
+        record_module_names: bool = True,
         profiled_functions: Optional[List] = None,
         output_filename: Optional[str] = None,
-    ):
+        **profiler_kwargs: Any,
+    ) -> None:
         """
         This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
         different operators inside your model - both on the CPU and GPU
@@ -75,24 +139,8 @@ def __init__(
             filename: If present, filename where the profiler results will be saved instead of printing to stdout.
                 The ``.txt`` extension will be used automatically.
 
-            enabled: Setting this to False makes this context manager a no-op.
-
-            use_cuda: Enables timing of CUDA events as well using the cudaEvent API.
-                Adds approximately 4us of overhead to each tensor operation.
-
-            record_shapes: If shapes recording is set, information about input dimensions will be collected.
-
-            profile_memory: Whether to report memory usage, default: True (Introduced in PyTorch 1.6.0)
-
             group_by_input_shapes: Include operator input shapes and group calls by shape.
 
-            with_stack: record source information (file and line number) for the ops (Introduced in PyTorch 1.7.0)
-
-            use_kineto: experimental support for Kineto profiler (Introduced in PyTorch 1.8.0)
-
-            use_cpu: use_kineto=True and can be used to lower the overhead
-                for GPU-only profiling (Introduced in PyTorch 1.8.0)
-
             emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
                 Run::
 
@@ -103,164 +151,189 @@ def __init__(
                     nvvp trace_name.prof
                     torch.autograd.profiler.load_nvprof(path)
 
-            export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
+            export_to_chrome: Whether to export the sequence of profiled operators for Chrome.
                 It will generate a ``.json`` file which can be read by Chrome.
 
             path_to_export_trace: Directory path to export ``.json`` traces when using ``export_to_chrome=True``.
                 By default, it will be save where the file being is being run.
 
-            row_limit: Limit the number of rows in a table, `0` is a special value that
+            row_limit: Limit the number of rows in a table, ``-1`` is a special value that
                 removes the limit completely.
 
-            sort_by_key: Keys to sort out profiled table
+            sort_by_key: Attribute used to sort entries. By default
+                they are printed in the same order as they were registered.
+                Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
+                ``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``,
+                ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``.
 
-            profiled_functions: list of profiled functions which will create a context manager on.
+            record_functions: list of profiled functions which will create a context manager on.
                 Any other will be pass through.
 
+            record_module_names: Whether to add module names while recording autograd operation.
+
+            profiler_kwargs: Keyword arguments for the PyTorch profiler. This depends on your PyTorch version
+
         Raises:
             MisconfigurationException:
                 If arg ``sort_by_key`` is not present in ``AVAILABLE_SORT_KEYS``.
-            ValueError:
-                If you attempt to stop recording an action which was never started.
         """
+        super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
 
-        self.profiled_actions = {}
-        self.enabled = enabled
-        self.profiled_functions = profiled_functions or self.PROFILED_FUNCTIONS
-        self.use_cuda = use_cuda
-        self.record_shapes = record_shapes
-        self.profile_memory = profile_memory
-        self.sort_by_key = sort_by_key or ("cuda_time_total" if self.use_cuda else "cpu_time_total")
-        self.with_stack = with_stack
-        self.group_by_input_shapes = group_by_input_shapes and record_shapes
-        self.use_kineto = use_kineto
-        self.use_cpu = use_cpu
-        self.row_limit = row_limit
-        self.emit_nvtx = emit_nvtx
-        self.export_to_chrome = export_to_chrome
-        self.path_to_export_trace = path_to_export_trace
-
-        if export_to_chrome and path_to_export_trace is None:
+        record_functions = self.__deprecation_check(profiled_functions, record_functions)
+
+        self._group_by_input_shapes = group_by_input_shapes and profiler_kwargs.get("record_shapes", False)
+        self._emit_nvtx = emit_nvtx
+        self._export_to_chrome = export_to_chrome
+        self._path_to_export_trace = path_to_export_trace
+        self._row_limit = row_limit
+        self._sort_by_key = sort_by_key or f"{'cuda' if profiler_kwargs.get('use_cuda', False) else 'cpu'}_time_total"
+        self._record_functions_start = set(record_functions + list(self.START_RECORD_FUNCTIONS))
+        self._record_functions = set(record_functions + list(self.RECORD_FUNCTIONS))
+        self._record_module_names = record_module_names
+        self._profiler_kwargs = profiler_kwargs
+
+        self.profiler: Optional[_PROFILER] = None
+        self.function_events: Optional['EventList'] = None
+        self._lightning_module: Optional['LightningModule'] = None  # set by ProfilerConnector
+        self._register: Optional[RegisterRecordFunction] = None
+        self._parent_profiler: Optional[_PROFILER] = None
+        self._recording_map: Dict[str, record_function] = {}
+
+        if self._export_to_chrome and self._path_to_export_trace is None:
             rank_zero_warn(
-                "The exported trace would be save locally as `path_to_export_trace` is empty."
+                "The exported trace would be saved locally as `path_to_export_trace` is None."
                 " Note: Each functions will generate its own traced file."
             )
 
-        if self.sort_by_key not in self.AVAILABLE_SORT_KEYS:
+        if self._sort_by_key not in self.AVAILABLE_SORT_KEYS:
             raise MisconfigurationException(
-                f"Found sort_by_key: {sort_by_key}. Should be within {self.AVAILABLE_SORT_KEYS}. "
+                f"Found sort_by_key: {self._sort_by_key}. Should be within {self.AVAILABLE_SORT_KEYS}. "
             )
 
-        self.profiled_actions = {}
-        self.context_names = {}
-        self.running_stack = []
-        self.profiler = None
+    def __deprecation_check(
+        self,
+        profiled_functions: Optional[List[str]],
+        record_functions: Optional[List[str]],
+    ) -> List[str]:
+        if record_functions is None:
+            record_functions = []
 
-        super().__init__(dirpath=dirpath, filename=filename, output_filename=output_filename)
+        if profiled_functions is not None:
+            rank_zero_warn(
+                "`PyTorchProfiler.profiled_functions` has been renamed to"
+                " `record_functions` in v1.3 and will be removed in v1.5", DeprecationWarning
+            )
+            if not record_functions:
+                record_functions += profiled_functions
+            else:
+                raise MisconfigurationException(
+                    "You set `PytorchProfiler.profiled_functions` and `PyTorchProfiler.record_functions`."
+                    "  Please use only the later."
+                )
+
+        return record_functions
 
     def setup(
-        self,
-        stage: Optional[str] = None,
-        local_rank: Optional[int] = None,
-        log_dir: Optional[str] = None
+        self, stage: Optional[str] = None, local_rank: Optional[int] = None, log_dir: Optional[str] = None
     ) -> None:
         super().setup(stage=stage, local_rank=local_rank, log_dir=log_dir)
 
         # if the user didn't provide `path_to_export_trace`,
         # set it as TensorBoardLogger log_dir if exists
-        if self.path_to_export_trace is None:
-            self.path_to_export_trace = log_dir
+        if self._path_to_export_trace is None:
+            self._path_to_export_trace = log_dir
 
     def start(self, action_name: str) -> None:
-        if action_name not in self.profiled_functions:
-            return
-
-        if len(self.running_stack) > 0:
-            self._stop(self.running_stack[-1])
-        self.running_stack.append(action_name)
+        if self.profiler is None and action_name in self._record_functions_start:
+
+            # close profiler if it is already opened. might happen if 2 profilers
+            # are created and the first one did not call `describe`
+            try:
+                torch.autograd._disable_profiler()  # noqa
+            except (AttributeError, RuntimeError):
+                pass
+
+            self._create_profilers()
+
+            self.profiler.__enter__()
+            if self._parent_profiler is not None:
+                self._parent_profiler.__enter__()
+            if self._register is not None:
+                self._register.__enter__()
+
+        if (
+            self.profiler is not None and action_name in self._record_functions
+            and action_name not in self._recording_map
+        ):
+            recording = record_function(action_name)
+            recording.__enter__()
+            self._recording_map[action_name] = recording
 
-        self.context_names[action_name] = "/".join(self.running_stack)
-
-        self._start(action_name)
+    def stop(self, action_name: str) -> None:
+        if action_name in self._recording_map:
+            self._recording_map[action_name].__exit__(None, None, None)
+            del self._recording_map[action_name]
 
-    def _start(self, action_name: str) -> None:
-        if self.emit_nvtx:
-            self._parent_profiler = self._create_profiler(action_name, torch.cuda.profiler.profile, enter=True)
-            self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx)
-        else:
-            self._create_profiler(action_name, torch.autograd.profiler.profile)
-
-    def _create_profiler(self, action_name, profiler, enter=True):
-        init_args = inspect.signature(profiler.__init__).parameters
-        profiler_args = {k: v for k, v in vars(self).items() if k in init_args}
-        pr = profiler(**profiler_args)
-        if enter:
-            out_pr = pr.__enter__()
-            if out_pr is not None:
-                pr = out_pr
-        self.profiler = pr
-        return self.profiler
-
-    def _stop(self, action_name: str) -> None:
-        if self.profiler is None:
-            return
-
-        self.profiler.__exit__(exc_type=None, exc_val=None, exc_tb=None)
-
-        if isinstance(self.profiler, torch.autograd.profiler.emit_nvtx):
-            # when running ``emit_nvtx``, PyTorch requires 2 context manager.
-            # The parent_profiler is being closed too.
-            self._parent_profiler.__exit__(None, None, None)
-            self._parent_profiler = None
-            return
+    def summary(self) -> str:
+        if not self._profiler_kwargs.get("enabled", True) or self._emit_nvtx:
+            return ""
 
-        function_events = self.profiler.function_events
-        self.profiler = None
-        for name in self.running_stack:
-            if name not in self.profiled_actions:
-                self.profiled_actions[name] = function_events
-            else:
-                self.profiled_actions[name] += function_events
+        self._delete_profilers()
 
-    def stop(self, action_name: str) -> None:
-        if action_name not in self.profiled_functions:
-            return
+        if not self.function_events:
+            return ""
 
-        if len(self.running_stack) == 0 or self.running_stack[-1] != action_name:
-            raise ValueError(  # pragma: no-cover
-                f"Attempting to stop recording an action ({action_name}) which was never started."
+        if self._export_to_chrome:
+            filename = f"{self.local_rank}_trace.json"
+            path_to_trace = (
+                filename if self._path_to_export_trace is None else os.path.join(self._path_to_export_trace, filename)
             )
-        self._stop(action_name)
-        self.running_stack.pop()
-        # restore running profiler
-        if len(self.running_stack) > 0:
-            self._start(self.running_stack[-1])
+            self.function_events.export_chrome_trace(path_to_trace)
 
-    def summary(self) -> str:
-        recorded_stats = {}
-        output_string = ''
+        data = self.function_events.key_averages(group_by_input_shapes=self._group_by_input_shapes)
+        table = data.table(sort_by=self._sort_by_key, row_limit=self._row_limit)
 
-        if not self.enabled:
-            return output_string
+        recorded_stats = {"records": table}
+        return self._stats_to_str(recorded_stats)
 
-        for action_name, function_events in self.profiled_actions.items():
+    def _create_profilers(self) -> None:
+        if self._emit_nvtx:
+            self._parent_profiler = self._create_profiler(torch.cuda.profiler.profile)
+            self.profiler = self._create_profiler(torch.autograd.profiler.emit_nvtx)
+        else:
+            self._parent_profiler = None
+            self.profiler = self._create_profiler(torch.autograd.profiler.profile)
+        if self._record_module_names and self._lightning_module is not None:
+            self._register = RegisterRecordFunction(self._lightning_module)
+
+    def _create_profiler(self, profiler: Type[_PROFILER]) -> _PROFILER:
+        init_parameters = inspect.signature(profiler.__init__).parameters
+        kwargs = {k: v for k, v in self._profiler_kwargs.items() if k in init_parameters}
+        return profiler(**kwargs)
+
+    def _cache_functions_events(self):
+        if not self._emit_nvtx:
+            self.function_events = self.profiler.function_events
+
+    def _delete_profilers(self) -> None:
+        if self.profiler is not None:
+            self.profiler.__exit__(None, None, None)
+            self._cache_functions_events()
+            self.profiler = None
+
+        if self._parent_profiler is not None:
+            self._parent_profiler.__exit__(None, None, None)
+            self._parent_profiler = None
 
-            # next line is a workaround for a pytorch issue (fixed on master, still present
-            # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
-            # parent event for detach`
-            function_events.populate_cpu_children = lambda: None
+        if self._register is not None:
+            self._register.__exit__(None, None, None)
+            self._register = None
 
-            if self.export_to_chrome:
-                filename = f"{action_name}_{self.local_rank}_trace.json"
-                path_to_trace = filename if self.path_to_export_trace is None \
-                    else os.path.join(self.path_to_export_trace, filename)
-                function_events.export_chrome_trace(path_to_trace)
+    def teardown(self, stage: Optional[str] = None) -> None:
+        self._delete_profilers()
 
-            if self.emit_nvtx:
-                return output_string
+        for k in self._recording_map:
+            self.stop(k)
+        self._recording_map = {}
 
-            else:
-                data = function_events.key_averages(group_by_input_shapes=self.group_by_input_shapes)
-                table = data.table(sort_by=self.sort_by_key, row_limit=self.row_limit)
-                recorded_stats[action_name] = table
-        return self._stats_to_str(recorded_stats)
+        super().teardown(stage=stage)
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index e628d6d96bd19..191e8711463ab 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-
 from typing import Union
+from weakref import proxy
 
 from pytorch_lightning.profiler import (
     AdvancedProfiler,
@@ -57,4 +57,5 @@ def on_trainer_init(self, profiler: Union[BaseProfiler, str]):
     def setup(self) -> None:
         trainer = self.trainer
         local_rank = trainer.local_rank if trainer.world_size > 1 else None
+        trainer.profiler.lightning_module = proxy(trainer.lightning_module)
         trainer.profiler.setup(stage=trainer._setup_state, local_rank=local_rank, log_dir=trainer.log_dir)
diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py
index 53e82fd3f62b3..b33f41cb2ea48 100644
--- a/pytorch_lightning/trainer/predict_loop.py
+++ b/pytorch_lightning/trainer/predict_loop.py
@@ -44,6 +44,8 @@ def on_predict_model_eval(self, *_, **__):
         model_ref.on_predict_model_eval()
 
     def setup(self, model, max_batches, dataloaders):
+        self.trainer.call_hook("on_predict_start")
+
         # copy properties for forward overrides
         self.trainer.model_connector.copy_trainer_model_properties(model)
 
@@ -86,6 +88,8 @@ def predict_step(self, batch, batch_idx, dataloader_idx):
         return
 
     def on_predict_epoch_end(self):
+        self.trainer.profiler.describe()
+
         self.trainer._progress_bar_callback.on_predict_end(self.trainer, self.trainer.lightning_module)
 
         results = self._predictions
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index cc471f76b6033..c3ba34ca66d2d 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -743,7 +743,7 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer,
 
                 # backward pass
                 if result is not None:
-                    with self.trainer.profiler.profile("model_backward"):
+                    with self.trainer.profiler.profile("backward"):
                         self.backward(result, optimizer, opt_idx)
 
                     # hook - call this hook only
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 8090c4ed6590f..5a780660a0a99 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -68,6 +68,7 @@ def _compare_version(package: str, op, version) -> bool:
 _TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0")
 _TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
 _TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0")
+_TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0")
 
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
diff --git a/tests/checkpointing/test_torch_saving.py b/tests/checkpointing/test_torch_saving.py
index c8b1e96aeaf0a..8eabc4640046f 100644
--- a/tests/checkpointing/test_torch_saving.py
+++ b/tests/checkpointing/test_torch_saving.py
@@ -47,6 +47,7 @@ def test_model_torch_save_ddp_cpu(tmpdir):
         max_epochs=num_epochs,
         accelerator="ddp_cpu",
         num_processes=2,
+        logger=False,
     )
     temp_path = os.path.join(tmpdir, 'temp.pt')
     trainer.fit(model)
diff --git a/tests/deprecated_api/test_remove_1-5.py b/tests/deprecated_api/test_remove_1-5.py
index 0c5f581d7775c..725db1180d9e8 100644
--- a/tests/deprecated_api/test_remove_1-5.py
+++ b/tests/deprecated_api/test_remove_1-5.py
@@ -81,6 +81,11 @@ def on_save_checkpoint(self, *args):
         trainer.save_checkpoint(filepath)
 
 
+def test_v1_5_0_legacy_profiler_argument():
+    with pytest.deprecated_call(match="renamed to `record_functions` in v1.3"):
+        PyTorchProfiler(profiled_functions=[])
+
+
 def test_v1_5_0_running_sanity_check():
     trainer = Trainer()
     with pytest.deprecated_call(match='has been renamed to `Trainer.sanity_checking`'):
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index cf6afcc9b626c..5d144aef36573 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import logging
 import os
+import platform
 import time
 from copy import deepcopy
 from distutils.version import LooseVersion
@@ -24,6 +25,9 @@
 
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.profiler import AdvancedProfiler, PyTorchProfiler, SimpleProfiler
+from pytorch_lightning.profiler.pytorch import RegisterRecordFunction
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 from tests.helpers import BoringModel
 from tests.helpers.runif import RunIf
 
@@ -126,10 +130,10 @@ def test_simple_profiler_log_dir(tmpdir):
     )
     trainer.fit(model)
 
-    expected = profiler.dirpath
+    expected = tmpdir / "lightning_logs" / "version_0"
     assert trainer.log_dir == expected
     assert profiler._log_dir == trainer.log_dir
-    assert Path(os.path.join(profiler.dirpath, "fit-profiler.txt")).exists()
+    assert expected.join("fit-profiler.txt").exists()
 
 
 @RunIf(skip_windows=True)
@@ -264,8 +268,8 @@ def pytorch_profiler(tmpdir):
 
 def test_pytorch_profiler_describe(pytorch_profiler):
     """Ensure the profiler won't fail when reporting the summary."""
-    with pytorch_profiler.profile("test_step"):
-        pass
+    with pytorch_profiler.profile("on_test_start"):
+        torch.tensor(0)
 
     # log to stdout and print to file
     pytorch_profiler.describe()
@@ -274,15 +278,10 @@ def test_pytorch_profiler_describe(pytorch_profiler):
     assert len(data) > 0
 
 
-def test_pytorch_profiler_value_errors(pytorch_profiler):
+def test_pytorch_profiler_raises(pytorch_profiler):
     """Ensure errors are raised where expected."""
-
-    action = "test_step"
-    with pytest.raises(ValueError):
-        pytorch_profiler.stop(action)
-
-    pytorch_profiler.start(action)
-    pytorch_profiler.stop(action)
+    with pytest.raises(MisconfigurationException, match="profiled_functions` and `PyTorchProfiler.record"):
+        PyTorchProfiler(profiled_functions=["a"], record_functions=["b"])
 
 
 @RunIf(min_torch="1.6.0")
@@ -299,9 +298,8 @@ def test_advanced_profiler_cprofile_deepcopy(tmpdir):
 
 
 @RunIf(min_gpus=2, special=True)
-def test_pytorch_profiler_trainer_ddp(tmpdir):
+def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
     """Ensure that the profiler can be given to the training and default step are properly recorded. """
-    pytorch_profiler = PyTorchProfiler(dirpath=None, filename="profiler")
     model = BoringModel()
     trainer = Trainer(
         max_epochs=1,
@@ -314,17 +312,68 @@ def test_pytorch_profiler_trainer_ddp(tmpdir):
     )
     trainer.fit(model)
 
-    assert len(pytorch_profiler.summary()) > 0
-    assert set(pytorch_profiler.profiled_actions) == {'training_step_and_backward', 'validation_step'}
+    expected = ('validation_step', 'training_step_and_backward', 'training_step', 'backward')
+    for name in expected:
+        assert sum(e.name == name for e in pytorch_profiler.function_events)
 
-    files = sorted(f for f in os.listdir(pytorch_profiler.dirpath) if "fit" in f)
-    rank = int(os.getenv("LOCAL_RANK", "0"))
-    expected = f"fit-profiler-{rank}.txt"
-    assert files[rank] == expected
+    files = set(os.listdir(pytorch_profiler.dirpath))
+    expected = f"fit-profiler-{trainer.local_rank}.txt"
+    assert expected in files
 
     path = os.path.join(pytorch_profiler.dirpath, expected)
-    data = Path(path).read_text("utf-8")
-    assert len(data) > 0
+    assert Path(path).read_text()
+
+
+def test_pytorch_profiler_trainer_test(tmpdir, pytorch_profiler):
+    """Ensure that the profiler can be given to the trainer and test step are properly recorded. """
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_test_batches=2,
+        profiler=pytorch_profiler,
+    )
+    trainer.test(model)
+
+    assert sum(e.name == 'test_step' for e in pytorch_profiler.function_events)
+
+    path = pytorch_profiler.dirpath / f"test-{pytorch_profiler.filename}.txt"
+    assert path.read_text("utf-8")
+
+
+def test_pytorch_profiler_trainer_predict(tmpdir, pytorch_profiler):
+    """Ensure that the profiler can be given to the trainer and predict function are properly recorded. """
+    model = BoringModel()
+    model.predict_dataloader = model.train_dataloader
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_test_batches=2,
+        profiler=pytorch_profiler,
+    )
+    trainer.predict(model)
+
+    assert sum(e.name == 'predict_step' for e in pytorch_profiler.function_events)
+
+    path = pytorch_profiler.dirpath / f"predict-{pytorch_profiler.filename}.txt"
+    assert path.read_text("utf-8")
+
+
+def test_pytorch_profiler_trainer_validate(tmpdir, pytorch_profiler):
+    """Ensure that the profiler can be given to the trainer and validate function are properly recorded. """
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_val_batches=2,
+        profiler=pytorch_profiler,
+    )
+    trainer.validate(model)
+
+    assert sum(e.name == 'validation_step' for e in pytorch_profiler.function_events)
+
+    path = pytorch_profiler.dirpath / f"validate-{pytorch_profiler.filename}.txt"
+    assert path.read_text("utf-8")
 
 
 def test_pytorch_profiler_nested(tmpdir):
@@ -341,34 +390,31 @@ def test_pytorch_profiler_nested(tmpdir):
         with pytorch_profiler.profile("c"):
             _ = a + b
 
-    pa = pytorch_profiler.profiled_actions
+    pytorch_profiler.describe()
 
-    # From PyTorch 1.8.0, less operation are being traced.
-    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
-        expected_ = {
-            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'add'],
-            'b': ['zeros', 'empty', 'zero_'],
-            'c': ['add'],
+    events_name = {e.name for e in pytorch_profiler.function_events}
+
+    if platform.system() == "Windows":
+        expected = {'a', 'add', 'b', 'c', 'profiler::_record_function_enter', 'profiler::_record_function_exit'}
+    else:
+        expected = {
+            'signed char', 'add', 'profiler::_record_function_exit', 'bool', 'char', 'profiler::_record_function_enter'
         }
-    # From PyTorch 1.6.0, more operation are being traced.
-    elif LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
-        expected_ = {
-            'a': ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty'],
-            'b': ['zeros', 'empty', 'zero_', 'fill_'],
-            'c': ['add', 'empty'],
+
+    if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+        expected = {'add', 'zeros', 'ones', 'zero_', 'b', 'fill_', 'c', 'a', 'empty'}
+
+    if LooseVersion(torch.__version__) >= LooseVersion("1.7.0"):
+        expected = {
+            'aten::zeros', 'aten::add', 'aten::zero_', 'c', 'b', 'a', 'aten::fill_', 'aten::empty', 'aten::ones'
         }
-    else:
-        expected_ = {
-            'a': ['add'],
-            'b': [],
-            'c': ['add'],
+
+    if LooseVersion(torch.__version__) >= LooseVersion("1.8.0"):
+        expected = {
+            'aten::ones', 'a', 'aten::add', 'aten::empty', 'aten::zero_', 'b', 'c', 'aten::zeros', 'aten::fill_'
         }
 
-    for n in ('a', 'b', 'c'):
-        pa[n] = [e.name for e in pa[n]]
-        if LooseVersion(torch.__version__) >= LooseVersion("1.7.1"):
-            pa[n] = [e.replace("aten::", "") for e in pa[n]]
-        assert pa[n] == expected_[n]
+    assert events_name == expected, (events_name, torch.__version__, platform.system())
 
 
 @RunIf(min_gpus=1, special=True)
@@ -387,6 +433,43 @@ def test_pytorch_profiler_nested_emit_nvtx(tmpdir):
     trainer.fit(model)
 
 
+@RunIf(min_torch="1.5.0")
+def test_register_record_function(tmpdir):
+
+    use_cuda = torch.cuda.is_available()
+    pytorch_profiler = PyTorchProfiler(
+        export_to_chrome=False,
+        record_functions=["a"],
+        use_cuda=use_cuda,
+        dirpath=tmpdir,
+        filename="profiler",
+    )
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.layer = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.ReLU(), torch.nn.Linear(8, 2))
+
+    model = TestModel()
+    input = torch.rand((1, 8))
+
+    if use_cuda:
+        model = model.cuda()
+        input = input.cuda()
+
+    with pytorch_profiler.profile("a"):
+        with RegisterRecordFunction(model):
+            model(input)
+
+    pytorch_profiler.describe()
+    event_names = [e.name for e in pytorch_profiler.function_events]
+    assert 'torch.nn.modules.container.Sequential: layer' in event_names
+    assert 'torch.nn.modules.linear.Linear: layer.0' in event_names
+    assert 'torch.nn.modules.activation.ReLU: layer.1' in event_names
+    assert 'torch.nn.modules.linear.Linear: layer.2' in event_names
+
+
 @pytest.mark.parametrize("cls", (SimpleProfiler, AdvancedProfiler, PyTorchProfiler))
 def test_profiler_teardown(tmpdir, cls):
     """
@@ -407,6 +490,9 @@ def on_fit_end(self, trainer, *args, **kwargs) -> None:
     assert profiler._output_file is None
 
 
+@pytest.mark.skipif(_TORCH_GREATER_EQUAL_1_8, reason="currently not supported for PyTorch 1.8")
 def test_pytorch_profiler_deepcopy(pytorch_profiler):
+    pytorch_profiler.start("on_train_start")
+    torch.tensor(1)
     pytorch_profiler.describe()
     assert deepcopy(pytorch_profiler)
diff --git a/tests/trainer/properties/test_get_model.py b/tests/trainer/properties/test_get_model.py
index 3eb0596b55fc4..5dc1ea5de4e8a 100644
--- a/tests/trainer/properties/test_get_model.py
+++ b/tests/trainer/properties/test_get_model.py
@@ -80,23 +80,3 @@ def test_get_model_gpu(tmpdir):
         gpus=1,
     )
     trainer.fit(model)
-
-
-@RunIf(min_gpus=1, skip_windows=True)
-def test_get_model_ddp_gpu(tmpdir):
-    """
-    Tests that `trainer.lightning_module` extracts the model correctly when using GPU + ddp accelerators
-    """
-
-    model = TrainerGetModel()
-
-    limit_train_batches = 2
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_train_batches=limit_train_batches,
-        limit_val_batches=2,
-        max_epochs=1,
-        gpus=1,
-    )
-    trainer.fit(model)
-    return 1