Merge branch 'master' into amp-plugin-config-validate

Lightning-AI · May 29, 2023 · 1b10e30 · 1b10e30
2 parents bb7ae8c + bd53b03
commit 1b10e30
Show file tree

Hide file tree

Showing 38 changed files with 223 additions and 138 deletions.
diff --git a/.azure/gpu-benchmark.yml → .azure/gpu-benchmarks.yml b/.azure/gpu-benchmark.yml → .azure/gpu-benchmarks.yml
@@ -5,8 +5,7 @@
 
 trigger:
   tags:
-    include:
-      - '*'
+    include: ['*']
   branches:
     include:
       - "master"
@@ -20,9 +19,13 @@ pr:
       - "release/*"
   paths:
     include:
-      - ".azure/gpu-benchmark.yml"
-      - "tests/tests_pytorch/benchmarks/**"
+      - ".azure/gpu-benchmarks.yml"
+      - "requirements/fabric/**"
       - "requirements/pytorch/**"
+      - "src/lightning/fabric/**"
+      - "src/lightning/pytorch/**"
+      - "tests/parity_fabric/**"
+      - "tests/parity_pytorch/**"
     exclude:
       - "requirements/*/docs.txt"
       - "*.md"
@@ -45,6 +48,12 @@ jobs:
     container:
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.0-cuda11.7.1"
       options: "--gpus=all --shm-size=32g"
+    strategy:
+      matrix:
+        'pkg: Fabric':
+          PACKAGE_NAME: "fabric"
+        'pkg: Pytorch':
+          PACKAGE_NAME: "pytorch"
     workspace:
       clean: all
 
@@ -69,7 +78,6 @@ jobs:
 
     - bash: pip install -e .[dev] --find-links ${TORCH_URL}
       env:
-        PACKAGE_NAME: "pytorch"
         FREEZE_REQUIREMENTS: "1"
       displayName: 'Install package'
 
@@ -86,9 +94,17 @@ jobs:
           --target_import="lightning_fabric,pytorch_lightning"
       displayName: 'Adjust tests'
 
-    - bash: python -m pytest benchmarks -v --durations=0
+    - bash: python -m pytest parity_$(PACKAGE_NAME) -v --durations=0
       env:
         PL_RUNNING_BENCHMARKS: "1"
         PL_RUN_CUDA_TESTS: "1"
-      workingDirectory: tests/tests_pytorch
-      displayName: 'Testing: PyTorch benchmarks'
+      workingDirectory: tests/
+      displayName: 'Testing: benchmarks'
+
+    - bash: bash run_standalone_tasks.sh
+      workingDirectory: tests/parity_fabric
+      condition: eq(variables['PACKAGE_NAME'], 'fabric')
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: 'Testing: fabric standalone tasks'
+      timeoutInMinutes: "10"
diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -123,7 +123,7 @@ jobs:
       condition: eq(variables['PACKAGE_NAME'], 'fabric')
       displayName: 'Adjust tests & examples'
 
-    - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest --ignore benchmarks -v --durations=50
+    - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
       workingDirectory: tests/tests_fabric
       env:
         PL_RUN_CUDA_TESTS: "1"
@@ -138,13 +138,6 @@ jobs:
       displayName: 'Testing: fabric standalone tests'
       timeoutInMinutes: "10"
 
-    - bash: bash run_standalone_tasks.sh
-      workingDirectory: tests/tests_fabric
-      env:
-        PL_RUN_CUDA_TESTS: "1"
-      displayName: 'Testing: fabric standalone tasks'
-      timeoutInMinutes: "10"
-
     - bash: |
         python -m coverage report
         python -m coverage xml

diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -150,7 +150,7 @@ jobs:
         ls -l checkpoints/
       displayName: 'Get legacy checkpoints'
 
-    - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest --ignore benchmarks -v --durations=50
+    - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
       workingDirectory: tests/tests_pytorch
       env:
         PL_RUN_CUDA_TESTS: "1"
@@ -197,9 +197,3 @@ jobs:
       env:
         PL_USE_MOCKED_MNIST: "1"
       displayName: 'Testing: PyTorch examples'
-
-    - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0
-      workingDirectory: tests/tests_pytorch
-      env:
-        PL_RUN_CUDA_TESTS: "1"
-      displayName: 'Testing: PyTorch benchmarks'
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -70,14 +70,14 @@ subprojects:
 
   - id: "pytorch_lightning: Benchmarks"
     paths:
-      - ".azure/gpu-benchmark.yml"
+      - ".azure/gpu-benchmarks.yml"
       - "tests/tests_pytorch/benchmarks/**"
       - "requirements/pytorch/**"
       - "!requirements/pytorch/docs.txt"
       - "!*.md"
       - "!**/*.md"
     checks:
-      - "pytorch-lightning.Benchmark"
+      - "lightning.Benchmarks"
 
   - id: "pytorch-lightning: TPU workflow"
     paths:

diff --git a/.github/workflows/README.md b/.github/workflows/README.md
@@ -9,7 +9,7 @@
 | .github/workflows/ci-tests-pytorch.yml | Run all tests except for accelerator-specific, standalone and slow tests.                                                                                                   | CPU           |
 | .azure-pipelines/ipu-tests.yml         | Run only IPU-specific tests.                                                                                                                                                | IPU           |
 | .azure-pipelines/gpu-tests-pytorch.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU           |
-| .azure-pipelines/gpu-benchmark.yml     | Run speed/memory benchmarks for parity with pure PyTorch.                                                                                                                   | GPU           |
+| .azure-pipelines/gpu-benchmarks.yml    | Run speed/memory benchmarks for parity with pure PyTorch.                                                                                                                   | GPU           |
 | .github/workflows/tpu-tests.yml        | Run only TPU-specific tests. Requires that the PR title contains '\[TPU\]'                                                                                                  | TPU           |
 
 - \*Accelerators used in CI

diff --git a/README.md b/README.md
@@ -364,6 +364,7 @@ Fabric is designed for the most complex models like foundation model scaling, LL
 -         loss.backward()
 +         fabric.backward(loss)
           optimizer.step()
+          print(loss.data)
 ```
 
 </sub>
@@ -397,6 +398,7 @@ for epoch in range(num_epochs):
         loss = torch.nn.functional.cross_entropy(outputs, labels)
         fabric.backward(loss)
         optimizer.step()
+        print(loss.data)
 ```
 
 </sub>

diff --git a/pyproject.toml b/pyproject.toml
@@ -31,6 +31,8 @@ known_first_party = [
     "lightning_app",
     "lightning_fabric",
     "pytorch_lightning",
+    "parity_fabric",
+    "parity_pytorch",
     "integrations_app",
     "tests_app",
     "tests_cloud",

diff --git a/requirements/app/base.txt b/requirements/app/base.txt
@@ -1,7 +1,7 @@
 lightning-cloud >=0.5.34
 packaging
 typing-extensions >=4.0.0, <=4.4.0
-deepdiff >=5.7.0, <6.2.4
+deepdiff >=5.7.0, <6.3.1
 starsessions >=1.2.1, <2.0 # strict
 fsspec >=2022.5.0, <=2022.7.1
 croniter >=1.3.0, <1.4.0  # strict; TODO: for now until we find something more robust.

diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt
@@ -1,4 +1,4 @@
 redis >=4.0.1, <=4.2.4
-docker >=5.0.0, <6.1.2
+docker >=5.0.0, <6.1.3
 s3fs >=2022.5.0, <2022.11.1
 # setuptools==59.5.0
diff --git a/requirements/app/test.txt b/requirements/app/test.txt
@@ -1,4 +1,4 @@
-coverage ==6.5.0
+coverage ==7.2.5
 pytest ==7.3.1
 pytest-timeout ==2.1.0
 pytest-cov ==4.0.0

diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt
@@ -1,5 +1,5 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-torchvision >=0.12.0, <=0.15.1
+torchvision >=0.12.0, <=0.15.2
 torchmetrics >=0.10.0, <0.12.0
 lightning-utilities >=0.8.0, <0.9.0
diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt
@@ -1,4 +1,4 @@
-coverage ==6.5.0
+coverage ==7.2.5
 pytest ==7.3.1
 pytest-cov ==4.0.0
 pytest-rerunfailures ==10.3

diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt
@@ -1,6 +1,6 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-torchvision >=0.12.0, <=0.15.1
+torchvision >=0.12.0, <=0.15.2
 gym[classic_control] >=0.17.0, <0.26.3
 ipython[all] <8.7.1
 torchmetrics >=0.10.0, <0.12.0

diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
@@ -1,4 +1,4 @@
-coverage ==6.5.0
+coverage ==7.2.5
 pytest ==7.3.1
 pytest-cov ==4.0.0
 pytest-forked ==1.4.0

diff --git a/src/lightning/fabric/utilities/testing/__init__.py b/src/lightning/fabric/utilities/testing/__init__.py
@@ -1,3 +1,3 @@
-from lightning.fabric.utilities.testing._runif import _RunIf
+from lightning.fabric.utilities.testing._runif import _runif_reasons
 
-__all__ = ["_RunIf"]
+__all__ = ["_runif_reasons"]
diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py
@@ -27,7 +27,7 @@
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1
 
 
-def _RunIf(
+def _runif_reasons(
     *,
     min_cuda_gpus: int = 0,
     min_torch: Optional[str] = None,
@@ -41,7 +41,8 @@ def _RunIf(
     deepspeed: bool = False,
     dynamo: bool = False,
 ) -> Tuple[List[str], Dict[str, bool]]:
-    """
+    """Construct reasons for pytest skipif.
+
     Args:
         min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set.
         min_torch: Require that PyTorch is greater or equal than this version.

diff --git a/src/lightning/pytorch/callbacks/batch_size_finder.py b/src/lightning/pytorch/callbacks/batch_size_finder.py
@@ -41,7 +41,7 @@ class BatchSizeFinder(Callback):
 
             - ``'power'``: Keep multiplying the batch size by 2, until we get an OOM error.
             - ``'binsearch'``: Initially keep multiplying by 2 and after encountering an OOM error
-                do a binary search between the last successful batch size and the batch size that failed.
+              do a binary search between the last successful batch size and the batch size that failed.
 
         steps_per_trial: number of steps to run with a given batch size.
             Ideally 1 should be enough to test if an OOM error occurs,

diff --git a/src/lightning/pytorch/utilities/testing/__init__.py b/src/lightning/pytorch/utilities/testing/__init__.py
@@ -1,3 +1,3 @@
-from lightning.pytorch.utilities.testing._runif import _RunIf
+from lightning.pytorch.utilities.testing._runif import _runif_reasons
 
-__all__ = ["_RunIf"]
+__all__ = ["_runif_reasons"]
diff --git a/src/lightning/pytorch/utilities/testing/_runif.py b/src/lightning/pytorch/utilities/testing/_runif.py
@@ -16,7 +16,7 @@
 from lightning_utilities.core.imports import RequirementCache
 
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
-from lightning.fabric.utilities.testing import _RunIf as FabricRunIf
+from lightning.fabric.utilities.testing import _runif_reasons as FabricRunIf
 from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
 from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
 from lightning.pytorch.core.module import _ONNX_AVAILABLE
@@ -25,7 +25,7 @@
 _SKLEARN_AVAILABLE = RequirementCache("scikit-learn")
 
 
-def _RunIf(
+def _runif_reasons(
     *,
     min_cuda_gpus: int = 0,
     min_torch: Optional[str] = None,
@@ -44,7 +44,8 @@ def _RunIf(
     sklearn: bool = False,
     onnx: bool = False,
 ) -> Tuple[List[str], Dict[str, bool]]:
-    """
+    """Construct reasons for pytest skipif.
+
     Args:
         min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set.
         min_torch: Require that PyTorch is greater or equal than this version.

diff --git a/tests/tests_fabric/parity/__init__.py → tests/parity_fabric/__init__.py b/tests/tests_fabric/parity/__init__.py → tests/parity_fabric/__init__.py
diff --git a/tests/parity_fabric/conftest.py b/tests/parity_fabric/conftest.py
@@ -0,0 +1,32 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+import torch.distributed
+
+
+@pytest.fixture()
+def reset_deterministic_algorithm():
+    """Ensures that torch determinism settings are reset before the next test runs."""
+    yield
+    os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
+    torch.use_deterministic_algorithms(False)
+
+
+@pytest.fixture()
+def reset_cudnn_benchmark():
+    """Ensures that the `torch.backends.cudnn.benchmark` setting gets reset before the next test runs."""
+    yield
+    torch.backends.cudnn.benchmark = False
diff --git a/tests/tests_fabric/parity/models.py → tests/parity_fabric/models.py b/tests/tests_fabric/parity/models.py → tests/parity_fabric/models.py
diff --git a/tests/tests_fabric/run_standalone_tasks.sh → tests/parity_fabric/run_standalone_tasks.sh b/tests/tests_fabric/run_standalone_tasks.sh → tests/parity_fabric/run_standalone_tasks.sh
@@ -39,5 +39,5 @@ retry_command() {
   return $exit_code
 }
 
-retry_command "python -m parity.test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
-retry_command "python -m parity.test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
+retry_command "python -m test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
+retry_command "python -m test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
diff --git a/tests/tests_fabric/parity/test_parity_ddp.py → tests/parity_fabric/test_parity_ddp.py b/tests/tests_fabric/parity/test_parity_ddp.py → tests/parity_fabric/test_parity_ddp.py
@@ -23,8 +23,8 @@
 from torch.utils.data.distributed import DistributedSampler
 
 from lightning.fabric.fabric import Fabric
-from tests_fabric.parity.models import ConvNet
-from tests_fabric.parity.utils import (
+from parity_fabric.models import ConvNet
+from parity_fabric.utils import (
     cuda_reset,
     is_cuda_memory_close,
     is_state_dict_equal,

diff --git a/...tests_fabric/parity/test_parity_simple.py → tests/parity_fabric/test_parity_simple.py b/...tests_fabric/parity/test_parity_simple.py → tests/parity_fabric/test_parity_simple.py
@@ -21,16 +21,16 @@
 import torch.nn.functional
 
 from lightning.fabric.fabric import Fabric
-from tests_fabric.helpers.runif import RunIf
-from tests_fabric.parity.models import ConvNet
-from tests_fabric.parity.utils import (
+from parity_fabric.models import ConvNet
+from parity_fabric.utils import (
     cuda_reset,
     get_model_input_dtype,
     is_cuda_memory_close,
     is_state_dict_equal,
     is_timing_close,
     make_deterministic,
 )
+from tests_fabric.helpers.runif import RunIf
 
 
 def train_torch(

diff --git a/tests/tests_fabric/parity/utils.py → tests/parity_fabric/utils.py b/tests/tests_fabric/parity/utils.py → tests/parity_fabric/utils.py
diff --git a/tests/parity_pytorch/__init__.py b/tests/parity_pytorch/__init__.py
@@ -0,0 +1,8 @@
+import pytest
+
+from lightning.pytorch.utilities.testing import _runif_reasons
+
+
+def RunIf(**kwargs):
+    reasons, marker_kwargs = _runif_reasons(**kwargs)
+    return pytest.mark.skipif(condition=len(reasons) > 0, reason=f"Requires: [{' + '.join(reasons)}]", **marker_kwargs)
diff --git a/...pytorch/benchmarks/generate_comparison.py → tests/parity_pytorch/generate_comparison.py b/...pytorch/benchmarks/generate_comparison.py → tests/parity_pytorch/generate_comparison.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 
-from tests_pytorch.benchmarks.test_basic_parity import measure_loops
+from parity_pytorch.measure import measure_loops
 from tests_pytorch.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN
 
 NUM_EPOCHS = 20