Skip to content

Commit

Permalink
Merge branch 'master' into amp-plugin-config-validate
Browse files Browse the repository at this point in the history
  • Loading branch information
Borda authored May 29, 2023
2 parents bb7ae8c + bd53b03 commit 1b10e30
Show file tree
Hide file tree
Showing 38 changed files with 223 additions and 138 deletions.
32 changes: 24 additions & 8 deletions .azure/gpu-benchmark.yml → .azure/gpu-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

trigger:
tags:
include:
- '*'
include: ['*']
branches:
include:
- "master"
Expand All @@ -20,9 +19,13 @@ pr:
- "release/*"
paths:
include:
- ".azure/gpu-benchmark.yml"
- "tests/tests_pytorch/benchmarks/**"
- ".azure/gpu-benchmarks.yml"
- "requirements/fabric/**"
- "requirements/pytorch/**"
- "src/lightning/fabric/**"
- "src/lightning/pytorch/**"
- "tests/parity_fabric/**"
- "tests/parity_pytorch/**"
exclude:
- "requirements/*/docs.txt"
- "*.md"
Expand All @@ -45,6 +48,12 @@ jobs:
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.0-cuda11.7.1"
options: "--gpus=all --shm-size=32g"
strategy:
matrix:
'pkg: Fabric':
PACKAGE_NAME: "fabric"
'pkg: Pytorch':
PACKAGE_NAME: "pytorch"
workspace:
clean: all

Expand All @@ -69,7 +78,6 @@ jobs:
- bash: pip install -e .[dev] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "pytorch"
FREEZE_REQUIREMENTS: "1"
displayName: 'Install package'

Expand All @@ -86,9 +94,17 @@ jobs:
--target_import="lightning_fabric,pytorch_lightning"
displayName: 'Adjust tests'
- bash: python -m pytest benchmarks -v --durations=0
- bash: python -m pytest parity_$(PACKAGE_NAME) -v --durations=0
env:
PL_RUNNING_BENCHMARKS: "1"
PL_RUN_CUDA_TESTS: "1"
workingDirectory: tests/tests_pytorch
displayName: 'Testing: PyTorch benchmarks'
workingDirectory: tests/
displayName: 'Testing: benchmarks'

- bash: bash run_standalone_tasks.sh
workingDirectory: tests/parity_fabric
condition: eq(variables['PACKAGE_NAME'], 'fabric')
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: fabric standalone tasks'
timeoutInMinutes: "10"
9 changes: 1 addition & 8 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ jobs:
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Adjust tests & examples'
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest --ignore benchmarks -v --durations=50
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
Expand All @@ -138,13 +138,6 @@ jobs:
displayName: 'Testing: fabric standalone tests'
timeoutInMinutes: "10"

- bash: bash run_standalone_tasks.sh
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: fabric standalone tasks'
timeoutInMinutes: "10"

- bash: |
python -m coverage report
python -m coverage xml
Expand Down
8 changes: 1 addition & 7 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:
ls -l checkpoints/
displayName: 'Get legacy checkpoints'
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest --ignore benchmarks -v --durations=50
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50
workingDirectory: tests/tests_pytorch
env:
PL_RUN_CUDA_TESTS: "1"
Expand Down Expand Up @@ -197,9 +197,3 @@ jobs:
env:
PL_USE_MOCKED_MNIST: "1"
displayName: 'Testing: PyTorch examples'
- bash: python -m pytest benchmarks -v --maxfail=2 --durations=0
workingDirectory: tests/tests_pytorch
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: PyTorch benchmarks'
4 changes: 2 additions & 2 deletions .github/checkgroup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,14 @@ subprojects:

- id: "pytorch_lightning: Benchmarks"
paths:
- ".azure/gpu-benchmark.yml"
- ".azure/gpu-benchmarks.yml"
- "tests/tests_pytorch/benchmarks/**"
- "requirements/pytorch/**"
- "!requirements/pytorch/docs.txt"
- "!*.md"
- "!**/*.md"
checks:
- "pytorch-lightning.Benchmark"
- "lightning.Benchmarks"

- id: "pytorch-lightning: TPU workflow"
paths:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
| .github/workflows/ci-tests-pytorch.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU |
| .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU |
| .azure-pipelines/gpu-tests-pytorch.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU |
| .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU |
| .azure-pipelines/gpu-benchmarks.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU |
| .github/workflows/tpu-tests.yml | Run only TPU-specific tests. Requires that the PR title contains '\[TPU\]' | TPU |

- \*Accelerators used in CI
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ Fabric is designed for the most complex models like foundation model scaling, LL
- loss.backward()
+ fabric.backward(loss)
optimizer.step()
print(loss.data)
```

</sub>
Expand Down Expand Up @@ -397,6 +398,7 @@ for epoch in range(num_epochs):
loss = torch.nn.functional.cross_entropy(outputs, labels)
fabric.backward(loss)
optimizer.step()
print(loss.data)
```

</sub>
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ known_first_party = [
"lightning_app",
"lightning_fabric",
"pytorch_lightning",
"parity_fabric",
"parity_pytorch",
"integrations_app",
"tests_app",
"tests_cloud",
Expand Down
2 changes: 1 addition & 1 deletion requirements/app/base.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
lightning-cloud >=0.5.34
packaging
typing-extensions >=4.0.0, <=4.4.0
deepdiff >=5.7.0, <6.2.4
deepdiff >=5.7.0, <6.3.1
starsessions >=1.2.1, <2.0 # strict
fsspec >=2022.5.0, <=2022.7.1
croniter >=1.3.0, <1.4.0 # strict; TODO: for now until we find something more robust.
Expand Down
2 changes: 1 addition & 1 deletion requirements/app/cloud.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
redis >=4.0.1, <=4.2.4
docker >=5.0.0, <6.1.2
docker >=5.0.0, <6.1.3
s3fs >=2022.5.0, <2022.11.1
# setuptools==59.5.0
2 changes: 1 addition & 1 deletion requirements/app/test.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
coverage ==6.5.0
coverage ==7.2.5
pytest ==7.3.1
pytest-timeout ==2.1.0
pytest-cov ==4.0.0
Expand Down
2 changes: 1 addition & 1 deletion requirements/fabric/examples.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
torchvision >=0.12.0, <=0.15.1
torchvision >=0.12.0, <=0.15.2
torchmetrics >=0.10.0, <0.12.0
lightning-utilities >=0.8.0, <0.9.0
2 changes: 1 addition & 1 deletion requirements/fabric/test.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
coverage ==6.5.0
coverage ==7.2.5
pytest ==7.3.1
pytest-cov ==4.0.0
pytest-rerunfailures ==10.3
Expand Down
2 changes: 1 addition & 1 deletion requirements/pytorch/examples.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
torchvision >=0.12.0, <=0.15.1
torchvision >=0.12.0, <=0.15.2
gym[classic_control] >=0.17.0, <0.26.3
ipython[all] <8.7.1
torchmetrics >=0.10.0, <0.12.0
Expand Down
2 changes: 1 addition & 1 deletion requirements/pytorch/test.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
coverage ==6.5.0
coverage ==7.2.5
pytest ==7.3.1
pytest-cov ==4.0.0
pytest-forked ==1.4.0
Expand Down
4 changes: 2 additions & 2 deletions src/lightning/fabric/utilities/testing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from lightning.fabric.utilities.testing._runif import _RunIf
from lightning.fabric.utilities.testing._runif import _runif_reasons

__all__ = ["_RunIf"]
__all__ = ["_runif_reasons"]
5 changes: 3 additions & 2 deletions src/lightning/fabric/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1


def _RunIf(
def _runif_reasons(
*,
min_cuda_gpus: int = 0,
min_torch: Optional[str] = None,
Expand All @@ -41,7 +41,8 @@ def _RunIf(
deepspeed: bool = False,
dynamo: bool = False,
) -> Tuple[List[str], Dict[str, bool]]:
"""
"""Construct reasons for pytest skipif.
Args:
min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set.
min_torch: Require that PyTorch is greater or equal than this version.
Expand Down
2 changes: 1 addition & 1 deletion src/lightning/pytorch/callbacks/batch_size_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class BatchSizeFinder(Callback):
- ``'power'``: Keep multiplying the batch size by 2, until we get an OOM error.
- ``'binsearch'``: Initially keep multiplying by 2 and after encountering an OOM error
do a binary search between the last successful batch size and the batch size that failed.
do a binary search between the last successful batch size and the batch size that failed.
steps_per_trial: number of steps to run with a given batch size.
Ideally 1 should be enough to test if an OOM error occurs,
Expand Down
4 changes: 2 additions & 2 deletions src/lightning/pytorch/utilities/testing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from lightning.pytorch.utilities.testing._runif import _RunIf
from lightning.pytorch.utilities.testing._runif import _runif_reasons

__all__ = ["_RunIf"]
__all__ = ["_runif_reasons"]
7 changes: 4 additions & 3 deletions src/lightning/pytorch/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from lightning_utilities.core.imports import RequirementCache

from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
from lightning.fabric.utilities.testing import _RunIf as FabricRunIf
from lightning.fabric.utilities.testing import _runif_reasons as FabricRunIf
from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE
from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE
from lightning.pytorch.core.module import _ONNX_AVAILABLE
Expand All @@ -25,7 +25,7 @@
_SKLEARN_AVAILABLE = RequirementCache("scikit-learn")


def _RunIf(
def _runif_reasons(
*,
min_cuda_gpus: int = 0,
min_torch: Optional[str] = None,
Expand All @@ -44,7 +44,8 @@ def _RunIf(
sklearn: bool = False,
onnx: bool = False,
) -> Tuple[List[str], Dict[str, bool]]:
"""
"""Construct reasons for pytest skipif.
Args:
min_cuda_gpus: Require this number of gpus and that the ``PL_RUN_CUDA_TESTS=1`` environment variable is set.
min_torch: Require that PyTorch is greater or equal than this version.
Expand Down
File renamed without changes.
32 changes: 32 additions & 0 deletions tests/parity_fabric/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os

import pytest
import torch.distributed


@pytest.fixture()
def reset_deterministic_algorithm():
"""Ensures that torch determinism settings are reset before the next test runs."""
yield
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
torch.use_deterministic_algorithms(False)


@pytest.fixture()
def reset_cudnn_benchmark():
"""Ensures that the `torch.backends.cudnn.benchmark` setting gets reset before the next test runs."""
yield
torch.backends.cudnn.benchmark = False
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ retry_command() {
return $exit_code
}

retry_command "python -m parity.test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
retry_command "python -m parity.test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
retry_command "python -m test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
retry_command "python -m test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
from torch.utils.data.distributed import DistributedSampler

from lightning.fabric.fabric import Fabric
from tests_fabric.parity.models import ConvNet
from tests_fabric.parity.utils import (
from parity_fabric.models import ConvNet
from parity_fabric.utils import (
cuda_reset,
is_cuda_memory_close,
is_state_dict_equal,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@
import torch.nn.functional

from lightning.fabric.fabric import Fabric
from tests_fabric.helpers.runif import RunIf
from tests_fabric.parity.models import ConvNet
from tests_fabric.parity.utils import (
from parity_fabric.models import ConvNet
from parity_fabric.utils import (
cuda_reset,
get_model_input_dtype,
is_cuda_memory_close,
is_state_dict_equal,
is_timing_close,
make_deterministic,
)
from tests_fabric.helpers.runif import RunIf


def train_torch(
Expand Down
File renamed without changes.
8 changes: 8 additions & 0 deletions tests/parity_pytorch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pytest

from lightning.pytorch.utilities.testing import _runif_reasons


def RunIf(**kwargs):
reasons, marker_kwargs = _runif_reasons(**kwargs)
return pytest.mark.skipif(condition=len(reasons) > 0, reason=f"Requires: [{' + '.join(reasons)}]", **marker_kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
import os

from tests_pytorch.benchmarks.test_basic_parity import measure_loops
from parity_pytorch.measure import measure_loops
from tests_pytorch.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN

NUM_EPOCHS = 20
Expand Down
Loading

0 comments on commit 1b10e30

Please sign in to comment.