Skip to content

Commit

Permalink
Merge branch 'master' into amp-plugin-config-validate
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli committed Jun 6, 2023
2 parents 3e20ef9 + 420eb6f commit 017d8f0
Show file tree
Hide file tree
Showing 74 changed files with 1,251 additions and 540 deletions.
4 changes: 3 additions & 1 deletion .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ jobs:
- bash: |
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
pip install -q wget packaging
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
done
displayName: 'Adjust dependencies'
Expand Down
4 changes: 3 additions & 1 deletion .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,10 @@ jobs:
- bash: |
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
pip install -q wget packaging
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
done
# prune packages with installation issues
pip install -q -r .actions/requirements.txt
Expand Down
9 changes: 7 additions & 2 deletions .github/checkgroup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,13 @@ subprojects:
- id: "pytorch_lightning: Benchmarks"
paths:
- ".azure/gpu-benchmarks.yml"
- "tests/tests_pytorch/benchmarks/**"
- "requirements/fabric/**"
- "requirements/pytorch/**"
- "src/lightning/fabric/**"
- "src/lightning/pytorch/**"
- "tests/parity_fabric/**"
- "tests/parity_pytorch/**"
- "!requirements/fabric/docs.txt"
- "!requirements/pytorch/docs.txt"
- "!*.md"
- "!**/*.md"
Expand Down Expand Up @@ -139,7 +144,7 @@ subprojects:
- "build-cuda (3.9, 1.12, 11.6.1)"
- "build-cuda (3.9, 1.13, 11.7.1)"
- "build-cuda (3.10, 2.0, 11.7.1)"
- "build-NGC"
#- "build-NGC"
- "build-pl (3.9, 1.11, 11.3.1)"
- "build-pl (3.9, 1.12, 11.6.1)"
- "build-pl (3.9, 1.13, 11.7.1)"
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/ci-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,10 @@ jobs:
- name: Adjust PyTorch versions in requirements files
if: ${{ matrix.requires != 'oldest' && matrix.release != 'pre' }}
run: |
pip install -q wget packaging
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath ${{ matrix.pytorch-version }}; \
python ./adjust-torch-versions.py $fpath ${{ matrix.pytorch-version }}; \
done
cat requirements/fabric/base.txt
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/ci-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,10 @@ jobs:
- name: Adjust PyTorch versions in requirements files
if: ${{ matrix.requires != 'oldest' && matrix.release != 'pre' }}
run: |
pip install -q wget packaging
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath ${{ matrix.pytorch-version }}; \
python ./adjust-torch-versions.py $fpath ${{ matrix.pytorch-version }}; \
done
cat requirements/pytorch/base.txt
Expand Down
9 changes: 6 additions & 3 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
branches: [master, "release/*"]
pull_request_target:
branches: [master, "release/*"]
types: [ opened, reopened, edited, ready_for_review, synchronize ]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
Expand All @@ -25,8 +26,11 @@ env:
jobs:
test-on-tpus:
runs-on: ubuntu-22.04
# run only when the PR title contains '[TPU]' or is a merge to master
if: ${{ startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.title, '[TPU]') || (github.event_name == 'push' && github.ref == 'refs/heads/master') }}
# run only when the PR title contains 'TPU' or is a merge to master
if: |
(github.event_name == 'push' && github.ref == 'refs/heads/master') ||
(startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.title, 'TPU'))
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -153,7 +157,6 @@ jobs:
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
# see: https://github.com/actions/toolkit/issues/399
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
Expand Down
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,10 @@ Fabric is designed for the most complex models like foundation model scaling, LL
+ import lightning as L
import torch; import torchvision as tv

dataset = tv.datasets.CIFAR10("data", download=True,
train=True,
transform=tv.transforms.ToTensor())

+ fabric = L.Fabric()
+ fabric.launch()

Expand All @@ -346,9 +350,6 @@ Fabric is designed for the most complex models like foundation model scaling, LL
- model.to(device)
+ model, optimizer = fabric.setup(model, optimizer)

dataset = tv.datasets.CIFAR10("data", download=True,
train=True,
transform=tv.transforms.ToTensor())
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
+ dataloader = fabric.setup_dataloaders(dataloader)

Expand All @@ -375,16 +376,17 @@ Fabric is designed for the most complex models like foundation model scaling, LL
import lightning as L
import torch; import torchvision as tv

dataset = tv.datasets.CIFAR10("data", download=True,
train=True,
transform=tv.transforms.ToTensor())

fabric = L.Fabric()
fabric.launch()

model = tv.models.resnet18()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
model, optimizer = fabric.setup(model, optimizer)

dataset = tv.datasets.CIFAR10("data", download=True,
train=True,
transform=tv.transforms.ToTensor())
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
dataloader = fabric.setup_dataloaders(dataloader)

Expand Down
4 changes: 3 additions & 1 deletion dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,10 @@ RUN \
# Disable cache \
pip config set global.cache-dir false && \
# set particular PyTorch version \
pip install -q wget packaging && \
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py && \
for fpath in `ls requirements/**/*.txt`; do \
python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
done && \
CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
pip install \
Expand Down
2 changes: 2 additions & 0 deletions dockers/nvidia/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ RUN \
fi && \
# save the examples \
ls -lh lightning/ && \
rm -rf lightning/.git && \
rm -rf lightning/_notebooks/.git && \
mv lightning/_notebooks/.notebooks/ notebooks && \
cp -r lightning/*examples . && \

Expand Down
4 changes: 0 additions & 4 deletions docs/source-fabric/api/fabric_methods.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,6 @@ This eliminates the waiting time to transfer the model parameters from the CPU t
For strategies that handle large sharded models (FSDP, DeepSpeed), the :meth:`~lightning.fabric.fabric.Fabric.init_module` method will allocate the model parameters on the meta device first before sharding.
This makes it possible to work with models that are larger than the memory of a single device.

.. tip::

This is a wrapper over :meth:`~lightning.fabric.fabric.Fabric.init` and :meth:`~lightning.fabric.fabric.Fabric.sharded_model` which implement the features described above.
Using these separately can provide more control for expert users.

autocast
========
Expand Down
2 changes: 1 addition & 1 deletion requirements/_integrations/accelerators.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# validation HPU connectors
lightning-habana >=0.1.0
lightning-graphcore >=0.1.0.rc0
lightning-graphcore >=0.1.0.rc3
2 changes: 1 addition & 1 deletion requirements/fabric/strategies.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows"
deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"
2 changes: 1 addition & 1 deletion requirements/fabric/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ pytest-cov ==4.0.0
pytest-rerunfailures ==10.3
pytest-random-order ==1.1.0
click ==8.1.3
tensorboardX >=2.2, <=2.5.1 # min version is set by torch.onnx missing attribute
tensorboardX >=2.2, <=2.6 # min version is set by torch.onnx missing attribute
63 changes: 0 additions & 63 deletions requirements/pytorch/adjust-versions.py

This file was deleted.

2 changes: 1 addition & 1 deletion requirements/pytorch/extra.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ omegaconf >=2.0.5, <2.4.0
hydra-core >=1.0.5, <1.4.0
jsonargparse[signatures] >=4.18.0, <4.22.0
rich >=12.3.0, <=13.0.1
tensorboardX >=2.2, <=2.5.1 # min version is set by torch.onnx missing attribute
tensorboardX >=2.2, <=2.6 # min version is set by torch.onnx missing attribute
2 changes: 1 addition & 1 deletion requirements/pytorch/strategies.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows"
deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"
4 changes: 3 additions & 1 deletion src/lightning/app/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Allow customize `gradio` components with lightning colors ([#17054](https://github.com/Lightning-AI/lightning/pull/17054))

- Added the property `LightningWork.public_ip` that exposes the public IP of the `LightningWork` instance ([#17742](https://github.com/Lightning-AI/lightning/pull/17742))


### Changed

Expand All @@ -29,7 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

-
- Fixed `LightningWork.internal_ip` that was mistakenly exposing the public IP instead; now exposes the private/internal IP address ([#17742](https://github.com/Lightning-AI/lightning/pull/17742))


## [2.0.1.post0] - 2023-04-11
Expand Down
7 changes: 4 additions & 3 deletions src/lightning/app/components/database/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,10 @@ def db_url(self) -> Optional[str]:
use_localhost = "LIGHTNING_APP_STATE_URL" not in os.environ
if use_localhost:
return self.url
if self.internal_ip != "":
return f"http://{self.internal_ip}:{self.port}"
return self.internal_ip
ip_addr = self.public_ip or self.internal_ip
if ip_addr != "":
return f"http://{ip_addr}:{self.port}"
return ip_addr

def on_exit(self):
self._exit_event.set()
Expand Down
8 changes: 4 additions & 4 deletions src/lightning/app/components/serve/auto_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ def __init__(
raise ValueError("cold_start_proxy must be of type ColdStartProxy or str")

def get_internal_url(self) -> str:
if not self._internal_ip:
raise ValueError("Internal IP not set")
return f"http://{self._internal_ip}:{self._port}"
if not self._public_ip:
raise ValueError("Public IP not set")
return f"http://{self._public_ip}:{self._port}"

async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]], server_url: str):
request_data: List[_LoadBalancer._input_type] = [b[1] for b in batch]
Expand Down Expand Up @@ -386,7 +386,7 @@ def update_servers(self, server_works: List[LightningWork]):
"""
old_server_urls = set(self.servers)
current_server_urls = {
f"http://{server._internal_ip}:{server.port}" for server in server_works if server._internal_ip
f"http://{server._public_ip}:{server.port}" for server in server_works if server._internal_ip
}

# doing nothing if no server work has been added/removed
Expand Down
12 changes: 12 additions & 0 deletions src/lightning/app/core/work.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class LightningWork:
"_url",
"_restarting",
"_internal_ip",
"_public_ip",
)

_run_executor_cls: Type[WorkRunExecutor] = WorkRunExecutor
Expand Down Expand Up @@ -138,6 +139,7 @@ def __init__(
"_url",
"_future_url",
"_internal_ip",
"_public_ip",
"_restarting",
"_cloud_compute",
"_display_name",
Expand All @@ -148,6 +150,7 @@ def __init__(
self._url: str = ""
self._future_url: str = "" # The cache URL is meant to defer resolving the url values.
self._internal_ip: str = ""
self._public_ip: str = ""
# setattr_replacement is used by the multiprocessing runtime to send the latest changes to the main coordinator
self._setattr_replacement: Optional[Callable[[str, Any], None]] = None
self._name: str = ""
Expand Down Expand Up @@ -212,6 +215,15 @@ def internal_ip(self) -> str:
"""
return self._internal_ip

@property
def public_ip(self) -> str:
"""The public ip address of this LightningWork, reachable from the internet.
By default, this attribute returns the empty string and the ip address will only be returned once the work runs.
Locally, this address is undefined (empty string) and in the cloud it will be determined by the cluster.
"""
return self._public_ip

def _on_init_end(self) -> None:
self._local_build_config.on_work_init(self)
self._cloud_build_config.on_work_init(self, self._cloud_compute)
Expand Down
3 changes: 2 additions & 1 deletion src/lightning/app/utilities/proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,8 @@ def run_once(self):
# Set this here after the state observer is initialized, since it needs to record it as a change and send
# it back to the flow
default_internal_ip = "127.0.0.1" if constants.LIGHTNING_CLOUDSPACE_HOST is None else "0.0.0.0" # noqa: S104
self.work._internal_ip = os.environ.get("LIGHTNING_NODE_IP", default_internal_ip)
self.work._internal_ip = os.environ.get("LIGHTNING_NODE_PRIVATE_IP", default_internal_ip)
self.work._public_ip = os.environ.get("LIGHTNING_NODE_IP", "")

# 8. Patch the setattr method of the work. This needs to be done after step 4, so we don't
# send delta while calling `set_state`.
Expand Down
Loading

0 comments on commit 017d8f0

Please sign in to comment.