Merge branch 'master' into amp-plugin-config-validate

Lightning-AI · Jun 6, 2023 · 017d8f0 · 017d8f0
2 parents 3e20ef9 + 420eb6f
commit 017d8f0
Show file tree

Hide file tree

Showing 74 changed files with 1,251 additions and 540 deletions.
diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -91,8 +91,10 @@ jobs:
 
     - bash: |
         PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+        pip install -q wget packaging
+        python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
         for fpath in `ls requirements/**/*.txt`; do \
-          python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
+          python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
         done
       displayName: 'Adjust dependencies'
 

diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -98,8 +98,10 @@ jobs:
 
     - bash: |
         PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+        pip install -q wget packaging
+        python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
         for fpath in `ls requirements/**/*.txt`; do \
-          python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
+          python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
         done
         # prune packages with installation issues
         pip install -q -r .actions/requirements.txt

diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -71,8 +71,13 @@ subprojects:
   - id: "pytorch_lightning: Benchmarks"
     paths:
       - ".azure/gpu-benchmarks.yml"
-      - "tests/tests_pytorch/benchmarks/**"
+      - "requirements/fabric/**"
       - "requirements/pytorch/**"
+      - "src/lightning/fabric/**"
+      - "src/lightning/pytorch/**"
+      - "tests/parity_fabric/**"
+      - "tests/parity_pytorch/**"
+      - "!requirements/fabric/docs.txt"
       - "!requirements/pytorch/docs.txt"
       - "!*.md"
       - "!**/*.md"
@@ -139,7 +144,7 @@ subprojects:
       - "build-cuda (3.9, 1.12, 11.6.1)"
       - "build-cuda (3.9, 1.13, 11.7.1)"
       - "build-cuda (3.10, 2.0, 11.7.1)"
-      - "build-NGC"
+      #- "build-NGC"
       - "build-pl (3.9, 1.11, 11.3.1)"
       - "build-pl (3.9, 1.12, 11.6.1)"
       - "build-pl (3.9, 1.13, 11.7.1)"

diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -84,8 +84,10 @@ jobs:
     - name: Adjust PyTorch versions in requirements files
       if: ${{ matrix.requires != 'oldest' && matrix.release != 'pre' }}
       run: |
+        pip install -q wget packaging
+        python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
         for fpath in `ls requirements/**/*.txt`; do \
-          python ./requirements/pytorch/adjust-versions.py $fpath ${{ matrix.pytorch-version }}; \
+          python ./adjust-torch-versions.py $fpath ${{ matrix.pytorch-version }}; \
         done
         cat requirements/fabric/base.txt
 

diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -88,8 +88,10 @@ jobs:
     - name: Adjust PyTorch versions in requirements files
       if: ${{ matrix.requires != 'oldest' && matrix.release != 'pre' }}
       run: |
+        pip install -q wget packaging
+        python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
         for fpath in `ls requirements/**/*.txt`; do \
-          python ./requirements/pytorch/adjust-versions.py $fpath ${{ matrix.pytorch-version }}; \
+          python ./adjust-torch-versions.py $fpath ${{ matrix.pytorch-version }}; \
         done
         cat requirements/pytorch/base.txt
 

diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
@@ -5,6 +5,7 @@ on:
     branches: [master, "release/*"]
   pull_request_target:
     branches: [master, "release/*"]
+    types: [ opened, reopened, edited, ready_for_review, synchronize ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
@@ -25,8 +26,11 @@ env:
 jobs:
   test-on-tpus:
     runs-on: ubuntu-22.04
-    # run only when the PR title contains '[TPU]' or is a merge to master
-    if: ${{ startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.title, '[TPU]') || (github.event_name == 'push' && github.ref == 'refs/heads/master') }}
+    # run only when the PR title contains 'TPU' or is a merge to master
+    if: |
+      (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
+      (startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.title, 'TPU'))
+
     strategy:
       fail-fast: false
       matrix:
@@ -153,7 +157,6 @@ jobs:
 
     - name: Upload coverage to Codecov
       uses: codecov/codecov-action@v3
-      # see: https://github.com/actions/toolkit/issues/399
       continue-on-error: true
       with:
         token: ${{ secrets.CODECOV_TOKEN }}

diff --git a/README.md b/README.md
@@ -337,6 +337,10 @@ Fabric is designed for the most complex models like foundation model scaling, LL
 + import lightning as L
   import torch; import torchvision as tv
 
+ dataset = tv.datasets.CIFAR10("data", download=True,
+                               train=True,
+                               transform=tv.transforms.ToTensor())
+
 + fabric = L.Fabric()
 + fabric.launch()
 
@@ -346,9 +350,6 @@ Fabric is designed for the most complex models like foundation model scaling, LL
 - model.to(device)
 + model, optimizer = fabric.setup(model, optimizer)
 
-  dataset = tv.datasets.CIFAR10("data", download=True,
-                                train=True,
-                                transform=tv.transforms.ToTensor())
   dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
 + dataloader = fabric.setup_dataloaders(dataloader)
 
@@ -375,16 +376,17 @@ Fabric is designed for the most complex models like foundation model scaling, LL
 import lightning as L
 import torch; import torchvision as tv
 
+dataset = tv.datasets.CIFAR10("data", download=True,
+                              train=True,
+                              transform=tv.transforms.ToTensor())
+
 fabric = L.Fabric()
 fabric.launch()
 
 model = tv.models.resnet18()
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
 model, optimizer = fabric.setup(model, optimizer)
 
-dataset = tv.datasets.CIFAR10("data", download=True,
-                              train=True,
-                              transform=tv.transforms.ToTensor())
 dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
 dataloader = fabric.setup_dataloaders(dataloader)
 

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -88,8 +88,10 @@ RUN \
     # Disable cache \
     pip config set global.cache-dir false && \
     # set particular PyTorch version \
+    pip install -q wget packaging && \
+    python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py  && \
     for fpath in `ls requirements/**/*.txt`; do \
-      python ./requirements/pytorch/adjust-versions.py $fpath ${PYTORCH_VERSION}; \
+      python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
     done && \
     CUDA_VERSION_MM=${CUDA_VERSION%.*} && \
     pip install \

diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
@@ -38,6 +38,8 @@ RUN \
     fi && \
 # save the examples \
     ls -lh lightning/ && \
+    rm -rf lightning/.git && \
+    rm -rf lightning/_notebooks/.git && \
     mv lightning/_notebooks/.notebooks/ notebooks && \
     cp -r lightning/*examples . && \
 

diff --git a/docs/source-fabric/api/fabric_methods.rst b/docs/source-fabric/api/fabric_methods.rst
@@ -157,10 +157,6 @@ This eliminates the waiting time to transfer the model parameters from the CPU t
 For strategies that handle large sharded models (FSDP, DeepSpeed), the :meth:`~lightning.fabric.fabric.Fabric.init_module` method will allocate the model parameters on the meta device first before sharding.
 This makes it possible to work with models that are larger than the memory of a single device.
 
-.. tip::
-
-    This is a wrapper over :meth:`~lightning.fabric.fabric.Fabric.init` and :meth:`~lightning.fabric.fabric.Fabric.sharded_model` which implement the features described above.
-    Using these separately can provide more control for expert users.
 
 autocast
 ========

diff --git a/requirements/_integrations/accelerators.txt b/requirements/_integrations/accelerators.txt
@@ -1,3 +1,3 @@
 # validation HPU connectors
 lightning-habana >=0.1.0
-lightning-graphcore >=0.1.0.rc0
+lightning-graphcore >=0.1.0.rc3
diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt
@@ -1,3 +1,3 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows"
+deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"
diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt
@@ -4,4 +4,4 @@ pytest-cov ==4.0.0
 pytest-rerunfailures ==10.3
 pytest-random-order ==1.1.0
 click ==8.1.3
-tensorboardX >=2.2, <=2.5.1  # min version is set by torch.onnx missing attribute
+tensorboardX >=2.2, <=2.6  # min version is set by torch.onnx missing attribute
diff --git a/requirements/pytorch/adjust-versions.py b/requirements/pytorch/adjust-versions.py
diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt
@@ -7,4 +7,4 @@ omegaconf >=2.0.5, <2.4.0
 hydra-core >=1.0.5, <1.4.0
 jsonargparse[signatures] >=4.18.0, <4.22.0
 rich >=12.3.0, <=13.0.1
-tensorboardX >=2.2, <=2.5.1  # min version is set by torch.onnx missing attribute
+tensorboardX >=2.2, <=2.6  # min version is set by torch.onnx missing attribute
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
@@ -1,3 +1,3 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-deepspeed >=0.8.2, <=0.9.1; platform_system != "Windows"
+deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows"
diff --git a/src/lightning/app/CHANGELOG.md b/src/lightning/app/CHANGELOG.md
@@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Allow customize `gradio` components with lightning colors ([#17054](https://github.com/Lightning-AI/lightning/pull/17054))
 
+- Added the property `LightningWork.public_ip` that exposes the public IP of the `LightningWork` instance ([#17742](https://github.com/Lightning-AI/lightning/pull/17742))
+
 
 ### Changed
 
@@ -29,7 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed `LightningWork.internal_ip` that was mistakenly exposing the public IP instead; now exposes the private/internal IP address ([#17742](https://github.com/Lightning-AI/lightning/pull/17742))
 
 
 ## [2.0.1.post0] - 2023-04-11

diff --git a/src/lightning/app/components/database/server.py b/src/lightning/app/components/database/server.py
@@ -231,9 +231,10 @@ def db_url(self) -> Optional[str]:
         use_localhost = "LIGHTNING_APP_STATE_URL" not in os.environ
         if use_localhost:
             return self.url
-        if self.internal_ip != "":
-            return f"http://{self.internal_ip}:{self.port}"
-        return self.internal_ip
+        ip_addr = self.public_ip or self.internal_ip
+        if ip_addr != "":
+            return f"http://{ip_addr}:{self.port}"
+        return ip_addr
 
     def on_exit(self):
         self._exit_event.set()

diff --git a/src/lightning/app/components/serve/auto_scaler.py b/src/lightning/app/components/serve/auto_scaler.py
@@ -180,9 +180,9 @@ def __init__(
                 raise ValueError("cold_start_proxy must be of type ColdStartProxy or str")
 
     def get_internal_url(self) -> str:
-        if not self._internal_ip:
-            raise ValueError("Internal IP not set")
-        return f"http://{self._internal_ip}:{self._port}"
+        if not self._public_ip:
+            raise ValueError("Public IP not set")
+        return f"http://{self._public_ip}:{self._port}"
 
     async def send_batch(self, batch: List[Tuple[str, _BatchRequestModel]], server_url: str):
         request_data: List[_LoadBalancer._input_type] = [b[1] for b in batch]
@@ -386,7 +386,7 @@ def update_servers(self, server_works: List[LightningWork]):
         """
         old_server_urls = set(self.servers)
         current_server_urls = {
-            f"http://{server._internal_ip}:{server.port}" for server in server_works if server._internal_ip
+            f"http://{server._public_ip}:{server.port}" for server in server_works if server._internal_ip
         }
 
         # doing nothing if no server work has been added/removed

diff --git a/src/lightning/app/core/work.py b/src/lightning/app/core/work.py
@@ -60,6 +60,7 @@ class LightningWork:
         "_url",
         "_restarting",
         "_internal_ip",
+        "_public_ip",
     )
 
     _run_executor_cls: Type[WorkRunExecutor] = WorkRunExecutor
@@ -138,6 +139,7 @@ def __init__(
             "_url",
             "_future_url",
             "_internal_ip",
+            "_public_ip",
             "_restarting",
             "_cloud_compute",
             "_display_name",
@@ -148,6 +150,7 @@ def __init__(
         self._url: str = ""
         self._future_url: str = ""  # The cache URL is meant to defer resolving the url values.
         self._internal_ip: str = ""
+        self._public_ip: str = ""
         # setattr_replacement is used by the multiprocessing runtime to send the latest changes to the main coordinator
         self._setattr_replacement: Optional[Callable[[str, Any], None]] = None
         self._name: str = ""
@@ -212,6 +215,15 @@ def internal_ip(self) -> str:
         """
         return self._internal_ip
 
+    @property
+    def public_ip(self) -> str:
+        """The public ip address of this LightningWork, reachable from the internet.
+
+        By default, this attribute returns the empty string and the ip address will only be returned once the work runs.
+        Locally, this address is undefined (empty string) and in the cloud it will be determined by the cluster.
+        """
+        return self._public_ip
+
     def _on_init_end(self) -> None:
         self._local_build_config.on_work_init(self)
         self._cloud_build_config.on_work_init(self, self._cloud_compute)

diff --git a/src/lightning/app/utilities/proxies.py b/src/lightning/app/utilities/proxies.py
@@ -494,7 +494,8 @@ def run_once(self):
         # Set this here after the state observer is initialized, since it needs to record it as a change and send
         # it back to the flow
         default_internal_ip = "127.0.0.1" if constants.LIGHTNING_CLOUDSPACE_HOST is None else "0.0.0.0"  # noqa: S104
-        self.work._internal_ip = os.environ.get("LIGHTNING_NODE_IP", default_internal_ip)
+        self.work._internal_ip = os.environ.get("LIGHTNING_NODE_PRIVATE_IP", default_internal_ip)
+        self.work._public_ip = os.environ.get("LIGHTNING_NODE_IP", "")
 
         # 8. Patch the setattr method of the work. This needs to be done after step 4, so we don't
         # send delta while calling `set_state`.