Merge pull request #3477 from akiki-liang0/feat/a3ultra-preview-add-nemo

Add NeMo framework example to a3-Ultra
GoogleCloudPlatform · Jan 10, 2025 · ef16361 · ef16361
2 parents c6cf753 + 9b9bd8c
commit ef16361
Show file tree

Hide file tree

Showing 3 changed files with 150 additions and 0 deletions.
diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/Dockerfile
@@ -0,0 +1,39 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG NEMOFW_VERSION=24.07
+FROM nvcr.io/nvidia/nemo:$NEMOFW_VERSION
+
+ENV NCCL_DEBUG=INFO,WARN
+ENV NCCL_DEBUG_SUBSYS=NET,COLL,GPU,INIT,ENV
+ENV NCCL_SOCKET_IFNAME=enp0s19,enp192s20
+ENV GLOO_SOCKET_IFNAME=enp0s19,enp192s20
+ENV NCCL_CROSS_NIC=0
+ENV NCCL_NET_GDR_LEVEL=PIX
+#ENV PMIX_MCA_gds=^ds12
+ENV NCCL_P2P_NET_CHUNKSIZE=131072
+ENV NCCL_P2P_PCI_CHUNKSIZE=131072
+ENV NCCL_P2P_NVL_CHUNKSIZE=524288
+ENV NCCL_NVLS_CHUNKSIZE=524288
+ENV NCCL_IB_GID_INDEX=3
+ENV NCCL_IB_ADAPTIVE_ROUTING=1
+ENV NCCL_IB_QPS_PER_CONNECTION=4
+ENV NCCL_IB_TC=52
+ENV NCCL_IB_FIFO_TC=84
+ENV NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/usr/local/gib/configs/guest_config.txtpb
+ENV NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config.txtpb
+ENV NCCL_NET=gIB
+
+RUN echo "/usr/local/gib/lib64" >> /etc/ld.so.conf.d/gib.conf && ldconfig
+ENV LD_LIBRARY_PATH=/usr/local/gib/lib64:$LD_LIBRARY_PATH
diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/README.md
@@ -0,0 +1,81 @@
+README
+======
+
+1. Set up NeMo Framework Container
+
+   This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
+   container, and submits a Slurm job to copy the framework launcher scripts and a
+   few other auxiliary files into your working directory.
+
+   ```shell
+   sbatch setup_nemo.sh
+   ```
+
+2. Install NeMo Framework Requirements
+
+   We suggest using a virtual environment, and this installs the necessary
+   components to submit jobs using the NeMo
+   framework.
+
+   ```shell
+   python3 -m venv env
+   source env/bin/activate
+   pip install -r requirements.txt # Copied from the NeMo Framework Container earlier
+   # This is needed to use 24.07 and python3.11, which is what is present on
+   # Debian 12
+   pip install -U hydra-core
+   ```
+
+3. Run an example NeMo Framework Pre-Training
+
+   First, prepare the cache. This will download several files to the
+   ~/.cache/huggingface folder which are needed to load the tokenizer for
+   training.
+
+   ```shell
+   pip install transformers
+   python -c "from transformers import AutoTokenizer; \
+       AutoTokenizer.from_pretrained('gpt2')"
+   ```
+
+   This will run an example of training a 5B parameter GPT3 model for 10 steps
+   using mock data as the input.
+
+   ```shell
+   cd launcher_scripts
+   mkdir data
+
+   MAX_STEPS=10
+   NUM_NODES=8
+
+   python main.py \
+       launcher_scripts_path=${PWD} \
+       stages=[training] \
+       training=gpt3/5b \
+       env_vars.TRANSFORMERS_OFFLINE=0 \
+       container=../nemo-24.07.sqsh \
+       container_mounts=[${HOME}/.cache,/usr/local/gib] \
+       cluster.srun_args=["--container-writable"] \
+       training.model.data.data_impl=mock \
+       training.model.data.data_prefix=[] \
+       training.trainer.max_steps=${MAX_STEPS} \
+       training.trainer.val_check_interval=${MAX_STEPS} \
+       training.trainer.limit_val_batches=0.0 \
+       training.exp_manager.create_checkpoint_callback=False \
+       training.exp_manager.resume_if_exists=False \
+       training.trainer.num_nodes=${NUM_NODES}
+   ```
+
+   This will submit a pre-training job to your Slurm cluster. Once it starts, you
+   will see results appearing in `results/gpt3_5b/`. For this example, the job
+   should only take a few minutes.
+
+Next Steps
+----------
+
+Now that you've run an example training workload, you may find it preferable to
+customize conf/cluster/bcm.yaml, conf/config.yaml, and the training
+configuration file of your choosing as opposed to using command line arguments.
+For real training workloads you'll also want to use real data, as opposed to
+the mock datasets used here, and explore all tuning and configurations
+parameters for your use case through the NeMo Framework.
diff --git a/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh b/examples/machine-learning/a3-ultragpu-8g/nemo-framework/setup_nemo.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=a3ultra
+#SBATCH --exclusive
+
+: "${NEMOFW_VERSION:=24.07}"
+
+srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemo-"${NEMOFW_VERSION}" .
+srun rm -f nemo-"${NEMOFW_VERSION}".sqsh
+srun enroot import dockerd://nemo-"${NEMOFW_VERSION}"
+
+srun \
+	--container-mounts="${PWD}":/workspace/mount_dir,/var/tmp:/var/tmp \
+	--container-image=./nemo-"${NEMOFW_VERSION}".sqsh \
+	bash -c "cp -r /opt/NeMo-Framework-Launcher/requirements.txt /opt/NeMo-Framework-Launcher/launcher_scripts /opt/NeMo-Framework-Launcher/auto_configurator /workspace/mount_dir/"