[fix] fix OOM when megatron loading large model by only rank 0 loads weights #445
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: e2e_lora | |
on: | |
# Trigger the workflow on push or pull request, | |
# but only for the main branch | |
push: | |
branches: | |
- main | |
- v0.2.x | |
paths: | |
- "**/*.py" | |
- .github/workflows/e2e_lora.yml | |
pull_request: | |
branches: | |
- main | |
- v0.2.x | |
paths: | |
- "**/*.py" | |
- .github/workflows/e2e_lora.yml | |
- "tests/e2e/*.sh" | |
# Declare permissions just read content. | |
permissions: | |
contents: read | |
jobs: | |
e2e_lora: | |
runs-on: [self-hosted, l20-1] | |
env: | |
HTTP_PROXY: ${{ secrets.PROXY_HTTP }} | |
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} | |
NO_PROXY: "localhost,127.0.0.1" | |
HF_HUB_ENABLE_HF_TRANSFER: 1 | |
container: | |
image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3 | |
options: --gpus all --shm-size=10g | |
steps: | |
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
with: | |
fetch-depth: 0 | |
- name: Install the current repository | |
run: | | |
pip3 install hf_transfer peft | |
pip3 install -e .[test] | |
- name: Prepare gsm8k dataset | |
run: | | |
ray stop --force | |
python3 examples/data_preprocess/gsm8k.py | |
- name: Running gsm8k e2e training tests with LoRA | |
run: | | |
ray stop --force | |
bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/ | |
rm -rf $HOME/ckpts/* |