From fbab1e9e4174b7bf5395ff25dedcb7deb7e1f0c5 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Thu, 11 Apr 2024 21:46:24 -0700 Subject: [PATCH] Fix CI and validation scripts --- .ci/scripts/gather_test_models.py | 65 ++++++++++-- .ci/scripts/validate.sh | 25 ++--- .github/workflows/periodic.yml | 168 ++++++++++++++++++++++++++++++ .github/workflows/pull.yml | 157 ++++++++++++++++++++++++---- scripts/install_et.sh | 3 +- 5 files changed, 380 insertions(+), 38 deletions(-) diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py index 51401d0c5..26dd5254a 100644 --- a/.ci/scripts/gather_test_models.py +++ b/.ci/scripts/gather_test_models.py @@ -15,18 +15,63 @@ "tinyllamas/stories15M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin", # "tinyllamas/stories42M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin", "tinyllamas/stories110M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin", + "openlm-research/open_llama_7b": "https://huggingface.co/openlm-research/open_llama_7b/resolve/main/config.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/generation_config.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/special_tokens_map.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer.model,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer.model,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer_config.json", "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer_config.json", "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer_config.json", "mistralai/Mistral-7B-Instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00001-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00002-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00003-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer_config.json", } JOB_RUNNERS = { - "32-core-ubuntu": "linux x86", - # "macos-13": "macos x86", # not working for ExecuTorch yet - "macos-14": "macos M1", + "cpu": { + "32-core-ubuntu": "x86_64", + # "macos-12": "x86_64", # not working for complie and ExecuTorch yet + "macos-14": "aarch64", + }, + "gpu": { + "linux.g5.4xlarge.nvidia.gpu": "cuda", + }, } +def parse_args() -> Any: + from argparse import ArgumentParser + + parser = ArgumentParser("Gather all models to test on CI for the target OS") + parser.add_argument( + "-e", + "--event", + type=str, + choices=["pull_request", "push", "periodic"], + required=True, + help="GitHub CI Event. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on", + ) + parser.add_argument( + "-b", + "--backend", + type=str, + choices=["cpu", "gpu"], + required=True, + help="Supported backends to run. ['cpu', 'gpu']", + ) + + return parser.parse_args() + + +def model_should_run_on_event(model: str, event: str) -> bool: + """ + A helper function to decide whether a model should be tested on an event (pull_request/push) + We put higher priority and fast models to pull request and rest to push. + """ + if event == "pull_request": + return model in ["tinyllamas/stories15M"] + elif event == "push": + return model in [] + elif event == "periodic": + return model in ["mistralai/Mistral-7B-v0.1", "openlm-research/open_llama_7b"] + else: + return False + + def set_output(name: str, val: Any) -> None: """ Set the GitHb output so that it can be accessed by other jobs @@ -45,19 +90,27 @@ def export_models_for_ci() -> dict[str, dict]: This gathers all the models that we want to test on GitHub OSS CI """ + args = parse_args() + event = args.event + backend = args.backend + # This is the JSON syntax for configuration matrix used by GitHub # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs models = {"include": []} for repo_name, runner in itertools.product( MODEL_REPOS.keys(), - JOB_RUNNERS.keys(), + JOB_RUNNERS[backend].items(), ): + if not model_should_run_on_event(repo_name, event): + continue + record = { "repo_name": repo_name, + "model_name": repo_name.split("/")[-1], "resources": MODEL_REPOS[repo_name], - "runner": runner, - "platform": JOB_RUNNERS[runner], + "runner": runner[0], + "platform": runner[1], "timeout": 90, } diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh index 4e3fba65c..d4bc75da5 100644 --- a/.ci/scripts/validate.sh +++ b/.ci/scripts/validate.sh @@ -15,7 +15,7 @@ function generate_eager_model_output() { local MODEL_DIR="${CHECKPOINT_PATH%/*}" local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') echo "Run inference with eager model for $MODEL_NAME" - python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1 cat "$MODEL_DIR/output_eager" } @@ -25,7 +25,7 @@ function generate_compiled_model_output() { local MODEL_DIR="${CHECKPOINT_PATH%/*}" local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') echo ""############### Run inference with torch.compile for $MODEL_NAME "###############" - python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" + python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1 cat "$MODEL_DIR/output_compiled" } @@ -36,7 +36,7 @@ function generate_aoti_model_output() { local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') echo ""############### Run inference with AOTInductor for $MODEL_NAME "###############" python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE" - python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1 cat "$MODEL_DIR/output_aoti" } @@ -46,24 +46,24 @@ function generate_executorch_model_output() { local MODEL_DIR="${CHECKPOINT_PATH%/*}" local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//') echo ""############### Run inference with ExecuTorch using XNNPACK for $MODEL_NAME "###############" - python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32" - python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et" + python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32" || exit 1 + python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et" || exit 1 cat "$MODEL_DIR/output_et" } function run_compile() { - generate_compiled_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" + generate_compiled_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1 } function run_aoti() { - generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" + generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1 } function run_executorch() { if [ "$TARGET_DEVICE" = "cpu" ]; then - generate_executorch_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" + generate_executorch_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1 else - echo "Error: Executorch doesn't run on ${TARGET_DEVICE}" + echo "Skipped: Executorch doesn't run on ${TARGET_DEVICE}" fi } @@ -78,16 +78,17 @@ if [ "$#" -gt 2 ]; then for arg in "${@:3}"; do case "$arg" in "compile") - run_compile + run_compile || exit 1 ;; "aoti") - run_aoti + run_aoti || exit 1 ;; "executorch") - run_executorch + run_executorch || exit 1 ;; *) echo "Unknown argument: $arg" >&2 + exit 1 ;; esac done diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index b0bb8d8ab..8fd0bb5be 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -7,3 +7,171 @@ on: tags: - ciflow/periodic/* workflow_dispatch: + +jobs: + gather-models-cpu: + runs-on: ubuntu-22.04 + outputs: + models: ${{ steps.gather-models-cpu.outputs.models }} + steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Extract the list of models to run on CPU + id: gather-models-cpu + run: | + set -eux + PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "cpu" + test-cpu-compile: + name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-cpu + strategy: + matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + TORCHCHAT_ROOT: ${{ github.workspace }} + REPO_NAME: ${{ matrix.repo_name }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Print machine info + run: | + echo "$(uname -a)" + - name: Install dependencies + run: | + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + pip install -r requirements.txt + pip list + - name: Download checkpoints + run: | + bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" + - name: Run validation + run: | + pushd ${TORCHCHAT_ROOT} + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile" + test-cpu-aoti: + name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-cpu + strategy: + matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + TORCHCHAT_ROOT: ${{ github.workspace }} + REPO_NAME: ${{ matrix.repo_name }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Print machine info + run: | + echo "$(uname -a)" + - name: Install dependencies + run: | + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + pip install -r requirements.txt + pip list + - name: Download checkpoints + run: | + bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" + - name: Run validation + run: | + pushd ${TORCHCHAT_ROOT} + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti" + gather-models-gpu: + runs-on: ubuntu-22.04 + outputs: + models: ${{ steps.gather-models-gpu.outputs.models }} + steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Extract the list of models to run on GPU + id: gather-models-gpu + run: | + set -eux + PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu" + test-gpu-compile: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-gpu + strategy: + matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} + fail-fast: false + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + script: | + echo "::group::Print machine info" + nvidia-smi + echo "::endgroup::" + + echo "::group::Install required packages" + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install -r ./requirements.txt + pip list + echo "::endgroup::" + + echo "::group::Download checkpoint" + export REPO_NAME=${{ matrix.repo_name }} + bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} + echo "::endgroup::" + + echo "::group::Convert checkpoint" + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + echo "::endgroup::" + + echo "::group::Run inference" + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile" + echo "::endgroup::" + test-gpu-aoti: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-gpu + strategy: + matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} + fail-fast: false + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + script: | + echo "::group::Print machine info" + nvidia-smi + echo "::endgroup::" + + echo "::group::Install required packages" + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install -r ./requirements.txt + pip list + echo "::endgroup::" + + echo "::group::Download checkpoint" + export REPO_NAME=${{ matrix.repo_name }} + bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} + echo "::endgroup::" + + echo "::group::Convert checkpoint" + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + echo "::endgroup::" + + echo "::group::Run inference" + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti" + echo "::endgroup::" diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index cf1e38550..193e5c7bd 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -1,15 +1,17 @@ name: pull on: - schedule: - - cron: '0,6,12,18 0 * * *' # Runs at midnight UTC and every 6 hours + pull_request: + push: + branches: + - main workflow_dispatch: jobs: - gather-models: + gather-models-cpu: runs-on: ubuntu-22.04 outputs: - models: ${{ steps.gather-models.outputs.models }} + models: ${{ steps.gather-models-cpu.outputs.models }} steps: - uses: actions/checkout@v3 with: @@ -17,22 +19,21 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.11' - - name: Extract the list of models to test - id: gather-models + - name: Extract the list of models to run on CPU + id: gather-models-cpu run: | set -eux - PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py - test-cpu: - name: test-cpu (${{ matrix.platform }}, ${{ matrix.repo_name }}) - needs: gather-models + PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "cpu" + test-cpu-compile: + name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-cpu strategy: - matrix: ${{ fromJSON(needs.gather-models.outputs.models) }} + matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} fail-fast: false runs-on: ${{ matrix.runner }} env: - TORCHAT_ROOT: ${{ github.workspace }} + TORCHCHAT_ROOT: ${{ github.workspace }} REPO_NAME: ${{ matrix.repo_name }} - ENABKE_ET_PYBIND: ${{ matrix.runner == 'macos-14' && 'false' || 'true' }} steps: - name: Checkout repo uses: actions/checkout@v3 @@ -45,13 +46,131 @@ jobs: echo "$(uname -a)" - name: Install dependencies run: | - bash ${TORCHAT_ROOT}/scripts/install_et.sh $ENABKE_ET_PYBIND + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + pip install -r requirements.txt + pip list - name: Download checkpoints run: | - bash ${TORCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" + bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" - name: Run validation run: | - pushd ${TORCHAT_ROOT} - export CHECKPOINT_PATH=${TORCHAT_ROOT}/checkpoints/${REPO_NAME}/model.pth - bash ${TORCHAT_ROOT}/.ci/scripts/convert_checkpoint.sh ${REPO_NAME} - bash ${TORCHAT_ROOT}/.ci/scripts/validate.sh ${CHECKPOINT_PATH} + pushd ${TORCHCHAT_ROOT} + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile" + test-cpu-aoti: + name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-cpu + strategy: + matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + TORCHCHAT_ROOT: ${{ github.workspace }} + REPO_NAME: ${{ matrix.repo_name }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Print machine info + run: | + echo "$(uname -a)" + - name: Install dependencies + run: | + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu + pip install -r requirements.txt + pip list + - name: Download checkpoints + run: | + bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" + - name: Run validation + run: | + pushd ${TORCHCHAT_ROOT} + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti" + gather-models-gpu: + runs-on: ubuntu-22.04 + outputs: + models: ${{ steps.gather-models-gpu.outputs.models }} + steps: + - uses: actions/checkout@v3 + with: + submodules: 'false' + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Extract the list of models to run on GPU + id: gather-models-gpu + run: | + set -eux + PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu" + test-gpu-compile: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-gpu + strategy: + matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} + fail-fast: false + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + script: | + echo "::group::Print machine info" + nvidia-smi + echo "::endgroup::" + + echo "::group::Install required packages" + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install -r ./requirements.txt + pip list + echo "::endgroup::" + + echo "::group::Download checkpoint" + export REPO_NAME=${{ matrix.repo_name }} + bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} + echo "::endgroup::" + + echo "::group::Convert checkpoint" + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + echo "::endgroup::" + + echo "::group::Run inference" + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile" + echo "::endgroup::" + test-gpu-aoti: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) + needs: gather-models-gpu + strategy: + matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} + fail-fast: false + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.1" + script: | + echo "::group::Print machine info" + nvidia-smi + echo "::endgroup::" + + echo "::group::Install required packages" + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install -r ./requirements.txt + pip list + echo "::endgroup::" + + echo "::group::Download checkpoint" + export REPO_NAME=${{ matrix.repo_name }} + bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} + echo "::endgroup::" + + echo "::group::Convert checkpoint" + bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} + echo "::endgroup::" + + echo "::group::Run inference" + bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti" + echo "::endgroup::" diff --git a/scripts/install_et.sh b/scripts/install_et.sh index d230736dd..b7247ca64 100755 --- a/scripts/install_et.sh +++ b/scripts/install_et.sh @@ -11,7 +11,7 @@ install_pip_dependencies() { echo "Intalling common pip packages" pip install wheel - pip install cmake + pip install "cmake>=3.19" pip install ninja pip install zstd pushd ${TORCHCHAT_ROOT} @@ -26,6 +26,7 @@ install_executorch() { pushd ${TORCHCHAT_ROOT}/build/src git clone https://github.com/pytorch/executorch.git cd executorch + git checkout viable/strict echo "Install executorch: submodule update" git submodule sync git submodule update --init