Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CI and validation scripts #154

Merged
merged 1 commit into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 59 additions & 6 deletions .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,63 @@
"tinyllamas/stories15M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin",
# "tinyllamas/stories42M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin",
"tinyllamas/stories110M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin",
"openlm-research/open_llama_7b": "https://huggingface.co/openlm-research/open_llama_7b/resolve/main/config.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/generation_config.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/special_tokens_map.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer.model,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer.model,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer_config.json",
"mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer_config.json",
"mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer_config.json",
"mistralai/Mistral-7B-Instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00001-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00002-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00003-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer_config.json",
}

JOB_RUNNERS = {
"32-core-ubuntu": "linux x86",
# "macos-13": "macos x86", # not working for ExecuTorch yet
"macos-14": "macos M1",
"cpu": {
"32-core-ubuntu": "x86_64",
# "macos-12": "x86_64", # not working for complie and ExecuTorch yet
"macos-14": "aarch64",
},
"gpu": {
"linux.g5.4xlarge.nvidia.gpu": "cuda",
},
}


def parse_args() -> Any:
from argparse import ArgumentParser

parser = ArgumentParser("Gather all models to test on CI for the target OS")
parser.add_argument(
"-e",
"--event",
type=str,
choices=["pull_request", "push", "periodic"],
required=True,
help="GitHub CI Event. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on",
)
parser.add_argument(
"-b",
"--backend",
type=str,
choices=["cpu", "gpu"],
required=True,
help="Supported backends to run. ['cpu', 'gpu']",
)

return parser.parse_args()


def model_should_run_on_event(model: str, event: str) -> bool:
"""
A helper function to decide whether a model should be tested on an event (pull_request/push)
We put higher priority and fast models to pull request and rest to push.
"""
if event == "pull_request":
return model in ["tinyllamas/stories15M"]
elif event == "push":
return model in []
elif event == "periodic":
return model in ["mistralai/Mistral-7B-v0.1", "openlm-research/open_llama_7b"]
else:
return False


def set_output(name: str, val: Any) -> None:
"""
Set the GitHb output so that it can be accessed by other jobs
Expand All @@ -45,19 +90,27 @@ def export_models_for_ci() -> dict[str, dict]:
This gathers all the models that we want to test on GitHub OSS CI
"""

args = parse_args()
event = args.event
backend = args.backend

# This is the JSON syntax for configuration matrix used by GitHub
# https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
models = {"include": []}

for repo_name, runner in itertools.product(
MODEL_REPOS.keys(),
JOB_RUNNERS.keys(),
JOB_RUNNERS[backend].items(),
):
if not model_should_run_on_event(repo_name, event):
continue

record = {
"repo_name": repo_name,
"model_name": repo_name.split("/")[-1],
"resources": MODEL_REPOS[repo_name],
"runner": runner,
"platform": JOB_RUNNERS[runner],
"runner": runner[0],
"platform": runner[1],
"timeout": 90,
}

Expand Down
25 changes: 13 additions & 12 deletions .ci/scripts/validate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ function generate_eager_model_output() {
local MODEL_DIR="${CHECKPOINT_PATH%/*}"
local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
echo "Run inference with eager model for $MODEL_NAME"
python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager"
python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
cat "$MODEL_DIR/output_eager"
}

Expand All @@ -25,7 +25,7 @@ function generate_compiled_model_output() {
local MODEL_DIR="${CHECKPOINT_PATH%/*}"
local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
echo ""############### Run inference with torch.compile for $MODEL_NAME "###############"
python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled"
python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
cat "$MODEL_DIR/output_compiled"
}

Expand All @@ -36,7 +36,7 @@ function generate_aoti_model_output() {
local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
echo ""############### Run inference with AOTInductor for $MODEL_NAME "###############"
python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE"
python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti"
python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
cat "$MODEL_DIR/output_aoti"
}

Expand All @@ -46,24 +46,24 @@ function generate_executorch_model_output() {
local MODEL_DIR="${CHECKPOINT_PATH%/*}"
local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
echo ""############### Run inference with ExecuTorch using XNNPACK for $MODEL_NAME "###############"
python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32"
python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et"
python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32" || exit 1
python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et" || exit 1
cat "$MODEL_DIR/output_et"
}

function run_compile() {
generate_compiled_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE"
generate_compiled_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
}

function run_aoti() {
generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE"
generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
}

function run_executorch() {
if [ "$TARGET_DEVICE" = "cpu" ]; then
generate_executorch_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE"
generate_executorch_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
else
echo "Error: Executorch doesn't run on ${TARGET_DEVICE}"
echo "Skipped: Executorch doesn't run on ${TARGET_DEVICE}"
fi
}

Expand All @@ -78,16 +78,17 @@ if [ "$#" -gt 2 ]; then
for arg in "${@:3}"; do
case "$arg" in
"compile")
run_compile
run_compile || exit 1
;;
"aoti")
run_aoti
run_aoti || exit 1
;;
"executorch")
run_executorch
run_executorch || exit 1
;;
*)
echo "Unknown argument: $arg" >&2
exit 1
;;
esac
done
Expand Down
168 changes: 168 additions & 0 deletions .github/workflows/periodic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,171 @@ on:
tags:
- ciflow/periodic/*
workflow_dispatch:

jobs:
gather-models-cpu:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models-cpu.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Extract the list of models to run on CPU
id: gather-models-cpu
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "cpu"
test-cpu-compile:
name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
pip list
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile"
test-cpu-aoti:
name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-cpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
env:
TORCHCHAT_ROOT: ${{ github.workspace }}
REPO_NAME: ${{ matrix.repo_name }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Print machine info
run: |
echo "$(uname -a)"
- name: Install dependencies
run: |
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
pip install -r requirements.txt
pip list
- name: Download checkpoints
run: |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
- name: Run validation
run: |
pushd ${TORCHCHAT_ROOT}
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti"
gather-models-gpu:
runs-on: ubuntu-22.04
outputs:
models: ${{ steps.gather-models-gpu.outputs.models }}
steps:
- uses: actions/checkout@v3
with:
submodules: 'false'
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Extract the list of models to run on GPU
id: gather-models-gpu
run: |
set -eux
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu"
test-gpu-compile:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"

echo "::group::Install required packages"
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
echo "::endgroup::"

echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"

echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"

echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
echo "::endgroup::"
test-gpu-aoti:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
needs: gather-models-gpu
strategy:
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
fail-fast: false
with:
runner: linux.g5.4xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.1"
script: |
echo "::group::Print machine info"
nvidia-smi
echo "::endgroup::"

echo "::group::Install required packages"
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
pip install -r ./requirements.txt
pip list
echo "::endgroup::"

echo "::group::Download checkpoint"
export REPO_NAME=${{ matrix.repo_name }}
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
echo "::endgroup::"

echo "::group::Convert checkpoint"
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
echo "::endgroup::"

echo "::group::Run inference"
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti"
echo "::endgroup::"
Loading
Loading