pytorch · guangy10 · Apr 17, 2024 · Apr 12, 2024
diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
@@ -15,18 +15,63 @@
     "tinyllamas/stories15M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin",
     # "tinyllamas/stories42M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin",
     "tinyllamas/stories110M": "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt,https://github.com/karpathy/llama2.c/raw/master/tokenizer.model,https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin",
+    "openlm-research/open_llama_7b": "https://huggingface.co/openlm-research/open_llama_7b/resolve/main/config.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/generation_config.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/special_tokens_map.json,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer.model,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer.model,https://huggingface.co/openlm-research/open_llama_7b/resolve/main/tokenizer_config.json",
     "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/tokenizer_config.json",
     "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00001-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model-00002-of-00002.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/tokenizer_config.json",
     "mistralai/Mistral-7B-Instruct-v0.2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/generation_config.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00001-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00002-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model-00003-of-00003.bin,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/pytorch_model.bin.index.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/special_tokens_map.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.json,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer.model,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/tokenizer_config.json",
 }
 
 JOB_RUNNERS = {
-    "32-core-ubuntu": "linux x86",
-    # "macos-13": "macos x86", # not working for ExecuTorch yet
-    "macos-14": "macos M1",
+    "cpu": {
+        "32-core-ubuntu": "x86_64",
+        # "macos-12": "x86_64", # not working for complie and ExecuTorch yet
+        "macos-14": "aarch64",
+    },
+    "gpu": {
+        "linux.g5.4xlarge.nvidia.gpu": "cuda",
+    },
 }
 
 
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("Gather all models to test on CI for the target OS")
+    parser.add_argument(
+        "-e",
+        "--event",
+        type=str,
+        choices=["pull_request", "push", "periodic"],
+        required=True,
+        help="GitHub CI Event. See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on",
+    )
+    parser.add_argument(
+        "-b",
+        "--backend",
+        type=str,
+        choices=["cpu", "gpu"],
+        required=True,
+        help="Supported backends to run. ['cpu', 'gpu']",
+    )
+
+    return parser.parse_args()
+
+
+def model_should_run_on_event(model: str, event: str) -> bool:
+    """
+    A helper function to decide whether a model should be tested on an event (pull_request/push)
+    We put higher priority and fast models to pull request and rest to push.
+    """
+    if event == "pull_request":
+        return model in ["tinyllamas/stories15M"]
+    elif event == "push":
+        return model in []
+    elif event == "periodic":
+        return model in ["mistralai/Mistral-7B-v0.1", "openlm-research/open_llama_7b"]
+    else:
+        return False
+
+
 def set_output(name: str, val: Any) -> None:
     """
     Set the GitHb output so that it can be accessed by other jobs
@@ -45,19 +90,27 @@ def export_models_for_ci() -> dict[str, dict]:
     This gathers all the models that we want to test on GitHub OSS CI
     """
 
+    args = parse_args()
+    event = args.event
+    backend = args.backend
+
     # This is the JSON syntax for configuration matrix used by GitHub
     # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
     models = {"include": []}
 
     for repo_name, runner in itertools.product(
         MODEL_REPOS.keys(),
-        JOB_RUNNERS.keys(),
+        JOB_RUNNERS[backend].items(),
     ):
+        if not model_should_run_on_event(repo_name, event):
+            continue
+
         record = {
             "repo_name": repo_name,
+            "model_name": repo_name.split("/")[-1],
             "resources": MODEL_REPOS[repo_name],
-            "runner": runner,
-            "platform": JOB_RUNNERS[runner],
+            "runner": runner[0],
+            "platform": runner[1],
             "timeout": 90,
         }
 

diff --git a/.ci/scripts/validate.sh b/.ci/scripts/validate.sh
@@ -15,7 +15,7 @@ function generate_eager_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
     echo "Run inference with eager model for $MODEL_NAME"
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager"
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_eager" || exit 1
     cat "$MODEL_DIR/output_eager"
 }
 
@@ -25,7 +25,7 @@ function generate_compiled_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
     echo ""############### Run inference with torch.compile for $MODEL_NAME "###############"
-    python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled"
+    python -W ignore generate.py --compile --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_compiled" || exit 1
     cat "$MODEL_DIR/output_compiled"
 }
 
@@ -36,7 +36,7 @@ function generate_aoti_model_output() {
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
     echo ""############### Run inference with AOTInductor for $MODEL_NAME "###############"
     python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE"
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti"
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
     cat "$MODEL_DIR/output_aoti"
 }
 
@@ -46,24 +46,24 @@ function generate_executorch_model_output() {
     local MODEL_DIR="${CHECKPOINT_PATH%/*}"
     local MODEL_NAME=$(basename "$CHECKPOINT_PATH" | sed 's/\.[^.]*$//')
     echo ""############### Run inference with ExecuTorch using XNNPACK for $MODEL_NAME "###############"
-    python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32"
-    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et"
+    python -W ignore export.py --checkpoint-path "$CHECKPOINT_PATH" --output-pte-path "$MODEL_DIR/${MODEL_NAME}.pte" -d "fp32" || exit 1
+    python -W ignore generate.py --checkpoint-path "$CHECKPOINT_PATH" --prompt "$PROMPT" --device "$TARGET_DEVICE" --pte-path "$MODEL_DIR/${MODEL_NAME}.pte" > "$MODEL_DIR/output_et" || exit 1
     cat "$MODEL_DIR/output_et"
 }
 
 function run_compile() {
-    generate_compiled_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE"
+    generate_compiled_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
 }
 
 function run_aoti() {
-    generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE"
+    generate_aoti_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
 }
 
 function run_executorch() {
     if [ "$TARGET_DEVICE" = "cpu" ]; then
-        generate_executorch_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE"
+        generate_executorch_model_output "$CHECKPOINT_PATH" "$TARGET_DEVICE" || exit 1
     else
-        echo "Error: Executorch doesn't run on ${TARGET_DEVICE}"
+        echo "Skipped: Executorch doesn't run on ${TARGET_DEVICE}"
     fi
 }
 
@@ -78,16 +78,17 @@ if [ "$#" -gt 2 ]; then
     for arg in "${@:3}"; do
         case "$arg" in
             "compile")
-                run_compile
+                run_compile || exit 1
                 ;;
             "aoti")
-                run_aoti
+                run_aoti || exit 1
                 ;;
             "executorch")
-                run_executorch
+                run_executorch || exit 1
                 ;;
             *)
                 echo "Unknown argument: $arg" >&2
+                exit 1
                 ;;
         esac
     done

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
@@ -7,3 +7,171 @@ on:
     tags:
       - ciflow/periodic/*
   workflow_dispatch:
+
+jobs:
+  gather-models-cpu:
+    runs-on: ubuntu-22.04
+    outputs:
+      models: ${{ steps.gather-models-cpu.outputs.models }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Extract the list of models to run on CPU
+        id: gather-models-cpu
+        run: |
+          set -eux
+          PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "cpu"
+  test-cpu-compile:
+    name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-cpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      TORCHCHAT_ROOT: ${{ github.workspace }}
+      REPO_NAME: ${{ matrix.repo_name }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Print machine info
+        run: |
+          echo "$(uname -a)"
+      - name: Install dependencies
+        run: |
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+          pip list
+      - name: Download checkpoints
+        run: |
+          bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
+      - name: Run validation
+        run: |
+          pushd ${TORCHCHAT_ROOT}
+          bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+          bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile"
+  test-cpu-aoti:
+    name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-cpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }}
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      TORCHCHAT_ROOT: ${{ github.workspace }}
+      REPO_NAME: ${{ matrix.repo_name }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Print machine info
+        run: |
+          echo "$(uname -a)"
+      - name: Install dependencies
+        run: |
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+          pip list
+      - name: Download checkpoints
+        run: |
+          bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}"
+      - name: Run validation
+        run: |
+          pushd ${TORCHCHAT_ROOT}
+          bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+          bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti"
+  gather-models-gpu:
+    runs-on: ubuntu-22.04
+    outputs:
+      models: ${{ steps.gather-models-gpu.outputs.models }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'false'
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Extract the list of models to run on GPU
+        id: gather-models-gpu
+        run: |
+          set -eux
+          PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu"
+  test-gpu-compile:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-gpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        nvidia-smi
+        echo "::endgroup::"
+
+        echo "::group::Install required packages"
+        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r ./requirements.txt
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoint"
+        export REPO_NAME=${{ matrix.repo_name }}
+        bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
+        echo "::endgroup::"
+
+        echo "::group::Convert checkpoint"
+        bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile"
+        echo "::endgroup::"
+  test-gpu-aoti:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }})
+    needs: gather-models-gpu
+    strategy:
+      matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }}
+      fail-fast: false
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      script: |
+        echo "::group::Print machine info"
+        nvidia-smi
+        echo "::endgroup::"
+
+        echo "::group::Install required packages"
+        pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+        pip install -r ./requirements.txt
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Download checkpoint"
+        export REPO_NAME=${{ matrix.repo_name }}
+        bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }}
+        echo "::endgroup::"
+
+        echo "::group::Convert checkpoint"
+        bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME}
+        echo "::endgroup::"
+
+        echo "::group::Run inference"
+        bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti"
+        echo "::endgroup::"