[fbgemm_gpu] Add benchmark workflow

- Add benchmark workflow for AMD TBE
pytorch · Feb 20, 2025 · f95f0c3 · f95f0c3
1 parent 853e97c
commit f95f0c3
Show file tree

Hide file tree

Showing 7 changed files with 507 additions and 30 deletions.
diff --git a/.github/scripts/fbgemm_gpu_benchmarks.bash b/.github/scripts/fbgemm_gpu_benchmarks.bash
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"
+
+################################################################################
+# FBGEMM_GPU Test Helper Functions
+################################################################################
+
+run_tbe_microbench_for_amd () {
+  local env_name="$1"
+
+  __single_run() {
+    local cache_type="$1"
+    local embedding_location="$2"
+
+    echo "################################################################################"
+    echo "# Running Benchmark: (${cache_type}, ${embedding_location})"
+    echo "#"
+    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+    echo "################################################################################"
+    echo ""
+
+    # shellcheck disable=SC2155
+    local env_prefix=$(env_name_or_prefix "${env_name}")
+
+    if [ "$embedding_location" == "hbm" ]; then
+      local managed="device"
+    elif [ "$embedding_location" == "uvm" ]; then
+      local managed="managed"
+    fi
+
+    print_exec conda run --no-capture-output ${env_prefix} python split_table_batched_embeddings_benchmark.py device \
+      --batch-size 131072 \
+      --embedding-dim 256 \
+      --iters 400 \
+      --warmup-runs 50 \
+      --alpha 1.15 \
+      --bag-size 55 \
+      --weights-precision fp16 \
+      --cache-precision "${cache_type}" \
+      --output-dtype bf16 \
+      --managed="${managed}" \
+      --num-embeddings 10000000 \
+      --num-tables 1 \
+      --row-wise \
+      --num-requests 10 \
+      --pooling=none
+  }
+
+  pushd fbgemm_gpu/bench || return 1
+
+  local cache_types=(
+    fp16
+    fp32
+  )
+
+  local embedding_locations=(
+    hbm
+    # uvm
+  )
+
+  for cache_type in "${cache_types[@]}"; do
+    for embedding_location in "${embedding_locations[@]}"; do
+      __single_run "${cache_type}" "${embedding_location}" || return 1
+      echo ""
+      echo ""
+    done
+  done
+
+  popd || return 1
+}
diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash
@@ -37,3 +37,5 @@
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_lint.bash"
 # shellcheck disable=SC1091,SC2128
 . "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_test.bash"
+# shellcheck disable=SC1091,SC2128
+. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_benchmarks.bash"
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -165,22 +165,22 @@ print_gpu_info () {
   if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
     # Ensure that rocm-smi is available and returns GPU entries
     if ! rocm-smi; then
-      echo "[CHECK] ROCm drivers and ROCm device are required for this workflow, but does not appear to be installed or available!"
+      echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
       return 1
     fi
   else
-    if which rocminfo; then
-      # If rocminfo is installed on a machine without GPUs, this will return error
-      (print_exec rocminfo) || true
-    else
-      echo "[CHECK] rocminfo not found"
-    fi
-    if which rocm-smi; then
-      # If rocm-smi is installed on a machine without GPUs, this will return error
-      (print_exec rocm-smi) || true
-    else
-      echo "[CHECK] rocm-smi not found"
-    fi
+    local smi_programs=( rocminfo rocm-smi )
+
+    for smi_program in "${smi_programs[@]}"; do
+      # shellcheck disable=SC2086
+      if which $smi_program; then
+        # If the program is installed on a machine without GPUs, invoking it will return error
+        # shellcheck disable=SC2086
+        (print_exec $smi_program) || true
+      else
+        echo "[CHECK] $smi_program not found"
+      fi
+    done
   fi
 }
 

diff --git a/.github/workflows/fbgemm_gpu_benchmark_cuda.yml b/.github/workflows/fbgemm_gpu_benchmark_cuda.yml
@@ -0,0 +1,209 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This workflow is used for FBGEMM_GPU-CUDA Benchmarking
+name: FBGEMM_GPU-CUDA Benchmark
+
+on:
+  # PR Trigger (enabled for regression checks and debugging)
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
+    branches:
+      - main
+
+  # Manual Trigger
+  #
+  workflow_dispatch:
+    inputs:
+      pytorch_channel_version:
+        description: Package Channel + Version to Use for PyTorch Installation, in `<channel>[/<version>]` Format
+        type: string
+        required: false
+        default: ""
+
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Build on CPU hosts and upload to GHA
+  build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: cuda
+      BUILD_CUDA_VERSION: ${{ matrix.cuda-version }}
+    continue-on-error: true
+    strategy:
+      # Don't fast-fail all the other builds if one of the them fails
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.24xlarge" },
+        ]
+        python-version: [ "3.13" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
+        compiler: [ "gcc", "clang" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Install cuDNN
+      run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU Wheel
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cuda
+
+    - name: Upload Built Wheel as GHA Artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
+
+
+  # Download the built artifact from GHA, test on GPU, and push to PyPI
+  benchmark_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    # runs-on: linux.4xlarge.nvidia.gpu
+    # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
+    runs-on: ${{ matrix.host-machine.instance }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: cuda
+      BUILD_CUDA_VERSION: ${{ matrix.cuda-version }}
+      ENFORCE_CUDA_DEVICE: 1
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
+          # TODO: Enable when A100 machine queues are reasonably small enough for doing per-PR CI
+          # https://hud.pytorch.org/metrics
+          # { arch: x86, instance: "linux.gcp.a100" },
+        ]
+        python-version: [ "3.13" ]
+        cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
+        # Specify exactly ONE CUDA version for artifact publish
+        cuda-version-publish: [ "12.4.1" ]
+        compiler: [ "gcc", "clang" ]
+    needs: build_artifact
+
+    steps:
+    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Download Wheel Artifact from GHA
+      # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/download-artifact@v4
+      with:
+        name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+
+    # Use PyTorch test infrastructure action - https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml
+    - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
+      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install C/C++ Compilers for Updated LIBGCC
+      # NOTE: gcc is required for torch dynamo to work properly, as some of
+      # the compilation flags used by torch dynamo are gcc-specific:
+      #
+      #   clang-16: error: unknown argument: '-fno-tree-loop-vectorize'
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV gcc
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Install FBGEMM_GPU Wheel
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
+
+    - name: Run FBGEMM_GPU Benchmark
+      timeout-minutes: 40
+      run: . $PRELUDE; run_tbe_microbench_for_amd $BUILD_ENV