Skip to content

Commit

Permalink
[fbgemm_gpu] Add benchmark workflow
Browse files Browse the repository at this point in the history
- Add benchmark workflow for AMD TBE
  • Loading branch information
q10 committed Feb 20, 2025
1 parent 853e97c commit f95f0c3
Show file tree
Hide file tree
Showing 7 changed files with 507 additions and 30 deletions.
78 changes: 78 additions & 0 deletions .github/scripts/fbgemm_gpu_benchmarks.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/utils_base.bash"

################################################################################
# FBGEMM_GPU Test Helper Functions
################################################################################

run_tbe_microbench_for_amd () {
local env_name="$1"

__single_run() {
local cache_type="$1"
local embedding_location="$2"

echo "################################################################################"
echo "# Running Benchmark: (${cache_type}, ${embedding_location})"
echo "#"
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
echo "################################################################################"
echo ""

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

if [ "$embedding_location" == "hbm" ]; then
local managed="device"
elif [ "$embedding_location" == "uvm" ]; then
local managed="managed"
fi

print_exec conda run --no-capture-output ${env_prefix} python split_table_batched_embeddings_benchmark.py device \
--batch-size 131072 \
--embedding-dim 256 \
--iters 400 \
--warmup-runs 50 \
--alpha 1.15 \
--bag-size 55 \
--weights-precision fp16 \
--cache-precision "${cache_type}" \
--output-dtype bf16 \
--managed="${managed}" \
--num-embeddings 10000000 \
--num-tables 1 \
--row-wise \
--num-requests 10 \
--pooling=none
}

pushd fbgemm_gpu/bench || return 1

local cache_types=(
fp16
fp32
)

local embedding_locations=(
hbm
# uvm
)

for cache_type in "${cache_types[@]}"; do
for embedding_location in "${embedding_locations[@]}"; do
__single_run "${cache_type}" "${embedding_location}" || return 1
echo ""
echo ""
done
done

popd || return 1
}
2 changes: 2 additions & 0 deletions .github/scripts/setup_env.bash
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@
. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_lint.bash"
# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_test.bash"
# shellcheck disable=SC1091,SC2128
. "$( dirname -- "$BASH_SOURCE"; )/fbgemm_gpu_benchmarks.bash"
26 changes: 13 additions & 13 deletions .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -165,22 +165,22 @@ print_gpu_info () {
if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
# Ensure that rocm-smi is available and returns GPU entries
if ! rocm-smi; then
echo "[CHECK] ROCm drivers and ROCm device are required for this workflow, but does not appear to be installed or available!"
echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
return 1
fi
else
if which rocminfo; then
# If rocminfo is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
else
echo "[CHECK] rocminfo not found"
fi
if which rocm-smi; then
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocm-smi) || true
else
echo "[CHECK] rocm-smi not found"
fi
local smi_programs=( rocminfo rocm-smi )

for smi_program in "${smi_programs[@]}"; do
# shellcheck disable=SC2086
if which $smi_program; then
# If the program is installed on a machine without GPUs, invoking it will return error
# shellcheck disable=SC2086
(print_exec $smi_program) || true
else
echo "[CHECK] $smi_program not found"
fi
done
fi
}

Expand Down
209 changes: 209 additions & 0 deletions .github/workflows/fbgemm_gpu_benchmark_cuda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# This workflow is used for FBGEMM_GPU-CUDA Benchmarking
name: FBGEMM_GPU-CUDA Benchmark

on:
# PR Trigger (enabled for regression checks and debugging)
#
pull_request:
branches:
- main

# Push Trigger (enable to catch errors coming out of multiple merges)
#
push:
branches:
- main

# Manual Trigger
#
workflow_dispatch:
inputs:
pytorch_channel_version:
description: Package Channel + Version to Use for PyTorch Installation, in `<channel>[/<version>]` Format
type: string
required: false
default: ""

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: cuda
BUILD_CUDA_VERSION: ${{ matrix.cuda-version }}
continue-on-error: true
strategy:
# Don't fast-fail all the other builds if one of the them fails
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.24xlarge" },
]
python-version: [ "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
compiler: [ "gcc", "clang" ]

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install C/C++ Compilers
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install CUDA
run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

# Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cuda/${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Install cuDNN
run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU Wheel
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly cuda

- name: Upload Built Wheel as GHA Artifact
uses: actions/upload-artifact@v4
with:
name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
if-no-files-found: error


# Download the built artifact from GHA, test on GPU, and push to PyPI
benchmark_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
# runs-on: linux.4xlarge.nvidia.gpu
# Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
runs-on: ${{ matrix.host-machine.instance }}
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: cuda
BUILD_CUDA_VERSION: ${{ matrix.cuda-version }}
ENFORCE_CUDA_DEVICE: 1
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
# TODO: Enable when A100 machine queues are reasonably small enough for doing per-PR CI
# https://hud.pytorch.org/metrics
# { arch: x86, instance: "linux.gcp.a100" },
]
python-version: [ "3.13" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3", "12.8.0" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "12.4.1" ]
compiler: [ "gcc", "clang" ]
needs: build_artifact

steps:
# Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Download Wheel Artifact from GHA
# Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/download-artifact@v4
with:
name: fbgemm_gpu_nightly_cuda_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

# Use PyTorch test infrastructure action - https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml
- name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
uses: pytorch/test-infra/.github/actions/setup-nvidia@main

- name: Display System Info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install C/C++ Compilers for Updated LIBGCC
# NOTE: gcc is required for torch dynamo to work properly, as some of
# the compilation flags used by torch dynamo are gcc-specific:
#
# clang-16: error: unknown argument: '-fno-tree-loop-vectorize'
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV gcc

- name: Install CUDA
run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

# Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.pytorch_channel_version) || 'nightly' }} cuda/${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Wheel
run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl

- name: Run FBGEMM_GPU Benchmark
timeout-minutes: 40
run: . $PRELUDE; run_tbe_microbench_for_amd $BUILD_ENV
Loading

0 comments on commit f95f0c3

Please sign in to comment.