Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Benchmark : Fix remote push job #129

Merged
merged 1 commit into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions .github/actions/nm-github-action-benchmark/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ inputs:
- 'true'
- 'false'
required: true
reporting_enabled:
description: "When set to true, if there is a regression, do 3 things. 1. Mark the workflow as failed. 2. Add commit comments"
type: choice
options:
- 'true'
- 'false'
required: true
github_token:
description: "secrets.GITHUB_TOKEN from the caller"
required: true
Expand All @@ -44,12 +51,12 @@ runs:
# Push and deploy to Github pages automatically
auto-push: ${{ inputs.auto_push == 'true' }}
# Add a commit comment comparing the current benchmark with the previous.
comment-always: true
comment-always: ${{ inputs.reporting_enabled == 'true' }}
# Create an alert when some value has regressed more than 10%
alert-threshold: "110%"
# Mark the workflow as a failure when some alert is triggered
fail-on-alert: true
fail-on-alert: ${{ inputs.reporting_enabled == 'true' }}
# Add a commit comment describing what triggered the alert
comment-on-alert: true
comment-on-alert: ${{ inputs.reporting_enabled == 'true' }}
# TODO (varun): Is this a reasonable number ?
max-items-in-chart: 50
4 changes: 3 additions & 1 deletion .github/actions/nm-produce-gha-benchmark-json/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ inputs:
smaller_is_better_output_file_path:
description: 'Path to a file where the GHA CustomSmallerIsBetter JSON is to be stored'
required: true
observation_metrics_output_file_path:
description: 'Path to a file where metrics that we only want to observe are stored'
python:
description: 'python version, e.g. 3.10.12'
required: true
Expand All @@ -25,7 +27,7 @@ runs:
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
SUCCESS=0
python3 -m neuralmagic.benchmarks.scripts.logging.gha_benchmark_logging -i ${{inputs.vllm_benchmark_jsons_path}} --bigger-is-better-output-file-path ${{ inputs.bigger_is_better_output_file_path }} --smaller-is-better-output-file-path ${{ inputs.smaller_is_better_output_file_path }} || SUCCESS=$?
python3 -m neuralmagic.benchmarks.scripts.logging.gha_benchmark_logging -i ${{inputs.vllm_benchmark_jsons_path}} --bigger-is-better-metrics-output-file-path ${{ inputs.bigger_is_better_output_file_path }} --smaller-is-better-metrics-output-file-path ${{ inputs.smaller_is_better_output_file_path }} --observation-metrics-output-file-path ${{ inputs.observation_metrics_output_file_path }} || SUCCESS=$?
echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
exit ${SUCCESS}
shell: bash
27 changes: 25 additions & 2 deletions .github/workflows/nm-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ jobs:
bigger_is_better_output_file_path: gh-action-benchmark-jsons/bigger_is_better.json
# Metrics that are "better" when the value is smaller are stored here
smaller_is_better_output_file_path: gh-action-benchmark-jsons/smaller_is_better.json
# Metrics that we only want to observe are stored here
observation_metrics_output_file_path: gh-action-benchmark-jsons/observation_metrics.json
python: ${{ inputs.python }}
venv: TEST

Expand Down Expand Up @@ -189,23 +191,44 @@ jobs:
run: ls -R ./downloads

- name: nm-github-action-benchmark(bigger_is_better.json)
# Absence of the file indicates that there were no "bigger_is_better" metrics
if: ${{ hashFiles('downloads/bigger_is_better.json') != '' }}
uses: ./.github/actions/nm-github-action-benchmark
if: success() || failure()
with:
gh_action_benchmark_name: "bigger_is_better"
gh_action_benchmark_json_file_path: "downloads/bigger_is_better.json"
gh_action_benchmark_tool: "customBiggerIsBetter"
gh_pages_branch: "nm-gh-pages"
auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
reporting_enabled: "true"
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: nm-github-action-benchmark(smaller_is_better.json)
# Absence of the file indicates that there were no "smaller_is_better" metrics
if: ${{ hashFiles('downloads/smaller_is_better.json') != '' }}
uses: ./.github/actions/nm-github-action-benchmark
if: success() || failure()
with:
gh_action_benchmark_name: "smaller_is_better"
gh_action_benchmark_json_file_path: "downloads/smaller_is_better.json"
gh_action_benchmark_tool: "customSmallerIsBetter"
gh_pages_branch: "nm-gh-pages"
auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
reporting_enabled: "true"
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: nm-github-action-benchmark(observation_metrics.json)
# Absence of the file indicates that there were no "observation" metrics
if: ${{ hashFiles('downloads/observation_metrics.json') != '' }}
uses: ./.github/actions/nm-github-action-benchmark
with:
gh_action_benchmark_name: "observation_metrics"
gh_action_benchmark_json_file_path: "downloads/observation_metrics.json"
# `github-action-benchmark` expects a tool name that is either
# "customBiggerIsBetter" or "customSmallerIsBetter". This is a hack to
# work around that. Since we mark the action to not report failures, this
# is fine.
gh_action_benchmark_tool: "customBiggerIsBetter"
gh_pages_branch: "nm-gh-pages"
auto_push: ${{ inputs.push_benchmark_results_to_gh_pages }}
reporting_enabled: "false"
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 changes: 12 additions & 12 deletions .github/workflows/remote-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ jobs:
secrets: inherit

# Benchmarks
#AWS-AVX2-32G-A10G-24G-Benchmark:
# uses: ./.github/workflows/nm-benchmark.yml
# with:
# label: aws-avx2-32G-a10g-24G
# benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
# timeout: 60
# gitref: '${{ github.ref }}'
# Gi_per_thread: 12
# nvcc_threads: 1
# python: "3.10.12"
# push_benchmark_results_to_gh_pages: "false"
# secrets: inherit
AWS-AVX2-32G-A10G-24G-Benchmark:
uses: ./.github/workflows/nm-benchmark.yml
with:
label: aws-avx2-32G-a10g-24G
benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
timeout: 60
gitref: '${{ github.ref }}'
Gi_per_thread: 12
nvcc_threads: 1
python: "3.10.12"
push_benchmark_results_to_gh_pages: "false"
secrets: inherit
47 changes: 25 additions & 22 deletions neuralmagic/benchmarks/scripts/logging/benchmark_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,21 @@
BENCHMARK_RESULTS_SCHEMA_VERSION = "0.0.0"


class GHABenchmarkToolName(str, Enum):
BiggerIsBetter = "CustomBiggerIsBetter"
SmallerIsBetter = "CustomSmallerIsBetter"
class BenchmarkMetricType(str, Enum):
# Metrics that are "better" when the value is greater e.g. throughput.
BiggerIsBetter = "BiggerIsBetter"
# Metrics that are "better" when the value is smaller e.g. latency.
SmallerIsBetter = "SmallerIsBetter"
# Metrics that are too volatile and we primarily use for observation.
Observation = "Observation"


@dataclass
class MetricTemplate:
key: str = field(default=None)
unit: str = field(default=None)
value: float = field(default=None)
tool: GHABenchmarkToolName = field(default=None)
type: BenchmarkMetricType = field(default=None)

def from_dict(d: dict):
template: MetricTemplate = MetricTemplate()
Expand All @@ -51,40 +55,39 @@ def from_dict(d: dict):

BenchmarkServingResultMetricTemplates = SimpleNamespace(
request_throughput=MetricTemplate("request_throughput", "prompts/s", None,
GHABenchmarkToolName.BiggerIsBetter),
BenchmarkMetricType.BiggerIsBetter),
input_throughput=MetricTemplate("input_throughput", "tokens/s", None,
GHABenchmarkToolName.BiggerIsBetter),
BenchmarkMetricType.BiggerIsBetter),
output_throughput=MetricTemplate("output_throughput", "tokens/s", None,
GHABenchmarkToolName.BiggerIsBetter),
median_request_latency=MetricTemplate(
"median_request_latency", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.BiggerIsBetter),
median_request_latency=MetricTemplate("median_request_latency", "ms", None,
BenchmarkMetricType.SmallerIsBetter),
p90_request_latency=MetricTemplate("p90_request_latency", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
p99_request_latency=MetricTemplate("p99_request_latency", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
mean_ttft_ms=MetricTemplate("mean_ttft_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
median_ttft_ms=MetricTemplate("median_ttft_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
p90_ttft_ms=MetricTemplate("p90_ttft_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
p99_ttft_ms=MetricTemplate("p99_ttft_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
mean_tpot_ms=MetricTemplate("mean_tpot_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
median_tpot_ms=MetricTemplate("median_tpot_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
p90_tpot_ms=MetricTemplate("p90_tpot_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter),
BenchmarkMetricType.SmallerIsBetter),
p99_tpot_ms=MetricTemplate("p99_tpot_ms", "ms", None,
GHABenchmarkToolName.SmallerIsBetter))
BenchmarkMetricType.SmallerIsBetter))

BenchmarkThroughputResultMetricTemplates = SimpleNamespace(
request_throughput=MetricTemplate("request_throughput", "prompts/s", None,
GHABenchmarkToolName.BiggerIsBetter),
BenchmarkMetricType.BiggerIsBetter),
token_throughput=MetricTemplate("token_throughput", "tokens/s", None,
GHABenchmarkToolName.BiggerIsBetter))
BenchmarkMetricType.BiggerIsBetter))


class BenchmarkResult:
Expand Down
117 changes: 69 additions & 48 deletions neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from dataclasses import dataclass
from typing import List, Iterable, NamedTuple

from .benchmark_result import (GHABenchmarkToolName, BenchmarkResult,
from .benchmark_result import (BenchmarkMetricType, BenchmarkResult,
MetricTemplate)


Expand Down Expand Up @@ -79,12 +79,12 @@ def from_metric_template(metric_template: MetricTemplate, extra: dict):
extra=f"{json.dumps(extra, indent=2)}")


class Tool_Record_T(NamedTuple):
tool: GHABenchmarkToolName
class Type_Record_T(NamedTuple):
type: BenchmarkMetricType
record: GHARecord


def process(json_file_path: Path) -> Iterable[Tool_Record_T]:
def process(json_file_path: Path) -> Iterable[Type_Record_T]:

assert json_file_path.exists()

Expand All @@ -101,80 +101,101 @@ def process(json_file_path: Path) -> Iterable[Tool_Record_T]:
lambda md: MetricTemplate.from_dict(md), metrics.values())

return map(
lambda metric: Tool_Record_T(
metric.tool,
lambda metric: Type_Record_T(
metric.type,
GHARecord.from_metric_template(metric, extra=hover_data)), metrics)


def main(input_directory: Path, bigger_is_better_output_json_file_name: Path,
smaller_is_better_output_json_file_name: Path) -> None:
def main(args: argparse.Namespace) -> None:
input_directory = Path(args.input_directory)

def dump_to_json(gha_records: List[GHARecord], output_path: Path):
json_file_paths = input_directory.glob('*.json')

type_records: List[Type_Record_T] = list(
reduce(lambda whole, part: whole + part,
(map(lambda json_file_path: list(process(json_file_path)),
json_file_paths))))

def filter_and_dump_if_non_empty(type_records: List[Type_Record_T],
type: BenchmarkMetricType,
output_path: Path):
"""
Given a list of type_record tuples, filter the records with the given
type.
If there are no records after we filter, don't dump json. otherwise,
dump all records as JSON.
"""
# Make output directory if it doesn't exist
output_path.parent.mkdir(parents=True, exist_ok=True)

gha_records: List[GHARecord] = list(
map(
lambda type_record: type_record.record,
filter(lambda type_record: type_record.type == type,
type_records)))

if len(gha_records) == 0:
return

# Make data JSON serializable
gha_record_dicts = list(map(lambda x: x.__dict__, gha_records))
with open(output_path, 'w+') as f:
json.dump(gha_record_dicts, f, indent=4)

json_file_paths = input_directory.glob('*.json')
tool_records: List[Tool_Record_T] = list(
reduce(lambda whole, part: whole + part,
(map(lambda json_file_path: list(process(json_file_path)),
json_file_paths))))

bigger_is_better: List[GHARecord] = list(
map(
lambda tool_record: tool_record.record,
filter(
lambda tool_record: tool_record.tool == GHABenchmarkToolName.
BiggerIsBetter, tool_records)))

smaller_is_better: List[GHARecord] = list(
map(
lambda tool_record: tool_record.record,
filter(
lambda tool_record: tool_record.tool == GHABenchmarkToolName.
SmallerIsBetter, tool_records)))

dump_to_json(bigger_is_better, bigger_is_better_output_json_file_name)
dump_to_json(smaller_is_better, smaller_is_better_output_json_file_name)
filter_and_dump_if_non_empty(
type_records, BenchmarkMetricType.BiggerIsBetter,
Path(args.bigger_is_better_metrics_output_file_path))
filter_and_dump_if_non_empty(
type_records, BenchmarkMetricType.SmallerIsBetter,
Path(args.smaller_is_better_metrics_output_file_path))
filter_and_dump_if_non_empty(
type_records, BenchmarkMetricType.Observation,
Path(args.observation_metrics_output_file_path))


if __name__ == '__main__':
parser = argparse.ArgumentParser(description="""
Process the benchmark JSONs produced by BenchmarkResult and output JSONs
that could be consumed by `github-action-benchmark`
that could be consumed by `github-action-benchmark`.
The JSONs are not produced if there are no metrics to report for some
BenchmarkMetricType.
Reference : https://github.com/benchmark-action/github-action-benchmark
""")

parser.add_argument(
"-i",
"--input-json-directory",
"--input-directory",
required=True,
type=str,
help="""Path to the directory containing BenchmarkResult
jsons. This is typically the output directory passed
to the benchmark runner scripts like
neuralmagic/benchmarks/run_benchmarks.py.""")

parser.add_argument(
"--bigger-is-better-output-file-path",
type=str,
required=True,
help="""An output file path, where the GHABenchmarkToolName
BiggerIsBetter metrics are to be stored.""")
parser.add_argument("--bigger-is-better-metrics-output-file-path",
required=True,
type=str,
help="""
An output file path, where the BenchmarkMetricType
BiggerIsBetter metrics are stored.
""")

parser.add_argument(
"--smaller-is-better-output-file-path",
type=str,
required=True,
help="""An output file path, where the GHABenchmarkToolName
SmallerIsBetter metrics are to be stored""")
parser.add_argument("--smaller-is-better-metrics-output-file-path",
required=True,
type=str,
help="""
An output file path, where the BenchmarkMetricType
SmallerIsBetter metrics are stored.
""")

parser.add_argument("--observation-metrics-output-file-path",
required=True,
type=str,
help="""
An output file path, where the BenchmarkMetricType
Observation metrics are stored.
""")

args = parser.parse_args()

main(Path(args.input_json_directory),
Path(args.bigger_is_better_output_file_path),
Path(args.smaller_is_better_output_file_path))
main(args)
Loading