Skip to content

Commit

Permalink
Merge branch 'master' into directives
Browse files Browse the repository at this point in the history
  • Loading branch information
isazi committed Nov 28, 2024
2 parents 760462c + 6ebf773 commit b207e06
Show file tree
Hide file tree
Showing 8 changed files with 506 additions and 78 deletions.
46 changes: 46 additions & 0 deletions examples/cuda/vector_add_tegra_observer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python
"""This is the minimal example from the README"""

import json

import numpy
from kernel_tuner import tune_kernel
from kernel_tuner.observers.tegra import TegraObserver

def tune():

kernel_string = """
__global__ void vector_add(float *c, float *a, float *b, int n) {
int i = blockIdx.x * block_size_x + threadIdx.x;
if (i<n) {
c[i] = a[i] + b[i];
}
}
"""

size = 800000

a = numpy.random.randn(size).astype(numpy.float32)
b = numpy.random.randn(size).astype(numpy.float32)
c = numpy.zeros_like(b)
n = numpy.int32(size)

args = [c, a, b, n]

tune_params = dict()
tune_params["block_size_x"] = [128+64*i for i in range(15)]

tegraobserver = TegraObserver(["core_freq"])

metrics = dict()
metrics["f"] = lambda p: p["core_freq"]

results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params, observers=[tegraobserver], metrics=metrics)

print(results)

return results


if __name__ == "__main__":
tune()
1 change: 1 addition & 0 deletions kernel_tuner/backends/hip.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"bool": ctypes.c_bool,
"int8": ctypes.c_int8,
"int16": ctypes.c_int16,
"float16": ctypes.c_int16,
"int32": ctypes.c_int32,
"int64": ctypes.c_int64,
"uint8": ctypes.c_uint8,
Expand Down
16 changes: 13 additions & 3 deletions kernel_tuner/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
from kernel_tuner.backends.nvcuda import CudaFunctions
from kernel_tuner.backends.opencl import OpenCLFunctions
from kernel_tuner.backends.compiler import CompilerFunctions
from kernel_tuner.backends.opencl import OpenCLFunctions
from kernel_tuner.backends.hip import HipFunctions
from kernel_tuner.observers.nvml import NVMLObserver
from kernel_tuner.observers.tegra import TegraObserver
from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver, PrologueObserver

try:
Expand Down Expand Up @@ -307,8 +306,9 @@ def __init__(
)
self.dev = dev

# look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
# look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
self.use_nvml = False
self.use_tegra = False
self.continuous_observers = []
self.output_observers = []
self.prologue_observers = []
Expand All @@ -317,6 +317,9 @@ def __init__(
if isinstance(obs, NVMLObserver):
self.nvml = obs.nvml
self.use_nvml = True
if isinstance(obs, TegraObserver):
self.tegra = obs.tegra
self.use_tegra = True
if hasattr(obs, "continuous_observer"):
self.continuous_observers.append(obs.continuous_observer)
if isinstance(obs, OutputObserver):
Expand Down Expand Up @@ -373,6 +376,7 @@ def benchmark_default(self, func, gpu_args, threads, grid, result):
for obs in self.benchmark_observers:
result.update(obs.get_results())


def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
"""Benchmark continuously for at least 'duration' seconds"""
iterations = int(np.ceil(duration / (result["time"] / 1000)))
Expand All @@ -396,6 +400,7 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration):
for obs in self.continuous_observers:
result.update(obs.get_results())


def set_nvml_parameters(self, instance):
"""Set the NVML parameters. Avoids setting time leaking into benchmark time."""
if self.use_nvml:
Expand All @@ -410,6 +415,11 @@ def set_nvml_parameters(self, instance):
if "nvml_mem_clock" in instance.params:
self.nvml.mem_clock = instance.params["nvml_mem_clock"]

if self.use_tegra:
if "tegra_gr_clock" in instance.params:
self.tegra.gr_clock = instance.params["tegra_gr_clock"]


def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False):
"""Benchmark the kernel instance."""
logging.debug("benchmark " + instance.name)
Expand Down
77 changes: 6 additions & 71 deletions kernel_tuner/observers/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def __init__(
save_all=False,
nvidia_smi_fallback=None,
use_locked_clocks=False,
continous_duration=1,
continuous_duration=1,
):
"""Create an NVMLObserver."""
if nvidia_smi_fallback:
Expand Down Expand Up @@ -355,7 +355,7 @@ def __init__(
if any([obs in self.needs_power for obs in observables]):
self.measure_power = True
power_observables = [obs for obs in observables if obs in self.needs_power]
self.continuous_observer = NVMLPowerObserver(power_observables, self, self.nvml, continous_duration)
self.continuous_observer = ContinuousObserver("nvml", power_observables, self, continuous_duration=continuous_duration)

# remove power observables
self.observables = [obs for obs in observables if obs not in self.needs_power]
Expand All @@ -373,6 +373,10 @@ def __init__(
self.during_obs = [obs for obs in observables if obs in ["core_freq", "mem_freq", "temperature"]]
self.iteration = {obs: [] for obs in self.during_obs}

def read_power(self):
""" Return power in Watt """
return self.nvml.pwr_usage() / 1e3

def before_start(self):
# clear results of the observables for next measurement
self.iteration = {obs: [] for obs in self.during_obs}
Expand Down Expand Up @@ -428,75 +432,6 @@ def get_results(self):
return averaged_results


class NVMLPowerObserver(ContinuousObserver):
"""Observer that measures power using NVML and continuous benchmarking."""

def __init__(self, observables, parent, nvml_instance, continous_duration=1):
self.parent = parent
self.nvml = nvml_instance

supported = ["power_readings", "nvml_power", "nvml_energy"]
for obs in observables:
if obs not in supported:
raise ValueError(f"Observable {obs} not in supported: {supported}")
self.observables = observables

# duration in seconds
self.continuous_duration = continous_duration

self.power = 0
self.energy = 0
self.power_readings = []
self.t0 = 0

# results from the last iteration-based benchmark
self.results = None

def before_start(self):
self.parent.before_start()
self.power = 0
self.energy = 0
self.power_readings = []

def after_start(self):
self.parent.after_start()
self.t0 = time.perf_counter()

def during(self):
self.parent.during()
power_usage = self.nvml.pwr_usage()
timestamp = time.perf_counter() - self.t0
# only store the result if we get a new measurement from NVML
if len(self.power_readings) == 0 or (
self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01
):
self.power_readings.append([timestamp, power_usage])

def after_finish(self):
self.parent.after_finish()
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
if not self.power_readings:
return

# convert to seconds from milliseconds
execution_time = self.results["time"] / 1e3
self.power = np.median([d[1] / 1e3 for d in self.power_readings])
self.energy = self.power * execution_time

def get_results(self):
results = self.parent.get_results()
keys = list(results.keys())
for key in keys:
results["pwr_" + key] = results.pop(key)
if "nvml_energy" in self.observables:
results["nvml_energy"] = self.energy
if "nvml_power" in self.observables:
results["nvml_power"] = self.power
if "power_readings" in self.observables:
results["power_readings"] = self.power_readings
return results


# High-level Helper functions


Expand Down
75 changes: 73 additions & 2 deletions kernel_tuner/observers/observer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import ABC, abstractmethod

import time
import numpy as np

class BenchmarkObserver(ABC):
"""Base class for Benchmark Observers"""
Expand Down Expand Up @@ -44,8 +45,78 @@ class IterationObserver(BenchmarkObserver):


class ContinuousObserver(BenchmarkObserver):
pass
"""Generic observer that measures power while and continuous benchmarking.
To support continuous benchmarking an Observer should support:
a .read_power() method, which the ContinuousObserver can call to read power in Watt
"""
def __init__(self, name, observables, parent, continuous_duration=1):
self.parent = parent
self.name = name

supported = [self.name + "_power", self.name + "_energy", "power_readings"]
for obs in observables:
if obs not in supported:
raise ValueError(f"Observable {obs} not in supported: {supported}")
self.observables = observables

# duration in seconds
self.continuous_duration = continuous_duration

self.power = 0
self.energy = 0
self.power_readings = []
self.t0 = 0

# results from the last iteration-based benchmark
# these are set by the benchmarking function of Kernel Tuner before
# the continuous observer is called.
self.results = None

def before_start(self):
self.parent.before_start()
self.power = 0
self.energy = 0
self.power_readings = []

def after_start(self):
self.parent.after_start()
self.t0 = time.perf_counter()

def during(self):
self.parent.during()
power_usage = self.parent.read_power()
timestamp = time.perf_counter() - self.t0
# only store the result if we get a new measurement from the GPU
if len(self.power_readings) == 0 or (
self.power_readings[-1][1] != power_usage
or timestamp - self.power_readings[-1][0] > 0.01
):
self.power_readings.append([timestamp, power_usage])

def after_finish(self):
self.parent.after_finish()
# safeguard in case we have no measurements, perhaps the kernel was too short to measure anything
if not self.power_readings:
return

# convert to seconds from milliseconds
execution_time = self.results["time"] / 1e3
self.power = np.median([d[1] for d in self.power_readings])
self.energy = self.power * execution_time

def get_results(self):
results = self.parent.get_results()
keys = list(results.keys())
for key in keys:
results["pwr_" + key] = results.pop(key)
if self.name + "_power" in self.observables:
results[self.name + "_power"] = self.power
if self.name + "_energy" in self.observables:
results[self.name + "_energy"] = self.energy
if "power_readings" in self.observables:
results["power_readings"] = self.power_readings
return results

class OutputObserver(BenchmarkObserver):
"""Observer that can verify or measure something about the output produced by a kernel."""
Expand Down
59 changes: 57 additions & 2 deletions kernel_tuner/observers/pmt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np

from kernel_tuner.observers.observer import BenchmarkObserver
from kernel_tuner.observers.observer import BenchmarkObserver, ContinuousObserver

# check if pmt is installed
try:
Expand Down Expand Up @@ -28,9 +28,25 @@ class PMTObserver(BenchmarkObserver):
:type observables: string,list/dictionary
:param use_continuous_observer:
Boolean to control whether or not to measure power/energy using
Kernel Tuner's continuous benchmarking mode. This improves measurement
accuracy when using internal power sensors, such as NVML or ROCM,
which have limited sampling frequency and might return averages
instead of instantaneous power readings. Default value: False.
:type use_continuous_observer: boolean
:param continuous_duration:
Number of seconds to measure continuously for.
:type continuous_duration: scalar
"""

def __init__(self, observable=None):
def __init__(self, observable=None, use_continuous_observer=False, continuous_duration=1):
if not pmt:
raise ImportError("could not import pmt")

Expand All @@ -54,6 +70,9 @@ def __init__(self, observable=None):
self.begin_states = [None] * len(self.pms)
self.initialize_results(self.pm_names)

if use_continuous_observer:
self.continuous_observer = PMTContinuousObserver("pmt", [], self, continuous_duration=continuous_duration)

def initialize_results(self, pm_names):
self.results = dict()
for pm_name in pm_names:
Expand Down Expand Up @@ -82,3 +101,39 @@ def get_results(self):
averages = {key: np.average(values) for key, values in self.results.items()}
self.initialize_results(self.pm_names)
return averages


class PMTContinuousObserver(ContinuousObserver):
"""Generic observer that measures power while and continuous benchmarking.
To support continuous benchmarking an Observer should support:
a .read_power() method, which the ContinuousObserver can call to read power in Watt
"""
def before_start(self):
""" Override default method in ContinuousObserver """
pass

def after_start(self):
self.parent.after_start()

def during(self):
""" Override default method in ContinuousObserver """
pass

def after_finish(self):
self.parent.after_finish()

def get_results(self):
average_kernel_execution_time_ms = self.results["time"]

averages = {key: np.average(values) for key, values in self.results.items()}
self.parent.initialize_results(self.parent.pm_names)

# correct energy measurement, because current _energy number is collected over the entire duration
# we estimate energy as the average power over the continuous duration times the kernel execution time
for pm_name in self.parent.pm_names:
energy_result_name = f"{pm_name}_energy"
power_result_name = f"{pm_name}_power"
averages[energy_result_name] = averages[power_result_name] * (average_kernel_execution_time_ms / 1e3)

return averages
Loading

0 comments on commit b207e06

Please sign in to comment.