Skip to content

Commit

Permalink
Use instant power draw explicitly when available (#65)
Browse files Browse the repository at this point in the history
  • Loading branch information
jaywonchung authored May 7, 2024
1 parent 370e640 commit 5048322
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 15 deletions.
26 changes: 16 additions & 10 deletions zeus/device/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,8 @@ def resetGpuLockedClocks(self) -> None:
pass

@abc.abstractmethod
def getPowerUsage(self) -> int:
"""Return the current power usage of the GPU. Units: mW."""
def getInstantPowerUsage(self) -> int:
"""Returns the current power usage of the GPU. Units: mW."""
pass

@abc.abstractmethod
Expand Down Expand Up @@ -411,9 +411,14 @@ def resetGpuLockedClocks(self) -> None:
pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

@_handle_nvml_errors
def getPowerUsage(self) -> int:
"""Returns the power usage of the specified GPU. Units: mW."""
return pynvml.nvmlDeviceGetPowerUsage(self.handle)
def getInstantPowerUsage(self) -> int:
"""Returns the current power usage of the specified GPU. Units: mW."""
metric = pynvml.nvmlDeviceGetFieldValues(
self.handle, [pynvml.NVML_FI_DEV_POWER_INSTANT]
)[0]
if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
raise pynvml.NVMLError(ret)
return metric.value.siVal

@_handle_nvml_errors
def supportsGetTotalEnergyConsumption(self) -> bool:
Expand Down Expand Up @@ -602,11 +607,12 @@ def resetGpuLockedClocks(self) -> None:
) # expects MHz

@_handle_amdsmi_errors
def getPowerUsage(self) -> int:
"""Returns the power usage of the specified GPU. Units: mW."""
def getInstantPowerUsage(self) -> int:
"""Returns the current power usage of the specified GPU. Units: mW."""
# returns in W, convert to mW
return int(
amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000
) # returns in W, convert to mW
)

@_handle_amdsmi_errors
def supportsGetTotalEnergyConsumption(self) -> bool:
Expand Down Expand Up @@ -723,9 +729,9 @@ def resetGpuLockedClocks(self, index: int) -> None:
"""Resets the GPU locked clocks of the specified GPU to their default values."""
self.gpus[index].resetGpuLockedClocks()

def getPowerUsage(self, index: int) -> int:
def getInstantPowerUsage(self, index: int) -> int:
"""Returns the power usage of the specified GPU. Units: mW."""
return self.gpus[index].getPowerUsage()
return self.gpus[index].getInstantPowerUsage()

def supportsGetTotalEnergyConsumption(self, index: int) -> bool:
"""Returns True if the specified GPU supports retrieving the total energy consumption."""
Expand Down
4 changes: 3 additions & 1 deletion zeus/monitor/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,9 @@ def __init__(
def _get_instant_power(self) -> tuple[dict[int, float], float]:
"""Measure the power consumption of all GPUs at the current time."""
power_measurement_start_time: float = time()
power = {i: self.gpus.getPowerUsage(i) / 1000.0 for i in self.gpu_indices}
power = {
i: self.gpus.getInstantPowerUsage(i) / 1000.0 for i in self.gpu_indices
}
power_measurement_time = time() - power_measurement_start_time
return power, power_measurement_time

Expand Down
15 changes: 11 additions & 4 deletions zeus/monitor/power.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ def _infer_counter_update_period_single(gpu_index: int) -> float:
for i in range(len(time_power_samples)):
time_power_samples[i] = (
time(),
gpus.getPowerUsage(gpu_index),
gpus.getInstantPowerUsage(gpu_index),
)

# Find the timestamps when the power readings changed.
time_power_samples = time_power_samples[10:]
changed_times = []
prev_power = time_power_samples[0][1]
for t, p in time_power_samples:
Expand All @@ -95,7 +96,12 @@ def _infer_counter_update_period_single(gpu_index: int) -> float:
prev_power = p

# Compute the minimum time difference between power change timestamps.
return min(time2 - time1 for time1, time2 in zip(changed_times, changed_times[1:]))
intervals = [
time2 - time1 for time1, time2 in zip(changed_times, changed_times[1:])
]
if len(intervals) == 0:
return 0.1
return min(intervals)


class PowerMonitor:
Expand All @@ -120,7 +126,8 @@ def __init__(
"""Initialize the power monitor.
Initialization should not be done in global scope due to python's protection.
Refer to the "Safe importing of main module" section in https://docs.python.org/3/library/multiprocessing.html for more detail.
Refer to the "Safe importing of main module" section in
https://docs.python.org/3/library/multiprocessing.html for more details.
Args:
gpu_indices: Indices of the GPUs to monitor. If None, monitor all GPUs.
Expand Down Expand Up @@ -257,7 +264,7 @@ def _polling_process(
power: list[float] = []
now = time()
for index in gpu_indices:
power.append(gpus.getPowerUsage(index))
power.append(gpus.getInstantPowerUsage(index))
power_str = ",".join(map(lambda p: str(p / 1000), power))
power_f.write(f"{now},{power_str}\n")
if (sleep_time := update_period - (time() - now)) > 0:
Expand Down

0 comments on commit 5048322

Please sign in to comment.