Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

feat(nvidia-smi): collect Voltage (XML only) #1048

Merged
merged 1 commit into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions modules/nvidia_smi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Labels per scope:
| gpu_frame_buffer_memory_usage | gpu | free, used, reserved | B | yes | yes |
| gpu_bar1_memory_usage | gpu | free, used | B | yes | no |
| gpu_temperature | gpu | temperature | Celsius | yes | yes |
| gpu_voltage | gpu | voltage | V | yes | no |
| gpu_clock_freq | gpu | graphics, video, sm, mem | MHz | yes | yes |
| gpu_power_draw | gpu | power_draw | Watts | yes | yes |
| gpu_performance_state | gpu | P0-P15 | state | yes | yes |
Expand Down
16 changes: 16 additions & 0 deletions modules/nvidia_smi/charts.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const (
prioGPUFBMemoryUsage
prioGPUBAR1MemoryUsage
prioGPUTemperatureChart
prioGPUVoltageChart
prioGPUClockFreq
prioGPUPowerDraw
prioGPUPerformanceState
Expand All @@ -34,6 +35,7 @@ var (
gpuEncoderUtilizationChartTmpl.Copy(),
gpuFrameBufferMemoryUsageChartTmpl.Copy(),
gpuBAR1MemoryUsageChartTmpl.Copy(),
gpuVoltageChartTmpl.Copy(),
gpuTemperatureChartTmpl.Copy(),
gpuClockFreqChartTmpl.Copy(),
gpuPowerDrawChartTmpl.Copy(),
Expand Down Expand Up @@ -158,6 +160,17 @@ var (
{ID: "gpu_%s_temperature", Name: "temperature"},
},
}
gpuVoltageChartTmpl = module.Chart{
ID: "gpu_%s_voltage",
Title: "Voltage",
Units: "V",
Fam: "voltage",
Ctx: "nvidia_smi.gpu_voltage",
Priority: prioGPUVoltageChart,
Dims: module.Dims{
{ID: "gpu_%s_voltage", Name: "voltage", Div: 1000}, // mV => V
},
}
gpuClockFreqChartTmpl = module.Chart{
ID: "gpu_%s_clock_freq",
Title: "Clock current frequency",
Expand Down Expand Up @@ -219,6 +232,9 @@ func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) {
if !isValidValue(gpu.PowerReadings.PowerDraw) {
_ = charts.Remove(gpuPowerDrawChartTmpl.ID)
}
if !isValidValue(gpu.Voltage.GraphicsVolt) {
_ = charts.Remove(gpuVoltageChartTmpl.ID)
}

for _, c := range *charts {
c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID))
Expand Down
4 changes: 4 additions & 0 deletions modules/nvidia_smi/collect_xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0)
addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0)
addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0)
addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0)
for i := 0; i < 16; i++ {
if s := "P" + strconv.Itoa(i); gpu.PerformanceState == s {
mx[px+"performance_state_"+s] = 1
Expand Down Expand Up @@ -130,6 +131,9 @@ type (
MinPowerLimit string `xml:"min_power_limit"`
MaxPowerLimit string `xml:"max_power_limit"`
} `xml:"power_readings"`
Voltage struct {
GraphicsVolt string `xml:"graphics_volt"`
} `xml:"voltage"`
Processes struct {
ProcessInfo []struct {
PID string `xml:"pid"`
Expand Down
1 change: 1 addition & 0 deletions modules/nvidia_smi/nvidia_smi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ func TestNvidiaSMI_Collect(t *testing.T) {
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_sm_clock": 210,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_temperature": 45,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_video_clock": 555,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_voltage": 631,
}

assert.Equal(t, expected, mx)
Expand Down