Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

feat(nvidia_smi): collect MIG metrics (XML only) #1067

Merged
merged 3 commits into from
Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 23 additions & 18 deletions modules/nvidia_smi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,29 @@ All metrics have "nvidia_smi." prefix.

Labels per scope:

- gpu: product_name.

| Metric | Scope | Dimensions | Units | XML | CSV |
|--------------------------------|:-----:|:------------------------:|:-------:|:---:|:---:|
| gpu_pcie_bandwidth_usage | gpu | rx, tx | B/s | yes | no |
| gpu_pcie_bandwidth_utilization | gpu | rx, tx | % | yes | no |
| gpu_fan_speed_perc | gpu | fan_speed | % | yes | yes |
| gpu_utilization | gpu | gpu | % | yes | yes |
| gpu_memory_utilization | gpu | memory | % | yes | yes |
| gpu_decoder_utilization | gpu | decoder | % | yes | no |
| gpu_encoder_utilization | gpu | encoder | % | yes | no |
| gpu_frame_buffer_memory_usage | gpu | free, used, reserved | B | yes | yes |
| gpu_bar1_memory_usage | gpu | free, used | B | yes | no |
| gpu_temperature | gpu | temperature | Celsius | yes | yes |
| gpu_voltage | gpu | voltage | V | yes | no |
| gpu_clock_freq | gpu | graphics, video, sm, mem | MHz | yes | yes |
| gpu_power_draw | gpu | power_draw | Watts | yes | yes |
| gpu_performance_state | gpu | P0-P15 | state | yes | yes |
- gpu: uuid, product_name.
- mig: gpu_uuid, gpu_product_name, gpu_instance_id

| Metric | Scope | Dimensions | Units | XML | CSV |
|-----------------------------------|:-----:|:------------------------:|:-------:|:---:|:---:|
| gpu_pcie_bandwidth_usage | gpu | rx, tx | B/s | yes | no |
| gpu_pcie_bandwidth_utilization | gpu | rx, tx | % | yes | no |
| gpu_fan_speed_perc | gpu | fan_speed | % | yes | yes |
| gpu_utilization | gpu | gpu | % | yes | yes |
| gpu_memory_utilization | gpu | memory | % | yes | yes |
| gpu_decoder_utilization | gpu | decoder | % | yes | no |
| gpu_encoder_utilization | gpu | encoder | % | yes | no |
| gpu_frame_buffer_memory_usage | gpu | free, used, reserved | B | yes | yes |
| gpu_bar1_memory_usage | gpu | free, used | B | yes | no |
| gpu_temperature | gpu | temperature | Celsius | yes | yes |
| gpu_voltage | gpu | voltage | V | yes | no |
| gpu_clock_freq | gpu | graphics, video, sm, mem | MHz | yes | yes |
| gpu_power_draw | gpu | power_draw | Watts | yes | yes |
| gpu_performance_state | gpu | P0-P15 | state | yes | yes |
| gpu_mig_mode_current_status | gpu | enabled, disabled | status | yes | no |
| gpu_mig_devices_count | gpu | mig | devices | yes | no |
| gpu_mig_frame_buffer_memory_usage | mig | free, used, reserved | B | yes | no |
| gpu_mig_bar1_memory_usage | mig | free, used | B | yes | no |

## Configuration

Expand Down
111 changes: 109 additions & 2 deletions modules/nvidia_smi/charts.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ const (
prioGPUMemUtilization
prioGPUDecoderUtilization
prioGPUEncoderUtilization
prioGPUMIGModeStatus
prioGPUMIGDevicesCount
prioGPUFBMemoryUsage
prioGPUMIGFBMemoryUsage
prioGPUBAR1MemoryUsage
prioGPUMIGBAR1MemoryUsage
prioGPUTemperatureChart
prioGPUVoltageChart
prioGPUClockFreq
Expand All @@ -35,6 +39,8 @@ var (
gpuMemUtilizationChartTmpl.Copy(),
gpuDecoderUtilizationChartTmpl.Copy(),
gpuEncoderUtilizationChartTmpl.Copy(),
gpuMIGModeCurrentStatusChartTmpl.Copy(),
gpuMIGDevicesCountChartTmpl.Copy(),
gpuFrameBufferMemoryUsageChartTmpl.Copy(),
gpuBAR1MemoryUsageChartTmpl.Copy(),
gpuVoltageChartTmpl.Copy(),
Expand All @@ -43,6 +49,10 @@ var (
gpuPowerDrawChartTmpl.Copy(),
gpuPerformanceStateChartTmpl.Copy(),
}
migDeviceXMLCharts = module.Charts{
migDeviceFrameBufferMemoryUsageChartTmpl.Copy(),
migDeviceBAR1MemoryUsageChartTmpl.Copy(),
}
gpuCSVCharts = module.Charts{
gpuFanSpeedPercChartTmpl.Copy(),
gpuUtilizationChartTmpl.Copy(),
Expand Down Expand Up @@ -136,6 +146,29 @@ var (
{ID: "gpu_%s_encoder_utilization", Name: "encoder"},
},
}
gpuMIGModeCurrentStatusChartTmpl = module.Chart{
ID: "gpu_%s_mig_mode_current_status",
Title: "MIG current mode",
Units: "status",
Fam: "mig",
Ctx: "nvidia_smi.gpu_mig_mode_current_status",
Priority: prioGPUMIGModeStatus,
Dims: module.Dims{
{ID: "gpu_%s_mig_current_mode_enabled", Name: "enabled"},
{ID: "gpu_%s_mig_current_mode_disabled", Name: "disabled"},
},
}
gpuMIGDevicesCountChartTmpl = module.Chart{
ID: "gpu_%s_mig_devices_count",
Title: "MIG devices",
Units: "devices",
Fam: "mig",
Ctx: "nvidia_smi.gpu_mig_devices_count",
Priority: prioGPUMIGDevicesCount,
Dims: module.Dims{
{ID: "gpu_%s_mig_devices_count", Name: "mig"},
},
}
gpuFrameBufferMemoryUsageChartTmpl = module.Chart{
ID: "gpu_%s_frame_buffer_memory_usage",
Title: "Frame buffer memory usage",
Expand Down Expand Up @@ -240,6 +273,23 @@ var (

func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) {
charts := gpuXMLCharts.Copy()

if !isValidValue(gpu.Utilization.GpuUtil) {
_ = charts.Remove(gpuUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.Utilization.MemoryUtil) {
_ = charts.Remove(gpuMemUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.Utilization.DecoderUtil) {
_ = charts.Remove(gpuDecoderUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.Utilization.EncoderUtil) {
_ = charts.Remove(gpuEncoderUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.MIGMode.CurrentMIG) {
_ = charts.Remove(gpuMIGModeCurrentStatusChartTmpl.ID)
_ = charts.Remove(gpuMIGDevicesCountChartTmpl.ID)
}
if !isValidValue(gpu.FanSpeed) {
_ = charts.Remove(gpuFanSpeedPercChartTmpl.ID)
}
Expand All @@ -254,6 +304,7 @@ func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) {
c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID))
c.Labels = []module.Label{
// csv output has no 'product_brand'
{Key: "uuid", Value: gpu.UUID},
{Key: "product_name", Value: gpu.ProductName},
}
for _, d := range c.Dims {
Expand All @@ -269,6 +320,12 @@ func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) {
func (nv *NvidiaSMI) addGPUCSVCharts(gpu csvGPUInfo) {
charts := gpuCSVCharts.Copy()

if !isValidValue(gpu.utilizationGPU) {
_ = charts.Remove(gpuUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.utilizationMemory) {
_ = charts.Remove(gpuMemUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.fanSpeed) {
_ = charts.Remove(gpuFanSpeedPercChartTmpl.ID)
}
Expand All @@ -291,8 +348,58 @@ func (nv *NvidiaSMI) addGPUCSVCharts(gpu csvGPUInfo) {
}
}

func (nv *NvidiaSMI) removeGPUCharts(uuid string) {
prefix := "gpu_" + strings.ToLower(uuid)
var (
migDeviceFrameBufferMemoryUsageChartTmpl = module.Chart{
ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage",
Title: "MIG Frame buffer memory usage",
Units: "B",
Fam: "fb mem usage",
Ctx: "nvidia_smi.gpu_mig_frame_buffer_memory_usage",
Type: module.Stacked,
Priority: prioGPUMIGFBMemoryUsage,
Dims: module.Dims{
{ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_free", Name: "free"},
{ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_used", Name: "used"},
{ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"},
},
}
migDeviceBAR1MemoryUsageChartTmpl = module.Chart{
ID: "mig_instance_%s_gpu_%s_bar1_memory_usage",
Title: "MIG BAR1 memory usage",
Units: "B",
Fam: "bar1 mem usage",
Ctx: "nvidia_smi.gpu_mig_bar1_memory_usage",
Type: module.Stacked,
Priority: prioGPUMIGBAR1MemoryUsage,
Dims: module.Dims{
{ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_free", Name: "free"},
{ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_used", Name: "used"},
},
}
)

func (nv *NvidiaSMI) addMIGDeviceXMLCharts(gpu xmlGPUInfo, mig xmlMIGDeviceInfo) {
charts := migDeviceXMLCharts.Copy()

for _, c := range *charts {
c.ID = fmt.Sprintf(c.ID, strings.ToLower(mig.GPUInstanceID), strings.ToLower(gpu.UUID))
c.Labels = []module.Label{
{Key: "gpu_uuid", Value: gpu.UUID},
{Key: "gpu_product_name", Value: gpu.ProductName},
{Key: "gpu_instance_id", Value: mig.GPUInstanceID},
}
for _, d := range c.Dims {
d.ID = fmt.Sprintf(d.ID, mig.GPUInstanceID, gpu.UUID)
}
}

if err := nv.Charts().Add(*charts...); err != nil {
nv.Warning(err)
}
}

func (nv *NvidiaSMI) removeCharts(prefix string) {
prefix = strings.ToLower(prefix)

for _, c := range *nv.Charts() {
if strings.HasPrefix(c.ID, prefix) {
Expand Down
7 changes: 7 additions & 0 deletions modules/nvidia_smi/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,10 @@ func removeUnits(s string) string {
}
return s
}

func boolToInt(v bool) int64 {
if v {
return 1
}
return 0
}
18 changes: 9 additions & 9 deletions modules/nvidia_smi/collect_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,15 @@ func (nv *NvidiaSMI) collectGPUInfoCSV(mx map[string]int64) error {
continue
}

seen[gpu.uuid] = true
px := "gpu_" + gpu.uuid + "_"

seen[px] = true

if !nv.gpus[gpu.uuid] {
nv.gpus[gpu.uuid] = true
if !nv.gpus[px] {
nv.gpus[px] = true
nv.addGPUCSVCharts(gpu)
}

px := "gpu_" + gpu.uuid + "_"

addMetric(mx, px+"fan_speed_perc", gpu.fanSpeed, 0)
addMetric(mx, px+"gpu_utilization", gpu.utilizationGPU, 0)
addMetric(mx, px+"mem_utilization", gpu.utilizationMemory, 0)
Expand All @@ -167,10 +167,10 @@ func (nv *NvidiaSMI) collectGPUInfoCSV(mx map[string]int64) error {
}
}

for uuid := range nv.gpus {
if !seen[uuid] {
delete(nv.gpus, uuid)
nv.removeGPUCharts(uuid)
for px := range nv.gpus {
if !seen[px] {
delete(nv.gpus, px)
nv.removeCharts(px)
}
}

Expand Down
Loading