diff --git a/modules/nvidia_smi/README.md b/modules/nvidia_smi/README.md index cc2098ecd..0f697af1d 100644 --- a/modules/nvidia_smi/README.md +++ b/modules/nvidia_smi/README.md @@ -21,24 +21,29 @@ All metrics have "nvidia_smi." prefix. Labels per scope: -- gpu: product_name. - -| Metric | Scope | Dimensions | Units | XML | CSV | -|--------------------------------|:-----:|:------------------------:|:-------:|:---:|:---:| -| gpu_pcie_bandwidth_usage | gpu | rx, tx | B/s | yes | no | -| gpu_pcie_bandwidth_utilization | gpu | rx, tx | % | yes | no | -| gpu_fan_speed_perc | gpu | fan_speed | % | yes | yes | -| gpu_utilization | gpu | gpu | % | yes | yes | -| gpu_memory_utilization | gpu | memory | % | yes | yes | -| gpu_decoder_utilization | gpu | decoder | % | yes | no | -| gpu_encoder_utilization | gpu | encoder | % | yes | no | -| gpu_frame_buffer_memory_usage | gpu | free, used, reserved | B | yes | yes | -| gpu_bar1_memory_usage | gpu | free, used | B | yes | no | -| gpu_temperature | gpu | temperature | Celsius | yes | yes | -| gpu_voltage | gpu | voltage | V | yes | no | -| gpu_clock_freq | gpu | graphics, video, sm, mem | MHz | yes | yes | -| gpu_power_draw | gpu | power_draw | Watts | yes | yes | -| gpu_performance_state | gpu | P0-P15 | state | yes | yes | +- gpu: uuid, product_name. +- mig: gpu_uuid, gpu_product_name, gpu_instance_id + +| Metric | Scope | Dimensions | Units | XML | CSV | +|-----------------------------------|:-----:|:------------------------:|:-------:|:---:|:---:| +| gpu_pcie_bandwidth_usage | gpu | rx, tx | B/s | yes | no | +| gpu_pcie_bandwidth_utilization | gpu | rx, tx | % | yes | no | +| gpu_fan_speed_perc | gpu | fan_speed | % | yes | yes | +| gpu_utilization | gpu | gpu | % | yes | yes | +| gpu_memory_utilization | gpu | memory | % | yes | yes | +| gpu_decoder_utilization | gpu | decoder | % | yes | no | +| gpu_encoder_utilization | gpu | encoder | % | yes | no | +| gpu_frame_buffer_memory_usage | gpu | free, used, reserved | B | yes | yes | +| gpu_bar1_memory_usage | gpu | free, used | B | yes | no | +| gpu_temperature | gpu | temperature | Celsius | yes | yes | +| gpu_voltage | gpu | voltage | V | yes | no | +| gpu_clock_freq | gpu | graphics, video, sm, mem | MHz | yes | yes | +| gpu_power_draw | gpu | power_draw | Watts | yes | yes | +| gpu_performance_state | gpu | P0-P15 | state | yes | yes | +| gpu_mig_mode_current_status | gpu | enabled, disabled | status | yes | no | +| gpu_mig_devices_count | gpu | mig | devices | yes | no | +| gpu_mig_frame_buffer_memory_usage | mig | free, used, reserved | B | yes | no | +| gpu_mig_bar1_memory_usage | mig | free, used | B | yes | no | ## Configuration diff --git a/modules/nvidia_smi/charts.go b/modules/nvidia_smi/charts.go index 3d6f4641d..c6ab80e57 100644 --- a/modules/nvidia_smi/charts.go +++ b/modules/nvidia_smi/charts.go @@ -17,8 +17,12 @@ const ( prioGPUMemUtilization prioGPUDecoderUtilization prioGPUEncoderUtilization + prioGPUMIGModeStatus + prioGPUMIGDevicesCount prioGPUFBMemoryUsage + prioGPUMIGFBMemoryUsage prioGPUBAR1MemoryUsage + prioGPUMIGBAR1MemoryUsage prioGPUTemperatureChart prioGPUVoltageChart prioGPUClockFreq @@ -35,6 +39,8 @@ var ( gpuMemUtilizationChartTmpl.Copy(), gpuDecoderUtilizationChartTmpl.Copy(), gpuEncoderUtilizationChartTmpl.Copy(), + gpuMIGModeCurrentStatusChartTmpl.Copy(), + gpuMIGDevicesCountChartTmpl.Copy(), gpuFrameBufferMemoryUsageChartTmpl.Copy(), gpuBAR1MemoryUsageChartTmpl.Copy(), gpuVoltageChartTmpl.Copy(), @@ -43,6 +49,10 @@ var ( gpuPowerDrawChartTmpl.Copy(), gpuPerformanceStateChartTmpl.Copy(), } + migDeviceXMLCharts = module.Charts{ + migDeviceFrameBufferMemoryUsageChartTmpl.Copy(), + migDeviceBAR1MemoryUsageChartTmpl.Copy(), + } gpuCSVCharts = module.Charts{ gpuFanSpeedPercChartTmpl.Copy(), gpuUtilizationChartTmpl.Copy(), @@ -136,6 +146,29 @@ var ( {ID: "gpu_%s_encoder_utilization", Name: "encoder"}, }, } + gpuMIGModeCurrentStatusChartTmpl = module.Chart{ + ID: "gpu_%s_mig_mode_current_status", + Title: "MIG current mode", + Units: "status", + Fam: "mig", + Ctx: "nvidia_smi.gpu_mig_mode_current_status", + Priority: prioGPUMIGModeStatus, + Dims: module.Dims{ + {ID: "gpu_%s_mig_current_mode_enabled", Name: "enabled"}, + {ID: "gpu_%s_mig_current_mode_disabled", Name: "disabled"}, + }, + } + gpuMIGDevicesCountChartTmpl = module.Chart{ + ID: "gpu_%s_mig_devices_count", + Title: "MIG devices", + Units: "devices", + Fam: "mig", + Ctx: "nvidia_smi.gpu_mig_devices_count", + Priority: prioGPUMIGDevicesCount, + Dims: module.Dims{ + {ID: "gpu_%s_mig_devices_count", Name: "mig"}, + }, + } gpuFrameBufferMemoryUsageChartTmpl = module.Chart{ ID: "gpu_%s_frame_buffer_memory_usage", Title: "Frame buffer memory usage", @@ -240,6 +273,23 @@ var ( func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) { charts := gpuXMLCharts.Copy() + + if !isValidValue(gpu.Utilization.GpuUtil) { + _ = charts.Remove(gpuUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.Utilization.MemoryUtil) { + _ = charts.Remove(gpuMemUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.Utilization.DecoderUtil) { + _ = charts.Remove(gpuDecoderUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.Utilization.EncoderUtil) { + _ = charts.Remove(gpuEncoderUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.MIGMode.CurrentMIG) { + _ = charts.Remove(gpuMIGModeCurrentStatusChartTmpl.ID) + _ = charts.Remove(gpuMIGDevicesCountChartTmpl.ID) + } if !isValidValue(gpu.FanSpeed) { _ = charts.Remove(gpuFanSpeedPercChartTmpl.ID) } @@ -254,6 +304,7 @@ func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) { c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID)) c.Labels = []module.Label{ // csv output has no 'product_brand' + {Key: "uuid", Value: gpu.UUID}, {Key: "product_name", Value: gpu.ProductName}, } for _, d := range c.Dims { @@ -269,6 +320,12 @@ func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) { func (nv *NvidiaSMI) addGPUCSVCharts(gpu csvGPUInfo) { charts := gpuCSVCharts.Copy() + if !isValidValue(gpu.utilizationGPU) { + _ = charts.Remove(gpuUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.utilizationMemory) { + _ = charts.Remove(gpuMemUtilizationChartTmpl.ID) + } if !isValidValue(gpu.fanSpeed) { _ = charts.Remove(gpuFanSpeedPercChartTmpl.ID) } @@ -291,8 +348,58 @@ func (nv *NvidiaSMI) addGPUCSVCharts(gpu csvGPUInfo) { } } -func (nv *NvidiaSMI) removeGPUCharts(uuid string) { - prefix := "gpu_" + strings.ToLower(uuid) +var ( + migDeviceFrameBufferMemoryUsageChartTmpl = module.Chart{ + ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage", + Title: "MIG Frame buffer memory usage", + Units: "B", + Fam: "fb mem usage", + Ctx: "nvidia_smi.gpu_mig_frame_buffer_memory_usage", + Type: module.Stacked, + Priority: prioGPUMIGFBMemoryUsage, + Dims: module.Dims{ + {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_free", Name: "free"}, + {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_used", Name: "used"}, + {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"}, + }, + } + migDeviceBAR1MemoryUsageChartTmpl = module.Chart{ + ID: "mig_instance_%s_gpu_%s_bar1_memory_usage", + Title: "MIG BAR1 memory usage", + Units: "B", + Fam: "bar1 mem usage", + Ctx: "nvidia_smi.gpu_mig_bar1_memory_usage", + Type: module.Stacked, + Priority: prioGPUMIGBAR1MemoryUsage, + Dims: module.Dims{ + {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_free", Name: "free"}, + {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_used", Name: "used"}, + }, + } +) + +func (nv *NvidiaSMI) addMIGDeviceXMLCharts(gpu xmlGPUInfo, mig xmlMIGDeviceInfo) { + charts := migDeviceXMLCharts.Copy() + + for _, c := range *charts { + c.ID = fmt.Sprintf(c.ID, strings.ToLower(mig.GPUInstanceID), strings.ToLower(gpu.UUID)) + c.Labels = []module.Label{ + {Key: "gpu_uuid", Value: gpu.UUID}, + {Key: "gpu_product_name", Value: gpu.ProductName}, + {Key: "gpu_instance_id", Value: mig.GPUInstanceID}, + } + for _, d := range c.Dims { + d.ID = fmt.Sprintf(d.ID, mig.GPUInstanceID, gpu.UUID) + } + } + + if err := nv.Charts().Add(*charts...); err != nil { + nv.Warning(err) + } +} + +func (nv *NvidiaSMI) removeCharts(prefix string) { + prefix = strings.ToLower(prefix) for _, c := range *nv.Charts() { if strings.HasPrefix(c.ID, prefix) { diff --git a/modules/nvidia_smi/collect.go b/modules/nvidia_smi/collect.go index 2353f3c7e..0830b54a3 100644 --- a/modules/nvidia_smi/collect.go +++ b/modules/nvidia_smi/collect.go @@ -63,3 +63,10 @@ func removeUnits(s string) string { } return s } + +func boolToInt(v bool) int64 { + if v { + return 1 + } + return 0 +} diff --git a/modules/nvidia_smi/collect_csv.go b/modules/nvidia_smi/collect_csv.go index bcaf8580e..2584aaffe 100644 --- a/modules/nvidia_smi/collect_csv.go +++ b/modules/nvidia_smi/collect_csv.go @@ -137,15 +137,15 @@ func (nv *NvidiaSMI) collectGPUInfoCSV(mx map[string]int64) error { continue } - seen[gpu.uuid] = true + px := "gpu_" + gpu.uuid + "_" + + seen[px] = true - if !nv.gpus[gpu.uuid] { - nv.gpus[gpu.uuid] = true + if !nv.gpus[px] { + nv.gpus[px] = true nv.addGPUCSVCharts(gpu) } - px := "gpu_" + gpu.uuid + "_" - addMetric(mx, px+"fan_speed_perc", gpu.fanSpeed, 0) addMetric(mx, px+"gpu_utilization", gpu.utilizationGPU, 0) addMetric(mx, px+"mem_utilization", gpu.utilizationMemory, 0) @@ -167,10 +167,10 @@ func (nv *NvidiaSMI) collectGPUInfoCSV(mx map[string]int64) error { } } - for uuid := range nv.gpus { - if !seen[uuid] { - delete(nv.gpus, uuid) - nv.removeGPUCharts(uuid) + for px := range nv.gpus { + if !seen[px] { + delete(nv.gpus, px) + nv.removeCharts(px) } } diff --git a/modules/nvidia_smi/collect_xml.go b/modules/nvidia_smi/collect_xml.go index 965aff321..06c109e43 100644 --- a/modules/nvidia_smi/collect_xml.go +++ b/modules/nvidia_smi/collect_xml.go @@ -20,22 +20,23 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error { return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err) } - seen := make(map[string]bool) + seenGPU := make(map[string]bool) + seenMIG := make(map[string]bool) for _, gpu := range info.GPUs { if !isValidValue(gpu.UUID) { continue } - seen[gpu.UUID] = true + px := "gpu_" + gpu.UUID + "_" + + seenGPU[px] = true - if !nv.gpus[gpu.UUID] { - nv.gpus[gpu.UUID] = true + if !nv.gpus[px] { + nv.gpus[px] = true nv.addGPUXMLCharts(gpu) } - px := "gpu_" + gpu.UUID + "_" - addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes if max := calcMaxPCIEBandwidth(gpu); max > 0 { @@ -62,19 +63,50 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error { addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0) addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0) for i := 0; i < 16; i++ { - if s := "P" + strconv.Itoa(i); gpu.PerformanceState == s { - mx[px+"performance_state_"+s] = 1 - } else { - mx[px+"performance_state_"+s] = 0 + s := "P" + strconv.Itoa(i) + mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s) + } + if isValidValue(gpu.MIGMode.CurrentMIG) { + mode := strings.ToLower(gpu.MIGMode.CurrentMIG) + mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled") + mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled") + mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice)) + } + + for _, mig := range gpu.MIGDevices.MIGDevice { + if !isValidValue(mig.GPUInstanceID) { + continue + } + + px := "mig_instance_" + mig.GPUInstanceID + "_" + px + + seenMIG[px] = true + + if !nv.migs[px] { + nv.migs[px] = true + nv.addMIGDeviceXMLCharts(gpu, mig) } + + addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0) + addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024) // MiB => bytes + addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024) // MiB => bytes + addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes + addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024) // MiB => bytes + addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024) // MiB => bytes } + } + for px := range nv.gpus { + if !seenGPU[px] { + delete(nv.gpus, px) + nv.removeCharts(px) + } } - for uuid := range nv.gpus { - if !seen[uuid] { - delete(nv.gpus, uuid) - nv.removeGPUCharts(uuid) + for px := range nv.migs { + if !seenMIG[px] { + delete(nv.migs, px) + nv.removeCharts(px) } } @@ -122,7 +154,13 @@ type ( UUID string `xml:"uuid"` FanSpeed string `xml:"fan_speed"` PerformanceState string `xml:"performance_state"` - PCI struct { + MIGMode struct { + CurrentMIG string `xml:"current_mig"` + } `xml:"mig_mode"` + MIGDevices struct { + MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"` + } `xml:"mig_devices"` + PCI struct { TxUtil string `xml:"tx_util"` RxUtil string `xml:"rx_util"` PCIGPULinkInfo struct { @@ -187,4 +225,34 @@ type ( } `sml:"process_info"` } `xml:"processes"` } + + xmlMIGDeviceInfo struct { + Index string `xml:"index"` + GPUInstanceID string `xml:"gpu_instance_id"` + ComputeInstanceID string `xml:"compute_instance_id"` + DeviceAttributes struct { + Shared struct { + MultiprocessorCount string `xml:"multiprocessor_count"` + CopyEngineCount string `xml:"copy_engine_count"` + EncoderCount string `xml:"encoder_count"` + DecoderCount string `xml:"decoder_count"` + OFACount string `xml:"ofa_count"` + JPGCount string `xml:"jpg_count"` + } `xml:"shared"` + } `xml:"device_attributes"` + ECCErrorCount struct { + VolatileCount struct { + SRAMUncorrectable string `xml:"sram_uncorrectable"` + } `xml:"volatile_count"` + } `xml:"ecc_error_count"` + FBMemoryUsage struct { + Free string `xml:"free"` + Used string `xml:"used"` + Reserved string `xml:"reserved"` + } `xml:"fb_memory_usage"` + BAR1MemoryUsage struct { + Free string `xml:"free"` + Used string `xml:"used"` + } `xml:"bar1_memory_usage"` + } ) diff --git a/modules/nvidia_smi/nvidia_smi.go b/modules/nvidia_smi/nvidia_smi.go index 541fa81d0..c8a2d1359 100644 --- a/modules/nvidia_smi/nvidia_smi.go +++ b/modules/nvidia_smi/nvidia_smi.go @@ -28,6 +28,7 @@ func New() *NvidiaSMI { binName: "nvidia-smi", charts: &module.Charts{}, gpus: make(map[string]bool), + migs: make(map[string]bool), } } @@ -51,6 +52,7 @@ type ( gpuQueryProperties []string gpus map[string]bool + migs map[string]bool } nvidiaSMI interface { queryGPUInfoXML() ([]byte, error) diff --git a/modules/nvidia_smi/nvidia_smi_test.go b/modules/nvidia_smi/nvidia_smi_test.go index f08e9fbc4..4176b13ce 100644 --- a/modules/nvidia_smi/nvidia_smi_test.go +++ b/modules/nvidia_smi/nvidia_smi_test.go @@ -17,6 +17,8 @@ var ( dataXMLRTX3060, _ = os.ReadFile("testdata/rtx-3060.xml") dataXMLTeslaP100, _ = os.ReadFile("testdata/tesla-p100.xml") + dataXMLA100SXM4MIG, _ = os.ReadFile("testdata/a100-sxm4-mig.xml") + dataHelpQueryGPU, _ = os.ReadFile("testdata/help-query-gpu.txt") dataCSVTeslaP100, _ = os.ReadFile("testdata/tesla-p100.csv") ) @@ -26,8 +28,11 @@ func Test_testDataIsValid(t *testing.T) { "dataXMLRTX2080Win": dataXMLRTX2080Win, "dataXMLRTX3060": dataXMLRTX3060, "dataXMLTeslaP100": dataXMLTeslaP100, - "dataHelpQueryGPU": dataHelpQueryGPU, - "dataCSVTeslaP100": dataCSVTeslaP100, + + "dataXMLA100SXM4MIG": dataXMLA100SXM4MIG, + + "dataHelpQueryGPU": dataHelpQueryGPU, + "dataCSVTeslaP100": dataCSVTeslaP100, } { require.NotNilf(t, data, name) } @@ -70,6 +75,10 @@ func TestNvidiaSMI_Check(t *testing.T) { prepare func(nv *NvidiaSMI) wantFail bool }{ + "success A100-SXM4 MIG [XML]": { + wantFail: false, + prepare: prepareCaseMIGA100formatXML, + }, "success RTX 3060 [XML]": { wantFail: false, prepare: prepareCaseRTX3060formatXML, @@ -121,6 +130,66 @@ func TestNvidiaSMI_Collect(t *testing.T) { check func(t *testing.T, nv *NvidiaSMI) } tests := map[string][]testCaseStep{ + "success A100-SXM4 MIG [XML]": { + { + prepare: prepareCaseMIGA100formatXML, + check: func(t *testing.T, nv *NvidiaSMI) { + mx := nv.Collect() + + expected := map[string]int64{ + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_bar1_memory_usage_free": 68718428160, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_bar1_memory_usage_used": 1048576, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_free": 42273341440, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_reserved": 634388480, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_used": 39845888, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_graphics_clock": 1410, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_mem_clock": 1215, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_mig_current_mode_disabled": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_mig_current_mode_enabled": 1, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_mig_devices_count": 2, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_pcie_bandwidth_usage_rx": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_pcie_bandwidth_usage_tx": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_pcie_bandwidth_utilization_rx": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_pcie_bandwidth_utilization_tx": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P0": 1, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P1": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P10": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P11": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P12": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P13": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P14": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P15": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P2": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P3": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P4": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P5": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P6": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P7": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P8": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_performance_state_P9": 0, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_power_draw": 66, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_sm_clock": 1410, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_temperature": 36, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_video_clock": 1275, + "gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_voltage": 881, + "mig_instance_1_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_bar1_memory_usage_free": 34358689792, + "mig_instance_1_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_bar1_memory_usage_used": 0, + "mig_instance_1_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_ecc_error_sram_uncorrectable": 0, + "mig_instance_1_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_free": 20916994048, + "mig_instance_1_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_reserved": 0, + "mig_instance_1_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_used": 19922944, + "mig_instance_2_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_bar1_memory_usage_free": 34358689792, + "mig_instance_2_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_bar1_memory_usage_used": 0, + "mig_instance_2_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_ecc_error_sram_uncorrectable": 0, + "mig_instance_2_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_free": 20916994048, + "mig_instance_2_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_reserved": 0, + "mig_instance_2_gpu_GPU-27b94a00-ed54-5c24-b1fd-1054085de32a_frame_buffer_memory_usage_used": 19922944, + } + + assert.Equal(t, expected, mx) + }, + }, + }, "success RTX 3060 [XML]": { { prepare: prepareCaseRTX3060formatXML, @@ -385,6 +454,11 @@ func (m *mockNvidiaSMI) queryHelpQueryGPU() ([]byte, error) { return m.helpQueryGPU, nil } +func prepareCaseMIGA100formatXML(nv *NvidiaSMI) { + nv.UseCSVFormat = false + nv.exec = &mockNvidiaSMI{gpuInfoXML: dataXMLA100SXM4MIG} +} + func prepareCaseRTX3060formatXML(nv *NvidiaSMI) { nv.UseCSVFormat = false nv.exec = &mockNvidiaSMI{gpuInfoXML: dataXMLRTX3060} diff --git a/modules/nvidia_smi/testdata/a100-sxm4-mig.xml b/modules/nvidia_smi/testdata/a100-sxm4-mig.xml new file mode 100644 index 000000000..74146ac78 --- /dev/null +++ b/modules/nvidia_smi/testdata/a100-sxm4-mig.xml @@ -0,0 +1,359 @@ + + + + Fri Jan 27 11:32:31 2023 + 510.47.03 + 11.6 + 1 + + NVIDIA A100-SXM4-40GB + NVIDIA + Ampere + Enabled + Disabled + Disabled + + Enabled + Enabled + + + + 0 + 1 + 0 + + + 42 + 3 + 0 + 2 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 19 MiB + 19948 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + 1 + 2 + 0 + + + 42 + 3 + 0 + 2 + 0 + 0 + + + + + 0 + + + + 19968 MiB + 0 MiB + 19 MiB + 19948 MiB + + + 32767 MiB + 0 MiB + 32767 MiB + + + + Disabled + 4000 + + N/A + N/A + + 1324321002473 + GPU-27b94a00-ed54-5c24-b1fd-1054085de32a + 0 + 92.00.45.00.03 + No + 0x4 + 692-2G506-0200-003 + 3 + + G506.0200.00.04 + 2.0 + 6.16 + N/A + + + N/A + N/A + + 510.47.03 + + Pass-Through + N/A + + + N/A + + + 00 + 04 + 0000 + 20B010DE + 00000000:00:04.0 + 134F10DE + + + 4 + 4 + + + 16x + 16x + + + + N/A + N/A + + 0 + 0 + 0 KB/s + 0 KB/s + + N/A + P0 + + Not Active + Not Active + + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + Not Active + + + 40960 MiB + 605 MiB + 38 MiB + 40315 MiB + + + 65536 MiB + 1 MiB + 65535 MiB + + Default + + N/A + N/A + N/A + N/A + + + 0 + 0 + 0 + + + 0 + 0 + 0 + + + Enabled + Enabled + + + + N/A + N/A + N/A + N/A + + + 0 + 0 + 0 + 0 + + + + + N/A + N/A + + + N/A + N/A + + N/A + N/A + + N/A + + 36 C + 92 C + 89 C + 85 C + N/A + 44 C + 95 C + + + N/A + N/A + + + P0 + Supported + 66.92 W + 400.00 W + 400.00 W + 400.00 W + 100.00 W + 400.00 W + + + 1410 MHz + 1410 MHz + 1215 MHz + 1275 MHz + + + 1095 MHz + 1215 MHz + + + 1095 MHz + 1215 MHz + + + 1410 MHz + 1410 MHz + 1215 MHz + 1290 MHz + + + 1410 MHz + + + N/A + N/A + + + 881.250 mV + + + + 1215 MHz + 1410 MHz + 1395 MHz + 1380 MHz + 1365 MHz + 1350 MHz + 1335 MHz + 1320 MHz + 1305 MHz + 1290 MHz + 1275 MHz + 1260 MHz + 1245 MHz + 1230 MHz + 1215 MHz + 1200 MHz + 1185 MHz + 1170 MHz + 1155 MHz + 1140 MHz + 1125 MHz + 1110 MHz + 1095 MHz + 1080 MHz + 1065 MHz + 1050 MHz + 1035 MHz + 1020 MHz + 1005 MHz + 990 MHz + 975 MHz + 960 MHz + 945 MHz + 930 MHz + 915 MHz + 900 MHz + 885 MHz + 870 MHz + 855 MHz + 840 MHz + 825 MHz + 810 MHz + 795 MHz + 780 MHz + 765 MHz + 750 MHz + 735 MHz + 720 MHz + 705 MHz + 690 MHz + 675 MHz + 660 MHz + 645 MHz + 630 MHz + 615 MHz + 600 MHz + 585 MHz + 570 MHz + 555 MHz + 540 MHz + 525 MHz + 510 MHz + 495 MHz + 480 MHz + 465 MHz + 450 MHz + 435 MHz + 420 MHz + 405 MHz + 390 MHz + 375 MHz + 360 MHz + 345 MHz + 330 MHz + 315 MHz + 300 MHz + 285 MHz + 270 MHz + 255 MHz + 240 MHz + 225 MHz + 210 MHz + + + + + + + + +