Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

Commit

Permalink
feat(nvidia_smi): collect PCIe bandwidth utilization (XML only) (#1066)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyam8 authored Jan 27, 2023
1 parent 6e526e1 commit eddf28e
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 21 deletions.
31 changes: 16 additions & 15 deletions modules/nvidia_smi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,22 @@ Labels per scope:

- gpu: product_name.

| Metric | Scope | Dimensions | Units | XML | CSV |
|-------------------------------|:-----:|:------------------------:|:-------:|:---:|:---:|
| gpu_pcie_bandwidth_usage | gpu | rx, tx | B/s | yes | no |
| gpu_fan_speed_perc | gpu | fan_speed | % | yes | yes |
| gpu_utilization | gpu | gpu | % | yes | yes |
| gpu_memory_utilization | gpu | memory | % | yes | yes |
| gpu_decoder_utilization | gpu | decoder | % | yes | no |
| gpu_encoder_utilization | gpu | encoder | % | yes | no |
| gpu_frame_buffer_memory_usage | gpu | free, used, reserved | B | yes | yes |
| gpu_bar1_memory_usage | gpu | free, used | B | yes | no |
| gpu_temperature | gpu | temperature | Celsius | yes | yes |
| gpu_voltage | gpu | voltage | V | yes | no |
| gpu_clock_freq | gpu | graphics, video, sm, mem | MHz | yes | yes |
| gpu_power_draw | gpu | power_draw | Watts | yes | yes |
| gpu_performance_state | gpu | P0-P15 | state | yes | yes |
| Metric | Scope | Dimensions | Units | XML | CSV |
|--------------------------------|:-----:|:------------------------:|:-------:|:---:|:---:|
| gpu_pcie_bandwidth_usage | gpu | rx, tx | B/s | yes | no |
| gpu_pcie_bandwidth_utilization | gpu | rx, tx | % | yes | no |
| gpu_fan_speed_perc | gpu | fan_speed | % | yes | yes |
| gpu_utilization | gpu | gpu | % | yes | yes |
| gpu_memory_utilization | gpu | memory | % | yes | yes |
| gpu_decoder_utilization | gpu | decoder | % | yes | no |
| gpu_encoder_utilization | gpu | encoder | % | yes | no |
| gpu_frame_buffer_memory_usage | gpu | free, used, reserved | B | yes | yes |
| gpu_bar1_memory_usage | gpu | free, used | B | yes | no |
| gpu_temperature | gpu | temperature | Celsius | yes | yes |
| gpu_voltage | gpu | voltage | V | yes | no |
| gpu_clock_freq | gpu | graphics, video, sm, mem | MHz | yes | yes |
| gpu_power_draw | gpu | power_draw | Watts | yes | yes |
| gpu_performance_state | gpu | P0-P15 | state | yes | yes |

## Configuration

Expand Down
14 changes: 14 additions & 0 deletions modules/nvidia_smi/charts.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

const (
prioGPUPCIBandwidthUsage = module.Priority + iota
prioGPUPCIBandwidthUtilization
prioGPUFanSpeed
prioGPUUtilization
prioGPUMemUtilization
Expand All @@ -28,6 +29,7 @@ const (
var (
gpuXMLCharts = module.Charts{
gpuPCIBandwidthUsageChartTmpl.Copy(),
gpuPCIBandwidthUtilizationChartTmpl.Copy(),
gpuFanSpeedPercChartTmpl.Copy(),
gpuUtilizationChartTmpl.Copy(),
gpuMemUtilizationChartTmpl.Copy(),
Expand Down Expand Up @@ -67,6 +69,18 @@ var (
{ID: "gpu_%s_pcie_bandwidth_usage_tx", Name: "tx", Mul: -1},
},
}
gpuPCIBandwidthUtilizationChartTmpl = module.Chart{
ID: "gpu_%s_pcie_bandwidth_utilization",
Title: "PCI Express Bandwidth Utilization",
Units: "percentage",
Fam: "pcie bandwidth",
Ctx: "nvidia_smi.gpu_pcie_bandwidth_utilization",
Priority: prioGPUPCIBandwidthUtilization,
Dims: module.Dims{
{ID: "gpu_%s_pcie_bandwidth_utilization_rx", Name: "rx", Div: 100},
{ID: "gpu_%s_pcie_bandwidth_utilization_tx", Name: "tx", Div: 100},
},
}
gpuFanSpeedPercChartTmpl = module.Chart{
ID: "gpu_%s_fan_speed_perc",
Title: "Fan speed",
Expand Down
17 changes: 13 additions & 4 deletions modules/nvidia_smi/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,7 @@ func addMetric(mx map[string]int64, key, value string, mul int) {
return
}

// remove units
if i := strings.IndexByte(value, ' '); i != -1 {
value = value[:i]
}
value = removeUnits(value)

v, err := strconv.ParseFloat(value, 64)
if err != nil {
Expand All @@ -54,3 +51,15 @@ func addMetric(mx map[string]int64, key, value string, mul int) {
func isValidValue(v string) bool {
return v != "" && v != "N/A" && v != "[N/A]"
}

func parseFloat(s string) float64 {
v, _ := strconv.ParseFloat(removeUnits(s), 64)
return v
}

func removeUnits(s string) string {
if i := strings.IndexByte(s, ' '); i != -1 {
s = s[:i]
}
return s
}
49 changes: 47 additions & 2 deletions modules/nvidia_smi/collect_xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"encoding/xml"
"fmt"
"strconv"
"strings"
)

func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
Expand Down Expand Up @@ -37,6 +38,12 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {

addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes
addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes
if max := calcMaxPCIEBandwidth(gpu); max > 0 {
rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes
tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes
mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100)
mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100)
}
addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0)
addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0)
addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0)
Expand All @@ -61,6 +68,7 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
mx[px+"performance_state_"+s] = 0
}
}

}

for uuid := range nv.gpus {
Expand All @@ -73,6 +81,35 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
return nil
}

func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 {
gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen
width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x")

if !isValidValue(gen) || !isValidValue(width) {
return 0
}

// https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
var speed, enc float64
switch gen {
case "1":
speed, enc = 2.5, 1/5
case "2":
speed, enc = 5, 1/5
case "3":
speed, enc = 8, 2/130
case "4":
speed, enc = 16, 2/130
case "5":
speed, enc = 32, 2/130
default:
return 0
}

// Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s
return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes
}

type (
xmlInfo struct {
GPUs []xmlGPUInfo `xml:"gpu"`
Expand All @@ -86,8 +123,16 @@ type (
FanSpeed string `xml:"fan_speed"`
PerformanceState string `xml:"performance_state"`
PCI struct {
TxUtil string `xml:"tx_util"`
RxUtil string `xml:"rx_util"`
TxUtil string `xml:"tx_util"`
RxUtil string `xml:"rx_util"`
PCIGPULinkInfo struct {
PCIEGen struct {
MaxLinkGen string `xml:"max_link_gen"`
} `xml:"pcie_gen"`
LinkWidths struct {
MaxLinkWidth string `xml:"max_link_width"`
} `xml:"link_widths"`
} `xml:"pci_gpu_link_info"`
} `xml:"pci"`
Utilization struct {
GpuUtil string `xml:"gpu_util"`
Expand Down
6 changes: 6 additions & 0 deletions modules/nvidia_smi/nvidia_smi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ func TestNvidiaSMI_Collect(t *testing.T) {
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_mem_utilization": 0,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_pcie_bandwidth_usage_rx": 0,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_pcie_bandwidth_usage_tx": 0,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_pcie_bandwidth_utilization_rx": 0,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_pcie_bandwidth_utilization_tx": 0,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_performance_state_P0": 0,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_performance_state_P1": 0,
"gpu_GPU-473d8d0f-d462-185c-6b36-6fc23e23e571_performance_state_P10": 0,
Expand Down Expand Up @@ -188,6 +190,8 @@ func TestNvidiaSMI_Collect(t *testing.T) {
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_mem_utilization": 0,
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_pcie_bandwidth_usage_rx": 0,
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_pcie_bandwidth_usage_tx": 0,
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_pcie_bandwidth_utilization_rx": 0,
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_pcie_bandwidth_utilization_tx": 0,
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_performance_state_P0": 1,
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_performance_state_P1": 0,
"gpu_GPU-d3da8716-eaab-75db-efc1-60e88e1cd55e_performance_state_P10": 0,
Expand Down Expand Up @@ -275,6 +279,8 @@ func TestNvidiaSMI_Collect(t *testing.T) {
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_mem_utilization": 7,
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_pcie_bandwidth_usage_rx": 93184000,
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_pcie_bandwidth_usage_tx": 13312000,
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_pcie_bandwidth_utilization_rx": 58,
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_pcie_bandwidth_utilization_tx": 8,
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_performance_state_P0": 0,
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_performance_state_P1": 0,
"gpu_GPU-fbd55ed4-1eec-4423-0a47-ad594b4333e3_performance_state_P10": 0,
Expand Down

0 comments on commit eddf28e

Please sign in to comment.