Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Issue#29][Can nvidiagpubeat be made to also export the process runni… #30

Merged
merged 1 commit into from
Jan 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 69 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,43 +133,94 @@ export PATH=$PATH:.
./nvidiagpubeat -c nvidiagpubeat.yml -e -d "*" -E seccomp.enabled=false
```

localnvidiasmi executable built for macOS and is a mock GPU event generator.
localnvidiasmi executable built for macOS and is a mock GPU event generator that supports events for --query-compute-apps and --query-gpu. The executable is generated using nvidiasmilocal/localnvidiasmi.go file.


### Sample event
Below is a sample event emitted by nvidiagpubeat.
The file nvidiagpubeat.yml defines the beat `nvidiagpubeat` with multiple options for `query`. For example `query: "--query-gpu=` will provide information about GPU and `query: "--query-compute-apps=` will list currently active compute processes.

The `--query-gpu` will generate below event by nvidiagpubeat.

```
Publish event: {
"@timestamp": "2019-03-25T15:34:17.739Z",
Publish event: Publish event: {
"@timestamp": "2021-01-03T07:27:16.080Z",
"@metadata": {
"beat": "nvidiagpubeat",
"type": "doc",
"version": "6.5.5"
},
"type": "nvidiagpubeat",
"gpu_uuid": "GPU-b884db58-6340-7969-a79f-b937f3583884",
"driver_version": "418.87.01",
"index": 3,
"gpu_serial": 3.20218176911e+11,
"memory": {
"used": 3256,
"total": 16280
},
"name": "Tesla100-PCIE-16GB",
"host": {
"name": "AB-SJC-11111111"
},
"utilization": {
"gpu": 4,
"memory": 40
"memory": 50,
"gpu": 50
},
"memory": {
"used": 0,
"total": 6082,
"free": 6082
"beat": {
"name": "AB-SJC-11111111",
"hostname": "AB-SJC-11111111",
"version": "6.5.5"
},
"pstate": 0,
"gpu_bus_id": "00000000:19:00.0",
"count": 4,
"fan": {
"speed": "[NotSupported]"
},
"gpuIndex": 3,
"power": {
"draw": 25.28,
"limit": 250
},
"temperature": {
"gpu": 27
"gpu": 24
},
"pstate": 8,
"beat": {
"name": "hostname.company.com",
"hostname": "hostname.company.com",
"clocks": {
"gr": 405,
"sm": 405,
"mem": 715
}
}
```

The `--query-compute-apps` will generate below event by nvidiagpubeat.

```
Publish event: {
"@timestamp": "2021-01-03T07:29:53.633Z",
"@metadata": {
"beat": "nvidiagpubeat",
"type": "doc",
"version": "6.5.5"
},
"host": {
"name": "hostname.company.com"
"pid": 222414,
"process_name": "[NotFound]",
"used_gpu_memory": 10,
"gpu_bus_id": "00000000:19:00.0",
"gpu_serial": 3.20218176911e+11,
"beat": {
"name": "AB-SJC-11111111",
"hostname": "AB-SJC-11111111",
"version": "6.5.5"
},
"gpu_name": "Tesla100-PCIE-16GB",
"used_memory": 15,
"gpuIndex": 3,
"type": "nvidiagpubeat"
"type": "nvidiagpubeat",
"gpu_uuid": "GPU-b884db58-6340-7969-a79f-b937f3583884",
"host": {
"name": "LM-SJC-11004865"
}
}
```

Expand Down
2 changes: 1 addition & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ type Config struct {

var DefaultConfig = Config{
Period: 1 * time.Second,
Query: "utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,temperature.gpu,pstate",
Query: "--query-gpu=utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,temperature.gpu,pstate",
Env: "local",
}
Binary file modified localnvidiasmi
Binary file not shown.
30 changes: 25 additions & 5 deletions nvidia/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,26 +46,38 @@ func newUtilization() Utilization {

func (g Utilization) command(env string, query string) *exec.Cmd {
if env == "test" {
return exec.Command("localnvidiasmi")
if strings.Contains(query, "query-compute-apps") {
args := "query-compute-apps"
return exec.Command("localnvidiasmi", args)
} else {
return exec.Command("localnvidiasmi")
}
}

//For backward compatibility support the default query of --query-apps, if none is provided
if strings.Contains(query, "=") {
return exec.Command("nvidia-smi", query, "--format=csv")
} else {
return exec.Command("nvidia-smi", "--query-gpu="+query, "--format=csv")
}
return exec.Command("nvidia-smi", "--query-gpu="+query, "--format=csv")
}

//Run the nvidiasmi command to collect GPU metrics
//Parse output and return events.
func (g Utilization) run(cmd *exec.Cmd, gpuCount int, query string, action Action) ([]common.MapStr, error) {
logp.Info("Running query: %s with gpuCount %d", query, gpuCount)
logp.Info("Running command %s for query: %s with gpuCount %d", cmd, query, gpuCount)
reader := action.start(cmd)
gpuIndex := 0
events := make([]common.MapStr, gpuCount, 2*gpuCount)

for {
line, err := reader.ReadString('\n')

if err == io.EOF {
break
}
// Ignore header
if strings.Contains(line, "utilization") {
if strings.Contains(line, "utilization") || strings.Contains(line, "gpu_name") || strings.Contains(line, "gpu_uuid") {
continue
}
if len(line) == 0 {
Expand All @@ -85,7 +97,15 @@ func (g Utilization) run(cmd *exec.Cmd, gpuCount int, query string, action Actio
if err == io.EOF {
break
}
headers := strings.Split(query, ",")

var headers []string
if strings.Contains(query, "=") {
rawHeaders := strings.Split(query, "=")
headers = strings.Split(rawHeaders[1], ",")
} else {
headers = strings.Split(query, ",")
}

event := common.MapStr{
"gpuIndex": gpuIndex,
"type": "nvidiagpubeat",
Expand Down
8 changes: 7 additions & 1 deletion nvidiagpubeat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,13 @@
nvidiagpubeat:
# Defines how often an event is sent to the output
period: 1s
query: "name,driver_version,count,index,fan.speed,memory.total,memory.used,utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,clocks.gr,clocks.sm,clocks.mem,pstate"
# By default the query of type query-gpu is executed to support backward compatibility
#query: "name,gpu_bus_id,gpu_serial,gpu_uuid,driver_version,count,index,fan.speed,memory.total,memory.used,utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,clocks.gr,clocks.sm,clocks.mem,pstate"
# A generic version of query is supported by nvidiagpubeat for query options like --query-gpu,--query-compute-apps and others.
# -query-gpu will provide information about GPU.
#query: "--query-gpu=name,gpu_bus_id,gpu_serial,gpu_uuid,driver_version,count,index,fan.speed,memory.total,memory.used,utilization.gpu,utilization.memory,temperature.gpu,power.draw,power.limit,clocks.gr,clocks.sm,clocks.mem,pstate"
# --query-compute-apps will list currently active compute processes.
query: "--query-compute-apps=gpu_name,gpu_bus_id,gpu_serial,gpu_uuid,pid,process_name,used_gpu_memory,used_memory"
env: "test"
# env can be test or production. test is for test purposes to evaluate funcationality of this beat. Switch to production
# when you want to run this beat on a Nvidia GPU machine with SMI driver installed.
Expand Down
35 changes: 30 additions & 5 deletions nvidiasmilocal/localnvidiasmi.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,36 @@
package main

import "fmt"
import (
"fmt"
"os"
)

func main() {
if len(os.Args) <= 1 {
utilization()
} else {
arg := os.Args[1]
if arg == "utilization" {
utilization()
} else {
process()
}
}

}

func utilization() {
fmt.Println("name, driver_version, count, index, fan.speed [%], memory.total [MiB], memory.used [MiB], utilization.gpu [%], utilization.memory [%], temperature.gpu, power.draw [W], power.limit [W], clocks.current.graphics [MHz], clocks.current.sm [MHz], clocks.current.memory [MHz], pstate")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 0, [Not Supported], 16280 MiB, 1628 MiB, 10 %, 10 %, 25, 24.80 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 1, [Not Supported], 16280 MiB, 3256 MiB, 30 %, 20 %, 25, 25.05 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 2, [Not Supported], 16280 MiB, 1628 MiB, 20 %, 10 %, 24, 26.26 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 418.87.01, 4, 3, [Not Supported], 16280 MiB, 3256 MiB, 70 %, 20 %, 24, 25.28 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:08:00.0, 0320218165889, GPU-78f90e78-39a0-4f40-fcbc-0adf3598c166, 418.87.01, 4, 0, [Not Supported], 16280 MiB, 1628 MiB, 10 %, 10 %, 25, 24.80 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:0B:00.0, 0320218176885, GPU-d1229c61-babc-aebe-ff8f-6dc94386640c, 418.87.01, 4, 1, [Not Supported], 16280 MiB, 3256 MiB, 30 %, 20 %, 25, 25.05 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:16:00.0, 0320218166179, GPU-eb5e8723-4a49-98f5-8e77-21b06537da8a, 418.87.01, 4, 2, [Not Supported], 16280 MiB, 1628 MiB, 20 %, 10 %, 24, 26.26 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:19:00.0, 0320218176911, GPU-b884db58-6340-7969-a79f-b937f3583884, 418.87.01, 4, 3, [Not Supported], 16280 MiB, 3256 MiB, 50 %, 50 %, 24, 25.28 W, 250.00 W, 405 MHz, 405 MHz, 715 MHz, P0")
}

func process() {
fmt.Println("gpu_name, gpu_bus_id, gpu_serial, gpu_uuid, pid, process_name, used_gpu_memory [MiB], used_gpu_memory [MiB]")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:16:00.0, 0320218176947, GPU-bb7f65ee-acdb-7efd-0f32-73699400b86e, 240930, python, 10 MiB, 15 MiB")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:0B:00.0, 0320218176885, GPU-d1229c61-babc-aebe-ff8f-6dc94386640c, 65808, python, 10 MiB, 15 MiB")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:16:00.0, 0320218166179, GPU-eb5e8723-4a49-98f5-8e77-21b06537da8a, 267414, python, 10 MiB, 15 MiB")
fmt.Println("Tesla P100-PCIE-16GB, 00000000:19:00.0, 0320218176911, GPU-b884db58-6340-7969-a79f-b937f3583884, 222414, [Not Found], 10 MiB, 15 MiB")
}