Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gpu support #1973

Merged
merged 15 commits into from
Jun 19, 2023
1 change: 1 addition & 0 deletions .github/workflows/publish-development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:

- name: Build binaries
run: |
go generate ./pkg/capacity/...
cd cmds
make
env:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish-pre-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:

- name: Build binaries
run: |
go generate ./pkg/capacity/...
cd cmds
make
env:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ jobs:

- name: Build binaries
run: |
go generate ./pkg/capacity/...
cd cmds
make
env:
Expand Down
13 changes: 13 additions & 0 deletions client/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,13 @@ type UsersCounters struct {
Workloads int `json:"workloads"`
}

// GPU information
type GPU struct {
ID string `json:"id"`
Vendor string `json:"vendor"`
Device string `json:"device"`
}

// Counters returns some node statistics. Including total and available cpu, memory, storage, etc...
func (n *NodeClient) Counters(ctx context.Context) (counters Counters, err error) {
const cmd = "zos.statistics.get"
Expand All @@ -205,6 +212,12 @@ func (n *NodeClient) Pools(ctx context.Context) (pools []pkg.PoolMetrics, err er
return
}

func (n *NodeClient) GPUs(ctx context.Context) (gpus []GPU, err error) {
xmonader marked this conversation as resolved.
Show resolved Hide resolved
const cmd = "zos.gpu.list"
err = n.bus.Call(ctx, n.nodeTwin, cmd, nil, &gpus)
return
}

// NetworkListWGPorts return a list of all "taken" ports on the node. A new deployment
// should be careful to use a free port for its network setup.
func (n *NodeClient) NetworkListWGPorts(ctx context.Context) ([]uint16, error) {
Expand Down
16 changes: 16 additions & 0 deletions cmds/modules/noded/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,24 @@ func action(cli *cli.Context) error {
if err != nil {
return errors.Wrap(err, "failed to get hypervisors")
}
gpus, err := oracle.GPUs()
if err != nil {
return errors.Wrap(err, "failed to list gpus")
}

var info registrar.RegistrationInfo
for _, gpu := range gpus {
// log info about the GPU here ?
vendor, device, ok := gpu.GetDevice()
if ok {
log.Info().Str("vendor", vendor.Name).Str("device", device.Name).Msg("found GPU")
} else {
log.Info().Uint16("vendor", gpu.Vendor).Uint16("device", device.ID).Msg("found GPU (can't look up device name)")
}

info = info.WithGPU(gpu.ShortID())
}

info = info.WithCapacity(cap).
WithSerialNumber(dmi.BoardVersion()).
WithSecureBoot(secureBoot).
Expand Down
35 changes: 35 additions & 0 deletions cmds/modules/provisiond/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ func action(cli *cli.Context) error {
}

setupStorageRmb(zosRouter, cl)
setupGPURmb(zosRouter, cl)

_ = mbus.NewDeploymentMessageBus(zosRouter, engine)

Expand Down Expand Up @@ -491,3 +492,37 @@ func setupStorageRmb(router rmb.Router, cl zbus.Client) {
return stub.Metrics(ctx)
})
}

func setupGPURmb(router rmb.Router, cl zbus.Client) {
type Info struct {
ID string `json:"id"`
Vendor string `json:"vendor"`
Device string `json:"device"`
}
gpus := router.Subroute("gpu")
gpus.WithHandler("list", func(ctx context.Context, payload []byte) (interface{}, error) {
devices, err := capacity.ListPCI(capacity.GPU)
if err != nil {
return nil, errors.Wrap(err, "failed to list available devices")
}

var list []Info
for _, device := range devices {
info := Info{
ID: device.ShortID(),
Vendor: "unknown",
Device: "unknown",
}

vendor, device, ok := device.GetDevice()
if ok {
info.Vendor = vendor.Name
info.Device = device.Name
}

list = append(list, info)
}

return list, nil
})
}
3 changes: 1 addition & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ require (
github.com/shirou/gopsutil v3.21.11+incompatible
github.com/stretchr/testify v1.8.2
github.com/threefoldtech/0-fs v1.3.1-0.20201203163303-d963de9adea7
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230614080449-33e2dc5b38f2
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230616110830-e0aae0ad6d60
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.4.0
github.com/threefoldtech/zbus v1.0.1
github.com/tyler-smith/go-bip39 v1.1.0
Expand Down Expand Up @@ -133,7 +133,6 @@ require (
github.com/safchain/ethtool v0.0.0-20201023143004-874930cb3ce0 // indirect
github.com/sirupsen/logrus v1.8.1 // indirect
github.com/stretchr/objx v0.5.0 // indirect
github.com/threefoldtech/substrate-client v0.1.5 // indirect
github.com/tinylib/msgp v1.1.5 // indirect
github.com/tklauser/go-sysconf v0.3.9 // indirect
github.com/tklauser/numcpus v0.6.0 // indirect
Expand Down
11 changes: 4 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW
github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb h1:PBC98N2aIaM3XXiurYmW7fx4GZkL8feAMVq7nEjURHk=
github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/gologme/log v1.2.0/go.mod h1:gq31gQ8wEHkR+WekdWsqDuf8pXTUZA9BnnzTuPz1Y9U=
github.com/gomodule/redigo v1.8.9 h1:Sl3u+2BI/kk+VEatbj0scLdrFhjPmbxOc1myhDP41ws=
github.com/gomodule/redigo v1.8.9/go.mod h1:7ArFNvsTjH8GMMzB4uy1snslv2BwmginuMs06a1uzZE=
github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0=
github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4=
Expand Down Expand Up @@ -531,14 +530,12 @@ github.com/stripe/safesql v0.2.0/go.mod h1:q7b2n0JmzM1mVGfcYpanfVb2j23cXZeWFxcIL
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
github.com/threefoldtech/0-fs v1.3.1-0.20201203163303-d963de9adea7 h1:64QIPSO1Acx7ENdMwQ0Q4tnE94By1BljA/2R2NbY51Y=
github.com/threefoldtech/0-fs v1.3.1-0.20201203163303-d963de9adea7/go.mod h1:OPPZiE/GthPR2IepjKSc8wa+t/7wl3dtHQyEdUcftZI=
github.com/threefoldtech/substrate-client v0.1.5 h1:yYmWaJCMoJO0ieaO6eu3vIK0agehSw4OhSXlGydmv/E=
github.com/threefoldtech/substrate-client v0.1.5/go.mod h1:ys/GJLeLmNX8E36UFMeR0yZC5Lo4PufYtfwsVX8H/AM=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230509101146-8e43c43597cd h1:aermHEtCwO/gzn8OiwDzhvsBHOuxHFgHymcoFsRVNvQ=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230509101146-8e43c43597cd/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230614080449-33e2dc5b38f2 h1:WZUNb8xu/Zxs9gwNDXL3FG2gwox1rgJudMCpsHCksJM=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230614080449-33e2dc5b38f2/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.2.0 h1:dUPLbUFYBTfsM31UpIyPVyv18kDMLczHYIINPK/mFKI=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.2.0/go.mod h1:DDQ6ktnCli7z1+zpSR/j2twWwe6FUGxPO+WnEkRsnxw=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230615114630-b98807b623e2 h1:6yESej3aqBdGvynqJ4yarhyMlr65ck122JnRfNi3qcs=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230615114630-b98807b623e2/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230616110830-e0aae0ad6d60 h1:HIKhyBKgXbDU9QmkcpZp2lk8A2U2zjFUtfh7Tc93ka0=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230616110830-e0aae0ad6d60/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.4.0 h1:fmCYshC+u0/1nqlu10TnZjZeywioXOSWpm6n8zPKgkI=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.4.0/go.mod h1:tttnc/l4F2lx1SyETTiLrMaiQEgMeGCvlVEIW6nUeJA=
github.com/threefoldtech/zbus v1.0.1 h1:3KaEpyOiDYAw+lrAyoQUGIvY9BcjVRXlQ1beBRqhRNk=
Expand Down
2 changes: 1 addition & 1 deletion pkg/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ deadcode:
@${GOPATH}/bin/deadcode -test $(shell go list ./...) || true

spelling:
@${GOPATH}/bin/misspell -i monitord -error `find .`
@${GOPATH}/bin/misspell -i "monitord,forumla,etherent" -error `find .`

static:
@${GOPATH}/bin/staticcheck -- ./...
Expand Down
5 changes: 5 additions & 0 deletions pkg/capacity/capacity.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,8 @@ func (r *ResourceOracle) GetHypervisor() (string, error) {

return "", nil
}

// GPUs returns the list of available GPUs as PCI devices
func (r *ResourceOracle) GPUs() ([]PCI, error) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As an exported function I believe it should have a comment

return ListPCI(GPU)
}
Loading