Skip to content

Commit

Permalink
Gpu support (#1973)
Browse files Browse the repository at this point in the history
* WIP: lspci in zos

Those helper utils to help list (and find) gpu devices

* list gpu only test

* Collect gpu info

Preparation for node information update. this has to wait
until the chain has the changes needed for this

* List GPU over node api

This is for users to look up available GPU types on nodes

Also include the SLOT in the GPU ID. The gpu id is what will
be used by the user to specify which gpu to attach to his VM

* fix short id

* update docs to match implementation

* initialize gpu devices for usability

-this include loading correct modules
-and bind to correct devices

* handling of iommu groups

we need to make sure all devices inside the same gpu iommu group
are bind to the vfio driver

* Setting up and validating all the devices

Everything seems to be working except the CH process fails with this
error:

Could not mmap sparse area (offset = 0x0, size = 0x10000000): Resource busy (os error 16)
Error booting VM: VmBoot(DeviceManager(VfioMapRegion(MmapArea)))

Investingating what can be the error but no luck yet

* Add missing file gpu

* fix ci

* unbinding boot vga

* iommu skip bridge devices

* use latest client to set gpu status

* Apply review comments and CI
  • Loading branch information
muhamadazmy authored Jun 19, 2023
1 parent decd6ff commit df9b1fd
Show file tree
Hide file tree
Showing 22 changed files with 37,136 additions and 33 deletions.
1 change: 1 addition & 0 deletions .github/workflows/publish-development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:

- name: Build binaries
run: |
go generate ./pkg/capacity/...
cd cmds
make
env:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish-pre-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:

- name: Build binaries
run: |
go generate ./pkg/capacity/...
cd cmds
make
env:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/publish-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ jobs:

- name: Build binaries
run: |
go generate ./pkg/capacity/...
cd cmds
make
env:
Expand Down
13 changes: 13 additions & 0 deletions client/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,13 @@ type UsersCounters struct {
Workloads int `json:"workloads"`
}

// GPU information
type GPU struct {
ID string `json:"id"`
Vendor string `json:"vendor"`
Device string `json:"device"`
}

// Counters returns some node statistics. Including total and available cpu, memory, storage, etc...
func (n *NodeClient) Counters(ctx context.Context) (counters Counters, err error) {
const cmd = "zos.statistics.get"
Expand All @@ -205,6 +212,12 @@ func (n *NodeClient) Pools(ctx context.Context) (pools []pkg.PoolMetrics, err er
return
}

func (n *NodeClient) GPUs(ctx context.Context) (gpus []GPU, err error) {
const cmd = "zos.gpu.list"
err = n.bus.Call(ctx, n.nodeTwin, cmd, nil, &gpus)
return
}

// NetworkListWGPorts return a list of all "taken" ports on the node. A new deployment
// should be careful to use a free port for its network setup.
func (n *NodeClient) NetworkListWGPorts(ctx context.Context) ([]uint16, error) {
Expand Down
16 changes: 16 additions & 0 deletions cmds/modules/noded/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,24 @@ func action(cli *cli.Context) error {
if err != nil {
return errors.Wrap(err, "failed to get hypervisors")
}
gpus, err := oracle.GPUs()
if err != nil {
return errors.Wrap(err, "failed to list gpus")
}

var info registrar.RegistrationInfo
for _, gpu := range gpus {
// log info about the GPU here ?
vendor, device, ok := gpu.GetDevice()
if ok {
log.Info().Str("vendor", vendor.Name).Str("device", device.Name).Msg("found GPU")
} else {
log.Info().Uint16("vendor", gpu.Vendor).Uint16("device", device.ID).Msg("found GPU (can't look up device name)")
}

info = info.WithGPU(gpu.ShortID())
}

info = info.WithCapacity(cap).
WithSerialNumber(dmi.BoardVersion()).
WithSecureBoot(secureBoot).
Expand Down
35 changes: 35 additions & 0 deletions cmds/modules/provisiond/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ func action(cli *cli.Context) error {
}

setupStorageRmb(zosRouter, cl)
setupGPURmb(zosRouter, cl)

_ = mbus.NewDeploymentMessageBus(zosRouter, engine)

Expand Down Expand Up @@ -491,3 +492,37 @@ func setupStorageRmb(router rmb.Router, cl zbus.Client) {
return stub.Metrics(ctx)
})
}

func setupGPURmb(router rmb.Router, cl zbus.Client) {
type Info struct {
ID string `json:"id"`
Vendor string `json:"vendor"`
Device string `json:"device"`
}
gpus := router.Subroute("gpu")
gpus.WithHandler("list", func(ctx context.Context, payload []byte) (interface{}, error) {
devices, err := capacity.ListPCI(capacity.GPU)
if err != nil {
return nil, errors.Wrap(err, "failed to list available devices")
}

var list []Info
for _, device := range devices {
info := Info{
ID: device.ShortID(),
Vendor: "unknown",
Device: "unknown",
}

vendor, device, ok := device.GetDevice()
if ok {
info.Vendor = vendor.Name
info.Device = device.Name
}

list = append(list, info)
}

return list, nil
})
}
3 changes: 1 addition & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ require (
github.com/shirou/gopsutil v3.21.11+incompatible
github.com/stretchr/testify v1.8.2
github.com/threefoldtech/0-fs v1.3.1-0.20201203163303-d963de9adea7
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230614080449-33e2dc5b38f2
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230616110830-e0aae0ad6d60
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.4.0
github.com/threefoldtech/zbus v1.0.1
github.com/tyler-smith/go-bip39 v1.1.0
Expand Down Expand Up @@ -133,7 +133,6 @@ require (
github.com/safchain/ethtool v0.0.0-20201023143004-874930cb3ce0 // indirect
github.com/sirupsen/logrus v1.8.1 // indirect
github.com/stretchr/objx v0.5.0 // indirect
github.com/threefoldtech/substrate-client v0.1.5 // indirect
github.com/tinylib/msgp v1.1.5 // indirect
github.com/tklauser/go-sysconf v0.3.9 // indirect
github.com/tklauser/numcpus v0.6.0 // indirect
Expand Down
11 changes: 4 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,6 @@ github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW
github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb h1:PBC98N2aIaM3XXiurYmW7fx4GZkL8feAMVq7nEjURHk=
github.com/golang/snappy v0.0.5-0.20220116011046-fa5810519dcb/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/gologme/log v1.2.0/go.mod h1:gq31gQ8wEHkR+WekdWsqDuf8pXTUZA9BnnzTuPz1Y9U=
github.com/gomodule/redigo v1.8.9 h1:Sl3u+2BI/kk+VEatbj0scLdrFhjPmbxOc1myhDP41ws=
github.com/gomodule/redigo v1.8.9/go.mod h1:7ArFNvsTjH8GMMzB4uy1snslv2BwmginuMs06a1uzZE=
github.com/gomodule/redigo v2.0.0+incompatible h1:K/R+8tc58AaqLkqG2Ol3Qk+DR/TlNuhuh457pBFPtt0=
github.com/gomodule/redigo v2.0.0+incompatible/go.mod h1:B4C85qUVwatsJoIUNIfCRsp7qO0iAmpGFZ4EELWSbC4=
Expand Down Expand Up @@ -531,14 +530,12 @@ github.com/stripe/safesql v0.2.0/go.mod h1:q7b2n0JmzM1mVGfcYpanfVb2j23cXZeWFxcIL
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
github.com/threefoldtech/0-fs v1.3.1-0.20201203163303-d963de9adea7 h1:64QIPSO1Acx7ENdMwQ0Q4tnE94By1BljA/2R2NbY51Y=
github.com/threefoldtech/0-fs v1.3.1-0.20201203163303-d963de9adea7/go.mod h1:OPPZiE/GthPR2IepjKSc8wa+t/7wl3dtHQyEdUcftZI=
github.com/threefoldtech/substrate-client v0.1.5 h1:yYmWaJCMoJO0ieaO6eu3vIK0agehSw4OhSXlGydmv/E=
github.com/threefoldtech/substrate-client v0.1.5/go.mod h1:ys/GJLeLmNX8E36UFMeR0yZC5Lo4PufYtfwsVX8H/AM=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230509101146-8e43c43597cd h1:aermHEtCwO/gzn8OiwDzhvsBHOuxHFgHymcoFsRVNvQ=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230509101146-8e43c43597cd/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230614080449-33e2dc5b38f2 h1:WZUNb8xu/Zxs9gwNDXL3FG2gwox1rgJudMCpsHCksJM=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230614080449-33e2dc5b38f2/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.2.0 h1:dUPLbUFYBTfsM31UpIyPVyv18kDMLczHYIINPK/mFKI=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.2.0/go.mod h1:DDQ6ktnCli7z1+zpSR/j2twWwe6FUGxPO+WnEkRsnxw=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230615114630-b98807b623e2 h1:6yESej3aqBdGvynqJ4yarhyMlr65ck122JnRfNi3qcs=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230615114630-b98807b623e2/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230616110830-e0aae0ad6d60 h1:HIKhyBKgXbDU9QmkcpZp2lk8A2U2zjFUtfh7Tc93ka0=
github.com/threefoldtech/tfchain/clients/tfchain-client-go v0.0.0-20230616110830-e0aae0ad6d60/go.mod h1:dtDKAPiUDxAwIkfHV7xcAFZcOm+xwNIuOI1MLFS+MeQ=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.4.0 h1:fmCYshC+u0/1nqlu10TnZjZeywioXOSWpm6n8zPKgkI=
github.com/threefoldtech/tfgrid-sdk-go/rmb-sdk-go v0.4.0/go.mod h1:tttnc/l4F2lx1SyETTiLrMaiQEgMeGCvlVEIW6nUeJA=
github.com/threefoldtech/zbus v1.0.1 h1:3KaEpyOiDYAw+lrAyoQUGIvY9BcjVRXlQ1beBRqhRNk=
Expand Down
2 changes: 1 addition & 1 deletion pkg/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ deadcode:
@${GOPATH}/bin/deadcode -test $(shell go list ./...) || true

spelling:
@${GOPATH}/bin/misspell -i monitord -error `find .`
@${GOPATH}/bin/misspell -i "monitord,forumla,etherent" -error `find .`

static:
@${GOPATH}/bin/staticcheck -- ./...
Expand Down
5 changes: 5 additions & 0 deletions pkg/capacity/capacity.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,8 @@ func (r *ResourceOracle) GetHypervisor() (string, error) {

return "", nil
}

// GPUs returns the list of available GPUs as PCI devices
func (r *ResourceOracle) GPUs() ([]PCI, error) {
return ListPCI(GPU)
}
Loading

0 comments on commit df9b1fd

Please sign in to comment.