From a42eb589df13728a1e91a374d6734a70aedf3d8f Mon Sep 17 00:00:00 2001 From: Webber Huang <webber.huang@suse.com> Date: Sat, 17 Feb 2024 21:15:08 +0800 Subject: [PATCH] Fixing pcidevice device plugin stop deadlock Signed-off-by: Webber Huang <webber.huang@suse.com> Fixing codeFactor "Complex Method" in pcidevice plugin healthcheck() --- pkg/controller/nodes/node_controller.go | 3 +-- pkg/deviceplugins/device_manager.go | 28 ++++++++++++++----------- pkg/deviceplugins/deviceplugin.go | 6 ++++++ 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pkg/controller/nodes/node_controller.go b/pkg/controller/nodes/node_controller.go index 3711dff7..05d92742 100644 --- a/pkg/controller/nodes/node_controller.go +++ b/pkg/controller/nodes/node_controller.go @@ -7,8 +7,6 @@ import ( "reflect" "time" - "github.com/harvester/pcidevices/pkg/controller/gpudevice" - ctlnetworkv1beta1 "github.com/harvester/harvester-network-controller/pkg/generated/controllers/network.harvesterhci.io/v1beta1" "github.com/jaypipes/ghw" ctlcorev1 "github.com/rancher/wrangler/pkg/generated/controllers/core/v1" @@ -17,6 +15,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "github.com/harvester/pcidevices/pkg/apis/devices.harvesterhci.io/v1beta1" + "github.com/harvester/pcidevices/pkg/controller/gpudevice" "github.com/harvester/pcidevices/pkg/controller/pcidevice" "github.com/harvester/pcidevices/pkg/controller/sriovdevice" ctl "github.com/harvester/pcidevices/pkg/generated/controllers/devices.harvesterhci.io/v1beta1" diff --git a/pkg/deviceplugins/device_manager.go b/pkg/deviceplugins/device_manager.go index b45434d3..89cc71d6 100644 --- a/pkg/deviceplugins/device_manager.go +++ b/pkg/deviceplugins/device_manager.go @@ -45,6 +45,7 @@ const ( pciBasePath = "/sys/bus/pci/devices" connectionTimeout = 120 * time.Second // Google gRPC default timeout PCIResourcePrefix = "PCI_RESOURCE" + tickerTimeout = 30 * time.Second ) type PCIDevice struct { @@ -292,7 +293,6 @@ func (dp *PCIDevicePlugin) Allocate(_ context.Context, r *pluginapi.AllocateRequ } func (dp *PCIDevicePlugin) healthCheck() error { - logger := log.DefaultLogger() monitoredDevices := make(map[string]string) watcher, err := fsnotify.NewWatcher() if err != nil { @@ -348,31 +348,37 @@ func (dp *PCIDevicePlugin) healthCheck() error { return fmt.Errorf("failed to watch device-plugin socket: %v", err) } + return dp.performCheck(monitoredDevices, watcher) +} + +func (dp *PCIDevicePlugin) performCheck(monitoredDevices map[string]string, watcher *fsnotify.Watcher) error { for { select { case <-dp.stop: return nil + case <-dp.done: + return nil case err := <-watcher.Errors: - logger.Reason(err).Errorf("error watching devices and device plugin directory") + logrus.Errorf("error watching devices and device plugin directory: %v", err) case event := <-watcher.Events: - logger.V(4).Infof("health Event: %v", event) + logrus.Infof("health Event: %v", event) if monDevID, exist := monitoredDevices[event.Name]; exist { // Health in this case is if the device path actually exists if event.Op == fsnotify.Create { - logger.Infof("monitored device %s appeared", dp.resourceName) + logrus.Infof("monitored device %s appeared", dp.resourceName) dp.health <- deviceHealth{ DevID: monDevID, Health: pluginapi.Healthy, } } else if (event.Op == fsnotify.Remove) || (event.Op == fsnotify.Rename) { - logger.Infof("monitored device %s disappeared", dp.resourceName) + logrus.Infof("monitored device %s disappeared", dp.resourceName) dp.health <- deviceHealth{ DevID: monDevID, Health: pluginapi.Unhealthy, } } } else if event.Name == dp.socketPath && event.Op == fsnotify.Remove { - logger.Infof("device socket file for device %s was removed, kubelet probably restarted.", dp.resourceName) + logrus.Infof("device socket file for device %s was removed, kubelet probably restarted.", dp.resourceName) return nil } } @@ -389,14 +395,12 @@ func (dp *PCIDevicePlugin) GetDeviceName() string { // Stop stops the gRPC server func (dp *PCIDevicePlugin) stopDevicePlugin() error { - defer func() { - if !IsChanClosed(dp.done) { - close(dp.done) - } - }() + if !IsChanClosed(dp.done) { + close(dp.done) + } // Give the device plugin one second to properly deregister - ticker := time.NewTicker(1 * time.Second) + ticker := time.NewTicker(tickerTimeout) defer ticker.Stop() select { case <-dp.deregistered: diff --git a/pkg/deviceplugins/deviceplugin.go b/pkg/deviceplugins/deviceplugin.go index 89d4c432..834ebd7b 100644 --- a/pkg/deviceplugins/deviceplugin.go +++ b/pkg/deviceplugins/deviceplugin.go @@ -92,5 +92,11 @@ func (dp *PCIDevicePlugin) RemoveDevice(pd *v1beta1.PCIDevice, pdc *v1beta1.PCID logrus.Infof("Removing %s from device plugin", resourceName) dp.MarkPCIDeviceAsUnhealthy(pdc.Spec.Address) } + + for i, dev := range dp.devs { + if dev.ID == pdc.Spec.Address { + dp.devs[i].Health = pluginapi.Unhealthy + } + } return nil }