From a42eb589df13728a1e91a374d6734a70aedf3d8f Mon Sep 17 00:00:00 2001
From: Webber Huang <webber.huang@suse.com>
Date: Sat, 17 Feb 2024 21:15:08 +0800
Subject: [PATCH] Fixing pcidevice device plugin stop deadlock

Signed-off-by: Webber Huang <webber.huang@suse.com>

Fixing codeFactor "Complex Method" in pcidevice plugin healthcheck()
---
 pkg/controller/nodes/node_controller.go |  3 +--
 pkg/deviceplugins/device_manager.go     | 28 ++++++++++++++-----------
 pkg/deviceplugins/deviceplugin.go       |  6 ++++++
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/pkg/controller/nodes/node_controller.go b/pkg/controller/nodes/node_controller.go
index 3711dff7..05d92742 100644
--- a/pkg/controller/nodes/node_controller.go
+++ b/pkg/controller/nodes/node_controller.go
@@ -7,8 +7,6 @@ import (
 	"reflect"
 	"time"
 
-	"github.com/harvester/pcidevices/pkg/controller/gpudevice"
-
 	ctlnetworkv1beta1 "github.com/harvester/harvester-network-controller/pkg/generated/controllers/network.harvesterhci.io/v1beta1"
 	"github.com/jaypipes/ghw"
 	ctlcorev1 "github.com/rancher/wrangler/pkg/generated/controllers/core/v1"
@@ -17,6 +15,7 @@ import (
 	"k8s.io/apimachinery/pkg/labels"
 
 	"github.com/harvester/pcidevices/pkg/apis/devices.harvesterhci.io/v1beta1"
+	"github.com/harvester/pcidevices/pkg/controller/gpudevice"
 	"github.com/harvester/pcidevices/pkg/controller/pcidevice"
 	"github.com/harvester/pcidevices/pkg/controller/sriovdevice"
 	ctl "github.com/harvester/pcidevices/pkg/generated/controllers/devices.harvesterhci.io/v1beta1"
diff --git a/pkg/deviceplugins/device_manager.go b/pkg/deviceplugins/device_manager.go
index b45434d3..89cc71d6 100644
--- a/pkg/deviceplugins/device_manager.go
+++ b/pkg/deviceplugins/device_manager.go
@@ -45,6 +45,7 @@ const (
 	pciBasePath       = "/sys/bus/pci/devices"
 	connectionTimeout = 120 * time.Second // Google gRPC default timeout
 	PCIResourcePrefix = "PCI_RESOURCE"
+	tickerTimeout     = 30 * time.Second
 )
 
 type PCIDevice struct {
@@ -292,7 +293,6 @@ func (dp *PCIDevicePlugin) Allocate(_ context.Context, r *pluginapi.AllocateRequ
 }
 
 func (dp *PCIDevicePlugin) healthCheck() error {
-	logger := log.DefaultLogger()
 	monitoredDevices := make(map[string]string)
 	watcher, err := fsnotify.NewWatcher()
 	if err != nil {
@@ -348,31 +348,37 @@ func (dp *PCIDevicePlugin) healthCheck() error {
 		return fmt.Errorf("failed to watch device-plugin socket: %v", err)
 	}
 
+	return dp.performCheck(monitoredDevices, watcher)
+}
+
+func (dp *PCIDevicePlugin) performCheck(monitoredDevices map[string]string, watcher *fsnotify.Watcher) error {
 	for {
 		select {
 		case <-dp.stop:
 			return nil
+		case <-dp.done:
+			return nil
 		case err := <-watcher.Errors:
-			logger.Reason(err).Errorf("error watching devices and device plugin directory")
+			logrus.Errorf("error watching devices and device plugin directory: %v", err)
 		case event := <-watcher.Events:
-			logger.V(4).Infof("health Event: %v", event)
+			logrus.Infof("health Event: %v", event)
 			if monDevID, exist := monitoredDevices[event.Name]; exist {
 				// Health in this case is if the device path actually exists
 				if event.Op == fsnotify.Create {
-					logger.Infof("monitored device %s appeared", dp.resourceName)
+					logrus.Infof("monitored device %s appeared", dp.resourceName)
 					dp.health <- deviceHealth{
 						DevID:  monDevID,
 						Health: pluginapi.Healthy,
 					}
 				} else if (event.Op == fsnotify.Remove) || (event.Op == fsnotify.Rename) {
-					logger.Infof("monitored device %s disappeared", dp.resourceName)
+					logrus.Infof("monitored device %s disappeared", dp.resourceName)
 					dp.health <- deviceHealth{
 						DevID:  monDevID,
 						Health: pluginapi.Unhealthy,
 					}
 				}
 			} else if event.Name == dp.socketPath && event.Op == fsnotify.Remove {
-				logger.Infof("device socket file for device %s was removed, kubelet probably restarted.", dp.resourceName)
+				logrus.Infof("device socket file for device %s was removed, kubelet probably restarted.", dp.resourceName)
 				return nil
 			}
 		}
@@ -389,14 +395,12 @@ func (dp *PCIDevicePlugin) GetDeviceName() string {
 
 // Stop stops the gRPC server
 func (dp *PCIDevicePlugin) stopDevicePlugin() error {
-	defer func() {
-		if !IsChanClosed(dp.done) {
-			close(dp.done)
-		}
-	}()
+	if !IsChanClosed(dp.done) {
+		close(dp.done)
+	}
 
 	// Give the device plugin one second to properly deregister
-	ticker := time.NewTicker(1 * time.Second)
+	ticker := time.NewTicker(tickerTimeout)
 	defer ticker.Stop()
 	select {
 	case <-dp.deregistered:
diff --git a/pkg/deviceplugins/deviceplugin.go b/pkg/deviceplugins/deviceplugin.go
index 89d4c432..834ebd7b 100644
--- a/pkg/deviceplugins/deviceplugin.go
+++ b/pkg/deviceplugins/deviceplugin.go
@@ -92,5 +92,11 @@ func (dp *PCIDevicePlugin) RemoveDevice(pd *v1beta1.PCIDevice, pdc *v1beta1.PCID
 		logrus.Infof("Removing %s from device plugin", resourceName)
 		dp.MarkPCIDeviceAsUnhealthy(pdc.Spec.Address)
 	}
+
+	for i, dev := range dp.devs {
+		if dev.ID == pdc.Spec.Address {
+			dp.devs[i].Health = pluginapi.Unhealthy
+		}
+	}
 	return nil
 }