kubernetes · k8s-ci-robot · Oct 25, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/cluster-autoscaler/cloudprovider/oci/README.md b/cluster-autoscaler/cloudprovider/oci/README.md
@@ -166,6 +166,13 @@ use-instance-principals = true
 
 n/a
 
+### Node Group Auto Discovery
+`--node-group-auto-discovery` could be given in below pattern. It would discover the nodepools under given compartment by matching the nodepool tags (either they are Freeform or Defined tags)
+```
+clusterId:<clusterId>,compartmentId:<compartmentId>,nodepoolTags:<tagKey1>=<tagValue1>&<tagKey2>=<tagValue2>,min:<min>,max:<max>
+```
+Auto discovery can not be used along with static discovery (`node` parameter) to prevent conflicts.
+
 ## Deployment
 
 ### Create OCI config secret (only if _not_ using Instance Principals)
@@ -271,7 +278,8 @@ kubectl apply -f ./cloudprovider/oci/examples/oci-nodepool-cluster-autoscaler-w-
   correctly (`oci-cloud-controller-manager`).
 - Avoid manually changing pools that are managed by the Cluster Autoscaler. For example, do not add or remove nodes
   using kubectl, or using the Console (or the Oracle Cloud Infrastructure CLI or API).
-- `--node-group-auto-discovery` and `--node-autoprovisioning-enabled=true` are not supported.
+- `--node-autoprovisioning-enabled=true` are not supported.
+- `--node-group-auto-discovery` and `node` parameters can not be used together as it can cause conflicts.
 - We set a `nvidia.com/gpu:NoSchedule` taint on nodes in a GPU enabled pools.
 
 ## Helpful links

diff --git a/cluster-autoscaler/cloudprovider/oci/common/oci_util.go b/cluster-autoscaler/cloudprovider/oci/common/oci_util.go
@@ -205,3 +205,21 @@ func GetAllPoolTypes(groups []string) (string, error) {
 	}
 	return ocidType, nil
 }
+
+// HasNodeGroupTags checks if nodepoolTags is provided
+func HasNodeGroupTags(nodeGroupAutoDiscoveryList []string) (bool, bool, error) {
+	instancePoolTagsFound := false
+	nodePoolTagsFound := false
+	for _, arg := range nodeGroupAutoDiscoveryList {
+		if strings.Contains(arg, "nodepoolTags") {
+			nodePoolTagsFound = true
+		}
+		if strings.Contains(arg, "instancepoolTags") {
+			instancePoolTagsFound = true
+		}
+	}
+	if instancePoolTagsFound == true && nodePoolTagsFound == true {
+		return instancePoolTagsFound, nodePoolTagsFound, fmt.Errorf("can not use both instancepoolTags and nodepoolTags")
+	}
+	return instancePoolTagsFound, nodePoolTagsFound, nil
+}
diff --git a/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go b/cluster-autoscaler/cloudprovider/oci/instancepools/oci_cloud_provider.go
@@ -153,8 +153,14 @@ func BuildOCI(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover
 	if err != nil {
 		klog.Fatalf("Failed to get pool type: %v", err)
 	}
-	if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) {
-		manager, err := nodepools.CreateNodePoolManager(opts.CloudConfig, do, createKubeClient(opts))
+	_, nodepoolTagsFound, err := ocicommon.HasNodeGroupTags(opts.NodeGroupAutoDiscovery)
+	if err != nil {
+		klog.Fatalf("Failed to get auto discovery tags: %v", err)
+	}
+	if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) && nodepoolTagsFound == true {
+		klog.Fatalf("-nodes and -node-group-auto-discovery parameters can not be used together.")
+	} else if strings.HasPrefix(ocidType, npconsts.OciNodePoolResourceIdent) || nodepoolTagsFound == true {
+		manager, err := nodepools.CreateNodePoolManager(opts.CloudConfig, opts.NodeGroupAutoDiscovery, do, createKubeClient(opts))
 		if err != nil {
 			klog.Fatalf("Could not create OCI OKE cloud provider: %v", err)
 		}

diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"math"
 	"os"
+	"regexp"
 	"strconv"
 	"strings"
 	"time"
@@ -34,6 +35,11 @@ import (
 const (
 	maxAddTaintRetries    = 5
 	maxGetNodepoolRetries = 3
+	clusterId             = "clusterId"
+	compartmentId         = "compartmentId"
+	nodepoolTags          = "nodepoolTags"
+	min                   = "min"
+	max                   = "max"
 )
 
 var (
@@ -75,10 +81,11 @@ type okeClient interface {
 	GetNodePool(context.Context, oke.GetNodePoolRequest) (oke.GetNodePoolResponse, error)
 	UpdateNodePool(context.Context, oke.UpdateNodePoolRequest) (oke.UpdateNodePoolResponse, error)
 	DeleteNode(context.Context, oke.DeleteNodeRequest) (oke.DeleteNodeResponse, error)
+	ListNodePools(ctx context.Context, request oke.ListNodePoolsRequest) (oke.ListNodePoolsResponse, error)
 }
 
 // CreateNodePoolManager creates an NodePoolManager that can manage autoscaling node pools
-func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (NodePoolManager, error) {
+func CreateNodePoolManager(cloudConfigPath string, nodeGroupAutoDiscoveryList []string, discoveryOpts cloudprovider.NodeGroupDiscoveryOptions, kubeClient kubernetes.Interface) (NodePoolManager, error) {
 
 	var err error
 	var configProvider common.ConfigurationProvider
@@ -151,6 +158,20 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N
 		nodePoolCache:          newNodePoolCache(&okeClient),
 	}
 
+	// auto discover nodepools from compartments with nodeGroupAutoDiscovery parameter
+	klog.Infof("checking node groups for autodiscovery ... ")
+	for _, arg := range nodeGroupAutoDiscoveryList {
+		nodeGroup, err := nodeGroupFromArg(arg)
+		if err != nil {
+			return nil, fmt.Errorf("unable to construct node group auto discovery from argument: %v", err)
+		}
+		nodeGroup.manager = manager
+		nodeGroup.kubeClient = kubeClient
+
+		manager.nodeGroups = append(manager.nodeGroups, *nodeGroup)
+		autoDiscoverNodeGroups(manager, manager.okeClient, *nodeGroup)
+	}
+
 	// Contains all the specs from the args that give us the pools.
 	for _, arg := range discoveryOpts.NodeGroupSpecs {
 		np, err := nodePoolFromArg(arg)
@@ -180,6 +201,48 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N
 	return manager, nil
 }
 
+func autoDiscoverNodeGroups(m *ociManagerImpl, okeClient okeClient, nodeGroup nodeGroupAutoDiscovery) (bool, error) {
+	var resp, reqErr = okeClient.ListNodePools(context.Background(), oke.ListNodePoolsRequest{
+		ClusterId:     common.String(nodeGroup.clusterId),
+		CompartmentId: common.String(nodeGroup.compartmentId),
+	})
+	if reqErr != nil {
+		klog.Errorf("failed to fetch the nodepool list with clusterId: %s, compartmentId: %s. Error: %v", nodeGroup.clusterId, nodeGroup.compartmentId, reqErr)
+		return false, reqErr
+	}
+	for _, nodePoolSummary := range resp.Items {
+		if validateNodepoolTags(nodeGroup.tags, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags) {
+			nodepool := &nodePool{}
+			nodepool.id = *nodePoolSummary.Id
+			nodepool.minSize = nodeGroup.minSize
+			nodepool.maxSize = nodeGroup.maxSize
+
+			nodepool.manager = nodeGroup.manager
+			nodepool.kubeClient = nodeGroup.kubeClient
+
+			m.staticNodePools[nodepool.id] = nodepool
+			klog.V(5).Infof("auto discovered nodepool in compartment : %s , nodepoolid: %s", nodeGroup.compartmentId, nodepool.id)
+		} else {
+			klog.Warningf("nodepool ignored as the tags do not satisfy the requirement : %s , %v, %v", *nodePoolSummary.Id, nodePoolSummary.FreeformTags, nodePoolSummary.DefinedTags)
+		}
+	}
+	return true, nil
+}
+
+func validateNodepoolTags(nodeGroupTags map[string]string, freeFormTags map[string]string, definedTags map[string]map[string]interface{}) bool {
+	if nodeGroupTags != nil {
+		for tagKey, tagValue := range nodeGroupTags {
+			namespacedTagKey := strings.Split(tagKey, ".")
+			if len(namespacedTagKey) == 2 && tagValue != definedTags[namespacedTagKey[0]][namespacedTagKey[1]] {
+				return false
+			} else if len(namespacedTagKey) != 2 && tagValue != freeFormTags[tagKey] {
+				return false
+			}
+		}
+	}
+	return true
+}
+
 // nodePoolFromArg parses a node group spec represented in the form of `<minSize>:<maxSize>:<ocid>` and produces a node group spec object
 func nodePoolFromArg(value string) (*nodePool, error) {
 	tokens := strings.SplitN(value, ":", 3)
@@ -207,6 +270,78 @@ func nodePoolFromArg(value string) (*nodePool, error) {
 	return spec, nil
 }
 
+// nodeGroupFromArg parses a node group spec represented in the form of
+// `clusterId:<clusterId>,compartmentId:<compartmentId>,nodepoolTags:<tagKey1>=<tagValue1>&<tagKey2>=<tagValue2>,min:<min>,max:<max>`
+// and produces a node group auto discovery object,
+// nodepoolTags are optional and CA will capture all nodes if no tags are provided.
+func nodeGroupFromArg(value string) (*nodeGroupAutoDiscovery, error) {
+	// this regex will find the key-value pairs in any given order if separated with a colon
+	regexPattern := `(?:` + compartmentId + `:(?P<` + compartmentId + `>[^,]+)`
+	regexPattern = regexPattern + `|` + nodepoolTags + `:(?P<` + nodepoolTags + `>[^,]+)`
+	regexPattern = regexPattern + `|` + max + `:(?P<` + max + `>[^,]+)`
+	regexPattern = regexPattern + `|` + min + `:(?P<` + min + `>[^,]+)`
+	regexPattern = regexPattern + `|` + clusterId + `:(?P<` + clusterId + `>[^,]+)`
+	regexPattern = regexPattern + `)(?:,|$)`
+
+	re := regexp.MustCompile(regexPattern)
+
+	parametersMap := make(map[string]string)
+
+	// push key-value pairs into a map
+	for _, match := range re.FindAllStringSubmatch(value, -1) {
+		for i, name := range re.SubexpNames() {
+			if i != 0 && match[i] != "" {
+				parametersMap[name] = match[i]
+			}
+		}
+	}
+
+	spec := &nodeGroupAutoDiscovery{}
+
+	if parametersMap[clusterId] != "" {
+		spec.clusterId = parametersMap[clusterId]
+	} else {
+		return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", clusterId)
+	}
+
+	if parametersMap[compartmentId] != "" {
+		spec.compartmentId = parametersMap[compartmentId]
+	} else {
+		return nil, fmt.Errorf("failed to set %s, it is missing in node-group-auto-discovery parameter", compartmentId)
+	}
+
+	if size, err := strconv.Atoi(parametersMap[min]); err == nil {
+		spec.minSize = size
+	} else {
+		return nil, fmt.Errorf("failed to set %s size: %s, expected integer", min, parametersMap[min])
+	}
+
+	if size, err := strconv.Atoi(parametersMap[max]); err == nil {
+		spec.maxSize = size
+	} else {
+		return nil, fmt.Errorf("failed to set %s size: %s, expected integer", max, parametersMap[max])
+	}
+
+	if parametersMap[nodepoolTags] != "" {
+		nodepoolTags := parametersMap[nodepoolTags]
+
+		spec.tags = make(map[string]string)
+
+		pairs := strings.Split(nodepoolTags, "&")
+
+		for _, pair := range pairs {
+			parts := strings.Split(pair, "=")
+			if len(parts) == 2 {
+				spec.tags[parts[0]] = parts[1]
+			}
+		}
+	}
+
+	klog.Infof("node group auto discovery spec constructed: %+v", spec)
+
+	return spec, nil
+}
+
 type ociManagerImpl struct {
 	cfg                    *ocicommon.CloudConfig
 	okeClient              okeClient
@@ -215,6 +350,7 @@ type ociManagerImpl struct {
 	ociTagsGetter          ocicommon.TagsGetter
 	registeredTaintsGetter RegisteredTaintsGetter
 	staticNodePools        map[string]NodePool
+	nodeGroups             []nodeGroupAutoDiscovery
 
 	lastRefresh time.Time
 
@@ -253,6 +389,15 @@ func (m *ociManagerImpl) TaintToPreventFurtherSchedulingOnRestart(nodes []*apiv1
 }
 
 func (m *ociManagerImpl) forceRefresh() error {
+	// auto discover node groups
+	if m.nodeGroups != nil {
+		// empty previous nodepool map to do an auto discovery
+		m.staticNodePools = make(map[string]NodePool)
+		for _, nodeGroup := range m.nodeGroups {
+			autoDiscoverNodeGroups(m, m.okeClient, nodeGroup)
+		}
+	}
+	// rebuild nodepool cache
 	err := m.nodePoolCache.rebuild(m.staticNodePools, maxGetNodepoolRetries)
 	if err != nil {
 		return err

diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager_test.go
@@ -336,6 +336,10 @@ func (c mockOKEClient) DeleteNode(context.Context, oke.DeleteNodeRequest) (oke.D
 	}, nil
 }
 
+func (c mockOKEClient) ListNodePools(context.Context, oke.ListNodePoolsRequest) (oke.ListNodePoolsResponse, error) {
+	return oke.ListNodePoolsResponse{}, nil
+}
+
 func TestRemoveInstance(t *testing.T) {
 	instanceId1 := "instance1"
 	instanceId2 := "instance2"
@@ -384,3 +388,70 @@ func TestRemoveInstance(t *testing.T) {
 		}
 	}
 }
+
+func TestNodeGroupFromArg(t *testing.T) {
+	var nodeGroupArg = "clusterId:testClusterId,compartmentId:testCompartmentId,nodepoolTags:ca-managed=true&namespace.foo=bar,min:1,max:5"
+	nodeGroupAutoDiscovery, err := nodeGroupFromArg(nodeGroupArg)
+	if err != nil {
+		t.Errorf("Error: #{err}")
+	}
+	if nodeGroupAutoDiscovery.clusterId != "testClusterId" {
+		t.Errorf("Error: clusterId should be testClusterId")
+	}
+	if nodeGroupAutoDiscovery.compartmentId != "testCompartmentId" {
+		t.Errorf("Error: compartmentId should be testCompartmentId")
+	}
+	if nodeGroupAutoDiscovery.minSize != 1 {
+		t.Errorf("Error: minSize should be 1")
+	}
+	if nodeGroupAutoDiscovery.maxSize != 5 {
+		t.Errorf("Error: maxSize should be 5")
+	}
+	if nodeGroupAutoDiscovery.tags["ca-managed"] != "true" {
+		t.Errorf("Error: ca-managed:true is missing in tags.")
+	}
+	if nodeGroupAutoDiscovery.tags["namespace.foo"] != "bar" {
+		t.Errorf("Error: namespace.foo:bar is missing in tags.")
+	}
+}
+
+func TestValidateNodePoolTags(t *testing.T) {
+
+	var nodeGroupTags map[string]string = nil
+	var nodePoolTags map[string]string = nil
+	var definedTags map[string]map[string]interface{} = nil
+
+	if validateNodepoolTags(nodeGroupTags, nodePoolTags, definedTags) == false {
+		t.Errorf("validateNodepoolTags shouldn't return false for empty tags map")
+	}
+
+	nodeGroupTags = make(map[string]string)
+	nodeGroupTags["test"] = "test"
+
+	if validateNodepoolTags(nodeGroupTags, nodePoolTags, definedTags) == true {
+		t.Errorf("validateNodepoolTags shouldn't return true for tags missing")
+	}
+
+	nodePoolTags = make(map[string]string)
+	nodePoolTags["foo"] = "bar"
+
+	if validateNodepoolTags(nodeGroupTags, nodePoolTags, definedTags) == true {
+		t.Errorf("validateNodepoolTags shouldn't return true for not matching tags")
+	}
+
+	nodePoolTags["test"] = "test"
+
+	if validateNodepoolTags(nodeGroupTags, nodePoolTags, definedTags) == false {
+		t.Errorf("validateNodepoolTags shouldn't return false for matching tags")
+	}
+
+	nodeGroupTags["ns.tag1"] = "tag2"
+	definedTagsMap := make(map[string]interface{})
+	definedTagsMap["tag1"] = "tag2"
+	definedTags = make(map[string]map[string]interface{})
+	definedTags["ns"] = definedTagsMap
+
+	if validateNodepoolTags(nodeGroupTags, nodePoolTags, definedTags) == false {
+		t.Errorf("validateNodepoolTags shouldn't return false for namespaced tags")
+	}
+}
diff --git a/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go b/cluster-autoscaler/cloudprovider/oci/nodepools/oci_node_pool.go
@@ -46,6 +46,17 @@ type nodePool struct {
 	maxSize int
 }
 
+type nodeGroupAutoDiscovery struct {
+	manager    NodePoolManager
+	kubeClient kubernetes.Interface
+
+	clusterId     string
+	compartmentId string
+	tags          map[string]string
+	minSize       int
+	maxSize       int
+}
+
 // MaxSize returns maximum size of the node group.
 func (np *nodePool) MaxSize() int {
 	return np.maxSize