diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go index 534d1ce2ba06..bb68567e8f40 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go @@ -207,26 +207,32 @@ func (m *azureCache) regenerate() error { return nil } +// fetchAzureResources retrieves and updates the cached Azure resources. +// +// This function performs the following: +// - Fetches and updates the list of Virtual Machine Scale Sets (VMSS) in the specified resource group. +// - Fetches and updates the list of Virtual Machines (VMs) and identifies the node pools they belong to. +// - Maintains a set of VMs pools and VMSS resources which helps the Cluster Autoscaler (CAS) operate on mixed node pools. +// +// Returns an error if any of the Azure API calls fail. func (m *azureCache) fetchAzureResources() error { m.mutex.Lock() defer m.mutex.Unlock() - // fetch all the resources since CAS may be operating on mixed nodepools - // including both VMSS and VMs pools + // NOTE: this lists virtual machine scale sets, not virtual machine + // scale set instances vmssResult, err := m.fetchScaleSets() - if err == nil { - m.scaleSets = vmssResult - } else { + if err != nil { return err } - + m.scaleSets = vmssResult vmResult, vmsPoolSet, err := m.fetchVirtualMachines() - if err == nil { - m.virtualMachines = vmResult - m.vmsPoolSet = vmsPoolSet - } else { + if err != nil { return err } + // we fetch both sets of resources since CAS may operate on mixed nodepools + m.virtualMachines = vmResult + m.vmsPoolSet = vmsPoolSet return nil } @@ -263,7 +269,6 @@ func (m *azureCache) fetchVirtualMachines() (map[string][]compute.VirtualMachine if vmPoolName == nil { vmPoolName = tags[legacyAgentpoolNameTag] } - if vmPoolName == nil { continue } @@ -276,8 +281,8 @@ func (m *azureCache) fetchVirtualMachines() (map[string][]compute.VirtualMachine } // nodes from vms pool will have tag "aks-managed-agentpool-type" set to "VirtualMachines" - if agnetpoolType := tags[agentpoolTypeTag]; agnetpoolType != nil { - if strings.EqualFold(to.String(agnetpoolType), vmsPoolType) { + if agentpoolType := tags[agentpoolTypeTag]; agentpoolType != nil { + if strings.EqualFold(to.String(agentpoolType), vmsPoolType) { vmsPoolSet[to.String(vmPoolName)] = struct{}{} } } @@ -314,7 +319,6 @@ func (m *azureCache) Register(nodeGroup cloudprovider.NodeGroup) bool { // Node group is already registered and min/max size haven't changed, no action required. return false } - m.registeredNodeGroups[i] = nodeGroup klog.V(4).Infof("Node group %q updated", nodeGroup.Id()) m.invalidateUnownedInstanceCache() @@ -323,6 +327,7 @@ func (m *azureCache) Register(nodeGroup cloudprovider.NodeGroup) bool { } klog.V(4).Infof("Registering Node Group %q", nodeGroup.Id()) + m.registeredNodeGroups = append(m.registeredNodeGroups, nodeGroup) m.invalidateUnownedInstanceCache() return true @@ -391,6 +396,25 @@ func (m *azureCache) getAutoscalingOptions(ref azureRef) map[string]string { return m.autoscalingOptions[ref] } +// HasInstance returns if a given instance exists in the azure cache +func (m *azureCache) HasInstance(providerID string) (bool, error) { + m.mutex.Lock() + defer m.mutex.Unlock() + resourceID, err := convertResourceGroupNameToLower(providerID) + if err != nil { + // Most likely an invalid resource id, we should return an error + // most of these shouldn't make it here do to higher level + // validation in the HasInstance azure.cloudprovider function + return false, err + } + + if m.getInstanceFromCache(resourceID) != nil { + return true, nil + } + // couldn't find instance in the cache, assume it's deleted + return false, cloudprovider.ErrNotImplemented +} + // FindForInstance returns node group of the given Instance func (m *azureCache) FindForInstance(instance *azureRef, vmType string) (cloudprovider.NodeGroup, error) { vmsPoolSet := m.getVMsPoolSet() diff --git a/cluster-autoscaler/cloudprovider/azure/azure_client.go b/cluster-autoscaler/cloudprovider/azure/azure_client.go index 2e0522a2e45e..2bf337a4e8d4 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_client.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_client.go @@ -236,7 +236,7 @@ func newAgentpoolClientWithConfig(subscriptionID string, cred azcore.TokenCreden return nil, fmt.Errorf("failed to init cluster agent pools client: %w", err) } - klog.V(10).Infof("Successfully created agent pool client with ARMBaseURLForAPClient") + klog.V(10).Infof("Successfully created agent pool client with ARMBaseURL") return agentPoolsClient, nil } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go index 114cd90a4ee2..53bdc4de9ad6 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider.go @@ -17,8 +17,10 @@ limitations under the License. package azure import ( + "fmt" "io" "os" + "strings" apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -106,6 +108,12 @@ func (azure *AzureCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovid klog.V(6).Infof("Skipping the search for node group for the node '%s' because it has no spec.ProviderID", node.ObjectMeta.Name) return nil, nil } + + if !strings.HasPrefix(node.Spec.ProviderID, "azure://") { + klog.V(6).Infof("Wrong azure ProviderID for node %v, skipped", node.Name) + return nil, nil + } + klog.V(6).Infof("Searching for node group for the node: %s\n", node.Spec.ProviderID) ref := &azureRef{ Name: node.Spec.ProviderID, @@ -115,9 +123,27 @@ func (azure *AzureCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovid return azure.azureManager.GetNodeGroupForInstance(ref) } -// HasInstance returns whether a given node has a corresponding instance in this cloud provider -func (azure *AzureCloudProvider) HasInstance(*apiv1.Node) (bool, error) { - return true, cloudprovider.ErrNotImplemented +// HasInstance returns whether a given node has a corresponding instance in this cloud provider. +// +// Used to prevent undercount of existing VMs (taint-based overcount of deleted VMs), +// and so should not return false, nil (no instance) if uncertain; return error instead. +// (Think "has instance for sure, else error".) Returning an error causes fallback to taint-based +// determination; use ErrNotImplemented for silent fallback, any other error will be logged. +// +// Expected behavior (should work for VMSS Uniform/Flex, and VMs): +// - exists : return true, nil +// - !exists : return *, ErrNotImplemented (could use custom error for autoscaled nodes) +// - unimplemented case : return *, ErrNotImplemented +// - any other error : return *, error +func (azure *AzureCloudProvider) HasInstance(node *apiv1.Node) (bool, error) { + if node.Spec.ProviderID == "" { + return false, fmt.Errorf("ProviderID for node: %s is empty, skipped", node.Name) + } + + if !strings.HasPrefix(node.Spec.ProviderID, "azure://") { + return false, fmt.Errorf("invalid azure ProviderID prefix for node: %s, skipped", node.Name) + } + return azure.azureManager.azureCache.HasInstance(node.Spec.ProviderID) } // Pricing returns pricing model for this cloud provider or error if not available. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go index 0b064477fdb6..da37ed8492da 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cloud_provider_test.go @@ -17,6 +17,7 @@ limitations under the License. package azure import ( + "fmt" "testing" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute" @@ -24,6 +25,8 @@ import ( "github.com/Azure/go-autorest/autorest/to" apiv1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmclient/mockvmclient" "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssclient/mockvmssclient" @@ -56,6 +59,7 @@ func newTestAzureManager(t *testing.T) *AzureManager { VMType: vmTypeVMSS, MaxDeploymentsCount: 2, Deployment: "deployment", + EnableForceDelete: true, Location: "eastus", }, azClient: &azClient{ @@ -130,6 +134,126 @@ func TestNodeGroups(t *testing.T) { assert.Equal(t, len(provider.NodeGroups()), 2) } +func TestHasInstance(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + provider := newTestProvider(t) + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMClient := mockvmclient.NewMockInterface(ctrl) + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + provider.azureManager.azClient.virtualMachinesClient = mockVMClient + provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + + // Simulate node groups and instances + expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", compute.Uniform) + expectedVMsPoolVMs := newTestVMsPoolVMList(3) + expectedVMSSVMs := newTestVMSSVMList(3) + + mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() + mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedVMsPoolVMs, nil).AnyTimes() + mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + + // Register node groups + assert.Equal(t, len(provider.NodeGroups()), 0) + registered := provider.azureManager.RegisterNodeGroup( + newTestScaleSet(provider.azureManager, "test-asg"), + ) + provider.azureManager.explicitlyConfigured["test-asg"] = true + assert.True(t, registered) + + registered = provider.azureManager.RegisterNodeGroup( + newTestVMsPool(provider.azureManager, "test-vms-pool"), + ) + provider.azureManager.explicitlyConfigured["test-vms-pool"] = true + assert.True(t, registered) + assert.Equal(t, len(provider.NodeGroups()), 2) + + // Refresh cache + provider.azureManager.forceRefresh() + + // Test HasInstance for a node from the VMSS pool + node := newApiNode(compute.Uniform, 0) + hasInstance, err := provider.azureManager.azureCache.HasInstance(node.Spec.ProviderID) + assert.True(t, hasInstance) + assert.NoError(t, err) + + // Test HasInstance for a node from the VMs pool + vmsPoolNode := newVMsNode(0) + hasInstance, err = provider.azureManager.azureCache.HasInstance(vmsPoolNode.Spec.ProviderID) + assert.True(t, hasInstance) + assert.NoError(t, err) +} + +func TestUnownedInstancesFallbackToDeletionTaint(t *testing.T) { + // VMSS Instances that belong to a VMSS on the cluster but do not belong to a registered ASG + // should return err unimplemented for HasInstance + ctrl := gomock.NewController(t) + defer ctrl.Finish() + provider := newTestProvider(t) + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMClient := mockvmclient.NewMockInterface(ctrl) + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + provider.azureManager.azClient.virtualMachinesClient = mockVMClient + provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + + // // Simulate VMSS instances + unregisteredVMSSInstance := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unregistered-vmss-node", + }, + Spec: apiv1.NodeSpec{ + ProviderID: "azure:///subscriptions/sub/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/unregistered-vmss-instance-id/virtualMachines/0", + }, + } + // Mock responses to simulate that the instance belongs to a VMSS not in any registered ASG + expectedVMSSVMs := newTestVMSSVMList(1) + mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "unregistered-vmss-instance-id", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + + // Call HasInstance and check the result + hasInstance, err := provider.azureManager.azureCache.HasInstance(unregisteredVMSSInstance.Spec.ProviderID) + assert.False(t, hasInstance) + assert.Equal(t, cloudprovider.ErrNotImplemented, err) +} + +func TestHasInstanceProviderIDErrorValidation(t *testing.T) { + provider := newTestProvider(t) + // Test case: Node with an empty ProviderID + nodeWithoutValidProviderID := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + Spec: apiv1.NodeSpec{ + ProviderID: "", + }, + } + _, err := provider.HasInstance(nodeWithoutValidProviderID) + assert.Equal(t, "ProviderID for node: test-node is empty, skipped", err.Error()) + + // Test cases: Nodes with invalid ProviderID prefixes + invalidProviderIDs := []string{ + "aazure://", + "kubemark://", + "kwok://", + "incorrect!", + } + + for _, providerID := range invalidProviderIDs { + invalidProviderIDNode := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + Spec: apiv1.NodeSpec{ + ProviderID: providerID, + }, + } + _, err := provider.HasInstance(invalidProviderIDNode) + assert.Equal(t, "invalid azure ProviderID prefix for node: test-node, skipped", err.Error()) + } +} + func TestMixedNodeGroups(t *testing.T) { ctrl := gomock.NewController(t) provider := newTestProvider(t) @@ -187,57 +311,66 @@ func TestMixedNodeGroups(t *testing.T) { func TestNodeGroupForNode(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() - orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible} + orchestrationModes := []compute.OrchestrationMode{compute.Uniform, compute.Flexible} expectedVMSSVMs := newTestVMSSVMList(3) expectedVMs := newTestVMList(3) for _, orchMode := range orchestrationModes { - expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", orchMode) - provider := newTestProvider(t) - mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) - mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil) - provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient - mockVMClient := mockvmclient.NewMockInterface(ctrl) - provider.azureManager.azClient.virtualMachinesClient = mockVMClient - mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedVMs, nil).AnyTimes() - - if orchMode == compute.Uniform { - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) - mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() - provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient - } else { - - provider.azureManager.config.EnableVmssFlex = true - mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() + t.Run(fmt.Sprintf("OrchestrationMode_%v", orchMode), func(t *testing.T) { + expectedScaleSets := newTestVMSSList(3, "test-asg", "eastus", orchMode) + provider := newTestProvider(t) + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil) + provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMClient := mockvmclient.NewMockInterface(ctrl) provider.azureManager.azClient.virtualMachinesClient = mockVMClient - } - - registered := provider.azureManager.RegisterNodeGroup( - newTestScaleSet(provider.azureManager, testASG)) - provider.azureManager.explicitlyConfigured[testASG] = true - assert.True(t, registered) - assert.Equal(t, len(provider.NodeGroups()), 1) - - node := newApiNode(orchMode, 0) - // refresh cache - provider.azureManager.forceRefresh() - group, err := provider.NodeGroupForNode(node) - assert.NoError(t, err) - assert.NotNil(t, group, "Group should not be nil") - assert.Equal(t, group.Id(), testASG) - assert.Equal(t, group.MinSize(), 1) - assert.Equal(t, group.MaxSize(), 5) - - // test node in cluster that is not in a group managed by cluster autoscaler - nodeNotInGroup := &apiv1.Node{ - Spec: apiv1.NodeSpec{ - ProviderID: azurePrefix + "/subscriptions/subscripion/resourceGroups/test-resource-group/providers/Microsoft.Compute/virtualMachines/test-instance-id-not-in-group", - }, - } - group, err = provider.NodeGroupForNode(nodeNotInGroup) - assert.NoError(t, err) - assert.Nil(t, group) + mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedVMs, nil).AnyTimes() + + if orchMode == compute.Uniform { + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + } else { + provider.azureManager.config.EnableVmssFlex = true + mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() + } + + registered := provider.azureManager.RegisterNodeGroup( + newTestScaleSet(provider.azureManager, "test-asg")) + provider.azureManager.explicitlyConfigured["test-asg"] = true + assert.True(t, registered) + assert.Equal(t, len(provider.NodeGroups()), 1) + + node := newApiNode(orchMode, 0) + // refresh cache + provider.azureManager.forceRefresh() + group, err := provider.NodeGroupForNode(node) + assert.NoError(t, err) + assert.NotNil(t, group, "Group should not be nil") + assert.Equal(t, group.Id(), "test-asg") + assert.Equal(t, group.MinSize(), 1) + assert.Equal(t, group.MaxSize(), 5) + + hasInstance, err := provider.HasInstance(node) + assert.True(t, hasInstance) + assert.NoError(t, err) + + // test node in cluster that is not in a group managed by cluster autoscaler + nodeNotInGroup := &apiv1.Node{ + Spec: apiv1.NodeSpec{ + ProviderID: "azure:///subscriptions/subscription/resourceGroups/test-resource-group/providers/Microsoft.Compute/virtualMachineScaleSets/test/virtualMachines/test-instance-id-not-in-group", + }, + } + group, err = provider.NodeGroupForNode(nodeNotInGroup) + assert.NoError(t, err) + assert.Nil(t, group) + + hasInstance, err = provider.HasInstance(nodeNotInGroup) + assert.False(t, hasInstance) + assert.Error(t, err) + assert.Equal(t, err, cloudprovider.ErrNotImplemented) + }) } } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 2f1e3b6023f6..6c354c2a23e4 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -56,6 +56,9 @@ const ( rateLimitWriteQPSEnvVar = "RATE_LIMIT_WRITE_QPS" rateLimitWriteBucketsEnvVar = "RATE_LIMIT_WRITE_BUCKETS" + // VmssSizeRefreshPeriodDefault in seconds + VmssSizeRefreshPeriodDefault = 30 + // auth methods authMethodPrincipal = "principal" authMethodCLI = "cli" @@ -138,20 +141,20 @@ type Config struct { CloudProviderBackoffDuration int `json:"cloudProviderBackoffDuration,omitempty" yaml:"cloudProviderBackoffDuration,omitempty"` CloudProviderBackoffJitter float64 `json:"cloudProviderBackoffJitter,omitempty" yaml:"cloudProviderBackoffJitter,omitempty"` + // EnableForceDelete defines whether to enable force deletion on the APIs + EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"` + // EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"` // EnableVmssFlex defines whether to enable Vmss Flex support or not EnableVmssFlex bool `json:"enableVmssFlex,omitempty" yaml:"enableVmssFlex,omitempty"` - // (DEPRECATED, DO NOT USE) EnableForceDelete defines whether to enable force deletion on the APIs - EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"` - // (DEPRECATED, DO NOT USE) EnableDetailedCSEMessage defines whether to emit error messages in the CSE error body info EnableDetailedCSEMessage bool `json:"enableDetailedCSEMessage,omitempty" yaml:"enableDetailedCSEMessage,omitempty"` - // (DEPRECATED, DO NOT USE) GetVmssSizeRefreshPeriod defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance - GetVmssSizeRefreshPeriod time.Duration `json:"getVmssSizeRefreshPeriod,omitempty" yaml:"getVmssSizeRefreshPeriod,omitempty"` + // (DEPRECATED, DO NOT USE) GetVmssSizeRefreshPeriod (seconds) defines how frequently to call GET VMSS API to fetch VMSS info per nodegroup instance + GetVmssSizeRefreshPeriod int `json:"getVmssSizeRefreshPeriod,omitempty" yaml:"getVmssSizeRefreshPeriod,omitempty"` } // BuildAzureConfig returns a Config object for the Azure clients @@ -262,6 +265,15 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { cfg.EnableDynamicInstanceList = dynamicInstanceListDefault } + if getVmssSizeRefreshPeriod := os.Getenv("AZURE_GET_VMSS_SIZE_REFRESH_PERIOD"); getVmssSizeRefreshPeriod != "" { + cfg.GetVmssSizeRefreshPeriod, err = strconv.Atoi(getVmssSizeRefreshPeriod) + if err != nil { + return nil, fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD %q: %v", getVmssSizeRefreshPeriod, err) + } + } else { + cfg.GetVmssSizeRefreshPeriod = VmssSizeRefreshPeriodDefault + } + if enableVmssFlex := os.Getenv("AZURE_ENABLE_VMSS_FLEX"); enableVmssFlex != "" { cfg.EnableVmssFlex, err = strconv.ParseBool(enableVmssFlex) if err != nil { @@ -326,6 +338,13 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { } } + if enableForceDelete := os.Getenv("AZURE_ENABLE_FORCE_DELETE"); enableForceDelete != "" { + cfg.EnableForceDelete, err = strconv.ParseBool(enableForceDelete) + if err != nil { + return nil, fmt.Errorf("failed to parse AZURE_ENABLE_FORCE_DELETE: %q, %v", enableForceDelete, err) + } + } + err = initializeCloudProviderRateLimitConfig(&cfg.CloudProviderRateLimitConfig) if err != nil { return nil, err diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go index 1b8f704072b7..717ec86a402d 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go @@ -130,14 +130,11 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi Cap: 10 * time.Minute, } - if err := manager.forceRefresh(); err != nil { - err = kretry.OnError(retryBackoff, retry.IsErrorRetriable, func() (err error) { - return manager.forceRefresh() - }) - if err != nil { - return nil, err - } - return manager, nil + err = kretry.OnError(retryBackoff, retry.IsErrorRetriable, func() (err error) { + return manager.forceRefresh() + }) + if err != nil { + return nil, err } return manager, nil @@ -176,7 +173,6 @@ func (m *AzureManager) buildNodeGroupFromSpec(spec string) (cloudprovider.NodeGr if err != nil { return nil, fmt.Errorf("failed to parse node group spec: %v", err) } - vmsPoolSet := m.azureCache.getVMsPoolSet() if _, ok := vmsPoolSet[s.Name]; ok { return NewVMsPool(s, m), nil diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index a17f619385f9..8c7fd01df4b8 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -369,6 +369,7 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) { VmssCacheTTL: 100, VmssVmsCacheTTL: 110, VmssVmsCacheJitter: 90, + GetVmssSizeRefreshPeriod: 30, MaxDeploymentsCount: 8, CloudProviderBackoff: true, CloudProviderBackoffRetries: 1, @@ -483,6 +484,14 @@ func TestCreateAzureManagerWithNilConfig(t *testing.T) { assert.Equal(t, expectedErr, err, "Return err does not match, expected: %v, actual: %v", expectedErr, err) }) + t.Run("invalid int for AZURE_GET_VMSS_SIZE_REFRESH_PERIOD", func(t *testing.T) { + t.Setenv("AZURE_GET_VMSS_SIZE_REFRESH_PERIOD", "invalidint") + manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient) + expectedErr := fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD \"invalidint\": strconv.Atoi: parsing \"invalidint\": invalid syntax") + assert.Nil(t, manager) + assert.Equal(t, expectedErr, err, "Return err does not match, expected: %v, actual: %v", expectedErr, err) + }) + t.Run("invalid int for AZURE_MAX_DEPLOYMENT_COUNT", func(t *testing.T) { t.Setenv("AZURE_MAX_DEPLOYMENT_COUNT", "invalidint") manager, err := createAzureManagerInternal(nil, cloudprovider.NodeGroupDiscoveryOptions{}, mockAzClient) @@ -675,10 +684,7 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) { expectedScaleSets := []compute.VirtualMachineScaleSet{fakeVMSSWithTags(vmssName, map[string]*string{vmssTag: &vmssTagValue, "min": &min, "max": &max})} mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() - mockVMClient := mockvmclient.NewMockInterface(ctrl) - mockVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return([]compute.VirtualMachine{}, nil).AnyTimes() manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient - manager.azClient.virtualMachinesClient = mockVMClient err := manager.forceRefresh() assert.NoError(t, err) @@ -694,9 +700,10 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) { minSize: minVal, maxSize: maxVal, manager: manager, + enableForceDelete: manager.config.EnableForceDelete, curSize: 3, sizeRefreshPeriod: manager.azureCache.refreshInterval, - getVmssSizeRefreshPeriod: manager.azureCache.refreshInterval, + getVmssSizeRefreshPeriod: time.Duration(manager.azureCache.refreshInterval) * time.Second, InstanceCache: InstanceCache{instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod}, }} assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index e205414bff9f..71a8dfd0043c 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -119,9 +119,9 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d } if az.config.GetVmssSizeRefreshPeriod != 0 { - scaleSet.getVmssSizeRefreshPeriod = az.config.GetVmssSizeRefreshPeriod + scaleSet.getVmssSizeRefreshPeriod = time.Duration(az.config.GetVmssSizeRefreshPeriod) * time.Second } else { - scaleSet.getVmssSizeRefreshPeriod = az.azureCache.refreshInterval + scaleSet.getVmssSizeRefreshPeriod = time.Duration(az.azureCache.refreshInterval) * time.Second } if az.config.EnableDetailedCSEMessage { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 853054cbb354..f3aa0dc356d8 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -34,6 +34,10 @@ import ( "sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient" ) +const ( + testLocation = "eastus" +) + func newTestScaleSet(manager *AzureManager, name string) *ScaleSet { return &ScaleSet{ azureRef: azureRef{ @@ -83,7 +87,7 @@ func newTestVMSSListForEdgeZones(capacity int64, name string) *compute.VirtualMa Name: to.StringPtr("Standard_D4_v2"), }, VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{}, - Location: to.StringPtr("eastus"), + Location: to.StringPtr(testLocation), ExtendedLocation: &compute.ExtendedLocation{ Name: to.StringPtr("losangeles"), Type: compute.ExtendedLocationTypes("EdgeZone"), @@ -249,10 +253,20 @@ func TestIncreaseSize(t *testing.T) { expectedScaleSets = append(expectedScaleSets, *expectedEdgeZoneScaleSets, *expectedEdgeZoneMinZeroScaleSets) provider := newTestProvider(t) + // expectedScaleSets := newTestVMSSList(3, testASG, testLocation, orchMode) + + // // Include Edge Zone scenario here, testing scale from 3 to 5 and scale from zero cases. + // expectedEdgeZoneScaleSets := newTestVMSSListForEdgeZones(3, "edgezone-vmss") + // expectedEdgeZoneMinZeroScaleSets := newTestVMSSListForEdgeZones(0, "edgezone-minzero-vmss") + // expectedScaleSets = append(expectedScaleSets, *expectedEdgeZoneScaleSets, *expectedEdgeZoneMinZeroScaleSets) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedScaleSets, nil).AnyTimes() - mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(nil, nil) + mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), provider.azureManager.config.ResourceGroup, testASG, gomock.Any()).Return(nil, nil) + // This should be Anytimes() because the parent function of this call - updateVMSSCapacity() is a goroutine + // and this test doesn't wait on goroutine, hence, it is difficult to write exact expected number (which is 3 here) + // before we return from this this. + // This is a future TODO: sync.WaitGroup should be used in actual code and make code easily testable mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), provider.azureManager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient mockVMClient := mockvmclient.NewMockInterface(ctrl) @@ -337,28 +351,34 @@ func TestIncreaseSize(t *testing.T) { } // TestIncreaseSizeOnVMProvisioningFailed has been tweeked only for Uniform Orchestration mode. -// If ProvisioningState == failed, Status.State == InstanceFailed for all the cases. -// Expected results would be different for Flexible orchestration mode but that is not getting tested in AKS. +// If ProvisioningState == failed and power state is not running, Status.State == InstanceCreating with errorInfo populated. func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { testCases := map[string]struct { - expectInstanceRunning bool - isMissingInstanceView bool - statuses []compute.InstanceViewStatus + expectInstanceRunning bool + isMissingInstanceView bool + statuses []compute.InstanceViewStatus + expectErrorInfoPopulated bool }{ - "out of resources when no power state exists": {}, + "out of resources when no power state exists": { + expectErrorInfoPopulated: true, + }, "out of resources when VM is stopped": { - statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateStopped)}}, + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateStopped)}}, + expectErrorInfoPopulated: true, }, "out of resources when VM reports invalid power state": { - statuses: []compute.InstanceViewStatus{{Code: to.StringPtr("PowerState/invalid")}}, + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr("PowerState/invalid")}}, + expectErrorInfoPopulated: true, }, "instance running when power state is running": { - expectInstanceRunning: true, - statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateRunning)}}, + expectInstanceRunning: true, + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateRunning)}}, + expectErrorInfoPopulated: false, }, "instance running if instance view cannot be retrieved": { - expectInstanceRunning: true, - isMissingInstanceView: true, + expectInstanceRunning: true, + isMissingInstanceView: true, + expectErrorInfoPopulated: false, }, } for testName, testCase := range testCases { @@ -410,11 +430,12 @@ func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { assert.NoError(t, err) assert.Equal(t, 3, len(nodes)) - if testCase.expectInstanceRunning { - assert.Equal(t, cloudprovider.InstanceRunning, nodes[2].Status.State) - } else { + + assert.Equal(t, testCase.expectErrorInfoPopulated, nodes[2].Status.ErrorInfo != nil) + if testCase.expectErrorInfoPopulated { assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State) - assert.Equal(t, cloudprovider.OutOfResourcesErrorClass, nodes[2].Status.ErrorInfo.ErrorClass) + } else { + assert.Equal(t, cloudprovider.InstanceRunning, nodes[2].Status.State) } }) } @@ -528,18 +549,48 @@ func TestDeleteNodes(t *testing.T) { vmssName := "test-asg" var vmssCapacity int64 = 3 - orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible} - expectedVMSSVMs := newTestVMSSVMList(3) - expectedVMs := newTestVMList(3) + cases := []struct { + name string + orchestrationMode compute.OrchestrationMode + enableForceDelete bool + }{ + { + name: "uniform, force delete enabled", + orchestrationMode: compute.Uniform, + enableForceDelete: true, + }, + { + name: "uniform, force delete disabled", + orchestrationMode: compute.Uniform, + enableForceDelete: false, + }, + { + name: "flexible, force delete enabled", + orchestrationMode: compute.Flexible, + enableForceDelete: true, + }, + { + name: "flexible, force delete disabled", + orchestrationMode: compute.Flexible, + enableForceDelete: false, + }, + } - for _, orchMode := range orchestrationModes { + for _, tc := range cases { + orchMode := tc.orchestrationMode + enableForceDelete := tc.enableForceDelete + + expectedVMSSVMs := newTestVMSSVMList(3) + expectedVMs := newTestVMList(3) manager := newTestAzureManager(t) + manager.config.EnableForceDelete = enableForceDelete expectedScaleSets := newTestVMSSList(vmssCapacity, vmssName, "eastus", orchMode) + fmt.Printf("orchMode: %s, enableForceDelete: %t\n", orchMode, enableForceDelete) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2) - mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), false).Return(nil, nil) + mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), enableForceDelete).Return(nil, nil) mockVMSSClient.EXPECT().WaitForDeleteInstancesResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient @@ -634,20 +685,49 @@ func TestDeleteNodeUnregistered(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() - orchestrationModes := [2]compute.OrchestrationMode{compute.Uniform, compute.Flexible} - vmssName := "test-asg" var vmssCapacity int64 = 2 - expectedVMSSVMs := newTestVMSSVMList(2) - expectedVMs := newTestVMList(2) - for _, orchMode := range orchestrationModes { + cases := []struct { + name string + orchestrationMode compute.OrchestrationMode + enableForceDelete bool + }{ + { + name: "uniform, force delete enabled", + orchestrationMode: compute.Uniform, + enableForceDelete: true, + }, + { + name: "uniform, force delete disabled", + orchestrationMode: compute.Uniform, + enableForceDelete: false, + }, + { + name: "flexible, force delete enabled", + orchestrationMode: compute.Flexible, + enableForceDelete: true, + }, + { + name: "flexible, force delete disabled", + orchestrationMode: compute.Flexible, + enableForceDelete: false, + }, + } + + for _, tc := range cases { + orchMode := tc.orchestrationMode + enableForceDelete := tc.enableForceDelete + expectedVMSSVMs := newTestVMSSVMList(2) + expectedVMs := newTestVMList(2) + manager := newTestAzureManager(t) + manager.config.EnableForceDelete = enableForceDelete expectedScaleSets := newTestVMSSList(vmssCapacity, vmssName, "eastus", orchMode) mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil).Times(2) - mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), false).Return(nil, nil) + mockVMSSClient.EXPECT().DeleteInstancesAsync(gomock.Any(), manager.config.ResourceGroup, gomock.Any(), gomock.Any(), enableForceDelete).Return(nil, nil) mockVMSSClient.EXPECT().WaitForDeleteInstancesResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient mockVMClient := mockvmclient.NewMockInterface(ctrl) @@ -994,7 +1074,7 @@ func TestEnableVmssFlexFlag(t *testing.T) { mockVMClient := mockvmclient.NewMockInterface(ctrl) mockVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup).Return(expectedVMs, nil).AnyTimes() - mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), "test-asg").Return(expectedVMs, nil).AnyTimes() + mockVMClient.EXPECT().ListVmssFlexVMsWithoutInstanceView(gomock.Any(), testASG).Return(expectedVMs, nil).AnyTimes() provider.azureManager.azClient.virtualMachinesClient = mockVMClient provider.azureManager.RegisterNodeGroup( @@ -1028,19 +1108,29 @@ func TestTemplateNodeInfo(t *testing.T) { assert.Equal(t, len(provider.NodeGroups()), 1) asg := ScaleSet{ - manager: provider.azureManager, - minSize: 1, - maxSize: 5, - enableDynamicInstanceList: true, + manager: provider.azureManager, + minSize: 1, + maxSize: 5, } asg.Name = "test-asg" - nodeInfo, err := asg.TemplateNodeInfo() - assert.NoError(t, err) - assert.NotNil(t, nodeInfo) - assert.NotEmpty(t, nodeInfo.Pods) + t.Run("Checking fallback to static because dynamic list is empty", func(t *testing.T) { + asg.enableDynamicInstanceList = true + + nodeInfo, err := asg.TemplateNodeInfo() + assert.NoError(t, err) + assert.NotNil(t, nodeInfo) + assert.NotEmpty(t, nodeInfo.Pods) + }) + + // Properly testing dynamic SKU list through skewer is not possible, + // because there are no Resource API mocks included yet. + // Instead, the rest of the (consumer side) tests here + // override GetVMSSTypeDynamically and GetVMSSTypeStatically functions. t.Run("Checking dynamic workflow", func(t *testing.T) { + asg.enableDynamicInstanceList = true + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { vmssType := InstanceType{} vmssType.VCPU = 1 @@ -1057,6 +1147,8 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Checking static workflow if dynamic fails", func(t *testing.T) { + asg.enableDynamicInstanceList = true + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } @@ -1076,6 +1168,8 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Fails to find vmss instance information using static and dynamic workflow, instance not supported", func(t *testing.T) { + asg.enableDynamicInstanceList = true + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } @@ -1087,8 +1181,9 @@ func TestTemplateNodeInfo(t *testing.T) { assert.Equal(t, err, fmt.Errorf("static error exists")) }) - // Note: This test should be removed once enableDynamicInstanceList toggled is removed and the feature is completely enabled. - t.Run("Checking static workflow if enableDynamicInstanceList Toggle is false", func(t *testing.T) { + // Note: static-only workflow tests can be removed once support for dynamic is always on + + t.Run("Checking static-only workflow", func(t *testing.T) { asg.enableDynamicInstanceList = false GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*InstanceType, error) { @@ -1105,6 +1200,16 @@ func TestTemplateNodeInfo(t *testing.T) { assert.NotNil(t, nodeInfo) assert.NotEmpty(t, nodeInfo.Pods) }) + + t.Run("Checking static-only workflow with built-in SKU list", func(t *testing.T) { + asg.enableDynamicInstanceList = false + + nodeInfo, err := asg.TemplateNodeInfo() + assert.NoError(t, err) + assert.NotNil(t, nodeInfo) + assert.NotEmpty(t, nodeInfo.Pods) + }) + } func TestCseErrors(t *testing.T) { errorMessage := to.StringPtr("Error Message Test") diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template.go b/cluster-autoscaler/cloudprovider/azure/azure_template.go index ef15e99c0838..39afb09af222 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go @@ -113,7 +113,6 @@ func buildNodeFromTemplate(nodeGroupName string, template compute.VirtualMachine // GenericLabels node.Labels = cloudprovider.JoinStringMaps(node.Labels, buildGenericLabels(template, nodeName)) - // Labels from the Scale Set's Tags node.Labels = cloudprovider.JoinStringMaps(node.Labels, extractLabelsFromScaleSet(template.Tags)) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util.go b/cluster-autoscaler/cloudprovider/azure/azure_util.go index a912f286bcdb..9f4a44284ed2 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util.go @@ -35,6 +35,7 @@ import ( "github.com/Azure/go-autorest/autorest" "github.com/Azure/go-autorest/autorest/to" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/version" klog "k8s.io/klog/v2" "sigs.k8s.io/cloud-provider-azure/pkg/retry" @@ -63,6 +64,8 @@ const ( // CSE Extension checks vmssCSEExtensionName = "vmssCSE" vmssExtensionProvisioningFailed = "VMExtensionProvisioningFailed" + // vmExtensionProvisioningErrorClass represents a Vm extension provisioning error + vmExtensionProvisioningErrorClass cloudprovider.InstanceErrorClass = 103 // resource ids nsgID = "nsgID" diff --git a/cluster-autoscaler/cloudprovider/azure/testdata/test.pfx b/cluster-autoscaler/cloudprovider/azure/testdata/test.pfx old mode 100755 new mode 100644 diff --git a/cluster-autoscaler/cloudprovider/azure/testdata/testnopassword.pfx b/cluster-autoscaler/cloudprovider/azure/testdata/testnopassword.pfx old mode 100755 new mode 100644