diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 7cff4e1af63c..3b7978c5868b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -295,7 +295,7 @@ func (scaleSet *ScaleSet) GetScaleSetVms() ([]compute.VirtualMachineScaleSetVM, defer cancel() resourceGroup := scaleSet.manager.config.ResourceGroup - vmList, rerr := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSet.Name, "") + vmList, rerr := scaleSet.manager.azClient.virtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSet.Name, "instanceView") klog.V(4).Infof("GetScaleSetVms: scaleSet.Name: %s, vmList: %v", scaleSet.Name, vmList) if rerr != nil { klog.Errorf("VirtualMachineScaleSetVMsClient.List failed for %s: %v", scaleSet.Name, rerr) @@ -612,18 +612,26 @@ func buildInstanceCache(vmList interface{}) []cloudprovider.Instance { switch vms := vmList.(type) { case []compute.VirtualMachineScaleSetVM: for _, vm := range vms { - addInstanceToCache(&instances, vm.ID, vm.ProvisioningState) + powerState := vmPowerStateRunning + if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { + powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) + } + addInstanceToCache(&instances, vm.ID, vm.ProvisioningState, powerState) } case []compute.VirtualMachine: for _, vm := range vms { - addInstanceToCache(&instances, vm.ID, vm.ProvisioningState) + powerState := vmPowerStateRunning + if vm.InstanceView != nil && vm.InstanceView.Statuses != nil { + powerState = vmPowerStateFromStatuses(*vm.InstanceView.Statuses) + } + addInstanceToCache(&instances, vm.ID, vm.ProvisioningState, powerState) } } return instances } -func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisioningState *string) { +func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisioningState *string, powerState string) { // The resource ID is empty string, which indicates the instance may be in deleting state. if len(*id) == 0 { return @@ -638,7 +646,7 @@ func addInstanceToCache(instances *[]cloudprovider.Instance, id *string, provisi *instances = append(*instances, cloudprovider.Instance{ Id: "azure://" + resourceID, - Status: instanceStatusFromProvisioningState(provisioningState), + Status: instanceStatusFromProvisioningStateAndPowerState(resourceID, provisioningState, powerState), }) } @@ -665,13 +673,13 @@ func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, statu scaleSet.lastInstanceRefresh = time.Now() } -// instanceStatusFromProvisioningState converts the VM provisioning state to cloudprovider.InstanceStatus -func instanceStatusFromProvisioningState(provisioningState *string) *cloudprovider.InstanceStatus { +// instanceStatusFromProvisioningStateAndPowerState converts the VM provisioning state and power state to cloudprovider.InstanceStatus +func instanceStatusFromProvisioningStateAndPowerState(resourceId string, provisioningState *string, powerState string) *cloudprovider.InstanceStatus { if provisioningState == nil { return nil } - klog.V(5).Infof("Getting vm instance provisioning state %s", *provisioningState) + klog.V(5).Infof("Getting vm instance provisioning state %s for %s", *provisioningState, resourceId) status := &cloudprovider.InstanceStatus{} switch *provisioningState { @@ -680,11 +688,21 @@ func instanceStatusFromProvisioningState(provisioningState *string) *cloudprovid case string(compute.ProvisioningStateCreating): status.State = cloudprovider.InstanceCreating case string(compute.ProvisioningStateFailed): - status.State = cloudprovider.InstanceCreating - status.ErrorInfo = &cloudprovider.InstanceErrorInfo{ - ErrorClass: cloudprovider.OutOfResourcesErrorClass, - ErrorCode: "provisioning-state-failed", - ErrorMessage: "Azure failed to provision a node for this node group", + // Provisioning can fail both during instance creation or after the instance is running. + // Per https://learn.microsoft.com/en-us/azure/virtual-machines/states-billing#provisioning-states, + // ProvisioningState represents the most recent provisioning state, therefore only report + // InstanceCreating errors when the power state indicates the instance has not yet started running + if !isRunningVmPowerState(powerState) { + klog.V(4).Infof("VM %s reports failed provisioning state with non-running power state: %s", resourceId, powerState) + status.State = cloudprovider.InstanceCreating + status.ErrorInfo = &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: "provisioning-state-failed", + ErrorMessage: "Azure failed to provision a node for this node group", + } + } else { + klog.V(5).Infof("VM %s reports a failed provisioning state but is running (%s)", resourceId, powerState) + status.State = cloudprovider.InstanceRunning } default: status.State = cloudprovider.InstanceRunning diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 0897fe25b273..0f14a75a5b93 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -226,45 +226,77 @@ func TestIncreaseSize(t *testing.T) { } func TestIncreaseSizeOnVMProvisioningFailed(t *testing.T) { - ctrl := gomock.NewController(t) - defer ctrl.Finish() - - manager := newTestAzureManager(t) - vmssName := "vmss-failed-upscale" - - expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform) - expectedVMSSVMs := newTestVMSSVMList(3) - expectedVMSSVMs[2].ProvisioningState = to.StringPtr(string(compute.ProvisioningStateFailed)) - - mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) - mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil) - mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil) - mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() - manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient - mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) - mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() - manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient - manager.explicitlyConfigured["vmss-failed-upscale"] = true - registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName)) - assert.True(t, registered) - manager.Refresh() - - provider, err := BuildAzureCloudProvider(manager, nil) - assert.NoError(t, err) - - scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) - assert.True(t, ok) - - // Increase size by one, but the new node fails provisioning - err = scaleSet.IncreaseSize(1) - assert.NoError(t, err) - - nodes, err := scaleSet.Nodes() - assert.NoError(t, err) - - assert.Equal(t, 3, len(nodes)) - assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State) - assert.Equal(t, cloudprovider.OutOfResourcesErrorClass, nodes[2].Status.ErrorInfo.ErrorClass) + testCases := map[string]struct { + expectInstanceRunning bool + isMissingInstanceView bool + statuses []compute.InstanceViewStatus + }{ + "out of resources when no power state exists": {}, + "out of resources when VM is stopped": { + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateStopped)}}, + }, + "out of resources when VM reports invalid power state": { + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr("PowerState/invalid")}}, + }, + "instance running when power state is running": { + expectInstanceRunning: true, + statuses: []compute.InstanceViewStatus{{Code: to.StringPtr(vmPowerStateRunning)}}, + }, + "instance running if instance view cannot be retrieved": { + expectInstanceRunning: true, + isMissingInstanceView: true, + }, + } + for testName, testCase := range testCases { + t.Run(testName, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + manager := newTestAzureManager(t) + vmssName := "vmss-failed-upscale" + + expectedScaleSets := newTestVMSSList(3, "vmss-failed-upscale", "eastus", compute.Uniform) + expectedVMSSVMs := newTestVMSSVMList(3) + expectedVMSSVMs[2].ProvisioningState = to.StringPtr(string(compute.ProvisioningStateFailed)) + if !testCase.isMissingInstanceView { + expectedVMSSVMs[2].InstanceView = &compute.VirtualMachineScaleSetVMInstanceView{Statuses: &testCase.statuses} + } + + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup).Return(expectedScaleSets, nil) + mockVMSSClient.EXPECT().CreateOrUpdateAsync(gomock.Any(), manager.config.ResourceGroup, vmssName, gomock.Any()).Return(nil, nil) + mockVMSSClient.EXPECT().WaitForCreateOrUpdateResult(gomock.Any(), gomock.Any(), manager.config.ResourceGroup).Return(&http.Response{StatusCode: http.StatusOK}, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetsClient = mockVMSSClient + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + mockVMSSVMClient.EXPECT().List(gomock.Any(), manager.config.ResourceGroup, "vmss-failed-upscale", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes() + manager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient + manager.explicitlyConfigured["vmss-failed-upscale"] = true + registered := manager.RegisterNodeGroup(newTestScaleSet(manager, vmssName)) + assert.True(t, registered) + manager.Refresh() + + provider, err := BuildAzureCloudProvider(manager, nil) + assert.NoError(t, err) + + scaleSet, ok := provider.NodeGroups()[0].(*ScaleSet) + assert.True(t, ok) + + // Increase size by one, but the new node fails provisioning + err = scaleSet.IncreaseSize(1) + assert.NoError(t, err) + + nodes, err := scaleSet.Nodes() + assert.NoError(t, err) + + assert.Equal(t, 3, len(nodes)) + if testCase.expectInstanceRunning { + assert.Equal(t, cloudprovider.InstanceRunning, nodes[2].Status.State) + } else { + assert.Equal(t, cloudprovider.InstanceCreating, nodes[2].Status.State) + assert.Equal(t, cloudprovider.OutOfResourcesErrorClass, nodes[2].Status.ErrorInfo.ErrorClass) + } + }) + } } func TestIncreaseSizeOnVMSSUpdating(t *testing.T) { diff --git a/cluster-autoscaler/cloudprovider/azure/azure_util.go b/cluster-autoscaler/cloudprovider/azure/azure_util.go index 08d8c749a88d..5a389b738c63 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_util.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_util.go @@ -82,6 +82,16 @@ const ( nodeTaintTagName = "k8s.io_cluster-autoscaler_node-template_taint_" nodeResourcesTagName = "k8s.io_cluster-autoscaler_node-template_resources_" nodeOptionsTagName = "k8s.io_cluster-autoscaler_node-template_autoscaling-options_" + + // PowerStates reflect the operational state of a VM + // From https://learn.microsoft.com/en-us/java/api/com.microsoft.azure.management.compute.powerstate?view=azure-java-stable + vmPowerStateStarting = "PowerState/starting" + vmPowerStateRunning = "PowerState/running" + vmPowerStateStopping = "PowerState/stopping" + vmPowerStateStopped = "PowerState/stopped" + vmPowerStateDeallocating = "PowerState/deallocating" + vmPowerStateDeallocated = "PowerState/deallocated" + vmPowerStateUnknown = "PowerState/unknown" ) var ( @@ -608,3 +618,32 @@ func isAzureRequestsThrottled(rerr *retry.Error) bool { return rerr.HTTPStatusCode == http.StatusTooManyRequests } + +func isRunningVmPowerState(powerState string) bool { + return powerState == vmPowerStateRunning || powerState == vmPowerStateStarting +} + +func isKnownVmPowerState(powerState string) bool { + knownPowerStates := map[string]bool{ + vmPowerStateStarting: true, + vmPowerStateRunning: true, + vmPowerStateStopping: true, + vmPowerStateStopped: true, + vmPowerStateDeallocating: true, + vmPowerStateDeallocated: true, + vmPowerStateUnknown: true, + } + return knownPowerStates[powerState] +} + +func vmPowerStateFromStatuses(statuses []compute.InstanceViewStatus) string { + for _, status := range statuses { + if status.Code == nil || !isKnownVmPowerState(*status.Code) { + continue + } + return *status.Code + } + + // PowerState is not set if the VM is still creating (or has failed creation) + return vmPowerStateUnknown +}