From 1612ef548b0aad47202c37ee1bedc49d46f4ca1d Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Wed, 4 Sep 2024 20:23:06 +0000 Subject: [PATCH 1/3] Dynamic listing of SKUs (master) --- .../cloudprovider/azure/README.md | 3 - .../cloudprovider/azure/azure_cache.go | 58 ++++++++++--------- .../cloudprovider/azure/azure_config.go | 7 --- .../cloudprovider/azure/azure_instance.go | 8 +++ .../cloudprovider/azure/azure_manager.go | 3 + .../cloudprovider/azure/azure_scale_set.go | 14 ++--- .../azure/azure_scale_set_test.go | 12 ---- .../cloudprovider/azure/azure_template.go | 28 ++++----- 8 files changed, 58 insertions(+), 75 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md index bfe964fe176f..9fd1325b26e0 100644 --- a/cluster-autoscaler/cloudprovider/azure/README.md +++ b/cluster-autoscaler/cloudprovider/azure/README.md @@ -185,11 +185,8 @@ A configurable jitter (`AZURE_VMSS_VMS_CACHE_JITTER` environment variable, defau | vmssVmsCacheTTL | 300 | AZURE_VMSS_VMS_CACHE_TTL | vmssVmsCacheTTL | | vmssVmsCacheJitter | 0 | AZURE_VMSS_VMS_CACHE_JITTER | vmssVmsCacheJitter | -The `AZURE_ENABLE_DYNAMIC_INSTANCE_LIST` environment variable enables workflow that fetched SKU information dynamically using SKU API calls. By default, it uses static list of SKUs. - | Config Name | Default | Environment Variable | Cloud Config File | |---------------------------|---------|------------------------------------|---------------------------| -| enableDynamicInstanceList | false | AZURE_ENABLE_DYNAMIC_INSTANCE_LIST | enableDynamicInstanceList | The `AZURE_ENABLE_VMSS_FLEX` environment variable enables VMSS Flex support. By default, support is disabled. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go index c8334ac78771..0b9d932131f5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go @@ -18,6 +18,7 @@ package azure import ( "context" + "errors" "reflect" "regexp" "strings" @@ -96,7 +97,7 @@ type azureCache struct { unownedInstances map[azureRef]bool autoscalingOptions map[azureRef]map[string]string - skus map[string]*skewer.Cache + skus *skewer.Cache } func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*azureCache, error) { @@ -113,17 +114,17 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*az instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup), unownedInstances: make(map[azureRef]bool), autoscalingOptions: make(map[azureRef]map[string]string), - skus: make(map[string]*skewer.Cache), - } - - if config.EnableDynamicInstanceList { - cache.skus[config.Location] = &skewer.Cache{} + skus: &skewer.Cache{}, } if err := cache.regenerate(); err != nil { klog.Errorf("Error while regenerating Azure cache: %v", err) } + if err := cache.fetchSKUCache(config.Location); err != nil { + klog.Errorf("Error while populating SKU list: %v", err) + } + return cache, nil } @@ -186,21 +187,11 @@ func (m *azureCache) regenerate() error { newAutoscalingOptions[ref] = options } - newSkuCache := make(map[string]*skewer.Cache) - for location := range m.skus { - cache, err := m.fetchSKUs(context.Background(), location) - if err != nil { - return err - } - newSkuCache[location] = cache - } - m.mutex.Lock() defer m.mutex.Unlock() m.instanceToNodeGroup = newInstanceToNodeGroupCache m.autoscalingOptions = newAutoscalingOptions - m.skus = newSkuCache // Reset unowned instances cache. m.unownedInstances = make(map[azureRef]bool) @@ -208,6 +199,18 @@ func (m *azureCache) regenerate() error { return nil } +func (m *azureCache) fetchSKUCache(location string) error { + m.mutex.Lock() + defer m.mutex.Unlock() + + cache, err := m.fetchSKUs(context.Background(), location) + if err != nil { + return err + } + m.skus = cache + return nil +} + // fetchAzureResources retrieves and updates the cached Azure resources. // // This function performs the following: @@ -359,27 +362,26 @@ func (m *azureCache) Unregister(nodeGroup cloudprovider.NodeGroup) bool { } func (m *azureCache) fetchSKUs(ctx context.Context, location string) (*skewer.Cache, error) { + if location == "" { + return nil, errors.New("location not specified") + } + return skewer.NewCache(ctx, skewer.WithLocation(location), skewer.WithResourceClient(m.azClient.skuClient), ) } +// HasVMSKUS returns true if the cache has any VM SKUs. Can be used to judge success of loading. +func (m *azureCache) HasVMSKUs() bool { + // not nil or empty (using the only exposed semi-efficient way to check) + return !(m.skus == nil || m.skus.Equal(&skewer.Cache{})) +} + func (m *azureCache) GetSKU(ctx context.Context, skuName, location string) (skewer.SKU, error) { m.mutex.Lock() defer m.mutex.Unlock() - - cache, ok := m.skus[location] - if !ok { - var err error - cache, err = m.fetchSKUs(ctx, location) - if err != nil { - klog.V(1).Infof("Failed to instantiate cache, err: %v", err) - return skewer.SKU{}, err - } - m.skus[location] = cache - } - + cache := m.skus return cache.Get(ctx, skuName, skewer.VirtualMachines, location) } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index 49d0bdf32d4d..bb4ed339f44b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -86,9 +86,6 @@ type Config struct { // EnableForceDelete defines whether to enable force deletion on the APIs EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"` - // EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check - EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"` - // (DEPRECATED, DO NOT USE) EnableDetailedCSEMessage defines whether to emit error messages in the CSE error body info EnableDetailedCSEMessage bool `json:"enableDetailedCSEMessage,omitempty" yaml:"enableDetailedCSEMessage,omitempty"` @@ -114,7 +111,6 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { cfg := &Config{} // Static defaults - cfg.EnableDynamicInstanceList = false cfg.EnableVmssFlexNodes = false cfg.CloudProviderBackoffRetries = providerazureconsts.BackoffRetriesDefault cfg.CloudProviderBackoffExponent = providerazureconsts.BackoffExponentDefault @@ -247,9 +243,6 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { if _, err = assignBoolFromEnvIfExists(&cfg.EnableForceDelete, "AZURE_ENABLE_FORCE_DELETE"); err != nil { return nil, err } - if _, err = assignBoolFromEnvIfExists(&cfg.EnableDynamicInstanceList, "AZURE_ENABLE_DYNAMIC_INSTANCE_LIST"); err != nil { - return nil, err - } if _, err = assignBoolFromEnvIfExists(&cfg.EnableVmssFlexNodes, "AZURE_ENABLE_VMSS_FLEX_NODES"); err != nil { return nil, err } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_instance.go b/cluster-autoscaler/cloudprovider/azure/azure_instance.go index 9a04c441b51f..b8a79648c7c1 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_instance.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_instance.go @@ -18,6 +18,7 @@ package azure import ( "context" + "errors" "fmt" "regexp" "strings" @@ -64,6 +65,13 @@ var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCac ctx := context.Background() var vmssType InstanceType + // We will only use static SKU list if dynamic listing fails + if !azCache.HasVMSKUs() { + msg := "no vm sku info loaded, using only static sku list" + klog.Warning(msg) + return vmssType, errors.New(msg) + } + sku, err := azCache.GetSKU(ctx, *template.Sku.Name, *template.Location) if err != nil { // We didn't find an exact match but this is a promo type, check for matching standard diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go index 4a60051ffe18..f4d90473ac51 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go @@ -104,6 +104,8 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi if cfg.VmssCacheTTLInSeconds != 0 { cacheTTL = time.Duration(cfg.VmssCacheTTLInSeconds) * time.Second } + + // We always want to generate the cache dynamically cache, err := newAzureCache(azClient, cacheTTL, *cfg) if err != nil { return nil, err @@ -128,6 +130,7 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi Cap: 10 * time.Minute, } + // skuCache will already be created at this step by newAzureCache() err = kretry.OnError(retryBackoff, retry.IsErrorRetriable, func() (err error) { return manager.forceRefresh() }) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index 68a64c672035..e9963ff5b818 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -60,9 +60,8 @@ type ScaleSet struct { minSize int maxSize int - enableForceDelete bool - enableDynamicInstanceList bool - enableDetailedCSEMessage bool + enableForceDelete bool + enableDetailedCSEMessage bool // Current Size (Number of VMs) @@ -106,10 +105,9 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d instancesRefreshJitter: az.config.VmssVmsCacheJitter, }, - enableForceDelete: az.config.EnableForceDelete, - enableDynamicInstanceList: az.config.EnableDynamicInstanceList, - enableDetailedCSEMessage: az.config.EnableDetailedCSEMessage, - dedicatedHost: dedicatedHost, + enableForceDelete: az.config.EnableForceDelete, + enableDetailedCSEMessage: az.config.EnableDetailedCSEMessage, + dedicatedHost: dedicatedHost, } if az.config.VmssVirtualMachinesCacheTTLInSeconds != 0 { @@ -635,7 +633,7 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro inputLabels := map[string]string{} inputTaints := "" - node, err := buildNodeFromTemplate(scaleSet.Name, inputLabels, inputTaints, template, scaleSet.manager, scaleSet.enableDynamicInstanceList) + node, err := buildNodeFromTemplate(scaleSet.Name, inputLabels, inputTaints, template, scaleSet.manager) if err != nil { return nil, err diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index b3bd5fca5b5c..9795691da834 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -1115,8 +1115,6 @@ func TestTemplateNodeInfo(t *testing.T) { asg.Name = "test-asg" t.Run("Checking fallback to static because dynamic list is empty", func(t *testing.T) { - asg.enableDynamicInstanceList = true - nodeInfo, err := asg.TemplateNodeInfo() assert.NoError(t, err) assert.NotNil(t, nodeInfo) @@ -1129,8 +1127,6 @@ func TestTemplateNodeInfo(t *testing.T) { // override GetVMSSTypeDynamically and GetVMSSTypeStatically functions. t.Run("Checking dynamic workflow", func(t *testing.T) { - asg.enableDynamicInstanceList = true - GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { vmssType := InstanceType{} vmssType.VCPU = 1 @@ -1147,8 +1143,6 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Checking static workflow if dynamic fails", func(t *testing.T) { - asg.enableDynamicInstanceList = true - GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } @@ -1168,8 +1162,6 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Fails to find vmss instance information using static and dynamic workflow, instance not supported", func(t *testing.T) { - asg.enableDynamicInstanceList = true - GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } @@ -1184,8 +1176,6 @@ func TestTemplateNodeInfo(t *testing.T) { // Note: static-only workflow tests can be removed once support for dynamic is always on t.Run("Checking static-only workflow", func(t *testing.T) { - asg.enableDynamicInstanceList = false - GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*InstanceType, error) { vmssType := InstanceType{} vmssType.VCPU = 1 @@ -1202,8 +1192,6 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Checking static-only workflow with built-in SKU list", func(t *testing.T) { - asg.enableDynamicInstanceList = false - nodeInfo, err := asg.TemplateNodeInfo() assert.NoError(t, err) assert.NotNil(t, nodeInfo) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template.go b/cluster-autoscaler/cloudprovider/azure/azure_template.go index 9411354be35b..3bc0c23e7526 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go @@ -85,7 +85,7 @@ const ( ) func buildNodeFromTemplate(nodeGroupName string, inputLabels map[string]string, inputTaints string, - template compute.VirtualMachineScaleSet, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) { + template compute.VirtualMachineScaleSet, manager *AzureManager) (*apiv1.Node, error) { node := apiv1.Node{} nodeName := fmt.Sprintf("%s-asg-%d", nodeGroupName, rand.Int63()) @@ -101,23 +101,17 @@ func buildNodeFromTemplate(nodeGroupName string, inputLabels map[string]string, var vcpu, gpuCount, memoryMb int64 - // Fetching SKU information from SKU API if enableDynamicInstanceList is true. - var dynamicErr error - if enableDynamicInstanceList { - var vmssTypeDynamic InstanceType - klog.V(1).Infof("Fetching instance information for SKU: %s from SKU API", *template.Sku.Name) - vmssTypeDynamic, dynamicErr = GetVMSSTypeDynamically(template, manager.azureCache) - if dynamicErr == nil { - vcpu = vmssTypeDynamic.VCPU - gpuCount = vmssTypeDynamic.GPU - memoryMb = vmssTypeDynamic.MemoryMb - } else { - klog.Errorf("Dynamically fetching of instance information from SKU api failed with error: %v", dynamicErr) - } - } - if !enableDynamicInstanceList || dynamicErr != nil { - klog.V(1).Infof("Falling back to static SKU list for SKU: %s", *template.Sku.Name) + // Fetching SKU information from SKU API first. + klog.V(1).Infof("Fetching instance information for SKU: %s from SKU API", *template.Sku.Name) + vmssTypeDynamic, dynamicErr := GetVMSSTypeDynamically(template, manager.azureCache) + if dynamicErr == nil { + vcpu = vmssTypeDynamic.VCPU + gpuCount = vmssTypeDynamic.GPU + memoryMb = vmssTypeDynamic.MemoryMb + } else { // fall-back on static list of vmss if dynamic workflow fails. + klog.Errorf("Dynamically fetching of instance information from SKU api failed with error: %v", dynamicErr) + klog.V(1).Infof("Falling back to static SKU list for SKU: %s", *template.Sku.Name) vmssTypeStatic, staticErr := GetVMSSTypeStatically(template) if staticErr == nil { vcpu = vmssTypeStatic.VCPU From 5ba8aecb54f971e9b5bbc59fd12f6e7c26cd57a0 Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Thu, 5 Sep 2024 21:18:51 +0000 Subject: [PATCH 2/3] re-add enableDynamicInstanceList --- .../cloudprovider/azure/README.md | 3 ++ .../cloudprovider/azure/azure_cache.go | 8 ++++-- .../cloudprovider/azure/azure_config.go | 7 +++++ .../cloudprovider/azure/azure_instance.go | 8 ------ .../cloudprovider/azure/azure_manager.go | 7 +++-- .../cloudprovider/azure/azure_scale_set.go | 14 ++++++---- .../azure/azure_scale_set_test.go | 12 ++++++++ .../cloudprovider/azure/azure_template.go | 28 +++++++++++-------- 8 files changed, 57 insertions(+), 30 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md index 9fd1325b26e0..de4b5095dbcb 100644 --- a/cluster-autoscaler/cloudprovider/azure/README.md +++ b/cluster-autoscaler/cloudprovider/azure/README.md @@ -185,8 +185,11 @@ A configurable jitter (`AZURE_VMSS_VMS_CACHE_JITTER` environment variable, defau | vmssVmsCacheTTL | 300 | AZURE_VMSS_VMS_CACHE_TTL | vmssVmsCacheTTL | | vmssVmsCacheJitter | 0 | AZURE_VMSS_VMS_CACHE_JITTER | vmssVmsCacheJitter | +The `AZURE_ENABLE_DYNAMIC_INSTANCE_LIST` environment variable enables workflow that fetched SKU information dynamically using SKU API calls. By default, it uses static list of SKUs. + | Config Name | Default | Environment Variable | Cloud Config File | |---------------------------|---------|------------------------------------|---------------------------| +| enableDynamicInstanceList | false | AZURE_ENABLE_DYNAMIC_INSTANCE_LIST enableDynamicInstanceList | The `AZURE_ENABLE_VMSS_FLEX` environment variable enables VMSS Flex support. By default, support is disabled. diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go index 0b9d932131f5..14ce9254e2c5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go @@ -114,15 +114,17 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*az instanceToNodeGroup: make(map[azureRef]cloudprovider.NodeGroup), unownedInstances: make(map[azureRef]bool), autoscalingOptions: make(map[azureRef]map[string]string), - skus: &skewer.Cache{}, + skus: &skewer.Cache{}, // populated iff config.EnableDynamicInstanceList } if err := cache.regenerate(); err != nil { klog.Errorf("Error while regenerating Azure cache: %v", err) } - if err := cache.fetchSKUCache(config.Location); err != nil { - klog.Errorf("Error while populating SKU list: %v", err) + if config.EnableDynamicInstanceList { + if err := cache.fetchSKUCache(config.Location); err != nil { + klog.Errorf("Error while populating SKU list: %v", err) + } } return cache, nil diff --git a/cluster-autoscaler/cloudprovider/azure/azure_config.go b/cluster-autoscaler/cloudprovider/azure/azure_config.go index bb4ed339f44b..98a416ed98af 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_config.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_config.go @@ -86,6 +86,9 @@ type Config struct { // EnableForceDelete defines whether to enable force deletion on the APIs EnableForceDelete bool `json:"enableForceDelete,omitempty" yaml:"enableForceDelete,omitempty"` + // (DEPRECATED, DO NOT USE) EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check + EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"` + // (DEPRECATED, DO NOT USE) EnableDetailedCSEMessage defines whether to emit error messages in the CSE error body info EnableDetailedCSEMessage bool `json:"enableDetailedCSEMessage,omitempty" yaml:"enableDetailedCSEMessage,omitempty"` @@ -111,6 +114,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { cfg := &Config{} // Static defaults + cfg.EnableDynamicInstanceList = false cfg.EnableVmssFlexNodes = false cfg.CloudProviderBackoffRetries = providerazureconsts.BackoffRetriesDefault cfg.CloudProviderBackoffExponent = providerazureconsts.BackoffExponentDefault @@ -243,6 +247,9 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) { if _, err = assignBoolFromEnvIfExists(&cfg.EnableForceDelete, "AZURE_ENABLE_FORCE_DELETE"); err != nil { return nil, err } + if _, err = assignBoolFromEnvIfExists(&cfg.EnableDynamicInstanceList, "AZURE_ENABLE_DYNAMIC_INSTANCE_LIST"); err != nil { + return nil, err + } if _, err = assignBoolFromEnvIfExists(&cfg.EnableVmssFlexNodes, "AZURE_ENABLE_VMSS_FLEX_NODES"); err != nil { return nil, err } diff --git a/cluster-autoscaler/cloudprovider/azure/azure_instance.go b/cluster-autoscaler/cloudprovider/azure/azure_instance.go index b8a79648c7c1..9a04c441b51f 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_instance.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_instance.go @@ -18,7 +18,6 @@ package azure import ( "context" - "errors" "fmt" "regexp" "strings" @@ -65,13 +64,6 @@ var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCac ctx := context.Background() var vmssType InstanceType - // We will only use static SKU list if dynamic listing fails - if !azCache.HasVMSKUs() { - msg := "no vm sku info loaded, using only static sku list" - klog.Warning(msg) - return vmssType, errors.New(msg) - } - sku, err := azCache.GetSKU(ctx, *template.Sku.Name, *template.Location) if err != nil { // We didn't find an exact match but this is a promo type, check for matching standard diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager.go b/cluster-autoscaler/cloudprovider/azure/azure_manager.go index f4d90473ac51..61af67d27c31 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager.go @@ -104,14 +104,17 @@ func createAzureManagerInternal(configReader io.Reader, discoveryOpts cloudprovi if cfg.VmssCacheTTLInSeconds != 0 { cacheTTL = time.Duration(cfg.VmssCacheTTLInSeconds) * time.Second } - - // We always want to generate the cache dynamically cache, err := newAzureCache(azClient, cacheTTL, *cfg) if err != nil { return nil, err } manager.azureCache = cache + if !manager.azureCache.HasVMSKUs() { + klog.Warning("No VM SKU info loaded, using only static SKU list") + cfg.EnableDynamicInstanceList = false + } + specs, err := ParseLabelAutoDiscoverySpecs(discoveryOpts) if err != nil { return nil, err diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index e9963ff5b818..68a64c672035 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -60,8 +60,9 @@ type ScaleSet struct { minSize int maxSize int - enableForceDelete bool - enableDetailedCSEMessage bool + enableForceDelete bool + enableDynamicInstanceList bool + enableDetailedCSEMessage bool // Current Size (Number of VMs) @@ -105,9 +106,10 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d instancesRefreshJitter: az.config.VmssVmsCacheJitter, }, - enableForceDelete: az.config.EnableForceDelete, - enableDetailedCSEMessage: az.config.EnableDetailedCSEMessage, - dedicatedHost: dedicatedHost, + enableForceDelete: az.config.EnableForceDelete, + enableDynamicInstanceList: az.config.EnableDynamicInstanceList, + enableDetailedCSEMessage: az.config.EnableDetailedCSEMessage, + dedicatedHost: dedicatedHost, } if az.config.VmssVirtualMachinesCacheTTLInSeconds != 0 { @@ -633,7 +635,7 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro inputLabels := map[string]string{} inputTaints := "" - node, err := buildNodeFromTemplate(scaleSet.Name, inputLabels, inputTaints, template, scaleSet.manager) + node, err := buildNodeFromTemplate(scaleSet.Name, inputLabels, inputTaints, template, scaleSet.manager, scaleSet.enableDynamicInstanceList) if err != nil { return nil, err diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go index 9795691da834..b3bd5fca5b5c 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go @@ -1115,6 +1115,8 @@ func TestTemplateNodeInfo(t *testing.T) { asg.Name = "test-asg" t.Run("Checking fallback to static because dynamic list is empty", func(t *testing.T) { + asg.enableDynamicInstanceList = true + nodeInfo, err := asg.TemplateNodeInfo() assert.NoError(t, err) assert.NotNil(t, nodeInfo) @@ -1127,6 +1129,8 @@ func TestTemplateNodeInfo(t *testing.T) { // override GetVMSSTypeDynamically and GetVMSSTypeStatically functions. t.Run("Checking dynamic workflow", func(t *testing.T) { + asg.enableDynamicInstanceList = true + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { vmssType := InstanceType{} vmssType.VCPU = 1 @@ -1143,6 +1147,8 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Checking static workflow if dynamic fails", func(t *testing.T) { + asg.enableDynamicInstanceList = true + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } @@ -1162,6 +1168,8 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Fails to find vmss instance information using static and dynamic workflow, instance not supported", func(t *testing.T) { + asg.enableDynamicInstanceList = true + GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, azCache *azureCache) (InstanceType, error) { return InstanceType{}, fmt.Errorf("dynamic error exists") } @@ -1176,6 +1184,8 @@ func TestTemplateNodeInfo(t *testing.T) { // Note: static-only workflow tests can be removed once support for dynamic is always on t.Run("Checking static-only workflow", func(t *testing.T) { + asg.enableDynamicInstanceList = false + GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*InstanceType, error) { vmssType := InstanceType{} vmssType.VCPU = 1 @@ -1192,6 +1202,8 @@ func TestTemplateNodeInfo(t *testing.T) { }) t.Run("Checking static-only workflow with built-in SKU list", func(t *testing.T) { + asg.enableDynamicInstanceList = false + nodeInfo, err := asg.TemplateNodeInfo() assert.NoError(t, err) assert.NotNil(t, nodeInfo) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_template.go b/cluster-autoscaler/cloudprovider/azure/azure_template.go index 3bc0c23e7526..9411354be35b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_template.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_template.go @@ -85,7 +85,7 @@ const ( ) func buildNodeFromTemplate(nodeGroupName string, inputLabels map[string]string, inputTaints string, - template compute.VirtualMachineScaleSet, manager *AzureManager) (*apiv1.Node, error) { + template compute.VirtualMachineScaleSet, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) { node := apiv1.Node{} nodeName := fmt.Sprintf("%s-asg-%d", nodeGroupName, rand.Int63()) @@ -101,17 +101,23 @@ func buildNodeFromTemplate(nodeGroupName string, inputLabels map[string]string, var vcpu, gpuCount, memoryMb int64 - // Fetching SKU information from SKU API first. - klog.V(1).Infof("Fetching instance information for SKU: %s from SKU API", *template.Sku.Name) - vmssTypeDynamic, dynamicErr := GetVMSSTypeDynamically(template, manager.azureCache) - if dynamicErr == nil { - vcpu = vmssTypeDynamic.VCPU - gpuCount = vmssTypeDynamic.GPU - memoryMb = vmssTypeDynamic.MemoryMb - } else { - // fall-back on static list of vmss if dynamic workflow fails. - klog.Errorf("Dynamically fetching of instance information from SKU api failed with error: %v", dynamicErr) + // Fetching SKU information from SKU API if enableDynamicInstanceList is true. + var dynamicErr error + if enableDynamicInstanceList { + var vmssTypeDynamic InstanceType + klog.V(1).Infof("Fetching instance information for SKU: %s from SKU API", *template.Sku.Name) + vmssTypeDynamic, dynamicErr = GetVMSSTypeDynamically(template, manager.azureCache) + if dynamicErr == nil { + vcpu = vmssTypeDynamic.VCPU + gpuCount = vmssTypeDynamic.GPU + memoryMb = vmssTypeDynamic.MemoryMb + } else { + klog.Errorf("Dynamically fetching of instance information from SKU api failed with error: %v", dynamicErr) + } + } + if !enableDynamicInstanceList || dynamicErr != nil { klog.V(1).Infof("Falling back to static SKU list for SKU: %s", *template.Sku.Name) + // fall-back on static list of vmss if dynamic workflow fails. vmssTypeStatic, staticErr := GetVMSSTypeStatically(template) if staticErr == nil { vcpu = vmssTypeStatic.VCPU From 35304f43bfd8a55d182bf5a1f41038fbcacdf315 Mon Sep 17 00:00:00 2001 From: Rachel Gregory Date: Thu, 5 Sep 2024 21:21:15 +0000 Subject: [PATCH 3/3] update README.md --- cluster-autoscaler/cloudprovider/azure/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/cloudprovider/azure/README.md b/cluster-autoscaler/cloudprovider/azure/README.md index de4b5095dbcb..bfe964fe176f 100644 --- a/cluster-autoscaler/cloudprovider/azure/README.md +++ b/cluster-autoscaler/cloudprovider/azure/README.md @@ -189,7 +189,7 @@ The `AZURE_ENABLE_DYNAMIC_INSTANCE_LIST` environment variable enables workflow t | Config Name | Default | Environment Variable | Cloud Config File | |---------------------------|---------|------------------------------------|---------------------------| -| enableDynamicInstanceList | false | AZURE_ENABLE_DYNAMIC_INSTANCE_LIST enableDynamicInstanceList | +| enableDynamicInstanceList | false | AZURE_ENABLE_DYNAMIC_INSTANCE_LIST | enableDynamicInstanceList | The `AZURE_ENABLE_VMSS_FLEX` environment variable enables VMSS Flex support. By default, support is disabled.