Skip to content

Commit

Permalink
kubernetes_cluster_node_pool: Fix race condition with virtual network…
Browse files Browse the repository at this point in the history
… status when creating node pool (#25888)

* Fix race condition when creating multiple node pools.

Related to #13105

```
❯  make acctests SERVICE='containers' TESTARGS='-run=TestAccKubernetesClusterNodePool_virtualNetworkOwnershipRaceCondition' TESTTIMEOUT='60m'
==> Checking that code complies with gofmt requirements...
==> Checking that Custom Timeouts are used...
==> Checking that acceptance test packages are used...
TF_ACC=1 go test -v ./internal/services/containers -run=TestAccKubernetesClusterNodePool_virtualNetworkOwnershipRaceCondition -timeout 60m -ldflags="-X=github.com/hashicorp/terraform-provider-azurerm/version.ProviderVersion=acc"
=== RUN   TestAccKubernetesClusterNodePool_virtualNetworkOwnershipRaceCondition
=== PAUSE TestAccKubernetesClusterNodePool_virtualNetworkOwnershipRaceCondition
=== CONT  TestAccKubernetesClusterNodePool_virtualNetworkOwnershipRaceCondition
--- PASS: TestAccKubernetesClusterNodePool_virtualNetworkOwnershipRaceCondition (1395.32s)
PASS
ok  	github.com/hashicorp/terraform-provider-azurerm/internal/services/containers	1398.094s
```

blah

* kubernetes_cluster_node_pool: Address PR reviews

Co-authored-by: Tom Harvey <[email protected]>
Co-authored-by: stephybun <[email protected]>

---------

Co-authored-by: Tom Harvey <[email protected]>
Co-authored-by: stephybun <[email protected]>
  • Loading branch information
3 people authored May 16, 2024
1 parent d53b100 commit 9857f2e
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@ import (
"github.com/hashicorp/go-azure-sdk/resource-manager/containerservice/2023-06-02-preview/agentpools"
"github.com/hashicorp/go-azure-sdk/resource-manager/containerservice/2023-06-02-preview/managedclusters"
"github.com/hashicorp/go-azure-sdk/resource-manager/containerservice/2023-06-02-preview/snapshots"
"github.com/hashicorp/go-azure-sdk/resource-manager/network/2023-09-01/subnets"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/hashicorp/terraform-provider-azurerm/helpers/tf"
"github.com/hashicorp/terraform-provider-azurerm/internal/clients"
"github.com/hashicorp/terraform-provider-azurerm/internal/features"
"github.com/hashicorp/terraform-provider-azurerm/internal/locks"
computeValidate "github.com/hashicorp/terraform-provider-azurerm/internal/services/compute/validate"
"github.com/hashicorp/terraform-provider-azurerm/internal/services/containers/migration"
"github.com/hashicorp/terraform-provider-azurerm/internal/services/containers/parse"
containerValidate "github.com/hashicorp/terraform-provider-azurerm/internal/services/containers/validate"
"github.com/hashicorp/terraform-provider-azurerm/internal/services/network"
"github.com/hashicorp/terraform-provider-azurerm/internal/tf/pluginsdk"
"github.com/hashicorp/terraform-provider-azurerm/internal/tf/validation"
"github.com/hashicorp/terraform-provider-azurerm/internal/timeouts"
Expand Down Expand Up @@ -423,10 +426,14 @@ func resourceKubernetesClusterNodePoolSchema() map[string]*pluginsdk.Schema {

return s
}

func resourceKubernetesClusterNodePoolCreate(d *pluginsdk.ResourceData, meta interface{}) error {
containersClient := meta.(*clients.Client).Containers
clustersClient := containersClient.KubernetesClustersClient
poolsClient := containersClient.AgentPoolsClient
subnetClient := meta.(*clients.Client).Network.Client.Subnets
vnetClient := meta.(*clients.Client).Network.VnetClient

ctx, cancel := timeouts.ForCreate(meta.(*clients.Client).StopContext, d)
defer cancel()

Expand All @@ -435,6 +442,20 @@ func resourceKubernetesClusterNodePoolCreate(d *pluginsdk.ResourceData, meta int
return err
}

var subnetID *commonids.SubnetId
if subnetIDValue, ok := d.GetOk("vnet_subnet_id"); ok {
subnetID, err = commonids.ParseSubnetID(subnetIDValue.(string))
if err != nil {
return err
}

locks.ByName(subnetID.VirtualNetworkName, network.VirtualNetworkResourceName)
defer locks.UnlockByName(subnetID.VirtualNetworkName, network.VirtualNetworkResourceName)

locks.ByName(subnetID.SubnetName, network.SubnetResourceName)
defer locks.UnlockByName(subnetID.SubnetName, network.SubnetResourceName)
}

id := agentpools.NewAgentPoolID(clusterId.SubscriptionId, clusterId.ResourceGroupName, clusterId.ManagedClusterName, d.Get("name").(string))

log.Printf("[DEBUG] Retrieving %s...", *clusterId)
Expand Down Expand Up @@ -602,8 +623,8 @@ func resourceKubernetesClusterNodePoolCreate(d *pluginsdk.ResourceData, meta int
profile.PodSubnetID = utils.String(podSubnetID)
}

if vnetSubnetID := d.Get("vnet_subnet_id").(string); vnetSubnetID != "" {
profile.VnetSubnetID = utils.String(vnetSubnetID)
if subnetID != nil {
profile.VnetSubnetID = utils.String(subnetID.ID())
}

if hostGroupID := d.Get("host_group_id").(string); hostGroupID != "" {
Expand Down Expand Up @@ -677,6 +698,38 @@ func resourceKubernetesClusterNodePoolCreate(d *pluginsdk.ResourceData, meta int
return fmt.Errorf("creating %s: %+v", id, err)
}

if subnetID != nil {
// Wait for vnet to come back to Succeeded before releasing any locks
timeout, ok := ctx.Deadline()
if !ok {
return fmt.Errorf("internal-error: context had no deadline")
}

// TODO: refactor this into a `custompoller` within the `network` package
stateConf := &pluginsdk.StateChangeConf{
Pending: []string{string(subnets.ProvisioningStateUpdating)},
Target: []string{string(subnets.ProvisioningStateSucceeded)},
Refresh: network.SubnetProvisioningStateRefreshFunc(ctx, subnetClient, *subnetID),
MinTimeout: 1 * time.Minute,
Timeout: time.Until(timeout),
}
if _, err = stateConf.WaitForStateContext(ctx); err != nil {
return fmt.Errorf("waiting for provisioning state of subnet for AKS Node Pool creation %s: %+v", *subnetID, err)
}

vnetId := commonids.NewVirtualNetworkID(subnetID.SubscriptionId, subnetID.ResourceGroupName, subnetID.VirtualNetworkName)
vnetStateConf := &pluginsdk.StateChangeConf{
Pending: []string{string(subnets.ProvisioningStateUpdating)},
Target: []string{string(subnets.ProvisioningStateSucceeded)},
Refresh: network.VirtualNetworkProvisioningStateRefreshFunc(ctx, vnetClient, vnetId),
MinTimeout: 1 * time.Minute,
Timeout: time.Until(timeout),
}
if _, err = vnetStateConf.WaitForStateContext(ctx); err != nil {
return fmt.Errorf("waiting for provisioning state of virtual network for AKS Node Pool creation %s: %+v", vnetId, err)
}
}

d.SetId(id.ID())
return resourceKubernetesClusterNodePoolRead(d, meta)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1131,7 +1131,6 @@ func (KubernetesClusterNodePoolResource) scaleNodePool(nodeCount int) acceptance
nodePoolName := state.Attributes["name"]
kubernetesClusterId := state.Attributes["kubernetes_cluster_id"]
parsedK8sId, err := commonids.ParseKubernetesClusterID(kubernetesClusterId)

if err != nil {
return fmt.Errorf("parsing kubernetes cluster id: %+v", err)
}
Expand Down Expand Up @@ -1167,6 +1166,21 @@ func (KubernetesClusterNodePoolResource) scaleNodePool(nodeCount int) acceptance
}
}

func TestAccKubernetesClusterNodePool_virtualNetworkOwnershipRaceCondition(t *testing.T) {
data := acceptance.BuildTestData(t, "azurerm_kubernetes_cluster_node_pool", "test1")
r := KubernetesClusterNodePoolResource{}

data.ResourceTest(t, r, []acceptance.TestStep{
{
Config: r.virtualNetworkOwnershipRaceCondition(data),
Check: acceptance.ComposeTestCheckFunc(
check.That("azurerm_kubernetes_cluster_node_pool.test2").ExistsInAzure(r),
check.That("azurerm_kubernetes_cluster_node_pool.test3").ExistsInAzure(r),
),
},
})
}

func (r KubernetesClusterNodePoolResource) autoScaleConfig(data acceptance.TestData) string {
return fmt.Sprintf(`
provider "azurerm" {
Expand Down Expand Up @@ -2945,3 +2959,100 @@ resource "azurerm_kubernetes_cluster_node_pool" "test" {
}
`, data.Locations.Primary, data.RandomInteger)
}

func (KubernetesClusterNodePoolResource) virtualNetworkOwnershipRaceCondition(data acceptance.TestData) string {
return fmt.Sprintf(`
provider "azurerm" {
features {}
}
resource "azurerm_resource_group" "test" {
name = "acctestRG-aks-%d"
location = "%s"
}
resource "azurerm_virtual_network" "test" {
name = "acctestvirtnet%d"
address_space = ["10.1.0.0/16"]
location = azurerm_resource_group.test.location
resource_group_name = azurerm_resource_group.test.name
}
resource "azurerm_subnet" "test" {
count = 8
name = "acctestsubnet%d${count.index}"
resource_group_name = azurerm_resource_group.test.name
virtual_network_name = azurerm_virtual_network.test.name
address_prefixes = ["10.1.${count.index}.0/24"]
service_endpoints = [
"Microsoft.Storage.Global",
"Microsoft.AzureActiveDirectory",
"Microsoft.KeyVault"
]
lifecycle {
# AKS automatically configures subnet delegations when the subnets are assigned
# to node pools. We ignore changes so the terraform refresh run by Terraform's plugin-sdk,
# at the end of the test, returns empty and leaves the test succeed.
ignore_changes = [delegation]
}
}
resource "azurerm_kubernetes_cluster" "test" {
name = "acctestaks%d"
location = azurerm_resource_group.test.location
resource_group_name = azurerm_resource_group.test.name
dns_prefix = "acctestaks%d"
default_node_pool {
name = "default"
node_count = 1
vm_size = "Standard_DS2_v2"
vnet_subnet_id = azurerm_subnet.test["6"].id
pod_subnet_id = azurerm_subnet.test["7"].id
upgrade_settings {
max_surge = "10%%"
}
}
identity {
type = "SystemAssigned"
}
network_profile {
network_plugin = "azure"
load_balancer_sku = "standard"
}
}
resource "azurerm_kubernetes_cluster_node_pool" "test1" {
name = "internal1"
kubernetes_cluster_id = azurerm_kubernetes_cluster.test.id
vm_size = "Standard_L8s_v3"
node_count = 1
vnet_subnet_id = azurerm_subnet.test[0].id
pod_subnet_id = azurerm_subnet.test[1].id
zones = ["1"]
}
resource "azurerm_kubernetes_cluster_node_pool" "test2" {
name = "internal2"
kubernetes_cluster_id = azurerm_kubernetes_cluster.test.id
vm_size = "Standard_L8s_v3"
node_count = 1
vnet_subnet_id = azurerm_subnet.test[2].id
pod_subnet_id = azurerm_subnet.test[3].id
zones = ["1"]
}
resource "azurerm_kubernetes_cluster_node_pool" "test3" {
name = "internal3"
kubernetes_cluster_id = azurerm_kubernetes_cluster.test.id
vm_size = "Standard_L8s_v3"
node_count = 1
vnet_subnet_id = azurerm_subnet.test[4].id
pod_subnet_id = azurerm_subnet.test[5].id
zones = ["1"]
}
`, data.RandomInteger, data.Locations.Primary, data.RandomInteger, data.RandomInteger, data.RandomInteger, data.RandomInteger)
}

0 comments on commit 9857f2e

Please sign in to comment.