diff --git a/.changelog/12916.txt b/.changelog/12916.txt new file mode 100644 index 0000000000..d0af3a6aa7 --- /dev/null +++ b/.changelog/12916.txt @@ -0,0 +1,3 @@ +```release-note:bug +container: reverted locking behavior that caused regression of operation apply time spike started in `v6.15` +``` \ No newline at end of file diff --git a/google-beta/services/container/node_config.go b/google-beta/services/container/node_config.go index a24a83e1b7..4942f15e10 100644 --- a/google-beta/services/container/node_config.go +++ b/google-beta/services/container/node_config.go @@ -2027,11 +2027,7 @@ func flattenHostMaintenancePolicy(c *container.HostMaintenancePolicy) []map[stri // node pool updates in `resource_container_cluster` func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Config, nodePoolInfo *NodePoolInformation, prefix, name string, timeout time.Duration) error { - // Cluster write-lock will be acquired when createOpF is called to make operation creations sequential within - // the cluster, and read-lock will be acquired when waitOpF is called to allow concurrent operation. - // This is to resolve the bottleneck of large number of operations being created at the same time. - clusterLockKey := nodePoolInfo.clusterLockKey() - // Nodepool write-lock will be acquired when calling creaetOpF and waitOpF. + // Nodepool write-lock will be acquired when update function is called. npLockKey := nodePoolInfo.nodePoolLockKey(name) userAgent, err := tpgresource.GenerateUserAgentString(d, config.UserAgent) @@ -2053,15 +2049,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf }, } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2069,7 +2067,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -2087,15 +2085,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.ContainerdConfig = &container.ContainerdConfig{} req.ForceSendFields = []string{"ContainerdConfig"} } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2103,7 +2103,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -2132,15 +2132,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.StoragePools = storagePools } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2148,7 +2150,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated disk disk_size_gb/disk_type/machine_type/storage_pools for Node Pool %s", d.Id()) @@ -2186,15 +2188,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.Taints = ntaints } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2202,7 +2206,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated taints for Node Pool %s", d.Id()) @@ -2236,15 +2240,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.Tags = ntags } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2252,7 +2258,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated tags for node pool %s", name) @@ -2276,15 +2282,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.ResourceManagerTags = rmTags } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2292,7 +2300,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated resource manager tags for node pool %s", name) @@ -2310,15 +2318,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf } } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2327,7 +2337,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf } // Call update serially. - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -2346,15 +2356,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf } } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2363,7 +2375,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf } // Call update serially. - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -2378,22 +2390,24 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf }, } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Update(nodePoolInfo.parent(), req) if config.UserProjectOverride { clusterUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterUpdateCall.Do() - } + op, err := clusterUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool", userAgent, timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated image type in Node Pool %s", d.Id()) @@ -2409,15 +2423,18 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.WorkloadMetadataConfig = &container.WorkloadMetadataConfig{} req.ForceSendFields = []string{"WorkloadMetadataConfig"} } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() - waitOpF := func(op *container.Operation) error { + if err != nil { + return err + } + + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2425,7 +2442,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated workload_metadata_config for node pool %s", name) @@ -2439,15 +2456,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf Enabled: gcfsEnabled, }, } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2455,7 +2474,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -2472,15 +2491,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.KubeletConfig = &container.NodeKubeletConfig{} req.ForceSendFields = []string{"KubeletConfig"} } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2488,7 +2509,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -2504,15 +2525,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf req.LinuxNodeConfig = &container.LinuxNodeConfig{} req.ForceSendFields = []string{"LinuxNodeConfig"} } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2520,7 +2543,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -2537,15 +2560,17 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf Enabled: fastSocket["enabled"].(bool), } } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -2553,7 +2578,7 @@ func nodePoolNodeConfigUpdate(d *schema.ResourceData, config *transport_tpg.Conf timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } diff --git a/google-beta/services/container/resource_container_node_pool.go b/google-beta/services/container/resource_container_node_pool.go index 59be87938f..c7c013bd6e 100644 --- a/google-beta/services/container/resource_container_node_pool.go +++ b/google-beta/services/container/resource_container_node_pool.go @@ -565,6 +565,11 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e return err } + // Acquire read-lock on cluster. + clusterLockKey := nodePoolInfo.clusterLockKey() + transport_tpg.MutexStore.RLock(clusterLockKey) + defer transport_tpg.MutexStore.RUnlock(clusterLockKey) + // Acquire write-lock on nodepool. npLockKey := nodePoolInfo.nodePoolLockKey(nodePool.Name) transport_tpg.MutexStore.Lock(npLockKey) @@ -577,9 +582,6 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e timeout := d.Timeout(schema.TimeoutCreate) startTime := time.Now() - clusterLockKey := nodePoolInfo.clusterLockKey() - transport_tpg.MutexStore.RLock(clusterLockKey) - // we attempt to prefetch the node pool to make sure it doesn't exist before creation var id = fmt.Sprintf("projects/%s/locations/%s/clusters/%s/nodePools/%s", nodePoolInfo.project, nodePoolInfo.location, nodePoolInfo.cluster, nodePool.Name) name := getNodePoolName(id) @@ -594,16 +596,11 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e // refreshed on the next call to apply. d.SetId(id) } else if err == nil { - transport_tpg.MutexStore.RUnlock(clusterLockKey) return fmt.Errorf("resource - %s - already exists", id) } - transport_tpg.MutexStore.RUnlock(clusterLockKey) var operation *container.Operation err = retry.Retry(timeout, func() *retry.RetryError { - transport_tpg.MutexStore.Lock(clusterLockKey) - defer transport_tpg.MutexStore.Unlock(clusterLockKey) - clusterNodePoolsCreateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Create(nodePoolInfo.parent(), req) if config.UserProjectOverride { clusterNodePoolsCreateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) @@ -622,8 +619,6 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e } return nil }) - transport_tpg.MutexStore.RLock(clusterLockKey) - defer transport_tpg.MutexStore.RUnlock(clusterLockKey) if err != nil { return fmt.Errorf("error creating NodePool: %s", err) } @@ -798,7 +793,10 @@ func resourceContainerNodePoolDelete(d *schema.ResourceData, meta interface{}) e } } + // Acquire read-lock on cluster. clusterLockKey := nodePoolInfo.clusterLockKey() + transport_tpg.MutexStore.RLock(clusterLockKey) + defer transport_tpg.MutexStore.RUnlock(clusterLockKey) // Acquire write-lock on nodepool. npLockKey := nodePoolInfo.nodePoolLockKey(name) @@ -810,8 +808,6 @@ func resourceContainerNodePoolDelete(d *schema.ResourceData, meta interface{}) e var operation *container.Operation err = retry.Retry(timeout, func() *retry.RetryError { - transport_tpg.MutexStore.Lock(clusterLockKey) - defer transport_tpg.MutexStore.Unlock(clusterLockKey) clusterNodePoolsDeleteCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Delete(nodePoolInfo.fullyQualifiedName(name)) if config.UserProjectOverride { clusterNodePoolsDeleteCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) @@ -831,8 +827,6 @@ func resourceContainerNodePoolDelete(d *schema.ResourceData, meta interface{}) e return nil }) - transport_tpg.MutexStore.RLock(clusterLockKey) - defer transport_tpg.MutexStore.RUnlock(clusterLockKey) if err != nil { return fmt.Errorf("Error deleting NodePool: %s", err) @@ -1358,11 +1352,12 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node return err } - // Cluster write-lock will be acquired when createOpF is called, and read-lock will be acquired when waitOpF is - // called. + // Acquire read-lock on cluster. clusterLockKey := nodePoolInfo.clusterLockKey() + transport_tpg.MutexStore.RLock(clusterLockKey) + defer transport_tpg.MutexStore.RUnlock(clusterLockKey) - // Nodepool write-lock will be acquired when calling createOpF and waitOpF. + // Nodepool write-lock will be acquired when update function is called. npLockKey := nodePoolInfo.nodePoolLockKey(name) if d.HasChange(prefix + "autoscaling") { @@ -1390,22 +1385,24 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node Update: update, } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Update(nodePoolInfo.parent(), req) if config.UserProjectOverride { clusterUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterUpdateCall.Do() - } + op, err := clusterUpdateCall.Do() + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool", userAgent, timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated autoscaling in Node Pool %s", d.Id()) @@ -1420,21 +1417,24 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node req := &container.SetNodePoolSizeRequest{ NodeCount: newSize, } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsSetSizeCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.SetSize(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsSetSizeCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsSetSizeCall.Do() - } + op, err := clusterNodePoolsSetSizeCall.Do() + + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool size", userAgent, timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] GKE node pool %s size has been updated to %d", name, newSize) @@ -1452,21 +1452,24 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node Management: management, } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsSetManagementCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.SetManagement(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsSetManagementCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsSetManagementCall.Do() - } + op, err := clusterNodePoolsSetManagementCall.Do() + + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool management", userAgent, timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated management in Node Pool %s", name) @@ -1477,20 +1480,23 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node NodePoolId: name, NodeVersion: d.Get(prefix + "version").(string), } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool version", userAgent, timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated version in Node Pool %s", name) @@ -1500,19 +1506,22 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node req := &container.UpdateNodePoolRequest{ Locations: tpgresource.ConvertStringSet(d.Get(prefix + "node_locations").(*schema.Set)), } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool node locations", userAgent, timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated node locations in Node Pool %s", name) @@ -1578,18 +1587,21 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node req := &container.UpdateNodePoolRequest{ UpgradeSettings: upgradeSettings, } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() + + if err != nil { + return err + } - waitOpF := func(op *container.Operation) error { + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool upgrade settings", userAgent, timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } log.Printf("[INFO] Updated upgrade settings in Node Pool %s", name) @@ -1601,15 +1613,18 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node NodePoolId: name, NodeNetworkConfig: expandNodeNetworkConfig(d.Get(prefix + "network_config")), } - createOpF := func() (*container.Operation, error) { + updateF := func() error { clusterNodePoolsUpdateCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.NodePools.Update(nodePoolInfo.fullyQualifiedName(name), req) if config.UserProjectOverride { clusterNodePoolsUpdateCall.Header().Add("X-Goog-User-Project", nodePoolInfo.project) } - return clusterNodePoolsUpdateCall.Do() - } + op, err := clusterNodePoolsUpdateCall.Do() - waitOpF := func(op *container.Operation) error { + if err != nil { + return err + } + + // Wait until it's updated return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, @@ -1617,7 +1632,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node timeout) } - if err := retryWhileIncompatibleOperation(timeout, npLockKey, clusterLockKey, createOpF, waitOpF); err != nil { + if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil { return err } @@ -1669,27 +1684,15 @@ func containerNodePoolAwaitRestingState(config *transport_tpg.Config, name, proj return state, err } -// Retries an operation while the canonical error code is FAILED_PRECONDTION or RESOURCE_EXHAUSTED which indicates -// there is an incompatible operation already running on the cluster or there are the number of allowed concurrent -// operations running on the cluster. These errors can be safely retried until the incompatible operation completes, -// and the newly requested operation can begin. -// The npLockKey is held throughout createOpFunc and waitOpFunc. -// The clusterLockKey write-lock is held during createOpFunc to make operation creations sequential within the cluster, -// and clusterLockKey read-lock is held during waitOpFunc to allow concurrency on a cluster. -func retryWhileIncompatibleOperation(timeout time.Duration, npLockKey string, clusterLockKey string, createOpFunc func() (*container.Operation, error), waitOpFunc func(*container.Operation) error) error { - f := func() error { - transport_tpg.MutexStore.Lock(clusterLockKey) - op, err := createOpFunc() - transport_tpg.MutexStore.Unlock(clusterLockKey) - if err != nil { - return err - } - transport_tpg.MutexStore.RLock(clusterLockKey) - defer transport_tpg.MutexStore.RUnlock(clusterLockKey) - return waitOpFunc(op) - } +// Retries an operation while the canonical error code is FAILED_PRECONDTION +// or RESOURCE_EXHAUSTED which indicates there is an incompatible operation +// already running on the cluster or there are the number of allowed +// concurrent operations running on the cluster. These errors can be safely +// retried until the incompatible operation completes, and the newly +// requested operation can begin. +func retryWhileIncompatibleOperation(timeout time.Duration, lockKey string, f func() error) error { return retry.Retry(timeout, func() *retry.RetryError { - if err := transport_tpg.LockedCall(npLockKey, f); err != nil { + if err := transport_tpg.LockedCall(lockKey, f); err != nil { if tpgresource.IsFailedPreconditionError(err) || tpgresource.IsQuotaError(err) { return retry.RetryableError(err) }