Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cancel instance refresh on any relevant change to ASG instead of blocking until previous one is finished (which may have led to failing nodes due to outdated join token) #598

Merged
merged 1 commit into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ func (t Template) ControllersPolicy() *iamv1.PolicyDocument {
"arn:*:autoscaling:*:*:autoScalingGroup:*:autoScalingGroupName/*",
},
Action: iamv1.Actions{
"autoscaling:CancelInstanceRefresh",
"autoscaling:CreateAutoScalingGroup",
"autoscaling:UpdateAutoScalingGroup",
"autoscaling:CreateOrUpdateTags",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ Resources:
Resource:
- '*'
- Action:
- autoscaling:CancelInstanceRefresh
- autoscaling:CreateAutoScalingGroup
- autoscaling:UpdateAutoScalingGroup
- autoscaling:CreateOrUpdateTags
Expand Down
44 changes: 26 additions & 18 deletions exp/controllers/awsmachinepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,13 @@ func (r *AWSMachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reque
return ctrl.Result{}, r.reconcileDelete(machinePoolScope, infraScope, infraScope)
}

return ctrl.Result{}, r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope)
return r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reconcileNormal now returns a Result object as well, since it may return "please wait 30 seconds until the previous ASG instance refresh is cancelled"

case *scope.ClusterScope:
if !awsMachinePool.ObjectMeta.DeletionTimestamp.IsZero() {
return ctrl.Result{}, r.reconcileDelete(machinePoolScope, infraScope, infraScope)
}

return ctrl.Result{}, r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope)
return r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope)
default:
return ctrl.Result{}, errors.New("infraCluster has unknown type")
}
Expand All @@ -216,7 +216,7 @@ func (r *AWSMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctr
Complete(r)
}

func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope, s3Scope scope.S3Scope) error {
func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope, s3Scope scope.S3Scope) (ctrl.Result, error) {
clusterScope.Info("Reconciling AWSMachinePool")

// If the AWSMachine is in an error state, return early.
Expand All @@ -225,28 +225,28 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP

// TODO: If we are in a failed state, delete the secret regardless of instance state

return nil
return ctrl.Result{}, nil
}

// If the AWSMachinepool doesn't have our finalizer, add it
if controllerutil.AddFinalizer(machinePoolScope.AWSMachinePool, expinfrav1.MachinePoolFinalizer) {
// Register finalizer immediately to avoid orphaning AWS resources
if err := machinePoolScope.PatchObject(); err != nil {
return err
return ctrl.Result{}, err
}
}

if !machinePoolScope.Cluster.Status.InfrastructureReady {
machinePoolScope.Info("Cluster infrastructure is not ready yet")
conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "")
return nil
return ctrl.Result{}, nil
}

// Make sure bootstrap data is available and populated
if machinePoolScope.MachinePool.Spec.Template.Spec.Bootstrap.DataSecretName == nil {
machinePoolScope.Info("Bootstrap data secret reference is not yet available")
conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "")
return nil
return ctrl.Result{}, nil
}

ec2Svc := r.getEC2Service(ec2Scope)
Expand All @@ -258,20 +258,24 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
asg, err := r.findASG(machinePoolScope, asgsvc)
if err != nil {
conditions.MarkUnknown(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, expinfrav1.ASGNotFoundReason, err.Error())
return err
return ctrl.Result{}, err
}

canUpdateLaunchTemplate := func() (bool, error) {
canStartInstanceRefresh := func() (bool, *string, error) {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've renamed this. The name was much too generic. There's only one usage of these functions, and that's for ASGs + instance refresh. No need to give it a name with broad meaning.

This now additionally returns the status of the previous instance refresh, if any. With that, we can decide whether CancelInstanceRefresh needs to be called (e.g. status InProgress), or not (already in status Cancelling).

// If there is a change: before changing the template, check if there exist an ongoing instance refresh,
// because only 1 instance refresh can be "InProgress". If template is updated when refresh cannot be started,
// that change will not trigger a refresh. Do not start an instance refresh if only userdata changed.
if asg == nil {
// If the ASG hasn't been created yet, there is no need to check if we can start the instance refresh.
// But we want to update the LaunchTemplate because an error in the LaunchTemplate may be blocking the ASG creation.
return true, nil
return true, nil, nil
}
return asgsvc.CanStartASGInstanceRefresh(machinePoolScope)
}
cancelInstanceRefresh := func() error {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This callback function is new, and gets passed to the launch template / ASG reconciliation code which decides whether to cancel the previous instance refresh (because there are changes that need a new refresh = roll all nodes again).

machinePoolScope.Info("cancelling instance refresh")
return asgsvc.CancelASGInstanceRefresh(machinePoolScope)
}
runPostLaunchTemplateUpdateOperation := func() error {
// skip instance refresh if ASG is not created yet
if asg == nil {
Expand All @@ -295,10 +299,14 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
machinePoolScope.Info("starting instance refresh", "number of instances", machinePoolScope.MachinePool.Spec.Replicas)
return asgsvc.StartASGInstanceRefresh(machinePoolScope)
}
if err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canUpdateLaunchTemplate, runPostLaunchTemplateUpdateOperation); err != nil {
res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation)
if err != nil {
r.Recorder.Eventf(machinePoolScope.AWSMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err)
machinePoolScope.Error(err, "failed to reconcile launch template")
return err
return ctrl.Result{}, err
}
if res != nil {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above: this function can now optionally return a Result. In that case, return early (it delays the reconciliation with RequeueAfter: ... seconds).

return *res, nil
}

// set the LaunchTemplateReady condition
Expand All @@ -308,9 +316,9 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
// Create new ASG
if err := r.createPool(machinePoolScope, clusterScope); err != nil {
conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, expinfrav1.ASGProvisionFailedReason, clusterv1.ConditionSeverityError, err.Error())
return err
return ctrl.Result{}, err
}
return nil
return ctrl.Result{}, nil
}

if annotations.ReplicasManagedByExternalAutoscaler(machinePoolScope.MachinePool) {
Expand All @@ -321,14 +329,14 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
"external", asg.DesiredCapacity)
machinePoolScope.MachinePool.Spec.Replicas = asg.DesiredCapacity
if err := machinePoolScope.PatchCAPIMachinePoolObject(ctx); err != nil {
return err
return ctrl.Result{}, err
}
}
}

if err := r.updatePool(machinePoolScope, clusterScope, asg); err != nil {
machinePoolScope.Error(err, "error updating AWSMachinePool")
return err
return ctrl.Result{}, err
}

launchTemplateID := machinePoolScope.GetLaunchTemplateIDStatus()
Expand All @@ -345,7 +353,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
}
err = reconSvc.ReconcileTags(machinePoolScope, resourceServiceToUpdate)
if err != nil {
return errors.Wrap(err, "error updating tags")
return ctrl.Result{}, errors.Wrap(err, "error updating tags")
}

// Make sure Spec.ProviderID is always set.
Expand All @@ -368,7 +376,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
machinePoolScope.Error(err, "failed updating instances", "instances", asg.Instances)
}

return nil
return ctrl.Result{}, nil
}

func (r *AWSMachinePoolReconciler) reconcileDelete(machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope) error {
Expand Down
Loading
Loading