-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cancel instance refresh on any relevant change to ASG instead of blocking until previous one is finished (which may have led to failing nodes due to outdated join token) #598
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -192,13 +192,13 @@ func (r *AWSMachinePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reque | |
return ctrl.Result{}, r.reconcileDelete(machinePoolScope, infraScope, infraScope) | ||
} | ||
|
||
return ctrl.Result{}, r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope) | ||
return r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope) | ||
case *scope.ClusterScope: | ||
if !awsMachinePool.ObjectMeta.DeletionTimestamp.IsZero() { | ||
return ctrl.Result{}, r.reconcileDelete(machinePoolScope, infraScope, infraScope) | ||
} | ||
|
||
return ctrl.Result{}, r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope) | ||
return r.reconcileNormal(ctx, machinePoolScope, infraScope, infraScope, s3Scope) | ||
default: | ||
return ctrl.Result{}, errors.New("infraCluster has unknown type") | ||
} | ||
|
@@ -216,7 +216,7 @@ func (r *AWSMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctr | |
Complete(r) | ||
} | ||
|
||
func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope, s3Scope scope.S3Scope) error { | ||
func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope, s3Scope scope.S3Scope) (ctrl.Result, error) { | ||
clusterScope.Info("Reconciling AWSMachinePool") | ||
|
||
// If the AWSMachine is in an error state, return early. | ||
|
@@ -225,28 +225,28 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP | |
|
||
// TODO: If we are in a failed state, delete the secret regardless of instance state | ||
|
||
return nil | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
// If the AWSMachinepool doesn't have our finalizer, add it | ||
if controllerutil.AddFinalizer(machinePoolScope.AWSMachinePool, expinfrav1.MachinePoolFinalizer) { | ||
// Register finalizer immediately to avoid orphaning AWS resources | ||
if err := machinePoolScope.PatchObject(); err != nil { | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
} | ||
|
||
if !machinePoolScope.Cluster.Status.InfrastructureReady { | ||
machinePoolScope.Info("Cluster infrastructure is not ready yet") | ||
conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, infrav1.WaitingForClusterInfrastructureReason, clusterv1.ConditionSeverityInfo, "") | ||
return nil | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
// Make sure bootstrap data is available and populated | ||
if machinePoolScope.MachinePool.Spec.Template.Spec.Bootstrap.DataSecretName == nil { | ||
machinePoolScope.Info("Bootstrap data secret reference is not yet available") | ||
conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, infrav1.WaitingForBootstrapDataReason, clusterv1.ConditionSeverityInfo, "") | ||
return nil | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
ec2Svc := r.getEC2Service(ec2Scope) | ||
|
@@ -258,20 +258,24 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP | |
asg, err := r.findASG(machinePoolScope, asgsvc) | ||
if err != nil { | ||
conditions.MarkUnknown(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, expinfrav1.ASGNotFoundReason, err.Error()) | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
|
||
canUpdateLaunchTemplate := func() (bool, error) { | ||
canStartInstanceRefresh := func() (bool, *string, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've renamed this. The name was much too generic. There's only one usage of these functions, and that's for ASGs + instance refresh. No need to give it a name with broad meaning. This now additionally returns the status of the previous instance refresh, if any. With that, we can decide whether |
||
// If there is a change: before changing the template, check if there exist an ongoing instance refresh, | ||
// because only 1 instance refresh can be "InProgress". If template is updated when refresh cannot be started, | ||
// that change will not trigger a refresh. Do not start an instance refresh if only userdata changed. | ||
if asg == nil { | ||
// If the ASG hasn't been created yet, there is no need to check if we can start the instance refresh. | ||
// But we want to update the LaunchTemplate because an error in the LaunchTemplate may be blocking the ASG creation. | ||
return true, nil | ||
return true, nil, nil | ||
} | ||
return asgsvc.CanStartASGInstanceRefresh(machinePoolScope) | ||
} | ||
cancelInstanceRefresh := func() error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This callback function is new, and gets passed to the launch template / ASG reconciliation code which decides whether to cancel the previous instance refresh (because there are changes that need a new refresh = roll all nodes again). |
||
machinePoolScope.Info("cancelling instance refresh") | ||
return asgsvc.CancelASGInstanceRefresh(machinePoolScope) | ||
} | ||
runPostLaunchTemplateUpdateOperation := func() error { | ||
// skip instance refresh if ASG is not created yet | ||
if asg == nil { | ||
|
@@ -295,10 +299,14 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP | |
machinePoolScope.Info("starting instance refresh", "number of instances", machinePoolScope.MachinePool.Spec.Replicas) | ||
return asgsvc.StartASGInstanceRefresh(machinePoolScope) | ||
} | ||
if err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canUpdateLaunchTemplate, runPostLaunchTemplateUpdateOperation); err != nil { | ||
res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation) | ||
if err != nil { | ||
r.Recorder.Eventf(machinePoolScope.AWSMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err) | ||
machinePoolScope.Error(err, "failed to reconcile launch template") | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
if res != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As above: this function can now optionally return a |
||
return *res, nil | ||
} | ||
|
||
// set the LaunchTemplateReady condition | ||
|
@@ -308,9 +316,9 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP | |
// Create new ASG | ||
if err := r.createPool(machinePoolScope, clusterScope); err != nil { | ||
conditions.MarkFalse(machinePoolScope.AWSMachinePool, expinfrav1.ASGReadyCondition, expinfrav1.ASGProvisionFailedReason, clusterv1.ConditionSeverityError, err.Error()) | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
return nil | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
if annotations.ReplicasManagedByExternalAutoscaler(machinePoolScope.MachinePool) { | ||
|
@@ -321,14 +329,14 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP | |
"external", asg.DesiredCapacity) | ||
machinePoolScope.MachinePool.Spec.Replicas = asg.DesiredCapacity | ||
if err := machinePoolScope.PatchCAPIMachinePoolObject(ctx); err != nil { | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
} | ||
} | ||
|
||
if err := r.updatePool(machinePoolScope, clusterScope, asg); err != nil { | ||
machinePoolScope.Error(err, "error updating AWSMachinePool") | ||
return err | ||
return ctrl.Result{}, err | ||
} | ||
|
||
launchTemplateID := machinePoolScope.GetLaunchTemplateIDStatus() | ||
|
@@ -345,7 +353,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP | |
} | ||
err = reconSvc.ReconcileTags(machinePoolScope, resourceServiceToUpdate) | ||
if err != nil { | ||
return errors.Wrap(err, "error updating tags") | ||
return ctrl.Result{}, errors.Wrap(err, "error updating tags") | ||
} | ||
|
||
// Make sure Spec.ProviderID is always set. | ||
|
@@ -368,7 +376,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP | |
machinePoolScope.Error(err, "failed updating instances", "instances", asg.Instances) | ||
} | ||
|
||
return nil | ||
return ctrl.Result{}, nil | ||
} | ||
|
||
func (r *AWSMachinePoolReconciler) reconcileDelete(machinePoolScope *scope.MachinePoolScope, clusterScope cloud.ClusterScoper, ec2Scope scope.EC2Scope) error { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
reconcileNormal
now returns aResult
object as well, since it may return "please wait 30 seconds until the previous ASG instance refresh is cancelled"