From bc7fb25d27789e431c0f21155b39295d98c71ae8 Mon Sep 17 00:00:00 2001 From: Hidde Beydals Date: Thu, 30 Nov 2023 12:41:45 +0100 Subject: [PATCH 1/7] api: limit maximum number of snapshots in history This ensures that on repetitive failures, the number of snapshots does not grow indefinitely due to there not being any in a superseded or deployed state. Signed-off-by: Hidde Beydals --- api/v2beta2/helmrelease_types.go | 7 ++++++- api/v2beta2/snapshot_types.go | 11 ++++++++++- api/v2beta2/snapshot_types_test.go | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/api/v2beta2/helmrelease_types.go b/api/v2beta2/helmrelease_types.go index 56cf9709d..eaea9c01c 100644 --- a/api/v2beta2/helmrelease_types.go +++ b/api/v2beta2/helmrelease_types.go @@ -37,6 +37,11 @@ const ( HelmReleaseFinalizer = "finalizers.fluxcd.io" ) +const ( + // defaultMaxHistory is the default number of Helm release versions to keep. + defaultMaxHistory = 5 +) + // Kustomize Helm PostRenderer specification. type Kustomize struct { // Strategic merge and JSON patches, defined as inline YAML objects, @@ -1200,7 +1205,7 @@ func (in HelmRelease) GetTimeout() metav1.Duration { // GetMaxHistory returns the configured MaxHistory, or the default of 5. func (in HelmRelease) GetMaxHistory() int { if in.Spec.MaxHistory == nil { - return 5 + return defaultMaxHistory } return *in.Spec.MaxHistory } diff --git a/api/v2beta2/snapshot_types.go b/api/v2beta2/snapshot_types.go index 2258acc58..587667665 100644 --- a/api/v2beta2/snapshot_types.go +++ b/api/v2beta2/snapshot_types.go @@ -81,11 +81,13 @@ func (in Snapshots) Previous(ignoreTests bool) *Snapshot { } // Truncate removes all Snapshots up to the Previous deployed Snapshot. -// If there is no previous-deployed Snapshot, no Snapshots are removed. +// If there is no previous-deployed Snapshot, the most recent 5 Snapshots are +// retained. func (in *Snapshots) Truncate(ignoreTests bool) { if in.Len() < 2 { return } + in.SortByVersion() for i := range (*in)[1:] { s := (*in)[i+1] @@ -96,6 +98,13 @@ func (in *Snapshots) Truncate(ignoreTests bool) { } } } + + if in.Len() > defaultMaxHistory { + // If none of the Snapshots are deployed or superseded, and there + // are more than the defaultMaxHistory, truncate to the most recent + // Snapshots. + *in = (*in)[:defaultMaxHistory] + } } // Snapshot captures a point-in-time copy of the status information for a Helm release, diff --git a/api/v2beta2/snapshot_types_test.go b/api/v2beta2/snapshot_types_test.go index 8447868b9..32911877f 100644 --- a/api/v2beta2/snapshot_types_test.go +++ b/api/v2beta2/snapshot_types_test.go @@ -258,6 +258,24 @@ func TestSnapshots_Truncate(t *testing.T) { }}, }, }, + { + name: "retains most recent snapshots when all have failed", + in: Snapshots{ + {Version: 6, Status: "deployed"}, + {Version: 5, Status: "failed"}, + {Version: 4, Status: "failed"}, + {Version: 3, Status: "failed"}, + {Version: 2, Status: "failed"}, + {Version: 1, Status: "failed"}, + }, + want: Snapshots{ + {Version: 6, Status: "deployed"}, + {Version: 5, Status: "failed"}, + {Version: 4, Status: "failed"}, + {Version: 3, Status: "failed"}, + {Version: 2, Status: "failed"}, + }, + }, { name: "without previous snapshot", in: Snapshots{ From 48cad6838633344b70e50670c791ea3399097dd5 Mon Sep 17 00:00:00 2001 From: Hidde Beydals Date: Thu, 30 Nov 2023 12:59:32 +0100 Subject: [PATCH 2/7] controller: unready dep should not bump obs gen This ensures that any unfulfilled dependencies for which we requeue do not prematurely bump the observed generation by introducing typed errors. These typed errors ensure that the logic to bump the observed generation can continue to be the same, while ignoring them just in time before returning the final error. Signed-off-by: Hidde Beydals --- internal/controller/helmrelease_controller.go | 21 ++++++++-- .../controller/helmrelease_controller_test.go | 10 ++--- internal/errors/ignore.go | 25 ++++++++++++ internal/errors/ignore_test.go | 40 +++++++++++++++++++ 4 files changed, 87 insertions(+), 9 deletions(-) create mode 100644 internal/errors/ignore.go create mode 100644 internal/errors/ignore_test.go diff --git a/internal/controller/helmrelease_controller.go b/internal/controller/helmrelease_controller.go index d10911653..ad2ed11c2 100644 --- a/internal/controller/helmrelease_controller.go +++ b/internal/controller/helmrelease_controller.go @@ -59,6 +59,7 @@ import ( "github.com/fluxcd/helm-controller/internal/action" "github.com/fluxcd/helm-controller/internal/chartutil" "github.com/fluxcd/helm-controller/internal/digest" + interrors "github.com/fluxcd/helm-controller/internal/errors" "github.com/fluxcd/helm-controller/internal/features" "github.com/fluxcd/helm-controller/internal/kube" "github.com/fluxcd/helm-controller/internal/loader" @@ -97,6 +98,11 @@ type HelmReleaseReconcilerOptions struct { RateLimiter ratelimiter.RateLimiter } +var ( + errWaitForDependency = errors.New("must wait for dependency") + errWaitForChart = errors.New("must wait for chart") +) + func (r *HelmReleaseReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opts HelmReleaseReconcilerOptions) error { // Index the HelmRelease by the HelmChart references they point at if err := mgr.GetFieldIndexer().IndexField(ctx, &v2.HelmRelease{}, v2.SourceIndexKey, @@ -167,6 +173,13 @@ func (r *HelmReleaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) patchOpts = append(patchOpts, patch.WithStatusObservedGeneration{}) } + // We do not want to return these errors, but rather wait for the + // designated RequeueAfter to expire and try again. + // However, not returning an error will cause the patch helper to + // patch the observed generation, which we do not want. So we ignore + // these errors here after patching. + retErr = interrors.Ignore(retErr, errWaitForDependency, errWaitForChart) + if err := patchHelper.Patch(ctx, obj, patchOpts...); err != nil { if !obj.DeletionTimestamp.IsZero() { err = apierrutil.FilterOut(err, func(e error) bool { return apierrors.IsNotFound(e) }) @@ -230,7 +243,7 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe // Exponential backoff would cause execution to be prolonged too much, // instead we requeue on a fixed interval. - return ctrl.Result{RequeueAfter: r.requeueDependency}, nil + return ctrl.Result{RequeueAfter: r.requeueDependency}, errWaitForDependency } log.Info("all dependencies are ready") @@ -262,7 +275,7 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe conditions.MarkFalse(obj, meta.ReadyCondition, "HelmChartNotReady", msg) // Do not requeue immediately, when the artifact is created // the watcher should trigger a reconciliation. - return jitter.JitteredRequeueInterval(ctrl.Result{RequeueAfter: obj.GetRequeueAfter()}), nil + return jitter.JitteredRequeueInterval(ctrl.Result{RequeueAfter: obj.GetRequeueAfter()}), errWaitForChart } // Compose values based from the spec and references. @@ -276,10 +289,10 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe loadedChart, err := loader.SecureLoadChartFromURL(r.httpClient, hc.GetArtifact().URL, hc.GetArtifact().Digest) if err != nil { if errors.Is(err, loader.ErrFileNotFound) { - msg := fmt.Sprintf("Chart not ready: artifact not found. Retrying in %s", r.requeueDependency) + msg := fmt.Sprintf("Chart not ready: artifact not found. Retrying in %s", r.requeueDependency.String()) conditions.MarkFalse(obj, meta.ReadyCondition, v2.ArtifactFailedReason, msg) log.Info(msg) - return ctrl.Result{RequeueAfter: r.requeueDependency}, nil + return ctrl.Result{RequeueAfter: r.requeueDependency}, errWaitForDependency } conditions.MarkFalse(obj, meta.ReadyCondition, v2.ArtifactFailedReason, fmt.Sprintf("Could not load chart: %s", err.Error())) diff --git a/internal/controller/helmrelease_controller_test.go b/internal/controller/helmrelease_controller_test.go index e432a210b..e5c31ffa5 100644 --- a/internal/controller/helmrelease_controller_test.go +++ b/internal/controller/helmrelease_controller_test.go @@ -108,7 +108,7 @@ func TestHelmReleaseReconciler_reconcileRelease(t *testing.T) { } res, err := r.reconcileRelease(context.TODO(), patch.NewSerialPatcher(obj, r.Client), obj) - g.Expect(err).ToNot(HaveOccurred()) + g.Expect(err).To(Equal(errWaitForDependency)) g.Expect(res.RequeueAfter).To(Equal(r.requeueDependency)) g.Expect(obj.Status.Conditions).To(conditions.MatchConditions([]metav1.Condition{ @@ -222,7 +222,7 @@ func TestHelmReleaseReconciler_reconcileRelease(t *testing.T) { } res, err := r.reconcileRelease(context.TODO(), patch.NewSerialPatcher(obj, r.Client), obj) - g.Expect(err).ToNot(HaveOccurred()) + g.Expect(err).To(Equal(errWaitForChart)) g.Expect(res.RequeueAfter).To(Equal(obj.Spec.Interval.Duration)) g.Expect(obj.Status.Conditions).To(conditions.MatchConditions([]metav1.Condition{ @@ -274,7 +274,7 @@ func TestHelmReleaseReconciler_reconcileRelease(t *testing.T) { } res, err := r.reconcileRelease(context.TODO(), patch.NewSerialPatcher(obj, r.Client), obj) - g.Expect(err).ToNot(HaveOccurred()) + g.Expect(err).To(Equal(errWaitForChart)) g.Expect(res.RequeueAfter).To(Equal(obj.Spec.Interval.Duration)) g.Expect(obj.Status.Conditions).To(conditions.MatchConditions([]metav1.Condition{ @@ -326,7 +326,7 @@ func TestHelmReleaseReconciler_reconcileRelease(t *testing.T) { } res, err := r.reconcileRelease(context.TODO(), patch.NewSerialPatcher(obj, r.Client), obj) - g.Expect(err).ToNot(HaveOccurred()) + g.Expect(err).To(Equal(errWaitForChart)) g.Expect(res.RequeueAfter).To(Equal(obj.Spec.Interval.Duration)) g.Expect(obj.Status.Conditions).To(conditions.MatchConditions([]metav1.Condition{ @@ -438,7 +438,7 @@ func TestHelmReleaseReconciler_reconcileRelease(t *testing.T) { } res, err := r.reconcileRelease(context.TODO(), patch.NewSerialPatcher(obj, r.Client), obj) - g.Expect(err).ToNot(HaveOccurred()) + g.Expect(err).To(Equal(errWaitForDependency)) g.Expect(res.RequeueAfter).To(Equal(r.requeueDependency)) g.Expect(obj.Status.Conditions).To(conditions.MatchConditions([]metav1.Condition{ diff --git a/internal/errors/ignore.go b/internal/errors/ignore.go new file mode 100644 index 000000000..8cd247c92 --- /dev/null +++ b/internal/errors/ignore.go @@ -0,0 +1,25 @@ +/* +Copyright 2023 The Flux authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package errors + +// Ignore returns nil if err is equal to any of the errs. +func Ignore(err error, errs ...error) error { + if IsOneOf(err, errs...) { + return nil + } + return err +} diff --git a/internal/errors/ignore_test.go b/internal/errors/ignore_test.go new file mode 100644 index 000000000..bcf204ea3 --- /dev/null +++ b/internal/errors/ignore_test.go @@ -0,0 +1,40 @@ +/* +Copyright 2023 The Flux authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package errors + +import ( + "errors" + "testing" +) + +func TestIgnore(t *testing.T) { + err1 := errors.New("error1") + err2 := errors.New("error2") + + if err := Ignore(err1, err1, err2); err != nil { + t.Errorf("Expected Ignore to return nil when the error is in the list, but got %v", err) + } + + err3 := errors.New("error3") + if err := Ignore(err3, err1, err2); !errors.Is(err, err3) { + t.Errorf("Expected Ignore to return the error when it is not in the list, but got %v", err) + } + + if err := Ignore(err1); !errors.Is(err, err1) { + t.Errorf("Expected Ignore to return the error with an empty list of errors, but got %v", err) + } +} From 0a2041c3385b2e15e5424d32b554746132d19d85 Mon Sep 17 00:00:00 2001 From: Hidde Beydals Date: Thu, 30 Nov 2023 22:07:00 +0100 Subject: [PATCH 3/7] controller: ensure object in cache before requeue Signed-off-by: Hidde Beydals --- internal/controller/helmrelease_controller.go | 30 ++++- .../controller/helmrelease_controller_test.go | 110 ++++++++++++++++++ internal/reconcile/atomic_release.go | 2 +- 3 files changed, 140 insertions(+), 2 deletions(-) diff --git a/internal/controller/helmrelease_controller.go b/internal/controller/helmrelease_controller.go index ad2ed11c2..660cf30ab 100644 --- a/internal/controller/helmrelease_controller.go +++ b/internal/controller/helmrelease_controller.go @@ -25,9 +25,11 @@ import ( "github.com/hashicorp/go-retryablehttp" corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" apierrutil "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/cli-runtime/pkg/genericclioptions" "k8s.io/client-go/rest" kuberecorder "k8s.io/client-go/tools/record" @@ -187,6 +189,14 @@ func (r *HelmReleaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) retErr = apierrutil.Reduce(apierrutil.NewAggregate([]error{retErr, err})) } + // Wait for the object to have synced in-cache after patching. + // This is required to ensure that the next reconciliation will + // operate on the patched object when an immediate reconcile is + // requested. + if err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 10*time.Second, true, r.waitForHistoryCacheSync(obj)); err != nil { + log.Error(err, "failed to wait for object to sync in-cache after patching") + } + // Always record suspend, readiness and duration metrics. r.Metrics.RecordSuspend(ctx, obj, obj.Spec.Suspend) r.Metrics.RecordReadiness(ctx, obj) @@ -226,7 +236,7 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe // Mark the resource as under reconciliation. conditions.MarkReconciling(obj, meta.ProgressingReason, "Fulfilling prerequisites") - if err := patchHelper.Patch(ctx, obj); err != nil { + if err := patchHelper.Patch(ctx, obj, patch.WithOwnedConditions{Conditions: intreconcile.OwnedConditions}, patch.WithFieldOwner(r.FieldManager)); err != nil { return ctrl.Result{}, err } @@ -646,6 +656,24 @@ func (r *HelmReleaseReconciler) getHelmChart(ctx context.Context, obj *v2.HelmRe return &hc, nil } +// waitForHistoryCacheSync returns a function that can be used to wait for the +// cache backing the Kubernetes client to be in sync with the current state of +// the v2beta2.HelmRelease. +// This is a trade-off between not caching at all, and introducing a slight +// delay to ensure we always have the latest history state. +func (r *HelmReleaseReconciler) waitForHistoryCacheSync(obj *v2.HelmRelease) wait.ConditionWithContextFunc { + newObj := &v2.HelmRelease{} + return func(ctx context.Context) (bool, error) { + if err := r.Get(ctx, client.ObjectKeyFromObject(obj), newObj); err != nil { + if apierrors.IsNotFound(err) { + return true, nil + } + return false, err + } + return apiequality.Semantic.DeepEqual(obj.Status.History, newObj.Status.History), nil + } +} + func (r *HelmReleaseReconciler) requestsForHelmChartChange(ctx context.Context, o client.Object) []reconcile.Request { hc, ok := o.(*sourcev1.HelmChart) if !ok { diff --git a/internal/controller/helmrelease_controller_test.go b/internal/controller/helmrelease_controller_test.go index e5c31ffa5..1376770c4 100644 --- a/internal/controller/helmrelease_controller_test.go +++ b/internal/controller/helmrelease_controller_test.go @@ -1931,6 +1931,116 @@ func TestHelmReleaseReconciler_getHelmChart(t *testing.T) { } } +func Test_waitForHistoryCacheSync(t *testing.T) { + tests := []struct { + name string + rel *v2.HelmRelease + cacheRel *v2.HelmRelease + want bool + }{ + { + name: "different history", + rel: &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-name", + }, + Status: v2.HelmReleaseStatus{ + History: v2.Snapshots{ + { + Version: 2, + Status: "deployed", + }, + { + Version: 1, + Status: "failed", + }, + }, + }, + }, + cacheRel: &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-name", + }, + Status: v2.HelmReleaseStatus{ + History: v2.Snapshots{ + { + Version: 1, + Status: "deployed", + }, + }, + }, + }, + want: false, + }, + { + name: "same history", + rel: &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-name", + }, + Status: v2.HelmReleaseStatus{ + History: v2.Snapshots{ + { + Version: 2, + Status: "deployed", + }, + { + Version: 1, + Status: "failed", + }, + }, + }, + }, + cacheRel: &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-name", + }, + Status: v2.HelmReleaseStatus{ + History: v2.Snapshots{ + { + Version: 2, + Status: "deployed", + }, + { + Version: 1, + Status: "failed", + }, + }, + }, + }, + want: true, + }, + { + name: "does not exist", + rel: &v2.HelmRelease{ + ObjectMeta: metav1.ObjectMeta{ + Name: "some-name", + }, + }, + cacheRel: nil, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + c := fake.NewClientBuilder() + c.WithScheme(NewTestScheme()) + if tt.cacheRel != nil { + c.WithObjects(tt.cacheRel) + } + r := &HelmReleaseReconciler{ + Client: c.Build(), + } + + got, err := r.waitForHistoryCacheSync(tt.rel)(context.Background()) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(got == tt.want).To(BeTrue()) + }) + } +} + func TestValuesReferenceValidation(t *testing.T) { tests := []struct { name string diff --git a/internal/reconcile/atomic_release.go b/internal/reconcile/atomic_release.go index 1e6a9e8af..61bba38ae 100644 --- a/internal/reconcile/atomic_release.go +++ b/internal/reconcile/atomic_release.go @@ -168,7 +168,7 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { // last observation before returning. If the patch fails, we // log the error and return the original context cancellation // error. - if err := r.patchHelper.Patch(ctx, req.Object); err != nil { + if err := r.patchHelper.Patch(ctx, req.Object, patch.WithOwnedConditions{Conditions: OwnedConditions}, patch.WithFieldOwner(r.fieldManager)); err != nil { log.Error(err, "failed to patch HelmRelease after context cancellation") } cancel() From 51563d601296a50f0d6c96097d8b829b22a9e391 Mon Sep 17 00:00:00 2001 From: Hidde Beydals Date: Fri, 1 Dec 2023 14:57:58 +0100 Subject: [PATCH 4/7] reconcile: stall without rollback target This ensures that if there is no target to roll back to due to all of them being in a failed state, the controller stalls instead of ending up in a loop of upgrade attempts. Signed-off-by: Hidde Beydals --- internal/controller/helmrelease_controller.go | 2 +- internal/reconcile/atomic_release.go | 21 +++++++++--- internal/reconcile/atomic_release_test.go | 33 ++++++++++++++++++- internal/reconcile/release.go | 3 -- internal/reconcile/rollback_remediation.go | 6 ++-- .../reconcile/rollback_remediation_test.go | 4 +-- 6 files changed, 55 insertions(+), 14 deletions(-) diff --git a/internal/controller/helmrelease_controller.go b/internal/controller/helmrelease_controller.go index 660cf30ab..fae653565 100644 --- a/internal/controller/helmrelease_controller.go +++ b/internal/controller/helmrelease_controller.go @@ -375,7 +375,7 @@ func (r *HelmReleaseReconciler) reconcileRelease(ctx context.Context, patchHelpe if errors.Is(err, intreconcile.ErrMustRequeue) { return ctrl.Result{Requeue: true}, nil } - if errors.Is(err, intreconcile.ErrExceededMaxRetries) { + if interrors.IsOneOf(err, intreconcile.ErrExceededMaxRetries, intreconcile.ErrMissingRollbackTarget) { err = reconcile.TerminalError(err) } return ctrl.Result{}, err diff --git a/internal/reconcile/atomic_release.go b/internal/reconcile/atomic_release.go index 61bba38ae..d83dde129 100644 --- a/internal/reconcile/atomic_release.go +++ b/internal/reconcile/atomic_release.go @@ -59,6 +59,9 @@ var ( // to continue the reconciliation process. ErrMustRequeue = errors.New("must requeue") + // ErrMissingRollbackTarget is returned when the rollback target is missing. + ErrMissingRollbackTarget = errors.New("missing target release for rollback") + // ErrUnknownReleaseStatus is returned when the release status is unknown // and cannot be acted upon. ErrUnknownReleaseStatus = errors.New("unknown release status") @@ -189,6 +192,11 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { if errors.Is(err, ErrExceededMaxRetries) { conditions.MarkStalled(req.Object, "RetriesExceeded", "Failed to %s after %d attempt(s)", req.Object.Status.LastAttemptedReleaseAction, req.Object.GetActiveRemediation().GetFailureCount(req.Object)) + return err + } + if errors.Is(err, ErrMissingRollbackTarget) { + conditions.MarkStalled(req.Object, "MissingRollbackTarget", "Failed to perform remediation: %s", err.Error()) + return err } return err } @@ -412,10 +420,15 @@ func (r *AtomicRelease) actionForState(ctx context.Context, req *Request, state // before instructing to roll back to it. prev := req.Object.Status.History.Previous(remediation.MustIgnoreTestFailures(req.Object.GetTest().IgnoreFailures)) if _, err := action.VerifySnapshot(r.configFactory.Build(nil), prev); err != nil { - if interrors.IsOneOf(err, action.ErrReleaseNotFound, action.ErrReleaseDisappeared, action.ErrReleaseNotObserved, action.ErrReleaseDigest) { - // If the rollback target is not found or is in any other - // way corrupt, the most correct remediation is to - // reattempt the upgrade. + if errors.Is(err, action.ErrReleaseNotFound) { + // If the rollback target is missing, we cannot roll back + // to it and must fail. + return nil, fmt.Errorf("%w: cannot remediate failed release", ErrMissingRollbackTarget) + } + + if interrors.IsOneOf(err, action.ErrReleaseDisappeared, action.ErrReleaseNotObserved, action.ErrReleaseDigest) { + // If the rollback target is in any way corrupt, + // the most correct remediation is to reattempt the upgrade. log.Info(msgWithReason("unable to verify previous release in storage to roll back to", err.Error())) return NewUpgrade(r.configFactory, r.eventRecorder), nil } diff --git a/internal/reconcile/atomic_release_test.go b/internal/reconcile/atomic_release_test.go index d2050e425..4e7fba159 100644 --- a/internal/reconcile/atomic_release_test.go +++ b/internal/reconcile/atomic_release_test.go @@ -1349,6 +1349,36 @@ func TestAtomicRelease_actionForState(t *testing.T) { }, want: &RollbackRemediation{}, }, + { + name: "failed release with active upgrade remediation and no previous release triggers error", + state: ReleaseState{Status: ReleaseStatusFailed}, + releases: []*helmrelease.Release{ + testutil.BuildRelease(&helmrelease.MockReleaseOptions{ + Name: mockReleaseName, + Namespace: mockReleaseNamespace, + Version: 2, + Status: helmrelease.StatusFailed, + Chart: testutil.BuildChart(), + }), + }, + spec: func(spec *v2.HelmReleaseSpec) { + spec.Upgrade = &v2.Upgrade{ + Remediation: &v2.UpgradeRemediation{ + Retries: 2, + }, + } + }, + status: func(releases []*helmrelease.Release) v2.HelmReleaseStatus { + return v2.HelmReleaseStatus{ + History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), + }, + LastAttemptedReleaseAction: v2.ReleaseActionUpgrade, + UpgradeFailures: 1, + } + }, + wantErr: ErrMissingRollbackTarget, + }, { name: "failed release with active upgrade remediation and unverified previous triggers upgrade", state: ReleaseState{Status: ReleaseStatusFailed}, @@ -1357,7 +1387,7 @@ func TestAtomicRelease_actionForState(t *testing.T) { Name: mockReleaseName, Namespace: mockReleaseNamespace, Version: 2, - Status: helmrelease.StatusSuperseded, + Status: helmrelease.StatusFailed, Chart: testutil.BuildChart(), }), }, @@ -1371,6 +1401,7 @@ func TestAtomicRelease_actionForState(t *testing.T) { status: func(releases []*helmrelease.Release) v2.HelmReleaseStatus { return v2.HelmReleaseStatus{ History: v2.Snapshots{ + release.ObservedToSnapshot(release.ObserveRelease(releases[0])), release.ObservedToSnapshot(release.ObserveRelease( testutil.BuildRelease(&helmrelease.MockReleaseOptions{ Name: mockReleaseName, diff --git a/internal/reconcile/release.go b/internal/reconcile/release.go index 5e89aeab2..249aafbaa 100644 --- a/internal/reconcile/release.go +++ b/internal/reconcile/release.go @@ -36,9 +36,6 @@ var ( // ErrNoLatest is returned when the HelmRelease has no latest release // but this is required by the ActionReconciler. ErrNoLatest = errors.New("no latest release") - // ErrNoPrevious is returned when the HelmRelease has no previous release - // but this is required by the ActionReconciler. - ErrNoPrevious = errors.New("no previous release") // ErrReleaseMismatch is returned when the resulting release after running // an action does not match the expected latest and/or previous release. // This can happen for actions where targeting a release by version is not diff --git a/internal/reconcile/rollback_remediation.go b/internal/reconcile/rollback_remediation.go index 755bf7434..e614f5a30 100644 --- a/internal/reconcile/rollback_remediation.go +++ b/internal/reconcile/rollback_remediation.go @@ -49,8 +49,8 @@ import ( // Remediated=False and a warning event is emitted. // // When the Request.Object does not have a (successful) previous deployed -// release, it returns an error of type ErrNoPrevious. In addition, it -// returns ErrReleaseMismatch if the name and/or namespace of the latest and +// release, it returns an error of type ErrMissingRollbackTarget. In addition, +// it returns ErrReleaseMismatch if the name and/or namespace of the latest and // previous release do not match. Any other returned error indicates the caller // should retry as it did not cause a change to the Helm storage. // @@ -85,7 +85,7 @@ func (r *RollbackRemediation) Reconcile(ctx context.Context, req *Request) error // Previous is required to determine what version to roll back to. prev := req.Object.Status.History.Previous(req.Object.GetUpgrade().GetRemediation().MustIgnoreTestFailures(req.Object.GetTest().IgnoreFailures)) if prev == nil { - return fmt.Errorf("%w: required to rollback", ErrNoPrevious) + return fmt.Errorf("%w: required to rollback", ErrMissingRollbackTarget) } // Confirm previous and current point to the same release. diff --git a/internal/reconcile/rollback_remediation_test.go b/internal/reconcile/rollback_remediation_test.go index cf8e23906..2300aefc3 100644 --- a/internal/reconcile/rollback_remediation_test.go +++ b/internal/reconcile/rollback_remediation_test.go @@ -121,7 +121,7 @@ func TestRollbackRemediation_Reconcile(t *testing.T) { }, }, { - name: "rollback without previous", + name: "rollback without previous target release", releases: func(namespace string) []*helmrelease.Release { return []*helmrelease.Release{ testutil.BuildRelease(&helmrelease.MockReleaseOptions{ @@ -147,7 +147,7 @@ func TestRollbackRemediation_Reconcile(t *testing.T) { }, } }, - wantErr: ErrNoPrevious, + wantErr: ErrMissingRollbackTarget, expectHistory: func(releases []*helmrelease.Release) v2.Snapshots { return v2.Snapshots{ release.ObservedToSnapshot(release.ObserveRelease(releases[1])), From 0919fb4c2447afdc6012f4423df676e2e0c9ed9f Mon Sep 17 00:00:00 2001 From: Hidde Beydals Date: Fri, 1 Dec 2023 17:23:52 +0100 Subject: [PATCH 5/7] controller: remove deprecated metrics Signed-off-by: Hidde Beydals --- internal/controller/helmrelease_controller.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/controller/helmrelease_controller.go b/internal/controller/helmrelease_controller.go index fae653565..0df4d8dd7 100644 --- a/internal/controller/helmrelease_controller.go +++ b/internal/controller/helmrelease_controller.go @@ -197,9 +197,7 @@ func (r *HelmReleaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) log.Error(err, "failed to wait for object to sync in-cache after patching") } - // Always record suspend, readiness and duration metrics. - r.Metrics.RecordSuspend(ctx, obj, obj.Spec.Suspend) - r.Metrics.RecordReadiness(ctx, obj) + // Record the duration of the reconciliation. r.Metrics.RecordDuration(ctx, obj, start) }() From 05bc368de723632476fa2ddb0109bc2e754676e7 Mon Sep 17 00:00:00 2001 From: Hidde Beydals Date: Fri, 1 Dec 2023 17:36:14 +0100 Subject: [PATCH 6/7] reconcile: add `ProgressingWithRetry` on retry Signed-off-by: Hidde Beydals --- internal/reconcile/atomic_release.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/internal/reconcile/atomic_release.go b/internal/reconcile/atomic_release.go index d83dde129..893dc317b 100644 --- a/internal/reconcile/atomic_release.go +++ b/internal/reconcile/atomic_release.go @@ -217,11 +217,13 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { log.V(logger.DebugLevel).Info( fmt.Sprintf("instructed to stop before running %s action reconciler %s", next.Type(), next.Name()), ) - conditions.Delete(req.Object, meta.ReconcilingCondition) if remediation := req.Object.GetActiveRemediation(); remediation == nil || !remediation.RetriesExhausted(req.Object) { + conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, conditions.GetMessage(req.Object, meta.ReadyCondition)) return ErrMustRequeue } + + conditions.Delete(req.Object, meta.ReconcilingCondition) return nil } @@ -229,14 +231,14 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { // This to show continuous progress, as Helm actions can be long-running. reconcilingMsg := fmt.Sprintf("Running '%s' action with timeout of %s", next.Name(), timeoutForAction(next, req.Object).String()) - conditions.MarkTrue(req.Object, meta.ReconcilingCondition, "Progressing", reconcilingMsg) + conditions.MarkReconciling(req.Object, meta.ProgressingReason, reconcilingMsg) // If the next action is a release action, we can mark the release // as progressing in terms of readiness as well. Doing this for any // other action type is not useful, as it would potentially // overwrite more important failure state from an earlier action. if next.Type() == ReconcilerTypeRelease { - conditions.MarkUnknown(req.Object, meta.ReadyCondition, "Progressing", reconcilingMsg) + conditions.MarkUnknown(req.Object, meta.ReadyCondition, meta.ProgressingReason, reconcilingMsg) } // Patch the object to reflect the new condition. @@ -258,11 +260,13 @@ func (r *AtomicRelease) Reconcile(ctx context.Context, req *Request) error { log.V(logger.DebugLevel).Info(fmt.Sprintf( "instructed to stop after running %s action reconciler %s", next.Type(), next.Name()), ) - conditions.Delete(req.Object, meta.ReconcilingCondition) if remediation := req.Object.GetActiveRemediation(); remediation == nil || !remediation.RetriesExhausted(req.Object) { + conditions.MarkReconciling(req.Object, meta.ProgressingWithRetryReason, conditions.GetMessage(req.Object, meta.ReadyCondition)) return ErrMustRequeue } + + conditions.Delete(req.Object, meta.ReconcilingCondition) return nil } From 67fd6fb724c3c6b91198383dd2c80a3daa0fa8e4 Mon Sep 17 00:00:00 2001 From: Hidde Beydals Date: Fri, 1 Dec 2023 18:49:06 +0100 Subject: [PATCH 7/7] reconcile: remove Remediated condition on release This avoids having a confusing "stale" Remediated condition when a new release has been attempted. Signed-off-by: Hidde Beydals --- internal/reconcile/install.go | 3 + internal/reconcile/release.go | 49 -------- internal/reconcile/release_test.go | 193 ----------------------------- internal/reconcile/upgrade.go | 3 + 4 files changed, 6 insertions(+), 242 deletions(-) diff --git a/internal/reconcile/install.go b/internal/reconcile/install.go index 02adae7f9..a390b2a5e 100644 --- a/internal/reconcile/install.go +++ b/internal/reconcile/install.go @@ -84,6 +84,9 @@ func (r *Install) Reconcile(ctx context.Context, req *Request) error { // before the install. req.Object.Status.ClearHistory() + // If we are installing, we are no longer remediated. + conditions.Delete(req.Object, v2.RemediatedCondition) + // Run the Helm install action. _, err := action.Install(ctx, cfg, req.Object, req.Chart, req.Values) diff --git a/internal/reconcile/release.go b/internal/reconcile/release.go index 249aafbaa..ce0a74d50 100644 --- a/internal/reconcile/release.go +++ b/internal/reconcile/release.go @@ -125,10 +125,6 @@ func summarize(req *Request) { conditions.Delete(req.Object, v2.TestSuccessCondition) } - // Remove any stale Remediation observation as soon as the release is - // Released and (optionally) has TestSuccess. - conditionallyDeleteRemediated(req) - conds := req.Object.Status.Conditions if len(conds) == 0 { // Nothing to summarize if there are no conditions. @@ -169,51 +165,6 @@ func summarize(req *Request) { }) } -// conditionallyDeleteRemediated removes the Remediated condition if the -// release is Released and (optionally) has TestSuccess. But only if -// the observed generation of these conditions is equal or higher than -// the generation of the Remediated condition. -func conditionallyDeleteRemediated(req *Request) { - remediated := conditions.Get(req.Object, v2.RemediatedCondition) - if remediated == nil { - // If the object is not marked as Remediated, there is nothing to - // remove. - return - } - - released := conditions.Get(req.Object, v2.ReleasedCondition) - if released == nil || released.Status != metav1.ConditionTrue { - // If the release is not marked as Released, we must still be - // Remediated. - return - } - - if !req.Object.GetTest().Enable || req.Object.GetTest().IgnoreFailures { - // If tests are not enabled, or failures are ignored, and the - // generation is equal or higher than the generation of the - // Remediated condition, we are not in a Remediated state anymore. - if released.Status == metav1.ConditionTrue && released.ObservedGeneration >= remediated.ObservedGeneration { - conditions.Delete(req.Object, v2.RemediatedCondition) - } - return - } - - testSuccess := conditions.Get(req.Object, v2.TestSuccessCondition) - if testSuccess == nil || testSuccess.Status != metav1.ConditionTrue { - // If the release is not marked as TestSuccess, we must still be - // Remediated. - return - } - - if testSuccess.Status == metav1.ConditionTrue && testSuccess.ObservedGeneration >= remediated.ObservedGeneration { - // If the release is marked as TestSuccess, and the generation of - // the TestSuccess condition is equal or higher than the generation - // of the Remediated condition, we are not in a Remediated state. - conditions.Delete(req.Object, v2.RemediatedCondition) - return - } -} - // eventMessageWithLog returns an event message composed out of the given // message and any log messages by appending them to the message. func eventMessageWithLog(msg string, log *action.LogBuffer) string { diff --git a/internal/reconcile/release_test.go b/internal/reconcile/release_test.go index 117820323..167b8df72 100644 --- a/internal/reconcile/release_test.go +++ b/internal/reconcile/release_test.go @@ -424,60 +424,6 @@ func Test_summarize(t *testing.T) { }, }, }, - { - name: "with stale remediation", - spec: &v2.HelmReleaseSpec{ - Test: &v2.Test{ - Enable: true, - }, - }, - conditions: []metav1.Condition{ - { - Type: v2.RemediatedCondition, - Status: metav1.ConditionTrue, - Reason: v2.RollbackSucceededReason, - Message: "Rollback finished", - ObservedGeneration: 2, - }, - { - Type: v2.ReleasedCondition, - Status: metav1.ConditionTrue, - Reason: v2.UpgradeSucceededReason, - Message: "Upgrade finished", - ObservedGeneration: 2, - }, - { - Type: v2.TestSuccessCondition, - Status: metav1.ConditionTrue, - Reason: v2.TestSucceededReason, - Message: "test hooks succeeded", - ObservedGeneration: 2, - }, - }, - expect: []metav1.Condition{ - { - Type: meta.ReadyCondition, - Status: metav1.ConditionTrue, - Reason: v2.TestSucceededReason, - Message: "test hooks succeeded", - ObservedGeneration: 2, - }, - { - Type: v2.ReleasedCondition, - Status: metav1.ConditionTrue, - Reason: v2.UpgradeSucceededReason, - Message: "Upgrade finished", - ObservedGeneration: 2, - }, - { - Type: v2.TestSuccessCondition, - Status: metav1.ConditionTrue, - Reason: v2.TestSucceededReason, - Message: "test hooks succeeded", - ObservedGeneration: 2, - }, - }, - }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -501,145 +447,6 @@ func Test_summarize(t *testing.T) { } } -func Test_conditionallyDeleteRemediated(t *testing.T) { - tests := []struct { - name string - spec v2.HelmReleaseSpec - conditions []metav1.Condition - expectDelete bool - }{ - { - name: "no Remediated condition", - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.ReleasedCondition, v2.InstallSucceededReason, "Install finished"), - }, - expectDelete: false, - }, - { - name: "no Released condition", - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.RemediatedCondition, v2.RollbackSucceededReason, "Rollback finished"), - }, - expectDelete: false, - }, - { - name: "Released=True without tests enabled", - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.RemediatedCondition, v2.RollbackSucceededReason, "Rollback finished"), - *conditions.TrueCondition(v2.ReleasedCondition, v2.UpgradeSucceededReason, "Upgrade finished"), - }, - expectDelete: true, - }, - { - name: "Stale Released=True with newer Remediated", - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.ReleasedCondition, v2.UpgradeSucceededReason, "Upgrade finished"), - { - Type: v2.RemediatedCondition, - Status: metav1.ConditionTrue, - Reason: v2.RollbackSucceededReason, - Message: "Rollback finished", - ObservedGeneration: 2, - }, - }, - expectDelete: false, - }, - { - name: "Released=False", - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.RemediatedCondition, v2.RollbackSucceededReason, "Rollback finished"), - *conditions.FalseCondition(v2.ReleasedCondition, v2.UpgradeFailedReason, "Upgrade failed"), - }, - expectDelete: false, - }, - { - name: "TestSuccess=True with tests enabled", - spec: v2.HelmReleaseSpec{ - Test: &v2.Test{ - Enable: true, - }, - }, - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.RemediatedCondition, v2.RollbackSucceededReason, "Rollback finished"), - *conditions.TrueCondition(v2.ReleasedCondition, v2.UpgradeSucceededReason, "Upgrade finished"), - *conditions.TrueCondition(v2.TestSuccessCondition, v2.TestSucceededReason, "Test hooks succeeded"), - }, - expectDelete: true, - }, - { - name: "TestSuccess=False with tests enabled", - spec: v2.HelmReleaseSpec{ - Test: &v2.Test{ - Enable: true, - }, - }, - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.RemediatedCondition, v2.RollbackSucceededReason, "Rollback finished"), - *conditions.TrueCondition(v2.ReleasedCondition, v2.UpgradeSucceededReason, "Upgrade finished"), - *conditions.FalseCondition(v2.TestSuccessCondition, v2.TestSucceededReason, "Test hooks succeeded"), - }, - expectDelete: false, - }, - { - name: "TestSuccess=False with tests ignored", - spec: v2.HelmReleaseSpec{ - Test: &v2.Test{ - Enable: true, - IgnoreFailures: true, - }, - }, - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.RemediatedCondition, v2.RollbackSucceededReason, "Rollback finished"), - *conditions.TrueCondition(v2.ReleasedCondition, v2.UpgradeSucceededReason, "Upgrade finished"), - *conditions.FalseCondition(v2.TestSuccessCondition, v2.TestFailedReason, "Test hooks failed"), - }, - expectDelete: true, - }, - { - name: "Stale TestSuccess=True with newer Remediated", - spec: v2.HelmReleaseSpec{ - Test: &v2.Test{ - Enable: true, - }, - }, - conditions: []metav1.Condition{ - *conditions.TrueCondition(v2.ReleasedCondition, v2.UpgradeSucceededReason, "Upgrade finished"), - *conditions.TrueCondition(v2.TestSuccessCondition, v2.TestSucceededReason, "Test hooks succeeded"), - { - Type: v2.RemediatedCondition, - Status: metav1.ConditionTrue, - Reason: v2.RollbackSucceededReason, - Message: "Rollback finished", - ObservedGeneration: 2, - }, - }, - expectDelete: false, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - g := NewWithT(t) - - obj := &v2.HelmRelease{ - Spec: tt.spec, - Status: v2.HelmReleaseStatus{ - Conditions: tt.conditions, - }, - } - isRemediated := conditions.Has(obj, v2.RemediatedCondition) - - conditionallyDeleteRemediated(&Request{Object: obj}) - - if tt.expectDelete { - g.Expect(isRemediated).ToNot(Equal(conditions.Has(obj, v2.RemediatedCondition))) - return - } - - g.Expect(conditions.Has(obj, v2.RemediatedCondition)).To(Equal(isRemediated)) - }) - } -} - func mockLogBuffer(size int, lines int) *action.LogBuffer { log := action.NewLogBuffer(action.NewDebugLog(logr.Discard()), size) for i := 0; i < lines; i++ { diff --git a/internal/reconcile/upgrade.go b/internal/reconcile/upgrade.go index 06e617491..95b345913 100644 --- a/internal/reconcile/upgrade.go +++ b/internal/reconcile/upgrade.go @@ -75,6 +75,9 @@ func (r *Upgrade) Reconcile(ctx context.Context, req *Request) error { // Mark upgrade attempt on object. req.Object.Status.LastAttemptedReleaseAction = v2.ReleaseActionUpgrade + // If we are upgrading, we are no longer remediated. + conditions.Delete(req.Object, v2.RemediatedCondition) + // Run the Helm upgrade action. _, err := action.Upgrade(ctx, cfg, req.Object, req.Chart, req.Values)