From ce30a35357a3129ba57b907b34e6b5052ecc155e Mon Sep 17 00:00:00 2001
From: Kevin Earls <kearls@redhat.com>
Date: Thu, 1 Dec 2022 10:48:51 +0100
Subject: [PATCH] Ignore reconcile errors that occur because a pod is being
 terminated (#1233)

* Ignore reconcile errors that occur because a pod is being terminated

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Appease the all powerfull linter

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Change behavior to end reconcile loop if pod has been terminated

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Print a log message if we exit the reconciler loop

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Look for NamespaceTerminatingCause

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Appease the almighty linter

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Fix log message

Signed-off-by: Kevin Earls <kearls@redhat.com>

* Skip flaky test

Signed-off-by: Kevin Earls <kearls@redhat.com>

Signed-off-by: Kevin Earls <kearls@redhat.com>
Co-authored-by: Ben B <bongartz@klimlive.de>
---
 cmd/otel-allocator/allocation/least_weighted_test.go | 1 +
 controllers/opentelemetrycollector_controller.go     | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/cmd/otel-allocator/allocation/least_weighted_test.go b/cmd/otel-allocator/allocation/least_weighted_test.go
index 2812541966..f70d5025fb 100644
--- a/cmd/otel-allocator/allocation/least_weighted_test.go
+++ b/cmd/otel-allocator/allocation/least_weighted_test.go
@@ -181,6 +181,7 @@ func TestNoCollectorReassignment(t *testing.T) {
 }
 
 func TestSmartCollectorReassignment(t *testing.T) {
+	t.Skip("This test is flaky and fails frequently, see issue 1291")
 	s, _ := New("least-weighted", logger)
 
 	cols := makeNCollectors(4, 0)
diff --git a/controllers/opentelemetrycollector_controller.go b/controllers/opentelemetrycollector_controller.go
index 8e986c4695..6d3bb96a67 100644
--- a/controllers/opentelemetrycollector_controller.go
+++ b/controllers/opentelemetrycollector_controller.go
@@ -168,6 +168,11 @@ func (r *OpenTelemetryCollectorReconciler) Reconcile(ctx context.Context, req ct
 func (r *OpenTelemetryCollectorReconciler) RunTasks(ctx context.Context, params reconcile.Params) error {
 	for _, task := range r.tasks {
 		if err := task.Do(ctx, params); err != nil {
+			// If we get an error that occurs because a pod is being terminated, then exit this loop
+			if apierrors.IsForbidden(err) && apierrors.HasStatusCause(err, corev1.NamespaceTerminatingCause) {
+				r.log.V(2).Info("Exiting reconcile loop because namespace is being terminated", "namespace", params.Instance.Namespace)
+				return nil
+			}
 			r.log.Error(err, fmt.Sprintf("failed to reconcile %s", task.Name))
 			if task.BailOnError {
 				return err