From 90b04bb285d7b7e18a6c37a9fc9d6c061af78fc4 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Wed, 19 Feb 2025 20:26:50 -0500 Subject: [PATCH 01/12] check restartCounts and idle crashlooping pods --- controllers/idler/idler_controller.go | 105 ++++++++++++++------- controllers/idler/idler_controller_test.go | 41 ++++++-- 2 files changed, 103 insertions(+), 43 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index eb4bdb75..d4d7a82d 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -34,6 +34,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +const ( + RestartThreshold = 50 + RequeueTimeThreshold = 300 +) + var SupportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{ schema.GroupVersion{Group: "camel.apache.org", Version: "v1"}.WithKind("Integration"): schema.GroupVersion{Group: "camel.apache.org", Version: "v1"}.WithResource("integrations"), schema.GroupVersion{Group: "camel.apache.org", Version: "v1alpha1"}.WithKind("KameletBinding"): schema.GroupVersion{Group: "camel.apache.org", Version: "v1alpha1"}.WithResource("kameletbindings"), @@ -110,7 +115,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. // Find the earlier pod to kill and requeue. Otherwise, use the idler timeoutSeconds to requeue. nextTime := nextPodToBeKilledAfter(logger, idler) if nextTime == nil { - after := time.Duration(idler.Spec.TimeoutSeconds) * time.Second + after := time.Duration(RequeueTimeThreshold) * time.Second logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) return reconcile.Result{ Requeue: true, @@ -133,9 +138,19 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. newStatusPods := make([]toolchainv1alpha1.Pod, 0, 10) for _, pod := range podList.Items { pod := pod // TODO We won't need it after upgrading to go 1.22: https://go.dev/blog/loopvar-preview - logger := log.FromContext(ctx) - podLogger := logger.WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase) + podLogger := log.FromContext(ctx).WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase) if trackedPod := findPodByName(idler, pod.Name); trackedPod != nil { + // check the restart count for the trackedPod + crashlooping, restartCount := isRestartingOften(pod.Status) + if crashlooping { + podLogger.Info("Pod is restarting too often. Killing the pod", "restart_count", restartCount) + // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. + err := deletePodsAndCreateNotification(ctx, podLogger, pod, true, r, idler) + if err != nil { + return err + } + continue + } timeoutSeconds := idler.Spec.TimeoutSeconds if isOwnedByVM(pod.ObjectMeta) { // use 1/12th of the timeout for VMs @@ -144,43 +159,11 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Already tracking this pod. Check the timeout. if time.Now().After(trackedPod.StartTime.Add(time.Duration(timeoutSeconds) * time.Second)) { podLogger.Info("Pod running for too long. Killing the pod.", "start_time", trackedPod.StartTime.Format("2006-01-02T15:04:05Z"), "timeout_seconds", timeoutSeconds) - var podreason string - podCondition := pod.Status.Conditions - for _, podCond := range podCondition { - if podCond.Type == "Ready" { - podreason = podCond.Reason - } - } - // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - lctx := log.IntoContext(ctx, podLogger) - appType, appName, deletedByController, err := r.scaleControllerToZero(lctx, pod.ObjectMeta) + err := deletePodsAndCreateNotification(ctx, podLogger, pod, false, r, idler) if err != nil { return err } - if !deletedByController { // Pod not managed by a controller. We can just delete the pod. - logger.Info("Deleting pod without controller") - if err := r.AllNamespacesClient.Delete(ctx, &pod); err != nil { - return err - } - podLogger.Info("Pod deleted") - } - - if appName == "" { - appName = pod.Name - appType = "Pod" - } - // Send notification if the deleted pod was managed by a controller or was a standalone pod that was not completed - // eg. If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification - if podreason != "PodCompleted" || deletedByController { - // By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent - if err := r.createNotification(ctx, idler, appName, appType); err != nil { - logger.Error(err, "failed to create Notification") - if err = r.setStatusIdlerNotificationCreationFailed(ctx, idler, err.Error()); err != nil { - logger.Error(err, "failed to set status IdlerNotificationCreationFailed") - } // not returning error to continue tracking remaining pods - } - } } else { newStatusPods = append(newStatusPods, *trackedPod) // keep tracking @@ -198,6 +181,56 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. return r.updateStatusPods(ctx, idler, newStatusPods) } +// Check if the pod belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. +// if it is a standalone pod, delete it. +// Send notification if the deleted pod was managed by a controller, was a standalone pod that was not completed or was crashlooping +func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, pod corev1.Pod, isCrashlooping bool, r *Reconciler, idler *toolchainv1alpha1.Idler) error { + lctx := log.IntoContext(ctx, podLogger) + logger := log.FromContext(ctx) + var podreason string + podCondition := pod.Status.Conditions + for _, podCond := range podCondition { + if podCond.Type == "Ready" { + podreason = podCond.Reason + } + } + appType, appName, deletedByController, err := r.scaleControllerToZero(lctx, pod.ObjectMeta) + if err != nil { + return err + } + if !deletedByController { // Pod not managed by a controller. We can just delete the pod. + log.FromContext(ctx).Info("Deleting pod without controller") + if err := r.AllNamespacesClient.Delete(ctx, &pod); err != nil { + return err + } + podLogger.Info("Pod deleted") + } + if appName == "" { + appName = pod.Name + appType = "Pod" + } + + // If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification + if podreason != "PodCompleted" || deletedByController || isCrashlooping { + // By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent + if err := r.createNotification(ctx, idler, appName, appType); err != nil { + logger.Error(err, "failed to create Notification") + if err = r.setStatusIdlerNotificationCreationFailed(ctx, idler, err.Error()); err != nil { + logger.Error(err, "failed to set status IdlerNotificationCreationFailed") + } // not returning error to continue tracking remaining pods + } + } + return nil +} + +func isRestartingOften(podstatus corev1.PodStatus) (bool, int32) { + var restartCount int32 + for _, status := range podstatus.ContainerStatuses { + restartCount += status.RestartCount + } + return restartCount > RestartThreshold, restartCount +} + func (r *Reconciler) createNotification(ctx context.Context, idler *toolchainv1alpha1.Idler, appName string, appType string) error { log.FromContext(ctx).Info("Create Notification") //Get the HostClient diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index e0dc5a19..fbd15578 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -117,10 +117,10 @@ func TestEnsureIdling(t *testing.T) { // then require.NoError(t, err) - // no pods found - the controller will requeue after the idler's timeout + // no pods found - the controller will requeue after 5 mins assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: 30 * time.Second, + RequeueAfter: RequeueTimeThreshold * time.Second, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).HasConditions(memberoperatortest.Running()) }) @@ -143,8 +143,9 @@ func TestEnsureIdling(t *testing.T) { reconciler, req, cl, allCl, dynamicClient := prepareReconcile(t, idler.Name, getHostCluster, idler, nsTmplSet, mur) podsTooEarlyToKill := preparePayloads(t, reconciler, idler.Name, "", freshStartTimes(idler)) - + podCrashLooping := preparePayloadCrashloopingPod(t, reconciler, idler.Name, "restartCount-") podsRunningForTooLong := preparePayloads(t, reconciler, idler.Name, "todelete-", expiredStartTimes(idler)) + noise := preparePayloads(t, reconciler, "another-namespace", "", expiredStartTimes(idler)) t.Run("First reconcile. Start tracking.", func(t *testing.T) { @@ -158,6 +159,7 @@ func TestEnsureIdling(t *testing.T) { PodsExist(podsRunningForTooLong.standalonePods). PodsExist(podsTooEarlyToKill.standalonePods). PodsExist(noise.standalonePods). + PodsExist(podCrashLooping.standalonePods). DaemonSetExists(podsRunningForTooLong.daemonSet). DaemonSetExists(podsTooEarlyToKill.daemonSet). DaemonSetExists(noise.daemonSet). @@ -191,24 +193,26 @@ func TestEnsureIdling(t *testing.T) { // Tracked pods memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods...)). + TracksPods(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods...), podCrashLooping.allPods...)). HasConditions(memberoperatortest.Running()) assert.True(t, res.Requeue) assert.Equal(t, 0, int(res.RequeueAfter)) // pods running for too long should be killed immediately - t.Run("Second Reconcile. Delete long running pods.", func(t *testing.T) { + t.Run("Second Reconcile. Delete long running and crashlooping pods.", func(t *testing.T) { //when res, err := reconciler.Reconcile(context.TODO(), req) // then require.NoError(t, err) // Too long running pods are gone. All long running controllers are scaled down. + // Crashlooping pods are gone. // The rest of the pods are still there and controllers are scaled up. memberoperatortest.AssertThatInIdleableCluster(t, allCl, dynamicClient). PodsDoNotExist(podsRunningForTooLong.standalonePods). PodsExist(podsTooEarlyToKill.standalonePods). PodsExist(noise.standalonePods). + PodsDoNotExist(podCrashLooping.standalonePods). DaemonSetDoesNotExist(podsRunningForTooLong.daemonSet). DaemonSetExists(podsTooEarlyToKill.daemonSet). DaemonSetExists(noise.daemonSet). @@ -284,7 +288,7 @@ func TestEnsureIdling(t *testing.T) { // requeue after the idler timeout assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: 60 * time.Second, + RequeueAfter: RequeueTimeThreshold * time.Second, }, res) }) }) @@ -522,7 +526,7 @@ func TestEnsureIdlingFailed(t *testing.T) { require.NoError(t, err) // 'NotFound' errors are ignored! assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: 60 * time.Second, + RequeueAfter: RequeueTimeThreshold * time.Second, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).ContainsCondition(memberoperatortest.Running()) } @@ -1317,6 +1321,29 @@ func preparePayloadsSinglePod(t *testing.T, r *Reconciler, namespace, namePrefix } } +func preparePayloadCrashloopingPod(t *testing.T, r *Reconciler, namespace, namePrefix string) payloads { + standalonePods := make([]*corev1.Pod, 0, 1) + startTime := metav1.Now() + // Create a standalone pod with no owner which has restart count sum > 50 + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s%s-pod-fail", namePrefix, namespace), + Namespace: namespace, + }, + Status: corev1.PodStatus{StartTime: &startTime, ContainerStatuses: []corev1.ContainerStatus{ + {RestartCount: 30}, + {RestartCount: 24}, + }}, + } + err := r.AllNamespacesClient.Create(context.TODO(), pod) + require.NoError(t, err) + standalonePods = append(standalonePods, pod) + return payloads{ + standalonePods: standalonePods, + allPods: standalonePods, + } +} + func createPods(t *testing.T, r *Reconciler, owner metav1.Object, startTime metav1.Time, podsToTrack []*corev1.Pod, conditions ...corev1.PodCondition) []*corev1.Pod { for i := 0; i < 3; i++ { pod := &corev1.Pod{ From a9b27504aaa54ce3fd7aa66dad69abca3e0bf8f9 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Wed, 19 Feb 2025 21:25:56 -0500 Subject: [PATCH 02/12] rename const to RequeueTimeThresholdSeconds, max requeue time to 5 mins --- controllers/idler/idler_controller.go | 8 ++++---- controllers/idler/idler_controller_test.go | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index d4d7a82d..50ae7643 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -35,8 +35,8 @@ import ( ) const ( - RestartThreshold = 50 - RequeueTimeThreshold = 300 + RestartThreshold = 50 + RequeueTimeThresholdSeconds = 300 ) var SupportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{ @@ -114,8 +114,8 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. } // Find the earlier pod to kill and requeue. Otherwise, use the idler timeoutSeconds to requeue. nextTime := nextPodToBeKilledAfter(logger, idler) - if nextTime == nil { - after := time.Duration(RequeueTimeThreshold) * time.Second + after := time.Duration(RequeueTimeThresholdSeconds) * time.Second + if nextTime == nil || *nextTime > after { logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) return reconcile.Result{ Requeue: true, diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index fbd15578..3eb8f207 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -120,7 +120,7 @@ func TestEnsureIdling(t *testing.T) { // no pods found - the controller will requeue after 5 mins assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: RequeueTimeThreshold * time.Second, + RequeueAfter: RequeueTimeThresholdSeconds * time.Second, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).HasConditions(memberoperatortest.Running()) }) @@ -288,7 +288,7 @@ func TestEnsureIdling(t *testing.T) { // requeue after the idler timeout assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: RequeueTimeThreshold * time.Second, + RequeueAfter: RequeueTimeThresholdSeconds * time.Second, }, res) }) }) @@ -526,7 +526,7 @@ func TestEnsureIdlingFailed(t *testing.T) { require.NoError(t, err) // 'NotFound' errors are ignored! assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: RequeueTimeThreshold * time.Second, + RequeueAfter: RequeueTimeThresholdSeconds * time.Second, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).ContainsCondition(memberoperatortest.Running()) } From d40b75bd609843b384e6dd0b6397ca9cda4f15ca Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Wed, 19 Feb 2025 21:52:57 -0500 Subject: [PATCH 03/12] fix condition --- controllers/idler/idler_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index 50ae7643..b634ec69 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -115,7 +115,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. // Find the earlier pod to kill and requeue. Otherwise, use the idler timeoutSeconds to requeue. nextTime := nextPodToBeKilledAfter(logger, idler) after := time.Duration(RequeueTimeThresholdSeconds) * time.Second - if nextTime == nil || *nextTime > after { + if nextTime == nil || (nextTime != nil && *nextTime > after) { logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) return reconcile.Result{ Requeue: true, From 576b44c40a3fbd763c7792142a84d8a245b83b23 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Mon, 24 Feb 2025 17:09:21 -0500 Subject: [PATCH 04/12] add more scenarios for crashlooping pods but within threshold --- controllers/idler/idler_controller.go | 4 +- controllers/idler/idler_controller_test.go | 109 ++++++++++++++++----- 2 files changed, 90 insertions(+), 23 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index b634ec69..b06ade45 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -226,7 +226,9 @@ func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, func isRestartingOften(podstatus corev1.PodStatus) (bool, int32) { var restartCount int32 for _, status := range podstatus.ContainerStatuses { - restartCount += status.RestartCount + if restartCount < status.RestartCount { + restartCount = status.RestartCount + } } return restartCount > RestartThreshold, restartCount } diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index 3eb8f207..d4a84dad 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -143,7 +143,8 @@ func TestEnsureIdling(t *testing.T) { reconciler, req, cl, allCl, dynamicClient := prepareReconcile(t, idler.Name, getHostCluster, idler, nsTmplSet, mur) podsTooEarlyToKill := preparePayloads(t, reconciler, idler.Name, "", freshStartTimes(idler)) - podCrashLooping := preparePayloadCrashloopingPod(t, reconciler, idler.Name, "restartCount-") + podsCrashLoopingWithinThreshold := preparePayloadCrashloopingPodsWithinThreshold(t, reconciler, idler.Name, "inThreshRestarts-", freshStartTimes(idler)) + podsCrashLooping := preparePayloadCrashloopingPod(t, reconciler, idler.Name, "restartCount-") podsRunningForTooLong := preparePayloads(t, reconciler, idler.Name, "todelete-", expiredStartTimes(idler)) noise := preparePayloads(t, reconciler, "another-namespace", "", expiredStartTimes(idler)) @@ -159,7 +160,7 @@ func TestEnsureIdling(t *testing.T) { PodsExist(podsRunningForTooLong.standalonePods). PodsExist(podsTooEarlyToKill.standalonePods). PodsExist(noise.standalonePods). - PodsExist(podCrashLooping.standalonePods). + PodsExist(podsCrashLooping.standalonePods). DaemonSetExists(podsRunningForTooLong.daemonSet). DaemonSetExists(podsTooEarlyToKill.daemonSet). DaemonSetExists(noise.daemonSet). @@ -175,6 +176,7 @@ func TestEnsureIdling(t *testing.T) { DeploymentScaledUp(noise.deployment). DeploymentScaledUp(noise.integration). DeploymentScaledUp(noise.kameletBinding). + DeploymentScaledUp(podsCrashLooping.deployment). ReplicaSetScaledUp(podsRunningForTooLong.replicaSet). ReplicaSetScaledUp(podsTooEarlyToKill.replicaSet). ReplicaSetScaledUp(noise.replicaSet). @@ -187,13 +189,15 @@ func TestEnsureIdling(t *testing.T) { StatefulSetScaledUp(podsRunningForTooLong.statefulSet). StatefulSetScaledUp(podsTooEarlyToKill.statefulSet). StatefulSetScaledUp(noise.statefulSet). + StatefulSetScaledUp(podsCrashLoopingWithinThreshold.statefulSet). VMRunning(podsRunningForTooLong.virtualmachine). VMRunning(podsTooEarlyToKill.virtualmachine). VMRunning(noise.virtualmachine) + // after golang 1.22 upgrade can use slices.Concat(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods, podsCrashLooping.allPods, podsCrashLoopingWithinThreshold.allPods) // Tracked pods memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods...), podCrashLooping.allPods...)). + TracksPods(append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods...), podsCrashLooping.allPods...), podsCrashLoopingWithinThreshold.allPods...)). HasConditions(memberoperatortest.Running()) assert.True(t, res.Requeue) @@ -212,7 +216,7 @@ func TestEnsureIdling(t *testing.T) { PodsDoNotExist(podsRunningForTooLong.standalonePods). PodsExist(podsTooEarlyToKill.standalonePods). PodsExist(noise.standalonePods). - PodsDoNotExist(podCrashLooping.standalonePods). + PodsDoNotExist(podsCrashLooping.standalonePods). DaemonSetDoesNotExist(podsRunningForTooLong.daemonSet). DaemonSetExists(podsTooEarlyToKill.daemonSet). DaemonSetExists(noise.daemonSet). @@ -222,6 +226,7 @@ func TestEnsureIdling(t *testing.T) { DeploymentScaledDown(podsRunningForTooLong.deployment). DeploymentScaledDown(podsRunningForTooLong.integration). DeploymentScaledDown(podsRunningForTooLong.kameletBinding). + DeploymentScaledDown(podsCrashLooping.deployment). DeploymentScaledUp(podsTooEarlyToKill.deployment). DeploymentScaledUp(podsTooEarlyToKill.integration). DeploymentScaledUp(podsTooEarlyToKill.kameletBinding). @@ -240,13 +245,14 @@ func TestEnsureIdling(t *testing.T) { StatefulSetScaledDown(podsRunningForTooLong.statefulSet). StatefulSetScaledUp(podsTooEarlyToKill.statefulSet). StatefulSetScaledUp(noise.statefulSet). + StatefulSetScaledUp(podsCrashLoopingWithinThreshold.statefulSet). VMStopped(podsRunningForTooLong.virtualmachine). VMRunning(podsTooEarlyToKill.virtualmachine). VMRunning(noise.virtualmachine) - // Still tracking all pods. Even deleted ones. + // Only tracks pods that have not been processed in this reconcile memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(podsTooEarlyToKill.allPods). + TracksPods(append(podsTooEarlyToKill.allPods, podsCrashLoopingWithinThreshold.allPods...)). HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated()) assert.True(t, res.Requeue) @@ -260,7 +266,7 @@ func TestEnsureIdling(t *testing.T) { require.NoError(t, err) // Tracking existing pods only. memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...)). + TracksPods(append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...), podsCrashLoopingWithinThreshold.allPods...), podsCrashLooping.controlledPods...)). HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated()) assert.True(t, res.Requeue) @@ -269,7 +275,7 @@ func TestEnsureIdling(t *testing.T) { t.Run("No pods. No requeue.", func(t *testing.T) { //given // cleanup remaining pods - pods := append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...) + pods := append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...), podsCrashLoopingWithinThreshold.allPods...), podsCrashLooping.controlledPods...) for _, pod := range pods { err := allCl.Delete(context.TODO(), pod) require.NoError(t, err) @@ -1122,7 +1128,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), rs) require.NoError(t, err) - controlledPods := createPods(t, r, rs, sTime, make([]*corev1.Pod, 0, 3), conditions...) + controlledPods := createPods(t, r, rs, sTime, make([]*corev1.Pod, 0, 3), false, conditions...) // Deployment with Camel K integration as an owner reference and a scale sub resource integration := &appsv1.Deployment{ @@ -1149,7 +1155,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), integrationRS) require.NoError(t, err) - controlledPods = createPods(t, r, integrationRS, sTime, controlledPods) + controlledPods = createPods(t, r, integrationRS, sTime, controlledPods, false) // Deployment with Camel K integration as an owner reference and a scale sub resource binding := &appsv1.Deployment{ @@ -1176,7 +1182,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), bindingRS) require.NoError(t, err) - controlledPods = createPods(t, r, bindingRS, sTime, controlledPods) + controlledPods = createPods(t, r, bindingRS, sTime, controlledPods, false) // Standalone ReplicaSet standaloneRs := &appsv1.ReplicaSet{ @@ -1185,7 +1191,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), standaloneRs) require.NoError(t, err) - controlledPods = createPods(t, r, standaloneRs, sTime, controlledPods) + controlledPods = createPods(t, r, standaloneRs, sTime, controlledPods, false) // DaemonSet ds := &appsv1.DaemonSet{ @@ -1193,7 +1199,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), ds) require.NoError(t, err) - controlledPods = createPods(t, r, ds, sTime, controlledPods) + controlledPods = createPods(t, r, ds, sTime, controlledPods, false) // Job job := &batchv1.Job{ @@ -1201,7 +1207,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), job) require.NoError(t, err) - controlledPods = createPods(t, r, job, sTime, controlledPods) + controlledPods = createPods(t, r, job, sTime, controlledPods, false) // StatefulSet sts := &appsv1.StatefulSet{ @@ -1210,7 +1216,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), sts) require.NoError(t, err) - controlledPods = createPods(t, r, sts, sTime, controlledPods) + controlledPods = createPods(t, r, sts, sTime, controlledPods, false) // DeploymentConfig dc := &openshiftappsv1.DeploymentConfig{ @@ -1227,7 +1233,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), rc) require.NoError(t, err) - controlledPods = createPods(t, r, rc, sTime, controlledPods) + controlledPods = createPods(t, r, rc, sTime, controlledPods, false) // VirtualMachine vm := &unstructured.Unstructured{} @@ -1249,7 +1255,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) _, err = r.DynamicClient.Resource(vmInstanceGVR).Namespace(namespace).Create(context.TODO(), vmi, metav1.CreateOptions{}) require.NoError(t, err) - controlledPods = createPods(t, r, vmi, vmstartTime, controlledPods) // vmi controls pod + controlledPods = createPods(t, r, vmi, vmstartTime, controlledPods, false) // vmi controls pod // Standalone ReplicationController standaloneRC := &corev1.ReplicationController{ @@ -1258,7 +1264,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), standaloneRC) require.NoError(t, err) - controlledPods = createPods(t, r, standaloneRC, sTime, controlledPods) + controlledPods = createPods(t, r, standaloneRC, sTime, controlledPods, false) // Pods with unknown owner. They are subject of direct management by the Idler. // It doesn't have to be Idler. We just need any object as the owner of the pods @@ -1270,7 +1276,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, }, Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 30}, } - standalonePods := createPods(t, r, idler, sTime, make([]*corev1.Pod, 0, 3)) + standalonePods := createPods(t, r, idler, sTime, make([]*corev1.Pod, 0, 3), false) // Pods with no owner. for i := 0; i < 3; i++ { @@ -1324,6 +1330,7 @@ func preparePayloadsSinglePod(t *testing.T, r *Reconciler, namespace, namePrefix func preparePayloadCrashloopingPod(t *testing.T, r *Reconciler, namespace, namePrefix string) payloads { standalonePods := make([]*corev1.Pod, 0, 1) startTime := metav1.Now() + replicas := int32(3) // Create a standalone pod with no owner which has restart count sum > 50 pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ @@ -1331,25 +1338,83 @@ func preparePayloadCrashloopingPod(t *testing.T, r *Reconciler, namespace, nameP Namespace: namespace, }, Status: corev1.PodStatus{StartTime: &startTime, ContainerStatuses: []corev1.ContainerStatus{ - {RestartCount: 30}, + {RestartCount: 52}, {RestartCount: 24}, }}, } err := r.AllNamespacesClient.Create(context.TODO(), pod) require.NoError(t, err) standalonePods = append(standalonePods, pod) + // Deployment + d := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s%s-deployment", namePrefix, namespace), Namespace: namespace}, + Spec: appsv1.DeploymentSpec{Replicas: &replicas}, + } + err = r.AllNamespacesClient.Create(context.TODO(), d) + require.NoError(t, err) + rs := &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-replicaset", d.Name), Namespace: namespace}, + Spec: appsv1.ReplicaSetSpec{Replicas: &replicas}, + } + err = controllerutil.SetControllerReference(d, rs, r.Scheme) + require.NoError(t, err) + err = r.AllNamespacesClient.Create(context.TODO(), rs) + require.NoError(t, err) + controlledPods := createPods(t, r, rs, startTime, make([]*corev1.Pod, 0, 3), true) + + allPods := append(standalonePods, controlledPods...) return payloads{ standalonePods: standalonePods, - allPods: standalonePods, + allPods: allPods, + controlledPods: controlledPods, + deployment: d, } } -func createPods(t *testing.T, r *Reconciler, owner metav1.Object, startTime metav1.Time, podsToTrack []*corev1.Pod, conditions ...corev1.PodCondition) []*corev1.Pod { +func preparePayloadCrashloopingPodsWithinThreshold(t *testing.T, r *Reconciler, namespace, namePrefix string, times payloadStartTimes) payloads { + startTime := metav1.NewTime(times.defaultStartTime) + replicas := int32(3) + controlledPods := make([]*corev1.Pod, 0, 3) + // Create a StatefulSet with Crashlooping pods less than threshold + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s%s-statefulset", namePrefix, namespace), Namespace: namespace}, + Spec: appsv1.StatefulSetSpec{Replicas: &replicas}, + } + err := r.AllNamespacesClient.Create(context.TODO(), sts) + require.NoError(t, err) + for i := 0; i < int(replicas); i++ { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod-%d", sts.Name, i), Namespace: sts.Namespace}, + Status: corev1.PodStatus{StartTime: &startTime, ContainerStatuses: []corev1.ContainerStatus{ + {RestartCount: 30}, + {RestartCount: 24}, + }}, + } + err := controllerutil.SetControllerReference(sts, pod, r.Scheme) + require.NoError(t, err) + controlledPods = append(controlledPods, pod) + err = r.AllNamespacesClient.Create(context.TODO(), pod) + require.NoError(t, err) + } + return payloads{ + controlledPods: controlledPods, + statefulSet: sts, + allPods: controlledPods, + } +} + +func createPods(t *testing.T, r *Reconciler, owner metav1.Object, startTime metav1.Time, podsToTrack []*corev1.Pod, isCrashlooping bool, conditions ...corev1.PodCondition) []*corev1.Pod { for i := 0; i < 3; i++ { pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod-%d", owner.GetName(), i), Namespace: owner.GetNamespace()}, Status: corev1.PodStatus{StartTime: &startTime, Conditions: conditions}, } + if isCrashlooping { + pod.Status = corev1.PodStatus{StartTime: &startTime, Conditions: conditions, ContainerStatuses: []corev1.ContainerStatus{ + {RestartCount: 52}, + {RestartCount: 24}, + }} + } err := controllerutil.SetControllerReference(owner, pod, r.Scheme) require.NoError(t, err) podsToTrack = append(podsToTrack, pod) From c2300653882c00715be041bc59c395a5db195305 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Mon, 24 Feb 2025 17:10:29 -0500 Subject: [PATCH 05/12] revert condition fix --- controllers/idler/idler_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index b06ade45..dda376c9 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -115,7 +115,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. // Find the earlier pod to kill and requeue. Otherwise, use the idler timeoutSeconds to requeue. nextTime := nextPodToBeKilledAfter(logger, idler) after := time.Duration(RequeueTimeThresholdSeconds) * time.Second - if nextTime == nil || (nextTime != nil && *nextTime > after) { + if nextTime == nil || *nextTime > after { logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) return reconcile.Result{ Requeue: true, From 82f8118a99f81e4c34d39486a26b46c419193af3 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Mon, 24 Feb 2025 17:52:56 -0500 Subject: [PATCH 06/12] fix comment, add comment when tracking controlled pods again, use constant for tests --- controllers/idler/idler_controller.go | 7 +++++-- controllers/idler/idler_controller_test.go | 12 ++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index dda376c9..2bca04fd 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -35,8 +35,11 @@ import ( ) const ( - RestartThreshold = 50 - RequeueTimeThresholdSeconds = 300 + RestartThreshold = 50 + RequeueTimeThresholdSeconds = 300 + RestartCountWithinThresholdContainer1 = 30 + RestartCountWithinThresholdContainer2 = 24 + RestartCountOverThreshold = 52 ) var SupportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{ diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index d4a84dad..b0b8c0f6 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -250,7 +250,7 @@ func TestEnsureIdling(t *testing.T) { VMRunning(podsTooEarlyToKill.virtualmachine). VMRunning(noise.virtualmachine) - // Only tracks pods that have not been processed in this reconcile + // Only tracks pods that have not been deleted memberoperatortest.AssertThatIdler(t, idler.Name, cl). TracksPods(append(podsTooEarlyToKill.allPods, podsCrashLoopingWithinThreshold.allPods...)). HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated()) @@ -266,7 +266,7 @@ func TestEnsureIdling(t *testing.T) { require.NoError(t, err) // Tracking existing pods only. memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...), podsCrashLoopingWithinThreshold.allPods...), podsCrashLooping.controlledPods...)). + TracksPods(append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...), podsCrashLoopingWithinThreshold.allPods...), podsCrashLooping.controlledPods...)). // controlledPods are being tracked again because in unit tests scaling down doesn't delete pods HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated()) assert.True(t, res.Requeue) @@ -1338,8 +1338,8 @@ func preparePayloadCrashloopingPod(t *testing.T, r *Reconciler, namespace, nameP Namespace: namespace, }, Status: corev1.PodStatus{StartTime: &startTime, ContainerStatuses: []corev1.ContainerStatus{ - {RestartCount: 52}, - {RestartCount: 24}, + {RestartCount: RestartCountOverThreshold}, + {RestartCount: RestartCountWithinThresholdContainer2}, }}, } err := r.AllNamespacesClient.Create(context.TODO(), pod) @@ -1386,8 +1386,8 @@ func preparePayloadCrashloopingPodsWithinThreshold(t *testing.T, r *Reconciler, pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod-%d", sts.Name, i), Namespace: sts.Namespace}, Status: corev1.PodStatus{StartTime: &startTime, ContainerStatuses: []corev1.ContainerStatus{ - {RestartCount: 30}, - {RestartCount: 24}, + {RestartCount: RestartCountWithinThresholdContainer1}, + {RestartCount: RestartCountWithinThresholdContainer2}, }}, } err := controllerutil.SetControllerReference(sts, pod, r.Scheme) From b38555a279fa6b54b980b2ab98707159af6690b2 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Mon, 24 Feb 2025 18:39:56 -0500 Subject: [PATCH 07/12] move test's consts in test file --- controllers/idler/idler_controller.go | 7 ++----- controllers/idler/idler_controller_test.go | 6 ++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index 2bca04fd..dda376c9 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -35,11 +35,8 @@ import ( ) const ( - RestartThreshold = 50 - RequeueTimeThresholdSeconds = 300 - RestartCountWithinThresholdContainer1 = 30 - RestartCountWithinThresholdContainer2 = 24 - RestartCountOverThreshold = 52 + RestartThreshold = 50 + RequeueTimeThresholdSeconds = 300 ) var SupportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{ diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index b0b8c0f6..fbd9a1d7 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -41,6 +41,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +const ( + RestartCountWithinThresholdContainer1 = 30 + RestartCountWithinThresholdContainer2 = 24 + RestartCountOverThreshold = 52 +) + func TestReconcile(t *testing.T) { t.Run("No Idler resource found", func(t *testing.T) { From a147e636496395427a349188b8cd8a91bfc020ea Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Mon, 24 Feb 2025 19:22:35 -0500 Subject: [PATCH 08/12] add rajiv's suggestions from review --- controllers/idler/idler_controller.go | 10 +++++----- controllers/idler/idler_controller_test.go | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index dda376c9..5beba12b 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -112,7 +112,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. return reconcile.Result{}, r.wrapErrorWithStatusUpdate(ctx, idler, r.setStatusFailed, err, "failed to ensure idling '%s'", idler.Name) } - // Find the earlier pod to kill and requeue. Otherwise, use the idler timeoutSeconds to requeue. + // Find the earlier pod to kill and requeue. Otherwise, use the RequeueTimeThresholdSeconds to requeue. nextTime := nextPodToBeKilledAfter(logger, idler) after := time.Duration(RequeueTimeThresholdSeconds) * time.Second if nextTime == nil || *nextTime > after { @@ -145,7 +145,7 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. if crashlooping { podLogger.Info("Pod is restarting too often. Killing the pod", "restart_count", restartCount) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, podLogger, pod, true, r, idler) + err := deletePodsAndCreateNotification(ctx, podLogger, pod, r, idler) if err != nil { return err } @@ -160,7 +160,7 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. if time.Now().After(trackedPod.StartTime.Add(time.Duration(timeoutSeconds) * time.Second)) { podLogger.Info("Pod running for too long. Killing the pod.", "start_time", trackedPod.StartTime.Format("2006-01-02T15:04:05Z"), "timeout_seconds", timeoutSeconds) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, podLogger, pod, false, r, idler) + err := deletePodsAndCreateNotification(ctx, podLogger, pod, r, idler) if err != nil { return err } @@ -184,7 +184,7 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Check if the pod belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. // if it is a standalone pod, delete it. // Send notification if the deleted pod was managed by a controller, was a standalone pod that was not completed or was crashlooping -func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, pod corev1.Pod, isCrashlooping bool, r *Reconciler, idler *toolchainv1alpha1.Idler) error { +func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { lctx := log.IntoContext(ctx, podLogger) logger := log.FromContext(ctx) var podreason string @@ -211,7 +211,7 @@ func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, } // If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification - if podreason != "PodCompleted" || deletedByController || isCrashlooping { + if podreason != "PodCompleted" || deletedByController { // By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent if err := r.createNotification(ctx, idler, appName, appType); err != nil { logger.Error(err, "failed to create Notification") diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index fbd9a1d7..5fb2342e 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -150,7 +150,7 @@ func TestEnsureIdling(t *testing.T) { podsTooEarlyToKill := preparePayloads(t, reconciler, idler.Name, "", freshStartTimes(idler)) podsCrashLoopingWithinThreshold := preparePayloadCrashloopingPodsWithinThreshold(t, reconciler, idler.Name, "inThreshRestarts-", freshStartTimes(idler)) - podsCrashLooping := preparePayloadCrashloopingPod(t, reconciler, idler.Name, "restartCount-") + podsCrashLooping := preparePayloadCrashloopingAboveThreshold(t, reconciler, idler.Name, "restartCount-") podsRunningForTooLong := preparePayloads(t, reconciler, idler.Name, "todelete-", expiredStartTimes(idler)) noise := preparePayloads(t, reconciler, "another-namespace", "", expiredStartTimes(idler)) @@ -1333,11 +1333,11 @@ func preparePayloadsSinglePod(t *testing.T, r *Reconciler, namespace, namePrefix } } -func preparePayloadCrashloopingPod(t *testing.T, r *Reconciler, namespace, namePrefix string) payloads { +func preparePayloadCrashloopingAboveThreshold(t *testing.T, r *Reconciler, namespace, namePrefix string) payloads { standalonePods := make([]*corev1.Pod, 0, 1) startTime := metav1.Now() replicas := int32(3) - // Create a standalone pod with no owner which has restart count sum > 50 + // Create a standalone pod with no owner which has at least one container with restart count > 50 pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf("%s%s-pod-fail", namePrefix, namespace), From 9164efed1c4569ec9780def2f3ef716fc2da1abe Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Tue, 25 Feb 2025 16:17:10 -0500 Subject: [PATCH 09/12] matous's suggestions --- controllers/idler/idler_controller.go | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index 5beba12b..60b6dcf6 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -141,11 +141,12 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. podLogger := log.FromContext(ctx).WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase) if trackedPod := findPodByName(idler, pod.Name); trackedPod != nil { // check the restart count for the trackedPod - crashlooping, restartCount := isRestartingOften(pod.Status) - if crashlooping { + restartCount := getHighestRestartCount(pod.Status) + if restartCount > RestartThreshold { podLogger.Info("Pod is restarting too often. Killing the pod", "restart_count", restartCount) + lctx := log.IntoContext(ctx, podLogger) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, podLogger, pod, r, idler) + err := deletePodsAndCreateNotification(ctx, lctx, pod, r, idler) if err != nil { return err } @@ -159,8 +160,9 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Already tracking this pod. Check the timeout. if time.Now().After(trackedPod.StartTime.Add(time.Duration(timeoutSeconds) * time.Second)) { podLogger.Info("Pod running for too long. Killing the pod.", "start_time", trackedPod.StartTime.Format("2006-01-02T15:04:05Z"), "timeout_seconds", timeoutSeconds) + lctx := log.IntoContext(ctx, podLogger) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, podLogger, pod, r, idler) + err := deletePodsAndCreateNotification(ctx, lctx, pod, r, idler) if err != nil { return err } @@ -184,14 +186,13 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Check if the pod belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. // if it is a standalone pod, delete it. // Send notification if the deleted pod was managed by a controller, was a standalone pod that was not completed or was crashlooping -func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { - lctx := log.IntoContext(ctx, podLogger) +func deletePodsAndCreateNotification(ctx context.Context, lctx context.Context, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { logger := log.FromContext(ctx) - var podreason string + var podReason string podCondition := pod.Status.Conditions for _, podCond := range podCondition { if podCond.Type == "Ready" { - podreason = podCond.Reason + podReason = podCond.Reason } } appType, appName, deletedByController, err := r.scaleControllerToZero(lctx, pod.ObjectMeta) @@ -199,11 +200,11 @@ func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, return err } if !deletedByController { // Pod not managed by a controller. We can just delete the pod. - log.FromContext(ctx).Info("Deleting pod without controller") + logger.Info("Deleting pod without controller") if err := r.AllNamespacesClient.Delete(ctx, &pod); err != nil { return err } - podLogger.Info("Pod deleted") + logger.Info("Pod deleted") } if appName == "" { appName = pod.Name @@ -211,7 +212,7 @@ func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, } // If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification - if podreason != "PodCompleted" || deletedByController { + if podReason != "PodCompleted" || deletedByController { // By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent if err := r.createNotification(ctx, idler, appName, appType); err != nil { logger.Error(err, "failed to create Notification") @@ -223,14 +224,14 @@ func deletePodsAndCreateNotification(ctx context.Context, podLogger logr.Logger, return nil } -func isRestartingOften(podstatus corev1.PodStatus) (bool, int32) { +func getHighestRestartCount(podstatus corev1.PodStatus) int32 { var restartCount int32 for _, status := range podstatus.ContainerStatuses { if restartCount < status.RestartCount { restartCount = status.RestartCount } } - return restartCount > RestartThreshold, restartCount + return restartCount } func (r *Reconciler) createNotification(ctx context.Context, idler *toolchainv1alpha1.Idler, appName string, appType string) error { From a75d1097a8d0725376b672d66aece6c3f0eb7e5c Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Wed, 26 Feb 2025 00:09:12 -0500 Subject: [PATCH 10/12] change requeuetime to be the shortest, to fix e2e test failure --- controllers/idler/idler_controller.go | 33 ++++++++++++++-------- controllers/idler/idler_controller_test.go | 9 +++--- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index 60b6dcf6..c47255ae 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -112,20 +112,16 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. return reconcile.Result{}, r.wrapErrorWithStatusUpdate(ctx, idler, r.setStatusFailed, err, "failed to ensure idling '%s'", idler.Name) } - // Find the earlier pod to kill and requeue. Otherwise, use the RequeueTimeThresholdSeconds to requeue. - nextTime := nextPodToBeKilledAfter(logger, idler) - after := time.Duration(RequeueTimeThresholdSeconds) * time.Second - if nextTime == nil || *nextTime > after { - logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) - return reconcile.Result{ - Requeue: true, - RequeueAfter: after, - }, r.setStatusReady(ctx, idler) - } - logger.Info("requeueing for next pod to kill", "after_seconds", nextTime.Seconds()) + // Requeue in idler.Spec.TimeoutSeconds or RequeueTimeThresholdSeconds, whichever is sooner. + nextPodToKillAfter := nextPodToBeKilledAfter(logger, idler) + maxRequeDuration := time.Duration(RequeueTimeThresholdSeconds) * time.Second + idlerTimeoutDuration := time.Duration(idler.Spec.TimeoutSeconds) * time.Second + after := findShortestDuration(nextPodToKillAfter, &maxRequeDuration, &idlerTimeoutDuration) + + logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) return reconcile.Result{ Requeue: true, - RequeueAfter: *nextTime, + RequeueAfter: *after, }, r.setStatusReady(ctx, idler) } @@ -635,6 +631,19 @@ func nextPodToBeKilledAfter(log logr.Logger, idler *toolchainv1alpha1.Idler) *ti return &d } +// findShortestDuration finds the shortest duration the given durations +func findShortestDuration(durations ...*time.Duration) *time.Duration { + var shortest *time.Duration + for _, d := range durations { + if d != nil { + if shortest == nil || *d < *shortest { + shortest = d + } + } + } + return shortest +} + // updateStatusPods updates the status pods to the new ones but only if something changed. Order is ignored. func (r *Reconciler) updateStatusPods(ctx context.Context, idler *toolchainv1alpha1.Idler, newPods []toolchainv1alpha1.Pod) error { nothingChanged := len(idler.Status.Pods) == len(newPods) diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index 5fb2342e..92c4142d 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -45,6 +45,7 @@ const ( RestartCountWithinThresholdContainer1 = 30 RestartCountWithinThresholdContainer2 = 24 RestartCountOverThreshold = 52 + TestIdlerTimeOutSeconds = 540 ) func TestReconcile(t *testing.T) { @@ -112,7 +113,7 @@ func TestEnsureIdling(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "john-dev", }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 30}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } reconciler, req, cl, _, _ := prepareReconcile(t, idler.Name, getHostCluster, idler) @@ -140,7 +141,7 @@ func TestEnsureIdling(t *testing.T) { toolchainv1alpha1.SpaceLabelKey: "alex", }, }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 60}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } namespaces := []string{"dev", "stage"} usernames := []string{"alex"} @@ -317,7 +318,7 @@ func TestEnsureIdling(t *testing.T) { toolchainv1alpha1.SpaceLabelKey: "alex", }, }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 60}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } namespaces := []string{"dev", "stage"} usernames := []string{"alex"} @@ -442,7 +443,7 @@ func TestEnsureIdlingFailed(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "alex-stage", }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 60}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } vm := &unstructured.Unstructured{} From bfc3cdab10e2b06e72dbf287053a7f7188fad488 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Wed, 26 Feb 2025 16:38:44 -0500 Subject: [PATCH 11/12] matous's suggestions second review --- controllers/idler/idler_controller.go | 41 +++++++++++----------- controllers/idler/idler_controller_test.go | 6 ++-- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index c47255ae..33685b88 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -18,7 +18,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" - "github.com/go-logr/logr" openshiftappsv1 "github.com/openshift/api/apps/v1" errs "github.com/pkg/errors" "github.com/redhat-cop/operator-utils/pkg/util" @@ -35,8 +34,8 @@ import ( ) const ( - RestartThreshold = 50 - RequeueTimeThresholdSeconds = 300 + RestartThreshold = 50 + RequeueTimeThreshold = 300 * time.Second ) var SupportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{ @@ -112,16 +111,12 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. return reconcile.Result{}, r.wrapErrorWithStatusUpdate(ctx, idler, r.setStatusFailed, err, "failed to ensure idling '%s'", idler.Name) } - // Requeue in idler.Spec.TimeoutSeconds or RequeueTimeThresholdSeconds, whichever is sooner. - nextPodToKillAfter := nextPodToBeKilledAfter(logger, idler) - maxRequeDuration := time.Duration(RequeueTimeThresholdSeconds) * time.Second - idlerTimeoutDuration := time.Duration(idler.Spec.TimeoutSeconds) * time.Second - after := findShortestDuration(nextPodToKillAfter, &maxRequeDuration, &idlerTimeoutDuration) - + // Requeue in shortest of the following values idler.Spec.TimeoutSeconds or RequeueTimeThreshold or nextPodToBeKilledAfter + after := findShortestRequeueDuration(idler) logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) return reconcile.Result{ Requeue: true, - RequeueAfter: *after, + RequeueAfter: after, }, r.setStatusReady(ctx, idler) } @@ -135,14 +130,14 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. for _, pod := range podList.Items { pod := pod // TODO We won't need it after upgrading to go 1.22: https://go.dev/blog/loopvar-preview podLogger := log.FromContext(ctx).WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase) + podCtx := log.IntoContext(ctx, podLogger) if trackedPod := findPodByName(idler, pod.Name); trackedPod != nil { // check the restart count for the trackedPod restartCount := getHighestRestartCount(pod.Status) if restartCount > RestartThreshold { podLogger.Info("Pod is restarting too often. Killing the pod", "restart_count", restartCount) - lctx := log.IntoContext(ctx, podLogger) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, lctx, pod, r, idler) + err := deletePodsAndCreateNotification(ctx, podCtx, pod, r, idler) if err != nil { return err } @@ -156,9 +151,8 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Already tracking this pod. Check the timeout. if time.Now().After(trackedPod.StartTime.Add(time.Duration(timeoutSeconds) * time.Second)) { podLogger.Info("Pod running for too long. Killing the pod.", "start_time", trackedPod.StartTime.Format("2006-01-02T15:04:05Z"), "timeout_seconds", timeoutSeconds) - lctx := log.IntoContext(ctx, podLogger) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, lctx, pod, r, idler) + err := deletePodsAndCreateNotification(ctx, podCtx, pod, r, idler) if err != nil { return err } @@ -182,7 +176,7 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Check if the pod belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. // if it is a standalone pod, delete it. // Send notification if the deleted pod was managed by a controller, was a standalone pod that was not completed or was crashlooping -func deletePodsAndCreateNotification(ctx context.Context, lctx context.Context, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { +func deletePodsAndCreateNotification(ctx context.Context, podCtx context.Context, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { logger := log.FromContext(ctx) var podReason string podCondition := pod.Status.Conditions @@ -191,7 +185,7 @@ func deletePodsAndCreateNotification(ctx context.Context, lctx context.Context, podReason = podCond.Reason } } - appType, appName, deletedByController, err := r.scaleControllerToZero(lctx, pod.ObjectMeta) + appType, appName, deletedByController, err := r.scaleControllerToZero(podCtx, pod.ObjectMeta) if err != nil { return err } @@ -611,7 +605,7 @@ func findPodByName(idler *toolchainv1alpha1.Idler, name string) *toolchainv1alph // nextPodToBeKilledAfter checks the start times of all the tracked pods in the Idler and the timeout left // for the next pod to be killed. // If there is no pod to kill, the func returns `nil` -func nextPodToBeKilledAfter(log logr.Logger, idler *toolchainv1alpha1.Idler) *time.Duration { +func nextPodToBeKilledAfter(idler *toolchainv1alpha1.Idler) *time.Duration { if len(idler.Status.Pods) == 0 { // no pod tracked, so nothing to kill return nil @@ -627,12 +621,17 @@ func nextPodToBeKilledAfter(log logr.Logger, idler *toolchainv1alpha1.Idler) *ti if d < 0 { d = 0 } - log.Info("next pod to kill", "after", d) return &d } -// findShortestDuration finds the shortest duration the given durations -func findShortestDuration(durations ...*time.Duration) *time.Duration { +// findShortestRequeueDuration finds the shortest duration the given durations +// returns the shortest duration to requeue after for idler +func findShortestRequeueDuration(idler *toolchainv1alpha1.Idler) time.Duration { + durations := make([]*time.Duration, 0, 3) + nextPodToKillAfter := nextPodToBeKilledAfter(idler) + maxRequeueDuration := RequeueTimeThreshold + idlerTimeoutDuration := time.Duration(idler.Spec.TimeoutSeconds) * time.Second + durations = append(durations, nextPodToKillAfter, &maxRequeueDuration, &idlerTimeoutDuration) var shortest *time.Duration for _, d := range durations { if d != nil { @@ -641,7 +640,7 @@ func findShortestDuration(durations ...*time.Duration) *time.Duration { } } } - return shortest + return *shortest } // updateStatusPods updates the status pods to the new ones but only if something changed. Order is ignored. diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index 92c4142d..1072985d 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -127,7 +127,7 @@ func TestEnsureIdling(t *testing.T) { // no pods found - the controller will requeue after 5 mins assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: RequeueTimeThresholdSeconds * time.Second, + RequeueAfter: RequeueTimeThreshold, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).HasConditions(memberoperatortest.Running()) }) @@ -301,7 +301,7 @@ func TestEnsureIdling(t *testing.T) { // requeue after the idler timeout assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: RequeueTimeThresholdSeconds * time.Second, + RequeueAfter: RequeueTimeThreshold, }, res) }) }) @@ -539,7 +539,7 @@ func TestEnsureIdlingFailed(t *testing.T) { require.NoError(t, err) // 'NotFound' errors are ignored! assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: RequeueTimeThresholdSeconds * time.Second, + RequeueAfter: RequeueTimeThreshold, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).ContainsCondition(memberoperatortest.Running()) } From 96d7089a2aa3a139721e7bd1c3e4789a559f5b95 Mon Sep 17 00:00:00 2001 From: Kanika Rana Date: Thu, 27 Feb 2025 10:23:02 -0500 Subject: [PATCH 12/12] only use podCtx in deletePodsAndCreateNotification --- controllers/idler/idler_controller.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index 33685b88..963d2778 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -137,7 +137,7 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. if restartCount > RestartThreshold { podLogger.Info("Pod is restarting too often. Killing the pod", "restart_count", restartCount) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, podCtx, pod, r, idler) + err := deletePodsAndCreateNotification(podCtx, pod, r, idler) if err != nil { return err } @@ -152,7 +152,7 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. if time.Now().After(trackedPod.StartTime.Add(time.Duration(timeoutSeconds) * time.Second)) { podLogger.Info("Pod running for too long. Killing the pod.", "start_time", trackedPod.StartTime.Format("2006-01-02T15:04:05Z"), "timeout_seconds", timeoutSeconds) // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - err := deletePodsAndCreateNotification(ctx, podCtx, pod, r, idler) + err := deletePodsAndCreateNotification(podCtx, pod, r, idler) if err != nil { return err } @@ -176,8 +176,8 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Check if the pod belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. // if it is a standalone pod, delete it. // Send notification if the deleted pod was managed by a controller, was a standalone pod that was not completed or was crashlooping -func deletePodsAndCreateNotification(ctx context.Context, podCtx context.Context, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { - logger := log.FromContext(ctx) +func deletePodsAndCreateNotification(podCtx context.Context, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { + logger := log.FromContext(podCtx) var podReason string podCondition := pod.Status.Conditions for _, podCond := range podCondition { @@ -191,7 +191,7 @@ func deletePodsAndCreateNotification(ctx context.Context, podCtx context.Context } if !deletedByController { // Pod not managed by a controller. We can just delete the pod. logger.Info("Deleting pod without controller") - if err := r.AllNamespacesClient.Delete(ctx, &pod); err != nil { + if err := r.AllNamespacesClient.Delete(podCtx, &pod); err != nil { return err } logger.Info("Pod deleted") @@ -204,9 +204,9 @@ func deletePodsAndCreateNotification(ctx context.Context, podCtx context.Context // If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification if podReason != "PodCompleted" || deletedByController { // By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent - if err := r.createNotification(ctx, idler, appName, appType); err != nil { + if err := r.createNotification(podCtx, idler, appName, appType); err != nil { logger.Error(err, "failed to create Notification") - if err = r.setStatusIdlerNotificationCreationFailed(ctx, idler, err.Error()); err != nil { + if err = r.setStatusIdlerNotificationCreationFailed(podCtx, idler, err.Error()); err != nil { logger.Error(err, "failed to set status IdlerNotificationCreationFailed") } // not returning error to continue tracking remaining pods }