diff --git a/controllers/idler/idler_controller.go b/controllers/idler/idler_controller.go index eb4bdb75..963d2778 100644 --- a/controllers/idler/idler_controller.go +++ b/controllers/idler/idler_controller.go @@ -18,7 +18,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" - "github.com/go-logr/logr" openshiftappsv1 "github.com/openshift/api/apps/v1" errs "github.com/pkg/errors" "github.com/redhat-cop/operator-utils/pkg/util" @@ -34,6 +33,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +const ( + RestartThreshold = 50 + RequeueTimeThreshold = 300 * time.Second +) + var SupportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{ schema.GroupVersion{Group: "camel.apache.org", Version: "v1"}.WithKind("Integration"): schema.GroupVersion{Group: "camel.apache.org", Version: "v1"}.WithResource("integrations"), schema.GroupVersion{Group: "camel.apache.org", Version: "v1alpha1"}.WithKind("KameletBinding"): schema.GroupVersion{Group: "camel.apache.org", Version: "v1alpha1"}.WithResource("kameletbindings"), @@ -107,20 +111,12 @@ func (r *Reconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctrl. return reconcile.Result{}, r.wrapErrorWithStatusUpdate(ctx, idler, r.setStatusFailed, err, "failed to ensure idling '%s'", idler.Name) } - // Find the earlier pod to kill and requeue. Otherwise, use the idler timeoutSeconds to requeue. - nextTime := nextPodToBeKilledAfter(logger, idler) - if nextTime == nil { - after := time.Duration(idler.Spec.TimeoutSeconds) * time.Second - logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) - return reconcile.Result{ - Requeue: true, - RequeueAfter: after, - }, r.setStatusReady(ctx, idler) - } - logger.Info("requeueing for next pod to kill", "after_seconds", nextTime.Seconds()) + // Requeue in shortest of the following values idler.Spec.TimeoutSeconds or RequeueTimeThreshold or nextPodToBeKilledAfter + after := findShortestRequeueDuration(idler) + logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds()) return reconcile.Result{ Requeue: true, - RequeueAfter: *nextTime, + RequeueAfter: after, }, r.setStatusReady(ctx, idler) } @@ -133,9 +129,20 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. newStatusPods := make([]toolchainv1alpha1.Pod, 0, 10) for _, pod := range podList.Items { pod := pod // TODO We won't need it after upgrading to go 1.22: https://go.dev/blog/loopvar-preview - logger := log.FromContext(ctx) - podLogger := logger.WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase) + podLogger := log.FromContext(ctx).WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase) + podCtx := log.IntoContext(ctx, podLogger) if trackedPod := findPodByName(idler, pod.Name); trackedPod != nil { + // check the restart count for the trackedPod + restartCount := getHighestRestartCount(pod.Status) + if restartCount > RestartThreshold { + podLogger.Info("Pod is restarting too often. Killing the pod", "restart_count", restartCount) + // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. + err := deletePodsAndCreateNotification(podCtx, pod, r, idler) + if err != nil { + return err + } + continue + } timeoutSeconds := idler.Spec.TimeoutSeconds if isOwnedByVM(pod.ObjectMeta) { // use 1/12th of the timeout for VMs @@ -144,43 +151,11 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. // Already tracking this pod. Check the timeout. if time.Now().After(trackedPod.StartTime.Add(time.Duration(timeoutSeconds) * time.Second)) { podLogger.Info("Pod running for too long. Killing the pod.", "start_time", trackedPod.StartTime.Format("2006-01-02T15:04:05Z"), "timeout_seconds", timeoutSeconds) - var podreason string - podCondition := pod.Status.Conditions - for _, podCond := range podCondition { - if podCond.Type == "Ready" { - podreason = podCond.Reason - } - } - // Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. - lctx := log.IntoContext(ctx, podLogger) - appType, appName, deletedByController, err := r.scaleControllerToZero(lctx, pod.ObjectMeta) + err := deletePodsAndCreateNotification(podCtx, pod, r, idler) if err != nil { return err } - if !deletedByController { // Pod not managed by a controller. We can just delete the pod. - logger.Info("Deleting pod without controller") - if err := r.AllNamespacesClient.Delete(ctx, &pod); err != nil { - return err - } - podLogger.Info("Pod deleted") - } - - if appName == "" { - appName = pod.Name - appType = "Pod" - } - // Send notification if the deleted pod was managed by a controller or was a standalone pod that was not completed - // eg. If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification - if podreason != "PodCompleted" || deletedByController { - // By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent - if err := r.createNotification(ctx, idler, appName, appType); err != nil { - logger.Error(err, "failed to create Notification") - if err = r.setStatusIdlerNotificationCreationFailed(ctx, idler, err.Error()); err != nil { - logger.Error(err, "failed to set status IdlerNotificationCreationFailed") - } // not returning error to continue tracking remaining pods - } - } } else { newStatusPods = append(newStatusPods, *trackedPod) // keep tracking @@ -198,6 +173,57 @@ func (r *Reconciler) ensureIdling(ctx context.Context, idler *toolchainv1alpha1. return r.updateStatusPods(ctx, idler, newStatusPods) } +// Check if the pod belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero. +// if it is a standalone pod, delete it. +// Send notification if the deleted pod was managed by a controller, was a standalone pod that was not completed or was crashlooping +func deletePodsAndCreateNotification(podCtx context.Context, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error { + logger := log.FromContext(podCtx) + var podReason string + podCondition := pod.Status.Conditions + for _, podCond := range podCondition { + if podCond.Type == "Ready" { + podReason = podCond.Reason + } + } + appType, appName, deletedByController, err := r.scaleControllerToZero(podCtx, pod.ObjectMeta) + if err != nil { + return err + } + if !deletedByController { // Pod not managed by a controller. We can just delete the pod. + logger.Info("Deleting pod without controller") + if err := r.AllNamespacesClient.Delete(podCtx, &pod); err != nil { + return err + } + logger.Info("Pod deleted") + } + if appName == "" { + appName = pod.Name + appType = "Pod" + } + + // If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification + if podReason != "PodCompleted" || deletedByController { + // By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent + if err := r.createNotification(podCtx, idler, appName, appType); err != nil { + logger.Error(err, "failed to create Notification") + if err = r.setStatusIdlerNotificationCreationFailed(podCtx, idler, err.Error()); err != nil { + logger.Error(err, "failed to set status IdlerNotificationCreationFailed") + } // not returning error to continue tracking remaining pods + } + } + return nil +} + +func getHighestRestartCount(podstatus corev1.PodStatus) int32 { + var restartCount int32 + for _, status := range podstatus.ContainerStatuses { + if restartCount < status.RestartCount { + restartCount = status.RestartCount + } + } + return restartCount +} + func (r *Reconciler) createNotification(ctx context.Context, idler *toolchainv1alpha1.Idler, appName string, appType string) error { log.FromContext(ctx).Info("Create Notification") //Get the HostClient @@ -579,7 +605,7 @@ func findPodByName(idler *toolchainv1alpha1.Idler, name string) *toolchainv1alph // nextPodToBeKilledAfter checks the start times of all the tracked pods in the Idler and the timeout left // for the next pod to be killed. // If there is no pod to kill, the func returns `nil` -func nextPodToBeKilledAfter(log logr.Logger, idler *toolchainv1alpha1.Idler) *time.Duration { +func nextPodToBeKilledAfter(idler *toolchainv1alpha1.Idler) *time.Duration { if len(idler.Status.Pods) == 0 { // no pod tracked, so nothing to kill return nil @@ -595,10 +621,28 @@ func nextPodToBeKilledAfter(log logr.Logger, idler *toolchainv1alpha1.Idler) *ti if d < 0 { d = 0 } - log.Info("next pod to kill", "after", d) return &d } +// findShortestRequeueDuration finds the shortest duration the given durations +// returns the shortest duration to requeue after for idler +func findShortestRequeueDuration(idler *toolchainv1alpha1.Idler) time.Duration { + durations := make([]*time.Duration, 0, 3) + nextPodToKillAfter := nextPodToBeKilledAfter(idler) + maxRequeueDuration := RequeueTimeThreshold + idlerTimeoutDuration := time.Duration(idler.Spec.TimeoutSeconds) * time.Second + durations = append(durations, nextPodToKillAfter, &maxRequeueDuration, &idlerTimeoutDuration) + var shortest *time.Duration + for _, d := range durations { + if d != nil { + if shortest == nil || *d < *shortest { + shortest = d + } + } + } + return *shortest +} + // updateStatusPods updates the status pods to the new ones but only if something changed. Order is ignored. func (r *Reconciler) updateStatusPods(ctx context.Context, idler *toolchainv1alpha1.Idler, newPods []toolchainv1alpha1.Pod) error { nothingChanged := len(idler.Status.Pods) == len(newPods) diff --git a/controllers/idler/idler_controller_test.go b/controllers/idler/idler_controller_test.go index e0dc5a19..1072985d 100644 --- a/controllers/idler/idler_controller_test.go +++ b/controllers/idler/idler_controller_test.go @@ -41,6 +41,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +const ( + RestartCountWithinThresholdContainer1 = 30 + RestartCountWithinThresholdContainer2 = 24 + RestartCountOverThreshold = 52 + TestIdlerTimeOutSeconds = 540 +) + func TestReconcile(t *testing.T) { t.Run("No Idler resource found", func(t *testing.T) { @@ -106,7 +113,7 @@ func TestEnsureIdling(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "john-dev", }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 30}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } reconciler, req, cl, _, _ := prepareReconcile(t, idler.Name, getHostCluster, idler) @@ -117,10 +124,10 @@ func TestEnsureIdling(t *testing.T) { // then require.NoError(t, err) - // no pods found - the controller will requeue after the idler's timeout + // no pods found - the controller will requeue after 5 mins assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: 30 * time.Second, + RequeueAfter: RequeueTimeThreshold, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).HasConditions(memberoperatortest.Running()) }) @@ -134,7 +141,7 @@ func TestEnsureIdling(t *testing.T) { toolchainv1alpha1.SpaceLabelKey: "alex", }, }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 60}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } namespaces := []string{"dev", "stage"} usernames := []string{"alex"} @@ -143,8 +150,10 @@ func TestEnsureIdling(t *testing.T) { reconciler, req, cl, allCl, dynamicClient := prepareReconcile(t, idler.Name, getHostCluster, idler, nsTmplSet, mur) podsTooEarlyToKill := preparePayloads(t, reconciler, idler.Name, "", freshStartTimes(idler)) - + podsCrashLoopingWithinThreshold := preparePayloadCrashloopingPodsWithinThreshold(t, reconciler, idler.Name, "inThreshRestarts-", freshStartTimes(idler)) + podsCrashLooping := preparePayloadCrashloopingAboveThreshold(t, reconciler, idler.Name, "restartCount-") podsRunningForTooLong := preparePayloads(t, reconciler, idler.Name, "todelete-", expiredStartTimes(idler)) + noise := preparePayloads(t, reconciler, "another-namespace", "", expiredStartTimes(idler)) t.Run("First reconcile. Start tracking.", func(t *testing.T) { @@ -158,6 +167,7 @@ func TestEnsureIdling(t *testing.T) { PodsExist(podsRunningForTooLong.standalonePods). PodsExist(podsTooEarlyToKill.standalonePods). PodsExist(noise.standalonePods). + PodsExist(podsCrashLooping.standalonePods). DaemonSetExists(podsRunningForTooLong.daemonSet). DaemonSetExists(podsTooEarlyToKill.daemonSet). DaemonSetExists(noise.daemonSet). @@ -173,6 +183,7 @@ func TestEnsureIdling(t *testing.T) { DeploymentScaledUp(noise.deployment). DeploymentScaledUp(noise.integration). DeploymentScaledUp(noise.kameletBinding). + DeploymentScaledUp(podsCrashLooping.deployment). ReplicaSetScaledUp(podsRunningForTooLong.replicaSet). ReplicaSetScaledUp(podsTooEarlyToKill.replicaSet). ReplicaSetScaledUp(noise.replicaSet). @@ -185,30 +196,34 @@ func TestEnsureIdling(t *testing.T) { StatefulSetScaledUp(podsRunningForTooLong.statefulSet). StatefulSetScaledUp(podsTooEarlyToKill.statefulSet). StatefulSetScaledUp(noise.statefulSet). + StatefulSetScaledUp(podsCrashLoopingWithinThreshold.statefulSet). VMRunning(podsRunningForTooLong.virtualmachine). VMRunning(podsTooEarlyToKill.virtualmachine). VMRunning(noise.virtualmachine) + // after golang 1.22 upgrade can use slices.Concat(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods, podsCrashLooping.allPods, podsCrashLoopingWithinThreshold.allPods) // Tracked pods memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods...)). + TracksPods(append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.allPods...), podsCrashLooping.allPods...), podsCrashLoopingWithinThreshold.allPods...)). HasConditions(memberoperatortest.Running()) assert.True(t, res.Requeue) assert.Equal(t, 0, int(res.RequeueAfter)) // pods running for too long should be killed immediately - t.Run("Second Reconcile. Delete long running pods.", func(t *testing.T) { + t.Run("Second Reconcile. Delete long running and crashlooping pods.", func(t *testing.T) { //when res, err := reconciler.Reconcile(context.TODO(), req) // then require.NoError(t, err) // Too long running pods are gone. All long running controllers are scaled down. + // Crashlooping pods are gone. // The rest of the pods are still there and controllers are scaled up. memberoperatortest.AssertThatInIdleableCluster(t, allCl, dynamicClient). PodsDoNotExist(podsRunningForTooLong.standalonePods). PodsExist(podsTooEarlyToKill.standalonePods). PodsExist(noise.standalonePods). + PodsDoNotExist(podsCrashLooping.standalonePods). DaemonSetDoesNotExist(podsRunningForTooLong.daemonSet). DaemonSetExists(podsTooEarlyToKill.daemonSet). DaemonSetExists(noise.daemonSet). @@ -218,6 +233,7 @@ func TestEnsureIdling(t *testing.T) { DeploymentScaledDown(podsRunningForTooLong.deployment). DeploymentScaledDown(podsRunningForTooLong.integration). DeploymentScaledDown(podsRunningForTooLong.kameletBinding). + DeploymentScaledDown(podsCrashLooping.deployment). DeploymentScaledUp(podsTooEarlyToKill.deployment). DeploymentScaledUp(podsTooEarlyToKill.integration). DeploymentScaledUp(podsTooEarlyToKill.kameletBinding). @@ -236,13 +252,14 @@ func TestEnsureIdling(t *testing.T) { StatefulSetScaledDown(podsRunningForTooLong.statefulSet). StatefulSetScaledUp(podsTooEarlyToKill.statefulSet). StatefulSetScaledUp(noise.statefulSet). + StatefulSetScaledUp(podsCrashLoopingWithinThreshold.statefulSet). VMStopped(podsRunningForTooLong.virtualmachine). VMRunning(podsTooEarlyToKill.virtualmachine). VMRunning(noise.virtualmachine) - // Still tracking all pods. Even deleted ones. + // Only tracks pods that have not been deleted memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(podsTooEarlyToKill.allPods). + TracksPods(append(podsTooEarlyToKill.allPods, podsCrashLoopingWithinThreshold.allPods...)). HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated()) assert.True(t, res.Requeue) @@ -256,7 +273,7 @@ func TestEnsureIdling(t *testing.T) { require.NoError(t, err) // Tracking existing pods only. memberoperatortest.AssertThatIdler(t, idler.Name, cl). - TracksPods(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...)). + TracksPods(append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...), podsCrashLoopingWithinThreshold.allPods...), podsCrashLooping.controlledPods...)). // controlledPods are being tracked again because in unit tests scaling down doesn't delete pods HasConditions(memberoperatortest.Running(), memberoperatortest.IdlerNotificationCreated()) assert.True(t, res.Requeue) @@ -265,7 +282,7 @@ func TestEnsureIdling(t *testing.T) { t.Run("No pods. No requeue.", func(t *testing.T) { //given // cleanup remaining pods - pods := append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...) + pods := append(append(append(podsTooEarlyToKill.allPods, podsRunningForTooLong.controlledPods...), podsCrashLoopingWithinThreshold.allPods...), podsCrashLooping.controlledPods...) for _, pod := range pods { err := allCl.Delete(context.TODO(), pod) require.NoError(t, err) @@ -284,7 +301,7 @@ func TestEnsureIdling(t *testing.T) { // requeue after the idler timeout assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: 60 * time.Second, + RequeueAfter: RequeueTimeThreshold, }, res) }) }) @@ -301,7 +318,7 @@ func TestEnsureIdling(t *testing.T) { toolchainv1alpha1.SpaceLabelKey: "alex", }, }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 60}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } namespaces := []string{"dev", "stage"} usernames := []string{"alex"} @@ -426,7 +443,7 @@ func TestEnsureIdlingFailed(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "alex-stage", }, - Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 60}, + Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: TestIdlerTimeOutSeconds}, } vm := &unstructured.Unstructured{} @@ -522,7 +539,7 @@ func TestEnsureIdlingFailed(t *testing.T) { require.NoError(t, err) // 'NotFound' errors are ignored! assert.Equal(t, reconcile.Result{ Requeue: true, - RequeueAfter: 60 * time.Second, + RequeueAfter: RequeueTimeThreshold, }, res) memberoperatortest.AssertThatIdler(t, idler.Name, cl).ContainsCondition(memberoperatortest.Running()) } @@ -1118,7 +1135,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), rs) require.NoError(t, err) - controlledPods := createPods(t, r, rs, sTime, make([]*corev1.Pod, 0, 3), conditions...) + controlledPods := createPods(t, r, rs, sTime, make([]*corev1.Pod, 0, 3), false, conditions...) // Deployment with Camel K integration as an owner reference and a scale sub resource integration := &appsv1.Deployment{ @@ -1145,7 +1162,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), integrationRS) require.NoError(t, err) - controlledPods = createPods(t, r, integrationRS, sTime, controlledPods) + controlledPods = createPods(t, r, integrationRS, sTime, controlledPods, false) // Deployment with Camel K integration as an owner reference and a scale sub resource binding := &appsv1.Deployment{ @@ -1172,7 +1189,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), bindingRS) require.NoError(t, err) - controlledPods = createPods(t, r, bindingRS, sTime, controlledPods) + controlledPods = createPods(t, r, bindingRS, sTime, controlledPods, false) // Standalone ReplicaSet standaloneRs := &appsv1.ReplicaSet{ @@ -1181,7 +1198,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), standaloneRs) require.NoError(t, err) - controlledPods = createPods(t, r, standaloneRs, sTime, controlledPods) + controlledPods = createPods(t, r, standaloneRs, sTime, controlledPods, false) // DaemonSet ds := &appsv1.DaemonSet{ @@ -1189,7 +1206,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), ds) require.NoError(t, err) - controlledPods = createPods(t, r, ds, sTime, controlledPods) + controlledPods = createPods(t, r, ds, sTime, controlledPods, false) // Job job := &batchv1.Job{ @@ -1197,7 +1214,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), job) require.NoError(t, err) - controlledPods = createPods(t, r, job, sTime, controlledPods) + controlledPods = createPods(t, r, job, sTime, controlledPods, false) // StatefulSet sts := &appsv1.StatefulSet{ @@ -1206,7 +1223,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), sts) require.NoError(t, err) - controlledPods = createPods(t, r, sts, sTime, controlledPods) + controlledPods = createPods(t, r, sts, sTime, controlledPods, false) // DeploymentConfig dc := &openshiftappsv1.DeploymentConfig{ @@ -1223,7 +1240,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) err = r.AllNamespacesClient.Create(context.TODO(), rc) require.NoError(t, err) - controlledPods = createPods(t, r, rc, sTime, controlledPods) + controlledPods = createPods(t, r, rc, sTime, controlledPods, false) // VirtualMachine vm := &unstructured.Unstructured{} @@ -1245,7 +1262,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, require.NoError(t, err) _, err = r.DynamicClient.Resource(vmInstanceGVR).Namespace(namespace).Create(context.TODO(), vmi, metav1.CreateOptions{}) require.NoError(t, err) - controlledPods = createPods(t, r, vmi, vmstartTime, controlledPods) // vmi controls pod + controlledPods = createPods(t, r, vmi, vmstartTime, controlledPods, false) // vmi controls pod // Standalone ReplicationController standaloneRC := &corev1.ReplicationController{ @@ -1254,7 +1271,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, } err = r.AllNamespacesClient.Create(context.TODO(), standaloneRC) require.NoError(t, err) - controlledPods = createPods(t, r, standaloneRC, sTime, controlledPods) + controlledPods = createPods(t, r, standaloneRC, sTime, controlledPods, false) // Pods with unknown owner. They are subject of direct management by the Idler. // It doesn't have to be Idler. We just need any object as the owner of the pods @@ -1266,7 +1283,7 @@ func preparePayloads(t *testing.T, r *Reconciler, namespace, namePrefix string, }, Spec: toolchainv1alpha1.IdlerSpec{TimeoutSeconds: 30}, } - standalonePods := createPods(t, r, idler, sTime, make([]*corev1.Pod, 0, 3)) + standalonePods := createPods(t, r, idler, sTime, make([]*corev1.Pod, 0, 3), false) // Pods with no owner. for i := 0; i < 3; i++ { @@ -1317,12 +1334,94 @@ func preparePayloadsSinglePod(t *testing.T, r *Reconciler, namespace, namePrefix } } -func createPods(t *testing.T, r *Reconciler, owner metav1.Object, startTime metav1.Time, podsToTrack []*corev1.Pod, conditions ...corev1.PodCondition) []*corev1.Pod { +func preparePayloadCrashloopingAboveThreshold(t *testing.T, r *Reconciler, namespace, namePrefix string) payloads { + standalonePods := make([]*corev1.Pod, 0, 1) + startTime := metav1.Now() + replicas := int32(3) + // Create a standalone pod with no owner which has at least one container with restart count > 50 + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s%s-pod-fail", namePrefix, namespace), + Namespace: namespace, + }, + Status: corev1.PodStatus{StartTime: &startTime, ContainerStatuses: []corev1.ContainerStatus{ + {RestartCount: RestartCountOverThreshold}, + {RestartCount: RestartCountWithinThresholdContainer2}, + }}, + } + err := r.AllNamespacesClient.Create(context.TODO(), pod) + require.NoError(t, err) + standalonePods = append(standalonePods, pod) + // Deployment + d := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s%s-deployment", namePrefix, namespace), Namespace: namespace}, + Spec: appsv1.DeploymentSpec{Replicas: &replicas}, + } + err = r.AllNamespacesClient.Create(context.TODO(), d) + require.NoError(t, err) + rs := &appsv1.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-replicaset", d.Name), Namespace: namespace}, + Spec: appsv1.ReplicaSetSpec{Replicas: &replicas}, + } + err = controllerutil.SetControllerReference(d, rs, r.Scheme) + require.NoError(t, err) + err = r.AllNamespacesClient.Create(context.TODO(), rs) + require.NoError(t, err) + controlledPods := createPods(t, r, rs, startTime, make([]*corev1.Pod, 0, 3), true) + + allPods := append(standalonePods, controlledPods...) + return payloads{ + standalonePods: standalonePods, + allPods: allPods, + controlledPods: controlledPods, + deployment: d, + } +} + +func preparePayloadCrashloopingPodsWithinThreshold(t *testing.T, r *Reconciler, namespace, namePrefix string, times payloadStartTimes) payloads { + startTime := metav1.NewTime(times.defaultStartTime) + replicas := int32(3) + controlledPods := make([]*corev1.Pod, 0, 3) + // Create a StatefulSet with Crashlooping pods less than threshold + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s%s-statefulset", namePrefix, namespace), Namespace: namespace}, + Spec: appsv1.StatefulSetSpec{Replicas: &replicas}, + } + err := r.AllNamespacesClient.Create(context.TODO(), sts) + require.NoError(t, err) + for i := 0; i < int(replicas); i++ { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod-%d", sts.Name, i), Namespace: sts.Namespace}, + Status: corev1.PodStatus{StartTime: &startTime, ContainerStatuses: []corev1.ContainerStatus{ + {RestartCount: RestartCountWithinThresholdContainer1}, + {RestartCount: RestartCountWithinThresholdContainer2}, + }}, + } + err := controllerutil.SetControllerReference(sts, pod, r.Scheme) + require.NoError(t, err) + controlledPods = append(controlledPods, pod) + err = r.AllNamespacesClient.Create(context.TODO(), pod) + require.NoError(t, err) + } + return payloads{ + controlledPods: controlledPods, + statefulSet: sts, + allPods: controlledPods, + } +} + +func createPods(t *testing.T, r *Reconciler, owner metav1.Object, startTime metav1.Time, podsToTrack []*corev1.Pod, isCrashlooping bool, conditions ...corev1.PodCondition) []*corev1.Pod { for i := 0; i < 3; i++ { pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: fmt.Sprintf("%s-pod-%d", owner.GetName(), i), Namespace: owner.GetNamespace()}, Status: corev1.PodStatus{StartTime: &startTime, Conditions: conditions}, } + if isCrashlooping { + pod.Status = corev1.PodStatus{StartTime: &startTime, Conditions: conditions, ContainerStatuses: []corev1.ContainerStatus{ + {RestartCount: 52}, + {RestartCount: 24}, + }} + } err := controllerutil.SetControllerReference(owner, pod, r.Scheme) require.NoError(t, err) podsToTrack = append(podsToTrack, pod)