Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SANDBOX-967: check restartCounts and idle crashlooping pods #630

Merged
merged 14 commits into from
Feb 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 94 additions & 50 deletions controllers/idler/idler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"

"github.com/go-logr/logr"
openshiftappsv1 "github.com/openshift/api/apps/v1"
errs "github.com/pkg/errors"
"github.com/redhat-cop/operator-utils/pkg/util"
Expand All @@ -34,6 +33,11 @@
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

const (
RestartThreshold = 50
RequeueTimeThreshold = 300 * time.Second
)

var SupportedScaleResources = map[schema.GroupVersionKind]schema.GroupVersionResource{
schema.GroupVersion{Group: "camel.apache.org", Version: "v1"}.WithKind("Integration"): schema.GroupVersion{Group: "camel.apache.org", Version: "v1"}.WithResource("integrations"),
schema.GroupVersion{Group: "camel.apache.org", Version: "v1alpha1"}.WithKind("KameletBinding"): schema.GroupVersion{Group: "camel.apache.org", Version: "v1alpha1"}.WithResource("kameletbindings"),
Expand Down Expand Up @@ -107,20 +111,12 @@
return reconcile.Result{}, r.wrapErrorWithStatusUpdate(ctx, idler, r.setStatusFailed, err,
"failed to ensure idling '%s'", idler.Name)
}
// Find the earlier pod to kill and requeue. Otherwise, use the idler timeoutSeconds to requeue.
nextTime := nextPodToBeKilledAfter(logger, idler)
if nextTime == nil {
after := time.Duration(idler.Spec.TimeoutSeconds) * time.Second
logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds())
return reconcile.Result{
Requeue: true,
RequeueAfter: after,
}, r.setStatusReady(ctx, idler)
}
logger.Info("requeueing for next pod to kill", "after_seconds", nextTime.Seconds())
// Requeue in shortest of the following values idler.Spec.TimeoutSeconds or RequeueTimeThreshold or nextPodToBeKilledAfter
after := findShortestRequeueDuration(idler)
logger.Info("requeueing for next pod to check", "after_seconds", after.Seconds())
return reconcile.Result{
Requeue: true,
RequeueAfter: *nextTime,
RequeueAfter: after,
}, r.setStatusReady(ctx, idler)
}

Expand All @@ -133,9 +129,20 @@
newStatusPods := make([]toolchainv1alpha1.Pod, 0, 10)
for _, pod := range podList.Items {
pod := pod // TODO We won't need it after upgrading to go 1.22: https://go.dev/blog/loopvar-preview
logger := log.FromContext(ctx)
podLogger := logger.WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase)
podLogger := log.FromContext(ctx).WithValues("pod_name", pod.Name, "pod_phase", pod.Status.Phase)
podCtx := log.IntoContext(ctx, podLogger)
if trackedPod := findPodByName(idler, pod.Name); trackedPod != nil {
// check the restart count for the trackedPod
restartCount := getHighestRestartCount(pod.Status)
if restartCount > RestartThreshold {
podLogger.Info("Pod is restarting too often. Killing the pod", "restart_count", restartCount)
// Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero.
err := deletePodsAndCreateNotification(podCtx, pod, r, idler)
if err != nil {
return err
}

Check warning on line 143 in controllers/idler/idler_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/idler/idler_controller.go#L142-L143

Added lines #L142 - L143 were not covered by tests
continue
}
timeoutSeconds := idler.Spec.TimeoutSeconds
if isOwnedByVM(pod.ObjectMeta) {
// use 1/12th of the timeout for VMs
Expand All @@ -144,43 +151,11 @@
// Already tracking this pod. Check the timeout.
if time.Now().After(trackedPod.StartTime.Add(time.Duration(timeoutSeconds) * time.Second)) {
podLogger.Info("Pod running for too long. Killing the pod.", "start_time", trackedPod.StartTime.Format("2006-01-02T15:04:05Z"), "timeout_seconds", timeoutSeconds)
var podreason string
podCondition := pod.Status.Conditions
for _, podCond := range podCondition {
if podCond.Type == "Ready" {
podreason = podCond.Reason
}
}

// Check if it belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero.
lctx := log.IntoContext(ctx, podLogger)
appType, appName, deletedByController, err := r.scaleControllerToZero(lctx, pod.ObjectMeta)
err := deletePodsAndCreateNotification(podCtx, pod, r, idler)
if err != nil {
return err
}
if !deletedByController { // Pod not managed by a controller. We can just delete the pod.
logger.Info("Deleting pod without controller")
if err := r.AllNamespacesClient.Delete(ctx, &pod); err != nil {
return err
}
podLogger.Info("Pod deleted")
}

if appName == "" {
appName = pod.Name
appType = "Pod"
}
// Send notification if the deleted pod was managed by a controller or was a standalone pod that was not completed
// eg. If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification
if podreason != "PodCompleted" || deletedByController {
// By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent
if err := r.createNotification(ctx, idler, appName, appType); err != nil {
logger.Error(err, "failed to create Notification")
if err = r.setStatusIdlerNotificationCreationFailed(ctx, idler, err.Error()); err != nil {
logger.Error(err, "failed to set status IdlerNotificationCreationFailed")
} // not returning error to continue tracking remaining pods
}
}

} else {
newStatusPods = append(newStatusPods, *trackedPod) // keep tracking
Comment on lines 159 to 161
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only a small detail, another option (to keep it consistent with the previous section) would be

				continue
			}
			newStatusPods = append(newStatusPods, *trackedPod) // keep tracking

but feel free to keep it as it is

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've kept it as it is for now - making the change had some test failures in the number of pods being tracked - i didn't debug further for now

Expand All @@ -198,6 +173,57 @@
return r.updateStatusPods(ctx, idler, newStatusPods)
}

// Check if the pod belongs to a controller (Deployment, DeploymentConfig, etc) and scale it down to zero.
// if it is a standalone pod, delete it.
// Send notification if the deleted pod was managed by a controller, was a standalone pod that was not completed or was crashlooping
func deletePodsAndCreateNotification(podCtx context.Context, pod corev1.Pod, r *Reconciler, idler *toolchainv1alpha1.Idler) error {
logger := log.FromContext(podCtx)
var podReason string
podCondition := pod.Status.Conditions
for _, podCond := range podCondition {
if podCond.Type == "Ready" {
podReason = podCond.Reason
}
}
appType, appName, deletedByController, err := r.scaleControllerToZero(podCtx, pod.ObjectMeta)
if err != nil {
return err
}
if !deletedByController { // Pod not managed by a controller. We can just delete the pod.
logger.Info("Deleting pod without controller")
if err := r.AllNamespacesClient.Delete(podCtx, &pod); err != nil {
return err
}
logger.Info("Pod deleted")
}
if appName == "" {
appName = pod.Name
appType = "Pod"
}

// If a build pod is in "PodCompleted" status then it was not running so there's no reason to send an idler notification
if podReason != "PodCompleted" || deletedByController {
// By now either a pod has been deleted or scaled to zero by controller, idler Triggered notification should be sent
if err := r.createNotification(podCtx, idler, appName, appType); err != nil {
logger.Error(err, "failed to create Notification")
if err = r.setStatusIdlerNotificationCreationFailed(podCtx, idler, err.Error()); err != nil {
logger.Error(err, "failed to set status IdlerNotificationCreationFailed")
} // not returning error to continue tracking remaining pods
}
}
return nil
}

func getHighestRestartCount(podstatus corev1.PodStatus) int32 {
var restartCount int32
for _, status := range podstatus.ContainerStatuses {
if restartCount < status.RestartCount {
restartCount = status.RestartCount
}
}
return restartCount
}

func (r *Reconciler) createNotification(ctx context.Context, idler *toolchainv1alpha1.Idler, appName string, appType string) error {
log.FromContext(ctx).Info("Create Notification")
//Get the HostClient
Expand Down Expand Up @@ -579,7 +605,7 @@
// nextPodToBeKilledAfter checks the start times of all the tracked pods in the Idler and the timeout left
// for the next pod to be killed.
// If there is no pod to kill, the func returns `nil`
func nextPodToBeKilledAfter(log logr.Logger, idler *toolchainv1alpha1.Idler) *time.Duration {
func nextPodToBeKilledAfter(idler *toolchainv1alpha1.Idler) *time.Duration {
if len(idler.Status.Pods) == 0 {
// no pod tracked, so nothing to kill
return nil
Expand All @@ -595,10 +621,28 @@
if d < 0 {
d = 0
}
log.Info("next pod to kill", "after", d)
return &d
}

// findShortestRequeueDuration finds the shortest duration the given durations
// returns the shortest duration to requeue after for idler
func findShortestRequeueDuration(idler *toolchainv1alpha1.Idler) time.Duration {
durations := make([]*time.Duration, 0, 3)
nextPodToKillAfter := nextPodToBeKilledAfter(idler)
maxRequeueDuration := RequeueTimeThreshold
idlerTimeoutDuration := time.Duration(idler.Spec.TimeoutSeconds) * time.Second
durations = append(durations, nextPodToKillAfter, &maxRequeueDuration, &idlerTimeoutDuration)
var shortest *time.Duration
for _, d := range durations {
if d != nil {
if shortest == nil || *d < *shortest {
shortest = d
}
}
}
return *shortest
}

// updateStatusPods updates the status pods to the new ones but only if something changed. Order is ignored.
func (r *Reconciler) updateStatusPods(ctx context.Context, idler *toolchainv1alpha1.Idler, newPods []toolchainv1alpha1.Pod) error {
nothingChanged := len(idler.Status.Pods) == len(newPods)
Expand Down
Loading
Loading