From 28f24791bb803c3bdee0baff8d937a50b201e1f1 Mon Sep 17 00:00:00 2001 From: Philip Schmid Date: Thu, 27 Feb 2025 21:57:20 +0100 Subject: [PATCH] operator: Added K8s lease configuration options Signed-off-by: Philip Schmid --- docs/content/en/docs/reference/helm-chart.md | 6 +++ install/kubernetes/tetragon/README.md | 6 +++ .../templates/operator_configmap.yaml | 6 ++- install/kubernetes/tetragon/values.yaml | 12 +++++ operator/cmd/serve/serve.go | 51 +++++++++++-------- 5 files changed, 58 insertions(+), 23 deletions(-) diff --git a/docs/content/en/docs/reference/helm-chart.md b/docs/content/en/docs/reference/helm-chart.md index 4230347944e..b0eed8dabfe 100644 --- a/docs/content/en/docs/reference/helm-chart.md +++ b/docs/content/en/docs/reference/helm-chart.md @@ -145,6 +145,12 @@ To use [the values available](#values), with `helm install` or `helm upgrade`, u | tetragonOperator.extraPodLabels | object | `{}` | Extra labels to be added on the Tetragon Operator Deployment Pods. | | tetragonOperator.extraVolumeMounts | list | `[]` | | | tetragonOperator.extraVolumes | list | `[]` | Extra volumes for the Tetragon Operator Deployment. | +| tetragonOperator.failoverLease | object | `{"enabled":false,"leaseDuration":"15s","leaseRenewDeadline":"5s","leaseRetryPeriod":"2s","namespace":""}` | Lease handling for an automated failover when running multiple replicas | +| tetragonOperator.failoverLease.enabled | bool | `false` | Enable lease failover functionality | +| tetragonOperator.failoverLease.leaseDuration | string | `"15s"` | If a lease is not renewed for X duration, the current leader is considered dead, a new leader is picked | +| tetragonOperator.failoverLease.leaseRenewDeadline | string | `"5s"` | The interval at which the leader will renew the lease | +| tetragonOperator.failoverLease.leaseRetryPeriod | string | `"2s"` | The timeout between retries if renewal fails | +| tetragonOperator.failoverLease.namespace | string | `""` | Kubernetes Namespace in which the Lease resource is created. Defaults to the namespace where Tetragon is deployed in, if it's empty. | | tetragonOperator.forceUpdateCRDs | bool | `false` | | | tetragonOperator.image | object | `{"override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/tetragon-operator","tag":"v1.3.0"}` | tetragon-operator image. | | tetragonOperator.nodeSelector | object | `{}` | Steer the Tetragon Operator Deployment Pod placement via nodeSelector, tolerations and affinity rules. | diff --git a/install/kubernetes/tetragon/README.md b/install/kubernetes/tetragon/README.md index 0674a0b6471..445623525f4 100644 --- a/install/kubernetes/tetragon/README.md +++ b/install/kubernetes/tetragon/README.md @@ -127,6 +127,12 @@ Helm chart for Tetragon | tetragonOperator.extraPodLabels | object | `{}` | Extra labels to be added on the Tetragon Operator Deployment Pods. | | tetragonOperator.extraVolumeMounts | list | `[]` | | | tetragonOperator.extraVolumes | list | `[]` | Extra volumes for the Tetragon Operator Deployment. | +| tetragonOperator.failoverLease | object | `{"enabled":false,"leaseDuration":"15s","leaseRenewDeadline":"5s","leaseRetryPeriod":"2s","namespace":""}` | Lease handling for an automated failover when running multiple replicas | +| tetragonOperator.failoverLease.enabled | bool | `false` | Enable lease failover functionality | +| tetragonOperator.failoverLease.leaseDuration | string | `"15s"` | If a lease is not renewed for X duration, the current leader is considered dead, a new leader is picked | +| tetragonOperator.failoverLease.leaseRenewDeadline | string | `"5s"` | The interval at which the leader will renew the lease | +| tetragonOperator.failoverLease.leaseRetryPeriod | string | `"2s"` | The timeout between retries if renewal fails | +| tetragonOperator.failoverLease.namespace | string | `""` | Kubernetes Namespace in which the Lease resource is created. Defaults to the namespace where Tetragon is deployed in, if it's empty. | | tetragonOperator.forceUpdateCRDs | bool | `false` | | | tetragonOperator.image | object | `{"override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/tetragon-operator","tag":"v1.3.0"}` | tetragon-operator image. | | tetragonOperator.nodeSelector | object | `{}` | Steer the Tetragon Operator Deployment Pod placement via nodeSelector, tolerations and affinity rules. | diff --git a/install/kubernetes/tetragon/templates/operator_configmap.yaml b/install/kubernetes/tetragon/templates/operator_configmap.yaml index 04a1666d39d..1fa5aa9a96d 100644 --- a/install/kubernetes/tetragon/templates/operator_configmap.yaml +++ b/install/kubernetes/tetragon/templates/operator_configmap.yaml @@ -16,6 +16,10 @@ data: skip-tracing-policy-crd: {{ not .Values.tetragonOperator.tracingPolicy.enabled | quote }} force-update-crds: {{ .Values.tetragonOperator.forceUpdateCRDs | quote }} {{- if gt (int .Values.tetragonOperator.replicas) 1 }} - leader-elect: "true" + leader-elect: {{ .Values.tetragonOperator.failoverLease.enabled | quote }} + leader-election-namespace: {{ .Values.tetragonOperator.failoverLease.namespace | quote }} + leader-election-lease-duration: {{ .Values.tetragonOperator.failoverLease.leaseDuration | quote }} + leader-election-renew-deadline: {{ .Values.tetragonOperator.failoverLease.leaseRenewDeadline | quote }} + leader-election-retry-period: {{ .Values.tetragonOperator.failoverLease.leaseRetryPeriod | quote }} {{- end }} {{- end }} diff --git a/install/kubernetes/tetragon/values.yaml b/install/kubernetes/tetragon/values.yaml index 20e5a3f51a6..510eb7e6674 100644 --- a/install/kubernetes/tetragon/values.yaml +++ b/install/kubernetes/tetragon/values.yaml @@ -253,6 +253,18 @@ tetragonOperator: enabled: true # -- Number of replicas to run for the tetragon-operator deployment replicas: 1 + # -- Lease handling for an automated failover when running multiple replicas + failoverLease: + # -- Enable lease failover functionality + enabled: false + # -- Kubernetes Namespace in which the Lease resource is created. Defaults to the namespace where Tetragon is deployed in, if it's empty. + namespace: "" + # -- If a lease is not renewed for X duration, the current leader is considered dead, a new leader is picked + leaseDuration: 15s + # -- The interval at which the leader will renew the lease + leaseRenewDeadline: 5s + # -- The timeout between retries if renewal fails + leaseRetryPeriod: 2s # -- Annotations for the Tetragon Operator Deployment. annotations: {} # -- Annotations for the Tetragon Operator Deployment Pods. diff --git a/operator/cmd/serve/serve.go b/operator/cmd/serve/serve.go index 939e42d29f1..4afdd461255 100644 --- a/operator/cmd/serve/serve.go +++ b/operator/cmd/serve/serve.go @@ -5,6 +5,7 @@ package serve import ( "fmt" + "time" "github.com/bombsimon/logrusr/v4" "github.com/cilium/cilium/pkg/logging" @@ -25,11 +26,15 @@ import ( ) var ( - metricsAddr string - enableLeaderElection bool - probeAddr string - scheme = runtime.NewScheme() - setupLog = ctrl.Log.WithName("setup") + metricsAddr string + enableLeaderElection bool + leaderElectionNamespace string + leaderElectionLeaseDuration time.Duration + leaderElectionRenewDeadline time.Duration + leaderElectionRetryPeriod time.Duration + probeAddr string + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") ) func init() { @@ -46,23 +51,17 @@ func New() *cobra.Command { ctrl.SetLogger(log) common.Initialize(cmd) mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - Metrics: metricsserver.Options{BindAddress: metricsAddr}, - WebhookServer: webhook.NewServer(webhook.Options{Port: 9443}), - HealthProbeBindAddress: probeAddr, - LeaderElection: enableLeaderElection, - LeaderElectionID: "f161f714.tetragon.cilium.io", - // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily - // when the Manager ends. This requires the binary to immediately end when the - // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly - // speeds up voluntary leader transitions as the new leader don't have to wait - // LeaseDuration time first. - // - // In the default scaffold provided, the program ends immediately after - // the manager stops, so would be fine to enable this option. However, - // if you are doing or is intended to do any operation such as perform cleanups - // after the manager stops then its usage might be unsafe. - // LeaderElectionReleaseOnCancel: true, + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: metricsAddr}, + WebhookServer: webhook.NewServer(webhook.Options{Port: 9443}), + HealthProbeBindAddress: probeAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "tetragon-operator-resource-lock", + LeaderElectionNamespace: leaderElectionNamespace, + LeaderElectionReleaseOnCancel: true, + LeaseDuration: &leaderElectionLeaseDuration, + RenewDeadline: &leaderElectionRenewDeadline, + RetryPeriod: &leaderElectionRetryPeriod, }) if err != nil { return fmt.Errorf("unable to start manager: %w", err) @@ -95,6 +94,14 @@ func New() *cobra.Command { cmd.Flags().BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ "Enabling this will ensure there is only one active controller manager.") + cmd.Flags().StringVar(&leaderElectionNamespace, "leader-election-namespace", "", + "Kubernetes namespace in which the leader election Lease resource should be created.") + cmd.Flags().DurationVar(&leaderElectionLeaseDuration, "leader-election-lease-duration", 15*time.Second, + "Duration that non-leader operator candidates will wait before forcing to acquire leadership") + cmd.Flags().DurationVar(&leaderElectionRenewDeadline, "leader-election-renew-deadline", 5*time.Second, + "Duration that current acting master will retry refreshing leadership in before giving up the lock") + cmd.Flags().DurationVar(&leaderElectionRetryPeriod, "leader-election-retry-period", 2*time.Second, + "Duration that LeaderElector clients should wait between retries of the actions") common.AddCommonFlags(&cmd) viper.BindPFlags(cmd.Flags()) return &cmd