Skip to content

Commit

Permalink
operator: Enhanced K8s Lease resource handling
Browse files Browse the repository at this point in the history
* Refactored K8s Lease resource handling
* Added K8s Lease configuration options

Signed-off-by: Philip Schmid <[email protected]>
  • Loading branch information
PhilipSchmid committed Feb 28, 2025
1 parent d7193ed commit 54e7945
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 28 deletions.
6 changes: 6 additions & 0 deletions docs/content/en/docs/reference/helm-chart.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions install/kubernetes/tetragon/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ data:
skip-tracing-policy-crd: {{ not .Values.tetragonOperator.tracingPolicy.enabled | quote }}
force-update-crds: {{ .Values.tetragonOperator.forceUpdateCRDs | quote }}
{{- if gt (int .Values.tetragonOperator.replicas) 1 }}
leader-elect: "true"
leader-elect: {{ .Values.tetragonOperator.failoverLease.enabled | quote }}
leader-election-namespace: {{ .Values.tetragonOperator.failoverLease.namespace | quote }}
leader-election-lease-duration: {{ .Values.tetragonOperator.failoverLease.leaseDuration | quote }}
leader-election-renew-deadline: {{ .Values.tetragonOperator.failoverLease.leaseRenewDeadline | quote }}
leader-election-retry-period: {{ .Values.tetragonOperator.failoverLease.leaseRetryPeriod | quote }}
{{- end }}
{{- end }}
12 changes: 12 additions & 0 deletions install/kubernetes/tetragon/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,18 @@ tetragonOperator:
enabled: true
# -- Number of replicas to run for the tetragon-operator deployment
replicas: 1
# -- Lease handling for an automated failover when running multiple replicas
failoverLease:
# -- Enable lease failover functionality
enabled: false
# -- Kubernetes Namespace in which the Lease resource is created. Defaults to the namespace where Tetragon is deployed in, if it's empty.
namespace: ""
# -- If a lease is not renewed for X duration, the current leader is considered dead, a new leader is picked
leaseDuration: 15s
# -- The interval at which the leader will renew the lease
leaseRenewDeadline: 5s
# -- The timeout between retries if renewal fails
leaseRetryPeriod: 2s
# -- Annotations for the Tetragon Operator Deployment.
annotations: {}
# -- Annotations for the Tetragon Operator Deployment Pods.
Expand Down
87 changes: 60 additions & 27 deletions operator/cmd/serve/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package serve

import (
"fmt"
"time"

"github.com/bombsimon/logrusr/v4"
"github.com/cilium/cilium/pkg/logging"
Expand All @@ -24,19 +25,31 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook"
)

const (
// LeaderElectionID is the name of the leader election Lease resource
LeaderElectionID = "tetragon-operator-resource-lock"
)

var (
metricsAddr string
enableLeaderElection bool
probeAddr string
scheme = runtime.NewScheme()
setupLog = ctrl.Log.WithName("setup")
scheme = runtime.NewScheme()
setupLog = ctrl.Log.WithName("setup")
)

func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(ciliumiov1alpha1.AddToScheme(scheme))
}

func validateLeaderElectionParams() error {
if operatorOption.Config.LeaderElectionLeaseDuration <= operatorOption.Config.LeaderElectionRenewDeadline {
return fmt.Errorf("leader-election-lease-duration must be greater than leader-election-renew-deadline")
}
if operatorOption.Config.LeaderElectionRenewDeadline <= operatorOption.Config.LeaderElectionRetryPeriod {
return fmt.Errorf("leader-election-renew-deadline must be greater than leader-election-retry-period")
}
return nil
}

func New() *cobra.Command {
cmd := cobra.Command{
Use: "serve",
Expand All @@ -45,29 +58,40 @@ func New() *cobra.Command {
log := logrusr.New(logging.DefaultLogger.WithField(logfields.LogSubsys, "operator"))
ctrl.SetLogger(log)
common.Initialize(cmd)
if err := validateLeaderElectionParams(); err != nil {
return fmt.Errorf("invalid leader election parameters: %w", err)
}
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{BindAddress: metricsAddr},
WebhookServer: webhook.NewServer(webhook.Options{Port: 9443}),
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "f161f714.tetragon.cilium.io",
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
// when the Manager ends. This requires the binary to immediately end when the
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly
// speeds up voluntary leader transitions as the new leader don't have to wait
// LeaseDuration time first.
//
// In the default scaffold provided, the program ends immediately after
// the manager stops, so would be fine to enable this option. However,
// if you are doing or is intended to do any operation such as perform cleanups
// after the manager stops then its usage might be unsafe.
// LeaderElectionReleaseOnCancel: true,
Scheme: scheme,
Metrics: metricsserver.Options{BindAddress: operatorOption.Config.MetricsAddr},
WebhookServer: webhook.NewServer(webhook.Options{Port: 9443}),
HealthProbeBindAddress: operatorOption.Config.ProbeAddr,
LeaderElection: operatorOption.Config.EnableLeaderElection,
LeaderElectionID: LeaderElectionID,
LeaderElectionNamespace: operatorOption.Config.LeaderElectionNamespace,
LeaderElectionReleaseOnCancel: true,
LeaseDuration: &operatorOption.Config.LeaderElectionLeaseDuration,
RenewDeadline: &operatorOption.Config.LeaderElectionRenewDeadline,
RetryPeriod: &operatorOption.Config.LeaderElectionRetryPeriod,
})
if err != nil {
return fmt.Errorf("unable to start manager: %w", err)
}

// Block until this manager is elected leader
if operatorOption.Config.EnableLeaderElection {
go func() {
for {
select {
case <-mgr.Elected():
setupLog.Info("elected leader", "lease", LeaderElectionID, "namespace", operatorOption.Config.LeaderElectionNamespace)
return
case <-time.After(operatorOption.Config.LeaderElectionRetryPeriod):
}
}
}()
}

if !operatorOption.Config.SkipPodInfoCRD {
if err = (&podinfo.Reconciler{
Client: mgr.GetClient(),
Expand All @@ -83,19 +107,28 @@ func New() *cobra.Command {
return fmt.Errorf("unable to set up ready check %w", err)
}

setupLog.Info("starting manager")
setupLog.Info("starting manager", "metricsAddr", operatorOption.Config.MetricsAddr, "probeAddr", operatorOption.Config.ProbeAddr, "leaderElection", operatorOption.Config.EnableLeaderElection)
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
return fmt.Errorf("problem running manager %w", err)
}
setupLog.Info("manager stopped gracefully")
return nil
},
}
cmd.Flags().StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metric endpoint binds to.")
cmd.Flags().StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
cmd.Flags().BoolVar(&enableLeaderElection, "leader-elect", false,
common.AddCommonFlags(&cmd)
cmd.Flags().StringVar(&operatorOption.Config.MetricsAddr, "metrics-bind-address", "0", "The address the metric endpoint binds to.")
cmd.Flags().StringVar(&operatorOption.Config.ProbeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
cmd.Flags().BoolVar(&operatorOption.Config.EnableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
"Enabling this will ensure there is only one active controller manager.")
common.AddCommonFlags(&cmd)
cmd.Flags().StringVar(&operatorOption.Config.LeaderElectionNamespace, "leader-election-namespace", "",
"Kubernetes namespace in which the leader election Lease resource should be created.")
cmd.Flags().DurationVar(&operatorOption.Config.LeaderElectionLeaseDuration, "leader-election-lease-duration", 15*time.Second,
"Duration that non-leader operator candidates will wait before forcing to acquire leadership")
cmd.Flags().DurationVar(&operatorOption.Config.LeaderElectionRenewDeadline, "leader-election-renew-deadline", 5*time.Second,
"Duration that current acting master will retry refreshing leadership in before giving up the lock")
cmd.Flags().DurationVar(&operatorOption.Config.LeaderElectionRetryPeriod, "leader-election-retry-period", 2*time.Second,
"Duration that LeaderElector clients should wait between retries of the actions")
viper.BindPFlags(cmd.Flags())
return &cmd
}
51 changes: 51 additions & 0 deletions operator/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
package option

import (
"time"

"github.com/spf13/viper"
)

Expand Down Expand Up @@ -34,6 +36,27 @@ const (
// ForceUpdateCRDs specifies whether operator should ignore current CRD version
// and forcefully update it.
ForceUpdateCRDs = "force-update-crds"

// MetricsAddr is the address the metric endpoint binds to.
MetricsAddr = "metrics-bind-address"

// ProbeAddr is the address the probe endpoint binds to.
ProbeAddr = "health-probe-bind-address"

// EnableLeaderElection enables leader election for controller manager.
EnableLeaderElection = "leader-elect"

// LeaderElectionNamespace is the Kubernetes namespace in which the leader election Lease resource should be created.
LeaderElectionNamespace = "leader-election-namespace"

// LeaderElectionLeaseDuration is the duration that non-leader operator candidates will wait before forcing to acquire leadership.
LeaderElectionLeaseDuration = "leader-election-lease-duration"

// LeaderElectionRenewDeadline is the duration that current acting master will retry refreshing leadership in before giving up the lock.
LeaderElectionRenewDeadline = "leader-election-renew-deadline"

// LeaderElectionRetryPeriod is the duration that LeaderElector clients should wait between retries of the actions.
LeaderElectionRetryPeriod = "leader-election-retry-period"
)

// OperatorConfig is the configuration used by the operator.
Expand All @@ -58,6 +81,27 @@ type OperatorConfig struct {
// ForceUpdateCRDs forces the CRD to be updated even if it's version
// is lower than the one in the cluster.
ForceUpdateCRDs bool

// MetricsAddr is the address the metric endpoint binds to.
MetricsAddr string

// ProbeAddr is the address the probe endpoint binds to.
ProbeAddr string

// EnableLeaderElection enables leader election for controller manager.
EnableLeaderElection bool

// LeaderElectionNamespace is the Kubernetes namespace in which the leader election Lease resource should be created.
LeaderElectionNamespace string

// LeaderElectionLeaseDuration is the duration that non-leader operator candidates will wait before forcing to acquire leadership.
LeaderElectionLeaseDuration time.Duration

// LeaderElectionRenewDeadline is the duration that current acting master will retry refreshing leadership in before giving up the lock.
LeaderElectionRenewDeadline time.Duration

// LeaderElectionRetryPeriod is the duration that LeaderElector clients should wait between retries of the actions.
LeaderElectionRetryPeriod time.Duration
}

// Config represents the operator configuration.
Expand All @@ -71,4 +115,11 @@ func ConfigPopulate() {
Config.SkipPodInfoCRD = viper.GetBool(SkipPodInfoCRD)
Config.SkipTracingPolicyCRD = viper.GetBool(SkipTracingPolicyCRD)
Config.ForceUpdateCRDs = viper.GetBool(ForceUpdateCRDs)
Config.MetricsAddr = viper.GetString(MetricsAddr)
Config.ProbeAddr = viper.GetString(ProbeAddr)
Config.EnableLeaderElection = viper.GetBool(EnableLeaderElection)
Config.LeaderElectionNamespace = viper.GetString(LeaderElectionNamespace)
Config.LeaderElectionLeaseDuration = viper.GetDuration(LeaderElectionLeaseDuration)
Config.LeaderElectionRenewDeadline = viper.GetDuration(LeaderElectionRenewDeadline)
Config.LeaderElectionRetryPeriod = viper.GetDuration(LeaderElectionRetryPeriod)
}

0 comments on commit 54e7945

Please sign in to comment.