From e52071d7f9e5a05dc8b3f9d2ee48265a55c6fc14 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Fri, 17 Sep 2021 17:22:28 +0300 Subject: [PATCH] fix: shutdown sidero-controller-manager when any component fails Fixes #560 The way it was implemented before this change, `errgoup` waits for all goroutines to finish before it returns, so if the controller crashes due to election issues, container still keeps running as HTTP API is up. After this change, container crashes on first error. Also added liveness/readiness check, they won't help much this issue, but provide additional layer of protection/visibility. Signed-off-by: Andrey Smirnov --- .../config/manager/manager.yaml | 12 ++++++- .../internal/healthz/healthz.go | 19 +++++++++++ app/sidero-controller-manager/main.go | 34 ++++++++++++------- 3 files changed, 52 insertions(+), 13 deletions(-) create mode 100644 app/sidero-controller-manager/internal/healthz/healthz.go diff --git a/app/sidero-controller-manager/config/manager/manager.yaml b/app/sidero-controller-manager/config/manager/manager.yaml index 5ffac713f..2e3a3f280 100644 --- a/app/sidero-controller-manager/config/manager/manager.yaml +++ b/app/sidero-controller-manager/config/manager/manager.yaml @@ -65,7 +65,7 @@ spec: containerPort: 69 protocol: UDP - name: http - containerPort: 8081 + containerPort: ${SIDERO_CONTROLLER_MANAGER_API_PORT:=8081} protocol: TCP env: - name: API_ENDPOINT @@ -79,4 +79,14 @@ spec: requests: cpu: 100m memory: 128Mi + readinessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 15 + livenessProbe: + httpGet: + path: /healthz + port: http + initialDelaySeconds: 15 terminationGracePeriodSeconds: 10 diff --git a/app/sidero-controller-manager/internal/healthz/healthz.go b/app/sidero-controller-manager/internal/healthz/healthz.go new file mode 100644 index 000000000..d924aa72b --- /dev/null +++ b/app/sidero-controller-manager/internal/healthz/healthz.go @@ -0,0 +1,19 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package healthz + +import ( + "net/http" +) + +func RegisterServer(mux *http.ServeMux) error { + mux.HandleFunc("/healthz", healthzHandler) + + return nil +} + +func healthzHandler(w http.ResponseWriter, req *http.Request) { + // do nothing, consider to be healthy always +} diff --git a/app/sidero-controller-manager/main.go b/app/sidero-controller-manager/main.go index 4ac474253..8c9b45e65 100644 --- a/app/sidero-controller-manager/main.go +++ b/app/sidero-controller-manager/main.go @@ -16,7 +16,6 @@ import ( debug "github.com/talos-systems/go-debug" "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" - "golang.org/x/sync/errgroup" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" @@ -32,6 +31,7 @@ import ( infrav1 "github.com/talos-systems/sidero/app/caps-controller-manager/api/v1alpha3" metalv1alpha1 "github.com/talos-systems/sidero/app/sidero-controller-manager/api/v1alpha1" "github.com/talos-systems/sidero/app/sidero-controller-manager/controllers" + "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/healthz" "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/ipxe" "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/metadata" "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/power/api" @@ -200,13 +200,16 @@ func main() { } // +kubebuilder:scaffold:builder + errCh := make(chan error) + setupLog.Info("starting TFTP server") go func() { if err := tftp.ServeTFTP(); err != nil { setupLog.Error(err, "unable to start TFTP server", "controller", "Environment") - os.Exit(1) } + + errCh <- err }() httpMux := http.NewServeMux() @@ -225,6 +228,13 @@ func main() { os.Exit(1) } + setupLog.Info("starting healthz server") + + if err := healthz.RegisterServer(httpMux); err != nil { + setupLog.Error(err, "unable to start healthz server", "controller", "Environment") + os.Exit(1) + } + setupLog.Info("starting internal API server") apiRecorder := eventBroadcaster.NewRecorder( @@ -251,18 +261,16 @@ func main() { setupLog.Info("starting manager and HTTP server") - var eg errgroup.Group - - eg.Go(func() error { + go func() { err := mgr.Start(ctrl.SetupSignalHandler()) if err != nil { setupLog.Error(err, "problem running manager") } - return err - }) + errCh <- err + }() - eg.Go(func() error { + go func() { // Go standard library doesn't support running HTTP/2 on non-TLS HTTP connections. // Package h2c provides handling for HTTP/2 over plaintext connection. // gRPC provides its own HTTP/2 server implementation, so that's not an issue for gRPC, @@ -288,10 +296,12 @@ func main() { setupLog.Error(err, "problem running HTTP server") } - return err - }) + errCh <- err + }() - if err := eg.Wait(); err != nil { - os.Exit(1) + for err = range errCh { + if err != nil { + os.Exit(1) + } } }