diff --git a/pkg/controllers/sidecar/checks.go b/pkg/controllers/sidecar/checks.go index 27a3fb5e245..71345016ca8 100644 --- a/pkg/controllers/sidecar/checks.go +++ b/pkg/controllers/sidecar/checks.go @@ -5,6 +5,7 @@ import ( "fmt" "net/http" + "github.com/scylladb/scylla-operator/pkg/controllers/sidecar/identity" "github.com/scylladb/scylla-operator/pkg/util/network" "github.com/pkg/errors" @@ -25,18 +26,41 @@ func (mc *MemberReconciler) setupHTTPChecks(ctx context.Context) { } } +func nodeUnderMaintenance(ctx context.Context, mc *MemberReconciler) (bool, error) { + member, err := identity.Retrieve(ctx, mc.member.Name, mc.member.Namespace, mc.kubeClient) + if err != nil { + return false, errors.Wrap(err, "get member service") + } + + _, ok := member.ServiceLabels[naming.NodeMaintenanceLabel] + return ok, nil +} + func livenessCheck(mc *MemberReconciler) func(http.ResponseWriter, *http.Request) { return func(w http.ResponseWriter, req *http.Request) { + ctx := log.WithTraceID(req.Context()) + + if maintenance, err := nodeUnderMaintenance(ctx, mc); err != nil { + w.WriteHeader(http.StatusServiceUnavailable) + mc.logger.Error(ctx, "Liveness check failed", "error", err) + return + } else if maintenance { + // During maintenance Pod should stay alive. + w.WriteHeader(http.StatusOK) + mc.logger.Info(ctx, "Node under maintenance") + return + } + host, err := network.FindFirstNonLocalIP() if err != nil { w.WriteHeader(http.StatusServiceUnavailable) - mc.logger.Error(log.WithTraceID(req.Context()), "Liveness check failed", "error", err) + mc.logger.Error(ctx, "Liveness check failed", "error", err) return } // Check if JMX is reachable _, err = mc.scyllaClient.Ping(context.Background(), host.String()) if err != nil { - mc.logger.Error(log.WithTraceID(req.Context()), "Liveness check failed", "error", err) + mc.logger.Error(ctx, "Liveness check failed", "error", err) w.WriteHeader(http.StatusServiceUnavailable) return } @@ -48,6 +72,17 @@ func readinessCheck(mc *MemberReconciler) func(http.ResponseWriter, *http.Reques return func(w http.ResponseWriter, req *http.Request) { ctx := log.WithTraceID(req.Context()) + if maintenance, err := nodeUnderMaintenance(ctx, mc); err != nil { + w.WriteHeader(http.StatusServiceUnavailable) + mc.logger.Error(ctx, "Readiness check failed", "error", err) + return + } else if maintenance { + // During maintenance Pod shouldn't be declare to be ready. + w.WriteHeader(http.StatusServiceUnavailable) + mc.logger.Info(ctx, "Node under maintenance") + return + } + host, err := network.FindFirstNonLocalIP() if err != nil { w.WriteHeader(http.StatusServiceUnavailable) diff --git a/pkg/naming/constants.go b/pkg/naming/constants.go index 666f35df6dc..d3af0771d12 100644 --- a/pkg/naming/constants.go +++ b/pkg/naming/constants.go @@ -22,6 +22,10 @@ const ( // ReplaceLabel express the intent to replace pod under the specific member. ReplaceLabel = "scylla/replace" + // NodeMaintenanceLabel means that node is under maintenance. + // Readiness check will always fail when this label is added to member service. + NodeMaintenanceLabel = "scylla/node-maintenance" + LabelValueTrue = "true" LabelValueFalse = "false" )