Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Maintenance Creation Check for Control Plane Nodes #110

Merged
merged 7 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ RUN export GO_VERSION=$(grep -E "go [[:digit:]]\.[[:digit:]][[:digit:]]" go.mod
# Copy the go source
COPY api/ api/
COPY controllers/ controllers/
COPY pkg/ pkg/
COPY hack/ hack/
COPY main.go main.go
COPY vendor/ vendor/
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ test: test-no-verify verify-unchanged ## Generate and format code, run tests, ge

.PHONY: test-no-verify
test-no-verify: manifests generate go-verify test-imports fmt vet envtest ginkgo ## Generate and format code, and run tests
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path --bin-dir $(LOCALBIN))" $(GINKGO) -r --keep-going --require-suite --vv ./api/... ./controllers/... --coverprofile cover.out
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path --bin-dir $(LOCALBIN))" $(GINKGO) -r --keep-going --require-suite --vv ./api/... ./pkg/... ./controllers/... --coverprofile cover.out

.PHONY: bundle-run
bundle-run: operator-sdk ## Run bundle image. Default NS is "openshift-workload-availability", redefine OPERATOR_NAMESPACE to override it.
Expand Down
75 changes: 33 additions & 42 deletions api/v1beta1/nodemaintenance_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ import (
"context"
"fmt"

"github.com/medik8s/common/pkg/etcd"
"github.com/medik8s/common/pkg/nodes"

v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -37,8 +37,8 @@ import (
const (
ErrorNodeNotExists = "invalid nodeName, no node with name %s found"
ErrorNodeMaintenanceExists = "invalid nodeName, a NodeMaintenance for node %s already exists"
ErrorControlPlaneQuorumViolation = "can not put master/control-plane node into maintenance at this moment, disrupting node %s will violate etcd quorum"
ErrorNodeNameUpdateForbidden = "updating spec.NodeName isn't allowed"
ErrorControlPlaneQuorumViolation = "can not put master/control-plane node into maintenance at this moment, it would violate the master/control-plane node quorum"
)

const (
Expand All @@ -55,15 +55,17 @@ var nodemaintenancelog = logf.Log.WithName("nodemaintenance-resource")
// NodeMaintenanceValidator validates NodeMaintenance resources. Needed because we need a client for validation
// +k8s:deepcopy-gen=false
type NodeMaintenanceValidator struct {
client client.Client
client client.Client
isOpenShift bool
}

var validator *NodeMaintenanceValidator

func (r *NodeMaintenance) SetupWebhookWithManager(mgr ctrl.Manager) error {
func (r *NodeMaintenance) SetupWebhookWithManager(isOpenShift bool, mgr ctrl.Manager) error {
// init the validator!
validator = &NodeMaintenanceValidator{
client: mgr.GetClient(),
client: mgr.GetClient(),
isOpenShift: isOpenShift,
}

return ctrl.NewWebhookManagedBy(mgr).
Expand Down Expand Up @@ -108,20 +110,21 @@ func (r *NodeMaintenance) ValidateDelete() (admission.Warnings, error) {

func (v *NodeMaintenanceValidator) ValidateCreate(nm *NodeMaintenance) error {
// Validate that node with given name exists
if err := v.validateNodeExists(nm.Spec.NodeName); err != nil {
nodemaintenancelog.Info("validation failed", "error", err)
nodeName := nm.Spec.NodeName
if err := v.validateNodeExists(nodeName); err != nil {
nodemaintenancelog.Info("validation failed ", "nmName", nm.Name, "nodeName", nodeName, "error", err)
return err
}

// Validate that no NodeMaintenance for given node exists yet
if err := v.validateNoNodeMaintenanceExists(nm.Spec.NodeName); err != nil {
nodemaintenancelog.Info("validation failed", "error", err)
if err := v.validateNoNodeMaintenanceExists(nodeName); err != nil {
nodemaintenancelog.Info("validation failed", "nmName", nm.Name, "nodeName", nodeName, "error", err)
return err
}

// Validate that NodeMaintenance for control-plane nodes don't violate quorum
if err := v.validateControlPlaneQuorum(nm.Spec.NodeName); err != nil {
nodemaintenancelog.Info("validation failed", "error", err)
if err := v.validateControlPlaneQuorum(nodeName); err != nil {
nodemaintenancelog.Info("validation failed", "nmName", nm.Name, "nodeName", nodeName, "error", err)
return err
}

Expand Down Expand Up @@ -162,8 +165,14 @@ func (v *NodeMaintenanceValidator) validateNoNodeMaintenanceExists(nodeName stri
}

func (v *NodeMaintenanceValidator) validateControlPlaneQuorum(nodeName string) error {
// check if the node is a control-plane node
if node, err := getNode(nodeName, v.client); err != nil {
if !v.isOpenShift {
// etcd quorum PDB is only installed in OpenShift
nodemaintenancelog.Info("Cluster does not have etcd quorum PDB, thus we can't asses control-plane quorum violation")
return nil
}
// check if the node is a control-plane node on OpenShift
node, err := getNode(nodeName, v.client)
if err != nil {
return fmt.Errorf("could not get node for master/control-plane quorum validation, please try again: %v", err)
} else if node == nil {
// this should have been catched already, but just in case
Expand All @@ -172,39 +181,21 @@ func (v *NodeMaintenanceValidator) validateControlPlaneQuorum(nodeName string) e
// not a control-plane node, nothing to do
return nil
}

// check the etcd-quorum-guard PodDisruptionBudget if we can drain a control-plane node
disruptionsAllowed := int32(-1)
for _, pdbName := range []string{EtcdQuorumPDBNewName, EtcdQuorumPDBOldName} {
var pdb policyv1.PodDisruptionBudget
key := types.NamespacedName{
Namespace: EtcdQuorumPDBNamespace,
Name: pdbName,
}
if err := v.client.Get(context.TODO(), key, &pdb); err != nil {
if apierrors.IsNotFound(err) {
// try next one
continue
}
return fmt.Errorf("could not get the etcd quorum guard PDB for master/control-plane quorum validation, please try again: %v", err)
}
disruptionsAllowed = pdb.Status.DisruptionsAllowed
break
}
if disruptionsAllowed == -1 {
// TODO do we need a fallback for k8s clusters?
nodemaintenancelog.Info("etcd quorum guard PDB hasn't been found. Skipping master/control-plane quorum validation.")
return nil
// The node is a control-plane node on OpenShift
// now we check if adding nm CR for this node will disrupt control-plane quorum
isDisruptionAllowed, err := etcd.IsEtcdDisruptionAllowed(context.Background(), v.client, nodemaintenancelog, node)
if err != nil {
return err
}
if disruptionsAllowed == 0 {
return fmt.Errorf(ErrorControlPlaneQuorumViolation)
if !isDisruptionAllowed {
return fmt.Errorf(ErrorControlPlaneQuorumViolation, nodeName)
}
return nil
}

// if the returned node is nil, it wasn't found
func getNode(nodeName string, client client.Client) (*v1.Node, error) {
var node v1.Node
// getNode returns a node if it exists, otherwise it returns nil
func getNode(nodeName string, client client.Client) (*corev1.Node, error) {
var node corev1.Node
key := types.NamespacedName{
Name: nodeName,
}
Expand Down
131 changes: 83 additions & 48 deletions api/v1beta1/nodemaintenance_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,27 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

v1 "k8s.io/api/core/v1"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var _ = Describe("NodeMaintenance Validation", func() {

const nonExistingNodeName = "node-not-exists"
const existingNodeName = "node-exists"
const (
nonExistingNodeName = "node-not-exists"
existingNodeName = "node-exists"
)

BeforeEach(func() {
// create quorum ns on 1st run
quorumNs := &v1.Namespace{
quorumNs := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: EtcdQuorumPDBNamespace,
},
}
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(quorumNs), &v1.Namespace{}); err != nil {
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(quorumNs), &corev1.Namespace{}); err != nil {
err := k8sClient.Create(context.Background(), quorumNs)
Expect(err).ToNot(HaveOccurred())
}
Expand All @@ -46,7 +48,7 @@ var _ = Describe("NodeMaintenance Validation", func() {

Context("for node already in maintenance", func() {

var node *v1.Node
var node *corev1.Node
var nmExisting *NodeMaintenance

BeforeEach(func() {
Expand Down Expand Up @@ -83,7 +85,7 @@ var _ = Describe("NodeMaintenance Validation", func() {

Context("for master/control-plane node", func() {

var node *v1.Node
var node *corev1.Node

BeforeEach(func() {
node = getTestNode(existingNodeName, true)
Expand All @@ -98,65 +100,69 @@ var _ = Describe("NodeMaintenance Validation", func() {

Context("with potential quorum violation", func() {

var pdb *policyv1.PodDisruptionBudget

BeforeEach(func() {
pdb = getTestPDB()
err := k8sClient.Create(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
})
pdb := getTestPDB()
Expect(k8sClient.Create(context.Background(), pdb)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), pdb)

AfterEach(func() {
err := k8sClient.Delete(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
})

It("should be rejected", func() {
nm := getTestNMO(existingNodeName)
_, err := nm.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(ErrorControlPlaneQuorumViolation))
When("node has etcd guard pod", func() {
var guardPod *corev1.Pod
BeforeEach(func() {
guardPod = getPodGuard(existingNodeName)
Expect(k8sClient.Create(context.Background(), guardPod)).To(Succeed())
setPodConditionReady(context.Background(), guardPod, corev1.ConditionTrue)
// delete with force as the guard pod deletion takes time and won't happen immediately
var force client.GracePeriodSeconds = 0
DeferCleanup(k8sClient.Delete, context.Background(), guardPod, force)
})
It("should be allowed if the pod is on Fail state", func() {
testGuardPod := &corev1.Pod{}
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(guardPod), testGuardPod)).To(Succeed())
setPodConditionReady(context.Background(), testGuardPod, corev1.ConditionFalse)

nm := getTestNMO(existingNodeName)
Expect(nm.ValidateCreate()).Error().NotTo(HaveOccurred())
})
It("should be rejected if the pod is on True state", func() {
nm := getTestNMO(existingNodeName)
_, err := nm.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(ErrorControlPlaneQuorumViolation, node.Name))
})
})
When("node doesn't have etcd guard pod", func() {
It("should be allowed", func() {
nm := getTestNMO(existingNodeName)
Expect(nm.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

})

Context("without potential quorum violation", func() {

var pdb *policyv1.PodDisruptionBudget

BeforeEach(func() {
pdb = getTestPDB()
err := k8sClient.Create(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
pdb := getTestPDB()
Expect(k8sClient.Create(context.Background(), pdb)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), pdb)

pdb.Status.DisruptionsAllowed = 1
err = k8sClient.Status().Update(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
})

AfterEach(func() {
err := k8sClient.Delete(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
Expect(k8sClient.Status().Update(context.Background(), pdb)).To(Succeed())
})

It("should not be rejected", func() {
It("should be allowed", func() {
nm := getTestNMO(existingNodeName)
Eventually(func() error {
_, err := nm.ValidateCreate()
return err
}, time.Second, 200*time.Millisecond).ShouldNot(HaveOccurred())
Expect(nm.ValidateCreate()).Error().NotTo(HaveOccurred())
})

})

Context("without etcd quorum guard PDB", func() {

It("should not be rejected", func() {
It("should be rejected", func() {
nm := getTestNMO(existingNodeName)
Eventually(func() error {
_, err := nm.ValidateCreate()
return err
}, time.Second, 200*time.Millisecond).ShouldNot(HaveOccurred())
_, err := nm.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(ErrorControlPlaneQuorumViolation, node.Name))
})

})
Expand Down Expand Up @@ -191,8 +197,8 @@ func getTestNMO(nodeName string) *NodeMaintenance {
}
}

func getTestNode(name string, isControlPlane bool) *v1.Node {
node := &v1.Node{
func getTestNode(name string, isControlPlane bool) *corev1.Node {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Expand All @@ -213,3 +219,32 @@ func getTestPDB() *policyv1.PodDisruptionBudget {
},
}
}

// getPodGuard returns guard pod with expected label and Ready condition is True for a given nodeName
func getPodGuard(nodeName string) *corev1.Pod {
dummyContainer := corev1.Container{
Name: "container-name",
Image: "foo",
}
return &corev1.Pod{
TypeMeta: metav1.TypeMeta{Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{
Name: "guard-" + nodeName,
Namespace: EtcdQuorumPDBNamespace,
Labels: map[string]string{
"app": "guard",
},
},
Spec: corev1.PodSpec{
NodeName: nodeName,
Containers: []corev1.Container{
dummyContainer,
},
},
}
}

func setPodConditionReady(ctx context.Context, pod *corev1.Pod, readyVal corev1.ConditionStatus) {
pod.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodReady, Status: readyVal}}
Expect(k8sClient.Status().Update(context.Background(), pod)).To(Succeed())
}
14 changes: 8 additions & 6 deletions api/v1beta1/webhook_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@ import (
// These tests use Ginkgo (BDD-style Go testing framework). Refer to
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

var cfg *rest.Config
var k8sClient client.Client
var testEnv *envtest.Environment
var ctx context.Context
var cancel context.CancelFunc
var (
cfg *rest.Config
k8sClient client.Client
testEnv *envtest.Environment
ctx context.Context
cancel context.CancelFunc
)

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)
Expand Down Expand Up @@ -110,7 +112,7 @@ var _ = BeforeSuite(func() {
})
Expect(err).NotTo(HaveOccurred())

err = (&NodeMaintenance{}).SetupWebhookWithManager(mgr)
err = (&NodeMaintenance{}).SetupWebhookWithManager(true, mgr)
Expect(err).NotTo(HaveOccurred())

//+kubebuilder:scaffold:webhook
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.20

require (
github.com/go-logr/logr v1.4.1
github.com/medik8s/common v1.12.0
github.com/medik8s/common v1.13.0
github.com/onsi/ginkgo/v2 v2.14.0
github.com/onsi/gomega v1.30.0
github.com/sirupsen/logrus v1.9.3
Expand Down
Loading