Skip to content

Commit

Permalink
Merge pull request #110 from razo7/better-webhook-cp-message
Browse files Browse the repository at this point in the history
Fix Maintenance Creation Check for Control Plane Nodes
  • Loading branch information
openshift-merge-bot[bot] authored Jan 25, 2024
2 parents 761b790 + 9178a33 commit 5cb0cee
Show file tree
Hide file tree
Showing 14 changed files with 336 additions and 106 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ RUN export GO_VERSION=$(grep -E "go [[:digit:]]\.[[:digit:]][[:digit:]]" go.mod
# Copy the go source
COPY api/ api/
COPY controllers/ controllers/
COPY pkg/ pkg/
COPY hack/ hack/
COPY main.go main.go
COPY vendor/ vendor/
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ test: test-no-verify verify-unchanged ## Generate and format code, run tests, ge

.PHONY: test-no-verify
test-no-verify: manifests generate go-verify test-imports fmt vet envtest ginkgo ## Generate and format code, and run tests
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path --bin-dir $(LOCALBIN))" $(GINKGO) -r --keep-going --require-suite --vv ./api/... ./controllers/... --coverprofile cover.out
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path --bin-dir $(LOCALBIN))" $(GINKGO) -r --keep-going --require-suite --vv ./api/... ./pkg/... ./controllers/... --coverprofile cover.out

.PHONY: bundle-run
bundle-run: operator-sdk ## Run bundle image. Default NS is "openshift-workload-availability", redefine OPERATOR_NAMESPACE to override it.
Expand Down
75 changes: 33 additions & 42 deletions api/v1beta1/nodemaintenance_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ import (
"context"
"fmt"

"github.com/medik8s/common/pkg/etcd"
"github.com/medik8s/common/pkg/nodes"

v1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -37,8 +37,8 @@ import (
const (
ErrorNodeNotExists = "invalid nodeName, no node with name %s found"
ErrorNodeMaintenanceExists = "invalid nodeName, a NodeMaintenance for node %s already exists"
ErrorControlPlaneQuorumViolation = "can not put master/control-plane node into maintenance at this moment, disrupting node %s will violate etcd quorum"
ErrorNodeNameUpdateForbidden = "updating spec.NodeName isn't allowed"
ErrorControlPlaneQuorumViolation = "can not put master/control-plane node into maintenance at this moment, it would violate the master/control-plane node quorum"
)

const (
Expand All @@ -55,15 +55,17 @@ var nodemaintenancelog = logf.Log.WithName("nodemaintenance-resource")
// NodeMaintenanceValidator validates NodeMaintenance resources. Needed because we need a client for validation
// +k8s:deepcopy-gen=false
type NodeMaintenanceValidator struct {
client client.Client
client client.Client
isOpenShift bool
}

var validator *NodeMaintenanceValidator

func (r *NodeMaintenance) SetupWebhookWithManager(mgr ctrl.Manager) error {
func (r *NodeMaintenance) SetupWebhookWithManager(isOpenShift bool, mgr ctrl.Manager) error {
// init the validator!
validator = &NodeMaintenanceValidator{
client: mgr.GetClient(),
client: mgr.GetClient(),
isOpenShift: isOpenShift,
}

return ctrl.NewWebhookManagedBy(mgr).
Expand Down Expand Up @@ -108,20 +110,21 @@ func (r *NodeMaintenance) ValidateDelete() (admission.Warnings, error) {

func (v *NodeMaintenanceValidator) ValidateCreate(nm *NodeMaintenance) error {
// Validate that node with given name exists
if err := v.validateNodeExists(nm.Spec.NodeName); err != nil {
nodemaintenancelog.Info("validation failed", "error", err)
nodeName := nm.Spec.NodeName
if err := v.validateNodeExists(nodeName); err != nil {
nodemaintenancelog.Info("validation failed ", "nmName", nm.Name, "nodeName", nodeName, "error", err)
return err
}

// Validate that no NodeMaintenance for given node exists yet
if err := v.validateNoNodeMaintenanceExists(nm.Spec.NodeName); err != nil {
nodemaintenancelog.Info("validation failed", "error", err)
if err := v.validateNoNodeMaintenanceExists(nodeName); err != nil {
nodemaintenancelog.Info("validation failed", "nmName", nm.Name, "nodeName", nodeName, "error", err)
return err
}

// Validate that NodeMaintenance for control-plane nodes don't violate quorum
if err := v.validateControlPlaneQuorum(nm.Spec.NodeName); err != nil {
nodemaintenancelog.Info("validation failed", "error", err)
if err := v.validateControlPlaneQuorum(nodeName); err != nil {
nodemaintenancelog.Info("validation failed", "nmName", nm.Name, "nodeName", nodeName, "error", err)
return err
}

Expand Down Expand Up @@ -162,8 +165,14 @@ func (v *NodeMaintenanceValidator) validateNoNodeMaintenanceExists(nodeName stri
}

func (v *NodeMaintenanceValidator) validateControlPlaneQuorum(nodeName string) error {
// check if the node is a control-plane node
if node, err := getNode(nodeName, v.client); err != nil {
if !v.isOpenShift {
// etcd quorum PDB is only installed in OpenShift
nodemaintenancelog.Info("Cluster does not have etcd quorum PDB, thus we can't asses control-plane quorum violation")
return nil
}
// check if the node is a control-plane node on OpenShift
node, err := getNode(nodeName, v.client)
if err != nil {
return fmt.Errorf("could not get node for master/control-plane quorum validation, please try again: %v", err)
} else if node == nil {
// this should have been catched already, but just in case
Expand All @@ -172,39 +181,21 @@ func (v *NodeMaintenanceValidator) validateControlPlaneQuorum(nodeName string) e
// not a control-plane node, nothing to do
return nil
}

// check the etcd-quorum-guard PodDisruptionBudget if we can drain a control-plane node
disruptionsAllowed := int32(-1)
for _, pdbName := range []string{EtcdQuorumPDBNewName, EtcdQuorumPDBOldName} {
var pdb policyv1.PodDisruptionBudget
key := types.NamespacedName{
Namespace: EtcdQuorumPDBNamespace,
Name: pdbName,
}
if err := v.client.Get(context.TODO(), key, &pdb); err != nil {
if apierrors.IsNotFound(err) {
// try next one
continue
}
return fmt.Errorf("could not get the etcd quorum guard PDB for master/control-plane quorum validation, please try again: %v", err)
}
disruptionsAllowed = pdb.Status.DisruptionsAllowed
break
}
if disruptionsAllowed == -1 {
// TODO do we need a fallback for k8s clusters?
nodemaintenancelog.Info("etcd quorum guard PDB hasn't been found. Skipping master/control-plane quorum validation.")
return nil
// The node is a control-plane node on OpenShift
// now we check if adding nm CR for this node will disrupt control-plane quorum
isDisruptionAllowed, err := etcd.IsEtcdDisruptionAllowed(context.Background(), v.client, nodemaintenancelog, node)
if err != nil {
return err
}
if disruptionsAllowed == 0 {
return fmt.Errorf(ErrorControlPlaneQuorumViolation)
if !isDisruptionAllowed {
return fmt.Errorf(ErrorControlPlaneQuorumViolation, nodeName)
}
return nil
}

// if the returned node is nil, it wasn't found
func getNode(nodeName string, client client.Client) (*v1.Node, error) {
var node v1.Node
// getNode returns a node if it exists, otherwise it returns nil
func getNode(nodeName string, client client.Client) (*corev1.Node, error) {
var node corev1.Node
key := types.NamespacedName{
Name: nodeName,
}
Expand Down
131 changes: 83 additions & 48 deletions api/v1beta1/nodemaintenance_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,27 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

v1 "k8s.io/api/core/v1"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var _ = Describe("NodeMaintenance Validation", func() {

const nonExistingNodeName = "node-not-exists"
const existingNodeName = "node-exists"
const (
nonExistingNodeName = "node-not-exists"
existingNodeName = "node-exists"
)

BeforeEach(func() {
// create quorum ns on 1st run
quorumNs := &v1.Namespace{
quorumNs := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
Name: EtcdQuorumPDBNamespace,
},
}
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(quorumNs), &v1.Namespace{}); err != nil {
if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(quorumNs), &corev1.Namespace{}); err != nil {
err := k8sClient.Create(context.Background(), quorumNs)
Expect(err).ToNot(HaveOccurred())
}
Expand All @@ -46,7 +48,7 @@ var _ = Describe("NodeMaintenance Validation", func() {

Context("for node already in maintenance", func() {

var node *v1.Node
var node *corev1.Node
var nmExisting *NodeMaintenance

BeforeEach(func() {
Expand Down Expand Up @@ -83,7 +85,7 @@ var _ = Describe("NodeMaintenance Validation", func() {

Context("for master/control-plane node", func() {

var node *v1.Node
var node *corev1.Node

BeforeEach(func() {
node = getTestNode(existingNodeName, true)
Expand All @@ -98,65 +100,69 @@ var _ = Describe("NodeMaintenance Validation", func() {

Context("with potential quorum violation", func() {

var pdb *policyv1.PodDisruptionBudget

BeforeEach(func() {
pdb = getTestPDB()
err := k8sClient.Create(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
})
pdb := getTestPDB()
Expect(k8sClient.Create(context.Background(), pdb)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), pdb)

AfterEach(func() {
err := k8sClient.Delete(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
})

It("should be rejected", func() {
nm := getTestNMO(existingNodeName)
_, err := nm.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(ErrorControlPlaneQuorumViolation))
When("node has etcd guard pod", func() {
var guardPod *corev1.Pod
BeforeEach(func() {
guardPod = getPodGuard(existingNodeName)
Expect(k8sClient.Create(context.Background(), guardPod)).To(Succeed())
setPodConditionReady(context.Background(), guardPod, corev1.ConditionTrue)
// delete with force as the guard pod deletion takes time and won't happen immediately
var force client.GracePeriodSeconds = 0
DeferCleanup(k8sClient.Delete, context.Background(), guardPod, force)
})
It("should be allowed if the pod is on Fail state", func() {
testGuardPod := &corev1.Pod{}
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(guardPod), testGuardPod)).To(Succeed())
setPodConditionReady(context.Background(), testGuardPod, corev1.ConditionFalse)

nm := getTestNMO(existingNodeName)
Expect(nm.ValidateCreate()).Error().NotTo(HaveOccurred())
})
It("should be rejected if the pod is on True state", func() {
nm := getTestNMO(existingNodeName)
_, err := nm.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(ErrorControlPlaneQuorumViolation, node.Name))
})
})
When("node doesn't have etcd guard pod", func() {
It("should be allowed", func() {
nm := getTestNMO(existingNodeName)
Expect(nm.ValidateCreate()).Error().NotTo(HaveOccurred())
})
})

})

Context("without potential quorum violation", func() {

var pdb *policyv1.PodDisruptionBudget

BeforeEach(func() {
pdb = getTestPDB()
err := k8sClient.Create(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
pdb := getTestPDB()
Expect(k8sClient.Create(context.Background(), pdb)).To(Succeed())
DeferCleanup(k8sClient.Delete, context.Background(), pdb)

pdb.Status.DisruptionsAllowed = 1
err = k8sClient.Status().Update(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
})

AfterEach(func() {
err := k8sClient.Delete(context.Background(), pdb)
Expect(err).ToNot(HaveOccurred())
Expect(k8sClient.Status().Update(context.Background(), pdb)).To(Succeed())
})

It("should not be rejected", func() {
It("should be allowed", func() {
nm := getTestNMO(existingNodeName)
Eventually(func() error {
_, err := nm.ValidateCreate()
return err
}, time.Second, 200*time.Millisecond).ShouldNot(HaveOccurred())
Expect(nm.ValidateCreate()).Error().NotTo(HaveOccurred())
})

})

Context("without etcd quorum guard PDB", func() {

It("should not be rejected", func() {
It("should be rejected", func() {
nm := getTestNMO(existingNodeName)
Eventually(func() error {
_, err := nm.ValidateCreate()
return err
}, time.Second, 200*time.Millisecond).ShouldNot(HaveOccurred())
_, err := nm.ValidateCreate()
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(ErrorControlPlaneQuorumViolation, node.Name))
})

})
Expand Down Expand Up @@ -191,8 +197,8 @@ func getTestNMO(nodeName string) *NodeMaintenance {
}
}

func getTestNode(name string, isControlPlane bool) *v1.Node {
node := &v1.Node{
func getTestNode(name string, isControlPlane bool) *corev1.Node {
node := &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Expand All @@ -213,3 +219,32 @@ func getTestPDB() *policyv1.PodDisruptionBudget {
},
}
}

// getPodGuard returns guard pod with expected label and Ready condition is True for a given nodeName
func getPodGuard(nodeName string) *corev1.Pod {
dummyContainer := corev1.Container{
Name: "container-name",
Image: "foo",
}
return &corev1.Pod{
TypeMeta: metav1.TypeMeta{Kind: "Pod"},
ObjectMeta: metav1.ObjectMeta{
Name: "guard-" + nodeName,
Namespace: EtcdQuorumPDBNamespace,
Labels: map[string]string{
"app": "guard",
},
},
Spec: corev1.PodSpec{
NodeName: nodeName,
Containers: []corev1.Container{
dummyContainer,
},
},
}
}

func setPodConditionReady(ctx context.Context, pod *corev1.Pod, readyVal corev1.ConditionStatus) {
pod.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodReady, Status: readyVal}}
Expect(k8sClient.Status().Update(context.Background(), pod)).To(Succeed())
}
14 changes: 8 additions & 6 deletions api/v1beta1/webhook_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,13 @@ import (
// These tests use Ginkgo (BDD-style Go testing framework). Refer to
// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.

var cfg *rest.Config
var k8sClient client.Client
var testEnv *envtest.Environment
var ctx context.Context
var cancel context.CancelFunc
var (
cfg *rest.Config
k8sClient client.Client
testEnv *envtest.Environment
ctx context.Context
cancel context.CancelFunc
)

func TestAPIs(t *testing.T) {
RegisterFailHandler(Fail)
Expand Down Expand Up @@ -110,7 +112,7 @@ var _ = BeforeSuite(func() {
})
Expect(err).NotTo(HaveOccurred())

err = (&NodeMaintenance{}).SetupWebhookWithManager(mgr)
err = (&NodeMaintenance{}).SetupWebhookWithManager(true, mgr)
Expect(err).NotTo(HaveOccurred())

//+kubebuilder:scaffold:webhook
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.20

require (
github.com/go-logr/logr v1.4.1
github.com/medik8s/common v1.12.0
github.com/medik8s/common v1.13.0
github.com/onsi/ginkgo/v2 v2.14.0
github.com/onsi/gomega v1.30.0
github.com/sirupsen/logrus v1.9.3
Expand Down
Loading

0 comments on commit 5cb0cee

Please sign in to comment.