Skip to content

Commit

Permalink
Merge pull request #250 from slintes/fix-unit-test
Browse files Browse the repository at this point in the history
Fix watchdog usage in controller unit tests
  • Loading branch information
openshift-merge-bot[bot] authored Jan 27, 2025
2 parents 8a2445e + c674511 commit 16052be
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 58 deletions.
48 changes: 21 additions & 27 deletions controllers/tests/controller/selfnoderemediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/medik8s/self-node-remediation/controllers"
"github.com/medik8s/self-node-remediation/controllers/tests/shared"
"github.com/medik8s/self-node-remediation/pkg/utils"
"github.com/medik8s/self-node-remediation/pkg/watchdog"
)

const (
Expand All @@ -44,6 +45,9 @@ var _ = Describe("SNR Controller", func() {
snr.Namespace = snrNamespace
snrConfig = shared.GenerateTestConfig()
time.Sleep(time.Second * 2)

// reset watchdog for each test!
dummyDog.Reset()
})

JustBeforeEach(func() {
Expand Down Expand Up @@ -183,7 +187,7 @@ var _ = Describe("SNR Controller", func() {

verifyTimeHasBeenRebootedExists(snr)

verifyNoWatchdogFood()
verifyWatchdogTriggered()

verifyEvent("Normal", "NodeReboot", "Remediation process - about to attempt fencing the unhealthy node by rebooting it")

Expand Down Expand Up @@ -233,7 +237,7 @@ var _ = Describe("SNR Controller", func() {

verifyTimeHasBeenRebootedExists(snr)

verifyNoWatchdogFood()
verifyWatchdogTriggered()

// The kube-api calls for VA fail intentionally. In this case, we expect the snr agent to try
// to delete node resources again. So LastError is set to the error every time Reconcile()
Expand Down Expand Up @@ -315,7 +319,7 @@ var _ = Describe("SNR Controller", func() {

verifyTimeHasBeenRebootedExists(snr)

verifyNoWatchdogFood()
verifyWatchdogTriggered()

verifyFinalizerExists(snr)

Expand Down Expand Up @@ -436,30 +440,16 @@ var _ = Describe("SNR Controller", func() {
})

Context("Unhealthy node without api-server access", func() {

// this is not a controller test anymore... it's testing peers. But keep it here for now...

BeforeEach(func() {
By("Simulate api-server failure")
k8sClient.ShouldSimulateFailure = true
remediationStrategy = v1alpha1.ResourceDeletionRemediationStrategy
})

It("Verify that watchdog is not receiving food after some time", func() {
lastFoodTime := dummyDog.LastFoodTime()
timeout := dummyDog.GetTimeout()
Eventually(func() bool {
newTime := dummyDog.LastFoodTime()
// ensure the timeout passed
timeoutPassed := time.Now().After(lastFoodTime.Add(3 * timeout))
// ensure wd wasn't feeded
missedFeed := newTime.Before(lastFoodTime.Add(timeout))
if timeoutPassed && missedFeed {
return true
}
lastFoodTime = newTime
return false
}, 10*shared.PeerUpdateInterval, timeout).Should(BeTrue())
Context("no peer found", func() {
It("Verify that watchdog is not triggered", func() {
verifyWatchdogNotTriggered()
})
})
})

Expand Down Expand Up @@ -532,12 +522,16 @@ func verifyNodeIsSchedulable() {
}, 95*time.Second, 250*time.Millisecond).Should(BeFalse())
}

func verifyNoWatchdogFood() {
By("Verify that watchdog is not receiving food")
currentLastFoodTime := dummyDog.LastFoodTime()
ConsistentlyWithOffset(1, func() time.Time {
return dummyDog.LastFoodTime()
}, 5*dummyDog.GetTimeout(), 1*time.Second).Should(Equal(currentLastFoodTime), "watchdog should not receive food anymore")
func verifyWatchdogTriggered() {
EventuallyWithOffset(1, func(g Gomega) {
g.Expect(dummyDog.Status()).To(Equal(watchdog.Triggered))
}, 5*dummyDog.GetTimeout(), 1*time.Second).Should(Succeed(), "watchdog should be triggered")
}

func verifyWatchdogNotTriggered() {
ConsistentlyWithOffset(1, func(g Gomega) {
g.Expect(dummyDog.Status()).To(Equal(watchdog.Armed))
}, 5*dummyDog.GetTimeout(), 1*time.Second).Should(Succeed(), "watchdog should not be triggered")
}

func verifyFinalizerExists(snr *v1alpha1.SelfNodeRemediation) {
Expand Down
2 changes: 1 addition & 1 deletion controllers/tests/controller/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ import (

var (
testEnv *envtest.Environment
dummyDog watchdog.Watchdog
dummyDog watchdog.FakeWatchdog
unhealthyNode, peerNode = &v1.Node{}, &v1.Node{}
cancelFunc context.CancelFunc
k8sClient *shared.K8sClientWrapper
Expand Down
29 changes: 17 additions & 12 deletions pkg/peers/peers.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ type Role int8
const (
Worker Role = iota
ControlPlane

// this is used instead of "peerUpdateInterval" when peer update fails
quickPeerUpdateInterval = 2 * time.Minute
)

type Peers struct {
Expand Down Expand Up @@ -77,22 +80,24 @@ func (p *Peers) Start(ctx context.Context) error {
p.controlPlanePeerSelector = createSelector(hostname, getControlPlaneLabel(myNode))
}

var updatePeersError error
cancellableCtx, cancel := context.WithCancel(ctx)

p.log.Info("peer starting", "name", p.myNodeName)
wait.UntilWithContext(cancellableCtx, func(ctx context.Context) {
updatePeersError = p.updateWorkerPeers(ctx)
if updatePeersError != nil {
cancel()
}
updatePeersError = p.updateControlPlanePeers(ctx)
if updatePeersError != nil {
cancel()
wait.UntilWithContext(ctx, func(ctx context.Context) {
updateWorkerPeersError := p.updateWorkerPeers(ctx)
updateControlPlanePeersError := p.updateControlPlanePeers(ctx)
if updateWorkerPeersError != nil || updateControlPlanePeersError != nil {
// the default update interval is quite long, in case of an error we want to retry quicker
quickCtx, quickCancel := context.WithCancel(ctx)
wait.UntilWithContext(quickCtx, func(ctx context.Context) {
quickUpdateWorkerPeersError := p.updateWorkerPeers(ctx)
quickUpdateControlPlanePeersError := p.updateControlPlanePeers(ctx)
if quickUpdateWorkerPeersError == nil && quickUpdateControlPlanePeersError == nil {
quickCancel()
}
}, quickPeerUpdateInterval)
}
}, p.peerUpdateInterval)

return updatePeersError
return nil
}

func (p *Peers) updateWorkerPeers(ctx context.Context) error {
Expand Down
23 changes: 21 additions & 2 deletions pkg/watchdog/fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,22 @@ const (
fakeTimeout = 1 * time.Second
)

type FakeWatchdog interface {
Watchdog
Reset()
}

// fakeWatchdogImpl provides the fake implementation of the watchdogImpl interface for tests
type fakeWatchdogImpl struct {
IsStartSuccessful bool
*synchronizedWatchdog
}

func NewFake(isStartSuccessful bool) Watchdog {
func NewFake(isStartSuccessful bool) FakeWatchdog {
fakeWDImpl := &fakeWatchdogImpl{IsStartSuccessful: isStartSuccessful}
return newSynced(ctrl.Log.WithName("fake watchdog"), fakeWDImpl)
syncedWD := newSynced(ctrl.Log.WithName("fake watchdog"), fakeWDImpl)
fakeWDImpl.synchronizedWatchdog = syncedWD
return fakeWDImpl
}

func (f *fakeWatchdogImpl) start() (*time.Duration, error) {
Expand All @@ -36,3 +44,14 @@ func (f *fakeWatchdogImpl) feed() error {
func (f *fakeWatchdogImpl) disarm() error {
return nil
}

func (f *fakeWatchdogImpl) Reset() {
swd := f.synchronizedWatchdog
swd.mutex.Lock()
defer swd.mutex.Unlock()
if swd.status != Armed {
swd.stop()
swd.status = Armed
swd.startFeeding()
}
}
36 changes: 20 additions & 16 deletions pkg/watchdog/synchronized.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,7 @@ func (swd *synchronizedWatchdog) Start(ctx context.Context) error {
swd.log.Info("watchdog started")
swd.mutex.Unlock()

feedCtx, cancel := context.WithCancel(context.Background())
swd.stop = cancel
// feed until stopped
go wait.NonSlidingUntilWithContext(feedCtx, func(feedCtx context.Context) {
swd.mutex.Lock()
defer swd.mutex.Unlock()
// prevent feeding of a disarmed watchdog in case the context isn't cancelled yet
if swd.status != Armed {
return
}
if err := swd.impl.feed(); err != nil {
swd.log.Error(err, "failed to feed watchdog!")
} else {
swd.lastFoodTime = time.Now()
}
}, swd.timeout/3)
swd.startFeeding()

<-ctx.Done()

Expand All @@ -100,6 +85,25 @@ func (swd *synchronizedWatchdog) Start(ctx context.Context) error {
return nil
}

func (swd *synchronizedWatchdog) startFeeding() {
feedCtx, cancel := context.WithCancel(context.Background())
swd.stop = cancel
// feed until stopped
go wait.NonSlidingUntilWithContext(feedCtx, func(feedCtx context.Context) {
swd.mutex.Lock()
defer swd.mutex.Unlock()
// prevent feeding of a disarmed watchdog in case the context isn't cancelled yet
if swd.status != Armed {
return
}
if err := swd.impl.feed(); err != nil {
swd.log.Error(err, "failed to feed watchdog!")
} else {
swd.lastFoodTime = time.Now()
}
}, swd.timeout/3)
}

func (swd *synchronizedWatchdog) Stop() {
swd.mutex.Lock()
defer swd.mutex.Unlock()
Expand Down
13 changes: 13 additions & 0 deletions pkg/watchdog/watchdog_suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package watchdog_test

import (
"testing"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

func TestWatchdog(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Watchdog Suite")
}
96 changes: 96 additions & 0 deletions pkg/watchdog/watchdog_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package watchdog_test

import (
"context"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/medik8s/self-node-remediation/pkg/watchdog"
)

var _ = Describe("Watchdog", func() {

var wd watchdog.FakeWatchdog
var cancel context.CancelFunc

BeforeEach(func() {
var ctx context.Context
ctx, cancel = context.WithCancel(context.Background())

wd = watchdog.NewFake(true)
Expect(wd).NotTo(BeNil())
go func() {
err := wd.Start(ctx)
Expect(err).NotTo(HaveOccurred())
}()
Eventually(func(g Gomega) {
g.Expect(wd.Status()).To(Equal(watchdog.Armed))
}, 1*time.Second, 100*time.Millisecond).Should(Succeed(), "watchdog should be armed")
})

AfterEach(func() {
cancel()
})

Context("Watchdog started", func() {
It("should be fed", func() {
verifyWatchdogFood(wd)
})
})

Context("Watchdog triggered", func() {
BeforeEach(func() {
wd.Stop()
})

It("should be triggered and not be fed anymore", func() {
Eventually(func(g Gomega) {
g.Expect(wd.Status()).To(Equal(watchdog.Triggered))
}, 1*time.Second, 100*time.Millisecond).Should(Succeed(), "watchdog should be triggered")
verifyNoWatchdogFood(wd)
})
})

Context("Watchdog cancelled", func() {
BeforeEach(func() {
cancel()
})

It("should be disarmed and and not be fed anymore", func() {
Eventually(func(g Gomega) {
g.Expect(wd.Status()).To(Equal(watchdog.Disarmed))
}, 1*time.Second, 100*time.Millisecond).Should(Succeed(), "watchdog should be disarmed")
verifyNoWatchdogFood(wd)
})
})

Context("Triggered watchdog reset", func() {
BeforeEach(func() {
wd.Stop()
wd.Reset()
})

It("should be armed and fed", func() {
Eventually(func(g Gomega) {
g.Expect(wd.Status()).To(Equal(watchdog.Armed))
}, 1*time.Second, 100*time.Millisecond).Should(Succeed(), "watchdog should be armed")
verifyWatchdogFood(wd)
})
})
})

func verifyWatchdogFood(wd watchdog.Watchdog) {
currentLastFoodTime := wd.LastFoodTime()
EventuallyWithOffset(1, func() time.Time {
return wd.LastFoodTime()
}, wd.GetTimeout(), 100*time.Millisecond).Should(BeTemporally(">", currentLastFoodTime), "watchdog should receive food")
}

func verifyNoWatchdogFood(wd watchdog.Watchdog) {
currentLastFoodTime := wd.LastFoodTime()
ConsistentlyWithOffset(1, func() time.Time {
return wd.LastFoodTime()
}, 5*wd.GetTimeout(), 1*time.Second).Should(Equal(currentLastFoodTime), "watchdog should not receive food anymore")
}

0 comments on commit 16052be

Please sign in to comment.