Skip to content

Commit 36b67b4

Browse files
fix: [WIN-NPM] race during bootup where we may not add one NetPol to a Pod (#2028)
* fix: lock while adding policy in bootup phase * test: fix UT to model true pod controller behavior * fix: prevent deadlock * style: lint
1 parent c48c3e1 commit 36b67b4

File tree

2 files changed

+58
-40
lines changed

2 files changed

+58
-40
lines changed

npm/pkg/dataplane/dataplane-test-cases_windows_test.go

+1-3
Original file line numberDiff line numberDiff line change
@@ -2224,15 +2224,13 @@ func getAllMultiJobTests() []*MultiJobTestCase {
22242224
{
22252225
Description: "create namespaces, pods, and a policy which applies to a pod",
22262226
Jobs: map[string][]*Action{
2227-
"finish_bootup_phase": {
2228-
FinishBootupPhase(),
2229-
},
22302227
"namespace_controller": {
22312228
CreateNamespace("x", map[string]string{"k1": "v1"}),
22322229
CreateNamespace("y", map[string]string{"k2": "v2"}),
22332230
ApplyDP(),
22342231
},
22352232
"pod_controller": {
2233+
FinishBootupPhase(),
22362234
CreatePod("x", "a", ip1, thisNode, map[string]string{"k1": "v1"}),
22372235
CreatePod("y", "a", ip2, otherNode, map[string]string{"k1": "v1"}),
22382236
ApplyDP(),

npm/pkg/dataplane/dataplane.go

+57-37
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ import (
1818
const (
1919
reconcileDuration = time.Duration(5 * time.Minute)
2020

21-
contextBackground = "BACKGROUND"
22-
contextApplyDP = "APPLY-DP"
23-
contextAddNetPol = "ADD-NETPOL"
24-
contextDelNetPol = "DEL-NETPOL"
21+
contextBackground = "BACKGROUND"
22+
contextApplyDP = "APPLY-DP"
23+
contextAddNetPol = "ADD-NETPOL"
24+
contextAddNetPolBootup = "BOOTUP-ADD-NETPOL"
25+
contextDelNetPol = "DEL-NETPOL"
2526
)
2627

2728
var ErrInvalidApplyConfig = errors.New("invalid apply config")
@@ -276,24 +277,21 @@ func (dp *DataPlane) RemoveFromList(listName *ipsets.IPSetMetadata, setNames []*
276277
// and accordingly makes changes in dataplane. This function helps emulate a single call to
277278
// dataplane instead of multiple ipset operations calls ipset operations calls to dataplane
278279
func (dp *DataPlane) ApplyDataPlane() error {
279-
if dp.applyInBackground {
280-
return dp.incrementBatchAndApplyIfNeeded(contextApplyDP)
280+
if !dp.applyInBackground {
281+
return dp.applyDataPlaneNow(contextApplyDP)
281282
}
282283

283-
return dp.applyDataPlaneNow(contextApplyDP)
284-
}
285-
286-
func (dp *DataPlane) incrementBatchAndApplyIfNeeded(context string) error {
284+
// increment batch and apply dataplane if needed
287285
dp.applyInfo.Lock()
288286
dp.applyInfo.numBatches++
289287
newCount := dp.applyInfo.numBatches
290288
dp.applyInfo.Unlock()
291289

292-
klog.Infof("[DataPlane] [%s] new batch count: %d", context, newCount)
290+
klog.Infof("[DataPlane] [%s] new batch count: %d", contextApplyDP, newCount)
293291

294292
if newCount >= dp.ApplyMaxBatches {
295-
klog.Infof("[DataPlane] [%s] applying now since reached maximum batch count: %d", context, newCount)
296-
return dp.applyDataPlaneNow(context)
293+
klog.Infof("[DataPlane] [%s] applying now since reached maximum batch count: %d", contextApplyDP, newCount)
294+
return dp.applyDataPlaneNow(contextApplyDP)
297295
}
298296

299297
return nil
@@ -380,32 +378,65 @@ func (dp *DataPlane) AddPolicy(policy *policies.NPMNetworkPolicy) error {
380378
return fmt.Errorf("[DataPlane] error while adding Rule IPSet references: %w", err)
381379
}
382380

383-
var endpointList map[string]string
384-
if dp.inBootupPhase() {
381+
inBootupPhase := false
382+
if dp.applyInBackground {
383+
dp.applyInfo.Lock()
384+
inBootupPhase = dp.applyInfo.inBootupPhase
385+
if inBootupPhase {
386+
// keep holding the lock to block FinishBootupPhase() and prevent PodController from
387+
// coming back online and causing race issues from updatePod() within applyDataPlaneNow()
388+
defer dp.applyInfo.Unlock()
389+
} else {
390+
dp.applyInfo.Unlock()
391+
}
392+
}
393+
394+
if inBootupPhase {
385395
// During bootup phase, the Pod controller will not be running.
386396
// We don't need to worry about adding Policies to Endpoints, so we don't need IPSets in the kernel yet.
387397
// Ideally, we get all NetworkPolicies in the cache before the Pod controller starts
388-
err = dp.incrementBatchAndApplyIfNeeded(contextAddNetPol)
389-
if err != nil {
390-
return err
391-
}
392-
} else {
393-
err = dp.applyDataPlaneNow(contextAddNetPol)
394-
if err != nil {
395-
return err
398+
399+
// increment batch and apply IPSets if needed
400+
dp.applyInfo.numBatches++
401+
newCount := dp.applyInfo.numBatches
402+
klog.Infof("[DataPlane] [%s] new batch count: %d", contextAddNetPolBootup, newCount)
403+
if newCount >= dp.ApplyMaxBatches {
404+
klog.Infof("[DataPlane] [%s] applying now since reached maximum batch count: %d", contextAddNetPolBootup, newCount)
405+
klog.Infof("[DataPlane] [%s] starting to apply ipsets", contextAddNetPolBootup)
406+
err = dp.ipsetMgr.ApplyIPSets()
407+
if err != nil {
408+
return fmt.Errorf("[DataPlane] [%s] error while applying IPSets: %w", contextAddNetPolBootup, err)
409+
}
410+
klog.Infof("[DataPlane] [%s] finished applying ipsets", contextAddNetPolBootup)
411+
412+
dp.applyInfo.numBatches = 0
396413
}
397414

398-
endpointList, err = dp.getEndpointsToApplyPolicy(policy)
415+
err = dp.policyMgr.AddPolicy(policy, nil)
399416
if err != nil {
400-
return fmt.Errorf("[DataPlane] error while getting endpoints to apply policy after applying dataplane: %w", err)
417+
return fmt.Errorf("[DataPlane] [%s] error while adding policy: %w", contextAddNetPolBootup, err)
401418
}
419+
420+
return nil
421+
}
422+
423+
// standard, non-bootup phase
424+
err = dp.applyDataPlaneNow(contextAddNetPol)
425+
if err != nil {
426+
return err
427+
}
428+
429+
var endpointList map[string]string
430+
endpointList, err = dp.getEndpointsToApplyPolicy(policy)
431+
if err != nil {
432+
return fmt.Errorf("[DataPlane] error while getting endpoints to apply policy after applying dataplane: %w", err)
402433
}
403434

404-
// endpointList will be empty if in bootup phase
405435
err = dp.policyMgr.AddPolicy(policy, endpointList)
406436
if err != nil {
407437
return fmt.Errorf("[DataPlane] error while adding policy: %w", err)
408438
}
439+
409440
return nil
410441
}
411442

@@ -582,14 +613,3 @@ func (dp *DataPlane) deleteIPSetsAndReferences(sets []*ipsets.TranslatedIPSet, n
582613
}
583614
return nil
584615
}
585-
586-
func (dp *DataPlane) inBootupPhase() bool {
587-
if !dp.applyInBackground {
588-
return false
589-
}
590-
591-
dp.applyInfo.Lock()
592-
defer dp.applyInfo.Unlock()
593-
594-
return dp.applyInfo.inBootupPhase
595-
}

0 commit comments

Comments
 (0)