Skip to content

Commit

Permalink
fix: use cached ctrlruntime client in IPAM pool monitor (#2043)
Browse files Browse the repository at this point in the history
Signed-off-by: Evan Baker <[email protected]>
  • Loading branch information
rbtr authored Sep 8, 2023
1 parent e614a3c commit ec97b6d
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 21 deletions.
4 changes: 4 additions & 0 deletions cns/restserver/ipam.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,10 @@ func (service *HTTPRestService) handleDebugPodContext(w http.ResponseWriter, r *
func (service *HTTPRestService) handleDebugRestData(w http.ResponseWriter, r *http.Request) {
service.RLock()
defer service.RUnlock()
if service.IPAMPoolMonitor == nil {
http.Error(w, "not ready", http.StatusServiceUnavailable)
return
}
resp := GetHTTPServiceDataResponse{
HTTPRestServiceData: HTTPRestServiceData{
PodIPIDByPodInterfaceKey: service.PodIPIDByPodInterfaceKey,
Expand Down
53 changes: 32 additions & 21 deletions cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1126,20 +1126,12 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
if err != nil {
return errors.Wrap(err, "failed to create ctrl client")
}
nnccli := nodenetworkconfig.NewClient(directcli)
directnnccli := nodenetworkconfig.NewClient(directcli)
if err != nil {
return errors.Wrap(err, "failed to create NNC client")
}
// TODO(rbtr): nodename and namespace should be in the cns config
scopedcli := nncctrl.NewScopedClient(nnccli, types.NamespacedName{Namespace: "kube-system", Name: nodeName})

clusterSubnetStateChan := make(chan v1alpha1.ClusterSubnetState)
// initialize the ipam pool monitor
poolOpts := ipampool.Options{
RefreshDelay: poolIPAMRefreshRateInMilliseconds * time.Millisecond,
}
poolMonitor := ipampool.NewMonitor(httpRestServiceImplementation, scopedcli, clusterSubnetStateChan, &poolOpts)
httpRestServiceImplementation.IPAMPoolMonitor = poolMonitor
directscopedcli := nncctrl.NewScopedClient(directnnccli, types.NamespacedName{Namespace: "kube-system", Name: nodeName})

logger.Printf("Reconciling initial CNS state")
// apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for
Expand All @@ -1149,7 +1141,7 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
err = retry.Do(func() error {
attempt++
logger.Printf("reconciling initial CNS state attempt: %d", attempt)
err = reconcileInitialCNSState(ctx, scopedcli, httpRestServiceImplementation, podInfoByIPProvider)
err = reconcileInitialCNSState(ctx, directscopedcli, httpRestServiceImplementation, podInfoByIPProvider)
if err != nil {
logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err)
}
Expand All @@ -1160,16 +1152,6 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
}
logger.Printf("reconciled initial CNS state after %d attempts", attempt)

// start the pool Monitor before the Reconciler, since it needs to be ready to receive an
// NodeNetworkConfig update by the time the Reconciler tries to send it.
go func() {
logger.Printf("Starting IPAM Pool Monitor")
if e := poolMonitor.Start(ctx); e != nil {
logger.Errorf("[Azure CNS] Failed to start pool monitor with err: %v", e)
}
}()
logger.Printf("initialized and started IPAM pool monitor")

// the nodeScopedCache sets Selector options on the Manager cache which are used
// to perform *server-side* filtering of the cached objects. This is very important
// for high node/pod count clusters, as it keeps us from watching objects at the
Expand Down Expand Up @@ -1199,6 +1181,25 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
return errors.Wrap(err, "failed to create manager")
}

// Build the IPAM Pool monitor
clusterSubnetStateChan := make(chan v1alpha1.ClusterSubnetState)

// this cachedscopedclient is built using the Manager's cached client, which is
// NOT SAFE TO USE UNTIL THE MANAGER IS STARTED!
// This is okay because it is only used to build the IPAMPoolMonitor, which does not
// attempt to use the client until it has received a NodeNetworkConfig to update, and
// that can only happen once the Manager has started and the NodeNetworkConfig
// reconciler has pushed the Monitor a NodeNetworkConfig.
cachedscopedcli := nncctrl.NewScopedClient(nodenetworkconfig.NewClient(manager.GetClient()), types.NamespacedName{Namespace: "kube-system", Name: nodeName})

poolOpts := ipampool.Options{
RefreshDelay: poolIPAMRefreshRateInMilliseconds * time.Millisecond,
}
poolMonitor := ipampool.NewMonitor(httpRestServiceImplementation, cachedscopedcli, clusterSubnetStateChan, &poolOpts)
httpRestServiceImplementation.IPAMPoolMonitor = poolMonitor

// Start building the NNC Reconciler

// get our Node so that we can xref it against the NodeNetworkConfig's to make sure that the
// NNC is not stale and represents the Node we're running on.
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
Expand Down Expand Up @@ -1231,6 +1232,16 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
httpRestServiceImplementation.RegisterPProfEndpoints()
}

// start the pool Monitor before the Reconciler, since it needs to be ready to receive an
// NodeNetworkConfig update by the time the Reconciler tries to send it.
go func() {
logger.Printf("Starting IPAM Pool Monitor")
if e := poolMonitor.Start(ctx); e != nil {
logger.Errorf("[Azure CNS] Failed to start pool monitor with err: %v", e)
}
}()
logger.Printf("initialized and started IPAM pool monitor")

// Start the Manager which starts the reconcile loop.
// The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
// Monitor's internal loop.
Expand Down

0 comments on commit ec97b6d

Please sign in to comment.