From ec97b6db880c5f2b13084ae489971ec7c0060da5 Mon Sep 17 00:00:00 2001 From: Evan Baker Date: Fri, 21 Jul 2023 11:35:39 -0500 Subject: [PATCH] fix: use cached ctrlruntime client in IPAM pool monitor (#2043) Signed-off-by: Evan Baker --- cns/restserver/ipam.go | 4 ++++ cns/service/main.go | 53 +++++++++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/cns/restserver/ipam.go b/cns/restserver/ipam.go index 5f777712a6f..fb3eaca631f 100644 --- a/cns/restserver/ipam.go +++ b/cns/restserver/ipam.go @@ -341,6 +341,10 @@ func (service *HTTPRestService) handleDebugPodContext(w http.ResponseWriter, r * func (service *HTTPRestService) handleDebugRestData(w http.ResponseWriter, r *http.Request) { service.RLock() defer service.RUnlock() + if service.IPAMPoolMonitor == nil { + http.Error(w, "not ready", http.StatusServiceUnavailable) + return + } resp := GetHTTPServiceDataResponse{ HTTPRestServiceData: HTTPRestServiceData{ PodIPIDByPodInterfaceKey: service.PodIPIDByPodInterfaceKey, diff --git a/cns/service/main.go b/cns/service/main.go index 7ba10e4ce27..e37f5f9772a 100644 --- a/cns/service/main.go +++ b/cns/service/main.go @@ -1126,20 +1126,12 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn if err != nil { return errors.Wrap(err, "failed to create ctrl client") } - nnccli := nodenetworkconfig.NewClient(directcli) + directnnccli := nodenetworkconfig.NewClient(directcli) if err != nil { return errors.Wrap(err, "failed to create NNC client") } // TODO(rbtr): nodename and namespace should be in the cns config - scopedcli := nncctrl.NewScopedClient(nnccli, types.NamespacedName{Namespace: "kube-system", Name: nodeName}) - - clusterSubnetStateChan := make(chan v1alpha1.ClusterSubnetState) - // initialize the ipam pool monitor - poolOpts := ipampool.Options{ - RefreshDelay: poolIPAMRefreshRateInMilliseconds * time.Millisecond, - } - poolMonitor := ipampool.NewMonitor(httpRestServiceImplementation, scopedcli, clusterSubnetStateChan, &poolOpts) - httpRestServiceImplementation.IPAMPoolMonitor = poolMonitor + directscopedcli := nncctrl.NewScopedClient(directnnccli, types.NamespacedName{Namespace: "kube-system", Name: nodeName}) logger.Printf("Reconciling initial CNS state") // apiserver nnc might not be registered or api server might be down and crashloop backof puts us outside of 5-10 minutes we have for @@ -1149,7 +1141,7 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn err = retry.Do(func() error { attempt++ logger.Printf("reconciling initial CNS state attempt: %d", attempt) - err = reconcileInitialCNSState(ctx, scopedcli, httpRestServiceImplementation, podInfoByIPProvider) + err = reconcileInitialCNSState(ctx, directscopedcli, httpRestServiceImplementation, podInfoByIPProvider) if err != nil { logger.Errorf("failed to reconcile initial CNS state, attempt: %d err: %v", attempt, err) } @@ -1160,16 +1152,6 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn } logger.Printf("reconciled initial CNS state after %d attempts", attempt) - // start the pool Monitor before the Reconciler, since it needs to be ready to receive an - // NodeNetworkConfig update by the time the Reconciler tries to send it. - go func() { - logger.Printf("Starting IPAM Pool Monitor") - if e := poolMonitor.Start(ctx); e != nil { - logger.Errorf("[Azure CNS] Failed to start pool monitor with err: %v", e) - } - }() - logger.Printf("initialized and started IPAM pool monitor") - // the nodeScopedCache sets Selector options on the Manager cache which are used // to perform *server-side* filtering of the cached objects. This is very important // for high node/pod count clusters, as it keeps us from watching objects at the @@ -1199,6 +1181,25 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn return errors.Wrap(err, "failed to create manager") } + // Build the IPAM Pool monitor + clusterSubnetStateChan := make(chan v1alpha1.ClusterSubnetState) + + // this cachedscopedclient is built using the Manager's cached client, which is + // NOT SAFE TO USE UNTIL THE MANAGER IS STARTED! + // This is okay because it is only used to build the IPAMPoolMonitor, which does not + // attempt to use the client until it has received a NodeNetworkConfig to update, and + // that can only happen once the Manager has started and the NodeNetworkConfig + // reconciler has pushed the Monitor a NodeNetworkConfig. + cachedscopedcli := nncctrl.NewScopedClient(nodenetworkconfig.NewClient(manager.GetClient()), types.NamespacedName{Namespace: "kube-system", Name: nodeName}) + + poolOpts := ipampool.Options{ + RefreshDelay: poolIPAMRefreshRateInMilliseconds * time.Millisecond, + } + poolMonitor := ipampool.NewMonitor(httpRestServiceImplementation, cachedscopedcli, clusterSubnetStateChan, &poolOpts) + httpRestServiceImplementation.IPAMPoolMonitor = poolMonitor + + // Start building the NNC Reconciler + // get our Node so that we can xref it against the NodeNetworkConfig's to make sure that the // NNC is not stale and represents the Node we're running on. node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) @@ -1231,6 +1232,16 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn httpRestServiceImplementation.RegisterPProfEndpoints() } + // start the pool Monitor before the Reconciler, since it needs to be ready to receive an + // NodeNetworkConfig update by the time the Reconciler tries to send it. + go func() { + logger.Printf("Starting IPAM Pool Monitor") + if e := poolMonitor.Start(ctx); e != nil { + logger.Errorf("[Azure CNS] Failed to start pool monitor with err: %v", e) + } + }() + logger.Printf("initialized and started IPAM pool monitor") + // Start the Manager which starts the reconcile loop. // The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the // Monitor's internal loop.