From ddb1d76178267e12a510d62f96d304bbfa0ec0cd Mon Sep 17 00:00:00 2001 From: Yang Guo Date: Mon, 8 Jul 2019 14:50:49 -0700 Subject: [PATCH] Support waiting for kube-apiserver to be ready with timout during NPD startup --- cmd/options/options.go | 9 +++++++ pkg/exporters/k8sexporter/k8s_exporter.go | 24 ++++++++++++++++++- .../problemclient/fake_problem_client.go | 6 ++++- .../problemclient/problem_client.go | 11 +++++++-- 4 files changed, 46 insertions(+), 4 deletions(-) diff --git a/cmd/options/options.go b/cmd/options/options.go index c7786ce28..ab6ccdbcf 100644 --- a/cmd/options/options.go +++ b/cmd/options/options.go @@ -20,6 +20,7 @@ import ( "flag" "fmt" "os" + "time" "net/url" @@ -49,6 +50,12 @@ type NodeProblemDetectorOptions struct { EnableK8sExporter bool // ApiServerOverride is the custom URI used to connect to Kubernetes ApiServer. ApiServerOverride string + // APIServerWaitTimeout is the timeout on waiting for kube-apiserver to be + // ready. + APIServerWaitTimeout time.Duration + // APIServerWaitInterval is the interval between the checks on the + // readiness of kube-apiserver. + APIServerWaitInterval time.Duration // prometheusExporter options // PrometheusServerPort is the port to bind the Prometheus scrape endpoint. Use 0 to disable. @@ -96,6 +103,8 @@ func (npdo *NodeProblemDetectorOptions) AddFlags(fs *pflag.FlagSet) { fs.BoolVar(&npdo.EnableK8sExporter, "enable-k8s-exporter", true, "Enables reporting to Kubernetes API server.") fs.StringVar(&npdo.ApiServerOverride, "apiserver-override", "", "Custom URI used to connect to Kubernetes ApiServer. This is ignored if --enable-k8s-exporter is false.") + fs.DurationVar(&npdo.APIServerWaitTimeout, "apiserver-wait-timeout", time.Duration(5)*time.Minute, "The timeout on waiting for kube-apiserver to be ready. This is ignored if --enable-k8s-exporter is false.") + fs.DurationVar(&npdo.APIServerWaitInterval, "apiserver-wait-interval", time.Duration(5)*time.Second, "The interval between the checks on the readiness of kube-apiserver. This is ignored if --enable-k8s-exporter is false.") fs.BoolVar(&npdo.PrintVersion, "version", false, "Print version information and quit") fs.StringVar(&npdo.HostnameOverride, "hostname-override", "", "Custom node name used to override hostname") diff --git a/pkg/exporters/k8sexporter/k8s_exporter.go b/pkg/exporters/k8sexporter/k8s_exporter.go index 6cdf80e18..47900ddcb 100644 --- a/pkg/exporters/k8sexporter/k8s_exporter.go +++ b/pkg/exporters/k8sexporter/k8s_exporter.go @@ -25,6 +25,7 @@ import ( "github.com/golang/glog" "k8s.io/apimachinery/pkg/util/clock" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/node-problem-detector/cmd/options" "k8s.io/node-problem-detector/pkg/exporters/k8sexporter/condition" @@ -38,13 +39,23 @@ type k8sExporter struct { conditionManager condition.ConditionManager } -// NewExporterOrDie creates a exporter for Kubernetes apiserver exporting, panics if error occurs. +// NewExporterOrDie creates a exporter for Kubernetes apiserver exporting, +// panics if error occurs. +// +// Note that this function may be blocked (until a timeout occurs) before +// kube-apiserver becomes ready. func NewExporterOrDie(npdo *options.NodeProblemDetectorOptions) types.Exporter { if !npdo.EnableK8sExporter { return nil } c := problemclient.NewClientOrDie(npdo) + + glog.Infof("Waiting for kube-apiserver to be ready (timeout %v)...", npdo.APIServerWaitTimeout) + if err := waitForAPIServerReadyWithTimeout(c, npdo); err != nil { + glog.Warningf("kube-apiserver did not become ready: timed out on waiting for kube-apiserver to return the node object: %v", err) + } + ke := k8sExporter{ client: c, conditionManager: condition.NewConditionManager(c, clock.RealClock{}), @@ -91,3 +102,14 @@ func (ke *k8sExporter) startHTTPReporting(npdo *options.NodeProblemDetectorOptio } }() } + +func waitForAPIServerReadyWithTimeout(c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error { + return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) { + // If NPD can get the node object from kube-apiserver, the server is + // ready and the RBAC permission is set correctly. + if _, err := c.GetNode(); err == nil { + return true, nil + } + return false, nil + }) +} diff --git a/pkg/exporters/k8sexporter/problemclient/fake_problem_client.go b/pkg/exporters/k8sexporter/problemclient/fake_problem_client.go index 6f878493b..50e47350e 100644 --- a/pkg/exporters/k8sexporter/problemclient/fake_problem_client.go +++ b/pkg/exporters/k8sexporter/problemclient/fake_problem_client.go @@ -21,7 +21,7 @@ import ( "reflect" "sync" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" ) // FakeProblemClient is a fake problem client for debug. @@ -92,3 +92,7 @@ func (f *FakeProblemClient) GetConditions(types []v1.NodeConditionType) ([]*v1.N // Eventf does nothing now. func (f *FakeProblemClient) Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) { } + +func (f *FakeProblemClient) GetNode() (*v1.Node, error) { + return nil, fmt.Errorf("GetNode() not implemented") +} diff --git a/pkg/exporters/k8sexporter/problemclient/problem_client.go b/pkg/exporters/k8sexporter/problemclient/problem_client.go index c2f841416..70a022ad6 100644 --- a/pkg/exporters/k8sexporter/problemclient/problem_client.go +++ b/pkg/exporters/k8sexporter/problemclient/problem_client.go @@ -26,7 +26,7 @@ import ( typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/kubernetes/pkg/api/legacyscheme" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/clock" @@ -47,6 +47,9 @@ type Client interface { SetConditions(conditions []v1.NodeCondition) error // Eventf reports the event. Eventf(eventType string, source, reason, messageFmt string, args ...interface{}) + // GetNode returns the Node object of the node on which the + // node-problem-detector runs. + GetNode() (*v1.Node, error) } type nodeProblemClient struct { @@ -79,7 +82,7 @@ func NewClientOrDie(npdo *options.NodeProblemDetectorOptions) Client { } func (c *nodeProblemClient) GetConditions(conditionTypes []v1.NodeConditionType) ([]*v1.NodeCondition, error) { - node, err := c.client.Nodes().Get(c.nodeName, metav1.GetOptions{}) + node, err := c.GetNode() if err != nil { return nil, err } @@ -116,6 +119,10 @@ func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, recorder.Eventf(c.nodeRef, eventType, reason, messageFmt, args...) } +func (c *nodeProblemClient) GetNode() (*v1.Node, error) { + return c.client.Nodes().Get(c.nodeName, metav1.GetOptions{}) +} + // generatePatch generates condition patch func generatePatch(conditions []v1.NodeCondition) ([]byte, error) { raw, err := json.Marshal(&conditions)