From 992aa5ea1a35b6bf96089f3649f72b89f392525e Mon Sep 17 00:00:00 2001 From: Jim Minter Date: Wed, 25 Nov 2020 18:01:31 -0600 Subject: [PATCH 1/4] enable hack/cluster create to be run back-to-back --- hack/cluster/cluster.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/cluster/cluster.go b/hack/cluster/cluster.go index 4f4da176ef1..1c5030d2a7c 100644 --- a/hack/cluster/cluster.go +++ b/hack/cluster/cluster.go @@ -36,7 +36,7 @@ func run(ctx context.Context, log *logrus.Entry) error { return err } - c, err := cluster.New(log, env, false) + c, err := cluster.New(log, env, os.Getenv("CI") != "") if err != nil { return err } From 5bd3dd95f6853ab18d3d1b07c36e4c4d315e1c70 Mon Sep 17 00:00:00 2001 From: Jim Minter Date: Wed, 25 Nov 2020 18:02:25 -0600 Subject: [PATCH 2/4] set DisableKeepAlives in internetchecker.go This is expected to significantly improve our cluster creation success in South Korea. --- .../controllers/checker/internetchecker.go | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pkg/operator/controllers/checker/internetchecker.go b/pkg/operator/controllers/checker/internetchecker.go index 6f486e0c402..a7a5e1e69f0 100644 --- a/pkg/operator/controllers/checker/internetchecker.go +++ b/pkg/operator/controllers/checker/internetchecker.go @@ -63,6 +63,24 @@ func (r *InternetChecker) Name() string { // Reconcile will keep checking that the cluster can connect to essential services. func (r *InternetChecker) Check(ctx context.Context) error { + cli := &http.Client{ + Transport: &http.Transport{ + // We set DisableKeepAlives for two reasons: + // + // 1. If we're talking HTTP/2 and the remote end blackholes traffic, + // Go has a bug whereby it doesn't reset the connection after a + // timeout (https://github.com/golang/go/issues/36026). If this + // happens, we never have a chance to get healthy. We have + // specifically seen this with gcs.prod.monitoring.core.windows.net + // in Korea Central, which currently has a bad server which when we + // hit it causes our cluster creations to fail. + // + // 2. We *want* to evaluate our capability to successfully create + // *new* connections to internet endpoints anyway. + DisableKeepAlives: true, + }, + } + instance, err := r.arocli.Clusters().Get(ctx, arov1alpha1.SingletonClusterName, metav1.GetOptions{}) if err != nil { return err @@ -73,7 +91,7 @@ func (r *InternetChecker) Check(ctx context.Context) error { for _, url := range instance.Spec.InternetChecker.URLs { checkCount++ go func(urlToCheck string) { - ch <- r.checkWithRetry(&http.Client{}, urlToCheck, checkBackoff) + ch <- r.checkWithRetry(cli, urlToCheck, checkBackoff) }(url) } From 436b35306ebc858a70ce156414ab5e94c8453867 Mon Sep 17 00:00:00 2001 From: Jim Minter Date: Wed, 25 Nov 2020 18:03:15 -0600 Subject: [PATCH 3/4] remove clarify internetchecker, ensuring that failed tests can not run too fast --- .../controllers/checker/internetchecker.go | 60 ++++++++++--------- .../checker/internetchecker_test.go | 17 +----- 2 files changed, 36 insertions(+), 41 deletions(-) diff --git a/pkg/operator/controllers/checker/internetchecker.go b/pkg/operator/controllers/checker/internetchecker.go index a7a5e1e69f0..db3eba388e9 100644 --- a/pkg/operator/controllers/checker/internetchecker.go +++ b/pkg/operator/controllers/checker/internetchecker.go @@ -14,8 +14,6 @@ import ( "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/util/retry" "github.com/Azure/ARO-RP/pkg/operator" arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" @@ -23,15 +21,6 @@ import ( "github.com/Azure/ARO-RP/pkg/operator/controllers" ) -// if a check fails it is retried using the following parameters -var checkBackoff = wait.Backoff{ - Steps: 5, - Duration: 5 * time.Second, - Factor: 1.5, - Jitter: 0.5, - Cap: 1 * time.Minute, -} - // InternetChecker reconciles a Cluster object type InternetChecker struct { arocli aroclient.AroV1alpha1Interface @@ -91,7 +80,7 @@ func (r *InternetChecker) Check(ctx context.Context) error { for _, url := range instance.Spec.InternetChecker.URLs { checkCount++ go func(urlToCheck string) { - ch <- r.checkWithRetry(cli, urlToCheck, checkBackoff) + ch <- r.checkWithRetry(cli, urlToCheck, time.Minute) }(url) } @@ -128,24 +117,41 @@ func (r *InternetChecker) Check(ctx context.Context) error { return controllers.SetCondition(ctx, r.arocli, condition, r.role) } -// check the URL, retrying a failed query a few times according to the given backoff -func (r *InternetChecker) checkWithRetry(client simpleHTTPClient, url string, backoff wait.Backoff) error { - return retry.OnError(backoff, func(_ error) bool { return true }, func() error { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil) - if err != nil { - return fmt.Errorf("%s: %s", url, err) - } +// check the URL, retrying a failed query a few times +func (r *InternetChecker) checkWithRetry(client simpleHTTPClient, url string, timeout time.Duration) error { + var err error - resp, err := client.Do(req) - if err != nil { - return fmt.Errorf("%s: %s", url, err) + for i := 0; i < 6; i++ { + err = r.checkOnce(client, url, timeout/6) + if err == nil { + return nil } - defer resp.Body.Close() + } + + return err +} + +// checkOnce checks a given url. The check both times out after a given timeout +// *and* will wait for the timeout if it fails, so that we don't hit endpoints +// too much. +func (r *InternetChecker) checkOnce(client simpleHTTPClient, url string, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil) + if err != nil { + <-ctx.Done() + return err + } + + resp, err := client.Do(req) + if err != nil { + <-ctx.Done() + return fmt.Errorf("%s: %s", url, err) + } - return nil - }) + resp.Body.Close() + return nil } func (r *InternetChecker) conditionType() (ctype status.ConditionType) { diff --git a/pkg/operator/controllers/checker/internetchecker_test.go b/pkg/operator/controllers/checker/internetchecker_test.go index 426b23ce5f7..2ed08a660ca 100644 --- a/pkg/operator/controllers/checker/internetchecker_test.go +++ b/pkg/operator/controllers/checker/internetchecker_test.go @@ -4,7 +4,6 @@ package checker // Licensed under the Apache License 2.0. import ( - "bytes" "context" "io/ioutil" "net" @@ -15,8 +14,6 @@ import ( "testing" "time" - "k8s.io/apimachinery/pkg/util/wait" - utillog "github.com/Azure/ARO-RP/pkg/util/log" ) @@ -48,14 +45,14 @@ var ( okResp = &fakeResponse{ httpResponse: &http.Response{ StatusCode: http.StatusOK, - Body: ioutil.NopCloser(&bytes.Buffer{}), + Body: ioutil.NopCloser(nil), }, } badReq = &fakeResponse{ httpResponse: &http.Response{ StatusCode: http.StatusBadRequest, - Body: ioutil.NopCloser(&bytes.Buffer{}), + Body: ioutil.NopCloser(nil), }, } @@ -71,14 +68,6 @@ var ( timedoutReq = &fakeResponse{err: context.DeadlineExceeded} ) -var testBackoff = wait.Backoff{ - Steps: 5, - Duration: 5 * time.Millisecond, - Factor: 2.0, - Jitter: 0.5, - Cap: 50 * time.Millisecond, -} - var testCases = []testCase{ { name: "200 OK", @@ -108,7 +97,7 @@ func TestInternetCheckerCheck(t *testing.T) { for _, test := range testCases { t.Run(test.name, func(t *testing.T) { client := &testClient{responses: test.responses} - err := r.checkWithRetry(client, urltocheck, testBackoff) + err := r.checkWithRetry(client, urltocheck, 100*time.Millisecond) if (err != nil) != test.wantError { t.Errorf("InternetChecker.check() error = %v, wantErr %v", err, test.wantError) } From 8704af953d8aaae55120f1d57fb228c186becc36 Mon Sep 17 00:00:00 2001 From: Jim Minter Date: Wed, 25 Nov 2020 18:12:33 -0600 Subject: [PATCH 4/4] requeue internet checker immediately if it fails --- pkg/operator/controllers/checker/checker.go | 9 +++++++-- .../controllers/checker/checker_controller.go | 4 +++- pkg/operator/controllers/checker/internetchecker.go | 11 ++++++++++- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pkg/operator/controllers/checker/checker.go b/pkg/operator/controllers/checker/checker.go index 7c45e21c6bf..42337c5fd2e 100644 --- a/pkg/operator/controllers/checker/checker.go +++ b/pkg/operator/controllers/checker/checker.go @@ -1,11 +1,16 @@ package checker -import "context" - // Copyright (c) Microsoft Corporation. // Licensed under the Apache License 2.0. +import ( + "context" + "errors" +) + type Checker interface { Check(context.Context) error Name() string } + +var errRequeue = errors.New("requeue") diff --git a/pkg/operator/controllers/checker/checker_controller.go b/pkg/operator/controllers/checker/checker_controller.go index 1e072093064..3e89ce20e1a 100644 --- a/pkg/operator/controllers/checker/checker_controller.go +++ b/pkg/operator/controllers/checker/checker_controller.go @@ -59,7 +59,9 @@ func (r *CheckerController) Reconcile(request ctrl.Request) (ctrl.Result, error) if thisErr != nil { // do all checks even if there is an error err = thisErr - r.log.Errorf("checker %s failed with %v", c.Name(), err) + if thisErr != errRequeue { + r.log.Errorf("checker %s failed with %v", c.Name(), err) + } } } diff --git a/pkg/operator/controllers/checker/internetchecker.go b/pkg/operator/controllers/checker/internetchecker.go index db3eba388e9..07b9ea3d12f 100644 --- a/pkg/operator/controllers/checker/internetchecker.go +++ b/pkg/operator/controllers/checker/internetchecker.go @@ -114,7 +114,16 @@ func (r *InternetChecker) Check(ctx context.Context) error { } - return controllers.SetCondition(ctx, r.arocli, condition, r.role) + err = controllers.SetCondition(ctx, r.arocli, condition, r.role) + if err != nil { + return err + } + + if checkFailed { + return errRequeue + } + + return nil } // check the URL, retrying a failed query a few times