Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve cluster creation success in Korea #1196

Merged
merged 4 commits into from
Dec 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hack/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func run(ctx context.Context, log *logrus.Entry) error {
return err
}

c, err := cluster.New(log, env, false)
c, err := cluster.New(log, env, os.Getenv("CI") != "")
if err != nil {
return err
}
Expand Down
9 changes: 7 additions & 2 deletions pkg/operator/controllers/checker/checker.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
package checker

import "context"

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"errors"
)

type Checker interface {
Check(context.Context) error
Name() string
}

var errRequeue = errors.New("requeue")
4 changes: 3 additions & 1 deletion pkg/operator/controllers/checker/checker_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ func (r *CheckerController) Reconcile(request ctrl.Request) (ctrl.Result, error)
if thisErr != nil {
// do all checks even if there is an error
err = thisErr

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this necessary? Can't we use err instead of thisErr? The only real error-generating statement this this function is c.Check(ctx) so one variable should be enough

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good questions, why this is not enough?

for _, c := range r.checkers {
		err := c.Check(ctx)
		if err != nil {
			// do all checks even if there is an error
			if err != errRequeue {
				r.log.Errorf("checker %s failed with %v", c.Name(), err)
			}
		}
	}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's subtle. Consider the following case: checker 0 returns an error, and checker 1 returns nil. In that case by your code we would return foo, nil when we exit the function, but in fact we want to return foo, err.

r.log.Errorf("checker %s failed with %v", c.Name(), err)
if thisErr != errRequeue {
r.log.Errorf("checker %s failed with %v", c.Name(), err)
}
}
}

Expand Down
89 changes: 61 additions & 28 deletions pkg/operator/controllers/checker/internetchecker.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,13 @@ import (
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"

"github.com/Azure/ARO-RP/pkg/operator"
arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1"
aroclient "github.com/Azure/ARO-RP/pkg/operator/clientset/versioned/typed/aro.openshift.io/v1alpha1"
"github.com/Azure/ARO-RP/pkg/operator/controllers"
)

// if a check fails it is retried using the following parameters
var checkBackoff = wait.Backoff{
Steps: 5,
Duration: 5 * time.Second,
Factor: 1.5,
Jitter: 0.5,
Cap: 1 * time.Minute,
}

// InternetChecker reconciles a Cluster object
type InternetChecker struct {
arocli aroclient.AroV1alpha1Interface
Expand Down Expand Up @@ -63,6 +52,24 @@ func (r *InternetChecker) Name() string {

// Reconcile will keep checking that the cluster can connect to essential services.
func (r *InternetChecker) Check(ctx context.Context) error {
cli := &http.Client{
Transport: &http.Transport{
// We set DisableKeepAlives for two reasons:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good explanation

//
// 1. If we're talking HTTP/2 and the remote end blackholes traffic,
// Go has a bug whereby it doesn't reset the connection after a
// timeout (https://github.com/golang/go/issues/36026). If this
// happens, we never have a chance to get healthy. We have
// specifically seen this with gcs.prod.monitoring.core.windows.net
// in Korea Central, which currently has a bad server which when we
// hit it causes our cluster creations to fail.
//
// 2. We *want* to evaluate our capability to successfully create
// *new* connections to internet endpoints anyway.
DisableKeepAlives: true,
},
}

instance, err := r.arocli.Clusters().Get(ctx, arov1alpha1.SingletonClusterName, metav1.GetOptions{})
if err != nil {
return err
Expand All @@ -73,7 +80,7 @@ func (r *InternetChecker) Check(ctx context.Context) error {
for _, url := range instance.Spec.InternetChecker.URLs {
checkCount++
go func(urlToCheck string) {
ch <- r.checkWithRetry(&http.Client{}, urlToCheck, checkBackoff)
ch <- r.checkWithRetry(cli, urlToCheck, time.Minute)
}(url)
}

Expand Down Expand Up @@ -107,27 +114,53 @@ func (r *InternetChecker) Check(ctx context.Context) error {

}

return controllers.SetCondition(ctx, r.arocli, condition, r.role)
err = controllers.SetCondition(ctx, r.arocli, condition, r.role)
if err != nil {
return err
}

if checkFailed {
return errRequeue
}

return nil
}

// check the URL, retrying a failed query a few times according to the given backoff
func (r *InternetChecker) checkWithRetry(client simpleHTTPClient, url string, backoff wait.Backoff) error {
return retry.OnError(backoff, func(_ error) bool { return true }, func() error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil)
if err != nil {
return fmt.Errorf("%s: %s", url, err)
}
// check the URL, retrying a failed query a few times
func (r *InternetChecker) checkWithRetry(client simpleHTTPClient, url string, timeout time.Duration) error {
var err error

resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("%s: %s", url, err)
for i := 0; i < 6; i++ {
err = r.checkOnce(client, url, timeout/6)
if err == nil {
return nil
}
defer resp.Body.Close()
}

return err
}

// checkOnce checks a given url. The check both times out after a given timeout
// *and* will wait for the timeout if it fails, so that we don't hit endpoints
// too much.
func (r *InternetChecker) checkOnce(client simpleHTTPClient, url string, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()

req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil)
if err != nil {
<-ctx.Done()
return err
}

resp, err := client.Do(req)
if err != nil {
<-ctx.Done()
return fmt.Errorf("%s: %s", url, err)
}

return nil
})
resp.Body.Close()
return nil
}

func (r *InternetChecker) conditionType() (ctype status.ConditionType) {
Expand Down
17 changes: 3 additions & 14 deletions pkg/operator/controllers/checker/internetchecker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ package checker
// Licensed under the Apache License 2.0.

import (
"bytes"
"context"
"io/ioutil"
"net"
Expand All @@ -15,8 +14,6 @@ import (
"testing"
"time"

"k8s.io/apimachinery/pkg/util/wait"

utillog "github.com/Azure/ARO-RP/pkg/util/log"
)

Expand Down Expand Up @@ -48,14 +45,14 @@ var (
okResp = &fakeResponse{
httpResponse: &http.Response{
StatusCode: http.StatusOK,
Body: ioutil.NopCloser(&bytes.Buffer{}),
Body: ioutil.NopCloser(nil),
},
}

badReq = &fakeResponse{
httpResponse: &http.Response{
StatusCode: http.StatusBadRequest,
Body: ioutil.NopCloser(&bytes.Buffer{}),
Body: ioutil.NopCloser(nil),
},
}

Expand All @@ -71,14 +68,6 @@ var (
timedoutReq = &fakeResponse{err: context.DeadlineExceeded}
)

var testBackoff = wait.Backoff{
Steps: 5,
Duration: 5 * time.Millisecond,
Factor: 2.0,
Jitter: 0.5,
Cap: 50 * time.Millisecond,
}

var testCases = []testCase{
{
name: "200 OK",
Expand Down Expand Up @@ -108,7 +97,7 @@ func TestInternetCheckerCheck(t *testing.T) {
for _, test := range testCases {
t.Run(test.name, func(t *testing.T) {
client := &testClient{responses: test.responses}
err := r.checkWithRetry(client, urltocheck, testBackoff)
err := r.checkWithRetry(client, urltocheck, 100*time.Millisecond)
if (err != nil) != test.wantError {
t.Errorf("InternetChecker.check() error = %v, wantErr %v", err, test.wantError)
}
Expand Down