From 714c0347153db4f073d83462ab1a265e2d0774f3 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 16 Dec 2020 10:36:15 -0800 Subject: [PATCH] Harden default creation for parallel invocation If multiple build commands are run in parallel without a builder already existing, there are race conditions where those default creations can get confused and cause builds to fail This adds a new test case to verify parallel builds work, and hardens the bootstrapping code to detect "already exists" errors and retry --- Makefile | 1 - integration/common/kubeclient.go | 4 +- integration/suites/parallel_default_test.go | 92 +++++++++++++++++++++ pkg/driver/driver.go | 15 +++- 4 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 integration/suites/parallel_default_test.go diff --git a/Makefile b/Makefile index 4e4a1c48..597b1dfc 100644 --- a/Makefile +++ b/Makefile @@ -93,4 +93,3 @@ cover.html: cover-int.out cover-unit.out .PHONY: lint lint: golangci-lint run - diff --git a/integration/common/kubeclient.go b/integration/common/kubeclient.go index d8cfc318..78f94cbf 100644 --- a/integration/common/kubeclient.go +++ b/integration/common/kubeclient.go @@ -62,7 +62,9 @@ func RunSimpleBuildImageAsPod(ctx context.Context, name, imageName, namespace st defer func() { err := podClient.Delete(ctx, pod.Name, metav1.DeleteOptions{}) - logrus.Warnf("failed to clean up pod %s: %s", pod.Name, err) + if err != nil { + logrus.Warnf("failed to clean up pod %s: %s", pod.Name, err) + } }() logrus.Infof("waiting for pod to start...") diff --git a/integration/suites/parallel_default_test.go b/integration/suites/parallel_default_test.go new file mode 100644 index 00000000..019e7cf0 --- /dev/null +++ b/integration/suites/parallel_default_test.go @@ -0,0 +1,92 @@ +// Copyright (C) 2020 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 +package suites + +import ( + "context" + "fmt" + "sync" + "testing" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" + "github.com/vmware-tanzu/buildkit-cli-for-kubectl/integration/common" + + "k8s.io/client-go/kubernetes" + v1 "k8s.io/client-go/kubernetes/typed/core/v1" +) + +const ParallelDefaultBuildCount = 3 + +type parallelDefaultSuite struct { + suite.Suite + Name string + CreateFlags []string + + ClientSet *kubernetes.Clientset + Namespace string + + configMapClient v1.ConfigMapInterface +} + +func (s *parallelDefaultSuite) SetupSuite() { + var err error + s.ClientSet, s.Namespace, err = common.GetKubeClientset() + require.NoError(s.T(), err, "%s: kube client failed", s.Name) + s.configMapClient = s.ClientSet.CoreV1().ConfigMaps(s.Namespace) +} + +func (s *parallelDefaultSuite) TestParallelDefaultBuilds() { + logrus.Infof("%s: Parallel %d Build", s.Name, ParallelDefaultBuildCount) + + dirs := make([]string, ParallelDefaultBuildCount) + errors := make([]error, ParallelDefaultBuildCount) + + // Create the contexts before threading + for i := 0; i < ParallelDefaultBuildCount; i++ { + dir, cleanup, err := common.NewSimpleBuildContext() + dirs[i] = dir + require.NoError(s.T(), err, "Failed to set up temporary build context") + defer cleanup() + } + wg := &sync.WaitGroup{} + wg.Add(ParallelDefaultBuildCount) + + for i := 0; i < ParallelDefaultBuildCount; i++ { + go func(i int) { + defer wg.Done() + imageName := fmt.Sprintf("dummy.acme.com/pbuild:%d", i) + args := []string{ + "--progress=plain", + "--tag", imageName, + dirs[i], + } + err := common.RunBuild(args) + if err != nil { + errors[i] = err + return + } + errors[i] = common.RunSimpleBuildImageAsPod( + context.Background(), + fmt.Sprintf("%s-testbuiltimage-%d", s.Name, i), + imageName, + s.Namespace, + s.ClientSet, + ) + + }(i) + } + wg.Wait() + for i := 0; i < ParallelDefaultBuildCount; i++ { + require.NoError(s.T(), errors[i], "build/run %d failed", i) + } +} + +func TestParallelDefaultBuildSuite(t *testing.T) { + common.Skipper(t) + // We don't parallelize with other tests, since we use the default builder name + suite.Run(t, ¶llelDefaultSuite{ + Name: "buildkit", + }) +} diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index 0d2adefa..c1b338d3 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -6,6 +6,8 @@ import ( "context" "io" "net" + "strings" + "time" "github.com/moby/buildkit/client" "github.com/moby/buildkit/session" @@ -33,6 +35,8 @@ const ( Stopped ) +const maxBootRetries = 3 + func (s Status) String() string { switch s { case Inactive: @@ -98,21 +102,26 @@ func Boot(ctx context.Context, d Driver, pw progress.Writer) (*client.Client, st } try++ if info.Status != Running { - if try > 2 { + if try > maxBootRetries { return nil, "", errors.Errorf("failed to bootstrap %T driver in attempts", d) } if err := d.Bootstrap(ctx, func(s *client.SolveStatus) { if pw != nil { pw.Status() <- s } - }); err != nil { + }); err != nil && (strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "not found")) { + // This most likely means another build is running in parallel + // Give it just enough time to finish creating resources then retry + time.Sleep(250 * time.Millisecond) + continue + } else if err != nil { return nil, "", err } } c, chosenNodeName, err := d.Client(ctx) if err != nil { - if errors.Cause(err) == ErrNotRunning && try <= 2 { + if errors.Cause(err) == ErrNotRunning && try <= maxBootRetries { continue } return nil, "", err