Skip to content
This repository was archived by the owner on Nov 3, 2023. It is now read-only.

Fix CI #101

Merged
merged 3 commits into from
Sep 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions .github/workflows/pull_request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,22 @@ jobs:
set -x
sudo curl -fsSLo /usr/share/keyrings/kubernetes-archive-keyring.gpg https://packages.cloud.google.com/apt/doc/apt-key.gpg
echo "deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo swapoff -a
# Note: Default docker setup (cgroupfs) is incompatible with default kubelet (systemd) so one has to be changed
# since k8s recommends against cgroupfs, we'll use systemd
sudo sh -c "echo '{\"exec-opts\": [\"native.cgroupdriver=systemd\"]}' > /etc/docker/daemon.json"
sudo systemctl restart docker
sudo apt-get update
sudo apt-get install -y kubelet kubeadm kubectl
sudo swapoff -a
sudo kubeadm init
docker info
sudo kubeadm init -v 5 || (sudo journalctl -u kubelet; exit 1)
mkdir -p $HOME/.kube/
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $USER $HOME/.kube/config
kubectl taint nodes --all node-role.kubernetes.io/master-
kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
kubectl wait --for=condition=ready --timeout=30s node --all
kubectl get nodes -o wide
docker version

- name: Run integration tests
run: make integration EXTRA_GO_TEST_FLAGS=-v
Expand Down
9 changes: 7 additions & 2 deletions pkg/build/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,18 +127,23 @@ func allIndexes(l int) []int {
}

func ensureBooted(ctx context.Context, drivers []DriverInfo, idxs []int, pw progress.Writer) (map[string]map[string]*client.Client, error) {
lock := sync.Mutex{}
clients := map[string]map[string]*client.Client{} // [driverName][chosenNodeName]
eg, ctx := errgroup.WithContext(ctx)

for _, i := range idxs {
lock.Lock()
clients[drivers[i].Name] = map[string]*client.Client{}
lock.Unlock()
func(i int) {
eg.Go(func() error {
c, chosenNodeName, err := driver.Boot(ctx, drivers[i].Driver, pw)
builderClients, err := driver.Boot(ctx, drivers[i].Driver, pw)
if err != nil {
return err
}
clients[drivers[i].Name][chosenNodeName] = c
lock.Lock()
clients[drivers[i].Name][builderClients.ChosenNode.NodeName] = builderClients.ChosenNode.BuildKitClient
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you could run into a race here unless you use sync.Mutex and lock/unlock it around the clients[] hashmap.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great catch! You're right, this was unsafe usage.

lock.Unlock()
return nil
})
}(i)
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func runCreate(streams genericclioptions.IOStreams, in createOptions, rootOpts *
}

pw := progress.NewPrinter(ctx, os.Stderr, in.progress)
_, _, err = driver.Boot(ctx, d, pw)
_, err = driver.Boot(ctx, d, pw)
if err != nil {
return err
}
Expand Down
76 changes: 49 additions & 27 deletions pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ package driver

import (
"context"
"crypto/rand"
"fmt"
"io"
"math/rand"
"math/big"
"net"
"strings"
"time"

"github.com/moby/buildkit/client"
"github.com/moby/buildkit/session"
specs "github.com/opencontainers/image-spec/specs-go/v1"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/imagetools"
"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/progress"
"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/store"
Expand All @@ -36,7 +38,16 @@ const (
Stopped
)

const maxBootRetries = 3
// RandSleep sleeps a random amount of time between zero and maxMS milliseconds
func RandSleep(maxMS int64) {
sleepTime, err := rand.Int(rand.Reader, big.NewInt(maxMS))
if err == nil {
time.Sleep(time.Duration(sleepTime.Int64()) * time.Millisecond)
} else {
logrus.Debugf("failed to get random number: %s", err)
time.Sleep(time.Duration(maxMS) * time.Millisecond)
}
}

func (s Status) String() string {
switch s {
Expand Down Expand Up @@ -66,7 +77,7 @@ type Driver interface {
Info(context.Context) (*Info, error)
Stop(ctx context.Context, force bool) error
Rm(ctx context.Context, force bool) error
Client(ctx context.Context) (*client.Client, string, error)
Clients(ctx context.Context) (*BuilderClients, error)
Features() map[Feature]bool
List(ctx context.Context) ([]Builder, error)
RuntimeSockProxy(ctx context.Context, name string) (net.Conn, error)
Expand Down Expand Up @@ -94,40 +105,51 @@ type Node struct {
Platforms []specs.Platform
}

func Boot(ctx context.Context, d Driver, pw progress.Writer) (*client.Client, string, error) {
try := 0
rand.Seed(time.Now().UnixNano())
for {
info, err := d.Info(ctx)
type BuilderClients struct {
// The builder on the chosen node/pod
ChosenNode NodeClient

// If multiple builders present, clients to the other builders (excluding the chosen pod)
OtherNodes []NodeClient
}
type NodeClient struct {
NodeName string
ClusterAddr string
BuildKitClient *client.Client
}

func Boot(ctx context.Context, d Driver, pw progress.Writer) (*BuilderClients, error) {
err := fmt.Errorf("timeout before starting")
var info *Info
var results *BuilderClients
for err != nil {
select {
case <-ctx.Done():
return nil, errors.Wrap(err, "timed out trying to bootstrap builder")
default:
}

info, err = d.Info(ctx)
if err != nil {
return nil, "", err
return nil, err
}
try++

if info.Status != Running {
if try > maxBootRetries {
return nil, "", errors.Errorf("failed to bootstrap builder in %d attempts (%s)", try, err)
}
if err = d.Bootstrap(ctx, func(s *client.SolveStatus) {
if pw != nil {
pw.Status() <- s
}
}); err != nil && (strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "not found")) {
// This most likely means another build is running in parallel
// Give it just enough time to finish creating resources then retry
time.Sleep(25 * time.Millisecond * time.Duration(1+rand.Int63n(39))) // 25 - 1000 ms
}); err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes me a bit nervous. I get it might take a bit for the builders to simmer down, but just sleeping randomly for 1000 milliseconds feels a bit weird.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair. Let me drop this down to a much smaller pause to get the racing CLIs skewed but not as noticeable to a human.

// Possibly another CLI running in parallel - random sleep then retry
RandSleep(100)
continue
} else if err != nil {
return nil, "", err
}
}

c, chosenNodeName, err := d.Client(ctx)
results, err = d.Clients(ctx)
if err != nil {
if errors.Cause(err) == ErrNotRunning && try <= maxBootRetries {
continue
}
return nil, "", err
// TODO - is there a fail-fast scenario we want to catch here?
RandSleep(1000)
}
return c, chosenNodeName, nil
}
return results, nil
}
Loading