Skip to content
This repository was archived by the owner on Nov 3, 2023. It is now read-only.

Commit

Permalink
Merge pull request #101 from dhiltgen/fix_ci
Browse files Browse the repository at this point in the history
Fix CI
  • Loading branch information
dhiltgen authored Sep 9, 2021
2 parents 63b1037 + 278c70c commit ac94d82
Show file tree
Hide file tree
Showing 9 changed files with 450 additions and 285 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/pull_request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,22 @@ jobs:
set -x
sudo curl -fsSLo /usr/share/keyrings/kubernetes-archive-keyring.gpg https://packages.cloud.google.com/apt/doc/apt-key.gpg
echo "deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo swapoff -a
# Note: Default docker setup (cgroupfs) is incompatible with default kubelet (systemd) so one has to be changed
# since k8s recommends against cgroupfs, we'll use systemd
sudo sh -c "echo '{\"exec-opts\": [\"native.cgroupdriver=systemd\"]}' > /etc/docker/daemon.json"
sudo systemctl restart docker
sudo apt-get update
sudo apt-get install -y kubelet kubeadm kubectl
sudo swapoff -a
sudo kubeadm init
docker info
sudo kubeadm init -v 5 || (sudo journalctl -u kubelet; exit 1)
mkdir -p $HOME/.kube/
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $USER $HOME/.kube/config
kubectl taint nodes --all node-role.kubernetes.io/master-
kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
kubectl wait --for=condition=ready --timeout=30s node --all
kubectl get nodes -o wide
docker version
- name: Run integration tests
run: make integration EXTRA_GO_TEST_FLAGS=-v
Expand Down
9 changes: 7 additions & 2 deletions pkg/build/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,18 +127,23 @@ func allIndexes(l int) []int {
}

func ensureBooted(ctx context.Context, drivers []DriverInfo, idxs []int, pw progress.Writer) (map[string]map[string]*client.Client, error) {
lock := sync.Mutex{}
clients := map[string]map[string]*client.Client{} // [driverName][chosenNodeName]
eg, ctx := errgroup.WithContext(ctx)

for _, i := range idxs {
lock.Lock()
clients[drivers[i].Name] = map[string]*client.Client{}
lock.Unlock()
func(i int) {
eg.Go(func() error {
c, chosenNodeName, err := driver.Boot(ctx, drivers[i].Driver, pw)
builderClients, err := driver.Boot(ctx, drivers[i].Driver, pw)
if err != nil {
return err
}
clients[drivers[i].Name][chosenNodeName] = c
lock.Lock()
clients[drivers[i].Name][builderClients.ChosenNode.NodeName] = builderClients.ChosenNode.BuildKitClient
lock.Unlock()
return nil
})
}(i)
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func runCreate(streams genericclioptions.IOStreams, in createOptions, rootOpts *
}

pw := progress.NewPrinter(ctx, os.Stderr, in.progress)
_, _, err = driver.Boot(ctx, d, pw)
_, err = driver.Boot(ctx, d, pw)
if err != nil {
return err
}
Expand Down
76 changes: 49 additions & 27 deletions pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ package driver

import (
"context"
"crypto/rand"
"fmt"
"io"
"math/rand"
"math/big"
"net"
"strings"
"time"

"github.com/moby/buildkit/client"
"github.com/moby/buildkit/session"
specs "github.com/opencontainers/image-spec/specs-go/v1"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/imagetools"
"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/progress"
"github.com/vmware-tanzu/buildkit-cli-for-kubectl/pkg/store"
Expand All @@ -36,7 +38,16 @@ const (
Stopped
)

const maxBootRetries = 3
// RandSleep sleeps a random amount of time between zero and maxMS milliseconds
func RandSleep(maxMS int64) {
sleepTime, err := rand.Int(rand.Reader, big.NewInt(maxMS))
if err == nil {
time.Sleep(time.Duration(sleepTime.Int64()) * time.Millisecond)
} else {
logrus.Debugf("failed to get random number: %s", err)
time.Sleep(time.Duration(maxMS) * time.Millisecond)
}
}

func (s Status) String() string {
switch s {
Expand Down Expand Up @@ -66,7 +77,7 @@ type Driver interface {
Info(context.Context) (*Info, error)
Stop(ctx context.Context, force bool) error
Rm(ctx context.Context, force bool) error
Client(ctx context.Context) (*client.Client, string, error)
Clients(ctx context.Context) (*BuilderClients, error)
Features() map[Feature]bool
List(ctx context.Context) ([]Builder, error)
RuntimeSockProxy(ctx context.Context, name string) (net.Conn, error)
Expand Down Expand Up @@ -94,40 +105,51 @@ type Node struct {
Platforms []specs.Platform
}

func Boot(ctx context.Context, d Driver, pw progress.Writer) (*client.Client, string, error) {
try := 0
rand.Seed(time.Now().UnixNano())
for {
info, err := d.Info(ctx)
type BuilderClients struct {
// The builder on the chosen node/pod
ChosenNode NodeClient

// If multiple builders present, clients to the other builders (excluding the chosen pod)
OtherNodes []NodeClient
}
type NodeClient struct {
NodeName string
ClusterAddr string
BuildKitClient *client.Client
}

func Boot(ctx context.Context, d Driver, pw progress.Writer) (*BuilderClients, error) {
err := fmt.Errorf("timeout before starting")
var info *Info
var results *BuilderClients
for err != nil {
select {
case <-ctx.Done():
return nil, errors.Wrap(err, "timed out trying to bootstrap builder")
default:
}

info, err = d.Info(ctx)
if err != nil {
return nil, "", err
return nil, err
}
try++

if info.Status != Running {
if try > maxBootRetries {
return nil, "", errors.Errorf("failed to bootstrap builder in %d attempts (%s)", try, err)
}
if err = d.Bootstrap(ctx, func(s *client.SolveStatus) {
if pw != nil {
pw.Status() <- s
}
}); err != nil && (strings.Contains(err.Error(), "already exists") || strings.Contains(err.Error(), "not found")) {
// This most likely means another build is running in parallel
// Give it just enough time to finish creating resources then retry
time.Sleep(25 * time.Millisecond * time.Duration(1+rand.Int63n(39))) // 25 - 1000 ms
}); err != nil {
// Possibly another CLI running in parallel - random sleep then retry
RandSleep(100)
continue
} else if err != nil {
return nil, "", err
}
}

c, chosenNodeName, err := d.Client(ctx)
results, err = d.Clients(ctx)
if err != nil {
if errors.Cause(err) == ErrNotRunning && try <= maxBootRetries {
continue
}
return nil, "", err
// TODO - is there a fail-fast scenario we want to catch here?
RandSleep(1000)
}
return c, chosenNodeName, nil
}
return results, nil
}
Loading

0 comments on commit ac94d82

Please sign in to comment.