Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use 'parallel' policy for workspace pod rollouts to avoid stalls #802

Merged
merged 12 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/run-acceptance-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,8 @@ jobs:
go-version: 1.23.x
- name: Run tests
run: make -C operator test-e2e
- name: 🐛 Debug Build
uses: stateful/vscode-server-action@v1
if: failure()
with:
timeout: '360000' # milliseconds
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ CHANGELOG
- Surface Update failures back to the Stack object status [#807](https://github.com/pulumi/pulumi-kubernetes-operator/pull/807)
- Surface update conflict errors when a stack is locked [#807](https://github.com/pulumi/pulumi-kubernetes-operator/pull/807)4 (add changelog entry)
- Do not destroy the workspace pod if an authentication error occurs [#805](https://github.com/pulumi/pulumi-kubernetes-operator/pull/805)
- Use 'parallel' policy for workspace pod rollouts to avoid stalls. [#802](https://github.com/pulumi/pulumi-kubernetes-operator/pull/802)

## 2.0.0-beta.3 (2024-11-27)

- Stack Controller: watch for delete events. [#756](https://github.com/pulumi/pulumi-kubernetes-operator/pull/756)
- Stack Controller: fix an issue where new commits weren't detected when using git sources. https://github.com/pulumi/pulumi-kubernetes-operator/issues/762
- Stack Controller: fix an issue where new commits weren't detected when using git sources. [#762](https://github.com/pulumi/pulumi-kubernetes-operator/issues/762)
- Ensure cleanup of Stack in foreground deletion. [#760](https://github.com/pulumi/pulumi-kubernetes-operator/pull/760)
- Register API resources into the "pulumi" category [#765](https://github.com/pulumi/pulumi-kubernetes-operator/pull/765)
- Use shorter DNS name for pod-to-pod networking. [#764](https://github.com/pulumi/pulumi-kubernetes-operator/pull/764)
Expand Down
2 changes: 1 addition & 1 deletion operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ test: manifests generate fmt vet envtest ## Run tests.
# Utilize Kind or modify the e2e tests to load the image locally, enabling compatibility with other vendors.
.PHONY: test-e2e # Run the e2e tests against a Kind k8s instance that is spun up.
test-e2e:
go test -count=1 -v --timeout=15m -v ./e2e/...
go test -count=1 -v --timeout=60m -v ./e2e/...

GOLANGCI_LINT = $(shell pwd)/bin/golangci-lint
GOLANGCI_LINT_VERSION ?= v1.54.2
Expand Down
43 changes: 39 additions & 4 deletions operator/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ import (
"testing"
"time"

autov1alpha1 "github.com/pulumi/pulumi-kubernetes-operator/v2/operator/api/auto/v1alpha1"
pulumiv1 "github.com/pulumi/pulumi-kubernetes-operator/v2/operator/api/pulumi/v1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/yaml"
)

Expand Down Expand Up @@ -75,7 +77,6 @@ func TestE2E(t *testing.T) {
{
name: "random-yaml-nonroot",
f: func(t *testing.T) {
t.Parallel()
// 1. Test `WorkspaceReclaimPolicy` is unset.
cmd := exec.Command("kubectl", "apply", "-f", "e2e/testdata/random-yaml-nonroot")
require.NoError(t, run(cmd))
Expand Down Expand Up @@ -122,7 +123,6 @@ func TestE2E(t *testing.T) {
{
name: "git-auth-nonroot",
f: func(t *testing.T) {
t.Parallel()
if os.Getenv("PULUMI_BOT_TOKEN") == "" {
t.Skip("missing PULUMI_BOT_TOKEN")
}
Expand All @@ -140,8 +140,6 @@ func TestE2E(t *testing.T) {
{
name: "targets",
f: func(t *testing.T) {
t.Parallel()

cmd := exec.Command("kubectl", "apply", "-f", "e2e/testdata/targets")
require.NoError(t, run(cmd))
dumpLogs(t, "targets", "pod/targets-workspace-0")
Expand All @@ -153,6 +151,29 @@ func TestE2E(t *testing.T) {
assert.NotContains(t, stack.Status.Outputs, "notTargeted")
},
},
{
name: "issue-801",
f: func(t *testing.T) {
dumpEvents(t, "issue-801")
dumpLogs(t, "issue-801", "pods/issue-801-workspace-0")

// deploy a workspace with a non-existent container image (pulumi:nosuchimage)
cmd := exec.Command("kubectl", "apply", "-f", "e2e/testdata/issue-801")
require.NoError(t, run(cmd))

// wait for the pod to be created (knowing that it will never become ready)
_, err := waitFor[corev1.Pod]("pods/issue-801-workspace-0", "issue-801", 5*time.Minute, "create")
assert.NoError(t, err)

// update the workspace to a valid image (expecting that a new pod will be rolled out)
cmd = exec.Command("kubectl", "apply", "-f", "e2e/testdata/issue-801/step2")
require.NoError(t, run(cmd))

// wait for the workspace to be fully ready
_, err = waitFor[autov1alpha1.Workspace]("workspaces/issue-801", "issue-801", 5*time.Minute, "condition=Ready")
assert.NoError(t, err)
},
},
{
name: "random-yaml-auth-error",
f: func(t *testing.T) {
Expand Down Expand Up @@ -218,6 +239,20 @@ func dumpLogs(t *testing.T, namespace, name string) {
})
}


func dumpEvents(t *testing.T, namespace string) {
t.Cleanup(func() {
if !t.Failed() {
return
}
t.Logf("=== EVENTS %s", namespace)
cmd := exec.Command("kubectl", "get", "events", "-n", namespace)
out, err := cmd.CombinedOutput()
assert.NoError(t, err)
t.Log(string(out))
})
}

func waitFor[T any](name, namespace string, d time.Duration, conditions ...string) (*T, error) {
if len(conditions) == 0 {
return nil, fmt.Errorf("no conditions provided")
Expand Down
3 changes: 2 additions & 1 deletion operator/e2e/testdata/git-auth-nonroot/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ spec:
refresh: false
continueResyncOnCommitMatch: false
resyncFrequencySeconds: 60
destroyOnFinalize: true
destroyOnFinalize: false

# Enable file state for testing.
envRefs:
Expand All @@ -73,6 +73,7 @@ spec:
value: "test"
workspaceTemplate:
spec:
image: pulumi/pulumi:3.147.0-nonroot
serviceAccountName: git-auth-nonroot
podTemplate:
spec:
Expand Down
55 changes: 55 additions & 0 deletions operator/e2e/testdata/issue-801/manifests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# note: image is purposefully set to a non-existent image to simulate a rollout problem
apiVersion: v1
kind: Namespace
metadata:
name: issue-801
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: issue-801
namespace: issue-801
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: issue-801:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: issue-801
namespace: issue-801
---
apiVersion: auto.pulumi.com/v1alpha1
kind: Workspace
metadata:
name: issue-801
namespace: issue-801
spec:
image: pulumi/nosuchimage:3.147.0-nonroot
securityProfile: restricted
serviceAccountName: issue-801
git:
url: https://github.com/pulumi/examples.git
ref: 1e2fc471709448f3c9f7a250f28f1eafcde7017b
dir: random-yaml
shallow: true
env:
- name: PULUMI_CONFIG_PASSPHRASE
value: test
- name: PULUMI_BACKEND_URL
value: file:///state/

podTemplate:
spec:
containers:
- name: pulumi
volumeMounts:
- name: state
mountPath: /state
volumes:
- name: state
emptyDir: {}
55 changes: 55 additions & 0 deletions operator/e2e/testdata/issue-801/step2/manifests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# modified: image tag
apiVersion: v1
kind: Namespace
metadata:
name: issue-801
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: issue-801
namespace: issue-801
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: issue-801:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: issue-801
namespace: issue-801
---
apiVersion: auto.pulumi.com/v1alpha1
kind: Workspace
metadata:
name: issue-801
namespace: issue-801
spec:
image: pulumi/pulumi:3.147.0-nonroot
securityProfile: restricted
serviceAccountName: issue-801
git:
url: https://github.com/pulumi/examples.git
ref: 1e2fc471709448f3c9f7a250f28f1eafcde7017b
dir: random-yaml
shallow: true
env:
- name: PULUMI_CONFIG_PASSPHRASE
value: test
- name: PULUMI_BACKEND_URL
value: file:///state/

podTemplate:
spec:
containers:
- name: pulumi
volumeMounts:
- name: state
mountPath: /state
volumes:
- name: state
emptyDir: {}
3 changes: 2 additions & 1 deletion operator/e2e/testdata/random-yaml-auth-error/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ spec:
refresh: false
continueResyncOnCommitMatch: false
resyncFrequencySeconds: 60
destroyOnFinalize: true
destroyOnFinalize: false
# Enable file state for testing.
envRefs:
PULUMI_BACKEND_URL:
Expand All @@ -82,6 +82,7 @@ spec:
value: "test"
workspaceTemplate:
spec:
image: pulumi/pulumi:3.147.0-nonroot
serviceAccountName: random-yaml-auth-error
podTemplate:
spec:
Expand Down
3 changes: 2 additions & 1 deletion operator/e2e/testdata/random-yaml-nonroot/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ spec:
refresh: false
continueResyncOnCommitMatch: false
resyncFrequencySeconds: 60
destroyOnFinalize: true
destroyOnFinalize: false

# Enable file state for testing.
envRefs:
Expand All @@ -103,6 +103,7 @@ spec:
value: "test"
workspaceTemplate:
spec:
image: pulumi/pulumi:3.147.0-nonroot
serviceAccountName: random-yaml-nonroot
podTemplate:
spec:
Expand Down
1 change: 1 addition & 0 deletions operator/e2e/testdata/targets/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ spec:
value: "test"
workspaceTemplate:
spec:
image: pulumi/pulumi:3.147.0-nonroot
serviceAccountName: targets
podTemplate:
spec:
Expand Down
10 changes: 10 additions & 0 deletions operator/internal/controller/auto/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"google.golang.org/grpc/status"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -188,6 +189,14 @@ func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}
err = r.Patch(ctx, ss, client.Apply, client.FieldOwner(FieldManager))
if err != nil {
// issue-801 - migration logic for 2.0.0-beta.3 to 2.0.0
if apierrors.IsInvalid(err) {
l.V(0).Info("replacing the workspace statefulset to update an immutable field")
if err = r.Delete(ctx, ss); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to delete statefulset: %w", err)
}
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
return ctrl.Result{}, fmt.Errorf("failed to apply statefulset: %w", err)
}

Expand Down Expand Up @@ -498,6 +507,7 @@ func newStatefulSet(ctx context.Context, w *autov1alpha1.Workspace, source *sour
Selector: &metav1.LabelSelector{MatchLabels: labels},
ServiceName: nameForService(w),
Replicas: ptr.To[int32](1),
PodManagementPolicy: appsv1.ParallelPodManagement,
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: labels,
Expand Down
Loading