Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 test: e2e: make managed suite more robust to errors with Eventually() #5215

Merged
merged 1 commit into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/e2e/data/e2e_eks_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ variables:
GC_WORKLOAD: "../../data/gcworkload.yaml"

intervals:
default/wait-client-request: ["5m", "5s"]
default/wait-cluster: ["40m", "10s"]
default/wait-control-plane: ["35m", "10s"]
default/wait-worker-nodes: ["30m", "10s"]
Expand Down
5 changes: 3 additions & 2 deletions test/e2e/suites/managed/addon.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ func CheckAddonExistsSpec(ctx context.Context, inputGetter func() CheckAddonExis

By(fmt.Sprintf("Getting control plane: %s", controlPlaneName))
controlPlane := &ekscontrolplanev1.AWSManagedControlPlane{}
err := mgmtClient.Get(ctx, crclient.ObjectKey{Namespace: input.Namespace.Name, Name: controlPlaneName}, controlPlane)
Expect(err).ToNot(HaveOccurred())
Eventually(func() error {
return mgmtClient.Get(ctx, crclient.ObjectKey{Namespace: input.Namespace.Name, Name: controlPlaneName}, controlPlane)
}, input.E2EConfig.GetIntervals("", "wait-client-request")...).Should(Succeed(), "eventually failed trying to get the AWSManagedControlPlane")

By(fmt.Sprintf("Checking EKS addon %s is installed on cluster %s and is active", input.AddonName, input.ClusterName))
waitForEKSAddonToHaveStatus(waitForEKSAddonToHaveStatusInput{
Expand Down
32 changes: 20 additions & 12 deletions test/e2e/suites/managed/aws_node_env.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package managed

import (
"context"
"errors"
"fmt"

"github.com/aws/aws-sdk-go/aws/client"
Expand Down Expand Up @@ -57,27 +58,34 @@ func CheckAwsNodeEnvVarsSet(ctx context.Context, inputGetter func() UpdateAwsNod

By(fmt.Sprintf("Getting control plane: %s", controlPlaneName))
controlPlane := &ekscontrolplanev1.AWSManagedControlPlane{}
err := mgmtClient.Get(ctx, crclient.ObjectKey{Namespace: input.Namespace.Name, Name: controlPlaneName}, controlPlane)
Expect(err).ToNot(HaveOccurred())
Eventually(func() error {
return mgmtClient.Get(ctx, crclient.ObjectKey{Namespace: input.Namespace.Name, Name: controlPlaneName}, controlPlane)
}, input.E2EConfig.GetIntervals("", "wait-client-request")...).Should(Succeed(), "eventually failed trying to get the AWSManagedControlPlane")

By(fmt.Sprintf("Checking environment variables are set on AWSManagedControlPlane: %s", controlPlaneName))
Expect(controlPlane.Spec.VpcCni.Env).NotTo(BeNil())
Expect(len(controlPlane.Spec.VpcCni.Env)).Should(BeNumerically(">", 1))

By("Checking if aws-node has been updated with the defined environment variables on the workload cluster")
daemonSet := &appsv1.DaemonSet{}

clusterClient := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, input.Namespace.Name, input.ClusterName).GetClient()
err = clusterClient.Get(ctx, crclient.ObjectKey{Namespace: "kube-system", Name: "aws-node"}, daemonSet)
Expect(err).ToNot(HaveOccurred())

for _, container := range daemonSet.Spec.Template.Spec.Containers {
if container.Name == "aws-node" {
Expect(matchEnvVar(container.Env, corev1.EnvVar{Name: "FOO", Value: "BAR"})).Should(BeTrue())
Expect(matchEnvVar(container.Env, corev1.EnvVar{Name: "ENABLE_PREFIX_DELEGATION", Value: "true"})).Should(BeTrue())
break

Eventually(func() error {
if err := clusterClient.Get(ctx, crclient.ObjectKey{Namespace: "kube-system", Name: "aws-node"}, daemonSet); err != nil {
return fmt.Errorf("unable to get aws-node: %w", err)
}
}

for _, container := range daemonSet.Spec.Template.Spec.Containers {
if container.Name == "aws-node" {
if matchEnvVar(container.Env, corev1.EnvVar{Name: "FOO", Value: "BAR"}) &&
matchEnvVar(container.Env, corev1.EnvVar{Name: "ENABLE_PREFIX_DELEGATION", Value: "true"}) {
return nil
}
}
}

return errors.New("unable to find the expected environment variables on the aws-node DaemonSet's container")
}, input.E2EConfig.GetIntervals("", "wait-client-request")...).Should(Succeed(), "should have been able to find the expected aws-node DaemonSet")
}

func matchEnvVar(s []corev1.EnvVar, ev corev1.EnvVar) bool {
Expand Down
26 changes: 9 additions & 17 deletions test/e2e/suites/managed/eks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ package managed
import (
"context"
"fmt"
"time"

"github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -75,22 +74,15 @@ var _ = ginkgo.Describe("[managed] [general] EKS cluster tests", func() {
})

ginkgo.By("should set environment variables on the aws-node daemonset")
Eventually(func() error {
defer ginkgo.GinkgoRecover()
CheckAwsNodeEnvVarsSet(ctx, func() UpdateAwsNodeVersionSpecInput {
return UpdateAwsNodeVersionSpecInput{
E2EConfig: e2eCtx.E2EConfig,
BootstrapClusterProxy: e2eCtx.Environment.BootstrapClusterProxy,
AWSSession: e2eCtx.BootstrapUserAWSSession,
Namespace: namespace,
ClusterName: clusterName,
}
})
return nil
}).WithTimeout(5*time.Minute).WithPolling(10*time.Second).WithContext(ctx).Should(
Succeed(),
"Failed to verify AWS Node environment variables after 5 minutes of retries",
)
CheckAwsNodeEnvVarsSet(ctx, func() UpdateAwsNodeVersionSpecInput {
return UpdateAwsNodeVersionSpecInput{
E2EConfig: e2eCtx.E2EConfig,
BootstrapClusterProxy: e2eCtx.Environment.BootstrapClusterProxy,
AWSSession: e2eCtx.BootstrapUserAWSSession,
Namespace: namespace,
ClusterName: clusterName,
}
})

ginkgo.By("should have the VPC CNI installed")
CheckAddonExistsSpec(ctx, func() CheckAddonExistsSpecInput {
Expand Down
70 changes: 56 additions & 14 deletions test/e2e/suites/managed/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
apimachinerytypes "k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
crclient "sigs.k8s.io/controller-runtime/pkg/client"

infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2"
Expand All @@ -51,6 +52,11 @@ const (
EKSControlPlaneOnlyLegacyFlavor = "eks-control-plane-only-legacy"
)

const (
clientRequestTimeout = 2 * time.Minute
clientRequestCheckInterval = 5 * time.Second
)

type DefaultConfigClusterFn func(clusterName, namespace string) clusterctl.ConfigClusterInput

func getEKSClusterName(namespace, clusterName string) string {
Expand All @@ -74,14 +80,19 @@ func getASGName(clusterName string) string {
}

func verifyClusterActiveAndOwned(eksClusterName string, sess client.ConfigProvider) {
cluster, err := getEKSCluster(eksClusterName, sess)
Expect(err).NotTo(HaveOccurred())
var (
cluster *eks.Cluster
err error
)
Eventually(func() error {
cluster, err = getEKSCluster(eksClusterName, sess)
return err
}, clientRequestTimeout, clientRequestCheckInterval).Should(Succeed(), fmt.Sprintf("eventually failed trying to get EKS Cluster %q", eksClusterName))

tagName := infrav1.ClusterTagKey(eksClusterName)
tagValue, ok := cluster.Tags[tagName]
Expect(ok).To(BeTrue(), "expecting the cluster owned tag to exist")
Expect(*tagValue).To(BeEquivalentTo(string(infrav1.ResourceLifecycleOwned)))

Expect(*cluster.Status).To(BeEquivalentTo(eks.ClusterStatusActive))
}

Expand All @@ -102,6 +113,7 @@ func getEKSClusterAddon(eksClusterName, addonName string, sess client.ConfigProv
AddonName: &addonName,
ClusterName: &eksClusterName,
}

describeOutput, err := eksClient.DescribeAddon(describeInput)
if err != nil {
return nil, fmt.Errorf("describing eks addon %s: %w", addonName, err)
Expand All @@ -112,16 +124,16 @@ func getEKSClusterAddon(eksClusterName, addonName string, sess client.ConfigProv

func verifySecretExists(ctx context.Context, secretName, namespace string, k8sclient crclient.Client) {
secret := &corev1.Secret{}
err := k8sclient.Get(ctx, apimachinerytypes.NamespacedName{Name: secretName, Namespace: namespace}, secret)

Expect(err).ShouldNot(HaveOccurred())
Eventually(func() error {
return k8sclient.Get(ctx, apimachinerytypes.NamespacedName{Name: secretName, Namespace: namespace}, secret)
}, clientRequestTimeout, clientRequestCheckInterval).Should(Succeed(), fmt.Sprintf("eventually failed trying to verify Secret %q exists", secretName))
}

func verifyConfigMapExists(ctx context.Context, name, namespace string, k8sclient crclient.Client) {
cm := &corev1.ConfigMap{}
Eventually(func() error {
return k8sclient.Get(ctx, apimachinerytypes.NamespacedName{Name: name, Namespace: namespace}, cm)
}, 2*time.Minute, 5*time.Second).Should(Succeed())
}, clientRequestTimeout, clientRequestCheckInterval).Should(Succeed(), fmt.Sprintf("eventually failed trying to verify ConfigMap %q exists", name))
}

func VerifyRoleExistsAndOwned(roleName string, eksClusterName string, checkOwned bool, sess client.ConfigProvider) {
Expand All @@ -130,8 +142,15 @@ func VerifyRoleExistsAndOwned(roleName string, eksClusterName string, checkOwned
RoleName: aws.String(roleName),
}

output, err := iamClient.GetRole(input)
Expect(err).ShouldNot(HaveOccurred())
var (
output *iam.GetRoleOutput
err error
)

Eventually(func() error {
output, err = iamClient.GetRole(input)
return err
}, clientRequestTimeout, clientRequestCheckInterval).Should(Succeed(), fmt.Sprintf("eventually failed trying to get IAM Role %q", roleName))

if checkOwned {
found := false
Expand All @@ -152,9 +171,24 @@ func verifyManagedNodeGroup(eksClusterName, nodeGroupName string, checkOwned boo
ClusterName: aws.String(eksClusterName),
NodegroupName: aws.String(nodeGroupName),
}
result, err := eksClient.DescribeNodegroup(input)
Expect(err).NotTo(HaveOccurred())
Expect(*result.Nodegroup.Status).To(BeEquivalentTo(eks.NodegroupStatusActive))
var (
result *eks.DescribeNodegroupOutput
err error
)

Eventually(func() error {
result, err = eksClient.DescribeNodegroup(input)
if err != nil {
return fmt.Errorf("error describing nodegroup: %w", err)
}

nodeGroupStatus := ptr.Deref(result.Nodegroup.Status, "")
if nodeGroupStatus != eks.NodegroupStatusActive {
return fmt.Errorf("expected nodegroup.Status to be %q, was %q instead", eks.NodegroupStatusActive, nodeGroupStatus)
}

return nil
}, clientRequestTimeout, clientRequestCheckInterval).Should(Succeed(), "eventually failed trying to describe EKS Node group")

if checkOwned {
tagName := infrav1.ClusterAWSCloudProviderTagKey(eksClusterName)
Expand All @@ -172,8 +206,16 @@ func verifyASG(eksClusterName, asgName string, checkOwned bool, sess client.Conf
},
}

result, err := asgClient.DescribeAutoScalingGroups(input)
Expect(err).NotTo(HaveOccurred())
var (
result *autoscaling.DescribeAutoScalingGroupsOutput
err error
)

Eventually(func() error {
result, err = asgClient.DescribeAutoScalingGroups(input)
return err
}, clientRequestTimeout, clientRequestCheckInterval).Should(Succeed())

for _, instance := range result.AutoScalingGroups[0].Instances {
Expect(*instance.LifecycleState).To(Equal("InService"), "expecting the instance in service")
}
Expand Down
Loading