From 2d6c82b9ff44a197a6ce06d2001b32570b61f376 Mon Sep 17 00:00:00 2001 From: Jan Fajerski Date: Tue, 12 Mar 2024 20:41:53 +0100 Subject: [PATCH] fix: use thanos default port in service and containerPort (#414) * fix: use thanos default port in service and containerPort Before thanos was only listening on localhost. Relying on the default setting allows user to modify that via server side apply and add a proxy chain. Signed-off-by: Jan Fajerski * fix: various test fixes and refactors Mostly this is replacing deprecated functions and attempting to reduce timing dependent behavior by waiting for expected resources to show up. Signed-off-by: Jan Fajerski * test: add flag to limit test to run to test/run-e2e.sh Also add the -count 1 flag (so results are never cached) and drop the --retain flag (not sure what that ever did). Signed-off-by: Jan Fajerski * test: update e2e action to use kind v0.20.0 Signed-off-by: Jan Fajerski --------- Signed-off-by: Jan Fajerski --- .github/e2e-tests-olm/action.yaml | 2 +- .../monitoring/thanos-querier/components.go | 6 +-- test/e2e/framework/assertions.go | 46 ++++++++++++++++--- test/e2e/framework/framework.go | 13 +++--- test/e2e/monitoring_stack_controller_test.go | 21 ++++++++- test/e2e/thanos_querier_controller_test.go | 13 +++--- test/run-e2e.sh | 13 +++++- 7 files changed, 86 insertions(+), 28 deletions(-) diff --git a/.github/e2e-tests-olm/action.yaml b/.github/e2e-tests-olm/action.yaml index fd125449..73c57541 100644 --- a/.github/e2e-tests-olm/action.yaml +++ b/.github/e2e-tests-olm/action.yaml @@ -4,7 +4,7 @@ inputs: kind-version: description: "kind version" required: false - default: 'v0.14.0' + default: 'v0.20.0' kind-image: description: "kind version" required: false diff --git a/pkg/controllers/monitoring/thanos-querier/components.go b/pkg/controllers/monitoring/thanos-querier/components.go index c28c22bf..67b11fb9 100644 --- a/pkg/controllers/monitoring/thanos-querier/components.go +++ b/pkg/controllers/monitoring/thanos-querier/components.go @@ -26,8 +26,6 @@ func thanosComponentReconcilers(thanos *msoapi.ThanosQuerier, sidecarUrls []stri func newThanosQuerierDeployment(name string, spec *msoapi.ThanosQuerier, sidecarUrls []string, thanosCfg ThanosConfiguration) *appsv1.Deployment { args := []string{ "query", - "--grpc-address=127.0.0.1:10901", - "--http-address=127.0.0.1:9090", "--log.format=logfmt", "--query.replica-label=prometheus_replica", "--query.auto-downsampling", @@ -71,7 +69,7 @@ func newThanosQuerierDeployment(name string, spec *msoapi.ThanosQuerier, sidecar Image: thanosCfg.Image, Ports: []corev1.ContainerPort{ { - ContainerPort: 9090, + ContainerPort: 10902, Name: "metrics", }, }, @@ -116,7 +114,7 @@ func newService(name string, namespace string) *corev1.Service { Spec: corev1.ServiceSpec{ Ports: []corev1.ServicePort{ { - Port: 9090, + Port: 10902, Name: "http", }, }, diff --git a/test/e2e/framework/assertions.go b/test/e2e/framework/assertions.go index 9072696a..e6352ee9 100644 --- a/test/e2e/framework/assertions.go +++ b/test/e2e/framework/assertions.go @@ -48,7 +48,8 @@ func WithPollInterval(d time.Duration) OptionFn { } } -// AssertResourceNeverExists asserts that a statefulset is never created for the duration of DefaultTestTimeout +// AssertResourceNeverExists asserts that a resource is never created for the +// duration of the timeout func (f *Framework) AssertResourceNeverExists(name, namespace string, resource client.Object, fns ...OptionFn) func(t *testing.T) { option := AssertOption{ PollInterval: 5 * time.Second, @@ -60,7 +61,7 @@ func (f *Framework) AssertResourceNeverExists(name, namespace string, resource c return func(t *testing.T) { //nolint - if err := wait.Poll(option.PollInterval, option.WaitTimeout, func() (bool, error) { + if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(ctx context.Context) (bool, error) { key := types.NamespacedName{ Name: name, Namespace: namespace, @@ -76,6 +77,35 @@ func (f *Framework) AssertResourceNeverExists(name, namespace string, resource c } } +// AssertResourceAbsent asserts that a resource is not present or, if present, is deleted +// within the timeout +func (f *Framework) AssertResourceAbsent(name, namespace string, resource client.Object, fns ...OptionFn) func(t *testing.T) { + option := AssertOption{ + PollInterval: 5 * time.Second, + WaitTimeout: DefaultTestTimeout, + } + for _, fn := range fns { + fn(&option) + } + + return func(t *testing.T) { + //nolint + if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(ctx context.Context) (bool, error) { + key := types.NamespacedName{ + Name: name, + Namespace: namespace, + } + if err := f.K8sClient.Get(context.Background(), key, resource); errors.IsNotFound(err) { + return true, nil + } + + return false, fmt.Errorf("resource %s/%s should not be present", namespace, name) + }); wait.Interrupted(err) { + t.Fatal(err) + } + } +} + // AssertResourceEventuallyExists asserts that a resource is created duration a time period of customForeverTestTimeout func (f *Framework) AssertResourceEventuallyExists(name, namespace string, resource client.Object, fns ...OptionFn) func(t *testing.T) { option := AssertOption{ @@ -113,8 +143,7 @@ func (f *Framework) AssertStatefulsetReady(name, namespace string, fns ...Option } return func(t *testing.T) { key := types.NamespacedName{Name: name, Namespace: namespace} - //nolint - if err := wait.Poll(5*time.Second, option.WaitTimeout, func() (bool, error) { + if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(ctx context.Context) (bool, error) { pod := &appsv1.StatefulSet{} err := f.K8sClient.Get(context.Background(), key, pod) return err == nil && pod.Status.ReadyReplicas == *pod.Spec.Replicas, nil @@ -125,8 +154,11 @@ func (f *Framework) AssertStatefulsetReady(name, namespace string, fns ...Option } func (f *Framework) GetResourceWithRetry(t *testing.T, name, namespace string, obj client.Object) { - //nolint - err := wait.Poll(5*time.Second, DefaultTestTimeout, func() (bool, error) { + option := AssertOption{ + PollInterval: 5 * time.Second, + WaitTimeout: DefaultTestTimeout, + } + err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(ctx context.Context) (bool, error) { key := types.NamespacedName{Name: name, Namespace: namespace} if err := f.K8sClient.Get(context.Background(), key, obj); errors.IsNotFound(err) { @@ -238,7 +270,7 @@ func (f *Framework) GetStackWhenAvailable(t *testing.T, name, namespace string) } var lastErr error - err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, DefaultTestTimeout+10*time.Second, true, func(ctx context.Context) (bool, error) { + err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, DefaultTestTimeout*2, true, func(ctx context.Context) (bool, error) { lastErr = nil err := f.K8sClient.Get(context.Background(), key, &ms) if err != nil { diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go index c8a5ec8e..eb02007e 100644 --- a/test/e2e/framework/framework.go +++ b/test/e2e/framework/framework.go @@ -9,7 +9,8 @@ import ( "strings" "testing" - policyv1beta1 "k8s.io/api/policy/v1beta1" + "github.com/pkg/errors" + policyv1 "k8s.io/api/policy/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -37,7 +38,7 @@ type Framework struct { func (f *Framework) StartPortForward(podName string, ns string, port string, stopChan chan struct{}) error { roundTripper, upgrader, err := spdy.RoundTripperFor(f.Config) if err != nil { - return err + return errors.Wrap(err, "error creating RoundTripper") } path := fmt.Sprintf("/api/v1/namespaces/%s/pods/%s/portforward", ns, podName) @@ -49,7 +50,7 @@ func (f *Framework) StartPortForward(podName string, ns string, port string, sto out, errOut := new(bytes.Buffer), new(bytes.Buffer) forwarder, err := portforward.New(dialer, []string{port}, stopChan, readyChan, out, errOut) if err != nil { - return err + return errors.Wrap(err, "failed to create portforward") } go func() { @@ -131,9 +132,9 @@ func (f *Framework) Evict(pod *corev1.Pod, gracePeriodSeconds int64) error { GracePeriodSeconds: &gracePeriodSeconds, } - eviction := &policyv1beta1.Eviction{ + eviction := &policyv1.Eviction{ TypeMeta: metav1.TypeMeta{ - APIVersion: policyv1beta1.SchemeGroupVersion.String(), + APIVersion: policyv1.SchemeGroupVersion.String(), Kind: "Eviction", }, ObjectMeta: metav1.ObjectMeta{ @@ -147,7 +148,7 @@ func (f *Framework) Evict(pod *corev1.Pod, gracePeriodSeconds int64) error { if err != nil { return err } - return c.PolicyV1beta1().Evictions(pod.Namespace).Evict(context.Background(), eviction) + return c.PolicyV1().Evictions(pod.Namespace).Evict(context.Background(), eviction) } func (f *Framework) CleanUp(t *testing.T, cleanupFunc func()) { diff --git a/test/e2e/monitoring_stack_controller_test.go b/test/e2e/monitoring_stack_controller_test.go index 0418143a..6857666e 100644 --- a/test/e2e/monitoring_stack_controller_test.go +++ b/test/e2e/monitoring_stack_controller_test.go @@ -98,6 +98,7 @@ func TestMonitoringStackController(t *testing.T) { if err != nil { t.Fatal(err) } + assertPDBExpectedPodsAreHealthy(t, stackName+"-alertmanager", e2eTestNamespace) assertAlertmanagersAreOnDifferentNodes(t, pods) assertAlertmanagersAreResilientToDisruption(t, pods) }, @@ -396,7 +397,7 @@ func singlePrometheusReplicaHasNoPDB(t *testing.T) { assert.NilError(t, err, "failed to update monitoring stack") // ensure there is no pdb - f.AssertResourceNeverExists(pdbName, ms.Namespace, &pdb)(t) + f.AssertResourceAbsent(pdbName, ms.Namespace, &pdb)(t) } func assertPrometheusScrapesItself(t *testing.T) { @@ -508,6 +509,21 @@ func assertAlertmanagersAreResilientToDisruption(t *testing.T, pods []corev1.Pod } } +func assertPDBExpectedPodsAreHealthy(t *testing.T, name, namespace string) { + if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, framework.DefaultTestTimeout, true, func(ctx context.Context) (bool, error) { + pdb := &policyv1.PodDisruptionBudget{} + key := types.NamespacedName{Name: name, Namespace: namespace} + err := f.K8sClient.Get(context.Background(), key, pdb) + if err != nil { + return false, nil + } + return pdb.Status.CurrentHealthy == pdb.Status.ExpectedPods, nil + + }); err != nil { + t.Fatal(err) + } +} + func assertAlertmanagerReceivesAlerts(t *testing.T) { ms := newMonitoringStack(t, "alerting") if err := f.K8sClient.Create(context.Background(), ms); err != nil { @@ -838,11 +854,12 @@ func namespaceSelectorTest(t *testing.T) { err := deployDemoApp(t, ns, nsLabels, resourceLabels) assert.NilError(t, err, "%s: deploying demo app failed", ns) } + f.AssertStatefulsetReady("prometheus-"+stackName, e2eTestNamespace, framework.WithTimeout(5*time.Minute))(t) stopChan := make(chan struct{}) defer close(stopChan) //nolint - if pollErr := wait.Poll(15*time.Second, framework.DefaultTestTimeout, func() (bool, error) { + if pollErr := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, framework.DefaultTestTimeout*2, true, func(ctx context.Context) (bool, error) { err := f.StartServicePortForward(ms.Name+"-prometheus", e2eTestNamespace, "9090", stopChan) return err == nil, nil }); pollErr != nil { diff --git a/test/e2e/thanos_querier_controller_test.go b/test/e2e/thanos_querier_controller_test.go index 4b25bce8..569dae09 100644 --- a/test/e2e/thanos_querier_controller_test.go +++ b/test/e2e/thanos_querier_controller_test.go @@ -76,16 +76,17 @@ func singleStackWithSidecar(t *testing.T) { stopChan := make(chan struct{}) defer close(stopChan) if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { - err = f.StartServicePortForward(name, e2eTestNamespace, "9090", stopChan) + err = f.StartServicePortForward(name, e2eTestNamespace, "10902", stopChan) return err == nil, nil }); wait.Interrupted(err) { - t.Fatal(err) + t.Fatal("timeout waiting for port-forward") } - promClient := framework.NewPrometheusClient("http://localhost:9090") + promClient := framework.NewPrometheusClient("http://localhost:10902") expectedResults := map[string]int{ "prometheus_build_info": 2, // must return from both prometheus pods } + var lastErr error if err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) { correct := 0 for query, value := range expectedResults { @@ -99,8 +100,8 @@ func singleStackWithSidecar(t *testing.T) { } if len(result.Data.Result) > value { - resultErr := fmt.Errorf("invalid result for query %s, got %d, want %d", query, len(result.Data.Result), value) - return true, resultErr + lastErr = fmt.Errorf("invalid result for query %s, got %d, want %d", query, len(result.Data.Result), value) + return true, lastErr } if len(result.Data.Result) != value { @@ -112,7 +113,7 @@ func singleStackWithSidecar(t *testing.T) { return correct == len(expectedResults), nil }); wait.Interrupted(err) { - t.Fatal(err) + t.Fatal(fmt.Errorf("querying thanos did not yield expected results: %w", lastErr)) } } diff --git a/test/run-e2e.sh b/test/run-e2e.sh index dd5ee113..42d8b38d 100755 --- a/test/run-e2e.sh +++ b/test/run-e2e.sh @@ -22,6 +22,7 @@ declare SHOW_USAGE=false declare LOGS_DIR="tmp/e2e" declare OPERATORS_NS="operators" declare TEST_TIMEOUT="15m" +declare RUN_REGEX="" cleanup() { info "Cleaning up ..." @@ -154,7 +155,7 @@ run_e2e() { watch_obo_errors "$obo_error_log" & local ret=0 - go test -v -failfast -timeout $TEST_TIMEOUT ./test/e2e/... --retain=true | tee "$LOGS_DIR/e2e.log" || ret=1 + go test -v -failfast -timeout $TEST_TIMEOUT ./test/e2e/... -run "$RUN_REGEX" -count 1 | tee "$LOGS_DIR/e2e.log" || ret=1 # terminte both log_events { jobs -p | xargs -I {} -- pkill -TERM -P {}; } || true @@ -203,6 +204,11 @@ parse_args() { OPERATORS_NS=$1 shift ;; + --run) + shift + RUN_REGEX=$1 + shift + ;; *) return 1 ;; # show usage on everything else esac done @@ -227,7 +233,9 @@ print_usage() { --no-builds skip building operator images, useful when operator image is already built and pushed --ns NAMESPACE namespace to deploy operators (default: $OPERATORS_NS) - For running against openshift use --ns openshift-operators + for running against openshift use --ns openshift-operators + --run REGEX regex to limit which tests are run. See go help testflag -run entry + for details EOF_HELP @@ -337,6 +345,7 @@ print_config() { Skip Deploy: $NO_DEPLOY Operator namespace: $OPERATORS_NS Logs directory: $LOGS_DIR + Run regex: $RUN_REGEX EOF line 50