diff --git a/.gitignore b/.gitignore index 6ecb9304d9..e55c1f21fd 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,6 @@ go.work* # scale-test test/scale/generated/* + +# test env file +*.env diff --git a/.pipelines/cni/cilium/nightly-release-test.yml b/.pipelines/cni/cilium/nightly-release-test.yml index ee43ece998..9cb7acc9ae 100644 --- a/.pipelines/cni/cilium/nightly-release-test.yml +++ b/.pipelines/cni/cilium/nightly-release-test.yml @@ -88,6 +88,7 @@ stages: name: "cilium_nightly" testDropgz: "" clusterName: ciliumnightly-$(commitID) + testHubble: true - job: logs displayName: "Failure Logs" dependsOn: diff --git a/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml b/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml index 55601e1131..88346909fb 100644 --- a/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml +++ b/.pipelines/singletenancy/cilium-overlay/cilium-overlay-e2e-step-template.yaml @@ -2,6 +2,7 @@ parameters: name: "" testDropgz: "" clusterName: "" + testHubble: false steps: - bash: | @@ -153,6 +154,21 @@ steps: name: "ciliumConnectivityTests" displayName: "Run Cilium Connectivity Tests" + - ${{ if eq( parameters['testHubble'], true) }}: + - script: | + echo "enable Hubble metrics server" + kubectl apply -f test/integration/manifests/cilium/hubble/hubble-peer-svc.yaml + kubectl apply -f test/integration/manifests/cilium/cilium-config-hubble.yaml + kubectl rollout restart ds cilium -n kube-system + echo "wait <3 minutes for pods to be ready after restart" + kubectl rollout status ds cilium -n kube-system --timeout=3m + kubectl get pods -Aowide + echo "verify Hubble metrics endpoint is usable" + go test ./test/integration/networkobservability -count=1 -v -tags=networkobservability + retryCountOnTaskFailure: 3 + name: "HubbleConnectivityTests" + displayName: "Run Hubble Connectivity Tests" + - script: | echo "validate pod IP assignment and check systemd-networkd restart" kubectl get pod -owide -A diff --git a/hack/toolbox/server/Dockerfile.heavy b/hack/toolbox/server/Dockerfile.heavy index ee9aea25d2..6839f45794 100644 --- a/hack/toolbox/server/Dockerfile.heavy +++ b/hack/toolbox/server/Dockerfile.heavy @@ -3,7 +3,7 @@ ADD ./ / WORKDIR / RUN CGO_ENABLED=0 GOOS=linux go build -o server . -FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 +FROM mcr.microsoft.com/mirror/docker/library/ubuntu:22.04 RUN apt-get update RUN apt-get install -y \ axel \ @@ -21,12 +21,12 @@ RUN apt-get install -y \ net-tools \ netcat \ nmap \ - python \ python3 \ ssh \ sudo \ tcpdump \ traceroute \ + unzip \ vim \ wget diff --git a/test/integration/manifests/cilium/cilium-config-hubble.yaml b/test/integration/manifests/cilium/cilium-config-hubble.yaml new file mode 100644 index 0000000000..c137aa23b4 --- /dev/null +++ b/test/integration/manifests/cilium/cilium-config-hubble.yaml @@ -0,0 +1,98 @@ +apiVersion: v1 +data: + agent-not-ready-taint-key: node.cilium.io/agent-not-ready + arping-refresh-period: 30s + auto-direct-node-routes: "false" + bpf-lb-external-clusterip: "false" + bpf-lb-map-max: "65536" + bpf-lb-mode: snat + bpf-map-dynamic-size-ratio: "0.0025" + bpf-policy-map-max: "16384" + bpf-root: /sys/fs/bpf + cgroup-root: /run/cilium/cgroupv2 + cilium-endpoint-gc-interval: 5m0s + cluster-id: "0" + cluster-name: default + debug: "false" + disable-cnp-status-updates: "true" + disable-endpoint-crd: "false" + enable-auto-protect-node-port-range: "true" + enable-bgp-control-plane: "false" + enable-bpf-clock-probe: "true" + enable-endpoint-health-checking: "false" + enable-endpoint-routes: "true" + enable-health-check-nodeport: "true" + enable-health-checking: "true" + enable-host-legacy-routing: "true" + enable-hubble: "true" + enable-ipv4: "true" + enable-ipv4-masquerade: "false" + enable-ipv6: "false" + enable-ipv6-masquerade: "false" + enable-k8s-terminating-endpoint: "true" + enable-l2-neigh-discovery: "true" + enable-l7-proxy: "false" + enable-local-node-route: "false" + enable-local-redirect-policy: "false" + enable-metrics: "true" + enable-policy: default + enable-remote-node-identity: "true" + enable-session-affinity: "true" + enable-svc-source-range-check: "true" + enable-vtep: "false" + enable-well-known-identities: "false" + enable-xt-socket-fallback: "true" + hubble-metrics: flow:sourceContext=workload-name;destinationContext=workload-name + tcp:sourceContext=workload-name;destinationContext=workload-name + dns:flow:sourceContext=workload-name;destinationContext=workload-name + hubble-metrics-server: :9965 + hubble-disable-tls: "false" + hubble-listen-address: "" + hubble-socket-path: /dev/null + hubble-tls-cert-file: /var/lib/cilium/tls/hubble/server.crt + hubble-tls-client-ca-files: /var/lib/cilium/tls/hubble/client-ca.crt + hubble-tls-key-file: /var/lib/cilium/tls/hubble/server.key + identity-allocation-mode: crd + install-iptables-rules: "true" + install-no-conntrack-iptables-rules: "false" + ipam: delegated-plugin + kube-proxy-replacement: strict + kube-proxy-replacement-healthz-bind-address: "0.0.0.0:10256" + local-router-ipv4: 169.254.23.0 + metrics: +cilium_bpf_map_pressure + monitor-aggregation: medium + monitor-aggregation-flags: all + monitor-aggregation-interval: 5s + node-port-bind-protection: "true" + nodes-gc-interval: 5m0s + operator-api-serve-addr: 127.0.0.1:9234 + operator-prometheus-serve-addr: :9963 + preallocate-bpf-maps: "false" + procfs: /host/proc + prometheus-serve-addr: :9962 + remove-cilium-node-taints: "true" + set-cilium-is-up-condition: "true" + sidecar-istio-proxy-image: cilium/istio_proxy + synchronize-k8s-nodes: "true" + tofqdns-dns-reject-response-code: refused + tofqdns-enable-dns-compression: "true" + tofqdns-endpoint-max-ip-per-hostname: "50" + tofqdns-idle-connection-grace-period: 0s + tofqdns-max-deferred-connection-deletes: "10000" + tofqdns-min-ttl: "3600" + tofqdns-proxy-response-max-delay: 100ms + tunnel: disabled + unmanaged-pod-watcher-interval: "15" + vtep-cidr: "" + vtep-endpoint: "" + vtep-mac: "" + vtep-mask: "" +kind: ConfigMap +metadata: + annotations: + meta.helm.sh/release-name: cilium + meta.helm.sh/release-namespace: kube-system + labels: + app.kubernetes.io/managed-by: Helm + name: cilium-config + namespace: kube-system diff --git a/test/integration/manifests/cilium/hubble/hubble-peer-svc.yaml b/test/integration/manifests/cilium/hubble/hubble-peer-svc.yaml new file mode 100644 index 0000000000..6ba733885c --- /dev/null +++ b/test/integration/manifests/cilium/hubble/hubble-peer-svc.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: cilium + name: hubble-peer + namespace: kube-system +spec: + internalTrafficPolicy: Cluster + ports: + - name: peer-service + port: 443 + protocol: TCP + targetPort: 4244 + selector: + k8s-app: cilium + sessionAffinity: None + type: ClusterIP diff --git a/test/integration/networkobservability/hubble_test.go b/test/integration/networkobservability/hubble_test.go new file mode 100644 index 0000000000..4cae750e42 --- /dev/null +++ b/test/integration/networkobservability/hubble_test.go @@ -0,0 +1,134 @@ +//go:build networkobservability + +package networkobservability + +import ( + "context" + "fmt" + "io" + "net/http" + "strings" + "testing" + "time" + + k8s "github.com/Azure/azure-container-networking/test/integration" + "github.com/Azure/azure-container-networking/test/internal/kubernetes" + "github.com/Azure/azure-container-networking/test/internal/retry" +) + +const ( + retryAttempts = 10 + retryDelay = 5 * time.Second + promAddress = "http://localhost:9965/metrics" + labelSelector = "k8s-app=cilium" + namespace = "kube-system" +) + +var ( + defaultRetrier = retry.Retrier{Attempts: retryAttempts, Delay: retryDelay} + requiredMetrics = []string{ + "hubble_flows_processed_total", + "hubble_tcp_flags_total", + } +) + +func TestEndpoints(t *testing.T) { + config := kubernetes.MustGetRestConfig() + ctx := context.Background() + clusterCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + pingCheckFn := func() error { + var pf *k8s.PortForwarder + pf, err := k8s.NewPortForwarder(config, t, k8s.PortForwardingOpts{ + Namespace: namespace, + LabelSelector: labelSelector, + LocalPort: 9965, + DestPort: 9965, + }) + if err != nil { + t.Error(err) + } + pctx := context.Background() + + portForwardCtx, cancel := context.WithTimeout(pctx, (retryAttempts+1)*retryDelay) + defer cancel() + + portForwardFn := func() error { + t.Logf("attempting port forward to a pod with label %s, in namespace %s...", labelSelector, namespace) + if err = pf.Forward(portForwardCtx); err != nil { + return fmt.Errorf("could not start port forward: %w", err) + } + return nil + } + + if err = defaultRetrier.Do(portForwardCtx, portForwardFn); err != nil { + t.Fatalf("could not start port forward within %d: %v", (retryAttempts+1)*retryDelay, err) + } + defer pf.Stop() + + // scrape the hubble metrics + metrics, err := getPrometheusMetrics(promAddress) + if err != nil { + return fmt.Errorf("scraping %s, failed with error: %w", promAddress, err) + } + + // verify that the response contains the required metrics + for _, reqMetric := range requiredMetrics { + if val, exists := metrics[reqMetric]; !exists { + return fmt.Errorf("scraping %s, did not find metric %s", val, promAddress) //nolint:goerr113,gocritic + } + } + t.Logf("all metrics validated: %+v", requiredMetrics) + return nil + } + + if err := defaultRetrier.Do(clusterCtx, pingCheckFn); err != nil { + t.Fatalf("metrics check failed with error: %v", err) + } +} + +func getPrometheusMetrics(url string) (map[string]struct{}, error) { + client := http.Client{} + resp, err := client.Get(url) //nolint + if err != nil { + return nil, fmt.Errorf("HTTP request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP request failed with status: %v", resp.Status) //nolint:goerr113,gocritic + } + + metricsData, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("reading HTTP response body failed: %w", err) + } + + metrics := parseMetrics(string(metricsData)) + return metrics, nil +} + +func parseMetrics(metricsData string) map[string]struct{} { + // Create a map to store the strings before the first '{'. + metrics := make(map[string]struct{}) + + // sample metrics + // hubble_tcp_flags_total{destination="",family="IPv4",flag="RST",source="kube-system/metrics-server"} 980 + // hubble_tcp_flags_total{destination="",family="IPv4",flag="SYN",source="kube-system/ama-metrics"} 1777 + // we only want the metric name for the time being + // label order/parseing can happen later + lines := strings.Split(metricsData, "\n") + // Iterate through each line. + for _, line := range lines { + // Find the index of the first '{' character. + index := strings.Index(line, "{") + if index >= 0 { + // Extract the string before the first '{'. + str := strings.TrimSpace(line[:index]) + // Store the string in the map. + metrics[str] = struct{}{} + } + } + + return metrics +}