From 3bdc3ef50956ed1d14b00242b27d0a538eb955a4 Mon Sep 17 00:00:00 2001 From: Lini Kurien Date: Wed, 11 Dec 2024 18:16:57 +0530 Subject: [PATCH] ARO-13380 - metrics: cwp status --- pkg/monitor/cluster/cluster.go | 29 ++- pkg/monitor/cluster/clusterwideproxystatus.go | 238 ++++++++++++++++++ .../cluster/clusterwideproxystatus_test.go | 102 ++++++++ pkg/monitor/worker.go | 2 +- test/e2e/monitor.go | 2 +- 5 files changed, 363 insertions(+), 10 deletions(-) create mode 100644 pkg/monitor/cluster/clusterwideproxystatus.go create mode 100644 pkg/monitor/cluster/clusterwideproxystatus_test.go diff --git a/pkg/monitor/cluster/cluster.go b/pkg/monitor/cluster/cluster.go index f8cb39d2780..0fa2b23a463 100644 --- a/pkg/monitor/cluster/cluster.go +++ b/pkg/monitor/cluster/cluster.go @@ -12,6 +12,7 @@ import ( configv1 "github.com/openshift/api/config/v1" configclient "github.com/openshift/client-go/config/clientset/versioned" machineclient "github.com/openshift/client-go/machine/clientset/versioned" + operatorclient "github.com/openshift/client-go/operator/clientset/versioned" mcoclient "github.com/openshift/machine-config-operator/pkg/generated/clientset/versioned" "github.com/sirupsen/logrus" appsv1 "k8s.io/api/apps/v1" @@ -22,6 +23,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/hive" "github.com/Azure/ARO-RP/pkg/metrics" "github.com/Azure/ARO-RP/pkg/monitor/dimension" @@ -41,13 +43,16 @@ type Monitor struct { oc *api.OpenShiftCluster dims map[string]string - restconfig *rest.Config - cli kubernetes.Interface - configcli configclient.Interface - maocli machineclient.Interface - mcocli mcoclient.Interface - m metrics.Emitter - arocli aroclient.Interface + restconfig *rest.Config + cli kubernetes.Interface + configcli configclient.Interface + operatorcli operatorclient.Interface + maocli machineclient.Interface + mcocli mcoclient.Interface + m metrics.Emitter + arocli aroclient.Interface + env env.Interface + tenantID string ocpclientset client.Client hiveclientset client.Client @@ -66,7 +71,7 @@ type Monitor struct { doc *api.OpenShiftClusterDocument } -func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftCluster, doc *api.OpenShiftClusterDocument, m metrics.Emitter, hiveRestConfig *rest.Config, hourlyRun bool, wg *sync.WaitGroup, hiveClusterManager hive.ClusterManager) (*Monitor, error) { +func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftCluster, doc *api.OpenShiftClusterDocument, env env.Interface, tenantID string, m metrics.Emitter, hiveRestConfig *rest.Config, hourlyRun bool, wg *sync.WaitGroup, hiveClusterManager hive.ClusterManager) (*Monitor, error) { r, err := azure.ParseResourceID(oc.ID) if err != nil { return nil, err @@ -103,6 +108,10 @@ func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftClu if err != nil { return nil, err } + operatorcli, err := operatorclient.NewForConfig(restConfig) + if err != nil { + return nil, err + } // lazy discovery will not attempt to reach out to the apiserver immediately mapper, err := apiutil.NewDynamicRESTMapper(restConfig, apiutil.WithLazyDiscovery) @@ -132,9 +141,12 @@ func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftClu restconfig: restConfig, cli: cli, configcli: configcli, + operatorcli: operatorcli, maocli: maocli, mcocli: mcocli, arocli: arocli, + env: env, + tenantID: tenantID, m: m, ocpclientset: ocpclientset, hiveclientset: hiveclientset, @@ -219,6 +231,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (errs []error) { mon.emitCertificateExpirationStatuses, mon.emitEtcdCertificateExpiry, mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable + mon.emitCWPStatus, } { err = f(ctx) if err != nil { diff --git a/pkg/monitor/cluster/clusterwideproxystatus.go b/pkg/monitor/cluster/clusterwideproxystatus.go new file mode 100644 index 00000000000..8eadb1a1d26 --- /dev/null +++ b/pkg/monitor/cluster/clusterwideproxystatus.go @@ -0,0 +1,238 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "net/url" + "strconv" + "strings" + + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v2" + "github.com/Azure/go-autorest/autorest/azure" + "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + apisubnet "github.com/Azure/ARO-RP/pkg/api/util/subnet" + arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1" +) + +const ( + cwp = "clusterWideProxy.status" + cwpErrorMessage = "NoProxy entries are incorrect" + cluster = "cluster" + mandatory_no_proxies = "localhost,127.0.0.1,.svc,.cluster.local,168.63.129.16" + AzureDNS = "169.254.169.254" + //169.254.169.254 (the IMDS IP) + //168.63.129.16 (Azure DNS, if no custom DNS exists) + //localhost, 127.0.0.1, .svc, .cluster.local +) + +// Main function to emit CWP status +func (mon *Monitor) emitCWPStatus(ctx context.Context) error { + proxyConfig, err := mon.configcli.ConfigV1().Proxies().Get(ctx, cluster, metav1.GetOptions{}) + if err != nil { + mon.log.Errorf("Error in getting the cluster wide proxy: %v", err) + return err + } + if proxyConfig.Spec.HTTPProxy == "" && proxyConfig.Spec.HTTPSProxy == "" && proxyConfig.Spec.NoProxy == "" { + mon.emitGauge(cwp, 1, map[string]string{ + "status": strconv.FormatBool(false), + "Message": "CWP Not Enabled", + }) + } else { + // Create the noProxy map for efficient lookups + no_proxy_list := strings.Split(proxyConfig.Spec.NoProxy, ",") + noProxyMap := make(map[string]bool) + var missing_no_proxy_list []string + for _, proxy := range no_proxy_list { + noProxyMap[proxy] = true + } + + // Check mandatory no_proxy entries + for _, mandatory_no_proxy := range strings.Split(mandatory_no_proxies, ",") { + if !noProxyMap[mandatory_no_proxy] { + missing_no_proxy_list = append(missing_no_proxy_list, mandatory_no_proxy) + } + } + if !noProxyMap[AzureDNS] { + dnsConfigcluster, err := mon.operatorcli.OperatorV1().DNSes().Get(ctx, "default", metav1.GetOptions{}) + if err != nil { + mon.log.Errorf("Error in getting DNS configuration: %v", err) + return err + } + if len(dnsConfigcluster.Spec.Servers) == 0 { + missing_no_proxy_list = append(missing_no_proxy_list, AzureDNS) + } + } + + mastersubnetID, err := azure.ParseResourceID(mon.oc.Properties.MasterProfile.SubnetID) + if err != nil { + mon.log.Errorf("failed to parse the mastersubnetID: %v", err) + return err + } + token, err := mon.env.FPNewClientCertificateCredential(mon.tenantID, nil) + if err != nil { + mon.log.Errorf("failed to obtain FP Client Credentials: %v", err) + return err + } + + // Create client factory + clientFactory, err := armnetwork.NewClientFactory(mastersubnetID.SubscriptionID, token, nil) + if err != nil { + mon.log.Errorf("failed to create client: %v", err) + return err + } + + // Check master subnet + masterVnetID, _, err := apisubnet.Split(mon.oc.Properties.MasterProfile.SubnetID) + if err != nil { + mon.log.Errorf("failed to get the masterVnetID: %v", err) + return err + } + mastervnetId, err := azure.ParseResourceID(masterVnetID) + if err != nil { + mon.log.Errorf("failed to parse the masterVnetID: %v", err) + return err + } + res, err := clientFactory.NewSubnetsClient().Get(ctx, mastersubnetID.ResourceGroup, mastervnetId.ResourceName, mastersubnetID.ResourceName, &armnetwork.SubnetsClientGetOptions{Expand: nil}) + if err != nil { + mon.log.Errorf("failed to finish the NewSubnetsClient request: %v", err) + return err + } + + if res.Properties.AddressPrefix != nil { + if !noProxyMap[*res.Properties.AddressPrefix] { + missing_no_proxy_list = append(missing_no_proxy_list, *res.Properties.AddressPrefix) + } + } + + // Check worker profiles + for _, workerProfile := range mon.oc.Properties.WorkerProfiles { + workersubnetID, err := azure.ParseResourceID(workerProfile.SubnetID) + if err != nil { + mon.log.Errorf("failed to parse the workersubnetID: %v", err) + return err + } + workerVnetID, _, err := apisubnet.Split(workerProfile.SubnetID) + if err != nil { + mon.log.Errorf("failed to feth the workerVnetID: %v", err) + return err + } + workervnetId, err := azure.ParseResourceID(workerVnetID) + if err != nil { + mon.log.Errorf("failed to parse the workerVnetID: %v", err) + return err + } + workerres, err := clientFactory.NewSubnetsClient().Get(ctx, workersubnetID.ResourceGroup, workervnetId.ResourceName, workersubnetID.ResourceName, &armnetwork.SubnetsClientGetOptions{Expand: nil}) + if err != nil { + mon.log.Errorf("failed to finish the request: %v", err) + } + if workerres.Properties.AddressPrefix != nil { + workermachinesCIDR := *workerres.Properties.AddressPrefix + if !noProxyMap[workermachinesCIDR] { + missing_no_proxy_list = append(missing_no_proxy_list, workermachinesCIDR) + } + } + } + + // Network Configuration Check + networkConfig, err := mon.configcli.ConfigV1().Networks().Get(ctx, cluster, metav1.GetOptions{}) + if err != nil { + mon.log.Errorf("Error in getting network info: %v", err) + return err + } + for _, network := range networkConfig.Spec.ClusterNetwork { + if !noProxyMap[network.CIDR] { + missing_no_proxy_list = append(missing_no_proxy_list, network.CIDR) + } + } + for _, network := range networkConfig.Spec.ServiceNetwork { + if !noProxyMap[network] { + missing_no_proxy_list = append(missing_no_proxy_list, network) + } + } + + // Gateway Domains Check + clusterdetails, err := mon.arocli.AroV1alpha1().Clusters().Get(ctx, arov1alpha1.SingletonClusterName, metav1.GetOptions{}) + if err != nil { + mon.log.Errorf("Error in getting cluster information: %v", err) + return err + } + clusterDomain := clusterdetails.Spec.Domain + if !noProxyMap[clusterDomain] { + missing_no_proxy_list = append(missing_no_proxy_list, clusterDomain) + } + for _, gatewayDomain := range clusterdetails.Spec.GatewayDomains { + gatewayDomain = strings.ToLower(gatewayDomain) + if !noProxyMap[gatewayDomain] { + missing_no_proxy_list = append(missing_no_proxy_list, gatewayDomain) + } + } + + // Infrastructure Configuration Check + infraConfig, err := mon.configcli.ConfigV1().Infrastructures().Get(ctx, cluster, metav1.GetOptions{}) + if err != nil { + mon.log.Errorf("Error in getting Infrasturcture info: %v", err) + return err + } + + // APIServerInternal URL Check + apiServerIntURL, err := url.Parse(infraConfig.Status.APIServerInternalURL) + if err != nil { + mon.log.Errorf("Error in parsing APIServerProfile: %v", err) + return err + } + apiServerIntdomain := strings.Split(apiServerIntURL.Host, ":")[0] + if !noProxyMap[apiServerIntdomain] { + missing_no_proxy_list = append(missing_no_proxy_list, apiServerIntdomain) + } + + // APIServerProfile URL Check + apiServerProfileURL, err := url.Parse(mon.oc.Properties.APIServerProfile.URL) + if err != nil { + mon.log.Errorf("Error in parsing APIServerProfile: %v", err) + return err + } + apiServerProfiledomain := strings.Split(apiServerProfileURL.Host, ":")[0] + if !noProxyMap[apiServerProfiledomain] { + missing_no_proxy_list = append(missing_no_proxy_list, apiServerProfiledomain) + } + + // ConsoleProfile URL Check + consolProfileURL, err := url.Parse(mon.oc.Properties.ConsoleProfile.URL) + if err != nil { + mon.log.Errorf("Error in parsing ConsoleProfile: %v", err) + return err + } + consoleProfiledomain := strings.Split(consolProfileURL.Host, ":")[0] + if !noProxyMap[consolProfileURL.Host] { + missing_no_proxy_list = append(missing_no_proxy_list, consoleProfiledomain) + } + if len(missing_no_proxy_list) > 0 { + status := true + message := "CWP enabled but missing " + strings.Join(missing_no_proxy_list, ",") + " in the no_proxy list" + mon.emitGauge(cwp, 1, map[string]string{ + "status": strconv.FormatBool(status), + "Message": message, + }) + mon.log.Infof(message) + if mon.hourlyRun { + mon.log.WithFields(logrus.Fields{ + "metric": cwp, + "status": strconv.FormatBool(status), + "Message": message, + }).Print() + } + } else { + mon.emitGauge(cwp, 1, map[string]string{ + "status": strconv.FormatBool(false), + "Message": "CWP enabled successfully", + }) + mon.log.Infof("CWP enabled successfully") + } + } + + return nil +} diff --git a/pkg/monitor/cluster/clusterwideproxystatus_test.go b/pkg/monitor/cluster/clusterwideproxystatus_test.go new file mode 100644 index 00000000000..b47f3c0bc7d --- /dev/null +++ b/pkg/monitor/cluster/clusterwideproxystatus_test.go @@ -0,0 +1,102 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. +import ( + "context" + "sync" + "testing" + + configv1 "github.com/openshift/api/config/v1" + configfake "github.com/openshift/client-go/config/clientset/versioned/fake" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" +) + +func TestEmitCWPStatus(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockMetrics := mock_metrics.NewMockEmitter(ctrl) + fakeConfigClient := configfake.NewSimpleClientset() + + mon := &Monitor{ + configcli: fakeConfigClient, + m: mockMetrics, + log: logrus.NewEntry(logrus.New()), + wg: &sync.WaitGroup{}, + } + + tests := []struct { + name string + proxyConfig *configv1.Proxy + expectErr bool + expectedError string + setupMocks func(*mock_metrics.MockEmitter) + }{ + { + name: "no proxy configured", + proxyConfig: &configv1.Proxy{ + ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, + Spec: configv1.ProxySpec{}, + }, + expectErr: false, + expectedError: "", + setupMocks: func(m *mock_metrics.MockEmitter) { + m.EXPECT(). + EmitGauge("clusterWideProxy.status", int64(1), gomock.Any()). + Times(1) + }, + }, + { + name: "missing mandatory no_proxy entries", + proxyConfig: &configv1.Proxy{ + ObjectMeta: metav1.ObjectMeta{Name: "cluster"}, + Spec: configv1.ProxySpec{ + NoProxy: "localhost,.svc,.cluster.local", + }, + }, + expectErr: false, + expectedError: "", + setupMocks: func(m *mock_metrics.MockEmitter) { + m.EXPECT(). + EmitGauge("clusterWideProxy.status", int64(1), gomock.Any()). + Times(1) + }, + }, + { + name: "error fetching proxy configuration", + proxyConfig: &configv1.Proxy{}, + expectErr: false, + expectedError: "", + setupMocks: func(m *mock_metrics.MockEmitter) { + m.EXPECT(). + EmitGauge("clusterWideProxy.status", int64(1), gomock.Any()). + Times(1) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.proxyConfig != nil { + _, _ = fakeConfigClient.ConfigV1().Proxies().Create(context.Background(), tt.proxyConfig, metav1.CreateOptions{}) + } + + tt.setupMocks(mockMetrics) + + err := mon.emitCWPStatus(context.Background()) + + if tt.expectErr { + require.Error(t, err) + require.Contains(t, err.Error(), tt.expectedError) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/pkg/monitor/worker.go b/pkg/monitor/worker.go index d589247d945..aa3df08b43f 100644 --- a/pkg/monitor/worker.go +++ b/pkg/monitor/worker.go @@ -292,7 +292,7 @@ func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.Ope nsgMon := nsg.NewMonitor(log, doc.OpenShiftCluster, mon.env, sub.ID, sub.Subscription.Properties.TenantID, mon.clusterm, dims, &wg, nsgMonTicker.C) - c, err := cluster.NewMonitor(log, restConfig, doc.OpenShiftCluster, doc, mon.clusterm, hiveRestConfig, hourlyRun, &wg, hiveClusterManager) + c, err := cluster.NewMonitor(log, restConfig, doc.OpenShiftCluster, doc, mon.env, sub.Subscription.Properties.TenantID, mon.clusterm, hiveRestConfig, hourlyRun, &wg, hiveClusterManager) if err != nil { log.Error(err) mon.m.EmitGauge("monitor.cluster.failedworker", 1, map[string]string{ diff --git a/test/e2e/monitor.go b/test/e2e/monitor.go index 954be14cc44..9aa9d1cb636 100644 --- a/test/e2e/monitor.go +++ b/test/e2e/monitor.go @@ -25,7 +25,7 @@ var _ = Describe("Monitor", func() { ID: resourceIDFromEnv(), }, &api.OpenShiftClusterDocument{ ID: resourceIDFromEnv(), - }, &noop.Noop{}, nil, true, &wg, nil) + }, nil, "", &noop.Noop{}, nil, true, &wg, nil) Expect(err).NotTo(HaveOccurred()) By("running the monitor once")