From 1f0f075feb7896cb3162be03c114a5e66aee6be6 Mon Sep 17 00:00:00 2001 From: Sahil Badla Date: Wed, 21 Apr 2021 12:09:10 -0700 Subject: [PATCH] #2285: rollup CR statistic metrics in v2 (#218) * #2285: rollup CR statistic metrics in v2 Signed-off-by: sbadla1 * #2285: updated metric flags Signed-off-by: sbadla1 * #2285: updated metric flags Signed-off-by: sbadla1 --- controllers/common/metrics.go | 77 ++++++++++++++++-------- controllers/common/metrics_test.go | 22 ++++++- controllers/rollingupgrade_controller.go | 5 ++ controllers/upgrade.go | 3 + controllers/upgrade_test.go | 47 ++++++++++++++- 5 files changed, 128 insertions(+), 26 deletions(-) diff --git a/controllers/common/metrics.go b/controllers/common/metrics.go index 723457dc..272afe82 100644 --- a/controllers/common/metrics.go +++ b/controllers/common/metrics.go @@ -1,38 +1,55 @@ package common import ( - "github.com/keikoproj/upgrade-manager/controllers/common/log" - "github.com/prometheus/client_golang/prometheus" "reflect" - "sigs.k8s.io/controller-runtime/pkg/metrics" "strings" "time" + + "github.com/keikoproj/upgrade-manager/controllers/common/log" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" ) -//All cluster level node upgrade statistics +var ( + metricNamespace = "upgrade_manager_v2" -var nodeRotationTotal = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Namespace: "node", - Name: "rotation_total_seconds", - Help: "Node rotation total", - Buckets: []float64{ - 10.0, - 30.0, - 60.0, - 90.0, - 120.0, - 180.0, - 300.0, - 600.0, - 900.0, - }, - }) + //All cluster level node upgrade statistics + nodeRotationTotal = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: metricNamespace, + Name: "node_rotation_total_seconds", + Help: "Node rotation total", + Buckets: []float64{ + 10.0, + 30.0, + 60.0, + 90.0, + 120.0, + 180.0, + 300.0, + 600.0, + 900.0, + }, + }) + + stepSummaries = make(map[string]map[string]prometheus.Summary) -var stepSummaries = make(map[string]map[string]prometheus.Summary) + CRStatus = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: metricNamespace, + Name: "resource_status", + Help: "Rollup CR statistics, partitioned by name.", + }, + []string{ + // name of the CR + "resource_name", + }, + ) +) func InitMetrics() { metrics.Registry.MustRegister(nodeRotationTotal) + metrics.Registry.MustRegister(CRStatus) } // Add rolling update step duration when the step is completed @@ -52,8 +69,8 @@ func AddStepDuration(groupName string, stepName string, duration time.Duration) if s, ok := steps[stepName]; !ok { summary = prometheus.NewSummary( prometheus.SummaryOpts{ - Namespace: "node", - Name: stepName + "_seconds", + Namespace: metricNamespace, + Name: "node_" + stepName + "_seconds", Help: "Summary for node " + stepName, ConstLabels: prometheus.Labels{"group": groupName}, }) @@ -72,3 +89,15 @@ func AddStepDuration(groupName string, stepName string, duration time.Duration) summary.Observe(duration.Seconds()) } } + +func SetRollupInitOrRunningStatus(ruName string) { + CRStatus.WithLabelValues(ruName).Set(0) +} + +func SetRollupCompletedStatus(ruName string) { + CRStatus.WithLabelValues(ruName).Set(1) +} + +func SetRollupFailedStatus(ruName string) { + CRStatus.WithLabelValues(ruName).Set(-1) +} diff --git a/controllers/common/metrics_test.go b/controllers/common/metrics_test.go index 9437beb6..29d14f25 100644 --- a/controllers/common/metrics_test.go +++ b/controllers/common/metrics_test.go @@ -1,8 +1,9 @@ package common import ( - "github.com/onsi/gomega" "testing" + + "github.com/onsi/gomega" ) func TestAddRollingUpgradeStepDuration(t *testing.T) { @@ -27,3 +28,22 @@ func TestAddRollingUpgradeStepDuration(t *testing.T) { AddStepDuration("test-asg", "total", 1) g.Expect(stepSummaries["test-asg"]["kickoff"]).NotTo(gomega.BeNil()) } + +func TestCRStatusCompleted(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + SetRollupInitOrRunningStatus("cr_test_1") + gauage, err := CRStatus.GetMetricWithLabelValues("cr_test_1") + g.Expect(err).To(gomega.BeNil()) + g.Expect(gauage).ToNot(gomega.BeNil()) + + SetRollupCompletedStatus("cr_test_2") + gauage, err = CRStatus.GetMetricWithLabelValues("cr_test_2") + g.Expect(err).To(gomega.BeNil()) + g.Expect(gauage).ToNot(gomega.BeNil()) + + SetRollupFailedStatus("cr_test_3") + gauage, err = CRStatus.GetMetricWithLabelValues("cr_test_3") + g.Expect(err).To(gomega.BeNil()) + g.Expect(gauage).ToNot(gomega.BeNil()) +} diff --git a/controllers/rollingupgrade_controller.go b/controllers/rollingupgrade_controller.go index af18c678..3ee2f841 100644 --- a/controllers/rollingupgrade_controller.go +++ b/controllers/rollingupgrade_controller.go @@ -127,16 +127,21 @@ func (r *RollingUpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Reque r.Info("admitted new rollingupgrade", "name", rollingUpgrade.NamespacedName(), "scalingGroup", scalingGroupName) r.AdmissionMap.Store(rollingUpgrade.NamespacedName(), scalingGroupName) rollingUpgrade.SetCurrentStatus(v1alpha1.StatusInit) + common.SetRollupInitOrRunningStatus(rollingUpgrade.Name) r.Cloud = NewDiscoveredState(r.Auth, r.Logger) if err := r.Cloud.Discover(); err != nil { rollingUpgrade.SetCurrentStatus(v1alpha1.StatusError) + // Set prometheus metric cr_status_failed + common.SetRollupFailedStatus(rollingUpgrade.Name) return ctrl.Result{}, err } // process node rotation if err := r.RotateNodes(rollingUpgrade); err != nil { rollingUpgrade.SetCurrentStatus(v1alpha1.StatusError) + // Set prometheus metric cr_status_failed + common.SetRollupFailedStatus(rollingUpgrade.Name) return ctrl.Result{}, err } diff --git a/controllers/upgrade.go b/controllers/upgrade.go index 7e713784..36783fbc 100644 --- a/controllers/upgrade.go +++ b/controllers/upgrade.go @@ -54,6 +54,7 @@ func (r *RollingUpgradeReconciler) RotateNodes(rollingUpgrade *v1alpha1.RollingU drainInterval = rollingUpgrade.PostDrainDelaySeconds() ) rollingUpgrade.SetCurrentStatus(v1alpha1.StatusRunning) + common.SetRollupInitOrRunningStatus(rollingUpgrade.Name) // set status start time if rollingUpgrade.StartTime() == "" { @@ -84,6 +85,8 @@ func (r *RollingUpgradeReconciler) RotateNodes(rollingUpgrade *v1alpha1.RollingU // check if all instances are rotated. if !r.IsScalingGroupDrifted(rollingUpgrade) { rollingUpgrade.SetCurrentStatus(v1alpha1.StatusComplete) + // Set prometheus metric cr_status_completed + common.SetRollupCompletedStatus(rollingUpgrade.Name) return nil } diff --git a/controllers/upgrade_test.go b/controllers/upgrade_test.go index 5bdc5e53..2572be5a 100644 --- a/controllers/upgrade_test.go +++ b/controllers/upgrade_test.go @@ -261,7 +261,7 @@ func TestIsScalingGroupDrifted(t *testing.T) { false, }, { - "All instances have different launch config as the ASG, expect false from IsScalingGroupDrifted", + "All instances have different launch config as the ASG, expect true from IsScalingGroupDrifted", createRollingUpgradeReconciler(t), createRollingUpgrade(), func() *MockAutoscalingGroup { @@ -284,6 +284,51 @@ func TestIsScalingGroupDrifted(t *testing.T) { } +func TestRotateNodes(t *testing.T) { + var tests = []struct { + TestDescription string + Reconciler *RollingUpgradeReconciler + RollingUpgrade *v1alpha1.RollingUpgrade + AsgClient *MockAutoscalingGroup + ExpectedValue bool + ExpectedStatusValue string + }{ + { + "All instances have different launch config as the ASG, expect true from IsScalingGroupDrifted", + createRollingUpgradeReconciler(t), + createRollingUpgrade(), + func() *MockAutoscalingGroup { + newAsgClient := createASGClient() + newAsgClient.autoScalingGroups[0].LaunchConfigurationName = aws.String("different-launch-config") + return newAsgClient + }(), + true, + v1alpha1.StatusRunning, + }, + { + "All instances have the same launch config as the ASG, expect false from IsScalingGroupDrifted", + createRollingUpgradeReconciler(t), + createRollingUpgrade(), + createASGClient(), + false, + v1alpha1.StatusComplete, + }, + } + for _, test := range tests { + test.Reconciler.Cloud.ScalingGroups = test.AsgClient.autoScalingGroups + test.Reconciler.Auth.AmazonClientSet.AsgClient = test.AsgClient + + err := test.Reconciler.RotateNodes(test.RollingUpgrade) + if err != nil { + t.Errorf("Test Description: \n expected value: nil, actual value: %v", err) + } + if test.RollingUpgrade.CurrentStatus() != test.ExpectedStatusValue { + t.Errorf("Test Description: %s \n expected value: %s, actual value: %s", test.TestDescription, test.ExpectedStatusValue, test.RollingUpgrade.CurrentStatus()) + } + } + +} + func TestDesiredNodesReady(t *testing.T) { var tests = []struct { TestDescription string