Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metrics features for controller v2 #189

Merged
merged 1 commit into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions api/v1alpha1/rollingupgrade_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"strconv"
"strings"
"time"

"github.com/keikoproj/upgrade-manager/controllers/common"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -52,6 +53,85 @@ type RollingUpgradeStatus struct {
Conditions []RollingUpgradeCondition `json:"conditions,omitempty"`
LastNodeTerminationTime metav1.Time `json:"lastTerminationTime,omitempty"`
LastNodeDrainTime metav1.Time `json:"lastDrainTime,omitempty"`

Statistics []*RollingUpgradeStatistics `json:"statistics,omitempty"`
InProcessingNodes map[string]*NodeInProcessing `json:"inProcessingNodes,omitempty"`
}

// RollingUpgrade Statistics, includes summary(sum/count) from each step
type RollingUpgradeStatistics struct {
StepName RollingUpgradeStep `json:"stepName,omitempty"`
DurationSum metav1.Duration `json:"durationSum,omitempty"`
DurationCount int32 `json:"durationCount,omitempty"`
}

// Node In-processing
type NodeInProcessing struct {
NodeName string `json:"nodeName,omitempty"`
StepName RollingUpgradeStep `json:"stepName,omitempty"`
UpgradeStartTime metav1.Time `json:"upgradeStartTime,omitempty"`
StepStartTime metav1.Time `json:"stepStartTime,omitempty"`
StepEndTime metav1.Time `json:"stepEndTime,omitempty"`
}

// Add one step duration
func (s *RollingUpgradeStatus) addStepDuration(asgName string, stepName RollingUpgradeStep, duration time.Duration) {
// if step exists, add count and sum, otherwise append
for _, s := range s.Statistics {
if s.StepName == stepName {
s.DurationSum = metav1.Duration{
Duration: s.DurationSum.Duration + duration,
}
s.DurationCount += 1
return
}
}
s.Statistics = append(s.Statistics, &RollingUpgradeStatistics{
StepName: stepName,
DurationSum: metav1.Duration{
Duration: duration,
},
DurationCount: 1,
})

//Add to system level statistics
common.AddRollingUpgradeStepDuration(asgName, string(stepName), duration)
}

// Node turns onto step
func (s *RollingUpgradeStatus) NodeStep(asgName string, nodeName string, stepName RollingUpgradeStep) {
if s.InProcessingNodes == nil {
s.InProcessingNodes = make(map[string]*NodeInProcessing)
}
var inProcessingNode *NodeInProcessing
if n, ok := s.InProcessingNodes[nodeName]; !ok {
inProcessingNode = &NodeInProcessing{
NodeName: nodeName,
StepName: stepName,
UpgradeStartTime: metav1.Now(),
StepStartTime: metav1.Now(),
}
s.InProcessingNodes[nodeName] = inProcessingNode
} else {
inProcessingNode = n
n.StepEndTime = metav1.Now()
var duration = n.StepEndTime.Sub(n.StepStartTime.Time)
if stepName == NodeRotationCompleted {
//Add overall and remove the node from in-processing map
var total = n.StepEndTime.Sub(n.UpgradeStartTime.Time)
s.addStepDuration(asgName, inProcessingNode.StepName, duration)
s.addStepDuration(asgName, NodeRotationTotal, total)
delete(s.InProcessingNodes, nodeName)
} else if inProcessingNode.StepName != stepName { //Still same step
var oldOrder = NodeRotationStepOrders[inProcessingNode.StepName]
var newOrder = NodeRotationStepOrders[stepName]
if newOrder > oldOrder { //Make sure the steps running in order
s.addStepDuration(asgName, inProcessingNode.StepName, duration)
n.StepStartTime = metav1.Now()
inProcessingNode.StepName = stepName
}
}
}
}

func (s *RollingUpgradeStatus) SetCondition(cond RollingUpgradeCondition) {
Expand Down Expand Up @@ -115,6 +195,8 @@ type NodeReadinessGate struct {
MatchLabels map[string]string `json:"matchLabels,omitempty" protobuf:"bytes,1,rep,name=matchLabels"`
}

type RollingUpgradeStep string

const (
// Status
StatusInit = "init"
Expand All @@ -124,8 +206,32 @@ const (

// Conditions
UpgradeComplete UpgradeConditionType = "Complete"

NodeRotationTotal RollingUpgradeStep = "total"

NodeRotationKickoff RollingUpgradeStep = "kickoff"
NodeRotationDesiredNodeReady RollingUpgradeStep = "desired_node_ready"
NodeRotationPredrainScript RollingUpgradeStep = "predrain_script"
NodeRotationDrain RollingUpgradeStep = "drain"
NodeRotationPostdrainScript RollingUpgradeStep = "postdrain_script"
NodeRotationPostWait RollingUpgradeStep = "post_wait"
NodeRotationTerminate RollingUpgradeStep = "terminate"
NodeRotationPostTerminate RollingUpgradeStep = "post_terminate"
NodeRotationCompleted RollingUpgradeStep = "completed"
)

var NodeRotationStepOrders = map[RollingUpgradeStep]int{
NodeRotationKickoff: 10,
NodeRotationDesiredNodeReady: 20,
NodeRotationPredrainScript: 30,
NodeRotationDrain: 40,
NodeRotationPostdrainScript: 50,
NodeRotationPostWait: 60,
NodeRotationTerminate: 70,
NodeRotationPostTerminate: 80,
NodeRotationCompleted: 1000,
}

var (
FiniteStates = []string{StatusComplete, StatusError}
AllowedStrategyType = []string{string(RandomUpdateStrategy), string(UniformAcrossAzUpdateStrategy)}
Expand Down
47 changes: 47 additions & 0 deletions api/v1alpha1/rollingupgrade_types_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package v1alpha1

import (
"github.com/onsi/gomega"
"testing"
)

// Test
func TestNodeTurnsOntoStep(t *testing.T) {
g := gomega.NewGomegaWithT(t)

r := &RollingUpgradeStatus{}

r.NodeStep("test-asg", "node-1", NodeRotationKickoff)

g.Expect(r.InProcessingNodes).NotTo(gomega.BeNil())
g.Expect(r.Statistics).To(gomega.BeNil())

r.NodeStep("test-asg", "node-1", NodeRotationDesiredNodeReady)

g.Expect(r.Statistics).NotTo(gomega.BeNil())
g.Expect(len(r.Statistics)).To(gomega.Equal(1))
g.Expect(r.Statistics[0].StepName).To(gomega.Equal(NodeRotationKickoff))

//Retry desired_node_ready
r.NodeStep("test-asg", "node-1", NodeRotationDesiredNodeReady)
g.Expect(len(r.Statistics)).To(gomega.Equal(1))
g.Expect(r.Statistics[0].StepName).To(gomega.Equal(NodeRotationKickoff))

//Retry desired_node_ready again
r.NodeStep("test-asg", "node-1", NodeRotationDesiredNodeReady)
g.Expect(len(r.Statistics)).To(gomega.Equal(1))
g.Expect(r.Statistics[0].StepName).To(gomega.Equal(NodeRotationKickoff))

//Completed
r.NodeStep("test-asg", "node-1", NodeRotationCompleted)
g.Expect(len(r.Statistics)).To(gomega.Equal(3))
g.Expect(r.Statistics[1].StepName).To(gomega.Equal(NodeRotationDesiredNodeReady))
g.Expect(r.Statistics[2].StepName).To(gomega.Equal(NodeRotationTotal))

//Second node
r.NodeStep("test-asg", "node-2", NodeRotationKickoff)
g.Expect(len(r.Statistics)).To(gomega.Equal(3))

r.NodeStep("test-asg", "node-2", NodeRotationDesiredNodeReady)
g.Expect(len(r.Statistics)).To(gomega.Equal(3))
}
60 changes: 60 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions config/crd/bases/upgrademgr.keikoproj.io_rollingupgrades.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,25 @@ spec:
type: string
endTime:
type: string
inProcessingNodes:
additionalProperties:
description: Node In-processing
properties:
nodeName:
type: string
stepEndTime:
format: date-time
type: string
stepName:
type: string
stepStartTime:
format: date-time
type: string
upgradeStartTime:
format: date-time
type: string
type: object
type: object
lastDrainTime:
format: date-time
type: string
Expand All @@ -143,6 +162,20 @@ spec:
type: integer
startTime:
type: string
statistics:
items:
description: RollingUpgrade Statistics, includes summary(sum/count)
from each step
properties:
durationCount:
format: int32
type: integer
durationSum:
type: string
stepName:
type: string
type: object
type: array
totalNodes:
type: integer
totalProcessingTime:
Expand Down
74 changes: 74 additions & 0 deletions controllers/common/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package common

import (
"github.com/keikoproj/upgrade-manager/controllers/common/log"
"github.com/prometheus/client_golang/prometheus"
"reflect"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"strings"
"time"
)

//All cluster level node upgrade statistics

var nodeRotationTotal = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "node",
Name: "rotation_total_seconds",
Help: "Node rotation total",
Buckets: []float64{
10.0,
30.0,
60.0,
90.0,
120.0,
180.0,
300.0,
600.0,
900.0,
},
})

var stepSummaries = make(map[string]map[string]prometheus.Summary)

func InitMetrics() {
metrics.Registry.MustRegister(nodeRotationTotal)
}

// Add rolling update step duration when the step is completed
func AddRollingUpgradeStepDuration(asgName string, stepName string, duration time.Duration) {
if strings.EqualFold(stepName, "total") { //Histogram
nodeRotationTotal.Observe(duration.Seconds())
} else { //Summary
var steps map[string]prometheus.Summary
if m, ok := stepSummaries[asgName]; !ok {
steps = make(map[string]prometheus.Summary)
stepSummaries[asgName] = steps
} else {
steps = m
}

var summary prometheus.Summary
if s, ok := steps[stepName]; !ok {
summary = prometheus.NewSummary(
prometheus.SummaryOpts{
Namespace: "node",
Name: stepName + "_seconds",
Help: "Summary for node " + stepName,
ConstLabels: prometheus.Labels{"asg": asgName},
})
err := metrics.Registry.Register(summary)
if err != nil {
if reflect.TypeOf(err).String() == "prometheus.AlreadyRegisteredError" {
log.Warnf("summary was registered again, ASG: %s, step: %s", asgName, stepName)
} else {
log.Errorf("register summary error, ASG: %s, step: %s, %v", asgName, stepName, err)
}
}
steps[stepName] = summary
} else {
summary = s
}
summary.Observe(duration.Seconds())
}
}
Loading