From 32850c405c42a73c7402b1e3c5e949a30ffdd39b Mon Sep 17 00:00:00 2001 From: chaosi-zju Date: Thu, 25 Jul 2024 16:00:20 +0800 Subject: [PATCH] add metrics for recreate/update resource event when sync work status Signed-off-by: chaosi-zju --- .../status/work_status_controller.go | 10 +++++--- pkg/metrics/resource.go | 24 +++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/pkg/controllers/status/work_status_controller.go b/pkg/controllers/status/work_status_controller.go index 7178b00ab5e6..670cf35aa476 100644 --- a/pkg/controllers/status/work_status_controller.go +++ b/pkg/controllers/status/work_status_controller.go @@ -41,6 +41,7 @@ import ( workv1alpha1 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha1" workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2" "github.com/karmada-io/karmada/pkg/events" + "github.com/karmada-io/karmada/pkg/metrics" "github.com/karmada-io/karmada/pkg/resourceinterpreter" "github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag" "github.com/karmada-io/karmada/pkg/util" @@ -239,9 +240,11 @@ func (c *WorkStatusController) syncWorkStatus(key util.QueueKey) error { } if needUpdate { - if err := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj); err != nil { - klog.Errorf("Updating %s failed: %v", fedKey.String(), err) - return err + updateErr := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj) + metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName) + if updateErr != nil { + klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr) + return updateErr } // We can't return even after a success updates, because that might lose the chance to collect status. // Not all updates are real, they might be no change, in that case there will be no more event for this update, @@ -283,6 +286,7 @@ func (c *WorkStatusController) handleDeleteEvent(key keys.FederatedKey) error { } reCreateErr := c.recreateResourceIfNeeded(work, key) + metrics.CountRecreateResourceToCluster(reCreateErr, key.GroupVersion().String(), key.Kind, key.Cluster) if reCreateErr != nil { c.updateAppliedCondition(work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error()) return reCreateErr diff --git a/pkg/metrics/resource.go b/pkg/metrics/resource.go index 8e81ca3155a2..a4c0caa99c8a 100644 --- a/pkg/metrics/resource.go +++ b/pkg/metrics/resource.go @@ -30,6 +30,8 @@ const ( policyApplyAttemptsMetricsName = "policy_apply_attempts_total" syncWorkDurationMetricsName = "binding_sync_work_duration_seconds" syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds" + recreateResourceToCluster = "recreate_resource_to_cluster" + updateResourceToCluster = "update_resource_to_cluster" policyPreemptionMetricsName = "policy_preemption_total" cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds" cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds" @@ -67,6 +69,16 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 12), }, []string{"result"}) + recreateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: recreateResourceToCluster, + Help: "Number of recreating operation of the resource to a target member cluster. By the result, 'error' means a resource recreated failed. Otherwise 'success'. Cluster means the target member cluster.", + }, []string{"result", "apiversion", "kind", "cluster"}) + + updateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: updateResourceToCluster, + Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster.", + }, []string{"result", "apiversion", "kind", "cluster"}) + policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ Name: policyPreemptionMetricsName, Help: "Number of preemption for the resource template. By the result, 'error' means a resource template failed to be preempted by other propagation policies. Otherwise 'success'.", @@ -118,6 +130,16 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) { syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start)) } +// CountRecreateResourceToCluster records the number of recreating operation of the resource to a target member cluster. +func CountRecreateResourceToCluster(err error, apiVersion, kind, cluster string) { + recreateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc() +} + +// CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster. +func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string) { + updateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc() +} + // CountPolicyPreemption records the numbers of policy preemption. func CountPolicyPreemption(err error) { policyPreemptionCounter.WithLabelValues(utilmetrics.GetResultByError(err)).Inc() @@ -151,6 +173,8 @@ func ResourceCollectors() []prometheus.Collector { policyApplyAttempts, syncWorkDurationHistogram, syncWorkloadDurationHistogram, + recreateResourceWhenSyncWorkStatus, + updateResourceWhenSyncWorkStatus, policyPreemptionCounter, cronFederatedHPADurationHistogram, cronFederatedHPARuleDurationHistogram,