Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add metrics for recreate/update resource event when sync work status #5247

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions pkg/controllers/status/work_status_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
workv1alpha1 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
"github.com/karmada-io/karmada/pkg/events"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
"github.com/karmada-io/karmada/pkg/util"
Expand Down Expand Up @@ -239,9 +240,11 @@ func (c *WorkStatusController) syncWorkStatus(key util.QueueKey) error {
}

if needUpdate {
if err := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj); err != nil {
klog.Errorf("Updating %s failed: %v", fedKey.String(), err)
return err
updateErr := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj)
metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GetAPIVersion(), desiredObj.GetKind(), clusterName)
if updateErr != nil {
klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr)
return updateErr
}
// We can't return even after a success updates, because that might lose the chance to collect status.
// Not all updates are real, they might be no change, in that case there will be no more event for this update,
Expand Down Expand Up @@ -283,6 +286,7 @@ func (c *WorkStatusController) handleDeleteEvent(key keys.FederatedKey) error {
}

reCreateErr := c.recreateResourceIfNeeded(work, key)
metrics.CountRecreateResourceToCluster(reCreateErr, key.GroupVersion().String(), key.Kind, key.Cluster)
if reCreateErr != nil {
c.updateAppliedCondition(work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error())
return reCreateErr
Expand Down
24 changes: 24 additions & 0 deletions pkg/metrics/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ const (
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
recreateResourceToCluster = "recreate_resource_to_cluster"
updateResourceToCluster = "update_resource_to_cluster"
policyPreemptionMetricsName = "policy_preemption_total"
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
Expand Down Expand Up @@ -67,6 +69,16 @@ var (
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
}, []string{"result"})

recreateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: recreateResourceToCluster,
Help: "Number of recreating operation of the resource to a target member cluster. By the result, 'error' means a resource recreated failed. Otherwise 'success'. Cluster means the target member cluster.",
}, []string{"result", "apiversion", "kind", "cluster"})

updateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: updateResourceToCluster,
Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster.",
}, []string{"result", "apiversion", "kind", "cluster"})

policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: policyPreemptionMetricsName,
Help: "Number of preemption for the resource template. By the result, 'error' means a resource template failed to be preempted by other propagation policies. Otherwise 'success'.",
Expand Down Expand Up @@ -118,6 +130,16 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) {
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
}

// CountRecreateResourceToCluster records the number of recreating operation of the resource to a target member cluster.
func CountRecreateResourceToCluster(err error, apiVersion, kind, cluster string) {
recreateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
}

// CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster.
func CountUpdateResourceToCluster(err error, apiVersion, kind, cluster string) {
updateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apiVersion, kind, cluster).Inc()
}

// CountPolicyPreemption records the numbers of policy preemption.
func CountPolicyPreemption(err error) {
policyPreemptionCounter.WithLabelValues(utilmetrics.GetResultByError(err)).Inc()
Expand Down Expand Up @@ -151,6 +173,8 @@ func ResourceCollectors() []prometheus.Collector {
policyApplyAttempts,
syncWorkDurationHistogram,
syncWorkloadDurationHistogram,
recreateResourceWhenSyncWorkStatus,
updateResourceWhenSyncWorkStatus,
policyPreemptionCounter,
cronFederatedHPADurationHistogram,
cronFederatedHPARuleDurationHistogram,
Expand Down