Skip to content

Commit

Permalink
add metrics for recreate/update resource event when sync work status
Browse files Browse the repository at this point in the history
Signed-off-by: chaosi-zju <[email protected]>
  • Loading branch information
chaosi-zju committed Jul 26, 2024
1 parent bc1c96e commit 8545a71
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
10 changes: 7 additions & 3 deletions pkg/controllers/status/work_status_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
workv1alpha1 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
"github.com/karmada-io/karmada/pkg/events"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/resourceinterpreter"
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
"github.com/karmada-io/karmada/pkg/util"
Expand Down Expand Up @@ -239,9 +240,11 @@ func (c *WorkStatusController) syncWorkStatus(key util.QueueKey) error {
}

if needUpdate {
if err := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj); err != nil {
klog.Errorf("Updating %s failed: %v", fedKey.String(), err)
return err
updateErr := c.ObjectWatcher.Update(clusterName, desiredObj, observedObj)
metrics.CountUpdateResourceToCluster(updateErr, desiredObj.GroupVersionKind().Group, desiredObj.GetKind(), clusterName)
if updateErr != nil {
klog.Errorf("Updating %s failed: %v", fedKey.String(), updateErr)
return updateErr
}
// We can't return even after a success updates, because that might lose the chance to collect status.
// Not all updates are real, they might be no change, in that case there will be no more event for this update,
Expand Down Expand Up @@ -283,6 +286,7 @@ func (c *WorkStatusController) handleDeleteEvent(key keys.FederatedKey) error {
}

reCreateErr := c.recreateResourceIfNeeded(work, key)
metrics.CountRecreateResourceInCluster(reCreateErr, key.Group, key.Kind, key.Cluster)
if reCreateErr != nil {
c.updateAppliedCondition(work, metav1.ConditionFalse, "ReCreateFailed", reCreateErr.Error())
return reCreateErr
Expand Down
24 changes: 24 additions & 0 deletions pkg/metrics/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ const (
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
recreateResourceToCluster = "recreate_resource_to_cluster"
updateResourceToCluster = "update_resource_to_cluster"
policyPreemptionMetricsName = "policy_preemption_total"
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
Expand Down Expand Up @@ -67,6 +69,16 @@ var (
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
}, []string{"result"})

recreateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: recreateResourceToCluster,
Help: "Number of recreating operation of the resource to a target member cluster. By the result, 'error' means a resource recreated failed. Otherwise 'success'. Cluster means the target member cluster.",
}, []string{"result", "apigroup", "kind", "cluster"})

updateResourceWhenSyncWorkStatus = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: updateResourceToCluster,
Help: "Number of updating operation of the resource to a target member cluster. By the result, 'error' means a resource updated failed. Otherwise 'success'. Cluster means the target member cluster.",
}, []string{"result", "apigroup", "kind", "cluster"})

policyPreemptionCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: policyPreemptionMetricsName,
Help: "Number of preemption for the resource template. By the result, 'error' means a resource template failed to be preempted by other propagation policies. Otherwise 'success'.",
Expand Down Expand Up @@ -118,6 +130,16 @@ func ObserveSyncWorkloadLatency(err error, start time.Time) {
syncWorkloadDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
}

// CountRecreateResourceInCluster records the number of recreating operation of the resource to a target member cluster.
func CountRecreateResourceInCluster(err error, apigroup, kind, cluster string) {
recreateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apigroup, kind, cluster).Inc()
}

// CountUpdateResourceToCluster records the number of updating operation of the resource to a target member cluster.
func CountUpdateResourceToCluster(err error, apigroup, kind, cluster string) {
updateResourceWhenSyncWorkStatus.WithLabelValues(utilmetrics.GetResultByError(err), apigroup, kind, cluster).Inc()
}

// CountPolicyPreemption records the numbers of policy preemption.
func CountPolicyPreemption(err error) {
policyPreemptionCounter.WithLabelValues(utilmetrics.GetResultByError(err)).Inc()
Expand Down Expand Up @@ -151,6 +173,8 @@ func ResourceCollectors() []prometheus.Collector {
policyApplyAttempts,
syncWorkDurationHistogram,
syncWorkloadDurationHistogram,
recreateResourceWhenSyncWorkStatus,
updateResourceWhenSyncWorkStatus,
policyPreemptionCounter,
cronFederatedHPADurationHistogram,
cronFederatedHPARuleDurationHistogram,
Expand Down

0 comments on commit 8545a71

Please sign in to comment.