From 995c4e070bd292d0d69c99296ed0f09bb0ec8306 Mon Sep 17 00:00:00 2001 From: Hongwei Chen Date: Fri, 10 Jan 2025 18:08:00 +0000 Subject: [PATCH 1/3] pap: power advisor emits essential power related metrics --- .../poweraware/advisor/action/action_test.go | 2 +- .../advisor/action/strategy/evict_first.go | 16 +++- .../action/strategy/evict_first_test.go | 3 + .../plugin/poweraware/advisor/advisor.go | 25 +++--- .../poweraware/advisor/advisor_metrics.go | 34 ++++++++ .../plugin/poweraware/advisor/advisor_test.go | 3 + .../plugin/poweraware/advisor/reconciler.go | 16 +--- .../poweraware/advisor/reconciler_metrics.go | 26 ++++++ .../plugin/poweraware/capper/instruction.go | 40 +++++++-- .../poweraware/capper/instruction_test.go | 24 ++++-- .../poweraware/capper/server/service.go | 35 +++----- .../capper/server/service_metrics.go | 34 ++++++++ .../poweraware/capper/server/service_test.go | 13 +-- .../poweraware/evictor/percentage_evictor.go | 10 ++- .../evictor/percentage_evictor_metrics.go | 29 +++++++ .../evictor/percentage_evictor_test.go | 2 + .../plugin/poweraware/metric/emit_metric.go | 85 +++++++++++++++++++ .../plugin/poweraware/metric/error_code.go | 29 +++++++ .../plugin/poweraware/metric/metric_tag.go | 35 ++++++++ .../sysadvisor/plugin/poweraware/spec/spec.go | 10 ++- 20 files changed, 397 insertions(+), 74 deletions(-) create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler_metrics.go create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/action_test.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/action_test.go index 170e3d872..27cc2d871 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/action_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/action_test.go @@ -39,7 +39,7 @@ func TestPowerAction_String(t *testing.T) { op: spec.InternalOpFreqCap, arg: 255, }, - want: "op: FreqCap, arg: 255", + want: "op: cap, arg: 255", }, } for _, tt := range tests { diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go index de4afc1c4..8c266f625 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go @@ -25,11 +25,16 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" "github.com/kubewharf/katalyst-core/pkg/consts" metrictypes "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types" + "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" ) // threshold of cpu usage that allows voluntary dvfs -const voluntaryDVFSCPUUsageThreshold = 0.45 +const ( + voluntaryDVFSCPUUsageThreshold = 0.45 + + metricPowerAwareDVFSEffect = "node_power_accu_dvfs_effect" +) type EvictableProber interface { HasEvictablePods() bool @@ -42,6 +47,7 @@ type EvictableProber interface { // P0 - evict if applicable; otherwise conduct DVFS once if needed (DVFS is limited to 10%); // S0 - DVFS in urgency (no limit on DVFS) type evictFirstStrategy struct { + emitter metrics.MetricEmitter coefficient exponentialDecay evictableProber EvictableProber dvfsTracker dvfsTracker @@ -135,6 +141,7 @@ func (e *evictFirstStrategy) yieldActionPlan(op, internalOp spec.InternalOp, act func (e *evictFirstStrategy) RecommendAction(actualWatt int, desiredWatt int, alert spec.PowerAlert, internalOp spec.InternalOp, ttl time.Duration) action.PowerAction { e.dvfsTracker.update(actualWatt, desiredWatt) + e.emitDVFSAccumulatedEffect(e.dvfsTracker.dvfsAccumEffect) general.InfofV(6, "pap: dvfs effect: %d", e.dvfsTracker.dvfsAccumEffect) if actualWatt <= desiredWatt { @@ -152,9 +159,14 @@ func (e *evictFirstStrategy) RecommendAction(actualWatt int, desiredWatt int, al return actionPlan } -func NewEvictFirstStrategy(prober EvictableProber, metricsReader metrictypes.MetricsReader) PowerActionStrategy { +func (e *evictFirstStrategy) emitDVFSAccumulatedEffect(percentage int) { + _ = e.emitter.StoreInt64(metricPowerAwareDVFSEffect, int64(percentage), metrics.MetricTypeNameRaw) +} + +func NewEvictFirstStrategy(emitter metrics.MetricEmitter, prober EvictableProber, metricsReader metrictypes.MetricsReader) PowerActionStrategy { general.Infof("pap: using EvictFirst strategy") return &evictFirstStrategy{ + emitter: emitter, coefficient: exponentialDecay{b: defaultDecayB}, evictableProber: prober, dvfsTracker: dvfsTracker{dvfsAccumEffect: 0}, diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go index de98ea8b1..65c2167ec 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go @@ -26,6 +26,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/advisor/action" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" + "github.com/kubewharf/katalyst-core/pkg/metrics" ) type mockEvicableProber struct { @@ -174,9 +175,11 @@ func Test_evictFirstStrategy_RecommendAction(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() e := &evictFirstStrategy{ + emitter: &metrics.DummyMetrics{}, coefficient: tt.fields.coefficient, evictableProber: tt.fields.evictableProber, dvfsTracker: dvfsTracker{dvfsAccumEffect: tt.fields.dvfsUsed}, + metricsReader: nil, } if got := e.RecommendAction(tt.args.actualWatt, tt.args.desiredWatt, tt.args.alert, tt.args.internalOp, tt.args.ttl); !reflect.DeepEqual(got, tt.want) { t.Errorf("RecommendAction() = %v, want %v", got, tt.want) diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor.go index 615242c4d..6ef767e29 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor.go @@ -26,6 +26,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/capper" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/evictor" + powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/reader" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" "github.com/kubewharf/katalyst-core/pkg/config/generic" @@ -40,12 +41,6 @@ const ( // 9 seconds between actions since RAPL/HSMP capping needs 4-6 seconds to stabilize itself // and malachite realtime metric server imposes delay of up to 2 seconds intervalSpecFetch = time.Second * 9 - - metricPowerAwareCurrentPowerInWatt = "power_current_watt" - metricPowerAwareDesiredPowerInWatt = "power_desired_watt" - metricPowerAwareActionPlan = "power_action_plan" - metricTagNameActionPlanOp = "op" - metricTagNameActionPlanMode = "mode" ) // PowerAwareAdvisor is the interface that runs the whole power advisory process @@ -74,20 +69,25 @@ func (p *powerAwareAdvisor) Init() error { return errors.New("no power reader is provided") } if err := p.powerReader.Init(); err != nil { + p.emitErrorCode(powermetric.ErrorCodeInitFailure) return errors.Wrap(err, "failed to initialize power reader") } if p.podEvictor == nil { + p.emitErrorCode(powermetric.ErrorCodeInitFailure) return errors.New("no pod eviction server is provided") } if err := p.podEvictor.Init(); err != nil { + p.emitErrorCode(powermetric.ErrorCodeInitFailure) return errors.Wrap(err, "failed to initialize evict service") } if p.powerCapper == nil { + p.emitErrorCode(powermetric.ErrorCodeInitFailure) return errors.New("no power capping server is provided") } if err := p.powerCapper.Init(); err != nil { + p.emitErrorCode(powermetric.ErrorCodeInitFailure) return errors.Wrap(err, "failed to initialize power capping server") } @@ -97,10 +97,12 @@ func (p *powerAwareAdvisor) Init() error { func (p *powerAwareAdvisor) Run(ctx context.Context) { general.Infof("pap: advisor Run started") if err := p.podEvictor.Start(); err != nil { + p.emitErrorCode(powermetric.ErrorCodeStartFailure) general.Errorf("pap: failed to start pod evict service: %v", err) return } if err := p.powerCapper.Start(); err != nil { + p.emitErrorCode(powermetric.ErrorCodeStartFailure) general.Errorf("pap: failed to start power capping service: %v", err) return } @@ -126,6 +128,7 @@ func (p *powerAwareAdvisor) cleanup() { func (p *powerAwareAdvisor) run(ctx context.Context) { powerSpec, err := p.specFetcher.GetPowerSpec(ctx) if err != nil { + p.emitErrorCode(powermetric.ErrorCodePowerSpecFormat) klog.Errorf("pap: getting power spec failed: %#v", err) return } @@ -150,6 +153,7 @@ func (p *powerAwareAdvisor) run(ctx context.Context) { currentWatts, err := p.powerReader.Get(ctx) if err != nil { + p.emitErrorCode(powermetric.ErrorCodePowerGetCurrentUsage) klog.Errorf("pap: reading power failed: %#v", err) return } @@ -157,13 +161,13 @@ func (p *powerAwareAdvisor) run(ctx context.Context) { klog.V(6).Infof("pap: current power usage: %d watts", currentWatts) // report metrics: current power reading, desired power value - _ = p.emitter.StoreInt64(metricPowerAwareCurrentPowerInWatt, int64(currentWatts), metrics.MetricTypeNameRaw) - _ = p.emitter.StoreInt64(metricPowerAwareDesiredPowerInWatt, int64(powerSpec.Budget), metrics.MetricTypeNameRaw) + p.emitCurrentPowerUSage(currentWatts) + p.emitPowerSpec(powerSpec) freqCapped, err := p.reconciler.Reconcile(ctx, powerSpec, currentWatts) if err != nil { + p.emitErrorCode(powermetric.ErrorCodeRecoverable) general.Errorf("pap: reconcile error: %v", err) - // todo: report to metric dashboard return } @@ -183,13 +187,14 @@ func NewAdvisor(dryRun bool, capper capper.PowerCapper, metricsReader metrictypes.MetricsReader, ) PowerAwareAdvisor { + percentageEvictor := evictor.NewPowerLoadEvict(qosConfig, emitter, podFetcher, podEvictor) return &powerAwareAdvisor{ emitter: emitter, specFetcher: spec.NewFetcher(nodeFetcher, annotationKeyPrefix), powerReader: reader, podEvictor: podEvictor, powerCapper: capper, - reconciler: newReconciler(dryRun, metricsReader, emitter, evictor.NewPowerLoadEvict(qosConfig, podFetcher, podEvictor), capper), + reconciler: newReconciler(dryRun, metricsReader, emitter, percentageEvictor, capper), inFreqCap: false, } } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go new file mode 100644 index 000000000..c0abb1951 --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go @@ -0,0 +1,34 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package advisor + +import ( + powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" +) + +func (p *powerAwareAdvisor) emitCurrentPowerUSage(currentWatts int) { + powermetric.EmitCurrentPowerUSage(p.emitter, currentWatts) +} + +func (p *powerAwareAdvisor) emitPowerSpec(powerSpec *spec.PowerSpec) { + powermetric.EmitPowerSpec(p.emitter, powerSpec) +} + +func (p *powerAwareAdvisor) emitErrorCode(errorCode int) { + powermetric.EmitErrorCode(p.emitter, errorCode) +} diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_test.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_test.go index f10e583f9..f1353f900 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_test.go @@ -26,6 +26,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/evictor" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/reader" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" + "github.com/kubewharf/katalyst-core/pkg/metrics" metricspool "github.com/kubewharf/katalyst-core/pkg/metrics/metrics-pool" ) @@ -168,6 +169,7 @@ func Test_powerAwareAdvisor_run_abort_on_spec_fetcher_error(t *testing.T) { depPowerReader := &dummyPowerReader{} advisor := powerAwareAdvisor{ + emitter: &metrics.DummyMetrics{}, specFetcher: &depSpecFetcher, powerReader: depPowerReader, } @@ -250,6 +252,7 @@ func Test_powerAwareAdvisor_Run_does_Init_Cleanup(t *testing.T) { depPodEvictor := &dummyPodEvictor{} advisor := powerAwareAdvisor{ + emitter: &metrics.DummyMetrics{}, powerReader: depPowerReader, podEvictor: depPodEvictor, } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go index cc7152af8..91f67d9a5 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go @@ -51,16 +51,6 @@ func (p *powerReconciler) OnDVFSReset() { p.strategy.OnDVFSReset() } -func (p *powerReconciler) emitOpCode(action action.PowerAction, mode string) { - // report metrics of action op code with tag of dryRun - op := action.Op.String() - _ = p.emitter.StoreInt64(metricPowerAwareActionPlan, 1, metrics.MetricTypeNameCount, - metrics.ConvertMapToTags(map[string]string{ - metricTagNameActionPlanOp: op, - metricTagNameActionPlanMode: mode, - })...) -} - func (p *powerReconciler) Reconcile(ctx context.Context, desired *spec.PowerSpec, actual int) (bool, error) { alertTimeLimit, err := spec.GetPowerAlertResponseTimeLimit(desired.Alert) if err != nil { @@ -78,13 +68,13 @@ func (p *powerReconciler) Reconcile(ctx context.Context, desired *spec.PowerSpec return false, nil } general.Infof("pap: dryRun: %s", actionPlan) - p.emitOpCode(actionPlan, "dryRun") + p.emitPowerAdvice(actionPlan, "dryRun") p.priorAction = actionPlan return false, nil } general.InfofV(6, "pap: reconcile action %#v", actionPlan) - p.emitOpCode(actionPlan, "real") + p.emitPowerAdvice(actionPlan, "real") switch actionPlan.Op { case spec.InternalOpFreqCap: @@ -107,7 +97,7 @@ func newReconciler(dryRun bool, metricsReader metrictypes.MetricsReader, emitter priorAction: action.PowerAction{}, evictor: evictor, capper: capper, - strategy: strategy.NewEvictFirstStrategy(evictor, metricsReader), + strategy: strategy.NewEvictFirstStrategy(emitter, evictor, metricsReader), emitter: emitter, } } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler_metrics.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler_metrics.go new file mode 100644 index 000000000..5e4813f76 --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler_metrics.go @@ -0,0 +1,26 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package advisor + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/advisor/action" + powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" +) + +func (p *powerReconciler) emitPowerAdvice(action action.PowerAction, mode string) { + powermetric.EmitPowerAdvice(p.emitter, action, mode) +} diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction.go index 1de11c7bf..a71a699be 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction.go @@ -42,9 +42,14 @@ var PowerCapReset = &CapInstruction{ } type CapInstruction struct { - OpCode PowerCapOpCode + OpCode PowerCapOpCode + + // keep string forms to facilitate wire encoding which prefers string(text) format OpCurrentValue string OpTargetValue string + + RawTargetValue int + RawCurrentValue int } func (c CapInstruction) ToListAndWatchResponse() *advisorsvc.ListAndWatchResponse { @@ -104,10 +109,29 @@ func getCappingInstructionFromCalcInfo(info *advisorsvc.CalculationInfo) (*CapIn opCurrValue := values[keyOpCurrentValue] opTargetValue := values[keyOpTargetValue] + var err error + currValue := 0 + if len(opCurrValue) > 0 { + currValue, err = strconv.Atoi(opCurrValue) + if err != nil { + return nil, errors.New("current value format error") + } + } + + targetValue := 0 + if len(opTargetValue) > 0 { + targetValue, err = strconv.Atoi(opTargetValue) + if err != nil { + return nil, errors.New("target value format error") + } + } + return &CapInstruction{ - OpCode: PowerCapOpCode(opCode), - OpCurrentValue: opCurrValue, - OpTargetValue: opTargetValue, + OpCode: PowerCapOpCode(opCode), + OpCurrentValue: opCurrValue, + OpTargetValue: opTargetValue, + RawCurrentValue: currValue, + RawTargetValue: targetValue, }, nil } @@ -136,8 +160,10 @@ func NewCapInstruction(targetWatts, currWatt int) (*CapInstruction, error) { } return &CapInstruction{ - OpCode: OpCap, - OpCurrentValue: fmt.Sprintf("%d", currWatt), - OpTargetValue: fmt.Sprintf("%d", targetWatts), + OpCode: OpCap, + OpCurrentValue: fmt.Sprintf("%d", currWatt), + OpTargetValue: fmt.Sprintf("%d", targetWatts), + RawTargetValue: targetWatts, + RawCurrentValue: currWatt, }, nil } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction_test.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction_test.go index c85fa5b1f..1cdc9f1f5 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/instruction_test.go @@ -102,9 +102,11 @@ func Test_getCappingInstruction(t *testing.T) { }, }, want: &CapInstruction{ - OpCode: "4", - OpCurrentValue: "100", - OpTargetValue: "80", + OpCode: "4", + OpCurrentValue: "100", + OpTargetValue: "80", + RawTargetValue: 80, + RawCurrentValue: 100, }, wantErr: assert.NoError, }, @@ -172,9 +174,11 @@ func TestFromListAndWatchResponse(t *testing.T) { }, want: []*CapInstruction{ { - OpCode: "4", - OpCurrentValue: "555", - OpTargetValue: "500", + OpCode: "4", + OpCurrentValue: "555", + OpTargetValue: "500", + RawCurrentValue: 555, + RawTargetValue: 500, }, { OpCode: "-1", @@ -216,9 +220,11 @@ func Test_capToMessage(t *testing.T) { currWatt: 567, }, want: &CapInstruction{ - OpCode: "4", - OpCurrentValue: "567", - OpTargetValue: "530", + OpCode: "4", + OpCurrentValue: "567", + OpTargetValue: "530", + RawTargetValue: 530, + RawCurrentValue: 567, }, wantErr: assert.NoError, }, diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go index b60db01f2..2dbf84ad2 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go @@ -30,6 +30,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/advisorsvc" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/capper" + powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" @@ -38,11 +39,6 @@ import ( const ( // ServiceNamePowerCap also is the unix socket name of the server is listening on ServiceNamePowerCap = "node_power_cap" - - metricPowerCappingTargetName = "power_capping_target" - metricPowerCappingResetName = "power_capping_reset" - metricPowerCappingNoActorName = "power_capping_no_actor" - metricPowerCappingLWSendResponseFailed = "power_capping_lw_send_response_failed" ) type powerCapService struct { @@ -133,7 +129,7 @@ stream: err := server.Send(resp) if err != nil { general.Errorf("pap: [power capping] send response failed: %v", err) - _ = p.emitter.StoreInt64(metricPowerCappingLWSendResponseFailed, 1, metrics.MetricTypeNameCount) + p.emitErrorCode(powermetric.ErrorCodePowerCapLWSendResponseFailed) break stream } } @@ -143,10 +139,9 @@ stream: } func (p *powerCapService) Reset() { - p.emitRawMetric(metricPowerCappingResetName, 1) if p.notify.IsEmpty() { klog.Warningf("pap: no power capping plugin connected; Reset op is lost") - p.emitRawMetric(metricPowerCappingNoActorName, 1) + p.emitErrorCode(powermetric.ErrorCodePowerCapperUnavailable) } p.Lock() @@ -154,9 +149,11 @@ func (p *powerCapService) Reset() { if !p.started { general.Warningf("pap: power capping service is unavailable") + p.emitErrorCode(powermetric.ErrorCodePowerCapperUnavailable) return } + p.emitPowerCapReset() p.requestReset() } @@ -165,25 +162,17 @@ func (p *powerCapService) requestReset() { p.notify.Notify() } -func (p *powerCapService) emitRawMetric(name string, value int) { - if p.emitter == nil { - return - } - - _ = p.emitter.StoreInt64(name, int64(value), metrics.MetricTypeNameRaw) -} - func (p *powerCapService) Cap(ctx context.Context, targetWatts, currWatt int) { capInst, err := capper.NewCapInstruction(targetWatts, currWatt) if err != nil { klog.Warningf("invalid cap request: %v", err) + p.emitErrorCode(powermetric.ErrorCodeOther) return } - p.emitRawMetric(metricPowerCappingTargetName, targetWatts) if p.notify.IsEmpty() { klog.Warningf("pap: no power capping plugin connected; Cap op from %d to %d watt is lost", currWatt, targetWatts) - p.emitRawMetric(metricPowerCappingNoActorName, 1) + p.emitErrorCode(powermetric.ErrorCodePowerCapperUnavailable) } p.Lock() @@ -191,22 +180,24 @@ func (p *powerCapService) Cap(ctx context.Context, targetWatts, currWatt int) { if !p.started { general.Warningf("pap: power capping service is unavailable") + p.emitErrorCode(powermetric.ErrorCodePowerCapperUnavailable) return } + p.emitPowerCapInstruction(capInst) p.capInstruction = capInst p.notify.Notify() } -func newPowerCapService() *powerCapService { +func newPowerCapService(emitter metrics.MetricEmitter) *powerCapService { return &powerCapService{ - notify: newNotifier(), + notify: newNotifier(), + emitter: emitter, } } func newPowerCapServiceSuite(conf *config.Configuration, emitter metrics.MetricEmitter) (*powerCapService, *grpcServer, error) { - powerCapSvc := newPowerCapService() - powerCapSvc.emitter = emitter + powerCapSvc := newPowerCapService(emitter) socketPath := conf.PowerCappingAdvisorSocketAbsPath if err := os.Remove(socketPath); err != nil && !os.IsNotExist(err) { diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go new file mode 100644 index 000000000..141f56e81 --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go @@ -0,0 +1,34 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server + +import ( + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/capper" + powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" +) + +func (p *powerCapService) emitPowerCapInstruction(instruction *capper.CapInstruction) { + powermetric.EmitPowerCapInstruction(p.emitter, instruction) +} + +func (p *powerCapService) emitPowerCapReset() { + powermetric.EmitPowerCapReset(p.emitter) +} + +func (p *powerCapService) emitErrorCode(errorCode int) { + powermetric.EmitErrorCode(p.emitter, errorCode) +} diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_test.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_test.go index e6cb256d0..481ff9dfd 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_test.go @@ -23,12 +23,13 @@ import ( "github.com/stretchr/testify/assert" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/capper" + "github.com/kubewharf/katalyst-core/pkg/metrics" ) func Test_powerCapAdvisorPluginServer_Reset(t *testing.T) { t.Parallel() - pcs := newPowerCapService() + pcs := newPowerCapService(&metrics.DummyMetrics{}) pcs.started = true pcs.Reset() @@ -42,15 +43,17 @@ func Test_powerCapAdvisorPluginServer_Reset(t *testing.T) { func Test_powerCapAdvisorPluginServer_Cap(t *testing.T) { t.Parallel() - pcs := newPowerCapService() + pcs := newPowerCapService(&metrics.DummyMetrics{}) pcs.started = true pcs.Cap(context.TODO(), 111, 123) assert.Equal(t, &capper.CapInstruction{ - OpCode: "4", - OpCurrentValue: "123", - OpTargetValue: "111", + OpCode: "4", + OpCurrentValue: "123", + OpTargetValue: "111", + RawCurrentValue: 123, + RawTargetValue: 111, }, pcs.capInstruction, "the latest power capping instruction should be what was just to Cap", diff --git a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go index efe988adc..ac58f1836 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go @@ -18,11 +18,12 @@ package evictor import ( "context" - v1 "k8s.io/api/core/v1" + powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" "github.com/kubewharf/katalyst-core/pkg/config/generic" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" + "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" ) @@ -33,6 +34,7 @@ type PercentageEvictor interface { } type loadEvictor struct { + emitter metrics.MetricEmitter qosConfig *generic.QoSConfiguration podFetcher pod.PodFetcher podEvictor PodEvictor @@ -69,6 +71,7 @@ func (l *loadEvictor) Evict(ctx context.Context, targetPercent int) { pods, err := l.podFetcher.GetPodList(ctx, nil) if err != nil { general.Errorf("pap: evict: failed to get pods: %v", err) + l.emitErrorCode(powermetric.ErrorCodePowerEvictFailure) return } countToEvict := getNumToEvict(len(pods), targetPercent) @@ -80,9 +83,11 @@ func (l *loadEvictor) Evict(ctx context.Context, targetPercent int) { evictablePods, err := l.podFetcher.GetPodList(ctx, l.isEvictablePod) if err != nil { general.Errorf("pap: evict: failed to get BE pods: %v", err) + l.emitErrorCode(powermetric.ErrorCodePowerEvictFailure) return } + l.emitEvictReq(countToEvict) general.InfofV(6, "pap: evict: %d pods, %d BE; going to evict BE up to %d%%%% pods = %d", len(pods), len(evictablePods), targetPercent, countToEvict) @@ -91,15 +96,18 @@ func (l *loadEvictor) Evict(ctx context.Context, targetPercent int) { if err := l.podEvictor.Evict(ctx, getN(evictablePods, countToEvict)); err != nil { // power alert eviction is the best effort by design; ok to log the error here general.Warningf("pap: failed to request eviction of pods: %v", err) + l.emitErrorCode(powermetric.ErrorCodePowerEvictFailure) } } func NewPowerLoadEvict(qosConfig *generic.QoSConfiguration, + emitter metrics.MetricEmitter, podFetcher pod.PodFetcher, podEvictor PodEvictor, ) PercentageEvictor { return &loadEvictor{ qosConfig: qosConfig, + emitter: emitter, podFetcher: podFetcher, podEvictor: podEvictor, } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go new file mode 100644 index 000000000..c53208f0f --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go @@ -0,0 +1,29 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package evictor + +import ( + powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" +) + +func (l *loadEvictor) emitEvictReq(pods int) { + powermetric.EmitEvictReq(l.emitter, pods) +} + +func (l *loadEvictor) emitErrorCode(errorCode int) { + powermetric.EmitErrorCode(l.emitter, errorCode) +} diff --git a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_test.go b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_test.go index ca74acece..b734df4dc 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_test.go @@ -26,6 +26,7 @@ import ( "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/config/generic" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" + "github.com/kubewharf/katalyst-core/pkg/metrics" ) func Test_loadEvictor_isEvictablePod(t *testing.T) { @@ -146,6 +147,7 @@ func Test_loadEvictor_Evict(t *testing.T) { podEvictor := &noopPodEvictor{} l := loadEvictor{ qosConfig: generic.NewQoSConfiguration(), + emitter: &metrics.DummyMetrics{}, podFetcher: &mockPodFetcher{}, podEvictor: podEvictor, } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go b/pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go new file mode 100644 index 000000000..af1d5f9b6 --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go @@ -0,0 +1,85 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metric + +import ( + "strconv" + + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/advisor/action" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/capper" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" + "github.com/kubewharf/katalyst-core/pkg/metrics" +) + +func EmitCurrentPowerUSage(emitter metrics.MetricEmitter, currentWatts int) { + _ = emitter.StoreInt64(metricPowerAwareCurrentPowerInWatt, int64(currentWatts), metrics.MetricTypeNameRaw) +} + +func EmitPowerSpec(emitter metrics.MetricEmitter, powerSpec *spec.PowerSpec) { + _ = emitter.StoreInt64(metricPowerSpecBudget, + int64(powerSpec.Budget), + metrics.MetricTypeNameRaw, + metrics.ConvertMapToTags(map[string]string{ + tagPowerAlert: string(powerSpec.Alert), + tagPowerInternalOp: powerSpec.InternalOp.String(), + })..., + ) +} + +func EmitPowerAdvice(emitter metrics.MetricEmitter, action action.PowerAction, mode string) { + _ = emitter.StoreInt64(metricPowerAwareDesiredPowerInWatt, + int64(action.Arg), + metrics.MetricTypeNameRaw, + metrics.ConvertMapToTags(map[string]string{ + tagPowerActionOp: action.Op.String(), + tagPowerActionMode: mode, + })..., + ) +} + +func EmitEvictReq(emitter metrics.MetricEmitter, pods int) { + _ = emitter.StoreInt64(metricPowerEvictReq, int64(pods), metrics.MetricTypeNameCount) +} + +func EmitPowerCapInstruction(emitter metrics.MetricEmitter, instruction *capper.CapInstruction) { + _ = emitter.StoreInt64(metricPowerCappingTarget, + int64(instruction.RawTargetValue), + metrics.MetricTypeNameRaw, + metrics.ConvertMapToTags(map[string]string{ + tagPowerCappingOpCode: string(instruction.OpCode), + })..., + ) + _ = emitter.StoreInt64(metricPowerCappingCurrent, + int64(instruction.RawCurrentValue), + metrics.MetricTypeNameRaw, + metrics.ConvertMapToTags(map[string]string{ + tagPowerCappingOpCode: string(instruction.OpCode), + })..., + ) +} + +func EmitPowerCapReset(emitter metrics.MetricEmitter) { + _ = emitter.StoreInt64(metricPowerCappingReset, 1, metrics.MetricTypeNameCount) +} + +func EmitErrorCode(emitter metrics.MetricEmitter, errorCode int) { + _ = emitter.StoreInt64(metricPowerError, 1, metrics.MetricTypeNameRaw, + metrics.ConvertMapToTags(map[string]string{ + tagErrorCode: strconv.Itoa(errorCode), + })..., + ) +} diff --git a/pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go b/pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go new file mode 100644 index 000000000..a51309c9e --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go @@ -0,0 +1,29 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metric + +const ( + ErrorCodeInitFailure = 9505001 + ErrorCodeStartFailure = 9505002 + ErrorCodePowerSpecFormat = 9505003 + ErrorCodePowerGetCurrentUsage = 9505004 + ErrorCodePowerCapperUnavailable = 9505005 + ErrorCodePowerCapLWSendResponseFailed = 9505006 + ErrorCodePowerEvictFailure = 9505007 + ErrorCodeRecoverable = 9505008 + ErrorCodeOther = 9805009 +) diff --git a/pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go b/pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go new file mode 100644 index 000000000..6c57125e1 --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go @@ -0,0 +1,35 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metric + +const ( + metricPowerAwareCurrentPowerInWatt = "node_power_usage" + metricPowerSpecBudget = "node_power_budget" + metricPowerAwareDesiredPowerInWatt = "node_power_advice" + metricPowerEvictReq = "node_power_evict_req" + metricPowerCappingReset = "node_power_cap_reset" + metricPowerCappingTarget = "node_power_cap_target" + metricPowerCappingCurrent = "node_power_cap_current" + metricPowerError = "node_power_error" + + tagPowerAlert = "alert" + tagPowerInternalOp = "internal_op" + tagPowerActionOp = "op" + tagPowerActionMode = "mode" + tagPowerCappingOpCode = "op" + tagErrorCode = "code" +) diff --git a/pkg/agent/sysadvisor/plugin/poweraware/spec/spec.go b/pkg/agent/sysadvisor/plugin/poweraware/spec/spec.go index ec2e08ac0..495085d0d 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/spec/spec.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/spec/spec.go @@ -75,14 +75,16 @@ func GetPowerAlertResponseTimeLimit(alert PowerAlert) (time.Duration, error) { func (o InternalOp) String() string { switch o { + case InternalOpAuto: + return "" case InternalOpThrottle: - return "Throttle" + return "throttle" case InternalOpEvict: - return "Evict" + return "evict" case InternalOpFreqCap: - return "FreqCap" + return "cap" case InternalOpNoop: - return "Noop" + return "noop" default: return fmt.Sprintf("%d", int(o)) } From 5e67ce35f054625d412e76b9ff85f1233bf1e479 Mon Sep 17 00:00:00 2001 From: Hongwei Chen Date: Fri, 10 Jan 2025 19:59:07 +0000 Subject: [PATCH 2/3] pap: not accumulate dvfs effect while no capper is available --- .../advisor/action/strategy/dvfs_tracker.go | 8 +++++++- .../action/strategy/dvfs_tracker_test.go | 18 ++++++++++++++++++ .../advisor/action/strategy/evict_first.go | 16 +++++++++++++--- .../plugin/poweraware/advisor/reconciler.go | 2 +- .../plugin/poweraware/capper/capper.go | 4 ++++ .../plugin/poweraware/capper/server/service.go | 4 ++++ .../poweraware/evictor/percentage_evictor.go | 1 + 7 files changed, 48 insertions(+), 5 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker.go index 7c14689a3..871092744 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker.go @@ -25,6 +25,8 @@ type dvfsTracker struct { dvfsAccumEffect int inDVFS bool prevPower int + + capperProber CapperProber } func (d *dvfsTracker) getDVFSAllowPercent() int { @@ -35,9 +37,13 @@ func (d *dvfsTracker) getDVFSAllowPercent() int { return leftPercentage } +func (d *dvfsTracker) isCapperAvailable() bool { + return d.capperProber != nil && d.capperProber.IsCapperReady() +} + func (d *dvfsTracker) update(actualWatt, desiredWatt int) { // only accumulate when dvfs is engaged - if d.prevPower >= 0 && d.inDVFS { + if d.prevPower >= 0 && d.inDVFS && d.isCapperAvailable() { // if actual power is more than previous, likely previous round dvfs took no effect; not to take into account if actualWatt < d.prevPower { dvfsEffect := (d.prevPower - actualWatt) * 100 / d.prevPower diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker_test.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker_test.go index 71ff9886f..fdf4abe9f 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/dvfs_tracker_test.go @@ -20,10 +20,24 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" ) +type mockCapperProber struct { + mock.Mock +} + +func (m *mockCapperProber) IsCapperReady() bool { + args := m.Called() + return args.Bool(0) +} + func Test_dvfsTracker_update(t *testing.T) { t.Parallel() + + mockProber := new(mockCapperProber) + mockProber.On("IsCapperReady").Return(true) + type fields struct { dvfsUsed int indvfs bool @@ -51,6 +65,7 @@ func Test_dvfsTracker_update(t *testing.T) { desiredWatt: 85, }, wantDVFSTracker: dvfsTracker{ + capperProber: mockProber, dvfsAccumEffect: 3, inDVFS: false, prevPower: 90, @@ -68,6 +83,7 @@ func Test_dvfsTracker_update(t *testing.T) { desiredWatt: 85, }, wantDVFSTracker: dvfsTracker{ + capperProber: mockProber, dvfsAccumEffect: 13, inDVFS: true, prevPower: 90, @@ -85,6 +101,7 @@ func Test_dvfsTracker_update(t *testing.T) { desiredWatt: 85, }, wantDVFSTracker: dvfsTracker{ + capperProber: mockProber, dvfsAccumEffect: 3, inDVFS: true, prevPower: 101, @@ -96,6 +113,7 @@ func Test_dvfsTracker_update(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() d := &dvfsTracker{ + capperProber: mockProber, dvfsAccumEffect: tt.fields.dvfsUsed, inDVFS: tt.fields.indvfs, prevPower: tt.fields.prevPower, diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go index 8c266f625..acb714ce9 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go @@ -22,6 +22,7 @@ import ( "github.com/pkg/errors" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/advisor/action" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/capper" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" "github.com/kubewharf/katalyst-core/pkg/consts" metrictypes "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types" @@ -40,6 +41,11 @@ type EvictableProber interface { HasEvictablePods() bool } +// CapperProber is only applicable to advisor; capper actor(client) won't be required to implement +type CapperProber interface { + IsCapperReady() bool +} + // evictFirstStrategy always attempts to evict low priority pods if any; only after all are exhausted will it resort to DVFS means. // besides, it will continue to try the best to meet the alert spec, regardless of the alert update time. // alert level has the following meanings in this strategy: @@ -163,13 +169,17 @@ func (e *evictFirstStrategy) emitDVFSAccumulatedEffect(percentage int) { _ = e.emitter.StoreInt64(metricPowerAwareDVFSEffect, int64(percentage), metrics.MetricTypeNameRaw) } -func NewEvictFirstStrategy(emitter metrics.MetricEmitter, prober EvictableProber, metricsReader metrictypes.MetricsReader) PowerActionStrategy { +func NewEvictFirstStrategy(emitter metrics.MetricEmitter, prober EvictableProber, metricsReader metrictypes.MetricsReader, capper capper.PowerCapper) PowerActionStrategy { general.Infof("pap: using EvictFirst strategy") + capperProber, _ := capper.(CapperProber) return &evictFirstStrategy{ emitter: emitter, coefficient: exponentialDecay{b: defaultDecayB}, evictableProber: prober, - dvfsTracker: dvfsTracker{dvfsAccumEffect: 0}, - metricsReader: metricsReader, + dvfsTracker: dvfsTracker{ + dvfsAccumEffect: 0, + capperProber: capperProber, + }, + metricsReader: metricsReader, } } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go index 91f67d9a5..065b4ea46 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/reconciler.go @@ -97,7 +97,7 @@ func newReconciler(dryRun bool, metricsReader metrictypes.MetricsReader, emitter priorAction: action.PowerAction{}, evictor: evictor, capper: capper, - strategy: strategy.NewEvictFirstStrategy(emitter, evictor, metricsReader), + strategy: strategy.NewEvictFirstStrategy(emitter, evictor, metricsReader, capper), emitter: emitter, } } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/capper.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/capper.go index eea22569a..9d21aa28b 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/capper.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/capper.go @@ -29,6 +29,10 @@ type PowerCapper interface { // noopCapper is placeholder for disabled power capping server type noopCapper struct{} +func (n noopCapper) IsCapperReady() bool { + return false +} + func (n noopCapper) Stop() error { return nil } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go index 2dbf84ad2..51c0b5030 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go @@ -50,6 +50,10 @@ type powerCapService struct { grpcServer *grpcServer } +func (p *powerCapService) IsCapperReady() bool { + return !p.notify.IsEmpty() +} + func (p *powerCapService) Stop() error { p.Lock() defer p.Unlock() diff --git a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go index ac58f1836..c6b2fe73a 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor.go @@ -18,6 +18,7 @@ package evictor import ( "context" + v1 "k8s.io/api/core/v1" powermetric "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/metric" From c774d9ad2c64e44b2016691ef2972310b266f97a Mon Sep 17 00:00:00 2001 From: Hongwei Chen Date: Mon, 13 Jan 2025 04:30:16 +0000 Subject: [PATCH 3/3] pap: error metric has human readable error cause tag, counter type --- .../poweraware/advisor/advisor_metrics.go | 4 +-- .../poweraware/capper/server/service.go | 2 +- .../capper/server/service_metrics.go | 4 +-- .../evictor/percentage_evictor_metrics.go | 4 +-- .../plugin/poweraware/metric/emit_metric.go | 8 ++--- .../plugin/poweraware/metric/error_cause.go | 31 +++++++++++++++++++ .../plugin/poweraware/metric/error_code.go | 29 ----------------- .../plugin/poweraware/metric/metric_tag.go | 2 +- 8 files changed, 42 insertions(+), 42 deletions(-) create mode 100644 pkg/agent/sysadvisor/plugin/poweraware/metric/error_cause.go delete mode 100644 pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go index c0abb1951..7c44f6932 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/advisor_metrics.go @@ -29,6 +29,6 @@ func (p *powerAwareAdvisor) emitPowerSpec(powerSpec *spec.PowerSpec) { powermetric.EmitPowerSpec(p.emitter, powerSpec) } -func (p *powerAwareAdvisor) emitErrorCode(errorCode int) { - powermetric.EmitErrorCode(p.emitter, errorCode) +func (p *powerAwareAdvisor) emitErrorCode(errorCause powermetric.ErrorCause) { + powermetric.EmitErrorCode(p.emitter, errorCause) } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go index 51c0b5030..728fe3472 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service.go @@ -133,7 +133,7 @@ stream: err := server.Send(resp) if err != nil { general.Errorf("pap: [power capping] send response failed: %v", err) - p.emitErrorCode(powermetric.ErrorCodePowerCapLWSendResponseFailed) + p.emitErrorCode(powermetric.ErrorCodePowerCapCommunication) break stream } } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go index 141f56e81..0bfe5b4f9 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/capper/server/service_metrics.go @@ -29,6 +29,6 @@ func (p *powerCapService) emitPowerCapReset() { powermetric.EmitPowerCapReset(p.emitter) } -func (p *powerCapService) emitErrorCode(errorCode int) { - powermetric.EmitErrorCode(p.emitter, errorCode) +func (p *powerCapService) emitErrorCode(errorCause powermetric.ErrorCause) { + powermetric.EmitErrorCode(p.emitter, errorCause) } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go index c53208f0f..546e37388 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/evictor/percentage_evictor_metrics.go @@ -24,6 +24,6 @@ func (l *loadEvictor) emitEvictReq(pods int) { powermetric.EmitEvictReq(l.emitter, pods) } -func (l *loadEvictor) emitErrorCode(errorCode int) { - powermetric.EmitErrorCode(l.emitter, errorCode) +func (l *loadEvictor) emitErrorCode(errorCause powermetric.ErrorCause) { + powermetric.EmitErrorCode(l.emitter, errorCause) } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go b/pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go index af1d5f9b6..2937ae92e 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/metric/emit_metric.go @@ -17,8 +17,6 @@ limitations under the License. package metric import ( - "strconv" - "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/advisor/action" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/capper" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/poweraware/spec" @@ -76,10 +74,10 @@ func EmitPowerCapReset(emitter metrics.MetricEmitter) { _ = emitter.StoreInt64(metricPowerCappingReset, 1, metrics.MetricTypeNameCount) } -func EmitErrorCode(emitter metrics.MetricEmitter, errorCode int) { - _ = emitter.StoreInt64(metricPowerError, 1, metrics.MetricTypeNameRaw, +func EmitErrorCode(emitter metrics.MetricEmitter, cause ErrorCause) { + _ = emitter.StoreInt64(metricPowerError, 1, metrics.MetricTypeNameCount, metrics.ConvertMapToTags(map[string]string{ - tagErrorCode: strconv.Itoa(errorCode), + tagErrorCause: string(cause), })..., ) } diff --git a/pkg/agent/sysadvisor/plugin/poweraware/metric/error_cause.go b/pkg/agent/sysadvisor/plugin/poweraware/metric/error_cause.go new file mode 100644 index 000000000..4deb71c92 --- /dev/null +++ b/pkg/agent/sysadvisor/plugin/poweraware/metric/error_cause.go @@ -0,0 +1,31 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metric + +type ErrorCause string + +const ( + ErrorCodeInitFailure = ErrorCause("init_error") + ErrorCodeStartFailure = ErrorCause("start_error") + ErrorCodePowerSpecFormat = ErrorCause("spec_error") + ErrorCodePowerGetCurrentUsage = ErrorCause("read_power_error") + ErrorCodePowerCapperUnavailable = ErrorCause("capper_unavailable") + ErrorCodePowerCapCommunication = ErrorCause("capper_communication_error") + ErrorCodePowerEvictFailure = ErrorCause("evict_error") + ErrorCodeRecoverable = ErrorCause("recoverable_error") + ErrorCodeOther = ErrorCause("other_error") +) diff --git a/pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go b/pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go deleted file mode 100644 index a51309c9e..000000000 --- a/pkg/agent/sysadvisor/plugin/poweraware/metric/error_code.go +++ /dev/null @@ -1,29 +0,0 @@ -/* -Copyright 2022 The Katalyst Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metric - -const ( - ErrorCodeInitFailure = 9505001 - ErrorCodeStartFailure = 9505002 - ErrorCodePowerSpecFormat = 9505003 - ErrorCodePowerGetCurrentUsage = 9505004 - ErrorCodePowerCapperUnavailable = 9505005 - ErrorCodePowerCapLWSendResponseFailed = 9505006 - ErrorCodePowerEvictFailure = 9505007 - ErrorCodeRecoverable = 9505008 - ErrorCodeOther = 9805009 -) diff --git a/pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go b/pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go index 6c57125e1..76e4b8c4d 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/metric/metric_tag.go @@ -31,5 +31,5 @@ const ( tagPowerActionOp = "op" tagPowerActionMode = "mode" tagPowerCappingOpCode = "op" - tagErrorCode = "code" + tagErrorCause = "cause" )