diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go index acb714ce9..7ee99e7fe 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first.go @@ -49,7 +49,8 @@ type CapperProber interface { // evictFirstStrategy always attempts to evict low priority pods if any; only after all are exhausted will it resort to DVFS means. // besides, it will continue to try the best to meet the alert spec, regardless of the alert update time. // alert level has the following meanings in this strategy: -// P1 - eviction only; +// P2 - noop and expecting scheduler to bias against the node +// P1 - noop and expecting scheduler not to schedule to the node // P0 - evict if applicable; otherwise conduct DVFS once if needed (DVFS is limited to 10%); // S0 - DVFS in urgency (no limit on DVFS) type evictFirstStrategy struct { @@ -95,7 +96,10 @@ func (e *evictFirstStrategy) recommendEvictFirstOp() spec.InternalOp { func (e *evictFirstStrategy) recommendOp(alert spec.PowerAlert, internalOp spec.InternalOp) spec.InternalOp { if internalOp != spec.InternalOpAuto { - return internalOp + // internal op is only applicable to dvfs related levels, i.e. s0 + p0 + if alert == spec.PowerAlertS0 || alert == spec.PowerAlertP0 { + return internalOp + } } switch alert { @@ -103,8 +107,6 @@ func (e *evictFirstStrategy) recommendOp(alert spec.PowerAlert, internalOp spec. return spec.InternalOpFreqCap case spec.PowerAlertP0: return e.recommendEvictFirstOp() - case spec.PowerAlertP1: - return spec.InternalOpEvict default: return spec.InternalOpNoop } @@ -126,8 +128,8 @@ func (e *evictFirstStrategy) adjustTargetForConstraintDVFS(actualWatt, desiredWa func (e *evictFirstStrategy) yieldActionPlan(op, internalOp spec.InternalOp, actualWatt, desiredWatt int, alert spec.PowerAlert, ttl time.Duration) action.PowerAction { switch op { case spec.InternalOpFreqCap: - // try to conduct freq capping within the allowed limit if not set for hard dvfs - if internalOp != spec.InternalOpFreqCap && !(alert == spec.PowerAlertS0 && internalOp == spec.InternalOpAuto) { + // try to conduct freq capping within the allowed limit except for the unconstrained dvfs + if alert != spec.PowerAlertS0 { var err error desiredWatt, err = e.adjustTargetForConstraintDVFS(actualWatt, desiredWatt) if err != nil { diff --git a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go index 65c2167ec..9d51a7a5b 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/advisor/action/strategy/evict_first_test.go @@ -69,16 +69,16 @@ func Test_evictFirstStrategy_RecommendAction(t *testing.T) { wantInDVFS bool }{ { - name: "internal op is always respected if exists", + name: "plan of s0 always targets full range", fields: fields{ coefficient: exponentialDecay{}, evictableProber: nil, dvfsUsed: 0, }, args: args{ + alert: spec.PowerAlertS0, actualWatt: 100, desiredWatt: 80, - internalOp: spec.InternalOpFreqCap, }, want: action.PowerAction{ Op: spec.InternalOpFreqCap, @@ -86,6 +86,38 @@ func Test_evictFirstStrategy_RecommendAction(t *testing.T) { }, wantInDVFS: true, }, + { + name: "plan of p0 is constraint when allowing dvfs only", + fields: fields{ + coefficient: exponentialDecay{}, + evictableProber: nil, + dvfsUsed: 0, + }, + args: args{ + alert: spec.PowerAlertP0, + actualWatt: 100, + desiredWatt: 80, + internalOp: spec.InternalOpFreqCap, + }, + want: action.PowerAction{ + Op: spec.InternalOpFreqCap, + Arg: 90, + }, + wantInDVFS: true, + }, + { + name: "p1 is noop", + fields: fields{}, + args: args{ + actualWatt: 100, + desiredWatt: 80, + alert: spec.PowerAlertP1, + }, + want: action.PowerAction{ + Op: spec.InternalOpNoop, + Arg: 0, + }, + }, { name: "p2 is noop", fields: fields{}, diff --git a/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader.go b/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader.go index 238bee219..b2faa3676 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader.go @@ -55,7 +55,12 @@ func (m *metricStorePowerReader) Get(ctx context.Context) (int, error) { func (m *metricStorePowerReader) get(ctx context.Context, now time.Time) (int, error) { data, err := m.GetNodeMetric(consts.MetricTotalPowerUsedWatts) if err != nil { - return 0, errors.Wrap(err, "failed to get metric from metric store") + return 0, errors.Wrap(err, "failed to get power usage from metric store") + } + + // 0 actually is error, typically caused by null response from malachite realtime power service + if data.Value == 0 { + return 0, errors.New("got invalid 0 power usage from metric store") } if !isDataFresh(data, now) { diff --git a/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader_test.go b/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader_test.go index 1f9cc6624..83d9c2f5a 100644 --- a/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader_test.go +++ b/pkg/agent/sysadvisor/plugin/poweraware/reader/metric_store_power_reader_test.go @@ -21,6 +21,8 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" + utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" ) @@ -101,3 +103,19 @@ func Test_metricStorePowerReader_get(t *testing.T) { }) } } + +func Test_metricStorePowerReader_get_0_is_error(t *testing.T) { + t.Parallel() + + setTime := time.Date(2024, 11, 11, 8, 30, 10, 0, time.UTC) + dummyMetricStore := utilmetric.NewMetricStore() + dummyMetricStore.SetNodeMetric("total.power.used.watts", utilmetric.MetricData{ + Value: 0, + Time: &setTime, + }) + + m := NewMetricStorePowerReader(dummyMetricStore).(*metricStorePowerReader) + now := time.Date(2024, 11, 11, 8, 30, 11, 0, time.UTC) + _, err := m.get(context.TODO(), now) + assert.Error(t, err) +}