Skip to content

Commit

Permalink
Merge pull request #774 from h-w-chen/dev/pap-p1-no-evict
Browse files Browse the repository at this point in the history
fix(sysadvisor): pap - level p1 action plan not to evict
  • Loading branch information
luomingmeng authored Feb 11, 2025
2 parents fd243ab + 682ef1a commit 1d22194
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ type CapperProber interface {
// evictFirstStrategy always attempts to evict low priority pods if any; only after all are exhausted will it resort to DVFS means.
// besides, it will continue to try the best to meet the alert spec, regardless of the alert update time.
// alert level has the following meanings in this strategy:
// P1 - eviction only;
// P2 - noop and expecting scheduler to bias against the node
// P1 - noop and expecting scheduler not to schedule to the node
// P0 - evict if applicable; otherwise conduct DVFS once if needed (DVFS is limited to 10%);
// S0 - DVFS in urgency (no limit on DVFS)
type evictFirstStrategy struct {
Expand Down Expand Up @@ -95,16 +96,17 @@ func (e *evictFirstStrategy) recommendEvictFirstOp() spec.InternalOp {

func (e *evictFirstStrategy) recommendOp(alert spec.PowerAlert, internalOp spec.InternalOp) spec.InternalOp {
if internalOp != spec.InternalOpAuto {
return internalOp
// internal op is only applicable to dvfs related levels, i.e. s0 + p0
if alert == spec.PowerAlertS0 || alert == spec.PowerAlertP0 {
return internalOp
}
}

switch alert {
case spec.PowerAlertS0:
return spec.InternalOpFreqCap
case spec.PowerAlertP0:
return e.recommendEvictFirstOp()
case spec.PowerAlertP1:
return spec.InternalOpEvict
default:
return spec.InternalOpNoop
}
Expand All @@ -126,8 +128,8 @@ func (e *evictFirstStrategy) adjustTargetForConstraintDVFS(actualWatt, desiredWa
func (e *evictFirstStrategy) yieldActionPlan(op, internalOp spec.InternalOp, actualWatt, desiredWatt int, alert spec.PowerAlert, ttl time.Duration) action.PowerAction {
switch op {
case spec.InternalOpFreqCap:
// try to conduct freq capping within the allowed limit if not set for hard dvfs
if internalOp != spec.InternalOpFreqCap && !(alert == spec.PowerAlertS0 && internalOp == spec.InternalOpAuto) {
// try to conduct freq capping within the allowed limit except for the unconstrained dvfs
if alert != spec.PowerAlertS0 {
var err error
desiredWatt, err = e.adjustTargetForConstraintDVFS(actualWatt, desiredWatt)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,23 +69,55 @@ func Test_evictFirstStrategy_RecommendAction(t *testing.T) {
wantInDVFS bool
}{
{
name: "internal op is always respected if exists",
name: "plan of s0 always targets full range",
fields: fields{
coefficient: exponentialDecay{},
evictableProber: nil,
dvfsUsed: 0,
},
args: args{
alert: spec.PowerAlertS0,
actualWatt: 100,
desiredWatt: 80,
internalOp: spec.InternalOpFreqCap,
},
want: action.PowerAction{
Op: spec.InternalOpFreqCap,
Arg: 80,
},
wantInDVFS: true,
},
{
name: "plan of p0 is constraint when allowing dvfs only",
fields: fields{
coefficient: exponentialDecay{},
evictableProber: nil,
dvfsUsed: 0,
},
args: args{
alert: spec.PowerAlertP0,
actualWatt: 100,
desiredWatt: 80,
internalOp: spec.InternalOpFreqCap,
},
want: action.PowerAction{
Op: spec.InternalOpFreqCap,
Arg: 90,
},
wantInDVFS: true,
},
{
name: "p1 is noop",
fields: fields{},
args: args{
actualWatt: 100,
desiredWatt: 80,
alert: spec.PowerAlertP1,
},
want: action.PowerAction{
Op: spec.InternalOpNoop,
Arg: 0,
},
},
{
name: "p2 is noop",
fields: fields{},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,12 @@ func (m *metricStorePowerReader) Get(ctx context.Context) (int, error) {
func (m *metricStorePowerReader) get(ctx context.Context, now time.Time) (int, error) {
data, err := m.GetNodeMetric(consts.MetricTotalPowerUsedWatts)
if err != nil {
return 0, errors.Wrap(err, "failed to get metric from metric store")
return 0, errors.Wrap(err, "failed to get power usage from metric store")
}

// 0 actually is error, typically caused by null response from malachite realtime power service
if data.Value == 0 {
return 0, errors.New("got invalid 0 power usage from metric store")
}

if !isDataFresh(data, now) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"testing"
"time"

"github.com/stretchr/testify/assert"

utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric"
)

Expand Down Expand Up @@ -101,3 +103,19 @@ func Test_metricStorePowerReader_get(t *testing.T) {
})
}
}

func Test_metricStorePowerReader_get_0_is_error(t *testing.T) {
t.Parallel()

setTime := time.Date(2024, 11, 11, 8, 30, 10, 0, time.UTC)
dummyMetricStore := utilmetric.NewMetricStore()
dummyMetricStore.SetNodeMetric("total.power.used.watts", utilmetric.MetricData{
Value: 0,
Time: &setTime,
})

m := NewMetricStorePowerReader(dummyMetricStore).(*metricStorePowerReader)
now := time.Date(2024, 11, 11, 8, 30, 11, 0, time.UTC)
_, err := m.get(context.TODO(), now)
assert.Error(t, err)
}

0 comments on commit 1d22194

Please sign in to comment.