Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "skipped" conditional for near zero metric value fast fails #412

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions servo/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,9 @@ class FastFailConfiguration(pydantic.BaseSettings):
skip: servo.types.Duration = 0
"""How long to wait before querying SLO metrics for potential violations"""

treat_zero_as_missing: bool = False
"""Whether or not to treat zero values as missing per certain metric systems"""

class Config:
extra = pydantic.Extra.forbid

Expand Down
46 changes: 43 additions & 3 deletions servo/fast_fail.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
class SloOutcomeStatus(str, enum.Enum):
passed = "passed"
failed = "failed"
zero_metric = "zero_metric"
zero_threshold = "zero_threshold"
missing_metric = "missing_metric"
missing_threshold = "missing_threshold"

Expand All @@ -41,6 +43,10 @@ def to_message(self, condition: servo.types.SloCondition):
message = "SLO passed"
elif self.status == SloOutcomeStatus.failed:
message = f"SLO failed metric value {self.metric_value} was not {condition.keep} threshold value {self.threshold_value}"
elif self.status == SloOutcomeStatus.zero_metric:
message = f"Skipping SLO {condition.metric} due to near-zero metric value of {self.metric_value}"
elif self.status == SloOutcomeStatus.zero_threshold:
message = f"Skipping SLO {condition.metric} due to near-zero threshold value of {self.threshold_value}"
else:
message = f"Uncrecognized outcome status {self.status}"
return f"{self.checked_at} {message}"
Expand Down Expand Up @@ -87,10 +93,21 @@ def check_readings(
metric_value=metric_value, metric_readings=metric_readings
)

if self.config.treat_zero_as_missing and float(metric_value) == 0:
self._results[condition].append(
SloOutcome(**result_args, status=SloOutcomeStatus.missing_metric)
)
continue

# Evaluate threshold
threshold_readings = None
if condition.threshold is not None:
threshold_value = condition.threshold * condition.threshold_multiplier

result_args.update(
threshold_value=threshold_value,
threshold_readings=threshold_readings,
)
elif condition.threshold_metric is not None:
threshold_readings = metrics.get(condition.threshold_metric)
if not threshold_readings:
Expand All @@ -104,9 +121,32 @@ def check_readings(
threshold_scalar = _get_scalar_from_readings(threshold_readings)
threshold_value = threshold_scalar * condition.threshold_multiplier

result_args.update(
threshold_value=threshold_value, threshold_readings=threshold_readings
)
result_args.update(
threshold_value=threshold_value,
threshold_readings=threshold_readings,
)

if self.config.treat_zero_as_missing and float(threshold_value) == 0:
self._results[condition].append(
SloOutcome(
**result_args, status=SloOutcomeStatus.missing_threshold
)
)
continue

elif 0 <= metric_value <= condition.slo_metric_minimum:
self._results[condition].append(
SloOutcome(**result_args, status=SloOutcomeStatus.zero_metric)
)
continue

elif 0 <= threshold_value <= condition.slo_threshold_minimum:
self._results[condition].append(
SloOutcome(
**result_args, status=SloOutcomeStatus.zero_threshold
)
)
continue

if metric_value.is_nan() or threshold_value.is_nan():
self._results[condition].append(
Expand Down
19 changes: 19 additions & 0 deletions servo/types/slo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ class SloKeep(str, enum.Enum):
class SloCondition(BaseModel):
description: Optional[str] = None
metric: str
slo_metric_minimum: float = 0.25
threshold_multiplier: decimal.Decimal = decimal.Decimal(1)
keep: SloKeep = SloKeep.below
trigger_count: TriggerConstraints = cast(TriggerConstraints, 1)
trigger_window: TriggerConstraints = cast(TriggerConstraints, None)
threshold: Optional[decimal.Decimal]
threshold_metric: Optional[str]
slo_threshold_minimum: float = 0.25

@pydantic.root_validator
@classmethod
Expand All @@ -47,6 +49,23 @@ def _check_threshold_values(cls, values):

return values

@pydantic.root_validator(pre=True)
@classmethod
def _check_duplicated_minimum(cls, values):
if (
values.get("threshold") is not None
and values.get("slo_threshold_minimum") is not None
):
# Use run time import to prevent circular imports
import servo.logging

servo.logging.logger.warning(
"SLO Condition should not specify both static threshold and metric based threshold minimum."
" Please double check the Slo Conditions of the User Config"
)

return values

@pydantic.validator("trigger_window", pre=True, always=True)
@classmethod
def _trigger_window_defaults_to_trigger_count(cls, v, *, values, **kwargs):
Expand Down
16 changes: 2 additions & 14 deletions tests/connectors/prometheus_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def test_generate_default_config(self):
" period: 1m\n"
" span: 1m\n"
" skip: '0'\n"
" treat_zero_as_missing: false\n"
)

def test_generate_override_metrics(self):
Expand Down Expand Up @@ -868,20 +869,7 @@ async def send_traffic() -> None:
+ date_matcher
+ "SLO failed metric value "
+ float_matcher
+ re.escape(" was not below threshold value 0.2], ")
+ re.escape("(tuning_p50_latency below main_p50_latency)[")
+ date_matcher
+ " SLO failed metric value "
+ float_matcher
+ " was not below threshold value "
+ float_matcher
+ ", "
+ date_matcher
+ " SLO failed metric value "
+ float_matcher
+ " was not below threshold value "
+ float_matcher
+ re.escape("]")
+ re.escape(" was not below threshold value 0.2]")
)

try:
Expand Down
157 changes: 143 additions & 14 deletions tests/fast_fail_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,8 @@

def test_non_unique_conditions() -> None:
conditions = [
SloCondition(
metric="same",
threshold=6000,
),
SloCondition(
metric="same",
threshold=6000,
),
SloCondition(metric="same", threshold=6000),
SloCondition(metric="same", threshold=6000),
SloCondition(
metric="same2",
threshold_metric="same3",
Expand All @@ -37,10 +31,7 @@ def test_non_unique_conditions() -> None:
metric="same2",
threshold_metric="same3",
),
SloCondition(
metric="not_same",
threshold=6000,
),
SloCondition(metric="not_same", threshold=6000),
SloCondition(
metric="not_same",
keep=SloKeep.above,
Expand All @@ -57,7 +48,7 @@ def test_non_unique_conditions() -> None:
)


def test_trigget_count_greter_than_window() -> None:
def test_trigger_count_greater_than_window() -> None:
with pytest.raises(pydantic.ValidationError) as err_info:
SloCondition(
metric="test",
Expand Down Expand Up @@ -91,7 +82,11 @@ def config() -> servo.configuration.FastFailConfiguration:
def slo_input(metric: Metric, tuning_metric: Metric) -> SloInput:
return SloInput(
conditions=[
SloCondition(metric=metric.name, threshold=6000, trigger_window=2),
SloCondition(
metric=metric.name,
threshold=6000,
trigger_window=2,
),
SloCondition(
metric=metric.name,
threshold_metric=tuning_metric.name,
Expand Down Expand Up @@ -179,6 +174,76 @@ def test_timeseries_slos_pass(
observer.check_readings(slo_check_readings, checked_at)


@pytest.mark.parametrize(
"checked_at, values, tuning_values",
[
(
datetime(2020, 1, 21, 12, 0, 1),
[[0.05, 0.35, 0.01, 0.03]],
[[0.0, 0.0, 0.0, 0.0]],
),
(
datetime(2020, 1, 21, 12, 10, 1),
[[0.05, 0.35, 0.01, 0.03], [0.05, 0.35, 0.01, 0.03]],
[
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
],
),
],
)
def test_timeseries_slos_skip_zero_metric(
observer: FastFailObserver,
checked_at: datetime,
metric: Metric,
tuning_metric: Metric,
values: List[List[float]],
tuning_values: List[List[float]],
) -> None:
slo_check_readings: Dict[str, List[TimeSeries]] = {
metric.name: _make_time_series_list(metric, values),
tuning_metric.name: _make_time_series_list(tuning_metric, tuning_values),
}

servo.logging.set_level("DEBUG")
observer.check_readings(slo_check_readings, checked_at)


@pytest.mark.parametrize(
"checked_at, values, tuning_values",
[
(
datetime(2020, 1, 21, 12, 0, 1),
[[0.0, 0.0, 0.0, 0.0]],
[[0.05, 0.35, 0.01, 0.03]],
),
(
datetime(2020, 1, 21, 12, 10, 1),
[
[0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0],
],
[[0.05, 0.35, 0.01, 0.03], [0.05, 0.35, 0.01, 0.03]],
),
],
)
def test_timeseries_slos_skip_zero_threshold(
observer: FastFailObserver,
checked_at: datetime,
metric: Metric,
tuning_metric: Metric,
values: List[List[float]],
tuning_values: List[List[float]],
) -> None:
slo_check_readings: Dict[str, List[TimeSeries]] = {
metric.name: _make_time_series_list(metric, values),
tuning_metric.name: _make_time_series_list(tuning_metric, tuning_values),
}

servo.logging.set_level("DEBUG")
observer.check_readings(slo_check_readings, checked_at)


@pytest.mark.parametrize(
"checked_at, values, tuning_values, error_str",
[
Expand Down Expand Up @@ -276,6 +341,70 @@ def test_data_point_slos_pass(
observer.check_readings(slo_check_readings, checked_at)


@pytest.mark.parametrize(
"checked_at, values, tuning_values",
[
(
datetime(2020, 1, 21, 12, 0, 1),
[0.05, 0.35, 0.01, 0.03],
[0.0, 0.0, 0.0, 0.0],
),
(
datetime(2020, 1, 21, 12, 10, 1),
[0.24],
[0.0],
),
],
)
def test_data_point_slos_skip_zero_metric(
observer: FastFailObserver,
checked_at: datetime,
metric: Metric,
tuning_metric: Metric,
values: List[float],
tuning_values: List[float],
) -> None:
slo_check_readings: Dict[str, List[DataPoint]] = {
metric.name: _make_data_point_list(metric, values),
tuning_metric.name: _make_data_point_list(tuning_metric, tuning_values),
}

servo.logging.set_level("DEBUG")
observer.check_readings(slo_check_readings, checked_at)


@pytest.mark.parametrize(
"checked_at, values, tuning_values",
[
(
datetime(2020, 1, 21, 12, 0, 1),
[0.0, 0.0, 0.0, 0.0],
[0.05, 0.35, 0.01, 0.03],
),
(
datetime(2020, 1, 21, 12, 10, 1),
[0.0],
[0.24],
),
],
)
def test_data_point_slos_skip_zero_threshold(
observer: FastFailObserver,
checked_at: datetime,
metric: Metric,
tuning_metric: Metric,
values: List[float],
tuning_values: List[float],
) -> None:
slo_check_readings: Dict[str, List[DataPoint]] = {
metric.name: _make_data_point_list(metric, values),
tuning_metric.name: _make_data_point_list(tuning_metric, tuning_values),
}

servo.logging.set_level("DEBUG")
observer.check_readings(slo_check_readings, checked_at)


@pytest.mark.parametrize(
"checked_at, values, tuning_values, error_str",
[
Expand Down