opsani · DanielHHowell · Apr 12, 2022 · Mar 15, 2022 · Mar 17, 2022 · Mar 29, 2022
diff --git a/servo/configuration.py b/servo/configuration.py
@@ -586,6 +586,9 @@ class FastFailConfiguration(pydantic.BaseSettings):
     skip: servo.types.Duration = 0
     """How long to wait before querying SLO metrics for potential violations"""
 
+    treat_zero_as_missing: bool = False
+    """Whether or not to treat zero values as missing per certain metric systems"""
+
     class Config:
         extra = pydantic.Extra.forbid
 

diff --git a/servo/fast_fail.py b/servo/fast_fail.py
@@ -20,6 +20,8 @@
 class SloOutcomeStatus(str, enum.Enum):
     passed = "passed"
     failed = "failed"
+    zero_metric = "zero_metric"
+    zero_threshold = "zero_threshold"
     missing_metric = "missing_metric"
     missing_threshold = "missing_threshold"
 
@@ -41,6 +43,10 @@ def to_message(self, condition: servo.types.SloCondition):
             message = "SLO passed"
         elif self.status == SloOutcomeStatus.failed:
             message = f"SLO failed metric value {self.metric_value} was not {condition.keep} threshold value {self.threshold_value}"
+        elif self.status == SloOutcomeStatus.zero_metric:
+            message = f"Skipping SLO {condition.metric} due to near-zero metric value of {self.metric_value}"
+        elif self.status == SloOutcomeStatus.zero_threshold:
+            message = f"Skipping SLO {condition.metric} due to near-zero threshold value of {self.threshold_value}"
         else:
             message = f"Uncrecognized outcome status {self.status}"
         return f"{self.checked_at} {message}"
@@ -87,10 +93,21 @@ def check_readings(
                 metric_value=metric_value, metric_readings=metric_readings
             )
 
+            if self.config.treat_zero_as_missing and float(metric_value) == 0:
+                self._results[condition].append(
+                    SloOutcome(**result_args, status=SloOutcomeStatus.missing_metric)
+                )
+                continue
+
             # Evaluate threshold
             threshold_readings = None
             if condition.threshold is not None:
                 threshold_value = condition.threshold * condition.threshold_multiplier
+
+                result_args.update(
+                    threshold_value=threshold_value,
+                    threshold_readings=threshold_readings,
+                )
             elif condition.threshold_metric is not None:
                 threshold_readings = metrics.get(condition.threshold_metric)
                 if not threshold_readings:
@@ -104,9 +121,32 @@ def check_readings(
                 threshold_scalar = _get_scalar_from_readings(threshold_readings)
                 threshold_value = threshold_scalar * condition.threshold_multiplier
 
-            result_args.update(
-                threshold_value=threshold_value, threshold_readings=threshold_readings
-            )
+                result_args.update(
+                    threshold_value=threshold_value,
+                    threshold_readings=threshold_readings,
+                )
+
+                if self.config.treat_zero_as_missing and float(threshold_value) == 0:
+                    self._results[condition].append(
+                        SloOutcome(
+                            **result_args, status=SloOutcomeStatus.missing_threshold
+                        )
+                    )
+                    continue
+
+                elif 0 <= metric_value <= condition.slo_metric_minimum:
+                    self._results[condition].append(
+                        SloOutcome(**result_args, status=SloOutcomeStatus.zero_metric)
+                    )
+                    continue
+
+                elif 0 <= threshold_value <= condition.slo_threshold_minimum:
+                    self._results[condition].append(
+                        SloOutcome(
+                            **result_args, status=SloOutcomeStatus.zero_threshold
+                        )
+                    )
+                    continue
 
             if metric_value.is_nan() or threshold_value.is_nan():
                 self._results[condition].append(

diff --git a/servo/types/slo.py b/servo/types/slo.py
@@ -22,12 +22,14 @@ class SloKeep(str, enum.Enum):
 class SloCondition(BaseModel):
     description: Optional[str] = None
     metric: str
+    slo_metric_minimum: float = 0.25
     threshold_multiplier: decimal.Decimal = decimal.Decimal(1)
     keep: SloKeep = SloKeep.below
     trigger_count: TriggerConstraints = cast(TriggerConstraints, 1)
     trigger_window: TriggerConstraints = cast(TriggerConstraints, None)
     threshold: Optional[decimal.Decimal]
     threshold_metric: Optional[str]
+    slo_threshold_minimum: float = 0.25
 
     @pydantic.root_validator
     @classmethod
@@ -47,6 +49,23 @@ def _check_threshold_values(cls, values):
 
         return values
 
+    @pydantic.root_validator(pre=True)
+    @classmethod
+    def _check_duplicated_minimum(cls, values):
+        if (
+            values.get("threshold") is not None
+            and values.get("slo_threshold_minimum") is not None
+        ):
+            # Use run time import to prevent circular imports
+            import servo.logging
+
+            servo.logging.logger.warning(
+                "SLO Condition should not specify both static threshold and metric based threshold minimum."
+                " Please double check the Slo Conditions of the User Config"
+            )
+
+        return values
+
     @pydantic.validator("trigger_window", pre=True, always=True)
     @classmethod
     def _trigger_window_defaults_to_trigger_count(cls, v, *, values, **kwargs):

diff --git a/tests/connectors/prometheus_test.py b/tests/connectors/prometheus_test.py
@@ -143,6 +143,7 @@ def test_generate_default_config(self):
             "  period: 1m\n"
             "  span: 1m\n"
             "  skip: '0'\n"
+            "  treat_zero_as_missing: false\n"
         )
 
     def test_generate_override_metrics(self):
@@ -868,20 +869,7 @@ async def send_traffic() -> None:
                             + date_matcher
                             + "SLO failed metric value "
                             + float_matcher
-                            + re.escape(" was not below threshold value 0.2], ")
-                            + re.escape("(tuning_p50_latency below main_p50_latency)[")
-                            + date_matcher
-                            + " SLO failed metric value "
-                            + float_matcher
-                            + " was not below threshold value "
-                            + float_matcher
-                            + ", "
-                            + date_matcher
-                            + " SLO failed metric value "
-                            + float_matcher
-                            + " was not below threshold value "
-                            + float_matcher
-                            + re.escape("]")
+                            + re.escape(" was not below threshold value 0.2]")
                         )
 
                         try:

diff --git a/tests/fast_fail_test.py b/tests/fast_fail_test.py
@@ -21,14 +21,8 @@
 
 def test_non_unique_conditions() -> None:
     conditions = [
-        SloCondition(
-            metric="same",
-            threshold=6000,
-        ),
-        SloCondition(
-            metric="same",
-            threshold=6000,
-        ),
+        SloCondition(metric="same", threshold=6000),
+        SloCondition(metric="same", threshold=6000),
         SloCondition(
             metric="same2",
             threshold_metric="same3",
@@ -37,10 +31,7 @@ def test_non_unique_conditions() -> None:
             metric="same2",
             threshold_metric="same3",
         ),
-        SloCondition(
-            metric="not_same",
-            threshold=6000,
-        ),
+        SloCondition(metric="not_same", threshold=6000),
         SloCondition(
             metric="not_same",
             keep=SloKeep.above,
@@ -57,7 +48,7 @@ def test_non_unique_conditions() -> None:
     )
 
 
-def test_trigget_count_greter_than_window() -> None:
+def test_trigger_count_greater_than_window() -> None:
     with pytest.raises(pydantic.ValidationError) as err_info:
         SloCondition(
             metric="test",
@@ -91,7 +82,11 @@ def config() -> servo.configuration.FastFailConfiguration:
 def slo_input(metric: Metric, tuning_metric: Metric) -> SloInput:
     return SloInput(
         conditions=[
-            SloCondition(metric=metric.name, threshold=6000, trigger_window=2),
+            SloCondition(
+                metric=metric.name,
+                threshold=6000,
+                trigger_window=2,
+            ),
             SloCondition(
                 metric=metric.name,
                 threshold_metric=tuning_metric.name,
@@ -179,6 +174,76 @@ def test_timeseries_slos_pass(
     observer.check_readings(slo_check_readings, checked_at)
 
 
+@pytest.mark.parametrize(
+    "checked_at, values, tuning_values",
+    [
+        (
+            datetime(2020, 1, 21, 12, 0, 1),
+            [[0.05, 0.35, 0.01, 0.03]],
+            [[0.0, 0.0, 0.0, 0.0]],
+        ),
+        (
+            datetime(2020, 1, 21, 12, 10, 1),
+            [[0.05, 0.35, 0.01, 0.03], [0.05, 0.35, 0.01, 0.03]],
+            [
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+        ),
+    ],
+)
+def test_timeseries_slos_skip_zero_metric(
+    observer: FastFailObserver,
+    checked_at: datetime,
+    metric: Metric,
+    tuning_metric: Metric,
+    values: List[List[float]],
+    tuning_values: List[List[float]],
+) -> None:
+    slo_check_readings: Dict[str, List[TimeSeries]] = {
+        metric.name: _make_time_series_list(metric, values),
+        tuning_metric.name: _make_time_series_list(tuning_metric, tuning_values),
+    }
+
+    servo.logging.set_level("DEBUG")
+    observer.check_readings(slo_check_readings, checked_at)
+
+
+@pytest.mark.parametrize(
+    "checked_at, values, tuning_values",
+    [
+        (
+            datetime(2020, 1, 21, 12, 0, 1),
+            [[0.0, 0.0, 0.0, 0.0]],
+            [[0.05, 0.35, 0.01, 0.03]],
+        ),
+        (
+            datetime(2020, 1, 21, 12, 10, 1),
+            [
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            [[0.05, 0.35, 0.01, 0.03], [0.05, 0.35, 0.01, 0.03]],
+        ),
+    ],
+)
+def test_timeseries_slos_skip_zero_threshold(
+    observer: FastFailObserver,
+    checked_at: datetime,
+    metric: Metric,
+    tuning_metric: Metric,
+    values: List[List[float]],
+    tuning_values: List[List[float]],
+) -> None:
+    slo_check_readings: Dict[str, List[TimeSeries]] = {
+        metric.name: _make_time_series_list(metric, values),
+        tuning_metric.name: _make_time_series_list(tuning_metric, tuning_values),
+    }
+
+    servo.logging.set_level("DEBUG")
+    observer.check_readings(slo_check_readings, checked_at)
+
+
 @pytest.mark.parametrize(
     "checked_at, values, tuning_values, error_str",
     [
@@ -276,6 +341,70 @@ def test_data_point_slos_pass(
     observer.check_readings(slo_check_readings, checked_at)
 
 
+@pytest.mark.parametrize(
+    "checked_at, values, tuning_values",
+    [
+        (
+            datetime(2020, 1, 21, 12, 0, 1),
+            [0.05, 0.35, 0.01, 0.03],
+            [0.0, 0.0, 0.0, 0.0],
+        ),
+        (
+            datetime(2020, 1, 21, 12, 10, 1),
+            [0.24],
+            [0.0],
+        ),
+    ],
+)
+def test_data_point_slos_skip_zero_metric(
+    observer: FastFailObserver,
+    checked_at: datetime,
+    metric: Metric,
+    tuning_metric: Metric,
+    values: List[float],
+    tuning_values: List[float],
+) -> None:
+    slo_check_readings: Dict[str, List[DataPoint]] = {
+        metric.name: _make_data_point_list(metric, values),
+        tuning_metric.name: _make_data_point_list(tuning_metric, tuning_values),
+    }
+
+    servo.logging.set_level("DEBUG")
+    observer.check_readings(slo_check_readings, checked_at)
+
+
+@pytest.mark.parametrize(
+    "checked_at, values, tuning_values",
+    [
+        (
+            datetime(2020, 1, 21, 12, 0, 1),
+            [0.0, 0.0, 0.0, 0.0],
+            [0.05, 0.35, 0.01, 0.03],
+        ),
+        (
+            datetime(2020, 1, 21, 12, 10, 1),
+            [0.0],
+            [0.24],
+        ),
+    ],
+)
+def test_data_point_slos_skip_zero_threshold(
+    observer: FastFailObserver,
+    checked_at: datetime,
+    metric: Metric,
+    tuning_metric: Metric,
+    values: List[float],
+    tuning_values: List[float],
+) -> None:
+    slo_check_readings: Dict[str, List[DataPoint]] = {
+        metric.name: _make_data_point_list(metric, values),
+        tuning_metric.name: _make_data_point_list(tuning_metric, tuning_values),
+    }
+
+    servo.logging.set_level("DEBUG")
+    observer.check_readings(slo_check_readings, checked_at)
+
+
 @pytest.mark.parametrize(
     "checked_at, values, tuning_values, error_str",
     [