Skip to content

Commit

Permalink
Add TempoUserConfigurableOverridesReloadFailing alert (#2784)
Browse files Browse the repository at this point in the history
* Add TempoUserConfigurableOverridesReloadFailing alert

* Update CHANGELOG.md

* Fail startup on user-configurable overrides error
  • Loading branch information
Koenraad Verheyden authored Aug 15, 2023
1 parent 90cf628 commit 12bdeff
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* [ENHANCEMENT] Assert ingestion rate limits as early as possible [#2640](https://github.com/grafana/tempo/pull/2703) (@mghildiy)
* [ENHANCEMENT] Add several metrics-generator fields to user-configurable overrides [#2711](https://github.com/grafana/tempo/pull/2711) (@kvrhdn)
* [ENHANCEMENT] Update /api/metrics/summary to correctly handle missing attributes and improve performance of TraceQL `select()` queries. [#2765](https://github.com/grafana/tempo/pull/2765) (@mdisibio)
* [ENHANCEMENT] Add `TempoUserConfigurableOverridesReloadFailing` alert [#2784](https://github.com/grafana/tempo/pull/2784) (@kvrhdn)
* [BUGFIX] Fix panic in metrics summary api [#2738](https://github.com/grafana/tempo/pull/2738) (@mdisibio)
* [BUGFIX] Fix node role auth IDMSv1 [#2760](https://github.com/grafana/tempo/pull/2760) (@coufalja)
* [BUGFIX] Only search ingester blocks that fall within the request time range. [#2783](https://github.com/grafana/tempo/pull/2783) (@joe-elliott)
Expand Down
8 changes: 8 additions & 0 deletions modules/overrides/user_configurable_overrides.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/opentracing/opentracing-go"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"golang.org/x/exp/slices"
"gopkg.in/yaml.v2"

Expand All @@ -24,6 +25,12 @@ import (
"github.com/grafana/tempo/tempodb/backend"
)

var metricUserConfigurableOverridesReloadFailed = promauto.NewCounter(prometheus.CounterOpts{
Namespace: "tempo",
Name: "overrides_user_configurable_overrides_reload_failed_total",
Help: "How often reloading the user-configurable overrides has failed",
})

type UserConfigurableOverridesConfig struct {
Enabled bool `yaml:"enabled"`

Expand Down Expand Up @@ -109,6 +116,7 @@ func (o *userConfigurableOverridesManager) running(ctx context.Context) error {
case <-ticker.C:
err := o.reloadAllTenantLimits(ctx)
if err != nil {
metricUserConfigurableOverridesReloadFailed.Inc()
level.Error(o.logger).Log("msg", "failed to refresh user-configurable config", "err", err)
}
continue
Expand Down
9 changes: 9 additions & 0 deletions operations/tempo-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@
"for": "15m"
"labels":
"severity": "warning"
- "alert": "TempoUserConfigurableOverridesReloadFailing"
"annotations":
"message": "Greater than 5 user-configurable overides reloads failed in the past hour."
"runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures"
"expr": |
sum by (cluster, namespace) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[1h])) > 5 and
sum by (cluster, namespace) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0
"labels":
"severity": "critical"
- "alert": "TempoProvisioningTooManyWrites"
"annotations":
"message": "Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} are receiving more data/second than desired, add more ingesters."
Expand Down
14 changes: 14 additions & 0 deletions operations/tempo-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,20 @@
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoBadOverrides',
},
},
{
alert: 'TempoUserConfigurableOverridesReloadFailing',
expr: |||
sum by (%s) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[1h])) > %s and
sum by (%s) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total{}[5m])) > 0
||| % [$._config.group_by_cluster, $._config.alerts.user_configurable_overrides_polls_per_hour_failed, $._config.group_by_cluster],
labels: {
severity: 'critical',
},
annotations: {
message: 'Greater than %s user-configurable overides reloads failed in the past hour.' % $._config.alerts.user_configurable_overrides_polls_per_hour_failed,
runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoTenantIndexFailures',
},
},
// ingesters
{
alert: 'TempoProvisioningTooManyWrites',
Expand Down
1 change: 1 addition & 0 deletions operations/tempo-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
compactions_per_hour_failed: 2,
flushes_per_hour_failed: 2,
polls_per_hour_failed: 2,
user_configurable_overrides_polls_per_hour_failed: 5,
max_tenant_index_age_seconds: 600,
p99_request_threshold_seconds: 3,
p99_request_exclude_regex: 'metrics|/frontend.Frontend/Process|debug_pprof',
Expand Down

0 comments on commit 12bdeff

Please sign in to comment.