Skip to content

Commit

Permalink
Alertmanager: Skip starting the Alertmanager for Grafana tenants unle…
Browse files Browse the repository at this point in the history
…ss they have a promoted, non-default configuration (#10491)

* (PoC) Alertmanager: Don't start Alertmanager if the config is default

* tests

* scope everything to grafana tenants

* make reference-help

* move logic to computeConfig

* simplify computeConfig

* refactor syncConfigs

* make doc

* update arg name

* make reference-help
  • Loading branch information
santihernandezc authored Jan 24, 2025
1 parent 2035601 commit 8f44959
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 75 deletions.
11 changes: 11 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -15586,6 +15586,17 @@
"fieldType": "boolean",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "grafana_alertmanager_conditionally_skip_tenant_suffix",
"required": false,
"desc": "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix",
"fieldType": "string",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_concurrent_get_requests_per_tenant",
Expand Down
2 changes: 2 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ Usage of ./cmd/mimir/mimir:
Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration. (default true)
-alertmanager.grafana-alertmanager-compatibility-enabled
[experimental] Enable routes to support the migration and operation of the Grafana Alertmanager.
-alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix string
[experimental] Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.
-alertmanager.log-parsing-label-matchers
[experimental] Enable logging when parsing label matchers. This flag is intended to be used with -alertmanager.utf8-strict-mode-enabled to validate UTF-8 strict mode is working as intended.
-alertmanager.max-alerts-count int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2372,6 +2372,11 @@ sharding_ring:
# CLI flag: -alertmanager.grafana-alertmanager-compatibility-enabled
[grafana_alertmanager_compatibility_enabled: <boolean> | default = false]
# (experimental) Skip starting the Alertmanager for tenants matching this suffix
# unless they have a promoted, non-default Grafana Alertmanager configuration.
# CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix
[grafana_alertmanager_conditionally_skip_tenant_suffix: <string> | default = ""]
# (advanced) Maximum number of concurrent GET requests allowed per tenant. The
# zero value (and negative values) result in a limit of GOMAXPROCS or 8,
# whichever is larger. Status code 503 is served for GET requests that would
Expand Down
1 change: 1 addition & 0 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ type Config struct {
PersisterConfig PersisterConfig

GrafanaAlertmanagerCompatibility bool
GrafanaAlertmanagerTenantSuffix string
}

// An Alertmanager manages the alerts for one user.
Expand Down
67 changes: 33 additions & 34 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ type MultitenantAlertmanagerConfig struct {

PeerTimeout time.Duration `yaml:"peer_timeout" category:"advanced"`

EnableAPI bool `yaml:"enable_api" category:"advanced"`
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
EnableAPI bool `yaml:"enable_api" category:"advanced"`

GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"`

MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"`

Expand Down Expand Up @@ -126,6 +128,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger

f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.")
f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.")
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.")
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.")

f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.")
Expand Down Expand Up @@ -664,14 +667,21 @@ func (am *MultitenantAlertmanager) isUserOwned(userID string) bool {

func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[string]alertspb.AlertConfigDescs) {
level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgMap))
amInitSkipped := map[string]struct{}{}
for user, cfgs := range cfgMap {
cfg, err := am.computeConfig(cfgs)
cfg, startAM, err := am.computeConfig(cfgs)
if err != nil {
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0))
level.Warn(am.logger).Log("msg", "error computing config", "err", err)
continue
}

if !startAM {
level.Debug(am.logger).Log("msg", "not initializing alertmanager for grafana tenant without a promoted, non-default configuration", "user", user)
amInitSkipped[user] = struct{}{}
continue
}

if err := am.syncStates(ctx, cfg); err != nil {
level.Error(am.logger).Log("msg", "error syncing states", "err", err, "user", user)
}
Expand All @@ -687,10 +697,11 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
}

userAlertmanagersToStop := map[string]*Alertmanager{}

am.alertmanagersMtx.Lock()
for userID, userAM := range am.alertmanagers {
if _, exists := cfgMap[userID]; !exists {
_, exists := cfgMap[userID]
_, initSkipped := amInitSkipped[userID]
if !exists || initSkipped {
userAlertmanagersToStop[userID] = userAM
delete(am.alertmanagers, userID)
delete(am.cfgs, userID)
Expand All @@ -710,42 +721,29 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
}

// computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations.
// It returns the final configuration and external URL the Alertmanager will use.
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, error) {
// It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant.
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) {
cfg := amConfig{
AlertConfigDesc: cfgs.Mimir,
tmplExternalURL: am.cfg.ExternalURL.URL,
}

switch {
// Mimir configuration.
case !cfgs.Grafana.Promoted:
level.Debug(am.logger).Log("msg", "grafana configuration not promoted, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil

case cfgs.Grafana.Default:
level.Debug(am.logger).Log("msg", "grafana configuration is default, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil

case cfgs.Grafana.RawConfig == "":
level.Debug(am.logger).Log("msg", "grafana configuration is empty, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil

// Grafana configuration.
case cfgs.Mimir.RawConfig == am.fallbackConfig:
level.Debug(am.logger).Log("msg", "mimir configuration is default, using grafana config with the default globals", "user", cfgs.Mimir.User)
return createUsableGrafanaConfig(cfgs.Grafana, cfgs.Mimir.RawConfig)

case cfgs.Mimir.RawConfig == "":
level.Debug(am.logger).Log("msg", "mimir configuration is empty, using grafana config with the default globals", "user", cfgs.Grafana.User)
return createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
// If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration.
if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User)
isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix)
return cfg, !isGrafanaTenant, nil
}

// Both configurations.
// TODO: merge configurations.
default:
level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil
// If the Mimir configuration is either default or empty, use the Grafana configuration.
if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User)
cfg, err := createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
return cfg, true, err
}

level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User)
return cfg, true, nil
}

// syncStates promotes/unpromotes the Grafana state and updates the 'promoted' flag if needed.
Expand Down Expand Up @@ -935,6 +933,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *defi
Limits: am.limits,
Features: am.features,
GrafanaAlertmanagerCompatibility: am.cfg.GrafanaAlertmanagerCompatibilityEnabled,
GrafanaAlertmanagerTenantSuffix: am.cfg.GrafanaAlertmanagerTenantSuffix,
}, reg)
if err != nil {
return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err)
Expand Down
Loading

0 comments on commit 8f44959

Please sign in to comment.