Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alertmanager: Skip starting the Alertmanager for Grafana tenants unless they have a promoted, non-default configuration #10491

Merged
11 changes: 11 additions & 0 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -15576,6 +15576,17 @@
"fieldType": "boolean",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "grafana_alertmanager_conditionally_skip_tenant_suffix",
"required": false,
"desc": "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.",
"fieldValue": null,
"fieldDefaultValue": "",
"fieldFlag": "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix",
"fieldType": "string",
"fieldCategory": "experimental"
},
{
"kind": "field",
"name": "max_concurrent_get_requests_per_tenant",
Expand Down
2 changes: 2 additions & 0 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ Usage of ./cmd/mimir/mimir:
Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration. (default true)
-alertmanager.grafana-alertmanager-compatibility-enabled
[experimental] Enable routes to support the migration and operation of the Grafana Alertmanager.
-alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix string
[experimental] Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.
-alertmanager.log-parsing-label-matchers
[experimental] Enable logging when parsing label matchers. This flag is intended to be used with -alertmanager.utf8-strict-mode-enabled to validate UTF-8 strict mode is working as intended.
-alertmanager.max-alerts-count int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2372,6 +2372,11 @@ sharding_ring:
# CLI flag: -alertmanager.grafana-alertmanager-compatibility-enabled
[grafana_alertmanager_compatibility_enabled: <boolean> | default = false]

# (experimental) Skip starting the Alertmanager for tenants matching this suffix
# unless they have a promoted, non-default Grafana Alertmanager configuration.
# CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix
[grafana_alertmanager_conditionally_skip_tenant_suffix: <string> | default = ""]

# (advanced) Maximum number of concurrent GET requests allowed per tenant. The
# zero value (and negative values) result in a limit of GOMAXPROCS or 8,
# whichever is larger. Status code 503 is served for GET requests that would
Expand Down
1 change: 1 addition & 0 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ type Config struct {
PersisterConfig PersisterConfig

GrafanaAlertmanagerCompatibility bool
GrafanaAlertmanagerTenantSuffix string
}

// An Alertmanager manages the alerts for one user.
Expand Down
67 changes: 33 additions & 34 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ type MultitenantAlertmanagerConfig struct {

PeerTimeout time.Duration `yaml:"peer_timeout" category:"advanced"`

EnableAPI bool `yaml:"enable_api" category:"advanced"`
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
EnableAPI bool `yaml:"enable_api" category:"advanced"`

GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"`
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"`

MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"`

Expand Down Expand Up @@ -126,6 +128,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger

f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.")
f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.")
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.")
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.")

f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.")
Expand Down Expand Up @@ -664,14 +667,21 @@ func (am *MultitenantAlertmanager) isUserOwned(userID string) bool {

func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[string]alertspb.AlertConfigDescs) {
level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgMap))
amInitSkipped := map[string]struct{}{}
for user, cfgs := range cfgMap {
cfg, err := am.computeConfig(cfgs)
cfg, startAM, err := am.computeConfig(cfgs)
if err != nil {
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0))
level.Warn(am.logger).Log("msg", "error computing config", "err", err)
continue
}

if !startAM {
level.Debug(am.logger).Log("msg", "not initializing alertmanager for grafana tenant without a promoted, non-default configuration", "user", user)
amInitSkipped[user] = struct{}{}
continue
}

if err := am.syncStates(ctx, cfg); err != nil {
level.Error(am.logger).Log("msg", "error syncing states", "err", err, "user", user)
}
Expand All @@ -687,10 +697,11 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
}

userAlertmanagersToStop := map[string]*Alertmanager{}

am.alertmanagersMtx.Lock()
for userID, userAM := range am.alertmanagers {
if _, exists := cfgMap[userID]; !exists {
_, exists := cfgMap[userID]
_, initSkipped := amInitSkipped[userID]
if !exists || initSkipped {
Comment on lines +703 to +704
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the skipped Alertmanager was already running, I'm deleting its data and adding it to the map of Alertmanagers to stop.

userAlertmanagersToStop[userID] = userAM
delete(am.alertmanagers, userID)
delete(am.cfgs, userID)
Expand All @@ -710,42 +721,29 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s
}

// computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations.
// It returns the final configuration and external URL the Alertmanager will use.
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, error) {
// It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant.
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) {
cfg := amConfig{
AlertConfigDesc: cfgs.Mimir,
tmplExternalURL: am.cfg.ExternalURL.URL,
}

switch {
// Mimir configuration.
case !cfgs.Grafana.Promoted:
level.Debug(am.logger).Log("msg", "grafana configuration not promoted, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil

case cfgs.Grafana.Default:
level.Debug(am.logger).Log("msg", "grafana configuration is default, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil

case cfgs.Grafana.RawConfig == "":
level.Debug(am.logger).Log("msg", "grafana configuration is empty, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil

// Grafana configuration.
case cfgs.Mimir.RawConfig == am.fallbackConfig:
level.Debug(am.logger).Log("msg", "mimir configuration is default, using grafana config with the default globals", "user", cfgs.Mimir.User)
return createUsableGrafanaConfig(cfgs.Grafana, cfgs.Mimir.RawConfig)

case cfgs.Mimir.RawConfig == "":
level.Debug(am.logger).Log("msg", "mimir configuration is empty, using grafana config with the default globals", "user", cfgs.Grafana.User)
return createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
// If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration.
if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User)
isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix)
return cfg, !isGrafanaTenant, nil
}
Comment on lines +731 to +736
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I decided to remove the granular debug log lines to shorten the function.


// Both configurations.
// TODO: merge configurations.
default:
level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User)
return cfg, nil
// If the Mimir configuration is either default or empty, use the Grafana configuration.
if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" {
level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User)
cfg, err := createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig)
return cfg, true, err
}

level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User)
return cfg, true, nil
}

// syncStates promotes/unpromotes the Grafana state and updates the 'promoted' flag if needed.
Expand Down Expand Up @@ -935,6 +933,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *defi
Limits: am.limits,
Features: am.features,
GrafanaAlertmanagerCompatibility: am.cfg.GrafanaAlertmanagerCompatibilityEnabled,
GrafanaAlertmanagerTenantSuffix: am.cfg.GrafanaAlertmanagerTenantSuffix,
}, reg)
if err != nil {
return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err)
Expand Down
Loading
Loading