From 8f449592347146e3993fd67a17ffd3a062cc2167 Mon Sep 17 00:00:00 2001 From: Santiago Date: Fri, 24 Jan 2025 11:46:51 +0100 Subject: [PATCH] Alertmanager: Skip starting the Alertmanager for Grafana tenants unless they have a promoted, non-default configuration (#10491) * (PoC) Alertmanager: Don't start Alertmanager if the config is default * tests * scope everything to grafana tenants * make reference-help * move logic to computeConfig * simplify computeConfig * refactor syncConfigs * make doc * update arg name * make reference-help --- cmd/mimir/config-descriptor.json | 11 +++ cmd/mimir/help-all.txt.tmpl | 2 + .../configuration-parameters/index.md | 5 + pkg/alertmanager/alertmanager.go | 1 + pkg/alertmanager/multitenant.go | 67 +++++++------ pkg/alertmanager/multitenant_test.go | 94 +++++++++++-------- 6 files changed, 105 insertions(+), 75 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index fbfbe2af753..f0b6c93d304 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -15586,6 +15586,17 @@ "fieldType": "boolean", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "grafana_alertmanager_conditionally_skip_tenant_suffix", + "required": false, + "desc": "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", + "fieldType": "string", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "max_concurrent_get_requests_per_tenant", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index b5c57b3780a..fdea624ec68 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -239,6 +239,8 @@ Usage of ./cmd/mimir/mimir: Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration. (default true) -alertmanager.grafana-alertmanager-compatibility-enabled [experimental] Enable routes to support the migration and operation of the Grafana Alertmanager. + -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix string + [experimental] Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration. -alertmanager.log-parsing-label-matchers [experimental] Enable logging when parsing label matchers. This flag is intended to be used with -alertmanager.utf8-strict-mode-enabled to validate UTF-8 strict mode is working as intended. -alertmanager.max-alerts-count int diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index fd481fc24a2..34381239ddd 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2372,6 +2372,11 @@ sharding_ring: # CLI flag: -alertmanager.grafana-alertmanager-compatibility-enabled [grafana_alertmanager_compatibility_enabled: | default = false] +# (experimental) Skip starting the Alertmanager for tenants matching this suffix +# unless they have a promoted, non-default Grafana Alertmanager configuration. +# CLI flag: -alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix +[grafana_alertmanager_conditionally_skip_tenant_suffix: | default = ""] + # (advanced) Maximum number of concurrent GET requests allowed per tenant. The # zero value (and negative values) result in a limit of GOMAXPROCS or 8, # whichever is larger. Status code 503 is served for GET requests that would diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 52bd99bdb30..dc8ebf27b3c 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -104,6 +104,7 @@ type Config struct { PersisterConfig PersisterConfig GrafanaAlertmanagerCompatibility bool + GrafanaAlertmanagerTenantSuffix string } // An Alertmanager manages the alerts for one user. diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 336fb5aa36a..cfbc3572556 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -81,8 +81,10 @@ type MultitenantAlertmanagerConfig struct { PeerTimeout time.Duration `yaml:"peer_timeout" category:"advanced"` - EnableAPI bool `yaml:"enable_api" category:"advanced"` - GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` + EnableAPI bool `yaml:"enable_api" category:"advanced"` + + GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` + GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"` MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"` @@ -126,6 +128,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.") f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.") + f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.") f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.") f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.") @@ -664,14 +667,21 @@ func (am *MultitenantAlertmanager) isUserOwned(userID string) bool { func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[string]alertspb.AlertConfigDescs) { level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgMap)) + amInitSkipped := map[string]struct{}{} for user, cfgs := range cfgMap { - cfg, err := am.computeConfig(cfgs) + cfg, startAM, err := am.computeConfig(cfgs) if err != nil { am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) level.Warn(am.logger).Log("msg", "error computing config", "err", err) continue } + if !startAM { + level.Debug(am.logger).Log("msg", "not initializing alertmanager for grafana tenant without a promoted, non-default configuration", "user", user) + amInitSkipped[user] = struct{}{} + continue + } + if err := am.syncStates(ctx, cfg); err != nil { level.Error(am.logger).Log("msg", "error syncing states", "err", err, "user", user) } @@ -687,10 +697,11 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s } userAlertmanagersToStop := map[string]*Alertmanager{} - am.alertmanagersMtx.Lock() for userID, userAM := range am.alertmanagers { - if _, exists := cfgMap[userID]; !exists { + _, exists := cfgMap[userID] + _, initSkipped := amInitSkipped[userID] + if !exists || initSkipped { userAlertmanagersToStop[userID] = userAM delete(am.alertmanagers, userID) delete(am.cfgs, userID) @@ -710,42 +721,29 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s } // computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations. -// It returns the final configuration and external URL the Alertmanager will use. -func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, error) { +// It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant. +func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) { cfg := amConfig{ AlertConfigDesc: cfgs.Mimir, tmplExternalURL: am.cfg.ExternalURL.URL, } - switch { - // Mimir configuration. - case !cfgs.Grafana.Promoted: - level.Debug(am.logger).Log("msg", "grafana configuration not promoted, using mimir config", "user", cfgs.Mimir.User) - return cfg, nil - - case cfgs.Grafana.Default: - level.Debug(am.logger).Log("msg", "grafana configuration is default, using mimir config", "user", cfgs.Mimir.User) - return cfg, nil - - case cfgs.Grafana.RawConfig == "": - level.Debug(am.logger).Log("msg", "grafana configuration is empty, using mimir config", "user", cfgs.Mimir.User) - return cfg, nil - - // Grafana configuration. - case cfgs.Mimir.RawConfig == am.fallbackConfig: - level.Debug(am.logger).Log("msg", "mimir configuration is default, using grafana config with the default globals", "user", cfgs.Mimir.User) - return createUsableGrafanaConfig(cfgs.Grafana, cfgs.Mimir.RawConfig) - - case cfgs.Mimir.RawConfig == "": - level.Debug(am.logger).Log("msg", "mimir configuration is empty, using grafana config with the default globals", "user", cfgs.Grafana.User) - return createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig) + // If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration. + if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" { + level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User) + isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix) + return cfg, !isGrafanaTenant, nil + } - // Both configurations. - // TODO: merge configurations. - default: - level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User) - return cfg, nil + // If the Mimir configuration is either default or empty, use the Grafana configuration. + if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" { + level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User) + cfg, err := createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig) + return cfg, true, err } + + level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User) + return cfg, true, nil } // syncStates promotes/unpromotes the Grafana state and updates the 'promoted' flag if needed. @@ -935,6 +933,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *defi Limits: am.limits, Features: am.features, GrafanaAlertmanagerCompatibility: am.cfg.GrafanaAlertmanagerCompatibilityEnabled, + GrafanaAlertmanagerTenantSuffix: am.cfg.GrafanaAlertmanagerTenantSuffix, }, reg) if err != nil { return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 92b83ee3f7e..ff8828f8f48 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -2506,6 +2506,11 @@ func TestComputeConfig(t *testing.T) { cfg := mockAlertmanagerConfig(t) am := setupSingleMultitenantAlertmanager(t, cfg, store, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) + reg2 := prometheus.NewPedanticRegistry() + cfg2 := mockAlertmanagerConfig(t) + cfg2.GrafanaAlertmanagerTenantSuffix = "-grafana" + amWithSuffix := setupSingleMultitenantAlertmanager(t, cfg2, store, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg2) + var grafanaCfg GrafanaAlertmanagerConfig require.NoError(t, json.Unmarshal([]byte(grafanaConfig), &grafanaCfg)) @@ -2523,6 +2528,7 @@ func TestComputeConfig(t *testing.T) { tests := []struct { name string cfg alertspb.AlertConfigDescs + expStartAM bool expErr string expCfg alertspb.AlertConfigDesc expURL string @@ -2532,12 +2538,13 @@ func TestComputeConfig(t *testing.T) { name: "no grafana configuration", cfg: alertspb.AlertConfigDescs{ Mimir: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, }, + expStartAM: false, expCfg: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, expURL: mimirExternalURL, @@ -2546,11 +2553,11 @@ func TestComputeConfig(t *testing.T) { name: "empty grafana configuration", cfg: alertspb.AlertConfigDescs{ Mimir: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, Grafana: alertspb.GrafanaAlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: "", Default: false, Promoted: true, @@ -2558,8 +2565,9 @@ func TestComputeConfig(t *testing.T) { StaticHeaders: map[string]string{"Test-Header": "test-value"}, }, }, + expStartAM: false, expCfg: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, expURL: mimirExternalURL, @@ -2568,19 +2576,20 @@ func TestComputeConfig(t *testing.T) { name: "grafana configuration is not promoted", cfg: alertspb.AlertConfigDescs{ Mimir: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, Grafana: alertspb.GrafanaAlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: grafanaConfig, Promoted: false, ExternalUrl: grafanaExternalURL, StaticHeaders: map[string]string{"Test-Header": "test-value"}, }, }, + expStartAM: false, expCfg: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, expURL: mimirExternalURL, @@ -2589,11 +2598,11 @@ func TestComputeConfig(t *testing.T) { name: "grafana configuration is default", cfg: alertspb.AlertConfigDescs{ Mimir: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, Grafana: alertspb.GrafanaAlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: grafanaConfig, Default: true, Promoted: true, @@ -2601,41 +2610,22 @@ func TestComputeConfig(t *testing.T) { StaticHeaders: map[string]string{"Test-Header": "test-value"}, }, }, + expStartAM: false, expCfg: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, expURL: mimirExternalURL, }, - { - name: "no mimir configuration", - cfg: alertspb.AlertConfigDescs{ - Grafana: alertspb.GrafanaAlertConfigDesc{ - User: "user", - RawConfig: grafanaConfig, - Default: false, - Promoted: true, - ExternalUrl: grafanaExternalURL, - StaticHeaders: map[string]string{"Test-Header": "test-value"}, - }, - }, - expCfg: alertspb.AlertConfigDesc{ - User: "user", - RawConfig: string(combinedCfg), - Templates: []*alertspb.TemplateDesc{}, - }, - expURL: grafanaExternalURL, - expHeaders: map[string]string{"Test-Header": "test-value"}, - }, { name: "empty mimir configuration", cfg: alertspb.AlertConfigDescs{ Mimir: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: "", }, Grafana: alertspb.GrafanaAlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: grafanaConfig, Default: false, Promoted: true, @@ -2643,8 +2633,9 @@ func TestComputeConfig(t *testing.T) { StaticHeaders: map[string]string{"Test-Header-1": "test-value-1", "Test-Header-2": "test-value-2"}, }, }, + expStartAM: true, expCfg: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: string(combinedCfg), Templates: []*alertspb.TemplateDesc{}, }, @@ -2655,11 +2646,11 @@ func TestComputeConfig(t *testing.T) { name: "default mimir configuration", cfg: alertspb.AlertConfigDescs{ Mimir: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: am.fallbackConfig, }, Grafana: alertspb.GrafanaAlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: grafanaConfig, Default: false, Promoted: true, @@ -2667,8 +2658,9 @@ func TestComputeConfig(t *testing.T) { StaticHeaders: map[string]string{"Test-Header-1": "test-value-1", "Test-Header-2": "test-value-2"}, }, }, + expStartAM: true, expCfg: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: string(combinedCfg), Templates: []*alertspb.TemplateDesc{}, }, @@ -2680,11 +2672,11 @@ func TestComputeConfig(t *testing.T) { name: "both mimir and grafana configurations (merging not implemented)", cfg: alertspb.AlertConfigDescs{ Mimir: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, Grafana: alertspb.GrafanaAlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: grafanaConfig, Default: false, Promoted: true, @@ -2692,8 +2684,9 @@ func TestComputeConfig(t *testing.T) { StaticHeaders: map[string]string{"Test-Header-1": "test-value-1", "Test-Header-2": "test-value-2"}, }, }, + expStartAM: true, expCfg: alertspb.AlertConfigDesc{ - User: "user", + User: "user-grafana", RawConfig: simpleConfigOne, }, expURL: am.cfg.ExternalURL.String(), @@ -2702,12 +2695,31 @@ func TestComputeConfig(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - cfg, err := am.computeConfig(test.cfg) + cfg, startAM, err := am.computeConfig(test.cfg) if test.expErr != "" { require.EqualError(t, err, test.expErr) return } + require.True(t, startAM) + require.NoError(t, err) + require.Equal(t, test.expCfg, cfg.AlertConfigDesc) + require.Equal(t, test.expURL, cfg.tmplExternalURL.String()) + require.Equal(t, test.expHeaders, cfg.staticHeaders) + }) + + t.Run(fmt.Sprintf("%s with Grafana tenant suffix", test.name), func(t *testing.T) { + cfg, startAM, err := amWithSuffix.computeConfig(test.cfg) + if test.expErr != "" { + require.EqualError(t, err, test.expErr) + return + } + + if !test.expStartAM { + require.False(t, startAM) + return + } + require.True(t, startAM) require.NoError(t, err) require.Equal(t, test.expCfg, cfg.AlertConfigDesc) require.Equal(t, test.expURL, cfg.tmplExternalURL.String())