-
Notifications
You must be signed in to change notification settings - Fork 548
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Alertmanager: Skip starting the Alertmanager for Grafana tenants unless they have a promoted, non-default configuration #10491
Merged
santihernandezc
merged 10 commits into
main
from
santihernandezc/dont_run_alertmanager_default_config
Jan 24, 2025
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
e51606e
(PoC) Alertmanager: Don't start Alertmanager if the config is default
santihernandezc c8a5fd6
tests
santihernandezc c2e7145
scope everything to grafana tenants
santihernandezc 0709703
make reference-help
santihernandezc 21c4fc9
move logic to computeConfig
santihernandezc 1da54f9
simplify computeConfig
santihernandezc 1860a34
refactor syncConfigs
santihernandezc 08f5d48
make doc
santihernandezc 89c0b4e
update arg name
santihernandezc 82e9697
make reference-help
santihernandezc File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,8 +81,10 @@ type MultitenantAlertmanagerConfig struct { | |
|
||
PeerTimeout time.Duration `yaml:"peer_timeout" category:"advanced"` | ||
|
||
EnableAPI bool `yaml:"enable_api" category:"advanced"` | ||
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` | ||
EnableAPI bool `yaml:"enable_api" category:"advanced"` | ||
|
||
GrafanaAlertmanagerCompatibilityEnabled bool `yaml:"grafana_alertmanager_compatibility_enabled" category:"experimental"` | ||
GrafanaAlertmanagerTenantSuffix string `yaml:"grafana_alertmanager_conditionally_skip_tenant_suffix" category:"experimental"` | ||
|
||
MaxConcurrentGetRequestsPerTenant int `yaml:"max_concurrent_get_requests_per_tenant" category:"advanced"` | ||
|
||
|
@@ -126,6 +128,7 @@ func (cfg *MultitenantAlertmanagerConfig) RegisterFlags(f *flag.FlagSet, logger | |
|
||
f.BoolVar(&cfg.EnableAPI, "alertmanager.enable-api", true, "Enable the alertmanager config API.") | ||
f.BoolVar(&cfg.GrafanaAlertmanagerCompatibilityEnabled, "alertmanager.grafana-alertmanager-compatibility-enabled", false, "Enable routes to support the migration and operation of the Grafana Alertmanager.") | ||
f.StringVar(&cfg.GrafanaAlertmanagerTenantSuffix, "alertmanager.grafana-alertmanager-conditionally-skip-tenant-suffix", "", "Skip starting the Alertmanager for tenants matching this suffix unless they have a promoted, non-default Grafana Alertmanager configuration.") | ||
f.IntVar(&cfg.MaxConcurrentGetRequestsPerTenant, "alertmanager.max-concurrent-get-requests-per-tenant", 0, "Maximum number of concurrent GET requests allowed per tenant. The zero value (and negative values) result in a limit of GOMAXPROCS or 8, whichever is larger. Status code 503 is served for GET requests that would exceed the concurrency limit.") | ||
|
||
f.BoolVar(&cfg.EnableStateCleanup, "alertmanager.enable-state-cleanup", true, "Enables periodic cleanup of alertmanager stateful data (notification logs and silences) from object storage. When enabled, data is removed for any tenant that does not have a configuration.") | ||
|
@@ -664,14 +667,21 @@ func (am *MultitenantAlertmanager) isUserOwned(userID string) bool { | |
|
||
func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[string]alertspb.AlertConfigDescs) { | ||
level.Debug(am.logger).Log("msg", "adding configurations", "num_configs", len(cfgMap)) | ||
amInitSkipped := map[string]struct{}{} | ||
for user, cfgs := range cfgMap { | ||
cfg, err := am.computeConfig(cfgs) | ||
cfg, startAM, err := am.computeConfig(cfgs) | ||
if err != nil { | ||
am.multitenantMetrics.lastReloadSuccessful.WithLabelValues(user).Set(float64(0)) | ||
level.Warn(am.logger).Log("msg", "error computing config", "err", err) | ||
continue | ||
} | ||
|
||
if !startAM { | ||
level.Debug(am.logger).Log("msg", "not initializing alertmanager for grafana tenant without a promoted, non-default configuration", "user", user) | ||
amInitSkipped[user] = struct{}{} | ||
continue | ||
} | ||
|
||
if err := am.syncStates(ctx, cfg); err != nil { | ||
level.Error(am.logger).Log("msg", "error syncing states", "err", err, "user", user) | ||
} | ||
|
@@ -687,10 +697,11 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s | |
} | ||
|
||
userAlertmanagersToStop := map[string]*Alertmanager{} | ||
|
||
am.alertmanagersMtx.Lock() | ||
for userID, userAM := range am.alertmanagers { | ||
if _, exists := cfgMap[userID]; !exists { | ||
_, exists := cfgMap[userID] | ||
_, initSkipped := amInitSkipped[userID] | ||
if !exists || initSkipped { | ||
userAlertmanagersToStop[userID] = userAM | ||
delete(am.alertmanagers, userID) | ||
delete(am.cfgs, userID) | ||
|
@@ -710,42 +721,29 @@ func (am *MultitenantAlertmanager) syncConfigs(ctx context.Context, cfgMap map[s | |
} | ||
|
||
// computeConfig takes an AlertConfigDescs struct containing Mimir and Grafana configurations. | ||
// It returns the final configuration and external URL the Alertmanager will use. | ||
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, error) { | ||
// It returns the final configuration and a bool indicating whether the Alertmanager should be started for the tenant. | ||
func (am *MultitenantAlertmanager) computeConfig(cfgs alertspb.AlertConfigDescs) (amConfig, bool, error) { | ||
cfg := amConfig{ | ||
AlertConfigDesc: cfgs.Mimir, | ||
tmplExternalURL: am.cfg.ExternalURL.URL, | ||
} | ||
|
||
switch { | ||
// Mimir configuration. | ||
case !cfgs.Grafana.Promoted: | ||
level.Debug(am.logger).Log("msg", "grafana configuration not promoted, using mimir config", "user", cfgs.Mimir.User) | ||
return cfg, nil | ||
|
||
case cfgs.Grafana.Default: | ||
level.Debug(am.logger).Log("msg", "grafana configuration is default, using mimir config", "user", cfgs.Mimir.User) | ||
return cfg, nil | ||
|
||
case cfgs.Grafana.RawConfig == "": | ||
level.Debug(am.logger).Log("msg", "grafana configuration is empty, using mimir config", "user", cfgs.Mimir.User) | ||
return cfg, nil | ||
|
||
// Grafana configuration. | ||
case cfgs.Mimir.RawConfig == am.fallbackConfig: | ||
level.Debug(am.logger).Log("msg", "mimir configuration is default, using grafana config with the default globals", "user", cfgs.Mimir.User) | ||
return createUsableGrafanaConfig(cfgs.Grafana, cfgs.Mimir.RawConfig) | ||
|
||
case cfgs.Mimir.RawConfig == "": | ||
level.Debug(am.logger).Log("msg", "mimir configuration is empty, using grafana config with the default globals", "user", cfgs.Grafana.User) | ||
return createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig) | ||
// If the Grafana configuration is either default, not promoted, or empty, use the Mimir configuration. | ||
if !cfgs.Grafana.Promoted || cfgs.Grafana.Default || cfgs.Grafana.RawConfig == "" { | ||
level.Debug(am.logger).Log("msg", "using mimir config", "user", cfgs.Mimir.User) | ||
isGrafanaTenant := am.cfg.GrafanaAlertmanagerTenantSuffix != "" && strings.HasSuffix(cfgs.Mimir.User, am.cfg.GrafanaAlertmanagerTenantSuffix) | ||
return cfg, !isGrafanaTenant, nil | ||
} | ||
Comment on lines
+731
to
+736
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I decided to remove the granular debug log lines to shorten the function. |
||
|
||
// Both configurations. | ||
// TODO: merge configurations. | ||
default: | ||
level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User) | ||
return cfg, nil | ||
// If the Mimir configuration is either default or empty, use the Grafana configuration. | ||
if cfgs.Mimir.RawConfig == am.fallbackConfig || cfgs.Mimir.RawConfig == "" { | ||
level.Debug(am.logger).Log("msg", "using grafana config with the default globals", "user", cfgs.Mimir.User) | ||
cfg, err := createUsableGrafanaConfig(cfgs.Grafana, am.fallbackConfig) | ||
return cfg, true, err | ||
} | ||
|
||
level.Warn(am.logger).Log("msg", "merging configurations not implemented, using mimir config", "user", cfgs.Mimir.User) | ||
return cfg, true, nil | ||
} | ||
|
||
// syncStates promotes/unpromotes the Grafana state and updates the 'promoted' flag if needed. | ||
|
@@ -935,6 +933,7 @@ func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *defi | |
Limits: am.limits, | ||
Features: am.features, | ||
GrafanaAlertmanagerCompatibility: am.cfg.GrafanaAlertmanagerCompatibilityEnabled, | ||
GrafanaAlertmanagerTenantSuffix: am.cfg.GrafanaAlertmanagerTenantSuffix, | ||
}, reg) | ||
if err != nil { | ||
return nil, fmt.Errorf("unable to start Alertmanager for user %v: %v", userID, err) | ||
|
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the skipped Alertmanager was already running, I'm deleting its data and adding it to the map of Alertmanagers to stop.