diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 5e75cbeff..47fe3b2fb 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -29,4 +29,4 @@ jobs: with: args: --verbose # Make sure to sync this with Makefile.common and scripts/golangci-lint.yml. - version: v1.62.0 + version: v1.63.4 diff --git a/.golangci.yml b/.golangci.yml index f5e4078df..331e31002 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -5,25 +5,28 @@ output: sort-results: true linters: + # Keep this list sorted alphabetically enable: - depguard - errorlint + - exptostd - gocritic - godot - gofumpt - goimports + - loggercheck - misspell + - nilnesserr - nolintlint - perfsprint - predeclared - revive + - sloglint - testifylint - unconvert - unused - usestdlibvars - whitespace - - loggercheck - - sloglint issues: max-issues-per-linter: 0 diff --git a/CHANGELOG.md b/CHANGELOG.md index ca403c4ee..7b2a4da44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## unreleased +* [ENHANCEMENT] promtool: Support linting of scrape interval, through lint option `too-long-scrape-interval`. #15719 + ## 3.1.0 / 2025-01-02 * [SECURITY] upgrade golang.org/x/crypto to address reported CVE-2024-45337. #15691 diff --git a/Makefile.common b/Makefile.common index 6e86d7280..e8832656a 100644 --- a/Makefile.common +++ b/Makefile.common @@ -61,7 +61,7 @@ PROMU_URL := https://github.com/prometheus/promu/releases/download/v$(PROMU_ SKIP_GOLANGCI_LINT := GOLANGCI_LINT := GOLANGCI_LINT_OPTS ?= -GOLANGCI_LINT_VERSION ?= v1.62.0 +GOLANGCI_LINT_VERSION ?= v1.63.4 # golangci-lint only supports linux, darwin and windows platforms on i386/amd64/arm64. # windows isn't included here because of the path separator being different. ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux darwin)) diff --git a/cmd/promtool/main.go b/cmd/promtool/main.go index 62a1d4f90..81ba93d2d 100644 --- a/cmd/promtool/main.go +++ b/cmd/promtool/main.go @@ -73,14 +73,19 @@ const ( // Exit code 3 is used for "one or more lint issues detected". lintErrExitCode = 3 - lintOptionAll = "all" - lintOptionDuplicateRules = "duplicate-rules" - lintOptionNone = "none" - checkHealth = "/-/healthy" - checkReadiness = "/-/ready" + lintOptionAll = "all" + lintOptionDuplicateRules = "duplicate-rules" + lintOptionTooLongScrapeInterval = "too-long-scrape-interval" + lintOptionNone = "none" + checkHealth = "/-/healthy" + checkReadiness = "/-/ready" ) -var lintOptions = []string{lintOptionAll, lintOptionDuplicateRules, lintOptionNone} +var ( + lintRulesOptions = []string{lintOptionAll, lintOptionDuplicateRules, lintOptionNone} + // Same as lintRulesOptions, but including scrape config linting options as well. + lintConfigOptions = append(append([]string{}, lintRulesOptions...), lintOptionTooLongScrapeInterval) +) func main() { var ( @@ -97,6 +102,10 @@ func main() { app.HelpFlag.Short('h') checkCmd := app.Command("check", "Check the resources for validity.") + checkLookbackDelta := checkCmd.Flag( + "query.lookback-delta", + "The server's maximum query lookback duration.", + ).Default("5m").Duration() experimental := app.Flag("experimental", "Enable experimental commands.").Bool() @@ -113,11 +122,12 @@ func main() { checkConfigSyntaxOnly := checkConfigCmd.Flag("syntax-only", "Only check the config file syntax, ignoring file and content validation referenced in the config").Bool() checkConfigLint := checkConfigCmd.Flag( "lint", - "Linting checks to apply to the rules specified in the config. Available options are: "+strings.Join(lintOptions, ", ")+". Use --lint=none to disable linting", + "Linting checks to apply to the rules/scrape configs specified in the config. Available options are: "+strings.Join(lintConfigOptions, ", ")+". Use --lint=none to disable linting", ).Default(lintOptionDuplicateRules).String() checkConfigLintFatal := checkConfigCmd.Flag( "lint-fatal", "Make lint errors exit with exit code 3.").Default("false").Bool() + checkConfigIgnoreUnknownFields := checkConfigCmd.Flag("ignore-unknown-fields", "Ignore unknown fields in the rule groups read by the config files. This is useful when you want to extend rule files with custom metadata. Ensure that those fields are removed before loading them into the Prometheus server as it performs strict checks by default.").Default("false").Bool() checkWebConfigCmd := checkCmd.Command("web-config", "Check if the web config files are valid or not.") webConfigFiles := checkWebConfigCmd.Arg( @@ -140,11 +150,12 @@ func main() { ).ExistingFiles() checkRulesLint := checkRulesCmd.Flag( "lint", - "Linting checks to apply. Available options are: "+strings.Join(lintOptions, ", ")+". Use --lint=none to disable linting", + "Linting checks to apply. Available options are: "+strings.Join(lintRulesOptions, ", ")+". Use --lint=none to disable linting", ).Default(lintOptionDuplicateRules).String() checkRulesLintFatal := checkRulesCmd.Flag( "lint-fatal", "Make lint errors exit with exit code 3.").Default("false").Bool() + checkRulesIgnoreUnknownFields := checkRulesCmd.Flag("ignore-unknown-fields", "Ignore unknown fields in the rule files. This is useful when you want to extend rule files with custom metadata. Ensure that those fields are removed before loading them into the Prometheus server as it performs strict checks by default.").Default("false").Bool() checkMetricsCmd := checkCmd.Command("metrics", checkMetricsUsage) checkMetricsExtended := checkCmd.Flag("extended", "Print extended information related to the cardinality of the metrics.").Bool() @@ -218,6 +229,7 @@ func main() { ).Required().ExistingFiles() testRulesDebug := testRulesCmd.Flag("debug", "Enable unit test debugging.").Default("false").Bool() testRulesDiff := testRulesCmd.Flag("diff", "[Experimental] Print colored differential output between expected & received output.").Default("false").Bool() + testRulesIgnoreUnknownFields := testRulesCmd.Flag("ignore-unknown-fields", "Ignore unknown fields in the test files. This is useful when you want to extend rule files with custom metadata. Ensure that those fields are removed before loading them into the Prometheus server as it performs strict checks by default.").Default("false").Bool() defaultDBPath := "data/" tsdbCmd := app.Command("tsdb", "Run tsdb commands.") @@ -339,7 +351,7 @@ func main() { os.Exit(CheckSD(*sdConfigFile, *sdJobName, *sdTimeout, prometheus.DefaultRegisterer)) case checkConfigCmd.FullCommand(): - os.Exit(CheckConfig(*agentMode, *checkConfigSyntaxOnly, newLintConfig(*checkConfigLint, *checkConfigLintFatal), *configFiles...)) + os.Exit(CheckConfig(*agentMode, *checkConfigSyntaxOnly, newConfigLintConfig(*checkConfigLint, *checkConfigLintFatal, *checkConfigIgnoreUnknownFields, model.Duration(*checkLookbackDelta)), *configFiles...)) case checkServerHealthCmd.FullCommand(): os.Exit(checkErr(CheckServerStatus(serverURL, checkHealth, httpRoundTripper))) @@ -351,7 +363,7 @@ func main() { os.Exit(CheckWebConfig(*webConfigFiles...)) case checkRulesCmd.FullCommand(): - os.Exit(CheckRules(newLintConfig(*checkRulesLint, *checkRulesLintFatal), *ruleFiles...)) + os.Exit(CheckRules(newRulesLintConfig(*checkRulesLint, *checkRulesLintFatal, *checkRulesIgnoreUnknownFields), *ruleFiles...)) case checkMetricsCmd.FullCommand(): os.Exit(CheckMetrics(*checkMetricsExtended)) @@ -393,6 +405,7 @@ func main() { *testRulesRun, *testRulesDiff, *testRulesDebug, + *testRulesIgnoreUnknownFields, *testRulesFiles...), ) @@ -445,16 +458,18 @@ func checkExperimental(f bool) { var errLint = errors.New("lint error") -type lintConfig struct { - all bool - duplicateRules bool - fatal bool +type rulesLintConfig struct { + all bool + duplicateRules bool + fatal bool + ignoreUnknownFields bool } -func newLintConfig(stringVal string, fatal bool) lintConfig { +func newRulesLintConfig(stringVal string, fatal, ignoreUnknownFields bool) rulesLintConfig { items := strings.Split(stringVal, ",") - ls := lintConfig{ - fatal: fatal, + ls := rulesLintConfig{ + fatal: fatal, + ignoreUnknownFields: ignoreUnknownFields, } for _, setting := range items { switch setting { @@ -464,16 +479,57 @@ func newLintConfig(stringVal string, fatal bool) lintConfig { ls.duplicateRules = true case lintOptionNone: default: - fmt.Printf("WARNING: unknown lint option %s\n", setting) + fmt.Printf("WARNING: unknown lint option: %q\n", setting) } } return ls } -func (ls lintConfig) lintDuplicateRules() bool { +func (ls rulesLintConfig) lintDuplicateRules() bool { return ls.all || ls.duplicateRules } +type configLintConfig struct { + rulesLintConfig + + lookbackDelta model.Duration +} + +func newConfigLintConfig(optionsStr string, fatal, ignoreUnknownFields bool, lookbackDelta model.Duration) configLintConfig { + c := configLintConfig{ + rulesLintConfig: rulesLintConfig{ + fatal: fatal, + }, + } + + lintNone := false + var rulesOptions []string + for _, option := range strings.Split(optionsStr, ",") { + switch option { + case lintOptionAll, lintOptionTooLongScrapeInterval: + c.lookbackDelta = lookbackDelta + if option == lintOptionAll { + rulesOptions = append(rulesOptions, lintOptionAll) + } + case lintOptionNone: + lintNone = true + default: + rulesOptions = append(rulesOptions, option) + } + } + + if lintNone { + c.lookbackDelta = 0 + rulesOptions = nil + } + + if len(rulesOptions) > 0 { + c.rulesLintConfig = newRulesLintConfig(strings.Join(rulesOptions, ","), fatal, ignoreUnknownFields) + } + + return c +} + // CheckServerStatus - healthy & ready. func CheckServerStatus(serverURL *url.URL, checkEndpoint string, roundTripper http.RoundTripper) error { if serverURL.Scheme == "" { @@ -512,12 +568,12 @@ func CheckServerStatus(serverURL *url.URL, checkEndpoint string, roundTripper ht } // CheckConfig validates configuration files. -func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings lintConfig, files ...string) int { +func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings configLintConfig, files ...string) int { failed := false hasErrors := false for _, f := range files { - ruleFiles, err := checkConfig(agentMode, f, checkSyntaxOnly) + ruleFiles, scrapeConfigs, err := checkConfig(agentMode, f, checkSyntaxOnly) if err != nil { fmt.Fprintln(os.Stderr, " FAILED:", err) hasErrors = true @@ -530,12 +586,12 @@ func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings lintConfig, files } fmt.Println() - rulesFailed, rulesHasErrors := checkRules(ruleFiles, lintSettings) - if rulesFailed { - failed = rulesFailed - } - if rulesHasErrors { - hasErrors = rulesHasErrors + if !checkSyntaxOnly { + scrapeConfigsFailed := lintScrapeConfigs(scrapeConfigs, lintSettings) + failed = failed || scrapeConfigsFailed + rulesFailed, rulesHaveErrors := checkRules(ruleFiles, lintSettings.rulesLintConfig) + failed = failed || rulesFailed + hasErrors = hasErrors || rulesHaveErrors } } if failed && hasErrors { @@ -574,12 +630,12 @@ func checkFileExists(fn string) error { return err } -func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]string, error) { +func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]string, []*config.ScrapeConfig, error) { fmt.Println("Checking", filename) cfg, err := config.LoadFile(filename, agentMode, promslog.NewNopLogger()) if err != nil { - return nil, err + return nil, nil, err } var ruleFiles []string @@ -587,15 +643,15 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin for _, rf := range cfg.RuleFiles { rfs, err := filepath.Glob(rf) if err != nil { - return nil, err + return nil, nil, err } // If an explicit file was given, error if it is not accessible. if !strings.Contains(rf, "*") { if len(rfs) == 0 { - return nil, fmt.Errorf("%q does not point to an existing file", rf) + return nil, nil, fmt.Errorf("%q does not point to an existing file", rf) } if err := checkFileExists(rfs[0]); err != nil { - return nil, fmt.Errorf("error checking rule file %q: %w", rfs[0], err) + return nil, nil, fmt.Errorf("error checking rule file %q: %w", rfs[0], err) } } ruleFiles = append(ruleFiles, rfs...) @@ -609,26 +665,26 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin var err error scfgs, err = cfg.GetScrapeConfigs() if err != nil { - return nil, fmt.Errorf("error loading scrape configs: %w", err) + return nil, nil, fmt.Errorf("error loading scrape configs: %w", err) } } for _, scfg := range scfgs { if !checkSyntaxOnly && scfg.HTTPClientConfig.Authorization != nil { if err := checkFileExists(scfg.HTTPClientConfig.Authorization.CredentialsFile); err != nil { - return nil, fmt.Errorf("error checking authorization credentials or bearer token file %q: %w", scfg.HTTPClientConfig.Authorization.CredentialsFile, err) + return nil, nil, fmt.Errorf("error checking authorization credentials or bearer token file %q: %w", scfg.HTTPClientConfig.Authorization.CredentialsFile, err) } } if err := checkTLSConfig(scfg.HTTPClientConfig.TLSConfig, checkSyntaxOnly); err != nil { - return nil, err + return nil, nil, err } for _, c := range scfg.ServiceDiscoveryConfigs { switch c := c.(type) { case *kubernetes.SDConfig: if err := checkTLSConfig(c.HTTPClientConfig.TLSConfig, checkSyntaxOnly); err != nil { - return nil, err + return nil, nil, err } case *file.SDConfig: if checkSyntaxOnly { @@ -637,17 +693,17 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin for _, file := range c.Files { files, err := filepath.Glob(file) if err != nil { - return nil, err + return nil, nil, err } if len(files) != 0 { for _, f := range files { var targetGroups []*targetgroup.Group targetGroups, err = checkSDFile(f) if err != nil { - return nil, fmt.Errorf("checking SD file %q: %w", file, err) + return nil, nil, fmt.Errorf("checking SD file %q: %w", file, err) } if err := checkTargetGroupsForScrapeConfig(targetGroups, scfg); err != nil { - return nil, err + return nil, nil, err } } continue @@ -656,7 +712,7 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin } case discovery.StaticConfig: if err := checkTargetGroupsForScrapeConfig(c, scfg); err != nil { - return nil, err + return nil, nil, err } } } @@ -673,18 +729,18 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin for _, file := range c.Files { files, err := filepath.Glob(file) if err != nil { - return nil, err + return nil, nil, err } if len(files) != 0 { for _, f := range files { var targetGroups []*targetgroup.Group targetGroups, err = checkSDFile(f) if err != nil { - return nil, fmt.Errorf("checking SD file %q: %w", file, err) + return nil, nil, fmt.Errorf("checking SD file %q: %w", file, err) } if err := checkTargetGroupsForAlertmanager(targetGroups, amcfg); err != nil { - return nil, err + return nil, nil, err } } continue @@ -693,12 +749,12 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin } case discovery.StaticConfig: if err := checkTargetGroupsForAlertmanager(c, amcfg); err != nil { - return nil, err + return nil, nil, err } } } } - return ruleFiles, nil + return ruleFiles, scfgs, nil } func checkTLSConfig(tlsConfig promconfig.TLSConfig, checkSyntaxOnly bool) error { @@ -760,7 +816,7 @@ func checkSDFile(filename string) ([]*targetgroup.Group, error) { } // CheckRules validates rule files. -func CheckRules(ls lintConfig, files ...string) int { +func CheckRules(ls rulesLintConfig, files ...string) int { failed := false hasErrors := false if len(files) == 0 { @@ -780,7 +836,7 @@ func CheckRules(ls lintConfig, files ...string) int { } // checkRulesFromStdin validates rule from stdin. -func checkRulesFromStdin(ls lintConfig) (bool, bool) { +func checkRulesFromStdin(ls rulesLintConfig) (bool, bool) { failed := false hasErrors := false fmt.Println("Checking standard input") @@ -789,7 +845,7 @@ func checkRulesFromStdin(ls lintConfig) (bool, bool) { fmt.Fprintln(os.Stderr, " FAILED:", err) return true, true } - rgs, errs := rulefmt.Parse(data) + rgs, errs := rulefmt.Parse(data, ls.ignoreUnknownFields) if errs != nil { failed = true fmt.Fprintln(os.Stderr, " FAILED:") @@ -818,12 +874,12 @@ func checkRulesFromStdin(ls lintConfig) (bool, bool) { } // checkRules validates rule files. -func checkRules(files []string, ls lintConfig) (bool, bool) { +func checkRules(files []string, ls rulesLintConfig) (bool, bool) { failed := false hasErrors := false for _, f := range files { fmt.Println("Checking", f) - rgs, errs := rulefmt.ParseFile(f) + rgs, errs := rulefmt.ParseFile(f, ls.ignoreUnknownFields) if errs != nil { failed = true fmt.Fprintln(os.Stderr, " FAILED:") @@ -852,7 +908,7 @@ func checkRules(files []string, ls lintConfig) (bool, bool) { return failed, hasErrors } -func checkRuleGroups(rgs *rulefmt.RuleGroups, lintSettings lintConfig) (int, []error) { +func checkRuleGroups(rgs *rulefmt.RuleGroups, lintSettings rulesLintConfig) (int, []error) { numRules := 0 for _, rg := range rgs.Groups { numRules += len(rg.Rules) @@ -876,6 +932,16 @@ func checkRuleGroups(rgs *rulefmt.RuleGroups, lintSettings lintConfig) (int, []e return numRules, nil } +func lintScrapeConfigs(scrapeConfigs []*config.ScrapeConfig, lintSettings configLintConfig) bool { + for _, scfg := range scrapeConfigs { + if lintSettings.lookbackDelta > 0 && scfg.ScrapeInterval >= lintSettings.lookbackDelta { + fmt.Fprintf(os.Stderr, " FAILED: too long scrape interval found, data point will be marked as stale - job: %s, interval: %s\n", scfg.JobName, scfg.ScrapeInterval) + return true + } + } + return false +} + type compareRuleType struct { metric string label labels.Labels diff --git a/cmd/promtool/main_test.go b/cmd/promtool/main_test.go index 9a0726918..48bed9a2d 100644 --- a/cmd/promtool/main_test.go +++ b/cmd/promtool/main_test.go @@ -185,7 +185,7 @@ func TestCheckDuplicates(t *testing.T) { c := test t.Run(c.name, func(t *testing.T) { t.Parallel() - rgs, err := rulefmt.ParseFile(c.ruleFile) + rgs, err := rulefmt.ParseFile(c.ruleFile, false) require.Empty(t, err) dups := checkDuplicates(rgs.Groups) require.Equal(t, c.expectedDups, dups) @@ -194,7 +194,7 @@ func TestCheckDuplicates(t *testing.T) { } func BenchmarkCheckDuplicates(b *testing.B) { - rgs, err := rulefmt.ParseFile("./testdata/rules_large.yml") + rgs, err := rulefmt.ParseFile("./testdata/rules_large.yml", false) require.Empty(b, err) b.ResetTimer() @@ -234,7 +234,7 @@ func TestCheckTargetConfig(t *testing.T) { for _, test := range cases { t.Run(test.name, func(t *testing.T) { t.Parallel() - _, err := checkConfig(false, "testdata/"+test.file, false) + _, _, err := checkConfig(false, "testdata/"+test.file, false) if test.err != "" { require.EqualErrorf(t, err, test.err, "Expected error %q, got %q", test.err, err.Error()) return @@ -319,7 +319,7 @@ func TestCheckConfigSyntax(t *testing.T) { for _, test := range cases { t.Run(test.name, func(t *testing.T) { t.Parallel() - _, err := checkConfig(false, "testdata/"+test.file, test.syntaxOnly) + _, _, err := checkConfig(false, "testdata/"+test.file, test.syntaxOnly) expectedErrMsg := test.err if strings.Contains(runtime.GOOS, "windows") { expectedErrMsg = test.errWindows @@ -355,7 +355,7 @@ func TestAuthorizationConfig(t *testing.T) { for _, test := range cases { t.Run(test.name, func(t *testing.T) { t.Parallel() - _, err := checkConfig(false, "testdata/"+test.file, false) + _, _, err := checkConfig(false, "testdata/"+test.file, false) if test.err != "" { require.ErrorContains(t, err, test.err, "Expected error to contain %q, got %q", test.err, err.Error()) return @@ -508,7 +508,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false)) require.Equal(t, successExitCode, exitCode, "") }) @@ -530,7 +530,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false)) require.Equal(t, failureExitCode, exitCode, "") }) @@ -552,7 +552,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, true)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true, false)) require.Equal(t, lintErrExitCode, exitCode, "") }) } @@ -560,23 +560,66 @@ func TestCheckRules(t *testing.T) { func TestCheckRulesWithRuleFiles(t *testing.T) { t.Run("rules-good", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false), "./testdata/rules.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false), "./testdata/rules.yml") require.Equal(t, successExitCode, exitCode, "") }) t.Run("rules-bad", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false), "./testdata/rules-bad.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false, false), "./testdata/rules-bad.yml") require.Equal(t, failureExitCode, exitCode, "") }) t.Run("rules-lint-fatal", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, true), "./testdata/prometheus-rules.lint.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true, false), "./testdata/prometheus-rules.lint.yml") require.Equal(t, lintErrExitCode, exitCode, "") }) } +func TestCheckScrapeConfigs(t *testing.T) { + for _, tc := range []struct { + name string + lookbackDelta model.Duration + expectError bool + }{ + { + name: "scrape interval less than lookback delta", + lookbackDelta: model.Duration(11 * time.Minute), + expectError: false, + }, + { + name: "scrape interval greater than lookback delta", + lookbackDelta: model.Duration(5 * time.Minute), + expectError: true, + }, + { + name: "scrape interval same as lookback delta", + lookbackDelta: model.Duration(10 * time.Minute), + expectError: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + // Non-fatal linting. + code := CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, false, false, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + require.Equal(t, successExitCode, code, "Non-fatal linting should return success") + // Fatal linting. + code = CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, false, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + if tc.expectError { + require.Equal(t, lintErrExitCode, code, "Fatal linting should return error") + } else { + require.Equal(t, successExitCode, code, "Fatal linting should return success when there are no problems") + } + // Check syntax only, no linting. + code = CheckConfig(false, true, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, false, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + require.Equal(t, successExitCode, code, "Fatal linting should return success when checking syntax only") + // Lint option "none" should disable linting. + code = CheckConfig(false, false, newConfigLintConfig(lintOptionNone+","+lintOptionTooLongScrapeInterval, true, false, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + require.Equal(t, successExitCode, code, `Fatal linting should return success when lint option "none" is specified`) + }) + } +} + func TestTSDBDumpCommand(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") diff --git a/cmd/promtool/rules.go b/cmd/promtool/rules.go index adb214b81..b2eb18ca8 100644 --- a/cmd/promtool/rules.go +++ b/cmd/promtool/rules.go @@ -69,7 +69,7 @@ func newRuleImporter(logger *slog.Logger, config ruleImporterConfig, apiClient q // loadGroups parses groups from a list of recording rule files. func (importer *ruleImporter) loadGroups(_ context.Context, filenames []string) (errs []error) { - groups, errs := importer.ruleManager.LoadGroups(importer.config.evalInterval, labels.Labels{}, "", nil, filenames...) + groups, errs := importer.ruleManager.LoadGroups(importer.config.evalInterval, labels.Labels{}, "", nil, false, filenames...) if errs != nil { return errs } diff --git a/cmd/promtool/testdata/prometheus-config.lint.too_long_scrape_interval.yml b/cmd/promtool/testdata/prometheus-config.lint.too_long_scrape_interval.yml new file mode 100644 index 000000000..0c85d13f3 --- /dev/null +++ b/cmd/promtool/testdata/prometheus-config.lint.too_long_scrape_interval.yml @@ -0,0 +1,3 @@ +scrape_configs: + - job_name: too_long_scrape_interval_test + scrape_interval: 10m diff --git a/cmd/promtool/testdata/rules_extrafields.yml b/cmd/promtool/testdata/rules_extrafields.yml new file mode 100644 index 000000000..85ef079bb --- /dev/null +++ b/cmd/promtool/testdata/rules_extrafields.yml @@ -0,0 +1,33 @@ +# This is the rules file. It has an extra "ownership" +# field in the second group. promtool should ignore this field +# and not return an error with --ignore-unknown-fields. + +groups: + - name: alerts + namespace: "foobar" + rules: + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." + - alert: AlwaysFiring + expr: 1 + + - name: rules + ownership: + service: "test" + rules: + - record: job:test:count_over_time1m + expr: sum without(instance) (count_over_time(test[1m])) + + # A recording rule that doesn't depend on input series. + - record: fixed_data + expr: 1 + + # Subquery with default resolution test. + - record: suquery_interval_test + expr: count_over_time(up[5m:]) diff --git a/cmd/promtool/testdata/rules_run_extrafields.yml b/cmd/promtool/testdata/rules_run_extrafields.yml new file mode 100644 index 000000000..86879fc39 --- /dev/null +++ b/cmd/promtool/testdata/rules_run_extrafields.yml @@ -0,0 +1,21 @@ +# Minimal test case to see that --ignore-unknown-fields +# is working as expected. It should not return an error +# when any extra fields are present in the rules file. +rule_files: + - rules_extrafields.yml + +evaluation_interval: 1m + + +tests: + - name: extra ownership field test + input_series: + - series: test + values: 1 + + promql_expr_test: + - expr: test + eval_time: 0 + exp_samples: + - value: 1 + labels: test diff --git a/cmd/promtool/unittest.go b/cmd/promtool/unittest.go index 78dacdc56..7f494e27a 100644 --- a/cmd/promtool/unittest.go +++ b/cmd/promtool/unittest.go @@ -46,11 +46,11 @@ import ( // RulesUnitTest does unit testing of rules based on the unit testing files provided. // More info about the file format can be found in the docs. -func RulesUnitTest(queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag, debug bool, files ...string) int { - return RulesUnitTestResult(io.Discard, queryOpts, runStrings, diffFlag, debug, files...) +func RulesUnitTest(queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag, debug, ignoreUnknownFields bool, files ...string) int { + return RulesUnitTestResult(io.Discard, queryOpts, runStrings, diffFlag, debug, ignoreUnknownFields, files...) } -func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag, debug bool, files ...string) int { +func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag, debug, ignoreUnknownFields bool, files ...string) int { failed := false junit := &junitxml.JUnitXML{} @@ -60,7 +60,7 @@ func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, } for _, f := range files { - if errs := ruleUnitTest(f, queryOpts, run, diffFlag, debug, junit.Suite(f)); errs != nil { + if errs := ruleUnitTest(f, queryOpts, run, diffFlag, debug, ignoreUnknownFields, junit.Suite(f)); errs != nil { fmt.Fprintln(os.Stderr, " FAILED:") for _, e := range errs { fmt.Fprintln(os.Stderr, e.Error()) @@ -82,7 +82,7 @@ func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, return successExitCode } -func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *regexp.Regexp, diffFlag, debug bool, ts *junitxml.TestSuite) []error { +func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *regexp.Regexp, diffFlag, debug, ignoreUnknownFields bool, ts *junitxml.TestSuite) []error { b, err := os.ReadFile(filename) if err != nil { ts.Abort(err) @@ -131,7 +131,7 @@ func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *reg if t.Interval == 0 { t.Interval = unitTestInp.EvaluationInterval } - ers := t.test(testname, evalInterval, groupOrderMap, queryOpts, diffFlag, debug, unitTestInp.RuleFiles...) + ers := t.test(testname, evalInterval, groupOrderMap, queryOpts, diffFlag, debug, ignoreUnknownFields, unitTestInp.RuleFiles...) if ers != nil { for _, e := range ers { tc.Fail(e.Error()) @@ -198,7 +198,7 @@ type testGroup struct { } // test performs the unit tests. -func (tg *testGroup) test(testname string, evalInterval time.Duration, groupOrderMap map[string]int, queryOpts promqltest.LazyLoaderOpts, diffFlag, debug bool, ruleFiles ...string) (outErr []error) { +func (tg *testGroup) test(testname string, evalInterval time.Duration, groupOrderMap map[string]int, queryOpts promqltest.LazyLoaderOpts, diffFlag, debug, ignoreUnknownFields bool, ruleFiles ...string) (outErr []error) { if debug { testStart := time.Now() fmt.Printf("DEBUG: Starting test %s\n", testname) @@ -228,7 +228,7 @@ func (tg *testGroup) test(testname string, evalInterval time.Duration, groupOrde Logger: promslog.NewNopLogger(), } m := rules.NewManager(opts) - groupsMap, ers := m.LoadGroups(time.Duration(tg.Interval), tg.ExternalLabels, tg.ExternalURL, nil, ruleFiles...) + groupsMap, ers := m.LoadGroups(time.Duration(tg.Interval), tg.ExternalLabels, tg.ExternalURL, nil, ignoreUnknownFields, ruleFiles...) if ers != nil { return ers } diff --git a/cmd/promtool/unittest_test.go b/cmd/promtool/unittest_test.go index ec34ad318..7466b222c 100644 --- a/cmd/promtool/unittest_test.go +++ b/cmd/promtool/unittest_test.go @@ -143,7 +143,7 @@ func TestRulesUnitTest(t *testing.T) { } t.Run(tt.name, func(t *testing.T) { t.Parallel() - if got := RulesUnitTest(tt.queryOpts, nil, false, false, tt.args.files...); got != tt.want { + if got := RulesUnitTest(tt.queryOpts, nil, false, false, false, tt.args.files...); got != tt.want { t.Errorf("RulesUnitTest() = %v, want %v", got, tt.want) } }) @@ -151,7 +151,7 @@ func TestRulesUnitTest(t *testing.T) { t.Run("Junit xml output ", func(t *testing.T) { t.Parallel() var buf bytes.Buffer - if got := RulesUnitTestResult(&buf, promqltest.LazyLoaderOpts{}, nil, false, false, reuseFiles...); got != 1 { + if got := RulesUnitTestResult(&buf, promqltest.LazyLoaderOpts{}, nil, false, false, false, reuseFiles...); got != 1 { t.Errorf("RulesUnitTestResults() = %v, want 1", got) } var test junitxml.JUnitXML @@ -194,10 +194,11 @@ func TestRulesUnitTestRun(t *testing.T) { files []string } tests := []struct { - name string - args args - queryOpts promqltest.LazyLoaderOpts - want int + name string + args args + queryOpts promqltest.LazyLoaderOpts + want int + ignoreUnknownFields bool }{ { name: "Test all without run arg", @@ -231,11 +232,19 @@ func TestRulesUnitTestRun(t *testing.T) { }, want: 1, }, + { + name: "Test all with extra fields", + args: args{ + files: []string{"./testdata/rules_run_extrafields.yml"}, + }, + ignoreUnknownFields: true, + want: 0, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { t.Parallel() - got := RulesUnitTest(tt.queryOpts, tt.args.run, false, false, tt.args.files...) + got := RulesUnitTest(tt.queryOpts, tt.args.run, false, false, tt.ignoreUnknownFields, tt.args.files...) require.Equal(t, tt.want, got) }) } diff --git a/docs/command-line/promtool.md b/docs/command-line/promtool.md index 5e2a8f6bb..ab675e634 100644 --- a/docs/command-line/promtool.md +++ b/docs/command-line/promtool.md @@ -59,9 +59,10 @@ Check the resources for validity. #### Flags -| Flag | Description | -| --- | --- | -| --extended | Print extended information related to the cardinality of the metrics. | +| Flag | Description | Default | +| --- | --- | --- | +| --query.lookback-delta | The server's maximum query lookback duration. | `5m` | +| --extended | Print extended information related to the cardinality of the metrics. | | @@ -102,8 +103,9 @@ Check if the config files are valid or not. | Flag | Description | Default | | --- | --- | --- | | --syntax-only | Only check the config file syntax, ignoring file and content validation referenced in the config | | -| --lint | Linting checks to apply to the rules specified in the config. Available options are: all, duplicate-rules, none. Use --lint=none to disable linting | `duplicate-rules` | +| --lint | Linting checks to apply to the rules/scrape configs specified in the config. Available options are: all, duplicate-rules, none, too-long-scrape-interval. Use --lint=none to disable linting | `duplicate-rules` | | --lint-fatal | Make lint errors exit with exit code 3. | `false` | +| --ignore-unknown-fields | Ignore unknown fields in the rule groups read by the config files. This is useful when you want to extend rule files with custom metadata. Ensure that those fields are removed before loading them into the Prometheus server as it performs strict checks by default. | `false` | | --agent | Check config file for Prometheus in Agent mode. | | @@ -177,6 +179,7 @@ Check if the rule files are valid or not. | --- | --- | --- | | --lint | Linting checks to apply. Available options are: all, duplicate-rules, none. Use --lint=none to disable linting | `duplicate-rules` | | --lint-fatal | Make lint errors exit with exit code 3. | `false` | +| --ignore-unknown-fields | Ignore unknown fields in the rule files. This is useful when you want to extend rule files with custom metadata. Ensure that those fields are removed before loading them into the Prometheus server as it performs strict checks by default. | `false` | @@ -464,6 +467,7 @@ Unit tests for rules. | --run ... | If set, will only run test groups whose names match the regular expression. Can be specified multiple times. | | | --debug | Enable unit test debugging. | `false` | | --diff | [Experimental] Print colored differential output between expected & received output. | `false` | +| --ignore-unknown-fields | Ignore unknown fields in the test files. This is useful when you want to extend rule files with custom metadata. Ensure that those fields are removed before loading them into the Prometheus server as it performs strict checks by default. | `false` | diff --git a/model/rulefmt/rulefmt.go b/model/rulefmt/rulefmt.go index 06818b07c..afe900885 100644 --- a/model/rulefmt/rulefmt.go +++ b/model/rulefmt/rulefmt.go @@ -320,7 +320,7 @@ func testTemplateParsing(rl *RuleNode) (errs []error) { } // Parse parses and validates a set of rules. -func Parse(content []byte) (*RuleGroups, []error) { +func Parse(content []byte, ignoreUnknownFields bool) (*RuleGroups, []error) { var ( groups RuleGroups node ruleGroups @@ -328,7 +328,9 @@ func Parse(content []byte) (*RuleGroups, []error) { ) decoder := yaml.NewDecoder(bytes.NewReader(content)) - decoder.KnownFields(true) + if !ignoreUnknownFields { + decoder.KnownFields(true) + } err := decoder.Decode(&groups) // Ignore io.EOF which happens with empty input. if err != nil && !errors.Is(err, io.EOF) { @@ -347,12 +349,12 @@ func Parse(content []byte) (*RuleGroups, []error) { } // ParseFile reads and parses rules from a file. -func ParseFile(file string) (*RuleGroups, []error) { +func ParseFile(file string, ignoreUnknownFields bool) (*RuleGroups, []error) { b, err := os.ReadFile(file) if err != nil { return nil, []error{fmt.Errorf("%s: %w", file, err)} } - rgs, errs := Parse(b) + rgs, errs := Parse(b, ignoreUnknownFields) for i := range errs { errs[i] = fmt.Errorf("%s: %w", file, errs[i]) } diff --git a/model/rulefmt/rulefmt_test.go b/model/rulefmt/rulefmt_test.go index a12a86e73..dc4c02c69 100644 --- a/model/rulefmt/rulefmt_test.go +++ b/model/rulefmt/rulefmt_test.go @@ -26,7 +26,7 @@ import ( ) func TestParseFileSuccess(t *testing.T) { - _, errs := ParseFile("testdata/test.yaml") + _, errs := ParseFile("testdata/test.yaml", false) require.Empty(t, errs, "unexpected errors parsing file") } @@ -86,7 +86,7 @@ func TestParseFileFailure(t *testing.T) { } for _, c := range table { - _, errs := ParseFile(filepath.Join("testdata", c.filename)) + _, errs := ParseFile(filepath.Join("testdata", c.filename), false) require.NotEmpty(t, errs, "Expected error parsing %s but got none", c.filename) require.ErrorContainsf(t, errs[0], c.errMsg, "Expected error for %s.", c.filename) } @@ -181,7 +181,7 @@ groups: } for _, tst := range tests { - rgs, errs := Parse([]byte(tst.ruleString)) + rgs, errs := Parse([]byte(tst.ruleString), false) require.NotNil(t, rgs, "Rule parsing, rule=\n"+tst.ruleString) passed := (tst.shouldPass && len(errs) == 0) || (!tst.shouldPass && len(errs) > 0) require.True(t, passed, "Rule validation failed, rule=\n"+tst.ruleString) @@ -208,7 +208,7 @@ groups: annotations: summary: "Instance {{ $labels.instance }} up" ` - _, errs := Parse([]byte(group)) + _, errs := Parse([]byte(group), false) require.Len(t, errs, 2, "Expected two errors") var err00 *Error require.ErrorAs(t, errs[0], &err00) @@ -416,7 +416,7 @@ groups: } for _, tt := range tc { t.Run(tt.name, func(t *testing.T) { - rgs, errs := Parse([]byte(tt.ruleString)) + rgs, errs := Parse([]byte(tt.ruleString), false) require.Empty(t, errs) require.Equal(t, tt.output, rgs.Groups[0]) diff --git a/promql/parser/generated_parser.y b/promql/parser/generated_parser.y index 3865dc654..ca710b1ab 100644 --- a/promql/parser/generated_parser.y +++ b/promql/parser/generated_parser.y @@ -363,17 +363,18 @@ grouping_label_list: grouping_label : maybe_label { if !model.LabelName($1.Val).IsValid() { - yylex.(*parser).unexpected("grouping opts", "label") + yylex.(*parser).addParseErrf($1.PositionRange(),"invalid label name for grouping: %q", $1.Val) } $$ = $1 } | STRING { - if !model.LabelName(yylex.(*parser).unquoteString($1.Val)).IsValid() { - yylex.(*parser).unexpected("grouping opts", "label") + unquoted := yylex.(*parser).unquoteString($1.Val) + if !model.LabelName(unquoted).IsValid() { + yylex.(*parser).addParseErrf($1.PositionRange(),"invalid label name for grouping: %q", unquoted) } $$ = $1 $$.Pos++ - $$.Val = yylex.(*parser).unquoteString($$.Val) + $$.Val = unquoted } | error { yylex.(*parser).unexpected("grouping opts", "label"); $$ = Item{} } diff --git a/promql/parser/generated_parser.y.go b/promql/parser/generated_parser.y.go index 7ff859116..04bc081f2 100644 --- a/promql/parser/generated_parser.y.go +++ b/promql/parser/generated_parser.y.go @@ -1259,19 +1259,20 @@ yydefault: yyDollar = yyS[yypt-1 : yypt+1] { if !model.LabelName(yyDollar[1].item.Val).IsValid() { - yylex.(*parser).unexpected("grouping opts", "label") + yylex.(*parser).addParseErrf(yyDollar[1].item.PositionRange(), "invalid label name for grouping: %q", yyDollar[1].item.Val) } yyVAL.item = yyDollar[1].item } case 59: yyDollar = yyS[yypt-1 : yypt+1] { - if !model.LabelName(yylex.(*parser).unquoteString(yyDollar[1].item.Val)).IsValid() { - yylex.(*parser).unexpected("grouping opts", "label") + unquoted := yylex.(*parser).unquoteString(yyDollar[1].item.Val) + if !model.LabelName(unquoted).IsValid() { + yylex.(*parser).addParseErrf(yyDollar[1].item.PositionRange(), "invalid label name for grouping: %q", unquoted) } yyVAL.item = yyDollar[1].item yyVAL.item.Pos++ - yyVAL.item.Val = yylex.(*parser).unquoteString(yyVAL.item.Val) + yyVAL.item.Val = unquoted } case 60: yyDollar = yyS[yypt-1 : yypt+1] diff --git a/rules/manager.go b/rules/manager.go index 62cf62581..2cd1ced1f 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -217,7 +217,7 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels default: } - groups, errs := m.LoadGroups(interval, externalLabels, externalURL, groupEvalIterationFunc, files...) + groups, errs := m.LoadGroups(interval, externalLabels, externalURL, groupEvalIterationFunc, false, files...) if errs != nil { for _, e := range errs { @@ -291,7 +291,7 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels // GroupLoader is responsible for loading rule groups from arbitrary sources and parsing them. type GroupLoader interface { - Load(identifier string) (*rulefmt.RuleGroups, []error) + Load(identifier string, ignoreUnknownFields bool) (*rulefmt.RuleGroups, []error) Parse(query string) (parser.Expr, error) } @@ -299,22 +299,22 @@ type GroupLoader interface { // and parser.ParseExpr. type FileLoader struct{} -func (FileLoader) Load(identifier string) (*rulefmt.RuleGroups, []error) { - return rulefmt.ParseFile(identifier) +func (FileLoader) Load(identifier string, ignoreUnknownFields bool) (*rulefmt.RuleGroups, []error) { + return rulefmt.ParseFile(identifier, ignoreUnknownFields) } func (FileLoader) Parse(query string) (parser.Expr, error) { return parser.ParseExpr(query) } // LoadGroups reads groups from a list of files. func (m *Manager) LoadGroups( - interval time.Duration, externalLabels labels.Labels, externalURL string, groupEvalIterationFunc GroupEvalIterationFunc, filenames ...string, + interval time.Duration, externalLabels labels.Labels, externalURL string, groupEvalIterationFunc GroupEvalIterationFunc, ignoreUnknownFields bool, filenames ...string, ) (map[string]*Group, []error) { groups := make(map[string]*Group) shouldRestore := !m.restored || m.opts.AlwaysRestoreAlertState for _, fn := range filenames { - rgs, errs := m.opts.GroupLoader.Load(fn) + rgs, errs := m.opts.GroupLoader.Load(fn, ignoreUnknownFields) if errs != nil { return nil, errs } diff --git a/rules/manager_test.go b/rules/manager_test.go index 537cf0f2f..7efea91c7 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -808,7 +808,7 @@ func TestUpdate(t *testing.T) { } // Groups will be recreated if updated. - rgs, errs := rulefmt.ParseFile("fixtures/rules.yaml") + rgs, errs := rulefmt.ParseFile("fixtures/rules.yaml", false) require.Empty(t, errs, "file parsing failures") tmpFile, err := os.CreateTemp("", "rules.test.*.yaml") @@ -938,7 +938,7 @@ func TestUpdateSetsSourceTenants(t *testing.T) { ruleManager.start() defer ruleManager.Stop() - rgs, errs := rulefmt.ParseFile("fixtures/rules_with_source_tenants.yaml") + rgs, errs := rulefmt.ParseFile("fixtures/rules_with_source_tenants.yaml", false) require.Empty(t, errs, "file parsing failures") tmpFile, err := os.CreateTemp("", "rules.test.*.yaml") @@ -980,7 +980,7 @@ func TestAlignEvaluationTimeOnInterval(t *testing.T) { ruleManager.start() defer ruleManager.Stop() - rgs, errs := rulefmt.ParseFile("fixtures/rules_with_alignment.yaml") + rgs, errs := rulefmt.ParseFile("fixtures/rules_with_alignment.yaml", false) require.Empty(t, errs, "file parsing failures") tmpFile, err := os.CreateTemp("", "rules.test.*.yaml") @@ -1051,7 +1051,7 @@ func TestGroupEvaluationContextFuncIsCalledWhenSupplied(t *testing.T) { GroupEvaluationContextFunc: mockContextWrapFunc, }) - rgs, errs := rulefmt.ParseFile("fixtures/rules_with_source_tenants.yaml") + rgs, errs := rulefmt.ParseFile("fixtures/rules_with_source_tenants.yaml", false) require.Empty(t, errs, "file parsing failures") tmpFile, err := os.CreateTemp("", "rules.test.*.yaml") @@ -1769,7 +1769,7 @@ func TestManager_LoadGroups_ShouldCheckWhetherEachRuleHasDependentsAndDependenci }) t.Run("load a mix of dependent and independent rules", func(t *testing.T) { - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_multiple.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) @@ -1804,7 +1804,7 @@ func TestManager_LoadGroups_ShouldCheckWhetherEachRuleHasDependentsAndDependenci }) t.Run("load only independent rules", func(t *testing.T) { - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple_independent.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_multiple_independent.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) @@ -2212,7 +2212,7 @@ func TestAsyncRuleEvaluation(t *testing.T) { t.Cleanup(cancel) ruleManager := NewManager(optsFactory(storage, &maxInflight, &inflightQueries, 0)) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_multiple.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) @@ -2258,7 +2258,7 @@ func TestAsyncRuleEvaluation(t *testing.T) { opts.RuleConcurrencyController = nil ruleManager := NewManager(opts) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_multiple.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) @@ -2296,7 +2296,7 @@ func TestAsyncRuleEvaluation(t *testing.T) { opts.RuleConcurrencyController = nil ruleManager := NewManager(opts) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple_independent.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_multiple_independent.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) @@ -2340,7 +2340,7 @@ func TestAsyncRuleEvaluation(t *testing.T) { opts.RuleConcurrencyController = nil ruleManager := NewManager(opts) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple_independent.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_multiple_independent.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) @@ -2387,7 +2387,7 @@ func TestAsyncRuleEvaluation(t *testing.T) { opts.RuleConcurrencyController = nil ruleManager := NewManager(opts) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_indeterminates.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_indeterminates.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) @@ -2426,7 +2426,7 @@ func TestAsyncRuleEvaluation(t *testing.T) { opts.RuleConcurrencyController = nil ruleManager := NewManager(opts) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple_dependents_on_base.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_multiple_dependents_on_base.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) var group *Group @@ -2472,7 +2472,7 @@ func TestAsyncRuleEvaluation(t *testing.T) { opts.RuleConcurrencyController = nil ruleManager := NewManager(opts) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_chain.yaml"}...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, []string{"fixtures/rules_chain.yaml"}...) require.Empty(t, errs) require.Len(t, groups, 1) var group *Group @@ -2516,7 +2516,7 @@ func TestBoundedRuleEvalConcurrency(t *testing.T) { ruleManager := NewManager(optsFactory(storage, &maxInflight, &inflightQueries, maxConcurrency)) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, files...) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, files...) require.Empty(t, errs) require.Len(t, groups, groupCount) @@ -2758,7 +2758,7 @@ func TestRuleDependencyController_AnalyseRules(t *testing.T) { QueryFunc: func(ctx context.Context, q string, ts time.Time) (promql.Vector, error) { return nil, nil }, }) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, tc.ruleFile) + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, tc.ruleFile) require.Empty(t, errs) require.Len(t, groups, 1) @@ -2787,7 +2787,7 @@ func BenchmarkRuleDependencyController_AnalyseRules(b *testing.B) { QueryFunc: func(ctx context.Context, q string, ts time.Time) (promql.Vector, error) { return nil, nil }, }) - groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, "fixtures/rules_multiple.yaml") + groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, false, "fixtures/rules_multiple.yaml") require.Empty(b, errs) require.Len(b, groups, 1) diff --git a/scrape/helpers_test.go b/scrape/helpers_test.go index 7bc9e3f7d..2719a467b 100644 --- a/scrape/helpers_test.go +++ b/scrape/helpers_test.go @@ -90,6 +90,27 @@ type histogramSample struct { fh *histogram.FloatHistogram } +type metadataEntry struct { + m metadata.Metadata + metric labels.Labels +} + +func metadataEntryEqual(a, b metadataEntry) bool { + if !labels.Equal(a.metric, b.metric) { + return false + } + if a.m.Type != b.m.Type { + return false + } + if a.m.Unit != b.m.Unit { + return false + } + if a.m.Help != b.m.Help { + return false + } + return true +} + type collectResultAppendable struct { *collectResultAppender } @@ -112,8 +133,8 @@ type collectResultAppender struct { rolledbackHistograms []histogramSample resultExemplars []exemplar.Exemplar pendingExemplars []exemplar.Exemplar - resultMetadata []metadata.Metadata - pendingMetadata []metadata.Metadata + resultMetadata []metadataEntry + pendingMetadata []metadataEntry } func (a *collectResultAppender) SetOptions(opts *storage.AppendOptions) {} @@ -173,7 +194,7 @@ func (a *collectResultAppender) AppendHistogramCTZeroSample(ref storage.SeriesRe func (a *collectResultAppender) UpdateMetadata(ref storage.SeriesRef, l labels.Labels, m metadata.Metadata) (storage.SeriesRef, error) { a.mtx.Lock() defer a.mtx.Unlock() - a.pendingMetadata = append(a.pendingMetadata, m) + a.pendingMetadata = append(a.pendingMetadata, metadataEntry{metric: l, m: m}) if ref == 0 { ref = storage.SeriesRef(rand.Uint64()) } diff --git a/scrape/scrape.go b/scrape/scrape.go index 2da07d719..85eb07a1c 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -29,6 +29,7 @@ import ( "strings" "sync" "time" + "unsafe" "github.com/klauspost/compress/gzip" config_util "github.com/prometheus/common/config" @@ -931,6 +932,7 @@ type scrapeLoop struct { // scrapeCache tracks mappings of exposed metric strings to label sets and // storage references. Additionally, it tracks staleness of series between // scrapes. +// Cache is meant to be used per a single target. type scrapeCache struct { iter uint64 // Current scrape iteration. @@ -951,8 +953,10 @@ type scrapeCache struct { seriesCur map[uint64]labels.Labels seriesPrev map[uint64]labels.Labels - metaMtx sync.Mutex - metadata map[string]*metaEntry + // TODO(bwplotka): Consider moving Metadata API to use WAL instead of scrape loop to + // avoid locking (using metadata API can block scraping). + metaMtx sync.Mutex // Mutex is needed due to api touching it when metadata is queried. + metadata map[string]*metaEntry // metadata by metric family name. metrics *scrapeMetrics } @@ -1078,73 +1082,79 @@ func (c *scrapeCache) forEachStale(f func(labels.Labels) bool) { } } -func (c *scrapeCache) setType(metric []byte, t model.MetricType) { +func yoloString(b []byte) string { + return unsafe.String(unsafe.SliceData(b), len(b)) +} + +func (c *scrapeCache) setType(mfName []byte, t model.MetricType) ([]byte, *metaEntry) { c.metaMtx.Lock() + defer c.metaMtx.Unlock() - e, ok := c.metadata[string(metric)] + e, ok := c.metadata[yoloString(mfName)] if !ok { e = &metaEntry{Metadata: metadata.Metadata{Type: model.MetricTypeUnknown}} - c.metadata[string(metric)] = e + c.metadata[string(mfName)] = e } if e.Type != t { e.Type = t e.lastIterChange = c.iter } e.lastIter = c.iter - - c.metaMtx.Unlock() + return mfName, e } -func (c *scrapeCache) setHelp(metric, help []byte) { +func (c *scrapeCache) setHelp(mfName, help []byte) ([]byte, *metaEntry) { c.metaMtx.Lock() + defer c.metaMtx.Unlock() - e, ok := c.metadata[string(metric)] + e, ok := c.metadata[yoloString(mfName)] if !ok { e = &metaEntry{Metadata: metadata.Metadata{Type: model.MetricTypeUnknown}} - c.metadata[string(metric)] = e + c.metadata[string(mfName)] = e } if e.Help != string(help) { e.Help = string(help) e.lastIterChange = c.iter } e.lastIter = c.iter - - c.metaMtx.Unlock() + return mfName, e } -func (c *scrapeCache) setUnit(metric, unit []byte) { +func (c *scrapeCache) setUnit(mfName, unit []byte) ([]byte, *metaEntry) { c.metaMtx.Lock() + defer c.metaMtx.Unlock() - e, ok := c.metadata[string(metric)] + e, ok := c.metadata[yoloString(mfName)] if !ok { e = &metaEntry{Metadata: metadata.Metadata{Type: model.MetricTypeUnknown}} - c.metadata[string(metric)] = e + c.metadata[string(mfName)] = e } if e.Unit != string(unit) { e.Unit = string(unit) e.lastIterChange = c.iter } e.lastIter = c.iter - - c.metaMtx.Unlock() + return mfName, e } -func (c *scrapeCache) GetMetadata(metric string) (MetricMetadata, bool) { +// GetMetadata returns metadata given the metric family name. +func (c *scrapeCache) GetMetadata(mfName string) (MetricMetadata, bool) { c.metaMtx.Lock() defer c.metaMtx.Unlock() - m, ok := c.metadata[metric] + m, ok := c.metadata[mfName] if !ok { return MetricMetadata{}, false } return MetricMetadata{ - Metric: metric, - Type: m.Type, - Help: m.Help, - Unit: m.Unit, + MetricFamily: mfName, + Type: m.Type, + Help: m.Help, + Unit: m.Unit, }, true } +// ListMetadata lists metadata. func (c *scrapeCache) ListMetadata() []MetricMetadata { c.metaMtx.Lock() defer c.metaMtx.Unlock() @@ -1153,16 +1163,16 @@ func (c *scrapeCache) ListMetadata() []MetricMetadata { for m, e := range c.metadata { res = append(res, MetricMetadata{ - Metric: m, - Type: e.Type, - Help: e.Help, - Unit: e.Unit, + MetricFamily: m, + Type: e.Type, + Help: e.Help, + Unit: e.Unit, }) } return res } -// MetadataSize returns the size of the metadata cache. +// SizeMetadata returns the size of the metadata cache. func (c *scrapeCache) SizeMetadata() (s int) { c.metaMtx.Lock() defer c.metaMtx.Unlock() @@ -1173,7 +1183,7 @@ func (c *scrapeCache) SizeMetadata() (s int) { return s } -// MetadataLen returns the number of metadata entries in the cache. +// LengthMetadata returns the number of metadata entries in the cache. func (c *scrapeCache) LengthMetadata() int { c.metaMtx.Lock() defer c.metaMtx.Unlock() @@ -1607,39 +1617,17 @@ func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string, ) } var ( - appErrs = appendErrors{} - sampleLimitErr error - bucketLimitErr error - lset labels.Labels // escapes to heap so hoisted out of loop - e exemplar.Exemplar // escapes to heap so hoisted out of loop - meta metadata.Metadata - metadataChanged bool + appErrs = appendErrors{} + sampleLimitErr error + bucketLimitErr error + lset labels.Labels // escapes to heap so hoisted out of loop + e exemplar.Exemplar // escapes to heap so hoisted out of loop + lastMeta *metaEntry + lastMFName []byte ) exemplars := make([]exemplar.Exemplar, 0, 1) - // updateMetadata updates the current iteration's metadata object and the - // metadataChanged value if we have metadata in the scrape cache AND the - // labelset is for a new series or the metadata for this series has just - // changed. It returns a boolean based on whether the metadata was updated. - updateMetadata := func(lset labels.Labels, isNewSeries bool) bool { - if !sl.appendMetadataToWAL { - return false - } - - sl.cache.metaMtx.Lock() - defer sl.cache.metaMtx.Unlock() - metaEntry, metaOk := sl.cache.metadata[lset.Get(labels.MetricName)] - if metaOk && (isNewSeries || metaEntry.lastIterChange == sl.cache.iter) { - metadataChanged = true - meta.Type = metaEntry.Type - meta.Unit = metaEntry.Unit - meta.Help = metaEntry.Help - return true - } - return false - } - // Take an appender with limits. app = appender(app, sl.sampleLimit, sl.bucketLimit, sl.maxSchema) @@ -1669,14 +1657,18 @@ loop: break } switch et { + // TODO(bwplotka): Consider changing parser to give metadata at once instead of type, help and unit in separation, ideally on `Series()/Histogram() + // otherwise we can expose metadata without series on metadata API. case textparse.EntryType: - sl.cache.setType(p.Type()) + // TODO(bwplotka): Build meta entry directly instead of locking and updating the map. This will + // allow to properly update metadata when e.g unit was added, then removed; + lastMFName, lastMeta = sl.cache.setType(p.Type()) continue case textparse.EntryHelp: - sl.cache.setHelp(p.Help()) + lastMFName, lastMeta = sl.cache.setHelp(p.Help()) continue case textparse.EntryUnit: - sl.cache.setUnit(p.Unit()) + lastMFName, lastMeta = sl.cache.setUnit(p.Unit()) continue case textparse.EntryComment: continue @@ -1699,26 +1691,19 @@ loop: t = *parsedTimestamp } - // Zero metadata out for current iteration until it's resolved. - meta = metadata.Metadata{} - metadataChanged = false - if sl.cache.getDropped(met) { continue } - ce, ok, seriesAlreadyScraped := sl.cache.get(met) + ce, seriesCached, seriesAlreadyScraped := sl.cache.get(met) var ( ref storage.SeriesRef hash uint64 ) - if ok { + if seriesCached { ref = ce.ref lset = ce.lset hash = ce.hash - - // Update metadata only if it changed in the current iteration. - updateMetadata(lset, false) } else { p.Metric(&lset) hash = lset.Hash() @@ -1747,9 +1732,6 @@ loop: sl.metrics.targetScrapePoolExceededLabelLimits.Inc() break loop } - - // Append metadata for new series if they were present. - updateMetadata(lset, true) } if seriesAlreadyScraped && parsedTimestamp == nil { @@ -1799,7 +1781,7 @@ loop: break loop } - if !ok { + if !seriesCached { if parsedTimestamp == nil || sl.trackTimestampsStaleness { // Bypass staleness logic if there is an explicit timestamp. sl.cache.trackStaleness(hash, lset) @@ -1857,10 +1839,18 @@ loop: sl.metrics.targetScrapeExemplarOutOfOrder.Add(float64(outOfOrderExemplars)) } - if sl.appendMetadataToWAL && metadataChanged { - if _, merr := app.UpdateMetadata(ref, lset, meta); merr != nil { - // No need to fail the scrape on errors appending metadata. - sl.l.Debug("Error when appending metadata in scrape loop", "ref", fmt.Sprintf("%d", ref), "metadata", fmt.Sprintf("%+v", meta), "err", merr) + if sl.appendMetadataToWAL && lastMeta != nil { + // Is it new series OR did metadata change for this family? + if !seriesCached || lastMeta.lastIterChange == sl.cache.iter { + // In majority cases we can trust that the current series/histogram is matching the lastMeta and lastMFName. + // However, optional TYPE etc metadata and broken OM text can break this, detect those cases here. + // TODO(bwplotka): Consider moving this to parser as many parser users end up doing this (e.g. CT and NHCB parsing). + if isSeriesPartOfFamily(lset.Get(labels.MetricName), lastMFName, lastMeta.Type) { + if _, merr := app.UpdateMetadata(ref, lset, lastMeta.Metadata); merr != nil { + // No need to fail the scrape on errors appending metadata. + sl.l.Debug("Error when appending metadata in scrape loop", "ref", fmt.Sprintf("%d", ref), "metadata", fmt.Sprintf("%+v", lastMeta.Metadata), "err", merr) + } + } } } } @@ -1896,6 +1886,71 @@ loop: return } +func isSeriesPartOfFamily(mName string, mfName []byte, typ model.MetricType) bool { + mfNameStr := yoloString(mfName) + if !strings.HasPrefix(mName, mfNameStr) { // Fast path. + return false + } + + var ( + gotMFName string + ok bool + ) + switch typ { + case model.MetricTypeCounter: + // Prometheus allows _total, cut it from mf name to support this case. + mfNameStr, _ = strings.CutSuffix(mfNameStr, "_total") + + gotMFName, ok = strings.CutSuffix(mName, "_total") + if !ok { + gotMFName = mName + } + case model.MetricTypeHistogram: + gotMFName, ok = strings.CutSuffix(mName, "_bucket") + if !ok { + gotMFName, ok = strings.CutSuffix(mName, "_sum") + if !ok { + gotMFName, ok = strings.CutSuffix(mName, "_count") + if !ok { + gotMFName = mName + } + } + } + case model.MetricTypeGaugeHistogram: + gotMFName, ok = strings.CutSuffix(mName, "_bucket") + if !ok { + gotMFName, ok = strings.CutSuffix(mName, "_gsum") + if !ok { + gotMFName, ok = strings.CutSuffix(mName, "_gcount") + if !ok { + gotMFName = mName + } + } + } + case model.MetricTypeSummary: + gotMFName, ok = strings.CutSuffix(mName, "_sum") + if !ok { + gotMFName, ok = strings.CutSuffix(mName, "_count") + if !ok { + gotMFName = mName + } + } + case model.MetricTypeInfo: + // Technically prometheus text does not support info type, but we might + // accidentally allow info type in prom parse, so support metric family names + // with the _info explicitly too. + mfNameStr, _ = strings.CutSuffix(mfNameStr, "_info") + + gotMFName, ok = strings.CutSuffix(mName, "_info") + if !ok { + gotMFName = mName + } + default: + gotMFName = mName + } + return mfNameStr == gotMFName +} + // Adds samples to the appender, checking the error, and then returns the # of samples added, // whether the caller should continue to process more samples, and any sample or bucket limit errors. func (sl *scrapeLoop) checkAddError(met []byte, err error, sampleLimitErr, bucketLimitErr *error, appErrs *appendErrors) (bool, error) { @@ -1934,17 +1989,80 @@ func (sl *scrapeLoop) checkAddError(met []byte, err error, sampleLimitErr, bucke } } +// reportSample represents automatically generated timeseries documented in +// https://prometheus.io/docs/concepts/jobs_instances/#automatically-generated-labels-and-time-series +type reportSample struct { + metadata.Metadata + name []byte +} + // The constants are suffixed with the invalid \xff unicode rune to avoid collisions // with scraped metrics in the cache. var ( - scrapeHealthMetricName = []byte("up" + "\xff") - scrapeDurationMetricName = []byte("scrape_duration_seconds" + "\xff") - scrapeSamplesMetricName = []byte("scrape_samples_scraped" + "\xff") - samplesPostRelabelMetricName = []byte("scrape_samples_post_metric_relabeling" + "\xff") - scrapeSeriesAddedMetricName = []byte("scrape_series_added" + "\xff") - scrapeTimeoutMetricName = []byte("scrape_timeout_seconds" + "\xff") - scrapeSampleLimitMetricName = []byte("scrape_sample_limit" + "\xff") - scrapeBodySizeBytesMetricName = []byte("scrape_body_size_bytes" + "\xff") + scrapeHealthMetric = reportSample{ + name: []byte("up" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "Health of the scrape target. 1 means the target is healthy, 0 if the scrape failed.", + Unit: "targets", + }, + } + scrapeDurationMetric = reportSample{ + name: []byte("scrape_duration_seconds" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "Duration of the last scrape in seconds.", + Unit: "seconds", + }, + } + scrapeSamplesMetric = reportSample{ + name: []byte("scrape_samples_scraped" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "Number of samples last scraped.", + Unit: "samples", + }, + } + samplesPostRelabelMetric = reportSample{ + name: []byte("scrape_samples_post_metric_relabeling" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "Number of samples remaining after metric relabeling was applied.", + Unit: "samples", + }, + } + scrapeSeriesAddedMetric = reportSample{ + name: []byte("scrape_series_added" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "Number of series in the last scrape.", + Unit: "series", + }, + } + scrapeTimeoutMetric = reportSample{ + name: []byte("scrape_timeout_seconds" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "The configured scrape timeout for a target.", + Unit: "seconds", + }, + } + scrapeSampleLimitMetric = reportSample{ + name: []byte("scrape_sample_limit" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "The configured sample limit for a target. Returns zero if there is no limit configured.", + Unit: "samples", + }, + } + scrapeBodySizeBytesMetric = reportSample{ + name: []byte("scrape_body_size_bytes" + "\xff"), + Metadata: metadata.Metadata{ + Type: model.MetricTypeGauge, + Help: "The uncompressed size of the last scrape response, if successful. Scrapes failing because body_size_limit is exceeded report -1, other scrape failures report 0.", + Unit: "bytes", + }, + } ) func (sl *scrapeLoop) report(app storage.Appender, start time.Time, duration time.Duration, scraped, added, seriesAdded, bytes int, scrapeErr error) (err error) { @@ -1958,29 +2076,29 @@ func (sl *scrapeLoop) report(app storage.Appender, start time.Time, duration tim } b := labels.NewBuilderWithSymbolTable(sl.symbolTable) - if err = sl.addReportSample(app, scrapeHealthMetricName, ts, health, b); err != nil { + if err = sl.addReportSample(app, scrapeHealthMetric, ts, health, b); err != nil { return } - if err = sl.addReportSample(app, scrapeDurationMetricName, ts, duration.Seconds(), b); err != nil { + if err = sl.addReportSample(app, scrapeDurationMetric, ts, duration.Seconds(), b); err != nil { return } - if err = sl.addReportSample(app, scrapeSamplesMetricName, ts, float64(scraped), b); err != nil { + if err = sl.addReportSample(app, scrapeSamplesMetric, ts, float64(scraped), b); err != nil { return } - if err = sl.addReportSample(app, samplesPostRelabelMetricName, ts, float64(added), b); err != nil { + if err = sl.addReportSample(app, samplesPostRelabelMetric, ts, float64(added), b); err != nil { return } - if err = sl.addReportSample(app, scrapeSeriesAddedMetricName, ts, float64(seriesAdded), b); err != nil { + if err = sl.addReportSample(app, scrapeSeriesAddedMetric, ts, float64(seriesAdded), b); err != nil { return } if sl.reportExtraMetrics { - if err = sl.addReportSample(app, scrapeTimeoutMetricName, ts, sl.timeout.Seconds(), b); err != nil { + if err = sl.addReportSample(app, scrapeTimeoutMetric, ts, sl.timeout.Seconds(), b); err != nil { return } - if err = sl.addReportSample(app, scrapeSampleLimitMetricName, ts, float64(sl.sampleLimit), b); err != nil { + if err = sl.addReportSample(app, scrapeSampleLimitMetric, ts, float64(sl.sampleLimit), b); err != nil { return } - if err = sl.addReportSample(app, scrapeBodySizeBytesMetricName, ts, float64(bytes), b); err != nil { + if err = sl.addReportSample(app, scrapeBodySizeBytesMetric, ts, float64(bytes), b); err != nil { return } } @@ -1993,37 +2111,37 @@ func (sl *scrapeLoop) reportStale(app storage.Appender, start time.Time) (err er stale := math.Float64frombits(value.StaleNaN) b := labels.NewBuilder(labels.EmptyLabels()) - if err = sl.addReportSample(app, scrapeHealthMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, scrapeHealthMetric, ts, stale, b); err != nil { return } - if err = sl.addReportSample(app, scrapeDurationMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, scrapeDurationMetric, ts, stale, b); err != nil { return } - if err = sl.addReportSample(app, scrapeSamplesMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, scrapeSamplesMetric, ts, stale, b); err != nil { return } - if err = sl.addReportSample(app, samplesPostRelabelMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, samplesPostRelabelMetric, ts, stale, b); err != nil { return } - if err = sl.addReportSample(app, scrapeSeriesAddedMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, scrapeSeriesAddedMetric, ts, stale, b); err != nil { return } if sl.reportExtraMetrics { - if err = sl.addReportSample(app, scrapeTimeoutMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, scrapeTimeoutMetric, ts, stale, b); err != nil { return } - if err = sl.addReportSample(app, scrapeSampleLimitMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, scrapeSampleLimitMetric, ts, stale, b); err != nil { return } - if err = sl.addReportSample(app, scrapeBodySizeBytesMetricName, ts, stale, b); err != nil { + if err = sl.addReportSample(app, scrapeBodySizeBytesMetric, ts, stale, b); err != nil { return } } return } -func (sl *scrapeLoop) addReportSample(app storage.Appender, s []byte, t int64, v float64, b *labels.Builder) error { - ce, ok, _ := sl.cache.get(s) +func (sl *scrapeLoop) addReportSample(app storage.Appender, s reportSample, t int64, v float64, b *labels.Builder) error { + ce, ok, _ := sl.cache.get(s.name) var ref storage.SeriesRef var lset labels.Labels if ok { @@ -2034,7 +2152,7 @@ func (sl *scrapeLoop) addReportSample(app storage.Appender, s []byte, t int64, v // with scraped metrics in the cache. // We have to drop it when building the actual metric. b.Reset(labels.EmptyLabels()) - b.Set(labels.MetricName, string(s[:len(s)-1])) + b.Set(labels.MetricName, string(s.name[:len(s.name)-1])) lset = sl.reportSampleMutator(b.Labels()) } @@ -2042,7 +2160,13 @@ func (sl *scrapeLoop) addReportSample(app storage.Appender, s []byte, t int64, v switch { case err == nil: if !ok { - sl.cache.addRef(s, ref, lset, lset.Hash()) + sl.cache.addRef(s.name, ref, lset, lset.Hash()) + // We only need to add metadata once a scrape target appears. + if sl.appendMetadataToWAL { + if _, merr := app.UpdateMetadata(ref, lset, s.Metadata); merr != nil { + sl.l.Debug("Error when appending metadata in addReportSample", "ref", fmt.Sprintf("%d", ref), "metadata", fmt.Sprintf("%+v", s.Metadata), "err", merr) + } + } } return nil case errors.Is(err, storage.ErrOutOfOrderSample), errors.Is(err, storage.ErrDuplicateSampleForTimestamp): diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index a67d52e5c..2bb9c7247 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -50,6 +50,7 @@ import ( "github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/metadata" "github.com/prometheus/prometheus/model/relabel" "github.com/prometheus/prometheus/model/textparse" "github.com/prometheus/prometheus/model/timestamp" @@ -96,7 +97,9 @@ func TestStorageHandlesOutOfOrderTimestamps(t *testing.T) { // Test with default OutOfOrderTimeWindow (0) t.Run("Out-Of-Order Sample Disabled", func(t *testing.T) { s := teststorage.New(t) - defer s.Close() + t.Cleanup(func() { + _ = s.Close() + }) runScrapeLoopTest(t, s, false) }) @@ -104,7 +107,9 @@ func TestStorageHandlesOutOfOrderTimestamps(t *testing.T) { // Test with specific OutOfOrderTimeWindow (600000) t.Run("Out-Of-Order Sample Enabled", func(t *testing.T) { s := teststorage.New(t, 600000) - defer s.Close() + t.Cleanup(func() { + _ = s.Close() + }) runScrapeLoopTest(t, s, true) }) @@ -126,13 +131,13 @@ func runScrapeLoopTest(t *testing.T, s *teststorage.TestStorage, expectOutOfOrde timestampInorder2 := now.Add(5 * time.Minute) slApp := sl.appender(context.Background()) - _, _, _, err := sl.append(slApp, []byte(`metric_a{a="1",b="1"} 1`), "text/plain", timestampInorder1) + _, _, _, err := sl.append(slApp, []byte(`metric_total{a="1",b="1"} 1`), "text/plain", timestampInorder1) require.NoError(t, err) - _, _, _, err = sl.append(slApp, []byte(`metric_a{a="1",b="1"} 2`), "text/plain", timestampOutOfOrder) + _, _, _, err = sl.append(slApp, []byte(`metric_total{a="1",b="1"} 2`), "text/plain", timestampOutOfOrder) require.NoError(t, err) - _, _, _, err = sl.append(slApp, []byte(`metric_a{a="1",b="1"} 3`), "text/plain", timestampInorder2) + _, _, _, err = sl.append(slApp, []byte(`metric_total{a="1",b="1"} 3`), "text/plain", timestampInorder2) require.NoError(t, err) require.NoError(t, slApp.Commit()) @@ -145,7 +150,7 @@ func runScrapeLoopTest(t *testing.T, s *teststorage.TestStorage, expectOutOfOrde defer q.Close() // Use a matcher to filter the metric name. - series := q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "metric_a")) + series := q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "metric_total")) var results []floatSample for series.Next() { @@ -165,12 +170,12 @@ func runScrapeLoopTest(t *testing.T, s *teststorage.TestStorage, expectOutOfOrde // Define the expected results want := []floatSample{ { - metric: labels.FromStrings("__name__", "metric_a", "a", "1", "b", "1"), + metric: labels.FromStrings("__name__", "metric_total", "a", "1", "b", "1"), t: timestamp.FromTime(timestampInorder1), f: 1, }, { - metric: labels.FromStrings("__name__", "metric_a", "a", "1", "b", "1"), + metric: labels.FromStrings("__name__", "metric_total", "a", "1", "b", "1"), t: timestamp.FromTime(timestampInorder2), f: 3, }, @@ -183,6 +188,134 @@ func runScrapeLoopTest(t *testing.T, s *teststorage.TestStorage, expectOutOfOrde } } +// Regression test against https://github.com/prometheus/prometheus/issues/15831. +func TestScrapeAppendMetadataUpdate(t *testing.T) { + const ( + scrape1 = `# TYPE test_metric counter +# HELP test_metric some help text +# UNIT test_metric metric +test_metric_total 1 +# TYPE test_metric2 gauge +# HELP test_metric2 other help text +test_metric2{foo="bar"} 2 +# TYPE test_metric3 gauge +# HELP test_metric3 this represents tricky case of "broken" text that is not trivial to detect +test_metric3_metric4{foo="bar"} 2 +# EOF` + scrape2 = `# TYPE test_metric counter +# HELP test_metric different help text +test_metric_total 11 +# TYPE test_metric2 gauge +# HELP test_metric2 other help text +# UNIT test_metric2 metric2 +test_metric2{foo="bar"} 22 +# EOF` + ) + + // Create an appender for adding samples to the storage. + capp := &collectResultAppender{next: nopAppender{}} + sl := newBasicScrapeLoop(t, context.Background(), nil, func(ctx context.Context) storage.Appender { return capp }, 0) + + now := time.Now() + slApp := sl.appender(context.Background()) + _, _, _, err := sl.append(slApp, []byte(scrape1), "application/openmetrics-text", now) + require.NoError(t, err) + require.NoError(t, slApp.Commit()) + testutil.RequireEqualWithOptions(t, []metadataEntry{ + {metric: labels.FromStrings("__name__", "test_metric_total"), m: metadata.Metadata{Type: "counter", Unit: "metric", Help: "some help text"}}, + {metric: labels.FromStrings("__name__", "test_metric2", "foo", "bar"), m: metadata.Metadata{Type: "gauge", Unit: "", Help: "other help text"}}, + }, capp.resultMetadata, []cmp.Option{cmp.Comparer(metadataEntryEqual)}) + capp.resultMetadata = nil + + // Next (the same) scrape should not add new metadata entries. + slApp = sl.appender(context.Background()) + _, _, _, err = sl.append(slApp, []byte(scrape1), "application/openmetrics-text", now.Add(15*time.Second)) + require.NoError(t, err) + require.NoError(t, slApp.Commit()) + testutil.RequireEqualWithOptions(t, []metadataEntry(nil), capp.resultMetadata, []cmp.Option{cmp.Comparer(metadataEntryEqual)}) + + slApp = sl.appender(context.Background()) + _, _, _, err = sl.append(slApp, []byte(scrape2), "application/openmetrics-text", now.Add(15*time.Second)) + require.NoError(t, err) + require.NoError(t, slApp.Commit()) + testutil.RequireEqualWithOptions(t, []metadataEntry{ + {metric: labels.FromStrings("__name__", "test_metric_total"), m: metadata.Metadata{Type: "counter", Unit: "metric", Help: "different help text"}}, // Here, technically we should have no unit, but it's a known limitation of the current implementation. + {metric: labels.FromStrings("__name__", "test_metric2", "foo", "bar"), m: metadata.Metadata{Type: "gauge", Unit: "metric2", Help: "other help text"}}, + }, capp.resultMetadata, []cmp.Option{cmp.Comparer(metadataEntryEqual)}) +} + +type nopScraper struct { + scraper +} + +func (n nopScraper) Report(start time.Time, dur time.Duration, err error) {} + +func TestScrapeReportMetadataUpdate(t *testing.T) { + // Create an appender for adding samples to the storage. + capp := &collectResultAppender{next: nopAppender{}} + sl := newBasicScrapeLoop(t, context.Background(), nopScraper{}, func(ctx context.Context) storage.Appender { return capp }, 0) + now := time.Now() + slApp := sl.appender(context.Background()) + + require.NoError(t, sl.report(slApp, now, 2*time.Second, 1, 1, 1, 512, nil)) + require.NoError(t, slApp.Commit()) + testutil.RequireEqualWithOptions(t, []metadataEntry{ + {metric: labels.FromStrings("__name__", "up"), m: scrapeHealthMetric.Metadata}, + {metric: labels.FromStrings("__name__", "scrape_duration_seconds"), m: scrapeDurationMetric.Metadata}, + {metric: labels.FromStrings("__name__", "scrape_samples_scraped"), m: scrapeSamplesMetric.Metadata}, + {metric: labels.FromStrings("__name__", "scrape_samples_post_metric_relabeling"), m: samplesPostRelabelMetric.Metadata}, + {metric: labels.FromStrings("__name__", "scrape_series_added"), m: scrapeSeriesAddedMetric.Metadata}, + }, capp.resultMetadata, []cmp.Option{cmp.Comparer(metadataEntryEqual)}) +} + +func TestIsSeriesPartOfFamily(t *testing.T) { + t.Run("counter", func(t *testing.T) { + require.True(t, isSeriesPartOfFamily("http_requests_total", []byte("http_requests_total"), model.MetricTypeCounter)) // Prometheus text style. + require.True(t, isSeriesPartOfFamily("http_requests_total", []byte("http_requests"), model.MetricTypeCounter)) // OM text style. + require.True(t, isSeriesPartOfFamily("http_requests_total", []byte("http_requests_total"), model.MetricTypeUnknown)) + + require.False(t, isSeriesPartOfFamily("http_requests_total", []byte("http_requests"), model.MetricTypeUnknown)) // We don't know. + require.False(t, isSeriesPartOfFamily("http_requests2_total", []byte("http_requests_total"), model.MetricTypeCounter)) + require.False(t, isSeriesPartOfFamily("http_requests_requests_total", []byte("http_requests"), model.MetricTypeCounter)) + }) + + t.Run("gauge", func(t *testing.T) { + require.True(t, isSeriesPartOfFamily("http_requests_count", []byte("http_requests_count"), model.MetricTypeGauge)) + require.True(t, isSeriesPartOfFamily("http_requests_count", []byte("http_requests_count"), model.MetricTypeUnknown)) + + require.False(t, isSeriesPartOfFamily("http_requests_count2", []byte("http_requests_count"), model.MetricTypeCounter)) + }) + + t.Run("histogram", func(t *testing.T) { + require.True(t, isSeriesPartOfFamily("http_requests_seconds_sum", []byte("http_requests_seconds"), model.MetricTypeHistogram)) + require.True(t, isSeriesPartOfFamily("http_requests_seconds_count", []byte("http_requests_seconds"), model.MetricTypeHistogram)) + require.True(t, isSeriesPartOfFamily("http_requests_seconds_bucket", []byte("http_requests_seconds"), model.MetricTypeHistogram)) + require.True(t, isSeriesPartOfFamily("http_requests_seconds", []byte("http_requests_seconds"), model.MetricTypeHistogram)) + + require.False(t, isSeriesPartOfFamily("http_requests_seconds_sum", []byte("http_requests_seconds"), model.MetricTypeUnknown)) // We don't know. + require.False(t, isSeriesPartOfFamily("http_requests_seconds2_sum", []byte("http_requests_seconds"), model.MetricTypeHistogram)) + }) + + t.Run("summary", func(t *testing.T) { + require.True(t, isSeriesPartOfFamily("http_requests_seconds_sum", []byte("http_requests_seconds"), model.MetricTypeSummary)) + require.True(t, isSeriesPartOfFamily("http_requests_seconds_count", []byte("http_requests_seconds"), model.MetricTypeSummary)) + require.True(t, isSeriesPartOfFamily("http_requests_seconds", []byte("http_requests_seconds"), model.MetricTypeSummary)) + + require.False(t, isSeriesPartOfFamily("http_requests_seconds_sum", []byte("http_requests_seconds"), model.MetricTypeUnknown)) // We don't know. + require.False(t, isSeriesPartOfFamily("http_requests_seconds2_sum", []byte("http_requests_seconds"), model.MetricTypeSummary)) + }) + + t.Run("info", func(t *testing.T) { + require.True(t, isSeriesPartOfFamily("go_build_info", []byte("go_build_info"), model.MetricTypeInfo)) // Prometheus text style. + require.True(t, isSeriesPartOfFamily("go_build_info", []byte("go_build"), model.MetricTypeInfo)) // OM text style. + require.True(t, isSeriesPartOfFamily("go_build_info", []byte("go_build_info"), model.MetricTypeUnknown)) + + require.False(t, isSeriesPartOfFamily("go_build_info", []byte("go_build"), model.MetricTypeUnknown)) // We don't know. + require.False(t, isSeriesPartOfFamily("go_build2_info", []byte("go_build_info"), model.MetricTypeInfo)) + require.False(t, isSeriesPartOfFamily("go_build_build_info", []byte("go_build_info"), model.MetricTypeInfo)) + }) +} + func TestDroppedTargetsList(t *testing.T) { var ( app = &nopAppendable{} @@ -824,7 +957,7 @@ func newBasicScrapeLoopWithFallback(t testing.TB, ctx context.Context, scraper s false, false, false, - false, + true, nil, false, newTestScrapeMetrics(t), @@ -1131,7 +1264,7 @@ func TestScrapeLoopMetadata(t *testing.T) { total, _, _, err := sl.append(slApp, []byte(`# TYPE test_metric counter # HELP test_metric some help text # UNIT test_metric metric -test_metric 1 +test_metric_total 1 # TYPE test_metric_no_help gauge # HELP test_metric_no_type other help text # EOF`), "application/openmetrics-text", time.Now()) diff --git a/scrape/target.go b/scrape/target.go index 22cde01c0..4f576504f 100644 --- a/scrape/target.go +++ b/scrape/target.go @@ -78,17 +78,17 @@ func (t *Target) String() string { // MetricMetadataStore represents a storage for metadata. type MetricMetadataStore interface { ListMetadata() []MetricMetadata - GetMetadata(metric string) (MetricMetadata, bool) + GetMetadata(mfName string) (MetricMetadata, bool) SizeMetadata() int LengthMetadata() int } -// MetricMetadata is a piece of metadata for a metric. +// MetricMetadata is a piece of metadata for a metric family. type MetricMetadata struct { - Metric string - Type model.MetricType - Help string - Unit string + MetricFamily string + Type model.MetricType + Help string + Unit string } func (t *Target) ListMetadata() []MetricMetadata { @@ -124,14 +124,14 @@ func (t *Target) LengthMetadata() int { } // GetMetadata returns type and help metadata for the given metric. -func (t *Target) GetMetadata(metric string) (MetricMetadata, bool) { +func (t *Target) GetMetadata(mfName string) (MetricMetadata, bool) { t.mtx.RLock() defer t.mtx.RUnlock() if t.metadata == nil { return MetricMetadata{}, false } - return t.metadata.GetMetadata(metric) + return t.metadata.GetMetadata(mfName) } func (t *Target) SetMetadataStore(s MetricMetadataStore) { diff --git a/scripts/golangci-lint.yml b/scripts/golangci-lint.yml index 01b943b9b..0c00c410a 100644 --- a/scripts/golangci-lint.yml +++ b/scripts/golangci-lint.yml @@ -36,4 +36,4 @@ jobs: uses: golangci/golangci-lint-action@971e284b6050e8a5849b72094c50ab08da042db8 # v6.1.1 with: args: --verbose - version: v1.62.0 + version: v1.63.4 diff --git a/storage/remote/client.go b/storage/remote/client.go index ad766be9b..2538ee90a 100644 --- a/storage/remote/client.go +++ b/storage/remote/client.go @@ -81,8 +81,8 @@ var ( remoteReadQueriesTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, - Subsystem: subsystem, - Name: "read_queries_total", + Subsystem: "remote_read_client", + Name: "queries_total", Help: "The total number of remote read queries.", }, []string{remoteName, endpoint, "response_type", "code"}, @@ -90,8 +90,8 @@ var ( remoteReadQueries = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, - Subsystem: subsystem, - Name: "remote_read_queries", + Subsystem: "remote_read_client", + Name: "queries", Help: "The number of in-flight remote read queries.", }, []string{remoteName, endpoint}, @@ -99,8 +99,8 @@ var ( remoteReadQueryDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, - Subsystem: subsystem, - Name: "read_request_duration_seconds", + Subsystem: "remote_read_client", + Name: "request_duration_seconds", Help: "Histogram of the latency for remote read requests. Note that for streamed responses this is only the duration of the initial call and does not include the processing of the stream.", Buckets: append(prometheus.DefBuckets, 25, 60), NativeHistogramBucketFactor: 1.1, diff --git a/storage/remote/metadata_watcher_test.go b/storage/remote/metadata_watcher_test.go index ce9b9d022..d939ef8ef 100644 --- a/storage/remote/metadata_watcher_test.go +++ b/storage/remote/metadata_watcher_test.go @@ -40,9 +40,9 @@ func (s *TestMetaStore) ListMetadata() []scrape.MetricMetadata { return s.Metadata } -func (s *TestMetaStore) GetMetadata(metric string) (scrape.MetricMetadata, bool) { +func (s *TestMetaStore) GetMetadata(mfName string) (scrape.MetricMetadata, bool) { for _, m := range s.Metadata { - if metric == m.Metric { + if mfName == m.MetricFamily { return m, true } } @@ -106,26 +106,26 @@ func TestWatchScrapeManager_ReadyForCollection(t *testing.T) { metadata := &TestMetaStore{ Metadata: []scrape.MetricMetadata{ { - Metric: "prometheus_tsdb_head_chunks_created_total", - Type: model.MetricTypeCounter, - Help: "Total number", - Unit: "", + MetricFamily: "prometheus_tsdb_head_chunks_created", + Type: model.MetricTypeCounter, + Help: "Total number", + Unit: "", }, { - Metric: "prometheus_remote_storage_retried_samples_total", - Type: model.MetricTypeCounter, - Help: "Total number", - Unit: "", + MetricFamily: "prometheus_remote_storage_retried_samples", + Type: model.MetricTypeCounter, + Help: "Total number", + Unit: "", }, }, } metadataDup := &TestMetaStore{ Metadata: []scrape.MetricMetadata{ { - Metric: "prometheus_tsdb_head_chunks_created_total", - Type: model.MetricTypeCounter, - Help: "Total number", - Unit: "", + MetricFamily: "prometheus_tsdb_head_chunks_created", + Type: model.MetricTypeCounter, + Help: "Total number", + Unit: "", }, }, } diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 4b966059f..b274707bf 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -550,7 +550,7 @@ func (t *QueueManager) AppendWatcherMetadata(ctx context.Context, metadata []scr mm := make([]prompb.MetricMetadata, 0, len(metadata)) for _, entry := range metadata { mm = append(mm, prompb.MetricMetadata{ - MetricFamilyName: entry.Metric, + MetricFamilyName: entry.MetricFamily, Help: entry.Help, Type: prompb.FromMetadataType(entry.Type), Unit: entry.Unit, @@ -1919,12 +1919,17 @@ func populateV2TimeSeries(symbolTable *writev2.SymbolsTable, batch []timeSeries, var nPendingSamples, nPendingExemplars, nPendingHistograms, nPendingMetadata int for nPending, d := range batch { pendingData[nPending].Samples = pendingData[nPending].Samples[:0] - // todo: should we also safeguard against empty metadata here? if d.metadata != nil { pendingData[nPending].Metadata.Type = writev2.FromMetadataType(d.metadata.Type) pendingData[nPending].Metadata.HelpRef = symbolTable.Symbolize(d.metadata.Help) - pendingData[nPending].Metadata.HelpRef = symbolTable.Symbolize(d.metadata.Unit) + pendingData[nPending].Metadata.UnitRef = symbolTable.Symbolize(d.metadata.Unit) nPendingMetadata++ + } else { + // Safeguard against sending garbage in case of not having metadata + // for whatever reason. + pendingData[nPending].Metadata.Type = writev2.Metadata_METRIC_TYPE_UNSPECIFIED + pendingData[nPending].Metadata.HelpRef = 0 + pendingData[nPending].Metadata.UnitRef = 0 } if sendExemplars { diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index 202c71c34..38eda81d9 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -342,10 +342,10 @@ func TestMetadataDelivery(t *testing.T) { numMetadata := 1532 for i := 0; i < numMetadata; i++ { metadata = append(metadata, scrape.MetricMetadata{ - Metric: "prometheus_remote_storage_sent_metadata_bytes_total_" + strconv.Itoa(i), - Type: model.MetricTypeCounter, - Help: "a nice help text", - Unit: "", + MetricFamily: "prometheus_remote_storage_sent_metadata_bytes_" + strconv.Itoa(i), + Type: model.MetricTypeCounter, + Help: "a nice help text", + Unit: "", }) } @@ -357,7 +357,7 @@ func TestMetadataDelivery(t *testing.T) { // fit into MaxSamplesPerSend. require.Equal(t, numMetadata/config.DefaultMetadataConfig.MaxSamplesPerSend+1, c.writesReceived) // Make sure the last samples were sent. - require.Equal(t, c.receivedMetadata[metadata[len(metadata)-1].Metric][0].MetricFamilyName, metadata[len(metadata)-1].Metric) + require.Equal(t, c.receivedMetadata[metadata[len(metadata)-1].MetricFamily][0].MetricFamilyName, metadata[len(metadata)-1].MetricFamily) } func TestWALMetadataDelivery(t *testing.T) { diff --git a/storage/remote/read_handler.go b/storage/remote/read_handler.go index 8f2945f97..3e315a615 100644 --- a/storage/remote/read_handler.go +++ b/storage/remote/read_handler.go @@ -56,10 +56,10 @@ func NewReadHandler(logger *slog.Logger, r prometheus.Registerer, queryable stor marshalPool: &sync.Pool{}, queries: prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: "prometheus", - Subsystem: "api", // TODO: changes to storage in Prometheus 3.0. - Name: "remote_read_queries", - Help: "The current number of remote read queries being executed or waiting.", + Namespace: namespace, + Subsystem: "remote_read_handler", + Name: "queries", + Help: "The current number of remote read queries that are either in execution or queued on the handler.", }), } if r != nil { diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 7a1d9fb62..729c7bffc 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -68,6 +68,9 @@ const ( // Non-standard status code (originally introduced by nginx) for the case when a client closes // the connection while the server is still processing the request. statusClientClosedConnection = 499 + + // checkContextEveryNIterations is used in some tight loops to check if the context is done. + checkContextEveryNIterations = 128 ) type errorType string @@ -963,10 +966,15 @@ func (api *API) series(r *http.Request) (result apiFuncResult) { warnings := set.Warnings() + i := 1 for set.Next() { - if err := ctx.Err(); err != nil { - return apiFuncResult{nil, returnAPIError(err), warnings, closer} + if i%checkContextEveryNIterations == 0 { + if err := ctx.Err(); err != nil { + return apiFuncResult{nil, returnAPIError(err), warnings, closer} + } } + i++ + metrics = append(metrics, set.At().Labels()) if limit > 0 && len(metrics) > limit { @@ -1229,11 +1237,11 @@ func (api *API) targetMetadata(r *http.Request) apiFuncResult { if metric == "" { for _, md := range t.ListMetadata() { res = append(res, metricMetadata{ - Target: targetLabels, - Metric: md.Metric, - Type: md.Type, - Help: md.Help, - Unit: md.Unit, + Target: targetLabels, + MetricFamily: md.MetricFamily, + Type: md.Type, + Help: md.Help, + Unit: md.Unit, }) } continue @@ -1254,11 +1262,11 @@ func (api *API) targetMetadata(r *http.Request) apiFuncResult { } type metricMetadata struct { - Target labels.Labels `json:"target"` - Metric string `json:"metric,omitempty"` - Type model.MetricType `json:"type"` - Help string `json:"help"` - Unit string `json:"unit"` + Target labels.Labels `json:"target"` + MetricFamily string `json:"metric,omitempty"` + Type model.MetricType `json:"type"` + Help string `json:"help"` + Unit string `json:"unit"` } // AlertmanagerDiscovery has all the active Alertmanagers. @@ -1358,7 +1366,7 @@ func (api *API) metricMetadata(r *http.Request) apiFuncResult { if metric == "" { for _, mm := range t.ListMetadata() { m := metadata.Metadata{Type: mm.Type, Help: mm.Help, Unit: mm.Unit} - ms, ok := metrics[mm.Metric] + ms, ok := metrics[mm.MetricFamily] if limitPerMetric > 0 && len(ms) >= limitPerMetric { continue @@ -1366,7 +1374,7 @@ func (api *API) metricMetadata(r *http.Request) apiFuncResult { if !ok { ms = map[metadata.Metadata]struct{}{} - metrics[mm.Metric] = ms + metrics[mm.MetricFamily] = ms } ms[m] = struct{}{} } @@ -1375,7 +1383,7 @@ func (api *API) metricMetadata(r *http.Request) apiFuncResult { if md, ok := t.GetMetadata(metric); ok { m := metadata.Metadata{Type: md.Type, Help: md.Help, Unit: md.Unit} - ms, ok := metrics[md.Metric] + ms, ok := metrics[md.MetricFamily] if limitPerMetric > 0 && len(ms) >= limitPerMetric { continue @@ -1383,7 +1391,7 @@ func (api *API) metricMetadata(r *http.Request) apiFuncResult { if !ok { ms = map[metadata.Metadata]struct{}{} - metrics[md.Metric] = ms + metrics[md.MetricFamily] = ms } ms[m] = struct{}{} } diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 37227d849..f9bdbe394 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -84,7 +84,7 @@ func (s *testMetaStore) ListMetadata() []scrape.MetricMetadata { func (s *testMetaStore) GetMetadata(metric string) (scrape.MetricMetadata, bool) { for _, m := range s.Metadata { - if metric == m.Metric { + if metric == m.MetricFamily { return m, true } } @@ -1891,10 +1891,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created.", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created.", + Unit: "", }, }, }, @@ -1921,10 +1921,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "blackbox", metadata: []scrape.MetricMetadata{ { - Metric: "prometheus_tsdb_storage_blocks_bytes", - Type: model.MetricTypeGauge, - Help: "The number of bytes that are currently used for local storage by all blocks.", - Unit: "", + MetricFamily: "prometheus_tsdb_storage_blocks_bytes", + Type: model.MetricTypeGauge, + Help: "The number of bytes that are currently used for local storage by all blocks.", + Unit: "", }, }, }, @@ -1934,10 +1934,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E Target: labels.FromMap(map[string]string{ "job": "blackbox", }), - Metric: "prometheus_tsdb_storage_blocks_bytes", - Help: "The number of bytes that are currently used for local storage by all blocks.", - Type: model.MetricTypeGauge, - Unit: "", + MetricFamily: "prometheus_tsdb_storage_blocks_bytes", + Help: "The number of bytes that are currently used for local storage by all blocks.", + Type: model.MetricTypeGauge, + Unit: "", }, }, }, @@ -1949,10 +1949,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created.", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created.", + Unit: "", }, }, }, @@ -1960,10 +1960,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "blackbox", metadata: []scrape.MetricMetadata{ { - Metric: "prometheus_tsdb_storage_blocks_bytes", - Type: model.MetricTypeGauge, - Help: "The number of bytes that are currently used for local storage by all blocks.", - Unit: "", + MetricFamily: "prometheus_tsdb_storage_blocks_bytes", + Type: model.MetricTypeGauge, + Help: "The number of bytes that are currently used for local storage by all blocks.", + Unit: "", }, }, }, @@ -1973,25 +1973,25 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E Target: labels.FromMap(map[string]string{ "job": "test", }), - Metric: "go_threads", - Help: "Number of OS threads created.", - Type: model.MetricTypeGauge, - Unit: "", + MetricFamily: "go_threads", + Help: "Number of OS threads created.", + Type: model.MetricTypeGauge, + Unit: "", }, { Target: labels.FromMap(map[string]string{ "job": "blackbox", }), - Metric: "prometheus_tsdb_storage_blocks_bytes", - Help: "The number of bytes that are currently used for local storage by all blocks.", - Type: model.MetricTypeGauge, - Unit: "", + MetricFamily: "prometheus_tsdb_storage_blocks_bytes", + Help: "The number of bytes that are currently used for local storage by all blocks.", + Type: model.MetricTypeGauge, + Unit: "", }, }, sorter: func(m interface{}) { sort.Slice(m.([]metricMetadata), func(i, j int) bool { s := m.([]metricMetadata) - return s[i].Metric < s[j].Metric + return s[i].MetricFamily < s[j].MetricFamily }) }, }, @@ -2026,16 +2026,16 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "prometheus_engine_query_duration_seconds", - Type: model.MetricTypeSummary, - Help: "Query timings", - Unit: "", + MetricFamily: "prometheus_engine_query_duration_seconds", + Type: model.MetricTypeSummary, + Help: "Query timings", + Unit: "", }, { - Metric: "go_info", - Type: model.MetricTypeGauge, - Help: "Information about the Go environment.", - Unit: "", + MetricFamily: "go_info", + Type: model.MetricTypeGauge, + Help: "Information about the Go environment.", + Unit: "", }, }, }, @@ -2056,10 +2056,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, }, }, @@ -2067,10 +2067,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "blackbox", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, }, }, @@ -2089,10 +2089,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, }, }, @@ -2100,10 +2100,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "blackbox", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads that were created.", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads that were created.", + Unit: "", }, }, }, @@ -2136,16 +2136,16 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, { - Metric: "prometheus_engine_query_duration_seconds", - Type: model.MetricTypeSummary, - Help: "Query Timings.", - Unit: "", + MetricFamily: "prometheus_engine_query_duration_seconds", + Type: model.MetricTypeSummary, + Help: "Query Timings.", + Unit: "", }, }, }, @@ -2153,10 +2153,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "blackbox", metadata: []scrape.MetricMetadata{ { - Metric: "go_gc_duration_seconds", - Type: model.MetricTypeSummary, - Help: "A summary of the GC invocation durations.", - Unit: "", + MetricFamily: "go_gc_duration_seconds", + Type: model.MetricTypeSummary, + Help: "A summary of the GC invocation durations.", + Unit: "", }, }, }, @@ -2172,22 +2172,22 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Repeated metadata", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Repeated metadata", + Unit: "", }, { - Metric: "go_gc_duration_seconds", - Type: model.MetricTypeSummary, - Help: "A summary of the GC invocation durations.", - Unit: "", + MetricFamily: "go_gc_duration_seconds", + Type: model.MetricTypeSummary, + Help: "A summary of the GC invocation durations.", + Unit: "", }, }, }, @@ -2211,22 +2211,22 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Repeated metadata", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Repeated metadata", + Unit: "", }, { - Metric: "go_gc_duration_seconds", - Type: model.MetricTypeSummary, - Help: "A summary of the GC invocation durations.", - Unit: "", + MetricFamily: "go_gc_duration_seconds", + Type: model.MetricTypeSummary, + Help: "A summary of the GC invocation durations.", + Unit: "", }, }, }, @@ -2244,22 +2244,22 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Repeated metadata", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Repeated metadata", + Unit: "", }, { - Metric: "go_gc_duration_seconds", - Type: model.MetricTypeSummary, - Help: "A summary of the GC invocation durations.", - Unit: "", + MetricFamily: "go_gc_duration_seconds", + Type: model.MetricTypeSummary, + Help: "A summary of the GC invocation durations.", + Unit: "", }, }, }, @@ -2267,16 +2267,16 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "secondTarget", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created, but from a different target", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created, but from a different target", + Unit: "", }, { - Metric: "go_gc_duration_seconds", - Type: model.MetricTypeSummary, - Help: "A summary of the GC invocation durations, but from a different target.", - Unit: "", + MetricFamily: "go_gc_duration_seconds", + Type: model.MetricTypeSummary, + Help: "A summary of the GC invocation durations, but from a different target.", + Unit: "", }, }, }, @@ -2293,10 +2293,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, }, }, @@ -2304,16 +2304,16 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "blackbox", metadata: []scrape.MetricMetadata{ { - Metric: "go_gc_duration_seconds", - Type: model.MetricTypeSummary, - Help: "A summary of the GC invocation durations.", - Unit: "", + MetricFamily: "go_gc_duration_seconds", + Type: model.MetricTypeSummary, + Help: "A summary of the GC invocation durations.", + Unit: "", }, { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads that were created.", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads that were created.", + Unit: "", }, }, }, @@ -2342,10 +2342,10 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E identifier: "test", metadata: []scrape.MetricMetadata{ { - Metric: "go_threads", - Type: model.MetricTypeGauge, - Help: "Number of OS threads created", - Unit: "", + MetricFamily: "go_threads", + Type: model.MetricTypeGauge, + Help: "Number of OS threads created", + Unit: "", }, }, },