From 444fe55181b7686e92173edc52eeb5473bb2920e Mon Sep 17 00:00:00 2001
From: George Krajcsovits <krajorama@users.noreply.github.com>
Date: Wed, 27 Nov 2024 18:31:16 +0100
Subject: [PATCH] validation: allow more labels for _info metrics by default
 (#10028)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* validation: allow more labels for _info metrics by default

The info metrics are generally low cardinality even though there are
many informational labels. This is to support conversion from OTLP
attributes easier.

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
Co-authored-by: Arve Knudsen <arve.knudsen@gmail.com>
Co-authored-by: Taylor C <41653732+tacole02@users.noreply.github.com>
---
 CHANGELOG.md                                  |  1 +
 cmd/mimir/config-descriptor.json              | 10 +++++
 cmd/mimir/help-all.txt.tmpl                   |  2 +
 cmd/mimir/help.txt.tmpl                       |  2 +
 .../configuration-parameters/index.md         |  6 +++
 .../mimir/manage/mimir-runbooks/_index.md     | 10 +++++
 pkg/distributor/validate.go                   | 28 ++++++++++++--
 pkg/distributor/validate_test.go              | 38 +++++++++++++++++--
 pkg/util/globalerror/user.go                  |  1 +
 pkg/util/validation/limits.go                 |  8 ++++
 10 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5e6fcdf3fb7..168005ff262 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -75,6 +75,7 @@
 * [ENHANCEMENT] Querier: improve performance and memory consumption of queries that select many series. #9914
 * [ENHANCEMENT] Ruler: Support OAuth2 and proxies in Alertmanager client #9945
 * [ENHANCEMENT] Ingester: Build 24h blocks for older OOO. #9844
+* [ENHANCEMENT] Distributor: allow a different limit for info series (series ending in `_info`) label count, via `-validation.max-label-names-per-info-series`. #10028
 * [BUGFIX] Fix issue where functions such as `rate()` over native histograms could return incorrect values if a float stale marker was present in the selected range. #9508
 * [BUGFIX] Fix issue where negation of native histograms (eg. `-some_native_histogram_series`) did nothing. #9508
 * [BUGFIX] Fix issue where `metric might not be a counter, name does not end in _total/_sum/_count/_bucket` annotation would be emitted even if `rate` or `increase` did not have enough samples to compute a result. #9508
diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json
index 798dad46ed5..59da3ccead2 100644
--- a/cmd/mimir/config-descriptor.json
+++ b/cmd/mimir/config-descriptor.json
@@ -3795,6 +3795,16 @@
           "fieldFlag": "validation.max-label-names-per-series",
           "fieldType": "int"
         },
+        {
+          "kind": "field",
+          "name": "max_label_names_per_info_series",
+          "required": false,
+          "desc": "Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series)",
+          "fieldValue": null,
+          "fieldDefaultValue": 80,
+          "fieldFlag": "validation.max-label-names-per-info-series",
+          "fieldType": "int"
+        },
         {
           "kind": "field",
           "name": "max_metadata_length",
diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl
index cb7cc339794..3f59650dc03 100644
--- a/cmd/mimir/help-all.txt.tmpl
+++ b/cmd/mimir/help-all.txt.tmpl
@@ -3317,6 +3317,8 @@ Usage of ./cmd/mimir/mimir:
     	Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m)
   -validation.enforce-metadata-metric-name
     	Enforce every metadata has a metric name. (default true)
+  -validation.max-label-names-per-info-series int
+    	Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series) (default 80)
   -validation.max-label-names-per-series int
     	Maximum number of label names per series. (default 30)
   -validation.max-length-label-name int
diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl
index f0f3b3eba43..665632c8491 100644
--- a/cmd/mimir/help.txt.tmpl
+++ b/cmd/mimir/help.txt.tmpl
@@ -861,6 +861,8 @@ Usage of ./cmd/mimir/mimir:
     	Enable anonymous usage reporting. (default true)
   -usage-stats.installation-mode string
     	Installation mode. Supported values: custom, helm, jsonnet. (default "custom")
+  -validation.max-label-names-per-info-series int
+    	Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series) (default 80)
   -validation.max-label-names-per-series int
     	Maximum number of label names per series. (default 30)
   -validation.max-length-label-name int
diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md
index 8a06067d199..1b1a7c8cc3a 100644
--- a/docs/sources/mimir/configure/configuration-parameters/index.md
+++ b/docs/sources/mimir/configure/configuration-parameters/index.md
@@ -3232,6 +3232,12 @@ The `limits` block configures default and per-tenant limits imposed by component
 # CLI flag: -validation.max-label-names-per-series
 [max_label_names_per_series: <int> | default = 30]
 
+# Maximum number of label names per info series. Has no effect if less than the
+# value of the maximum number of label names per series option
+# (-validation.max-label-names-per-series)
+# CLI flag: -validation.max-label-names-per-info-series
+[max_label_names_per_info_series: <int> | default = 80]
+
 # Maximum length accepted for metric metadata. Metadata refers to Metric Name,
 # HELP and UNIT. Longer metadata is dropped except for HELP which is truncated.
 # CLI flag: -validation.max-metadata-length
diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md
index 90997f5fdbe..0939c001e9f 100644
--- a/docs/sources/mimir/manage/mimir-runbooks/_index.md
+++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md
@@ -1729,6 +1729,16 @@ The limit protects the system’s stability from potential abuse or mistakes. To
 Invalid series are skipped during the ingestion, and valid series within the same request are ingested.
 {{< /admonition >}}
 
+### err-mimir-max-label-names-per-info-series
+
+This non-critical error occurs when Mimir receives a write request that contains an info series with a number of labels that exceeds the configured limit.
+An info series is a series where the metric name ends in `_info`.
+The limit protects the system’s stability from potential abuse or mistakes. To configure the limit on a per-tenant basis, use the `-validation.max-label-names-per-info-series` option.
+
+{{< admonition type="note" >}}
+Invalid series are skipped during ingestion, and valid series in the same request are ingested.
+{{< /admonition >}}
+
 ### err-mimir-max-native-histogram-buckets
 
 This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram that has too many observation buckets.
diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go
index c3c9b2d1a6b..fffc943b6c0 100644
--- a/pkg/distributor/validate.go
+++ b/pkg/distributor/validate.go
@@ -33,6 +33,7 @@ var (
 	reasonMissingMetricName            = globalerror.MissingMetricName.LabelValue()
 	reasonInvalidMetricName            = globalerror.InvalidMetricName.LabelValue()
 	reasonMaxLabelNamesPerSeries       = globalerror.MaxLabelNamesPerSeries.LabelValue()
+	reasonMaxLabelNamesPerInfoSeries   = globalerror.MaxLabelNamesPerInfoSeries.LabelValue()
 	reasonInvalidLabel                 = globalerror.SeriesInvalidLabel.LabelValue()
 	reasonInvalidLabelValue            = globalerror.SeriesInvalidLabelValue.LabelValue()
 	reasonLabelNameTooLong             = globalerror.SeriesLabelNameTooLong.LabelValue()
@@ -74,10 +75,16 @@ var (
 	invalidLabelMsgFormat      = globalerror.SeriesInvalidLabel.Message("received a series with an invalid label: '%.200s' series: '%.200s'")
 	invalidLabelValueMsgFormat = globalerror.SeriesInvalidLabelValue.Message("received a series with invalid value in label '%.200s': '%.200s' metric: '%.200s'")
 	duplicateLabelMsgFormat    = globalerror.SeriesWithDuplicateLabelNames.Message("received a series with duplicate label name, label: '%.200s' series: '%.200s'")
-	tooManyLabelsMsgFormat     = globalerror.MaxLabelNamesPerSeries.MessageWithPerTenantLimitConfig(
+
+	tooManyLabelsMsgFormat = globalerror.MaxLabelNamesPerSeries.MessageWithPerTenantLimitConfig(
 		"received a series whose number of labels exceeds the limit (actual: %d, limit: %d) series: '%.200s%s'",
 		validation.MaxLabelNamesPerSeriesFlag,
 	)
+	tooManyInfoLabelsMsgFormat = globalerror.MaxLabelNamesPerInfoSeries.MessageWithPerTenantLimitConfig(
+		"received an info series whose number of labels exceeds the limit (actual: %d, limit: %d) series: '%.200s%s'",
+		validation.MaxLabelNamesPerInfoSeriesFlag,
+	)
+
 	noMetricNameMsgFormat                 = globalerror.MissingMetricName.Message("received series has no metric name")
 	invalidMetricNameMsgFormat            = globalerror.InvalidMetricName.Message("received a series with invalid metric name: '%.200s'")
 	maxNativeHistogramBucketsMsgFormat    = globalerror.MaxNativeHistogramBuckets.Message("received a native histogram sample with too many buckets, timestamp: %d series: %s, buckets: %d, limit: %d")
@@ -126,6 +133,7 @@ type sampleValidationMetrics struct {
 	missingMetricName            *prometheus.CounterVec
 	invalidMetricName            *prometheus.CounterVec
 	maxLabelNamesPerSeries       *prometheus.CounterVec
+	maxLabelNamesPerInfoSeries   *prometheus.CounterVec
 	invalidLabel                 *prometheus.CounterVec
 	invalidLabelValue            *prometheus.CounterVec
 	labelNameTooLong             *prometheus.CounterVec
@@ -142,6 +150,7 @@ func (m *sampleValidationMetrics) deleteUserMetrics(userID string) {
 	m.missingMetricName.DeletePartialMatch(filter)
 	m.invalidMetricName.DeletePartialMatch(filter)
 	m.maxLabelNamesPerSeries.DeletePartialMatch(filter)
+	m.maxLabelNamesPerInfoSeries.DeletePartialMatch(filter)
 	m.invalidLabel.DeletePartialMatch(filter)
 	m.invalidLabelValue.DeletePartialMatch(filter)
 	m.labelNameTooLong.DeletePartialMatch(filter)
@@ -157,6 +166,7 @@ func (m *sampleValidationMetrics) deleteUserMetricsForGroup(userID, group string
 	m.missingMetricName.DeleteLabelValues(userID, group)
 	m.invalidMetricName.DeleteLabelValues(userID, group)
 	m.maxLabelNamesPerSeries.DeleteLabelValues(userID, group)
+	m.maxLabelNamesPerInfoSeries.DeleteLabelValues(userID, group)
 	m.invalidLabel.DeleteLabelValues(userID, group)
 	m.invalidLabelValue.DeleteLabelValues(userID, group)
 	m.labelNameTooLong.DeleteLabelValues(userID, group)
@@ -173,6 +183,7 @@ func newSampleValidationMetrics(r prometheus.Registerer) *sampleValidationMetric
 		missingMetricName:            validation.DiscardedSamplesCounter(r, reasonMissingMetricName),
 		invalidMetricName:            validation.DiscardedSamplesCounter(r, reasonInvalidMetricName),
 		maxLabelNamesPerSeries:       validation.DiscardedSamplesCounter(r, reasonMaxLabelNamesPerSeries),
+		maxLabelNamesPerInfoSeries:   validation.DiscardedSamplesCounter(r, reasonMaxLabelNamesPerInfoSeries),
 		invalidLabel:                 validation.DiscardedSamplesCounter(r, reasonInvalidLabel),
 		invalidLabelValue:            validation.DiscardedSamplesCounter(r, reasonInvalidLabelValue),
 		labelNameTooLong:             validation.DiscardedSamplesCounter(r, reasonLabelNameTooLong),
@@ -349,6 +360,7 @@ func validateExemplarTimestamp(m *exemplarValidationMetrics, userID string, minT
 // labelValidationConfig helps with getting required config to validate labels.
 type labelValidationConfig interface {
 	MaxLabelNamesPerSeries(userID string) int
+	MaxLabelNamesPerInfoSeries(userID string) int
 	MaxLabelNameLength(userID string) int
 	MaxLabelValueLength(userID string) int
 }
@@ -387,9 +399,17 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI
 	}
 
 	if !skipLabelCountValidation && len(ls) > cfg.MaxLabelNamesPerSeries(userID) {
-		m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc()
-		metric, ellipsis := getMetricAndEllipsis(ls)
-		return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis)
+		if strings.HasSuffix(unsafeMetricName, "_info") {
+			if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) {
+				m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc()
+				metric, ellipsis := getMetricAndEllipsis(ls)
+				return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis)
+			}
+		} else {
+			m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc()
+			metric, ellipsis := getMetricAndEllipsis(ls)
+			return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis)
+		}
 	}
 
 	maxLabelNameLength := cfg.MaxLabelNameLength(userID)
diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go
index 77c674ac9b4..df4de2dd60f 100644
--- a/pkg/distributor/validate_test.go
+++ b/pkg/distributor/validate_test.go
@@ -24,15 +24,20 @@ import (
 )
 
 type validateLabelsCfg struct {
-	maxLabelNamesPerSeries int
-	maxLabelNameLength     int
-	maxLabelValueLength    int
+	maxLabelNamesPerSeries     int
+	maxLabelNamesPerInfoSeries int
+	maxLabelNameLength         int
+	maxLabelValueLength        int
 }
 
 func (v validateLabelsCfg) MaxLabelNamesPerSeries(_ string) int {
 	return v.maxLabelNamesPerSeries
 }
 
+func (v validateLabelsCfg) MaxLabelNamesPerInfoSeries(_ string) int {
+	return v.maxLabelNamesPerInfoSeries
+}
+
 func (v validateLabelsCfg) MaxLabelNameLength(_ string) int {
 	return v.maxLabelNameLength
 }
@@ -64,6 +69,7 @@ func TestValidateLabels(t *testing.T) {
 	cfg.maxLabelValueLength = 25
 	cfg.maxLabelNameLength = 25
 	cfg.maxLabelNamesPerSeries = 2
+	cfg.maxLabelNamesPerInfoSeries = 3
 
 	for _, c := range []struct {
 		metric                   model.Metric
@@ -157,6 +163,31 @@ func TestValidateLabels(t *testing.T) {
 				)...,
 			),
 		},
+		{
+			// *_info metrics have higher label limits.
+			metric:                   map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop"},
+			skipLabelNameValidation:  false,
+			skipLabelCountValidation: false,
+			err:                      nil,
+		},
+		{
+			// *_info metrics have higher label limits.
+			metric:                   map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo_info", "bar": "baz", "blip": "blop", "blap": "blup"},
+			skipLabelNameValidation:  false,
+			skipLabelCountValidation: false,
+			err: fmt.Errorf(
+				tooManyInfoLabelsMsgFormat,
+				tooManyLabelsArgs(
+					[]mimirpb.LabelAdapter{
+						{Name: model.MetricNameLabel, Value: "foo_info"},
+						{Name: "bar", Value: "baz"},
+						{Name: "blip", Value: "blop"},
+						{Name: "blap", Value: "blup"},
+					},
+					3,
+				)...,
+			),
+		},
 		{
 			metric:                   map[model.LabelName]model.LabelValue{model.MetricNameLabel: "foo", "bar": "baz", "blip": "blop"},
 			skipLabelNameValidation:  false,
@@ -206,6 +237,7 @@ func TestValidateLabels(t *testing.T) {
 			cortex_discarded_samples_total{group="custom label",reason="label_value_invalid",user="testUser"} 1
 			cortex_discarded_samples_total{group="custom label",reason="label_value_too_long",user="testUser"} 1
 			cortex_discarded_samples_total{group="custom label",reason="max_label_names_per_series",user="testUser"} 1
+			cortex_discarded_samples_total{group="custom label",reason="max_label_names_per_info_series",user="testUser"} 1
 			cortex_discarded_samples_total{group="custom label",reason="metric_name_invalid",user="testUser"} 2
 			cortex_discarded_samples_total{group="custom label",reason="missing_metric_name",user="testUser"} 1
 			cortex_discarded_samples_total{group="custom label",reason="random reason",user="different user"} 1
diff --git a/pkg/util/globalerror/user.go b/pkg/util/globalerror/user.go
index 8a378e5b3b3..68dd3ef1040 100644
--- a/pkg/util/globalerror/user.go
+++ b/pkg/util/globalerror/user.go
@@ -17,6 +17,7 @@ const (
 	MissingMetricName                     ID = "missing-metric-name"
 	InvalidMetricName                     ID = "metric-name-invalid"
 	MaxLabelNamesPerSeries                ID = "max-label-names-per-series"
+	MaxLabelNamesPerInfoSeries            ID = "max-label-names-per-info-series"
 	MaxNativeHistogramBuckets             ID = "max-native-histogram-buckets"
 	NotReducibleNativeHistogram           ID = "not-reducible-native-histogram"
 	InvalidSchemaNativeHistogram          ID = "invalid-native-histogram-schema"
diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go
index 5751e1b3551..09b212ff2fa 100644
--- a/pkg/util/validation/limits.go
+++ b/pkg/util/validation/limits.go
@@ -39,6 +39,7 @@ const (
 	MaxEstimatedChunksPerQueryMultiplierFlag  = "querier.max-estimated-fetched-chunks-per-query-multiplier"
 	MaxEstimatedMemoryConsumptionPerQueryFlag = "querier.max-estimated-memory-consumption-per-query"
 	MaxLabelNamesPerSeriesFlag                = "validation.max-label-names-per-series"
+	MaxLabelNamesPerInfoSeriesFlag            = "validation.max-label-names-per-info-series"
 	MaxLabelNameLengthFlag                    = "validation.max-length-label-name"
 	MaxLabelValueLengthFlag                   = "validation.max-length-label-value"
 	MaxMetadataLengthFlag                     = "validation.max-metadata-length"
@@ -113,6 +114,7 @@ type Limits struct {
 	MaxLabelNameLength                          int                 `yaml:"max_label_name_length" json:"max_label_name_length"`
 	MaxLabelValueLength                         int                 `yaml:"max_label_value_length" json:"max_label_value_length"`
 	MaxLabelNamesPerSeries                      int                 `yaml:"max_label_names_per_series" json:"max_label_names_per_series"`
+	MaxLabelNamesPerInfoSeries                  int                 `yaml:"max_label_names_per_info_series" json:"max_label_names_per_info_series"`
 	MaxMetadataLength                           int                 `yaml:"max_metadata_length" json:"max_metadata_length"`
 	MaxNativeHistogramBuckets                   int                 `yaml:"max_native_histogram_buckets" json:"max_native_histogram_buckets"`
 	MaxExemplarsPerSeriesPerRequest             int                 `yaml:"max_exemplars_per_series_per_request" json:"max_exemplars_per_series_per_request" category:"experimental"`
@@ -264,6 +266,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	f.IntVar(&l.MaxLabelNameLength, MaxLabelNameLengthFlag, 1024, "Maximum length accepted for label names")
 	f.IntVar(&l.MaxLabelValueLength, MaxLabelValueLengthFlag, 2048, "Maximum length accepted for label value. This setting also applies to the metric name")
 	f.IntVar(&l.MaxLabelNamesPerSeries, MaxLabelNamesPerSeriesFlag, 30, "Maximum number of label names per series.")
+	f.IntVar(&l.MaxLabelNamesPerInfoSeries, MaxLabelNamesPerInfoSeriesFlag, 80, "Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-"+MaxLabelNamesPerSeriesFlag+")")
 	f.IntVar(&l.MaxMetadataLength, MaxMetadataLengthFlag, 1024, "Maximum length accepted for metric metadata. Metadata refers to Metric Name, HELP and UNIT. Longer metadata is dropped except for HELP which is truncated.")
 	f.IntVar(&l.MaxNativeHistogramBuckets, maxNativeHistogramBucketsFlag, 0, "Maximum number of buckets per native histogram sample. 0 to disable the limit.")
 	f.IntVar(&l.MaxExemplarsPerSeriesPerRequest, "distributor.max-exemplars-per-series-per-request", 0, "Maximum number of exemplars per series per request. 0 to disable limit in request. The exceeding exemplars are dropped.")
@@ -596,6 +599,11 @@ func (o *Overrides) MaxLabelNamesPerSeries(userID string) int {
 	return o.getOverridesForUser(userID).MaxLabelNamesPerSeries
 }
 
+// MaxLabelNamesPerInfoSeries returns maximum number of label/value pairs for info timeseries.
+func (o *Overrides) MaxLabelNamesPerInfoSeries(userID string) int {
+	return o.getOverridesForUser(userID).MaxLabelNamesPerInfoSeries
+}
+
 // MaxMetadataLength returns maximum length metadata can be. Metadata refers
 // to the Metric Name, HELP and UNIT.
 func (o *Overrides) MaxMetadataLength(userID string) int {