diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 4e2d6cb681a..c804628d160 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4379,6 +4379,50 @@ "fieldType": "int", "fieldCategory": "experimental" }, + { + "kind": "field", + "name": "cost_attribution_labels", + "required": false, + "desc": "List of labels used to define cost attribution. These labels will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_attributed_received_samples_total, cortex_ingester_attributed_active_series, and cortex_attributed_discarded_samples_total. Set to an empty string to disable cost attribution.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "validation.cost-attribution-labels", + "fieldType": "string", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_labels_per_user", + "required": false, + "desc": "Maximum number of cost attribution labels allowed per user. Set to 0 to disable.", + "fieldValue": null, + "fieldDefaultValue": 2, + "fieldFlag": "validation.max-cost-attribution-labels-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "max_cost_attribution_cardinality_per_user", + "required": false, + "desc": "Maximum cardinality of cost attribution labels allowed per user.", + "fieldValue": null, + "fieldDefaultValue": 10000, + "fieldFlag": "validation.max-cost-attribution-cardinality-per-user", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_cooldown", + "required": false, + "desc": "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "validation.cost-attribution-cooldown", + "fieldType": "duration", + "fieldCategory": "experimental" + }, { "kind": "field", "name": "ruler_evaluation_delay_duration", @@ -19638,6 +19682,28 @@ "fieldFlag": "timeseries-unmarshal-caching-optimization-enabled", "fieldType": "boolean", "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_eviction_interval", + "required": false, + "desc": "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.", + "fieldValue": null, + "fieldDefaultValue": 1200000000000, + "fieldFlag": "cost-attribution.eviction-interval", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cost_attribution_registry_path", + "required": false, + "desc": "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.", + "fieldValue": null, + "fieldDefaultValue": "", + "fieldFlag": "cost-attribution.registry-path", + "fieldType": "string", + "fieldCategory": "experimental" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 40dd0d5d3aa..5823330a073 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1285,6 +1285,10 @@ Usage of ./cmd/mimir/mimir: Expands ${var} or $var in config according to the values of the environment variables. -config.file value Configuration file to load. + -cost-attribution.eviction-interval duration + [experimental] Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit. (default 20m0s) + -cost-attribution.registry-path string + [experimental] Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed. -debug.block-profile-rate int Fraction of goroutine blocking events that are reported in the blocking profile. 1 to include every blocking event in the profile, 0 to disable. -debug.mutex-profile-fraction int @@ -3317,10 +3321,18 @@ Usage of ./cmd/mimir/mimir: Enable anonymous usage reporting. (default true) -usage-stats.installation-mode string Installation mode. Supported values: custom, helm, jsonnet. (default "custom") + -validation.cost-attribution-cooldown duration + [experimental] Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit. + -validation.cost-attribution-labels comma-separated-list-of-strings + [experimental] List of labels used to define cost attribution. These labels will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_attributed_received_samples_total, cortex_ingester_attributed_active_series, and cortex_attributed_discarded_samples_total. Set to an empty string to disable cost attribution. -validation.create-grace-period duration Controls how far into the future incoming samples and exemplars are accepted compared to the wall clock. Any sample or exemplar will be rejected if its timestamp is greater than '(now + creation_grace_period)'. This configuration is enforced in the distributor and ingester. (default 10m) -validation.enforce-metadata-metric-name Enforce every metadata has a metric name. (default true) + -validation.max-cost-attribution-cardinality-per-user int + [experimental] Maximum cardinality of cost attribution labels allowed per user. (default 10000) + -validation.max-cost-attribution-labels-per-user int + [experimental] Maximum number of cost attribution labels allowed per user. Set to 0 to disable. (default 2) -validation.max-label-names-per-info-series int Maximum number of label names per info series. Has no effect if less than the value of the maximum number of label names per series option (-validation.max-label-names-per-series) (default 80) -validation.max-label-names-per-series int diff --git a/development/mimir-microservices-mode/config/mimir.yaml b/development/mimir-microservices-mode/config/mimir.yaml index 5d245999115..31702611891 100644 --- a/development/mimir-microservices-mode/config/mimir.yaml +++ b/development/mimir-microservices-mode/config/mimir.yaml @@ -1,4 +1,6 @@ multitenancy_enabled: false +cost_attribution_registry_path: "/usage-metrics" +cost_attribution_eviction_interval: 10m distributor: ha_tracker: @@ -184,5 +186,10 @@ limits: ha_replica_label: ha_replica ha_max_clusters: 10 + cost_attribution_labels: "container" + max_cost_attribution_labels_per_user: 2 + max_cost_attribution_cardinality_per_user: 100 + cost_attribution_cooldown: 20m + runtime_config: - file: ./config/runtime.yaml + file: ./config/runtime.yaml \ No newline at end of file diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index bfd77794a8a..26463296cfb 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -461,6 +461,18 @@ overrides_exporter: # (experimental) Enables optimized marshaling of timeseries. # CLI flag: -timeseries-unmarshal-caching-optimization-enabled [timeseries_unmarshal_caching_optimization_enabled: | default = true] + +# (experimental) Time interval at which inactive cost attributions are evicted +# from the counter, ensuring they are not included in the cost attribution +# cardinality per user limit. +# CLI flag: -cost-attribution.eviction-interval +[cost_attribution_eviction_interval: | default = 20m] + +# (experimental) Defines a custom path for the registry. When specified, Mimir +# will expose cost attribution metrics through this custom path, if not +# specified, cost attribution metrics won't be exposed. +# CLI flag: -cost-attribution.registry-path +[cost_attribution_registry_path: | default = ""] ``` ### common @@ -3572,6 +3584,37 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -querier.active-series-results-max-size-bytes [active_series_results_max_size_bytes: | default = 419430400] +# (experimental) List of labels used to define cost attribution. These labels +# will be included in the specified distributor and ingester metrics for each +# write request, allowing them to be distinguished by the label. The label +# applies to the following metrics: +# cortex_distributor_attributed_received_samples_total, +# cortex_ingester_attributed_active_series, and +# cortex_attributed_discarded_samples_total. Set to an empty string to disable +# cost attribution. +# CLI flag: -validation.cost-attribution-labels +[cost_attribution_labels: | default = ""] + +# (experimental) Maximum number of cost attribution labels allowed per user. Set +# to 0 to disable. +# CLI flag: -validation.max-cost-attribution-labels-per-user +[max_cost_attribution_labels_per_user: | default = 2] + +# (experimental) Maximum cardinality of cost attribution labels allowed per +# user. +# CLI flag: -validation.max-cost-attribution-cardinality-per-user +[max_cost_attribution_cardinality_per_user: | default = 10000] + +# (experimental) Cooldown period for cost attribution labels. Specifies the +# duration the cost attribution remains in overflow before attempting a reset. +# If the cardinality remains above the limit after this period, the system will +# stay in overflow mode and extend the cooldown. Setting this value to 0 +# disables the cooldown, causing the system to continuously check whether the +# cardinality has dropped below the limit. A reset will occur once the +# cardinality falls below the limit. +# CLI flag: -validation.cost-attribution-cooldown +[cost_attribution_cooldown: | default = 0s] + # Duration to delay the evaluation of rules to ensure the underlying metrics # have been pushed. # CLI flag: -ruler.evaluation-delay-duration diff --git a/pkg/api/api.go b/pkg/api/api.go index a0592f74f15..e78b1e535f9 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -20,6 +20,7 @@ import ( "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/server" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/grafana/mimir/pkg/alertmanager" "github.com/grafana/mimir/pkg/alertmanager/alertmanagerpb" @@ -284,6 +285,11 @@ func (a *API) RegisterDistributor(d *distributor.Distributor, pushConfig distrib a.RegisterRoute("/distributor/ha_tracker", d.HATracker, false, true, "GET") } +// RegisterUsageMetricsRoute registers a Prometheus HTTP handler for the custom registry. +func (a *API) RegisterUsageMetricsRoute(customRegistryPath string, reg *prometheus.Registry) { + a.RegisterRoute(customRegistryPath, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), false, false, "GET") +} + // Ingester is defined as an interface to allow for alternative implementations // of ingesters to be passed into the API.RegisterIngester() method. type Ingester interface { diff --git a/pkg/blockbuilder/tsdb.go b/pkg/blockbuilder/tsdb.go index d8c1376230d..40087fbbc74 100644 --- a/pkg/blockbuilder/tsdb.go +++ b/pkg/blockbuilder/tsdb.go @@ -50,7 +50,7 @@ type TSDBBuilder struct { var softErrProcessor = mimir_storage.NewSoftAppendErrorProcessor( func() {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, func(int64, []mimirpb.LabelAdapter) {}, - func() {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, + func([]mimirpb.LabelAdapter) {}, func([]mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, func(error, int64, []mimirpb.LabelAdapter) {}, ) diff --git a/pkg/costattribution/manager.go b/pkg/costattribution/manager.go new file mode 100644 index 00000000000..9bb293e20a2 --- /dev/null +++ b/pkg/costattribution/manager.go @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "context" + "sort" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/grafana/dskit/services" + "github.com/prometheus/client_golang/prometheus" + + "github.com/grafana/mimir/pkg/util/validation" +) + +const ( + missingValue = "__missing__" + overflowValue = "__overflow__" +) + +type Manager struct { + services.Service + logger log.Logger + inactiveTimeout time.Duration + limits *validation.Overrides + + // mu protects the trackersByUserID map + mtx sync.RWMutex + trackersByUserID map[string]*Tracker + reg *prometheus.Registry + cleanupInterval time.Duration + metricsExportInterval time.Duration +} + +// NewManager creates a new cost attribution manager. which is responsible for managing the cost attribution of series. +// It will clean up inactive series and update the cost attribution of series every 3 minutes. +func NewManager(cleanupInterval, exportInterval, inactiveTimeout time.Duration, logger log.Logger, limits *validation.Overrides, reg *prometheus.Registry) (*Manager, error) { + m := &Manager{ + trackersByUserID: make(map[string]*Tracker), + limits: limits, + mtx: sync.RWMutex{}, + inactiveTimeout: inactiveTimeout, + logger: logger, + reg: reg, + cleanupInterval: cleanupInterval, + metricsExportInterval: exportInterval, + } + + m.Service = services.NewBasicService(nil, m.running, nil).WithName("cost attribution manager") + if err := reg.Register(m); err != nil { + return nil, err + } + return m, nil +} + +func (m *Manager) running(ctx context.Context) error { + if m == nil { + return nil + } + t := time.NewTicker(m.cleanupInterval) + defer t.Stop() + + tMupdate := time.NewTicker(m.metricsExportInterval) + defer tMupdate.Stop() + + for { + select { + case <-t.C: + err := m.purgeInactiveAttributionsUntil(time.Now().Add(-m.inactiveTimeout).Unix()) + if err != nil { + return err + } + case <-tMupdate.C: + m.updateMetrics() + case <-ctx.Done(): + return nil + } + } +} + +// EnabledForUser returns true if the cost attribution is enabled for the user +func (m *Manager) EnabledForUser(userID string) bool { + if m == nil { + return false + } + return len(m.limits.CostAttributionLabels(userID)) > 0 +} + +func (m *Manager) TrackerForUser(userID string) *Tracker { + // if manager is not initialized or cost attribution is not enabled, return nil + if m == nil || !m.EnabledForUser(userID) { + return nil + } + + m.mtx.Lock() + defer m.mtx.Unlock() + + // if not exists, create a new tracker + if _, exists := m.trackersByUserID[userID]; !exists { + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + } + return m.trackersByUserID[userID] +} + +func (m *Manager) Collect(out chan<- prometheus.Metric) { + if m == nil { + return + } + m.mtx.RLock() + defer m.mtx.RUnlock() + for _, tracker := range m.trackersByUserID { + tracker.Collect(out) + } +} + +// Describe implements prometheus.Collector. +func (m *Manager) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector +} + +// deleteUserTracer is delete user tracker since the user is disabled for cost attribution +func (m *Manager) deleteUserTracer(userID string) { + if m == nil { + return + } + m.mtx.Lock() + defer m.mtx.Unlock() + if _, exists := m.trackersByUserID[userID]; exists { + // clean up tracker metrics and delete the tracker + m.trackersByUserID[userID].cleanupTracker() + delete(m.trackersByUserID, userID) + } +} + +func (m *Manager) purgeInactiveAttributionsUntil(deadline int64) error { + if m == nil { + return nil + } + // Get all userIDs from the map + m.mtx.RLock() + userIDs := make([]string, 0, len(m.trackersByUserID)) + for userID := range m.trackersByUserID { + userIDs = append(userIDs, userID) + } + m.mtx.RUnlock() + + // Iterate over all userIDs and purge inactive attributions of each user + for _, userID := range userIDs { + // if cost attribution is not enabled for the user, delete the user tracker and continue + if !m.EnabledForUser(userID) { + m.deleteUserTracer(userID) + continue + } + + // get all inactive attributions for the user and clean up the tracker + invalidKeys := m.getInactiveObservationsForUser(userID, deadline) + + cat := m.TrackerForUser(userID) + for _, key := range invalidKeys { + cat.cleanupTrackerAttribution(key) + } + + // if the tracker is no longer overflowed, and it is currently in overflow state, check the cooldown and create new tracker + if cat != nil && cat.cooldownUntil != nil && cat.cooldownUntil.Load() < deadline { + if len(cat.observed) <= cat.MaxCardinality() { + m.deleteUserTracer(userID) + } else { + cat.cooldownUntil.Store(deadline + cat.cooldownDuration) + } + } + } + return nil +} + +// compare two sorted string slices +// true if they are equal, otherwise false +func CompareCALabels(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i, v := range a { + if v != b[i] { + return false + } + } + return true +} + +func (m *Manager) getInactiveObservationsForUser(userID string, deadline int64) []string { + cat := m.TrackerForUser(userID) + if cat == nil { + return nil + } + + newTrackedLabels := m.limits.CostAttributionLabels(userID) + sort.Slice(newTrackedLabels, func(i, j int) bool { + return newTrackedLabels[i] < newTrackedLabels[j] + }) + + // if they are different, we need to update the tracker, we don't mind, just reinitialized the tracker + if !CompareCALabels(cat.CALabels(), newTrackedLabels) { + m.mtx.Lock() + m.trackersByUserID[userID], _ = newTracker(userID, m.limits.CostAttributionLabels(userID), m.limits.MaxCostAttributionCardinalityPerUser(userID), m.limits.CostAttributionCooldown(userID), m.logger) + // update the tracker with the new tracker + cat = m.trackersByUserID[userID] + m.mtx.Unlock() + } else { + maxCardinality := m.limits.MaxCostAttributionCardinalityPerUser(userID) + if cat.MaxCardinality() != maxCardinality { + cat.UpdateMaxCardinality(maxCardinality) + } + + cooldown := int64(m.limits.CostAttributionCooldown(userID).Seconds()) + if cooldown != cat.CooldownDuration() { + cat.UpdateCooldownDuration(cooldown) + } + } + + return cat.GetInactiveObservations(deadline) +} + +func (m *Manager) updateMetrics() { + if m == nil { + return + } + + m.mtx.RLock() + defer m.mtx.RUnlock() + for _, tracker := range m.trackersByUserID { + tracker.updateMetrics() + } +} diff --git a/pkg/costattribution/manager_test.go b/pkg/costattribution/manager_test.go new file mode 100644 index 00000000000..b054680d960 --- /dev/null +++ b/pkg/costattribution/manager_test.go @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "testing" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + + "github.com/grafana/mimir/pkg/util/validation" +) + +func getMockLimits(idx int) (*validation.Overrides, error) { + baseLimits := map[string]*validation.Limits{ + "user1": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"team"}}, + "user2": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{}}, + "user3": {MaxCostAttributionCardinalityPerUser: 2, CostAttributionLabels: []string{"department", "service"}}, + "user4": {MaxCostAttributionCardinalityPerUser: 5, CostAttributionLabels: []string{"platform"}}, + } + + switch idx { + case 1: + baseLimits["user1"].CostAttributionLabels = []string{} + case 2: + baseLimits["user3"].CostAttributionLabels = []string{"team", "feature"} + case 3: + baseLimits["user3"].MaxCostAttributionCardinalityPerUser = 3 + case 4: + baseLimits["user1"].MaxCostAttributionCardinalityPerUser = 2 + case 5: + baseLimits["user1"].CostAttributionLabels = []string{"department"} + } + + return validation.NewOverrides(validation.Limits{}, validation.NewMockTenantLimits(baseLimits)) +} + +func newTestManager() *Manager { + logger := log.NewNopLogger() + limits, _ := getMockLimits(0) + reg := prometheus.NewRegistry() + manager, err := NewManager(5*time.Second, time.Second, 10*time.Second, logger, limits, reg) + if err != nil { + panic(err) + } + return manager +} + +func Test_NewManager(t *testing.T) { + manager := newTestManager() + assert.NotNil(t, manager) + assert.NotNil(t, manager.trackersByUserID) + assert.Equal(t, 10*time.Second, manager.inactiveTimeout) +} + +func Test_EnabledForUser(t *testing.T) { + manager := newTestManager() + assert.True(t, manager.EnabledForUser("user1"), "Expected cost attribution to be enabled for user1") + assert.False(t, manager.EnabledForUser("user2"), "Expected cost attribution to be disabled for user2") + assert.False(t, manager.EnabledForUser("user5"), "Expected cost attribution to be disabled for user5") +} + +func Test_CreateDeleteTracker(t *testing.T) { + manager := newTestManager() + + t.Run("Tracker existence and attributes", func(t *testing.T) { + user1Tracker := manager.TrackerForUser("user1") + assert.NotNil(t, user1Tracker) + assert.Equal(t, []string{"team"}, user1Tracker.CALabels()) + assert.Equal(t, 5, user1Tracker.MaxCardinality()) + + assert.Nil(t, manager.TrackerForUser("user2")) + + user3Tracker := manager.TrackerForUser("user3") + assert.NotNil(t, user3Tracker) + assert.Equal(t, []string{"department", "service"}, user3Tracker.CALabels()) + assert.Equal(t, 2, user3Tracker.MaxCardinality()) + }) + + t.Run("Metrics tracking", func(t *testing.T) { + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(12, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("department", "foo", "service", "dodo"), 1, time.Unix(20, 0)) + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + + // manually trigger metrics update to ensure they are exported + manager.updateMetrics() + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total", "cortex_received_attributed_samples_total")) + }) + + t.Run("Purge inactive attributions", func(t *testing.T) { + manager.purgeInactiveAttributionsUntil(time.Unix(10, 0).Unix()) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + ` + manager.updateMetrics() + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("Disabling user cost attribution", func(t *testing.T) { + manager.limits, _ = getMockLimits(1) + manager.purgeInactiveAttributionsUntil(time.Unix(11, 0).Unix()) + assert.Equal(t, 1, len(manager.trackersByUserID)) + + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{department="foo",service="dodo",tenant="user3",tracker="cost-attribution"} 1 + ` + manager.updateMetrics() + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) + + t.Run("Updating user cardinality and labels", func(t *testing.T) { + manager.limits, _ = getMockLimits(2) + manager.purgeInactiveAttributionsUntil(time.Unix(12, 0).Unix()) + // user3 tracker should be recreated with cost attribution labels changed to ["team", "feature"] + assert.Equal(t, 1, len(manager.trackersByUserID)) + assert.Equal(t, []string{"feature", "team"}, manager.TrackerForUser("user3").CALabels()) + + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(13, 0)) + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{feature="__missing__",reason="invalid-metrics-name",team="foo",tenant="user3",tracker="cost-attribution"} 1 + ` + manager.updateMetrics() + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_discarded_attributed_samples_total")) + }) + + t.Run("When cost attribution get overflowed, all metrics are purged except overflow metrics", func(t *testing.T) { + // user3 has maximum cardinality of 2, so adding 3rd attribution should trigger overflow + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "bar", "feature", "bar"), 1, time.Unix(15, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "baz", "feature", "baz"), 1, time.Unix(16, 0)) + manager.TrackerForUser("user3").IncrementReceivedSamples(labels.FromStrings("team", "foo", "feature", "foo"), 1, time.Unix(17, 0)) + expectedMetrics := ` + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{feature="__overflow__",team="__overflow__",tenant="user3",tracker="cost-attribution"} 1 + ` + manager.updateMetrics() + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), "cortex_received_attributed_samples_total")) + }) +} + +func Test_PurgeInactiveAttributionsUntil(t *testing.T) { + manager := newTestManager() + + // Simulate metrics for multiple users to set up initial state + manager.TrackerForUser("user1").IncrementReceivedSamples(labels.FromStrings("team", "foo"), 1, time.Unix(1, 0)) + manager.TrackerForUser("user1").IncrementDiscardedSamples(labels.FromStrings("team", "foo"), 1, "invalid-metrics-name", time.Unix(1, 0)) + manager.TrackerForUser("user3").IncrementDiscardedSamples(labels.FromStrings("department", "foo", "service", "bar"), 1, "out-of-window", time.Unix(10, 0)) + + manager.updateMetrics() + t.Run("Purge before inactive timeout", func(t *testing.T) { + // Run purge at a timestamp that doesn't exceed inactive timeout + manager.purgeInactiveAttributionsUntil(time.Unix(0, 0).Unix()) + + // No purging should have occurred, track user metrics remain + assert.Equal(t, 2, len(manager.trackersByUserID), "Expected trackers to remain active before timeout") + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{reason="invalid-metrics-name",team="foo",tenant="user1",tracker="cost-attribution"} 1 + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Purge after inactive timeout", func(t *testing.T) { + // disable cost attribution for user1 to test purging + manager.limits, _ = getMockLimits(1) + manager.purgeInactiveAttributionsUntil(time.Unix(5, 0).Unix()) + + // User3's tracker should remain since it's active, user1's tracker should be removed + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after purging") + assert.Nil(t, manager.TrackerForUser("user1"), "Expected user1 tracker to be purged") + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{department="foo",reason="out-of-window",service="bar",tenant="user3",tracker="cost-attribution"} 1 + ` + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(expectedMetrics), metricNames...)) + }) + + t.Run("Purge all trackers", func(t *testing.T) { + // Trigger a purge that should remove all inactive trackers + manager.purgeInactiveAttributionsUntil(time.Unix(20, 0).Unix()) + + // Tracker would stay at 1 since user1's tracker is disabled + assert.Equal(t, 1, len(manager.trackersByUserID), "Expected one active tracker after full purge") + + // No metrics should remain after all purged + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + } + assert.NoError(t, testutil.GatherAndCompare(manager.reg, strings.NewReader(""), metricNames...)) + }) +} diff --git a/pkg/costattribution/tracker.go b/pkg/costattribution/tracker.go new file mode 100644 index 00000000000..290bc0d98d3 --- /dev/null +++ b/pkg/costattribution/tracker.go @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "bytes" + "sort" + "strings" + "sync" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/model/labels" + "go.uber.org/atomic" +) + +type Observation struct { + lastUpdate *atomic.Int64 + activeSerie *atomic.Int64 + receivedSample *atomic.Int64 + discardSamplemu sync.RWMutex + discardedSample map[string]*atomic.Int64 +} + +const ( + TrackerLabel = "tracker" + TenantLabel = "tenant" + defaultTrackerName = "cost-attribution" +) + +type Tracker struct { + userID string + caLabels []string + caLabelMap map[string]int + maxCardinality int + activeSeriesPerUserAttribution *prometheus.GaugeVec + receivedSamplesAttribution *prometheus.CounterVec + discardedSampleAttribution *prometheus.CounterVec + + overflowLabels []string + // obseveredMtx protects the observed map + obseveredMtx sync.RWMutex + observed map[string]*Observation + + hashBuffer []byte + isOverflow bool + cooldownUntil *atomic.Int64 + cooldownDuration int64 + logger log.Logger +} + +func newTracker(userID string, trackedLabels []string, limit int, cooldown time.Duration, logger log.Logger) (*Tracker, error) { + // keep tracked labels sorted for consistent metric labels + sort.Slice(trackedLabels, func(i, j int) bool { + return trackedLabels[i] < trackedLabels[j] + }) + caLabelMap := make(map[string]int, len(trackedLabels)) + for i, label := range trackedLabels { + caLabelMap[label] = i + } + m := &Tracker{ + userID: userID, + caLabels: trackedLabels, + caLabelMap: caLabelMap, + maxCardinality: limit, + obseveredMtx: sync.RWMutex{}, + observed: map[string]*Observation{}, + //lint:ignore faillint the metrics are registered in the mimir package + discardedSampleAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_discarded_attributed_samples_total", + Help: "The total number of samples that were discarded per attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: defaultTrackerName}, + }, append(trackedLabels, TenantLabel, "reason")), + //lint:ignore faillint the metrics are registered in the mimir package + receivedSamplesAttribution: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_received_attributed_samples_total", + Help: "The total number of samples that were received per attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: defaultTrackerName}, + }, append(trackedLabels, TenantLabel)), + //lint:ignore faillint the metrics are registered in the mimir package + activeSeriesPerUserAttribution: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_ingester_attributed_active_series", + Help: "The total number of active series per user and attribution.", + ConstLabels: prometheus.Labels{TrackerLabel: defaultTrackerName}, + }, append(trackedLabels, TenantLabel)), + hashBuffer: make([]byte, 0, 1024), + cooldownDuration: int64(cooldown.Seconds()), + logger: logger, + } + + // set overflow label values to export when the tracker is in overflow state + m.overflowLabels = make([]string, len(trackedLabels)+2) + for i := 0; i < len(trackedLabels); i++ { + m.overflowLabels[i] = overflowValue + } + m.overflowLabels[len(trackedLabels)] = userID + m.overflowLabels[len(trackedLabels)+1] = overflowValue + return m, nil +} + +func (t *Tracker) CALabels() []string { + if t == nil { + return nil + } + return t.caLabels +} + +func (t *Tracker) MaxCardinality() int { + if t == nil { + return 0 + } + return t.maxCardinality +} + +func (t *Tracker) CooldownDuration() int64 { + if t == nil { + return 0 + } + return t.cooldownDuration +} + +var bufferPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Buffer) + }, +} + +// sep is used to separate the labels in the key, it is not a valid label caracter +const sep = rune(0x80) + +func (t *Tracker) cleanupTrackerAttribution(key string) { + if t == nil { + return + } + + t.obseveredMtx.Lock() + delete(t.observed, key) + t.obseveredMtx.Unlock() + + vals := strings.Split(key, string(sep)) + vals = append(vals, t.userID) + t.activeSeriesPerUserAttribution.DeleteLabelValues(vals...) + t.receivedSamplesAttribution.DeleteLabelValues(vals...) + + // except for discarded sample metrics, there is reason label that is not part of the key, we need to delete all partial matches + filter := prometheus.Labels{} + for i := 0; i < len(t.caLabels); i++ { + filter[t.caLabels[i]] = vals[i] + } + filter[TenantLabel] = t.userID + t.discardedSampleAttribution.DeletePartialMatch(filter) +} + +func (t *Tracker) cleanupTracker() { + if t == nil { + return + } + filter := prometheus.Labels{TenantLabel: t.userID} + t.activeSeriesPerUserAttribution.DeletePartialMatch(filter) + t.receivedSamplesAttribution.DeletePartialMatch(filter) + t.discardedSampleAttribution.DeletePartialMatch(filter) +} + +func (t *Tracker) IncrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 1, 0, 0, nil) +} + +func (t *Tracker) DecrementActiveSeries(lbs labels.Labels, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), -1, 0, 0, nil) +} + +func (t *Tracker) IncrementDiscardedSamples(lbs labels.Labels, value float64, reason string, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 0, 0, int64(value), &reason) +} + +func (t *Tracker) IncrementReceivedSamples(lbs labels.Labels, value float64, now time.Time) { + if t == nil { + return + } + t.updateCounters(lbs, now.Unix(), 0, int64(value), 0, nil) +} + +func (t *Tracker) Collect(out chan<- prometheus.Metric) { + if t == nil { + return + } + + t.activeSeriesPerUserAttribution.Collect(out) + t.receivedSamplesAttribution.Collect(out) + t.discardedSampleAttribution.Collect(out) +} + +// Describe implements prometheus.Collector. +func (t *Tracker) Describe(chan<- *prometheus.Desc) { + // this is an unchecked collector + if t == nil { + return + } +} + +func (t *Tracker) updateCounters(lbls labels.Labels, ts int64, activeSeriesIncrement, receviedSampleIncrement, discardedSampleIncrement int64, reason *string) { + if t == nil { + return + } + + labelValues := make([]string, len(t.caLabels)+1) + lbls.Range(func(l labels.Label) { + if idx, ok := t.caLabelMap[l.Name]; ok { + labelValues[idx] = l.Value + } + }) + labelValues[len(labelValues)-1] = t.userID + for i := 0; i < len(labelValues)-1; i++ { + if labelValues[i] == "" { + labelValues[i] = missingValue + } + } + + buf := bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer bufferPool.Put(buf) + + for i, value := range labelValues[:len(labelValues)-1] { + if i > 0 { + buf.WriteRune(sep) + } + buf.WriteString(value) + } + + t.obseveredMtx.Lock() + defer t.obseveredMtx.Unlock() + + t.updateOverflow(buf.String(), ts, activeSeriesIncrement, receviedSampleIncrement, discardedSampleIncrement, reason) +} + +func (t *Tracker) updateOverflow(stream string, ts int64, activeSeriesIncrement, receviedSampleIncrement, discardedSampleIncrement int64, reason *string) { + if t == nil { + return + } + + if o, known := t.observed[stream]; known && o.lastUpdate != nil { + if o.lastUpdate.Load() < ts { + o.lastUpdate.Store(ts) + } + if activeSeriesIncrement != 0 { + o.activeSerie.Add(activeSeriesIncrement) + } + if receviedSampleIncrement > 0 { + o.receivedSample.Add(receviedSampleIncrement) + } + if discardedSampleIncrement > 0 && reason != nil { + o.discardSamplemu.Lock() + o.discardedSample[*reason] = atomic.NewInt64(discardedSampleIncrement) + o.discardSamplemu.Unlock() + } + } else if len(t.observed) < t.maxCardinality*2 { + t.observed[stream] = &Observation{ + lastUpdate: atomic.NewInt64(ts), + activeSerie: atomic.NewInt64(activeSeriesIncrement), + receivedSample: atomic.NewInt64(receviedSampleIncrement), + discardedSample: map[string]*atomic.Int64{}, + discardSamplemu: sync.RWMutex{}, + } + if discardedSampleIncrement > 0 && reason != nil { + t.observed[stream].discardSamplemu.Lock() + t.observed[stream].discardedSample[*reason] = atomic.NewInt64(discardedSampleIncrement) + t.observed[stream].discardSamplemu.Unlock() + } + } + + // If the maximum cardinality is hit all streams become `__overflow__`, the function would return true. + // the origin labels ovserved time is not updated, but the overflow hash is updated. + if !t.isOverflow && len(t.observed) > t.maxCardinality { + t.isOverflow = true + t.cleanupTracker() + t.cooldownUntil = atomic.NewInt64(ts + t.cooldownDuration) + } +} + +func (t *Tracker) GetInactiveObservations(deadline int64) []string { + if t == nil { + return nil + } + + // otherwise, we need to check all observations and clean up the ones that are inactive + var invalidKeys []string + t.obseveredMtx.RLock() + defer t.obseveredMtx.RUnlock() + for labkey, ob := range t.observed { + if ob != nil && ob.lastUpdate != nil && ob.lastUpdate.Load() <= deadline { + invalidKeys = append(invalidKeys, labkey) + } + } + + return invalidKeys +} + +func (t *Tracker) UpdateMaxCardinality(limit int) { + if t == nil { + return + } + t.maxCardinality = limit +} + +func (t *Tracker) UpdateCooldownDuration(cooldownDuration int64) { + if t == nil { + return + } + t.cooldownDuration = cooldownDuration +} + +func (t *Tracker) updateMetrics() { + if t == nil { + return + } + + if t.isOverflow { + // if we are in overflow state, we only report the overflow metric + t.activeSeriesPerUserAttribution.WithLabelValues(t.overflowLabels[:len(t.overflowLabels)-1]...).Set(float64(1)) + t.receivedSamplesAttribution.WithLabelValues(t.overflowLabels[:len(t.overflowLabels)-1]...).Add(float64(1)) + t.discardedSampleAttribution.WithLabelValues(t.overflowLabels...).Add(float64(1)) + } else { + t.obseveredMtx.Lock() + for key, c := range t.observed { + if c != nil { + keys := strings.Split(key, string(sep)) + keys = append(keys, t.userID) + if c.activeSerie.Load() != 0 { + t.activeSeriesPerUserAttribution.WithLabelValues(keys...).Add(float64(c.activeSerie.Swap(0))) + } + if c.receivedSample.Load() > 0 { + t.receivedSamplesAttribution.WithLabelValues(keys...).Add(float64(c.receivedSample.Swap(0))) + } + c.discardSamplemu.Lock() + for reason, cnt := range c.discardedSample { + if cnt.Load() > 0 { + t.discardedSampleAttribution.WithLabelValues(append(keys, reason)...).Add(float64(cnt.Swap(0))) + } + } + c.discardSamplemu.Unlock() + } + } + t.obseveredMtx.Unlock() + } +} diff --git a/pkg/costattribution/tracker_test.go b/pkg/costattribution/tracker_test.go new file mode 100644 index 00000000000..a53fdf7ec3f --- /dev/null +++ b/pkg/costattribution/tracker_test.go @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package costattribution + +import ( + "strings" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func Test_GetCALabels(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.Equal(t, []string{"team"}, cat.CALabels(), "Expected cost attribution labels mismatch") +} + +func Test_GetMaxCardinality(t *testing.T) { + cat := newTestManager().TrackerForUser("user1") + assert.Equal(t, 5, cat.MaxCardinality(), "Expected max cardinality mismatch") +} + +func Test_CreateCleanupTracker(t *testing.T) { + tManager := newTestManager() + cat := tManager.TrackerForUser("user4") + + reg := prometheus.NewRegistry() + err := reg.Register(cat) + require.NoError(t, err) + + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), time.Unix(1, 0)) + cat.IncrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "2"), time.Unix(2, 0)) + cat.DecrementActiveSeries(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "3"), time.Unix(3, 0)) + cat.IncrementReceivedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 5, time.Unix(4, 0)) + cat.IncrementDiscardedSamples(labels.FromStrings("platform", "foo", "tenant", "user4", "team", "1"), 2, "sample-out-of-order", time.Unix(4, 0)) + + cat.IncrementActiveSeries(labels.FromStrings("platform", "bar", "tenant", "user4", "team", "2"), time.Unix(6, 0)) + + cat.updateMetrics() + + expectedMetrics := ` + # HELP cortex_discarded_attributed_samples_total The total number of samples that were discarded per attribution. + # TYPE cortex_discarded_attributed_samples_total counter + cortex_discarded_attributed_samples_total{platform="foo",reason="sample-out-of-order", tenant="user4",tracker="cost-attribution"} 2 + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + cortex_ingester_attributed_active_series{platform="foo",tenant="user4",tracker="cost-attribution"} 1 + # HELP cortex_received_attributed_samples_total The total number of samples that were received per attribution. + # TYPE cortex_received_attributed_samples_total counter + cortex_received_attributed_samples_total{platform="foo",tenant="user4",tracker="cost-attribution"} 5 + ` + + metricNames := []string{ + "cortex_discarded_attributed_samples_total", + "cortex_received_attributed_samples_total", + "cortex_ingester_attributed_active_series", + } + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + assert.Equal(t, []string{"foo"}, cat.GetInactiveObservations(5)) + tManager.purgeInactiveAttributionsUntil(5) + + expectedMetrics = ` + # HELP cortex_ingester_attributed_active_series The total number of active series per user and attribution. + # TYPE cortex_ingester_attributed_active_series gauge + cortex_ingester_attributed_active_series{platform="bar",tenant="user4",tracker="cost-attribution"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) + cat.cleanupTracker() + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(""), metricNames...)) +} + +// func Test_GetKeyValues(t *testing.T) { +// cat := newTestManager().TrackerForUser("user3") + +// // Test initial key values and overflow states +// keyVal1 := cat.updateOverflow(labels.FromStrings("department", "foo", "service", "bar"), 1) +// assert.Equal(t, []string{"foo", "bar", "user3"}, keyVal1, "First call, expecting values as-is") + +// keyVal2 := cat.getKeyValues(labels.FromStrings("department", "foo"), 3) +// assert.Equal(t, []string{"foo", "__missing__", "user3"}, keyVal2, "Service missing, should return '__missing__'") + +// keyVal3 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "baz", "team", "a"), 4) +// assert.Equal(t, []string{"__overflow__", "__overflow__", "user3"}, keyVal3, "Overflow state expected") + +// keyVal4 := cat.getKeyValues(labels.FromStrings("department", "foo", "service", "bar"), 5) +// assert.Equal(t, []string{"__overflow__", "__overflow__", "user3"}, keyVal4, "Overflow state expected") +// } + +func Test_UpdateCounters(t *testing.T) { + cat := newTestManager().TrackerForUser("user3") + lbls1 := labels.FromStrings("department", "foo", "service", "bar") + lbls2 := labels.FromStrings("department", "bar", "service", "baz") + lbls3 := labels.FromStrings("department", "baz", "service", "foo") + + cat.updateCounters(lbls1, 1, 1, 0, 0, nil) + assert.False(t, cat.isOverflow, "First observation, should not overflow") + + cat.updateCounters(lbls2, 2, 1, 0, 0, nil) + assert.False(t, cat.isOverflow, "Second observation, should not overflow") + + cat.updateCounters(lbls3, 3, 1, 0, 0, nil) + assert.True(t, cat.isOverflow, "Third observation, should overflow") + + cat.updateCounters(lbls3, 4, 1, 0, 0, nil) + assert.True(t, cat.isOverflow, "Fourth observation, should stay overflow") + + assert.Equal(t, int64(3+cat.cooldownDuration), cat.cooldownUntil.Load(), "CooldownUntil should be updated correctly") +} + +func Test_GetInactiveObservations(t *testing.T) { + // Setup the test environment: create a tracker for user1 with a "team" label and max cardinality of 5. + cat := newTestManager().TrackerForUser("user1") + + // Create two observations with different last update timestamps. + observations := []labels.Labels{ + labels.FromStrings("team", "foo"), + labels.FromStrings("team", "bar"), + } + // Simulate samples discarded with different timestamps. + cat.IncrementDiscardedSamples(observations[0], 1, "invalid-metrics-name", time.Unix(1, 0)) + cat.IncrementDiscardedSamples(observations[1], 2, "out-of-window-sample", time.Unix(12, 0)) + + // Ensure that two observations were successfully added to the tracker. + require.Len(t, cat.observed, 2) + + // Purge observations that haven't been updated in the last 10 seconds. + purged := cat.GetInactiveObservations(5) + require.Len(t, purged, 1) + + // Check that the purged observation matches the expected details. + assert.Equal(t, "foo", purged[0]) +} + +func Test_UpdateMaxCardinality(t *testing.T) { + // user1 original max cardinality is 5 + cat := newTestManager().TrackerForUser("user1") + cat.UpdateMaxCardinality(2) + assert.Equal(t, 2, cat.MaxCardinality(), "Expected max cardinality update to 2") +} diff --git a/pkg/distributor/allcase.txt b/pkg/distributor/allcase.txt new file mode 100644 index 00000000000..c2405a2affa --- /dev/null +++ b/pkg/distributor/allcase.txt @@ -0,0 +1,90 @@ +goos: darwin +goarch: amd64 +pkg: github.com/grafana/mimir/pkg/distributor +cpu: Intel(R) Core(TM) i5-1038NG7 CPU @ 2.00GHz +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 637 1853179 ns/op 166418 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 639 1895205 ns/op 165412 B/op 78 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 606 1892974 ns/op 163882 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 639 1907549 ns/op 167614 B/op 82 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 624 1884032 ns/op 164782 B/op 79 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_successfully_pushed-8 607 1887582 ns/op 166831 B/op 78 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1236 929959 ns/op 2508 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1281 982885 ns/op 2470 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1267 933902 ns/op 2442 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1216 936387 ns/op 2480 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1278 933974 ns/op 2499 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=ingestion_rate_limit_reached-8 1214 929894 ns/op 2361 B/op 43 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 196 6103227 ns/op 1201367 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 196 6092985 ns/op 1202556 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 199 5986432 ns/op 1201042 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 198 5930995 ns/op 1200964 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 196 5947211 ns/op 1202236 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=too_many_labels_limit_reached-8 198 6130152 ns/op 1201279 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 298 3970543 ns/op 1137518 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 294 4233587 ns/op 1136289 B/op 5056 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 270 4462189 ns/op 1136907 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 212 8484710 ns/op 1138269 B/op 5058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 138 8016180 ns/op 1137486 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_name_length_limit_reached-8 300 4011353 ns/op 1136971 B/op 5057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 187 6466417 ns/op 1218690 B/op 6057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 219 5879012 ns/op 1218969 B/op 6058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 219 5297183 ns/op 1216897 B/op 6057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 225 5271546 ns/op 1218623 B/op 6058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 223 5292385 ns/op 1218010 B/op 6058 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=max_label_value_length_limit_reached-8 224 5264725 ns/op 1217656 B/op 6057 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 679 1728152 ns/op 324954 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 666 1735820 ns/op 324749 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 642 1762658 ns/op 324898 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 660 1756348 ns/op 325070 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 651 1826077 ns/op 324740 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=timestamp_too_new-8 678 1748920 ns/op 324890 B/op 4054 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 296 3914359 ns/op 1566607 B/op 7090 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 302 3929262 ns/op 1564399 B/op 7091 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 296 3945855 ns/op 1575166 B/op 7097 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 283 3969611 ns/op 1563748 B/op 7088 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 309 3923366 ns/op 1566517 B/op 7092 allocs/op +BenchmarkDistributor_Push/cost_attribution=disabled/scenario=all_samples_go_to_metric_relabel_configs-8 304 3883719 ns/op 1567272 B/op 7091 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 546 2160785 ns/op 206548 B/op 2081 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 522 2198887 ns/op 206315 B/op 2079 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 553 2203611 ns/op 207243 B/op 2081 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 536 2183919 ns/op 206614 B/op 2079 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 529 2197734 ns/op 206679 B/op 2080 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_successfully_pushed-8 526 2390356 ns/op 208231 B/op 2083 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1134 1021807 ns/op 2511 B/op 46 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1042 1100917 ns/op 2390 B/op 46 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1124 2202501 ns/op 2586 B/op 46 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1197 994139 ns/op 2535 B/op 46 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1208 1193533 ns/op 2566 B/op 46 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=ingestion_rate_limit_reached-8 1155 980992 ns/op 2543 B/op 46 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 182 6460385 ns/op 1249816 B/op 8058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 186 6477713 ns/op 1249483 B/op 8058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 184 6418529 ns/op 1249053 B/op 8058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 187 6438986 ns/op 1251058 B/op 8058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 184 6596688 ns/op 1250124 B/op 8058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=too_many_labels_limit_reached-8 184 6355415 ns/op 1249171 B/op 8058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4314450 ns/op 1185126 B/op 8057 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 270 4359515 ns/op 1185189 B/op 8057 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 266 4713813 ns/op 1185453 B/op 8057 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 192 7228030 ns/op 1184618 B/op 8057 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 277 4415891 ns/op 1185916 B/op 8057 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_name_length_limit_reached-8 273 4381573 ns/op 1185519 B/op 8057 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 205 5639131 ns/op 1266525 B/op 9058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 199 5855152 ns/op 1266610 B/op 9058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 205 5819444 ns/op 1265659 B/op 9058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 208 5725823 ns/op 1266413 B/op 9058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 174 5847499 ns/op 1267865 B/op 9059 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=max_label_value_length_limit_reached-8 208 6097036 ns/op 1266590 B/op 9058 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 541 2116989 ns/op 373316 B/op 7054 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 538 2106373 ns/op 373357 B/op 7054 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 540 2082513 ns/op 372600 B/op 7054 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 538 2081507 ns/op 372469 B/op 7054 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 564 2075526 ns/op 372886 B/op 7054 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=timestamp_too_new-8 537 2084192 ns/op 372905 B/op 7054 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 283 4218062 ns/op 1605597 B/op 9089 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 279 4221775 ns/op 1605400 B/op 9090 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 285 4162809 ns/op 1608485 B/op 9089 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 282 4246972 ns/op 1604051 B/op 9092 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 286 4189918 ns/op 1605391 B/op 9088 allocs/op +BenchmarkDistributor_Push/cost_attribution=enabled/scenario=all_samples_go_to_metric_relabel_configs-8 290 4179196 ns/op 1602168 B/op 9087 allocs/op +PASS +ok github.com/grafana/mimir/pkg/distributor 389.802s diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 3723d46b669..cbe961cab37 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -48,6 +48,7 @@ import ( "golang.org/x/sync/errgroup" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" ingester_client "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/querier/stats" @@ -111,6 +112,7 @@ type Distributor struct { distributorsRing *ring.Ring healthyInstancesCount *atomic.Uint32 + costAttributionMgr *costattribution.Manager // For handling HA replicas. HATracker haTracker @@ -320,7 +322,7 @@ func (m *PushMetrics) deleteUserMetrics(user string) { } // New constructs a new Distributor -func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { +func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Overrides, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionInstanceRing, canJoinDistributorsRing bool, reg prometheus.Registerer, log log.Logger) (*Distributor, error) { clientMetrics := ingester_client.NewMetrics(reg) if cfg.IngesterClientFactory == nil { cfg.IngesterClientFactory = ring_client.PoolInstFunc(func(inst ring.InstanceDesc) (ring_client.PoolClient, error) { @@ -341,6 +343,7 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove ingesterPool: NewPool(cfg.PoolConfig, ingestersRing, cfg.IngesterClientFactory, log), healthyInstancesCount: atomic.NewUint32(0), limits: limits, + costAttributionMgr: costAttributionMgr, ingestionRate: util_math.NewEWMARate(0.2, instanceIngestionRateTickInterval), queryDuration: instrument.NewHistogramCollector(promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ @@ -733,21 +736,22 @@ func (d *Distributor) checkSample(ctx context.Context, userID, cluster, replica // The returned error may retain the series labels. // It uses the passed nowt time to observe the delay of sample timestamps. func (d *Distributor) validateSeries(nowt time.Time, ts *mimirpb.PreallocTimeseries, userID, group string, skipLabelValidation, skipLabelCountValidation bool, minExemplarTS, maxExemplarTS int64) error { - if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation); err != nil { + cat := d.costAttributionMgr.TrackerForUser(userID) + if err := validateLabels(d.sampleValidationMetrics, d.limits, userID, group, ts.Labels, skipLabelValidation, skipLabelCountValidation, cat, nowt); err != nil { return err } now := model.TimeFromUnixNano(nowt.UnixNano()) for _, s := range ts.Samples { - if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s); err != nil { + if err := validateSample(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, s, cat); err != nil { return err } } histogramsUpdated := false for i := range ts.Histograms { - updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[i]) + updated, err := validateSampleHistogram(d.sampleValidationMetrics, now, d.limits, userID, group, ts.Labels, &ts.Histograms[i], cat) if err != nil { return err } @@ -859,7 +863,8 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { } numSamples := 0 - group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), time.Now()) + now := time.Now() + group := d.activeGroups.UpdateActiveGroupTimestamp(userID, validation.GroupLabel(d.limits, userID, req.Timeseries), now) for _, ts := range req.Timeseries { numSamples += len(ts.Samples) + len(ts.Histograms) } @@ -873,6 +878,7 @@ func (d *Distributor) prePushHaDedupeMiddleware(next PushFunc) PushFunc { if errors.As(err, &tooManyClustersError{}) { d.discardedSamplesTooManyHaClusters.WithLabelValues(userID, group).Add(float64(numSamples)) + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(numSamples), reasonTooManyHAClusters, now) } return err @@ -1128,6 +1134,9 @@ func (d *Distributor) prePushValidationMiddleware(next PushFunc) PushFunc { totalN := validatedSamples + validatedExemplars + validatedMetadata if !d.ingestionRateLimiter.AllowN(now, userID, totalN) { + if len(req.Timeseries) > 0 { + d.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(req.Timeseries[0].Labels), float64(validatedSamples), reasonRateLimited, now) + } d.discardedSamplesRateLimited.WithLabelValues(userID, group).Add(float64(validatedSamples)) d.discardedExemplarsRateLimited.WithLabelValues(userID).Add(float64(validatedExemplars)) d.discardedMetadataRateLimited.WithLabelValues(userID).Add(float64(validatedMetadata)) @@ -1686,9 +1695,11 @@ func tokenForMetadata(userID string, metricName string) uint32 { func (d *Distributor) updateReceivedMetrics(req *mimirpb.WriteRequest, userID string) { var receivedSamples, receivedExemplars, receivedMetadata int + for _, ts := range req.Timeseries { receivedSamples += len(ts.TimeSeries.Samples) + len(ts.TimeSeries.Histograms) receivedExemplars += len(ts.TimeSeries.Exemplars) + d.costAttributionMgr.TrackerForUser(userID).IncrementReceivedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(receivedSamples), mtime.Now()) } receivedMetadata = len(req.Metadata) diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index d6f17f22a7f..4ed4e0b2fcc 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -54,6 +54,7 @@ import ( "google.golang.org/grpc/metadata" "github.com/grafana/mimir/pkg/cardinality" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -1828,7 +1829,7 @@ func mkLabels(n int, extra ...string) []mimirpb.LabelAdapter { ret[i+1] = mimirpb.LabelAdapter{Name: fmt.Sprintf("name_%d", i), Value: fmt.Sprintf("value_%d", i)} } for i := 0; i < len(extra); i += 2 { - ret[i+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} + ret[i/2+n+1] = mimirpb.LabelAdapter{Name: extra[i], Value: extra[i+1]} } slices.SortFunc(ret, func(a, b mimirpb.LabelAdapter) int { switch { @@ -1861,7 +1862,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1882,7 +1883,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1902,7 +1903,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(31) + metrics[i] = mkLabels(30, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1923,7 +1924,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long name. - metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx") + metrics[i] = mkLabels(10, fmt.Sprintf("xxx_%0.200d", 1), "xxx", "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1944,7 +1945,7 @@ func BenchmarkDistributor_Push(b *testing.B) { for i := 0; i < numSeriesPerRequest; i++ { // Add a label with a very long value. - metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1)) + metrics[i] = mkLabels(10, "xxx", fmt.Sprintf("xxx_%0.200d", 1), "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -1964,7 +1965,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().Add(time.Hour).UnixNano() / int64(time.Millisecond), @@ -1975,7 +1976,7 @@ func BenchmarkDistributor_Push(b *testing.B) { }, expectedErr: "received a sample whose timestamp is too far in the future", }, - "all samples go to metric_relabel_configs": { + "all samples go to metric relabel configs": { prepareConfig: func(limits *validation.Limits) { limits.MetricRelabelConfigs = []*relabel.Config{ { @@ -1992,7 +1993,7 @@ func BenchmarkDistributor_Push(b *testing.B) { samples := make([]mimirpb.Sample, numSeriesPerRequest) for i := 0; i < numSeriesPerRequest; i++ { - metrics[i] = mkLabels(10) + metrics[i] = mkLabels(10, "team", strconv.Itoa(i%4)) samples[i] = mimirpb.Sample{ Value: float64(i), TimestampMs: time.Now().UnixNano() / int64(time.Millisecond), @@ -2005,78 +2006,110 @@ func BenchmarkDistributor_Push(b *testing.B) { }, } - for testName, testData := range tests { - b.Run(testName, func(b *testing.B) { - // Create an in-memory KV store for the ring with 1 ingester registered. - kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) - b.Cleanup(func() { assert.NoError(b, closer.Close()) }) + costAttributionCases := []struct { + state string + customRegistry *prometheus.Registry + cfg func(limits *validation.Limits) + }{ + { + state: "disabled", + customRegistry: nil, + cfg: func(_ *validation.Limits) {}, + }, + { + state: "enabled", + customRegistry: prometheus.NewRegistry(), + cfg: func(limits *validation.Limits) { + limits.CostAttributionLabels = []string{"team"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + }, + } - err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, - func(_ interface{}) (interface{}, bool, error) { - d := &ring.Desc{} - d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) - return d, true, nil - }, - ) - require.NoError(b, err) - - ingestersRing, err := ring.New(ring.Config{ - KVStore: kv.Config{Mock: kvStore}, - HeartbeatTimeout: 60 * time.Minute, - ReplicationFactor: 1, - }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) - }) + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for testName, testData := range tests { + b.Run(fmt.Sprintf("scenario=%s", testName), func(b *testing.B) { + // Create an in-memory KV store for the ring with 1 ingester registered. + kvStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + b.Cleanup(func() { assert.NoError(b, closer.Close()) }) - test.Poll(b, time.Second, 1, func() interface{} { - return ingestersRing.InstancesCount() - }) + err := kvStore.CAS(context.Background(), ingester.IngesterRingKey, + func(_ interface{}) (interface{}, bool, error) { + d := &ring.Desc{} + d.AddIngester("ingester-1", "127.0.0.1", "", ring.NewRandomTokenGenerator().GenerateTokens(128, nil), ring.ACTIVE, time.Now(), false, time.Time{}) + return d, true, nil + }, + ) + require.NoError(b, err) + + ingestersRing, err := ring.New(ring.Config{ + KVStore: kv.Config{Mock: kvStore}, + HeartbeatTimeout: 60 * time.Minute, + ReplicationFactor: 1, + }, ingester.IngesterRingKey, ingester.IngesterRingKey, log.NewNopLogger(), nil) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingestersRing)) + }) - // Prepare the distributor configuration. - var distributorCfg Config - var clientConfig client.Config - limits := validation.Limits{} - flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) - distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" + test.Poll(b, time.Second, 1, func() interface{} { + return ingestersRing.InstancesCount() + }) - limits.IngestionRate = float64(rate.Inf) // Unlimited. - testData.prepareConfig(&limits) + // Prepare the distributor configuration. + var distributorCfg Config + var clientConfig client.Config + limits := validation.Limits{} + flagext.DefaultValues(&distributorCfg, &clientConfig, &limits) + distributorCfg.DistributorRing.Common.KVStore.Store = "inmemory" - distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { - return &noopIngester{}, nil - }) + limits.IngestionRate = float64(rate.Inf) // Unlimited. + testData.prepareConfig(&limits) - overrides, err := validation.NewOverrides(limits, nil) - require.NoError(b, err) + distributorCfg.IngesterClientFactory = ring_client.PoolInstFunc(func(ring.InstanceDesc) (ring_client.PoolClient, error) { + return &noopIngester{}, nil + }) - // Start the distributor. - distributor, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) + caCase.cfg(&limits) + overrides, err := validation.NewOverrides(limits, nil) + require.NoError(b, err) - b.Cleanup(func() { - require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) - }) + // Initialize the cost attribution manager + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } - // Prepare the series to remote write before starting the benchmark. - metrics, samples := testData.prepareSeries() + // Start the distributor. + distributor, err := New(distributorCfg, clientConfig, overrides, nil, cam, ingestersRing, nil, true, nil, log.NewNopLogger()) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), distributor)) - // Run the benchmark. - b.ReportAllocs() - b.ResetTimer() + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), distributor)) + }) - for n := 0; n < b.N; n++ { - _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + // Prepare the series to remote write before starting the benchmark. + metrics, samples := testData.prepareSeries() - if testData.expectedErr == "" && err != nil { - b.Fatalf("no error expected but got %v", err) - } - if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { - b.Fatalf("expected %v error but got %v", testData.expectedErr, err) - } + // Run the benchmark. + b.ReportAllocs() + b.ResetTimer() + + for n := 0; n < b.N; n++ { + _, err := distributor.Push(ctx, mimirpb.ToWriteRequest(metrics, samples, nil, nil, mimirpb.API)) + + if testData.expectedErr == "" && err != nil { + b.Fatalf("no error expected but got %v", err) + } + if testData.expectedErr != "" && (err == nil || !strings.Contains(err.Error(), testData.expectedErr)) { + b.Fatalf("expected %v error but got %v", testData.expectedErr, err) + } + } + }) } }) } @@ -5339,7 +5372,7 @@ func prepare(t testing.TB, cfg prepConfig) ([]*Distributor, []*mockIngester, []* require.NoError(t, err) reg := prometheus.NewPedanticRegistry() - d, err := New(distributorCfg, clientConfig, overrides, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) + d, err := New(distributorCfg, clientConfig, overrides, nil, nil, ingestersRing, partitionsRing, true, reg, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, d)) t.Cleanup(func() { @@ -7973,7 +8006,7 @@ func TestCheckStartedMiddleware(t *testing.T) { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - distributor, err := New(distributorConfig, clientConfig, overrides, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) + distributor, err := New(distributorConfig, clientConfig, overrides, nil, nil, ingestersRing, nil, true, nil, log.NewNopLogger()) require.NoError(t, err) ctx := user.InjectOrgID(context.Background(), "user") diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index fffc943b6c0..1d398851099 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -16,6 +16,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/extract" "github.com/grafana/mimir/pkg/util/globalerror" @@ -232,15 +233,17 @@ func newExemplarValidationMetrics(r prometheus.Registerer) *exemplarValidationMe // validateSample returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample) error { +func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s mimirpb.Sample, cat *costattribution.Tracker) error { if model.Time(s.TimestampMs) > now.Add(cfg.CreationGracePeriod(userID)) { m.tooFarInFuture.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooNewMsgFormat, s.TimestampMs, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.TimestampMs) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { m.tooFarInPast.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return fmt.Errorf(sampleTimestampTooOldMsgFormat, s.TimestampMs, unsafeMetricName) } @@ -251,20 +254,23 @@ func validateSample(m *sampleValidationMetrics, now model.Time, cfg sampleValida // validateSampleHistogram returns an err if the sample is invalid. // The returned error may retain the provided series labels. // It uses the passed 'now' time to measure the relative time of the sample. -func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram) (bool, error) { +func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sampleValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, s *mimirpb.Histogram, cat *costattribution.Tracker) (bool, error) { if model.Time(s.Timestamp) > now.Add(cfg.CreationGracePeriod(userID)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInFuture, now.Time()) m.tooFarInFuture.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooNewMsgFormat, s.Timestamp, unsafeMetricName) } if cfg.PastGracePeriod(userID) > 0 && model.Time(s.Timestamp) < now.Add(-cfg.PastGracePeriod(userID)).Add(-cfg.OutOfOrderTimeWindow(userID)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonTooFarInPast, now.Time()) m.tooFarInPast.WithLabelValues(userID, group).Inc() unsafeMetricName, _ := extract.UnsafeMetricNameFromLabelAdapters(ls) return false, fmt.Errorf(sampleTimestampTooOldMsgFormat, s.Timestamp, unsafeMetricName) } if s.Schema < mimirpb.MinimumHistogramSchema || s.Schema > mimirpb.MaximumHistogramSchema { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidNativeHistogramSchema, now.Time()) m.invalidNativeHistogramSchema.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(invalidSchemaNativeHistogramMsgFormat, s.Schema) } @@ -278,6 +284,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } if bucketCount > bucketLimit { if !cfg.ReduceNativeHistogramOverMaxBuckets(userID) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(maxNativeHistogramBucketsMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -285,6 +292,7 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam for { bc, err := s.ReduceResolution() if err != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxNativeHistogramBuckets, now.Time()) m.maxNativeHistogramBuckets.WithLabelValues(userID, group).Inc() return false, fmt.Errorf(notReducibleNativeHistogramMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), bucketCount, bucketLimit) } @@ -386,14 +394,16 @@ func removeNonASCIIChars(in string) (out string) { // validateLabels returns an err if the labels are invalid. // The returned error may retain the provided series labels. -func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool) error { +func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userID, group string, ls []mimirpb.LabelAdapter, skipLabelValidation, skipLabelCountValidation bool, cat *costattribution.Tracker, ts time.Time) error { unsafeMetricName, err := extract.UnsafeMetricNameFromLabelAdapters(ls) if err != nil { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMissingMetricName, ts) m.missingMetricName.WithLabelValues(userID, group).Inc() return errors.New(noMetricNameMsgFormat) } if !model.IsValidMetricName(model.LabelValue(unsafeMetricName)) { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidMetricName, ts) m.invalidMetricName.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidMetricNameMsgFormat, removeNonASCIIChars(unsafeMetricName)) } @@ -402,11 +412,13 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI if strings.HasSuffix(unsafeMetricName, "_info") { if len(ls) > cfg.MaxLabelNamesPerInfoSeries(userID) { m.maxLabelNamesPerInfoSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerInfoSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyInfoLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerInfoSeries(userID), metric, ellipsis) } } else { m.maxLabelNamesPerSeries.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonMaxLabelNamesPerSeries, ts) metric, ellipsis := getMetricAndEllipsis(ls) return fmt.Errorf(tooManyLabelsMsgFormat, len(ls), cfg.MaxLabelNamesPerSeries(userID), metric, ellipsis) } @@ -418,17 +430,21 @@ func validateLabels(m *sampleValidationMetrics, cfg labelValidationConfig, userI for _, l := range ls { if !skipLabelValidation && !model.LabelName(l.Name).IsValid() { m.invalidLabel.WithLabelValues(userID, group).Inc() + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonInvalidLabel, ts) return fmt.Errorf(invalidLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if len(l.Name) > maxLabelNameLength { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelNameTooLong, ts) m.labelNameTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelNameTooLongMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } else if !skipLabelValidation && !model.LabelValue(l.Value).IsValid() { m.invalidLabelValue.WithLabelValues(userID, group).Inc() return fmt.Errorf(invalidLabelValueMsgFormat, l.Name, strings.ToValidUTF8(l.Value, ""), unsafeMetricName) } else if len(l.Value) > maxLabelValueLength { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonLabelValueTooLong, ts) m.labelValueTooLong.WithLabelValues(userID, group).Inc() return fmt.Errorf(labelValueTooLongMsgFormat, l.Name, l.Value, mimirpb.FromLabelAdaptersToString(ls)) } else if lastLabelName == l.Name { + cat.IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ls), 1, reasonDuplicateLabelNames, ts) m.duplicateLabelNames.WithLabelValues(userID, group).Inc() return fmt.Errorf(duplicateLabelMsgFormat, l.Name, mimirpb.FromLabelAdaptersToString(ls)) } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index df4de2dd60f..c84ed0b58a8 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -60,6 +60,7 @@ func (vm validateMetadataCfg) MaxMetadataLength(_ string) int { } func TestValidateLabels(t *testing.T) { + ts := time.Now() reg := prometheus.NewPedanticRegistry() s := newSampleValidationMetrics(reg) @@ -222,7 +223,7 @@ func TestValidateLabels(t *testing.T) { err: nil, }, } { - err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation) + err := validateLabels(s, cfg, userID, "custom label", mimirpb.FromMetricsToLabelAdapters(c.metric), c.skipLabelNameValidation, c.skipLabelCountValidation, nil, ts) assert.Equal(t, c.err, err, "wrong error") } @@ -416,17 +417,17 @@ func TestValidateMetadata(t *testing.T) { } func TestValidateLabelDuplication(t *testing.T) { + ts := time.Now() var cfg validateLabelsCfg cfg.maxLabelNameLength = 10 cfg.maxLabelNamesPerSeries = 10 cfg.maxLabelValueLength = 10 userID := "testUser" - actual := validateLabels(newSampleValidationMetrics(nil), cfg, userID, "", []mimirpb.LabelAdapter{ {Name: model.MetricNameLabel, Value: "a"}, {Name: model.MetricNameLabel, Value: "b"}, - }, false, false) + }, false, false, nil, ts) expected := fmt.Errorf( duplicateLabelMsgFormat, model.MetricNameLabel, @@ -443,7 +444,7 @@ func TestValidateLabelDuplication(t *testing.T) { {Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}, {Name: "a", Value: "a"}, - }, false, false) + }, false, false, nil, ts) expected = fmt.Errorf( duplicateLabelMsgFormat, "a", @@ -594,7 +595,6 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { registry := prometheus.NewRegistry() metrics := newSampleValidationMetrics(registry) - for _, limit := range []int{0, 1, 2} { for name, h := range testCases { t.Run(fmt.Sprintf("limit-%d-%s", limit, name), func(t *testing.T) { @@ -602,7 +602,7 @@ func TestMaxNativeHistorgramBuckets(t *testing.T) { cfg.maxNativeHistogramBuckets = limit ls := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", ls, &h, nil) if limit == 1 { require.Error(t, err) @@ -649,7 +649,7 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { hist.Schema = testCase.schema - _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist) + _, err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, hist, nil) require.Equal(t, testCase.expectedError, err) }) } diff --git a/pkg/ingester/activeseries/active_labels_test.go b/pkg/ingester/activeseries/active_labels_test.go index aa7f928d7dd..6fdf3e00bc4 100644 --- a/pkg/ingester/activeseries/active_labels_test.go +++ b/pkg/ingester/activeseries/active_labels_test.go @@ -41,7 +41,7 @@ func TestIsLabelValueActive(t *testing.T) { labels.FromStrings("a", "5"), } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) memPostings := index.NewMemPostings() for i, l := range series { @@ -51,10 +51,10 @@ func TestIsLabelValueActive(t *testing.T) { // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) require.True(t, valid) result, err := IsLabelValueActive(ctx, reader, activeSeries, "a", "1") diff --git a/pkg/ingester/activeseries/active_native_histogram_postings_test.go b/pkg/ingester/activeseries/active_native_histogram_postings_test.go index 665f5787c61..2b95020c68d 100644 --- a/pkg/ingester/activeseries/active_native_histogram_postings_test.go +++ b/pkg/ingester/activeseries/active_native_histogram_postings_test.go @@ -26,7 +26,7 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -34,10 +34,10 @@ func TestNativeHistogramPostings_Expand(t *testing.T) { if i+1 == 3 || i+1 == 4 { buckets = 10 // Native histogram with 10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -62,7 +62,7 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { @@ -70,10 +70,10 @@ func TestNativeHistogramPostings_ExpandWithBucketCount(t *testing.T) { if i == 2 || i == 3 { buckets = i * 10 // Native histogram with i*10 buckets. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 5, allActive) @@ -106,17 +106,18 @@ func TestNativeHistogramPostings_SeekSkipsNonNative(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 if i+1 == 4 { buckets = -1 // Make ref==4 not a native histogram to check that Seek skips it. } - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -145,14 +146,15 @@ func TestNativeHistogramPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { buckets := i * 10 - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), buckets, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -181,14 +183,14 @@ func TestNativeHistogramPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), 10, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_postings_test.go b/pkg/ingester/activeseries/active_postings_test.go index a2345841d11..84c71634e72 100644 --- a/pkg/ingester/activeseries/active_postings_test.go +++ b/pkg/ingester/activeseries/active_postings_test.go @@ -26,13 +26,14 @@ func TestPostings_Expand(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -57,13 +58,14 @@ func TestPostings_Seek(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 2, allActive) @@ -88,13 +90,14 @@ func TestPostings_SeekToEnd(t *testing.T) { } allStorageRefs := []storage.SeriesRef{1, 2, 3, 4, 5} storagePostings := index.NewListPostings(allStorageRefs) - activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl)) + activeSeries := NewActiveSeries(&asmodel.Matchers{}, time.Duration(ttl), nil) + // Update each series at a different time according to its index. for i := range allStorageRefs { - activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1) + activeSeries.UpdateSeries(series[i], allStorageRefs[i], time.Unix(int64(i), 0), -1, nil) } - valid := activeSeries.Purge(mockedTime) + valid := activeSeries.Purge(mockedTime, nil) allActive, _, _, _, _, _ := activeSeries.ActiveWithMatchers() require.True(t, valid) require.Equal(t, 0, allActive) diff --git a/pkg/ingester/activeseries/active_series.go b/pkg/ingester/activeseries/active_series.go index 71044b5e348..e7895404a22 100644 --- a/pkg/ingester/activeseries/active_series.go +++ b/pkg/ingester/activeseries/active_series.go @@ -13,10 +13,12 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb" "github.com/prometheus/prometheus/tsdb/chunks" "github.com/prometheus/prometheus/util/zeropool" "go.uber.org/atomic" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" ) @@ -44,10 +46,11 @@ type ActiveSeries struct { stripes [numStripes]seriesStripe deleted deletedSeries - // matchersMutex protects matchers and lastMatchersUpdate. - matchersMutex sync.RWMutex - matchers *asmodel.Matchers - lastMatchersUpdate time.Time + // configMutex protects matchers and lastMatchersUpdate. it used by both matchers and cat + configMutex sync.RWMutex + matchers *asmodel.Matchers + cat *costattribution.Tracker + lastConfigUpdate time.Time // The duration after which series become inactive. // Also used to determine if enough time has passed since configuration reload for valid results. @@ -63,8 +66,8 @@ type seriesStripe struct { // Unix nanoseconds. Only used by purge. Zero = unknown. // Updated in purge and when old timestamp is used when updating series (in this case, oldestEntryTs is updated // without holding the lock -- hence the atomic). - oldestEntryTs atomic.Int64 - + oldestEntryTs atomic.Int64 + cat *costattribution.Tracker mu sync.RWMutex refs map[storage.SeriesRef]seriesEntry active uint32 // Number of active entries in this stripe. Only decreased during purge or clear. @@ -84,50 +87,61 @@ type seriesEntry struct { deleted bool // This series was marked as deleted, so before purging we need to remove the refence to it from the deletedSeries. } -func NewActiveSeries(asm *asmodel.Matchers, timeout time.Duration) *ActiveSeries { - c := &ActiveSeries{matchers: asm, timeout: timeout} +func NewActiveSeries( + asm *asmodel.Matchers, + timeout time.Duration, + cat *costattribution.Tracker, +) *ActiveSeries { + c := &ActiveSeries{ + matchers: asm, timeout: timeout, cat: cat, + } // Stripes are pre-allocated so that we only read on them and no lock is required. for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, cat) } return c } func (c *ActiveSeries) CurrentMatcherNames() []string { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() return c.matchers.MatcherNames() } +func (c *ActiveSeries) ConfigDiffers(ctCfg asmodel.CustomTrackersConfig, caCfg *costattribution.Tracker) bool { + currentCTC, currentCAT := c.CurrentConfig() + // TODO: I think here to check the pointer is not equal is already enough, if we recreate tracker, it is for a good reason, otherwise, nothing changed + return ctCfg.String() != currentCTC.String() || caCfg != currentCAT //|| !costattribution.CompareCALabels(caCfg.CALabels(), currentCAT.CALabels()) +} + func (c *ActiveSeries) ReloadMatchers(asm *asmodel.Matchers, now time.Time) { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() + c.configMutex.Lock() + defer c.configMutex.Unlock() for i := 0; i < numStripes; i++ { - c.stripes[i].reinitialize(asm, &c.deleted) + c.stripes[i].reinitialize(asm, &c.deleted, c.cat) } c.matchers = asm - c.lastMatchersUpdate = now + c.lastConfigUpdate = now } -func (c *ActiveSeries) CurrentConfig() asmodel.CustomTrackersConfig { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() - return c.matchers.Config() +func (c *ActiveSeries) CurrentConfig() (asmodel.CustomTrackersConfig, *costattribution.Tracker) { + c.configMutex.RLock() + defer c.configMutex.RUnlock() + return c.matchers.Config(), c.cat } // UpdateSeries updates series timestamp to 'now'. Function is called to make a copy of labels if entry doesn't exist yet. // Pass -1 in numNativeHistogramBuckets if the series is not a native histogram series. -func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int) { +func (c *ActiveSeries) UpdateSeries(series labels.Labels, ref storage.SeriesRef, now time.Time, numNativeHistogramBuckets int, idx tsdb.IndexReader) { stripeID := ref % numStripes - created := c.stripes[stripeID].updateSeriesTimestamp(now, series, ref, numNativeHistogramBuckets) if created { if deleted, ok := c.deleted.find(series); ok { deletedStripeID := deleted.ref % numStripes - c.stripes[deletedStripeID].remove(deleted.ref) + c.stripes[deletedStripeID].remove(deleted.ref, idx) } } } @@ -149,19 +163,19 @@ func (c *ActiveSeries) PostDeletion(deleted map[chunks.HeadSeriesRef]labels.Labe // Purge purges expired entries and returns true if enough time has passed since // last reload. This should be called periodically to avoid unbounded memory // growth. -func (c *ActiveSeries) Purge(now time.Time) bool { - c.matchersMutex.Lock() - defer c.matchersMutex.Unlock() +func (c *ActiveSeries) Purge(now time.Time, idx tsdb.IndexReader) bool { + c.configMutex.Lock() + defer c.configMutex.Unlock() purgeTime := now.Add(-c.timeout) - c.purge(purgeTime) + c.purge(purgeTime, idx) - return !c.lastMatchersUpdate.After(purgeTime) + return !c.lastConfigUpdate.After(purgeTime) } // purge removes expired entries from the cache. -func (c *ActiveSeries) purge(keepUntil time.Time) { +func (c *ActiveSeries) purge(keepUntil time.Time, idx tsdb.IndexReader) { for s := 0; s < numStripes; s++ { - c.stripes[s].purge(keepUntil) + c.stripes[s].purge(keepUntil, idx) } } @@ -196,8 +210,8 @@ func (c *ActiveSeries) Active() (total, totalNativeHistograms, totalNativeHistog // of buckets in those active native histogram series. This method does not purge // expired entries, so Purge should be called periodically. func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, totalNativeHistograms int, totalMatchingNativeHistograms []int, totalNativeHistogramBuckets int, totalMatchingNativeHistogramBuckets []int) { - c.matchersMutex.RLock() - defer c.matchersMutex.RUnlock() + c.configMutex.RLock() + defer c.configMutex.RUnlock() totalMatching = make([]int, len(c.matchers.MatcherNames())) totalMatchingNativeHistograms = make([]int, len(c.matchers.MatcherNames())) @@ -212,9 +226,9 @@ func (c *ActiveSeries) ActiveWithMatchers() (total int, totalMatching []int, tot return } -func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef) { +func (c *ActiveSeries) Delete(ref chunks.HeadSeriesRef, idx tsdb.IndexReader) { stripeID := storage.SeriesRef(ref) % numStripes - c.stripes[stripeID].remove(storage.SeriesRef(ref)) + c.stripes[stripeID].remove(storage.SeriesRef(ref), idx) } func (c *ActiveSeries) Clear() { @@ -394,6 +408,9 @@ func (s *seriesStripe) findAndUpdateOrCreateEntryForSeries(ref storage.SeriesRef numNativeHistogramBuckets: numNativeHistogramBuckets, } + // here if we have a cost attribution label, we can split the serie count based on the value of the label + // we also set the reference to the value of the label in the entry, so when remove, we can decrease the counter accordingly + s.cat.IncrementActiveSeries(series, time.Unix(0, nowNanos)) s.refs[ref] = e return e.nanos, true } @@ -415,10 +432,13 @@ func (s *seriesStripe) clear() { } // Reinitialize assigns new matchers and corresponding size activeMatching slices. -func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSeries) { +func (s *seriesStripe) reinitialize( + asm *asmodel.Matchers, + deleted *deletedSeries, + cat *costattribution.Tracker, +) { s.mu.Lock() defer s.mu.Unlock() - s.deleted = deleted s.oldestEntryTs.Store(0) s.refs = map[storage.SeriesRef]seriesEntry{} @@ -429,9 +449,10 @@ func (s *seriesStripe) reinitialize(asm *asmodel.Matchers, deleted *deletedSerie s.activeMatching = resizeAndClear(len(asm.MatcherNames()), s.activeMatching) s.activeMatchingNativeHistograms = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistograms) s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(asm.MatcherNames()), s.activeMatchingNativeHistogramBuckets) + s.cat = cat } -func (s *seriesStripe) purge(keepUntil time.Time) { +func (s *seriesStripe) purge(keepUntil time.Time, idx tsdb.IndexReader) { keepUntilNanos := keepUntil.UnixNano() if oldest := s.oldestEntryTs.Load(); oldest > 0 && keepUntilNanos <= oldest { // Nothing to do. @@ -449,12 +470,21 @@ func (s *seriesStripe) purge(keepUntil time.Time) { s.activeMatchingNativeHistogramBuckets = resizeAndClear(len(s.activeMatchingNativeHistogramBuckets), s.activeMatchingNativeHistogramBuckets) oldest := int64(math.MaxInt64) + buf := labels.NewScratchBuilder(128) for ref, entry := range s.refs { ts := entry.nanos.Load() if ts < keepUntilNanos { if entry.deleted { s.deleted.purge(ref) } + + if idx != nil { + if err := idx.Series(ref, &buf, nil); err != nil { + //TODO: think about what to do here + _ = err + } + s.cat.DecrementActiveSeries(buf.Labels(), keepUntil) + } delete(s.refs, ref) continue } @@ -489,7 +519,7 @@ func (s *seriesStripe) purge(keepUntil time.Time) { // This is mostly the same logic from purge() but we decrement counters for a single entry instead of incrementing for each entry. // Note: we might remove the oldest series here, but the worst thing can happen is that we let run a useless purge() cycle later, // so this method doesn't update the oldestEntryTs. -func (s *seriesStripe) remove(ref storage.SeriesRef) { +func (s *seriesStripe) remove(ref storage.SeriesRef, idx tsdb.IndexReader) { s.mu.Lock() defer s.mu.Unlock() @@ -502,6 +532,14 @@ func (s *seriesStripe) remove(ref storage.SeriesRef) { } s.active-- + if idx != nil { + buf := labels.NewScratchBuilder(10) + if err := idx.Series(ref, &buf, nil); err != nil { + //TODO: think about what to do here + _ = err + } + s.cat.DecrementActiveSeries(buf.Labels(), time.Now()) + } if entry.numNativeHistogramBuckets >= 0 { s.activeNativeHistograms-- s.activeNativeHistogramBuckets -= uint32(entry.numNativeHistogramBuckets) diff --git a/pkg/ingester/activeseries/active_series_test.go b/pkg/ingester/activeseries/active_series_test.go index cf821c5bca5..ca36450f823 100644 --- a/pkg/ingester/activeseries/active_series_test.go +++ b/pkg/ingester/activeseries/active_series_test.go @@ -37,10 +37,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { ref3, ls3 := storage.SeriesRef(3), labels.FromStrings("a", "3") ref4, ls4 := storage.SeriesRef(4), labels.FromStrings("a", "4") ref5 := storage.SeriesRef(5) // will be used for ls1 again. - - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - - valid := c.Purge(time.Now()) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -50,8 +48,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveBuckets) assert.Empty(t, activeMatchingBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -62,8 +60,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -74,8 +72,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -86,8 +84,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -98,8 +96,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 5, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -111,8 +109,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 8, allActiveBuckets) // more buckets for a histogram - c.UpdateSeries(ls3, ref3, time.Now(), 7) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 7, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -124,8 +122,8 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 10, allActiveBuckets) // changing a metric from histogram to float - c.UpdateSeries(ls4, ref4, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -150,7 +148,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -162,7 +160,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // ref5 is created with the same labelset as ls1, it shouldn't be accounted as different series. - c.UpdateSeries(ls1, ref5, time.Now(), -1) + c.UpdateSeries(ls1, ref5, time.Now(), -1, nil) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) assert.Equal(t, 1, allActiveHistograms) @@ -173,7 +171,7 @@ func TestActiveSeries_UpdateSeries_NoMatchers(t *testing.T) { assert.Equal(t, 7, allActiveBuckets) // Doesn't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, allActiveHistograms, _, allActiveBuckets, _ = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -204,19 +202,19 @@ func TestActiveSeries_ContainsRef(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) // Update each series with a different timestamp according to each index for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // The expected number of series is the total number of series minus the ttl // because the first ttl series should be purged exp := len(series) - (ttl) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -231,7 +229,7 @@ func TestActiveSeries_ContainsRef(t *testing.T) { func TestActiveSeries_UpdateSeries_WithMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) } @@ -243,7 +241,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { ref5, ls5 := storage.SeriesRef(5), labels.FromStrings("a", "5") ref6 := storage.SeriesRef(6) // same as ls2 - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -257,8 +255,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -272,8 +270,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -287,8 +285,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -302,8 +300,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls3, ref3, time.Now(), -1) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), -1, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 3, allActive) @@ -317,8 +315,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 0, allActiveHistograms) assert.Equal(t, 0, allActiveBuckets) - c.UpdateSeries(ls4, ref4, time.Now(), 3) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 3, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 4, allActive) @@ -332,8 +330,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 1, allActiveHistograms) assert.Equal(t, 3, allActiveBuckets) - c.UpdateSeries(ls5, ref5, time.Now(), 5) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls5, ref5, time.Now(), 5, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -348,8 +346,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 8, allActiveBuckets) // changing a metric from float to histogram - c.UpdateSeries(ls3, ref3, time.Now(), 6) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls3, ref3, time.Now(), 6, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -364,8 +362,8 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 14, allActiveBuckets) // fewer (zero) buckets for a histogram - c.UpdateSeries(ls4, ref4, time.Now(), 0) - valid = c.Purge(time.Now()) + c.UpdateSeries(ls4, ref4, time.Now(), 0, nil) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -397,7 +395,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -412,7 +410,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // ls2 is pushed again, this time with ref6 - c.UpdateSeries(ls2, ref6, time.Now(), -1) + c.UpdateSeries(ls2, ref6, time.Now(), -1, nil) // Numbers don't change. allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -427,7 +425,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { assert.Equal(t, 11, allActiveBuckets) // Don't change after purging. - valid = c.Purge(time.Now()) + valid = c.Purge(time.Now(), nil) assert.True(t, valid) allActive, activeMatching, allActiveHistograms, activeMatchingHistograms, allActiveBuckets, activeMatchingBuckets = c.ActiveWithMatchers() assert.Equal(t, 5, allActive) @@ -448,7 +446,7 @@ func testUpdateSeries(t *testing.T, c *ActiveSeries) { func TestActiveSeries_UpdateSeries_Clear(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~"2|3|4"}`})) - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) testUpdateSeries(t, c) c.Clear() @@ -488,12 +486,11 @@ func labelsWithHashCollision() (labels.Labels, labels.Labels) { func TestActiveSeries_ShouldCorrectlyHandleHashCollisions(t *testing.T) { ls1, ls2 := labelsWithHashCollision() ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) + c.UpdateSeries(ls1, ref1, time.Now(), -1, nil) + c.UpdateSeries(ls2, ref2, time.Now(), -1, nil) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) - c.UpdateSeries(ls1, ref1, time.Now(), -1) - c.UpdateSeries(ls2, ref2, time.Now(), -1) - - valid := c.Purge(time.Now()) + valid := c.Purge(time.Now(), nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -517,22 +514,22 @@ func TestActiveSeries_Purge_NoMatchers(t *testing.T) { for ttl := 1; ttl <= len(series); ttl++ { t.Run(fmt.Sprintf("ttl: %d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) for i := 0; i < len(series); i++ { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) } c.PostDeletion(map[chunks.HeadSeriesRef]labels.Labels{ deletedRef: deletedLabels, }) - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) exp := len(series) - (ttl) // Purge is not intended to purge - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -563,13 +560,13 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { t.Run(fmt.Sprintf("ttl=%d", ttl), func(t *testing.T) { mockedTime := time.Unix(int64(ttl), 0) - c := NewActiveSeries(asm, 5*time.Minute) + c := NewActiveSeries(asm, 5*time.Minute, nil) exp := len(series) - ttl expMatchingSeries := 0 for i, s := range series { - c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1) + c.UpdateSeries(series[i], refs[i], time.Unix(int64(i), 0), -1, nil) // if this series is matching, and they're within the ttl tmp := asm.Matches(s) @@ -578,11 +575,11 @@ func TestActiveSeries_Purge_WithMatchers(t *testing.T) { } } - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) // call purge twice, just to hit "quick" path. It doesn't really do anything. - c.purge(time.Unix(int64(ttl), 0)) + c.purge(time.Unix(int64(ttl), 0), nil) - valid := c.Purge(mockedTime) + valid := c.Purge(mockedTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, exp, allActive) @@ -596,28 +593,28 @@ func TestActiveSeries_PurgeOpt(t *testing.T) { ref1, ref2 := storage.SeriesRef(1), storage.SeriesRef(2) currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second) + c := NewActiveSeries(&asmodel.Matchers{}, 59*time.Second, nil) - c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-2*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 1, allActive) - c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) + c.UpdateSeries(ls1, ref1, currentTime.Add(-1*time.Minute), -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) // This will *not* update the series, since there is already newer timestamp. - c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1) + c.UpdateSeries(ls2, ref2, currentTime.Add(-1*time.Minute), -1, nil) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -632,30 +629,30 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { asm := asmodel.NewMatchers(MustNewCustomTrackersConfigFromMap(t, map[string]string{"foo": `{a=~.*}`})) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) assert.Equal(t, []int{1}, activeMatching) c.ReloadMatchers(asm, currentTime) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.False(t, valid) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls1, ref1, currentTime, -1) - c.UpdateSeries(ls2, ref2, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + c.UpdateSeries(ls2, ref2, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 2, allActive) @@ -666,8 +663,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls3, ref3, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls3, ref3, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -681,8 +678,8 @@ func TestActiveSeries_ReloadSeriesMatchers(t *testing.T) { // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - c.UpdateSeries(ls4, ref4, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls4, ref4, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -698,15 +695,15 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) currentTime := time.Now() - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -717,10 +714,10 @@ func TestActiveSeries_ReloadSeriesMatchers_LessMatchers(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -736,16 +733,15 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) currentTime := time.Now() - - c := NewActiveSeries(asm, DefaultTimeout) - valid := c.Purge(currentTime) + c := NewActiveSeries(asm, DefaultTimeout, nil) + valid := c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(t, 0, allActive) assert.Equal(t, []int{0, 0}, activeMatching) - c.UpdateSeries(ls1, ref1, currentTime, -1) - valid = c.Purge(currentTime) + c.UpdateSeries(ls1, ref1, currentTime, -1, nil) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 1, allActive) @@ -757,11 +753,11 @@ func TestActiveSeries_ReloadSeriesMatchers_SameSizeNewLabels(t *testing.T) { })) c.ReloadMatchers(asm, currentTime) - c.purge(time.Time{}) + c.purge(time.Time{}, nil) // Adding timeout time to make Purge results valid. currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(t, valid) allActive, activeMatching, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(t, 0, allActive) @@ -790,7 +786,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo var ( // Run the active series tracker with an active timeout = 0 so that the Purge() will always // purge the series. - c = NewActiveSeries(&asmodel.Matchers{}, 0) + c = NewActiveSeries(&asmodel.Matchers{}, 0, nil) updateGroup = &sync.WaitGroup{} purgeGroup = &sync.WaitGroup{} start = make(chan struct{}) @@ -824,7 +820,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo nextSeriesID = 0 } - c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1) + c.UpdateSeries(seriesList[nextSeriesID], storage.SeriesRef(nextSeriesID), now(), -1, nil) } }(i) } @@ -841,7 +837,7 @@ func benchmarkActiveSeriesUpdateSeriesConcurrency(b *testing.B, numSeries, numGo case <-stopPurge: return default: - c.Purge(future()) + c.Purge(future(), nil) } // Throttle, but keep high pressure from Purge(). @@ -928,10 +924,10 @@ func BenchmarkActiveSeries_UpdateSeries(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - c := NewActiveSeries(asm, DefaultTimeout) + c := NewActiveSeries(asm, DefaultTimeout, nil) for round := 0; round <= tt.nRounds; round++ { for ix := 0; ix < tt.nSeries; ix++ { - c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1) + c.UpdateSeries(series[ix], refs[ix], time.Unix(0, now), -1, nil) now++ } } @@ -953,7 +949,7 @@ func benchmarkPurge(b *testing.B, twice bool) { const numExpiresSeries = numSeries / 25 currentTime := time.Now() - c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout) + c := NewActiveSeries(&asmodel.Matchers{}, DefaultTimeout, nil) series := [numSeries]labels.Labels{} refs := [numSeries]storage.SeriesRef{} @@ -968,13 +964,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Prepare series for ix, s := range series { if ix < numExpiresSeries { - c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1) + c.UpdateSeries(s, refs[ix], currentTime.Add(-DefaultTimeout), -1, nil) } else { - c.UpdateSeries(s, refs[ix], currentTime, -1) + c.UpdateSeries(s, refs[ix], currentTime, -1, nil) } } - valid := c.Purge(currentTime) + valid := c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ := c.ActiveWithMatchers() assert.Equal(b, numSeries, allActive) @@ -982,13 +978,13 @@ func benchmarkPurge(b *testing.B, twice bool) { // Purge is going to purge everything currentTime = currentTime.Add(DefaultTimeout) - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) if twice { - valid = c.Purge(currentTime) + valid = c.Purge(currentTime, nil) assert.True(b, valid) allActive, _, _, _, _, _ = c.ActiveWithMatchers() assert.Equal(b, numSeries-numExpiresSeries, allActive) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 4ae60217a4b..db221d3a66d 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -52,6 +52,7 @@ import ( "golang.org/x/exp/slices" "golang.org/x/sync/errgroup" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/ingester/activeseries" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" @@ -314,6 +315,8 @@ type Ingester struct { activeGroups *util.ActiveGroupsCleanupService + costAttributionMgr *costattribution.Manager + tsdbMetrics *tsdbMetrics forceCompactTrigger chan requestWithUsersAndCallback @@ -368,8 +371,9 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus limits: limits, logger: logger, - tsdbs: make(map[string]*userTSDB), - usersMetadata: make(map[string]*userMetricsMetadata), + tsdbs: make(map[string]*userTSDB), + usersMetadata: make(map[string]*userMetricsMetadata), + bucket: bucketClient, tsdbMetrics: newTSDBMetrics(registerer, logger), shipperMetrics: newShipperMetrics(registerer), @@ -382,7 +386,7 @@ func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus } // New returns an Ingester that uses Mimir block storage. -func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { +func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, partitionRingWatcher *ring.PartitionRingWatcher, activeGroupsCleanupService *util.ActiveGroupsCleanupService, costAttributionMgr *costattribution.Manager, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { i, err := newIngester(cfg, limits, registerer, logger) if err != nil { return nil, err @@ -391,6 +395,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService + i.costAttributionMgr = costAttributionMgr // We create a circuit breaker, which will be activated on a successful completion of starting. i.circuitBreaker = newIngesterCircuitBreaker(i.cfg.PushCircuitBreaker, i.cfg.ReadCircuitBreaker, logger, registerer) @@ -783,10 +788,13 @@ func (i *Ingester) updateActiveSeries(now time.Time) { } newMatchersConfig := i.limits.ActiveSeriesCustomTrackersConfig(userID) - if newMatchersConfig.String() != userDB.activeSeries.CurrentConfig().String() { + newCostAttributionTracker := i.costAttributionMgr.TrackerForUser(userID) + if userDB.activeSeries.ConfigDiffers(newMatchersConfig, newCostAttributionTracker) { i.replaceMatchers(asmodel.NewMatchers(newMatchersConfig), userDB, now) } - valid := userDB.activeSeries.Purge(now) + + idx, _ := userDB.Head().Index() + valid := userDB.activeSeries.Purge(now, idx) if !valid { // Active series config has been reloaded, exposing loading metric until MetricsIdleTimeout passes. i.metrics.activeSeriesLoading.WithLabelValues(userID).Set(1) @@ -1159,7 +1167,6 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // Note that we don't .Finish() the span in this method on purpose spanlog := spanlogger.FromContext(ctx, i.logger) spanlog.DebugLog("event", "acquired append lock") - var ( startAppend = time.Now() @@ -1190,48 +1197,56 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTimestampTooOldCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTimestampTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { return newSampleTimestampTooOldError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.sampleOutOfOrder, func() softError { return newSampleOutOfOrderError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooOldCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooOld, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooOldOOOEnabled, func() softError { return newSampleTimestampTooOldOOOEnabledError(model.Time(timestamp), labels, outOfOrderWindow) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleTooFarInFutureCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleTooFarInFuture, startAppend) updateFirstPartial(i.errorSamplers.sampleTimestampTooFarInFuture, func() softError { return newSampleTimestampTooFarInFutureError(model.Time(timestamp), labels) }) }, func(timestamp int64, labels []mimirpb.LabelAdapter) { stats.newValueForTimestampCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonNewValueForTimestamp, startAppend) updateFirstPartial(i.errorSamplers.sampleDuplicateTimestamp, func() softError { return newSampleDuplicateTimestampError(model.Time(timestamp), labels) }) }, - func() { + func(labels []mimirpb.LabelAdapter) { stats.perUserSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerUserSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerUserLimitExceeded, func() softError { return newPerUserSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerUser(userID)) }) }, func(labels []mimirpb.LabelAdapter) { stats.perMetricSeriesLimitCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonPerMetricSeriesLimit, startAppend) updateFirstPartial(i.errorSamplers.maxSeriesPerMetricLimitExceeded, func() softError { return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.sampleOutOfOrderCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonSampleOutOfOrder, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { e := newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels) return e @@ -1239,30 +1254,35 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) }) }, func(err error, timestamp int64, labels []mimirpb.LabelAdapter) { stats.invalidNativeHistogramCount++ + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(labels), 1, reasonInvalidNativeHistogram, startAppend) updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) }) @@ -1377,7 +1397,6 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.PreallocTimeseries, app extendedAppender, startAppend time.Time, stats *pushStats, errProcessor *mimir_storage.SoftAppendErrorProcessor, updateFirstPartial func(sampler *util_log.Sampler, errFn softErrorFunction), activeSeries *activeseries.ActiveSeries, outOfOrderWindow time.Duration, minAppendTimeAvailable bool, minAppendTime int64) error { - // Fetch limits once per push request both to avoid processing half the request differently. var ( nativeHistogramsIngestionEnabled = i.limits.NativeHistogramsIngestionEnabled(userID) @@ -1390,6 +1409,11 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre var builder labels.ScratchBuilder var nonCopiedLabels labels.Labels + + // idx is used to decrease active series count in case of error for cost attribution. + idx, _ := i.getTSDB(userID).Head().Index() + // TODO: deal with the error here + for _, ts := range timeseries { // The labels must be sorted (in our case, it's guaranteed a write request // has sorted labels once hit the ingester). @@ -1405,8 +1429,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre allOutOfBoundsHistograms(ts.Histograms, minAppendTime) { stats.failedSamplesCount += len(ts.Samples) + len(ts.Histograms) - stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) + stats.sampleTimestampTooOldCount += len(ts.Samples) + len(ts.Histograms) + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)+len(ts.Histograms)), reasonSampleTimestampTooOld, startAppend) var firstTimestamp int64 if len(ts.Samples) > 0 { firstTimestamp = ts.Samples[0].TimestampMs @@ -1424,10 +1449,9 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre // ignore native histograms in the condition and statitics as well if outOfOrderWindow <= 0 && minAppendTimeAvailable && len(ts.Exemplars) == 0 && len(ts.Samples) > 0 && allOutOfBoundsFloats(ts.Samples, minAppendTime) { - stats.failedSamplesCount += len(ts.Samples) stats.sampleTimestampTooOldCount += len(ts.Samples) - + i.costAttributionMgr.TrackerForUser(userID).IncrementDiscardedSamples(mimirpb.FromLabelAdaptersToLabels(ts.Labels), float64(len(ts.Samples)), reasonSampleTimestampTooOld, startAppend) firstTimestamp := ts.Samples[0].TimestampMs updateFirstPartial(i.errorSamplers.sampleTimestampTooOld, func() softError { @@ -1548,7 +1572,7 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre } if activeSeries != nil && stats.succeededSamplesCount > oldSucceededSamplesCount { - activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets) + activeSeries.UpdateSeries(nonCopiedLabels, ref, startAppend, numNativeHistogramBuckets, idx) } if len(ts.Exemplars) > 0 && i.limits.MaxGlobalExemplarsPerUser(userID) > 0 { @@ -2642,8 +2666,12 @@ func (i *Ingester) createTSDB(userID string, walReplayConcurrency int) (*userTSD } userDB := &userTSDB{ - userID: userID, - activeSeries: activeseries.NewActiveSeries(asmodel.NewMatchers(matchersConfig), i.cfg.ActiveSeriesMetrics.IdleTimeout), + userID: userID, + activeSeries: activeseries.NewActiveSeries( + asmodel.NewMatchers(matchersConfig), + i.cfg.ActiveSeriesMetrics.IdleTimeout, + i.costAttributionMgr.TrackerForUser(userID), + ), seriesInMetric: newMetricCounter(i.limiter, i.cfg.getIgnoreSeriesLimitForMetricNamesMap()), ingestedAPISamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), ingestedRuleSamples: util_math.NewEWMARate(0.2, i.cfg.RateUpdatePeriod), @@ -3243,7 +3271,12 @@ func (i *Ingester) compactBlocksToReduceInMemorySeries(ctx context.Context, now } // Purge the active series so that the next call to Active() will return the up-to-date count. - db.activeSeries.Purge(now) + idx, err := db.Head().Index() + if err != nil { + level.Warn(i.logger).Log("msg", "failed to get the index of the TSDB head", "user", userID, "err", err) + continue + } + db.activeSeries.Purge(now, idx) // Estimate the number of series that would be dropped from the TSDB Head if we would // compact the head up until "now - active series idle timeout". diff --git a/pkg/ingester/ingester_early_compaction_test.go b/pkg/ingester/ingester_early_compaction_test.go index c12b6bb65dc..fe83b4a9df0 100644 --- a/pkg/ingester/ingester_early_compaction_test.go +++ b/pkg/ingester/ingester_early_compaction_test.go @@ -130,7 +130,7 @@ func TestIngester_compactBlocksToReduceInMemorySeries_ShouldTriggerCompactionOnl require.Len(t, listBlocksInDir(t, userBlocksDir), 0) // Use a trick to track all series we've written so far as "inactive". - ingester.getTSDB(userID).activeSeries.Purge(now.Add(30 * time.Minute)) + ingester.getTSDB(userID).activeSeries.Purge(now.Add(30*time.Minute), nil) // Pre-condition check. require.Equal(t, uint64(10), ingester.getTSDB(userID).Head().NumSeries()) diff --git a/pkg/ingester/ingester_ingest_storage_test.go b/pkg/ingester/ingester_ingest_storage_test.go index 4a529321155..fcf79dd4bc7 100644 --- a/pkg/ingester/ingester_ingest_storage_test.go +++ b/pkg/ingester/ingester_ingest_storage_test.go @@ -650,7 +650,7 @@ func createTestIngesterWithIngestStorage(t testing.TB, ingesterCfg *Config, over require.NoError(t, services.StopAndAwaitTerminated(ctx, prw)) }) - ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, reg, util_test.NewTestingLogger(t)) + ingester, err := New(*ingesterCfg, overrides, nil, prw, nil, nil, reg, util_test.NewTestingLogger(t)) require.NoError(t, err) return ingester, kafkaCluster, prw diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index fea6b13f764..48eff46fc86 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -59,6 +59,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" + "github.com/grafana/mimir/pkg/costattribution" asmodel "github.com/grafana/mimir/pkg/ingester/activeseries/model" "github.com/grafana/mimir/pkg/ingester/client" "github.com/grafana/mimir/pkg/mimirpb" @@ -3589,53 +3590,114 @@ func TestIngester_Push_DecreaseInactiveSeries(t *testing.T) { } func BenchmarkIngesterPush(b *testing.B) { - registry := prometheus.NewRegistry() - ctx := user.InjectOrgID(context.Background(), userID) + costAttributionCases := []struct { + state string + limitsCfg func(*validation.Limits) + customRegistry *prometheus.Registry + }{ + { + state: "enabled", + limitsCfg: func(*validation.Limits) {}, + customRegistry: nil, + }, + { + state: "disabled", + limitsCfg: func(limits *validation.Limits) { + if limits == nil { + return + } + limits.CostAttributionLabels = []string{"cpu"} + limits.MaxCostAttributionCardinalityPerUser = 100 + }, + customRegistry: prometheus.NewRegistry(), + }, + } - // Create a mocked ingester - cfg := defaultIngesterTestConfig(b) + tests := []struct { + name string + limitsCfg func() validation.Limits + }{ + { + name: "ingester push succeeded", + limitsCfg: func() validation.Limits { + limitsCfg := defaultLimitsTestConfig() + limitsCfg.NativeHistogramsIngestionEnabled = true + return limitsCfg + }, + }, + } - ingester, err := prepareIngesterWithBlocksStorage(b, cfg, nil, registry) - require.NoError(b, err) - require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - defer services.StopAndAwaitTerminated(context.Background(), ingester) //nolint:errcheck + for _, caCase := range costAttributionCases { + b.Run(fmt.Sprintf("cost_attribution=%s", caCase.state), func(b *testing.B) { + for _, t := range tests { + b.Run(fmt.Sprintf("scenario=%s", t.name), func(b *testing.B) { + registry := prometheus.NewRegistry() + ctx := user.InjectOrgID(context.Background(), userID) - // Wait until the ingester is healthy - test.Poll(b, 100*time.Millisecond, 1, func() interface{} { - return ingester.lifecycler.HealthyInstancesCount() - }) + // Create a mocked ingester + cfg := defaultIngesterTestConfig(b) - // Push a single time series to set the TSDB min time. - metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} - startTime := util.TimeToMillis(time.Now()) + limitCfg := t.limitsCfg() + caCase.limitsCfg(&limitCfg) - currTimeReq := mimirpb.ToWriteRequest( - metricLabelAdapters, - []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, - nil, - nil, - mimirpb.API, - ) - _, err = ingester.Push(ctx, currTimeReq) - require.NoError(b, err) + overrides, err := validation.NewOverrides(limitCfg, nil) + require.NoError(b, err) - const ( - series = 10 - samples = 1 - ) + var cam *costattribution.Manager + if caCase.customRegistry != nil { + cam, err = costattribution.NewManager(5*time.Second, time.Second, 10*time.Second, nil, overrides, caCase.customRegistry) + require.NoError(b, err) + } + + ingester, err := prepareIngesterWithBlockStorageOverridesAndCostAttribution(b, cfg, overrides, nil, "", "", registry, cam) + require.NoError(b, err) + require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingester)) - allLabels, allSamples := benchmarkData(series) + b.Cleanup(func() { + require.NoError(b, services.StopAndAwaitTerminated(context.Background(), ingester)) + }) - b.ResetTimer() - for iter := 0; iter < b.N; iter++ { - // Bump the timestamp on each of our test samples each time round the loop - for j := 0; j < samples; j++ { - for i := range allSamples { - allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + // Wait until the ingester is healthy + test.Poll(b, 100*time.Millisecond, 1, func() interface{} { + return ingester.lifecycler.HealthyInstancesCount() + }) + + // Push a single time series to set the TSDB min time. + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + startTime := util.TimeToMillis(time.Now()) + + currTimeReq := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: startTime}}, + nil, + nil, + mimirpb.API, + ) + _, err = ingester.Push(ctx, currTimeReq) + require.NoError(b, err) + + // so we are benchmark 5000 series with 10 sample each + const ( + series = 5000 + samples = 10 + ) + + allLabels, allSamples := benchmarkData(series) + + b.ResetTimer() + for iter := 0; iter < b.N; iter++ { + // Bump the timestamp on each of our test samples each time round the loop + for j := 0; j < samples; j++ { + for i := range allSamples { + allSamples[i].TimestampMs = startTime + int64(iter*samples+j+1) + } + _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) + require.NoError(b, err) + } + } + }) } - _, err := ingester.Push(ctx, mimirpb.ToWriteRequest(allLabels, allSamples, nil, nil, mimirpb.API)) - require.NoError(b, err) - } + }) } } @@ -6056,10 +6118,14 @@ func prepareIngesterWithBlocksStorageAndLimits(t testing.TB, ingesterCfg Config, } func prepareIngesterWithBlockStorageAndOverrides(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { - return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer) + return prepareIngesterWithBlockStorageOverridesAndCostAttribution(t, ingesterCfg, overrides, ingestersRing, dataDir, bucketDir, registerer, nil) +} + +func prepareIngesterWithBlockStorageOverridesAndCostAttribution(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { + return prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t, ingesterCfg, overrides, ingestersRing, nil, dataDir, bucketDir, registerer, cam) } -func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer) (*Ingester, error) { +func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, ingesterCfg Config, overrides *validation.Overrides, ingestersRing ring.ReadRing, partitionsRing *ring.PartitionRingWatcher, dataDir string, bucketDir string, registerer prometheus.Registerer, cam *costattribution.Manager) (*Ingester, error) { // Create a data dir if none has been provided. if dataDir == "" { dataDir = t.TempDir() @@ -6080,7 +6146,7 @@ func prepareIngesterWithBlockStorageAndOverridesAndPartitionRing(t testing.TB, i ingestersRing = createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()) } - ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) + ingester, err := New(ingesterCfg, overrides, ingestersRing, partitionsRing, nil, cam, registerer, noDebugNoopLogger{}) // LOGGING: log.NewLogfmtLogger(os.Stderr) if err != nil { return nil, err } @@ -6286,7 +6352,7 @@ func TestIngester_OpenExistingTSDBOnStartup(t *testing.T) { // setup the tsdbs dir testData.setup(t, tempDir) - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) startErr := services.StartAndAwaitRunning(context.Background(), ingester) @@ -7446,7 +7512,7 @@ func TestHeadCompactionOnStartup(t *testing.T) { ingesterCfg.BlocksStorageConfig.Bucket.S3.Endpoint = "localhost" ingesterCfg.BlocksStorageConfig.TSDB.Retention = 2 * 24 * time.Hour // Make sure that no newly created blocks are deleted. - ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, log.NewNopLogger()) + ingester, err := New(ingesterCfg, overrides, createAndStartRing(t, ingesterCfg.IngesterRing.ToRingConfig()), nil, nil, nil, nil, log.NewNopLogger()) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ingester)) diff --git a/pkg/ingester/user_tsdb.go b/pkg/ingester/user_tsdb.go index 95bfe9840e2..b4a5dc74baf 100644 --- a/pkg/ingester/user_tsdb.go +++ b/pkg/ingester/user_tsdb.go @@ -619,12 +619,14 @@ func (u *userTSDB) computeOwnedSeries() int { } count := 0 + idx, _ := u.Head().Index() + // TODO: deal with the err here u.Head().ForEachSecondaryHash(func(refs []chunks.HeadSeriesRef, secondaryHashes []uint32) { for i, sh := range secondaryHashes { if u.ownedTokenRanges.IncludesKey(sh) { count++ } else { - u.activeSeries.Delete(refs[i]) + u.activeSeries.Delete(refs[i], idx) } } }) diff --git a/pkg/mimir/mimir.go b/pkg/mimir/mimir.go index 7bcd3eac250..31baea29e7e 100644 --- a/pkg/mimir/mimir.go +++ b/pkg/mimir/mimir.go @@ -52,6 +52,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -148,6 +149,9 @@ type Config struct { Common CommonConfig `yaml:"common"` TimeseriesUnmarshalCachingOptimizationEnabled bool `yaml:"timeseries_unmarshal_caching_optimization_enabled" category:"experimental"` + + CostAttributionEvictionInterval time.Duration `yaml:"cost_attribution_eviction_interval" category:"experimental"` + CostAttributionRegistryPath string `yaml:"cost_attribution_registry_path" category:"experimental"` } // RegisterFlags registers flags. @@ -173,6 +177,8 @@ func (c *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.IntVar(&c.MaxSeparateMetricsGroupsPerUser, "max-separate-metrics-groups-per-user", 1000, "Maximum number of groups allowed per user by which specified distributor and ingester metrics can be further separated.") f.BoolVar(&c.EnableGoRuntimeMetrics, "enable-go-runtime-metrics", false, "Set to true to enable all Go runtime metrics, such as go_sched_* and go_memstats_*.") f.BoolVar(&c.TimeseriesUnmarshalCachingOptimizationEnabled, "timeseries-unmarshal-caching-optimization-enabled", true, "Enables optimized marshaling of timeseries.") + f.StringVar(&c.CostAttributionRegistryPath, "cost-attribution.registry-path", "", "Defines a custom path for the registry. When specified, Mimir will expose cost attribution metrics through this custom path, if not specified, cost attribution metrics won't be exposed.") + f.DurationVar(&c.CostAttributionEvictionInterval, "cost-attribution.eviction-interval", 20*time.Minute, "Time interval at which inactive cost attributions are evicted from the counter, ensuring they are not included in the cost attribution cardinality per user limit.") c.API.RegisterFlags(f) c.registerServerFlagsWithChangedDefaultValues(f) @@ -739,6 +745,8 @@ type Mimir struct { BlockBuilderScheduler *blockbuilderscheduler.BlockBuilderScheduler ContinuousTestManager *continuoustest.Manager BuildInfoHandler http.Handler + + CostAttributionManager *costattribution.Manager } // New makes a new Mimir. diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index c66d15c6474..feb057b6ca4 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -43,6 +43,7 @@ import ( blockbuilderscheduler "github.com/grafana/mimir/pkg/blockbuilder/scheduler" "github.com/grafana/mimir/pkg/compactor" "github.com/grafana/mimir/pkg/continuoustest" + "github.com/grafana/mimir/pkg/costattribution" "github.com/grafana/mimir/pkg/distributor" "github.com/grafana/mimir/pkg/flusher" "github.com/grafana/mimir/pkg/frontend" @@ -80,6 +81,7 @@ const ( OverridesExporter string = "overrides-exporter" Server string = "server" ActiveGroupsCleanupService string = "active-groups-cleanup-service" + CostAttributionService string = "cost-attribution-service" Distributor string = "distributor" DistributorService string = "distributor-service" Ingester string = "ingester" @@ -462,7 +464,9 @@ func (t *Mimir) initDistributorService() (serv services.Service, err error) { t.Cfg.Distributor.PreferAvailabilityZone = t.Cfg.Querier.PreferAvailabilityZone t.Cfg.Distributor.IngestStorageConfig = t.Cfg.IngestStorage - t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, t.ActiveGroupsCleanup, t.IngesterRing, t.IngesterPartitionInstanceRing, canJoinDistributorsRing, t.Registerer, util_log.Logger) + t.Distributor, err = distributor.New(t.Cfg.Distributor, t.Cfg.IngesterClient, t.Overrides, + t.ActiveGroupsCleanup, t.CostAttributionManager, t.IngesterRing, t.IngesterPartitionInstanceRing, + canJoinDistributorsRing, t.Registerer, util_log.Logger) if err != nil { return } @@ -644,6 +648,18 @@ func (t *Mimir) initActiveGroupsCleanupService() (services.Service, error) { return t.ActiveGroupsCleanup, nil } +func (t *Mimir) initCostAttributionService() (services.Service, error) { + // The cost attribution service is only initilized if the custom registry path is provided. + if t.Cfg.CostAttributionRegistryPath != "" { + reg := prometheus.NewRegistry() + var err error + t.CostAttributionManager, err = costattribution.NewManager(3*time.Minute, time.Minute, t.Cfg.CostAttributionEvictionInterval, util_log.Logger, t.Overrides, reg) + t.API.RegisterUsageMetricsRoute(t.Cfg.CostAttributionRegistryPath, reg) + return t.CostAttributionManager, err + } + return nil, nil +} + func (t *Mimir) tsdbIngesterConfig() { t.Cfg.Ingester.BlocksStorageConfig = t.Cfg.BlocksStorage } @@ -655,7 +671,7 @@ func (t *Mimir) initIngesterService() (serv services.Service, err error) { t.Cfg.Ingester.IngestStorageConfig = t.Cfg.IngestStorage t.tsdbIngesterConfig() - t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.Registerer, util_log.Logger) + t.Ingester, err = ingester.New(t.Cfg.Ingester, t.Overrides, t.IngesterRing, t.IngesterPartitionRingWatcher, t.ActiveGroupsCleanup, t.CostAttributionManager, t.Registerer, util_log.Logger) if err != nil { return } @@ -1136,6 +1152,7 @@ func (t *Mimir) setupModuleManager() error { mm.RegisterModule(Overrides, t.initOverrides, modules.UserInvisibleModule) mm.RegisterModule(OverridesExporter, t.initOverridesExporter) mm.RegisterModule(ActiveGroupsCleanupService, t.initActiveGroupsCleanupService, modules.UserInvisibleModule) + mm.RegisterModule(CostAttributionService, t.initCostAttributionService, modules.UserInvisibleModule) mm.RegisterModule(Distributor, t.initDistributor) mm.RegisterModule(DistributorService, t.initDistributorService, modules.UserInvisibleModule) mm.RegisterModule(Ingester, t.initIngester) @@ -1176,9 +1193,10 @@ func (t *Mimir) setupModuleManager() error { Overrides: {RuntimeConfig}, OverridesExporter: {Overrides, MemberlistKV, Vault}, Distributor: {DistributorService, API, ActiveGroupsCleanupService, Vault}, - DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault}, + DistributorService: {IngesterRing, IngesterPartitionRing, Overrides, Vault, CostAttributionService}, + CostAttributionService: {API, Overrides}, Ingester: {IngesterService, API, ActiveGroupsCleanupService, Vault}, - IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV}, + IngesterService: {IngesterRing, IngesterPartitionRing, Overrides, RuntimeConfig, MemberlistKV, CostAttributionService}, Flusher: {Overrides, API}, Queryable: {Overrides, DistributorService, IngesterRing, IngesterPartitionRing, API, StoreQueryable, MemberlistKV}, Querier: {TenantFederation, Vault}, diff --git a/pkg/storage/soft_append_error_processor.go b/pkg/storage/soft_append_error_processor.go index 0f02131537d..6fdda3ae588 100644 --- a/pkg/storage/soft_append_error_processor.go +++ b/pkg/storage/soft_append_error_processor.go @@ -22,7 +22,7 @@ type SoftAppendErrorProcessor struct { errTooOldSample func(int64, []mimirpb.LabelAdapter) sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter) errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter) - maxSeriesPerUser func() + maxSeriesPerUser func(labels []mimirpb.LabelAdapter) maxSeriesPerMetric func(labels []mimirpb.LabelAdapter) errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter) errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter) @@ -39,7 +39,7 @@ func NewSoftAppendErrorProcessor( errTooOldSample func(int64, []mimirpb.LabelAdapter), sampleTooFarInFuture func(int64, []mimirpb.LabelAdapter), errDuplicateSampleForTimestamp func(int64, []mimirpb.LabelAdapter), - maxSeriesPerUser func(), + maxSeriesPerUser func([]mimirpb.LabelAdapter), maxSeriesPerMetric func(labels []mimirpb.LabelAdapter), errOOONativeHistogramsDisabled func(error, int64, []mimirpb.LabelAdapter), errHistogramCountMismatch func(error, int64, []mimirpb.LabelAdapter), @@ -89,7 +89,7 @@ func (e *SoftAppendErrorProcessor) ProcessErr(err error, ts int64, labels []mimi e.errDuplicateSampleForTimestamp(ts, labels) return true case errors.Is(err, globalerror.MaxSeriesPerUser): - e.maxSeriesPerUser() + e.maxSeriesPerUser(labels) return true case errors.Is(err, globalerror.MaxSeriesPerMetric): e.maxSeriesPerMetric(labels) diff --git a/pkg/streamingpromql/benchmarks/comparison_test.go b/pkg/streamingpromql/benchmarks/comparison_test.go index 5b26a5d6c45..4b147583d31 100644 --- a/pkg/streamingpromql/benchmarks/comparison_test.go +++ b/pkg/streamingpromql/benchmarks/comparison_test.go @@ -237,7 +237,7 @@ func createIngesterQueryable(t testing.TB, address string) storage.Queryable { overrides, err := validation.NewOverrides(limits, nil) require.NoError(t, err) - d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, ingestersRing, nil, false, nil, logger) + d, err := distributor.New(distributorCfg, clientCfg, overrides, nil, nil, ingestersRing, nil, false, nil, logger) require.NoError(t, err) queryMetrics := stats.NewQueryMetrics(nil) diff --git a/pkg/streamingpromql/benchmarks/ingester.go b/pkg/streamingpromql/benchmarks/ingester.go index 6f3b5f04a9a..9107b66f64f 100644 --- a/pkg/streamingpromql/benchmarks/ingester.go +++ b/pkg/streamingpromql/benchmarks/ingester.go @@ -96,7 +96,7 @@ func startBenchmarkIngester(rootDataDir string) (*ingester.Ingester, string, fun return services.StopAndAwaitTerminated(context.Background(), ingestersRing) }) - ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, log.NewNopLogger()) + ing, err := ingester.New(ingesterCfg, overrides, ingestersRing, nil, nil, nil, nil, log.NewNopLogger()) if err != nil { cleanup() return nil, "", nil, fmt.Errorf("could not create ingester: %w", err) diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 09b212ff2fa..e0f1c0b2a7b 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -62,6 +62,8 @@ const ( QueryIngestersWithinFlag = "querier.query-ingesters-within" AlertmanagerMaxGrafanaConfigSizeFlag = "alertmanager.max-grafana-config-size-bytes" AlertmanagerMaxGrafanaStateSizeFlag = "alertmanager.max-grafana-state-size-bytes" + costAttributionLabelsFlag = "validation.cost-attribution-labels" + maxCostAttributionLabelsPerUserFlag = "validation.max-cost-attribution-labels-per-user" // MinCompactorPartialBlockDeletionDelay is the minimum partial blocks deletion delay that can be configured in Mimir. MinCompactorPartialBlockDeletionDelay = 4 * time.Hour @@ -70,6 +72,7 @@ const ( var ( errInvalidIngestStorageReadConsistency = fmt.Errorf("invalid ingest storage read consistency (supported values: %s)", strings.Join(api.ReadConsistencies, ", ")) errInvalidMaxEstimatedChunksPerQueryMultiplier = errors.New("invalid value for -" + MaxEstimatedChunksPerQueryMultiplierFlag + ": must be 0 or greater than or equal to 1") + errCostAttributionLabelsLimitExceeded = errors.New("invalid value for -" + costAttributionLabelsFlag + ": exceeds the limit defined by -" + maxCostAttributionLabelsPerUserFlag) ) // LimitError is a marker interface for the errors that do not comply with the specified limits. @@ -186,6 +189,12 @@ type Limits struct { LabelValuesMaxCardinalityLabelNamesPerRequest int `yaml:"label_values_max_cardinality_label_names_per_request" json:"label_values_max_cardinality_label_names_per_request"` ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` + // Cost attribution and limit. + CostAttributionLabels flagext.StringSliceCSV `yaml:"cost_attribution_labels" json:"cost_attribution_labels" category:"experimental"` + MaxCostAttributionLabelsPerUser int `yaml:"max_cost_attribution_labels_per_user" json:"max_cost_attribution_labels_per_user" category:"experimental"` + MaxCostAttributionCardinalityPerUser int `yaml:"max_cost_attribution_cardinality_per_user" json:"max_cost_attribution_cardinality_per_user" category:"experimental"` + CostAttributionCooldown model.Duration `yaml:"cost_attribution_cooldown" json:"cost_attribution_cooldown" category:"experimental"` + // Ruler defaults and limits. RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` @@ -296,6 +305,10 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { f.StringVar(&l.SeparateMetricsGroupLabel, "validation.separate-metrics-group-label", "", "Label used to define the group label for metrics separation. For each write request, the group is obtained from the first non-empty group label from the first timeseries in the incoming list of timeseries. Specific distributor and ingester metrics will be further separated adding a 'group' label with group label's value. Currently applies to the following metrics: cortex_discarded_samples_total") + f.Var(&l.CostAttributionLabels, costAttributionLabelsFlag, "List of labels used to define cost attribution. These labels will be included in the specified distributor and ingester metrics for each write request, allowing them to be distinguished by the label. The label applies to the following metrics: cortex_distributor_attributed_received_samples_total, cortex_ingester_attributed_active_series, and cortex_attributed_discarded_samples_total. Set to an empty string to disable cost attribution.") + f.IntVar(&l.MaxCostAttributionLabelsPerUser, maxCostAttributionLabelsPerUserFlag, 2, "Maximum number of cost attribution labels allowed per user. Set to 0 to disable.") + f.IntVar(&l.MaxCostAttributionCardinalityPerUser, "validation.max-cost-attribution-cardinality-per-user", 10000, "Maximum cardinality of cost attribution labels allowed per user.") + f.Var(&l.CostAttributionCooldown, "validation.cost-attribution-cooldown", "Cooldown period for cost attribution labels. Specifies the duration the cost attribution remains in overflow before attempting a reset. If the cardinality remains above the limit after this period, the system will stay in overflow mode and extend the cooldown. Setting this value to 0 disables the cooldown, causing the system to continuously check whether the cardinality has dropped below the limit. A reset will occur once the cardinality falls below the limit.") f.IntVar(&l.MaxChunksPerQuery, MaxChunksPerQueryFlag, 2e6, "Maximum number of chunks that can be fetched in a single query from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable.") f.Float64Var(&l.MaxEstimatedChunksPerQueryMultiplier, MaxEstimatedChunksPerQueryMultiplierFlag, 0, "Maximum number of chunks estimated to be fetched in a single query from ingesters and store-gateways, as a multiple of -"+MaxChunksPerQueryFlag+". This limit is enforced in the querier. Must be greater than or equal to 1, or 0 to disable.") f.IntVar(&l.MaxFetchedSeriesPerQuery, MaxSeriesPerQueryFlag, 0, "The maximum number of unique series for which a query can fetch samples from ingesters and store-gateways. This limit is enforced in the querier, ruler and store-gateway. 0 to disable") @@ -472,6 +485,10 @@ func (l *Limits) validate() error { return errInvalidIngestStorageReadConsistency } + if len(l.CostAttributionLabels) > l.MaxCostAttributionLabelsPerUser { + return errCostAttributionLabelsLimitExceeded + } + return nil } @@ -793,6 +810,22 @@ func (o *Overrides) SeparateMetricsGroupLabel(userID string) string { return o.getOverridesForUser(userID).SeparateMetricsGroupLabel } +func (o *Overrides) CostAttributionLabels(userID string) []string { + return o.getOverridesForUser(userID).CostAttributionLabels +} + +func (o *Overrides) MaxCostAttributionLabelsPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionLabelsPerUser +} + +func (o *Overrides) CostAttributionCooldown(userID string) time.Duration { + return time.Duration(o.getOverridesForUser(userID).CostAttributionCooldown) +} + +func (o *Overrides) MaxCostAttributionCardinalityPerUser(userID string) int { + return o.getOverridesForUser(userID).MaxCostAttributionCardinalityPerUser +} + // IngestionTenantShardSize returns the ingesters shard size for a given user. func (o *Overrides) IngestionTenantShardSize(userID string) int { return o.getOverridesForUser(userID).IngestionTenantShardSize diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 9dc82df2d05..c56cb1ab026 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -1076,6 +1076,12 @@ metric_relabel_configs: cfg: `ingest_storage_read_consistency: xyz`, expectedErr: errInvalidIngestStorageReadConsistency.Error(), }, + "should fail when cost_attribution_labels exceed max_cost_attribution_labels_per_user": { + cfg: ` +cost_attribution_labels: label1, label2, label3, +max_cost_attribution_labels_per_user: 2`, + expectedErr: errCostAttributionLabelsLimitExceeded.Error(), + }, } for testName, testData := range tests {