From e0bd75e049a3d55e694544b0fbedc13d30c78376 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 6 Jul 2021 16:06:40 +0200 Subject: [PATCH 1/2] Allow setting ring heartbeat timeout to zero to disable timeout check. This change allows the various ring heartbeat timeouts to be configured with zero, as a means of disabling the timeout. This is expected to be used with a separate enhancement to allow disabling heartbeats. When the heartbeat timeout is disabled, instances will always appear as healthy in the ring. Signed-off-by: Steve Simpson --- CHANGELOG.md | 7 ++++ docs/blocks-storage/compactor.md | 2 +- docs/blocks-storage/store-gateway.md | 4 +-- docs/configuration/config-file-reference.md | 13 +++---- pkg/alertmanager/alertmanager_ring.go | 2 +- pkg/compactor/compactor_ring.go | 2 +- pkg/distributor/distributor_ring.go | 2 +- pkg/ring/model.go | 13 +++++-- pkg/ring/model_test.go | 8 +++++ pkg/ring/ring.go | 2 +- pkg/ring/ring_test.go | 38 +++++++++++++++++++-- pkg/ruler/ruler_ring.go | 2 +- pkg/storegateway/gateway_ring.go | 2 +- 13 files changed, 78 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5532ab6d3a..e93f52978b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ * [CHANGE] Querier / ruler: Change `-querier.max-fetched-chunks-per-query` configuration to limit to maximum number of chunks that can be fetched in a single query. The number of chunks fetched by ingesters AND long-term storare combined should not exceed the value configured on `-querier.max-fetched-chunks-per-query`. #4260 * [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262 * [ENHANCEMENT] Reduce memory used by streaming queries, particularly in ruler. #4341 +* [ENHANCEMENT] Ring: allow experimental configuration of disabling of heartbeat timeouts by setting the relevant configuration value to zero. Applies to the following: #4342 + * `-distributor.ring.heartbeat-timeout` + * `-ring.heartbeat-timeout` + * `-ruler.ring.heartbeat-timeout` + * `-alertmanager.sharding-ring.heartbeat-timeout` + * `-compactor.ring.heartbeat-timeout` + * `-store-gateway.sharding-ring.heartbeat-timeout` * [BUGFIX] HA Tracker: when cleaning up obsolete elected replicas from KV store, tracker didn't update number of cluster per user correctly. #4336 ## 1.10.0-rc.0 / 2021-06-28 diff --git a/docs/blocks-storage/compactor.md b/docs/blocks-storage/compactor.md index 280545afab..5a55cabb7c 100644 --- a/docs/blocks-storage/compactor.md +++ b/docs/blocks-storage/compactor.md @@ -214,7 +214,7 @@ compactor: [heartbeat_period: | default = 5s] # The heartbeat timeout after which compactors are considered unhealthy - # within the ring. + # within the ring. 0 = never (timeout disabled). # CLI flag: -compactor.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md index d24813be5f..3cd768fea2 100644 --- a/docs/blocks-storage/store-gateway.md +++ b/docs/blocks-storage/store-gateway.md @@ -237,8 +237,8 @@ store_gateway: [heartbeat_period: | default = 15s] # The heartbeat timeout after which store gateways are considered unhealthy - # within the ring. This option needs be set both on the store-gateway and - # querier when running in microservices mode. + # within the ring. 0 = never (timeout disabled). This option needs be set + # both on the store-gateway and querier when running in microservices mode. # CLI flag: -store-gateway.sharding-ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 0267d01e88..34685a3073 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -568,7 +568,7 @@ ring: [heartbeat_period: | default = 5s] # The heartbeat timeout after which distributors are considered unhealthy - # within the ring. + # within the ring. 0 = never (timeout disabled). # CLI flag: -distributor.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -662,6 +662,7 @@ lifecycler: [mirror_timeout: | default = 2s] # The heartbeat timeout after which ingesters are skipped for reads/writes. + # 0 = never (timeout disabled). # CLI flag: -ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -1585,7 +1586,7 @@ ring: [heartbeat_period: | default = 5s] # The heartbeat timeout after which rulers are considered unhealthy within the - # ring. + # ring. 0 = never (timeout disabled). # CLI flag: -ruler.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -1906,7 +1907,7 @@ sharding_ring: [heartbeat_period: | default = 15s] # The heartbeat timeout after which alertmanagers are considered unhealthy - # within the ring. + # within the ring. 0 = never (timeout disabled). # CLI flag: -alertmanager.sharding-ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -5179,7 +5180,7 @@ sharding_ring: [heartbeat_period: | default = 5s] # The heartbeat timeout after which compactors are considered unhealthy within - # the ring. + # the ring. 0 = never (timeout disabled). # CLI flag: -compactor.ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] @@ -5257,8 +5258,8 @@ sharding_ring: [heartbeat_period: | default = 15s] # The heartbeat timeout after which store gateways are considered unhealthy - # within the ring. This option needs be set both on the store-gateway and - # querier when running in microservices mode. + # within the ring. 0 = never (timeout disabled). This option needs be set both + # on the store-gateway and querier when running in microservices mode. # CLI flag: -store-gateway.sharding-ring.heartbeat-timeout [heartbeat_timeout: | default = 1m] diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index 04d08d2d65..f0b707a090 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -77,7 +77,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix(rfprefix, "alertmanagers/", f) f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring. 0 = never (timeout disabled).") f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.") f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.") diff --git a/pkg/compactor/compactor_ring.go b/pkg/compactor/compactor_ring.go index 5b55c8a871..d384220a32 100644 --- a/pkg/compactor/compactor_ring.go +++ b/pkg/compactor/compactor_ring.go @@ -51,7 +51,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix("compactor.ring.", "collectors/", f) f.DurationVar(&cfg.HeartbeatPeriod, "compactor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, "compactor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which compactors are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatTimeout, "compactor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which compactors are considered unhealthy within the ring. 0 = never (timeout disabled).") // Wait stability flags. f.DurationVar(&cfg.WaitStabilityMinDuration, "compactor.ring.wait-stability-min-duration", time.Minute, "Minimum time to wait for ring stability at startup. 0 to disable.") diff --git a/pkg/distributor/distributor_ring.go b/pkg/distributor/distributor_ring.go index ee96a7d37c..ce70464e34 100644 --- a/pkg/distributor/distributor_ring.go +++ b/pkg/distributor/distributor_ring.go @@ -43,7 +43,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix("distributor.ring.", "collectors/", f) f.DurationVar(&cfg.HeartbeatPeriod, "distributor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, "distributor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which distributors are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatTimeout, "distributor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which distributors are considered unhealthy within the ring. 0 = never (timeout disabled).") // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} diff --git a/pkg/ring/model.go b/pkg/ring/model.go index c10281080e..dac6103b52 100644 --- a/pkg/ring/model.go +++ b/pkg/ring/model.go @@ -101,7 +101,7 @@ func (d *Desc) FindIngestersByState(state InstanceState) []InstanceDesc { func (d *Desc) Ready(now time.Time, heartbeatTimeout time.Duration) error { numTokens := 0 for id, ingester := range d.Ingesters { - if now.Sub(time.Unix(ingester.Timestamp, 0)) > heartbeatTimeout { + if !ingester.IsHeartbeatHealthy(heartbeatTimeout, now) { return fmt.Errorf("instance %s past heartbeat timeout", id) } else if ingester.State != ACTIVE { return fmt.Errorf("instance %s in state %v", id, ingester.State) @@ -136,7 +136,16 @@ func (i *InstanceDesc) GetRegisteredAt() time.Time { func (i *InstanceDesc) IsHealthy(op Operation, heartbeatTimeout time.Duration, now time.Time) bool { healthy := op.IsInstanceInStateHealthy(i.State) - return healthy && now.Unix()-i.Timestamp <= heartbeatTimeout.Milliseconds()/1000 + return healthy && i.IsHeartbeatHealthy(heartbeatTimeout, now) +} + +// IsHeartbeatHealthy returns whether the heartbeat timestamp for the ingester is within the +// specified timeout period. A timeout of zero disables the timeout; the heartbeat is ignored. +func (i *InstanceDesc) IsHeartbeatHealthy(heartbeatTimeout time.Duration, now time.Time) bool { + if heartbeatTimeout == 0 { + return true + } + return now.Sub(time.Unix(i.Timestamp, 0)) <= heartbeatTimeout } // Merge merges other ring into this one. Returns sub-ring that represents the change, diff --git a/pkg/ring/model_test.go b/pkg/ring/model_test.go index 9570abf17d..1d73e6f98b 100644 --- a/pkg/ring/model_test.go +++ b/pkg/ring/model_test.go @@ -136,10 +136,18 @@ func TestDesc_Ready(t *testing.T) { t.Fatal("expected ready, got", err) } + if err := r.Ready(now, 0); err != nil { + t.Fatal("expected ready, got", err) + } + if err := r.Ready(now.Add(5*time.Minute), 10*time.Second); err == nil { t.Fatal("expected !ready (no heartbeat from active ingester), but got no error") } + if err := r.Ready(now.Add(5*time.Minute), 0); err != nil { + t.Fatal("expected ready (no heartbeat but timeout disabled), got", err) + } + r = &Desc{ Ingesters: map[string]InstanceDesc{ "ing1": { diff --git a/pkg/ring/ring.go b/pkg/ring/ring.go index 8d5bd0c925..72165ed369 100644 --- a/pkg/ring/ring.go +++ b/pkg/ring/ring.go @@ -147,7 +147,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { cfg.KVStore.RegisterFlagsWithPrefix(prefix, "collectors/", f) - f.DurationVar(&cfg.HeartbeatTimeout, prefix+"ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which ingesters are skipped for reads/writes.") + f.DurationVar(&cfg.HeartbeatTimeout, prefix+"ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which ingesters are skipped for reads/writes. 0 = never (timeout disabled).") f.IntVar(&cfg.ReplicationFactor, prefix+"distributor.replication-factor", 3, "The number of ingesters to write to and read from.") f.BoolVar(&cfg.ZoneAwarenessEnabled, prefix+"distributor.zone-awareness-enabled", false, "True to enable the zone-awareness and replicate ingested samples across different availability zones.") } diff --git a/pkg/ring/ring_test.go b/pkg/ring/ring_test.go index f1d038d291..3578f1f550 100644 --- a/pkg/ring/ring_test.go +++ b/pkg/ring/ring_test.go @@ -390,11 +390,11 @@ func TestRing_GetAllHealthy(t *testing.T) { } func TestRing_GetReplicationSetForOperation(t *testing.T) { - const heartbeatTimeout = time.Minute now := time.Now() tests := map[string]struct { ringInstances map[string]InstanceDesc + ringHeartbeatTimeout time.Duration ringReplicationFactor int expectedErrForRead error expectedSetForRead []string @@ -405,6 +405,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { }{ "should return error on empty ring": { ringInstances: nil, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 1, expectedErrForRead: ErrEmptyRing, expectedErrForWrite: ErrEmptyRing, @@ -418,11 +419,41 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-40 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, + ringReplicationFactor: 1, + expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + }, + "should succeed on instances with old timestamps but heartbeat timeout disabled": { + ringInstances: map[string]InstanceDesc{ + "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, + }, + ringHeartbeatTimeout: 0, ringReplicationFactor: 1, expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, }, + "should succeed on instances with zero timestamp but heartbeat timeout disabled": { + ringInstances: map[string]InstanceDesc{ + "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, + "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, + "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, + "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, + "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, + }, + ringHeartbeatTimeout: 0, + ringReplicationFactor: 1, + expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, + }, + "should fail on 1 unhealthy instance and RF=1": { ringInstances: map[string]InstanceDesc{ "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Unix(), Tokens: GenerateTokens(128, nil)}, @@ -431,6 +462,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 1, expectedErrForRead: ErrTooManyUnhealthyInstances, expectedErrForWrite: ErrTooManyUnhealthyInstances, @@ -444,6 +476,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 3, expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, @@ -457,6 +490,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, }, + ringHeartbeatTimeout: time.Minute, ringReplicationFactor: 3, expectedErrForRead: ErrTooManyUnhealthyInstances, expectedErrForWrite: ErrTooManyUnhealthyInstances, @@ -474,7 +508,7 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { ring := Ring{ cfg: Config{ - HeartbeatTimeout: heartbeatTimeout, + HeartbeatTimeout: testData.ringHeartbeatTimeout, ReplicationFactor: testData.ringReplicationFactor, }, ringDesc: ringDesc, diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go index 20d2e900ab..a3902ed1c4 100644 --- a/pkg/ruler/ruler_ring.go +++ b/pkg/ruler/ruler_ring.go @@ -57,7 +57,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix("ruler.ring.", "rulers/", f) f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring.") + f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring. 0 = never (timeout disabled).") // Instance flags cfg.InstanceInterfaceNames = []string{"eth0", "en0"} diff --git a/pkg/storegateway/gateway_ring.go b/pkg/storegateway/gateway_ring.go index 4ca5ed08a7..f32bf37ccd 100644 --- a/pkg/storegateway/gateway_ring.go +++ b/pkg/storegateway/gateway_ring.go @@ -95,7 +95,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // Ring flags cfg.KVStore.RegisterFlagsWithPrefix(ringFlagsPrefix, "collectors/", f) f.DurationVar(&cfg.HeartbeatPeriod, ringFlagsPrefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.") - f.DurationVar(&cfg.HeartbeatTimeout, ringFlagsPrefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which store gateways are considered unhealthy within the ring."+sharedOptionWithQuerier) + f.DurationVar(&cfg.HeartbeatTimeout, ringFlagsPrefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which store gateways are considered unhealthy within the ring. 0 = never (timeout disabled)."+sharedOptionWithQuerier) f.IntVar(&cfg.ReplicationFactor, ringFlagsPrefix+"replication-factor", 3, "The replication factor to use when sharding blocks."+sharedOptionWithQuerier) f.StringVar(&cfg.TokensFilePath, ringFlagsPrefix+"tokens-file-path", "", "File path where tokens are stored. If empty, tokens are not stored at shutdown and restored at startup.") f.BoolVar(&cfg.ZoneAwarenessEnabled, ringFlagsPrefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate blocks across different availability zones.") From 98f97e9b174d98622868757c27fbb8f2eb1d2e2d Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Wed, 7 Jul 2021 11:51:09 +0200 Subject: [PATCH 2/2] Review comments. Signed-off-by: Steve Simpson --- docs/configuration/v1-guarantees.md | 7 +++++++ pkg/ring/ring_test.go | 15 --------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/configuration/v1-guarantees.md b/docs/configuration/v1-guarantees.md index 8c87104a22..36e39221ff 100644 --- a/docs/configuration/v1-guarantees.md +++ b/docs/configuration/v1-guarantees.md @@ -81,3 +81,10 @@ Currently experimental features are: - user config size (`-alertmanager.max-config-size-bytes`) - templates count in user config (`-alertmanager.max-templates-count`) - max template size (`-alertmanager.max-template-size-bytes`) +- Disabling ring heartbeat timeouts + - `-distributor.ring.heartbeat-timeout=0` + - `-ring.heartbeat-timeout=0` + - `-ruler.ring.heartbeat-timeout=0` + - `-alertmanager.sharding-ring.heartbeat-timeout=0` + - `-compactor.ring.heartbeat-timeout=0` + - `-store-gateway.sharding-ring.heartbeat-timeout=0` \ No newline at end of file diff --git a/pkg/ring/ring_test.go b/pkg/ring/ring_test.go index 3578f1f550..c4898e939c 100644 --- a/pkg/ring/ring_test.go +++ b/pkg/ring/ring_test.go @@ -439,21 +439,6 @@ func TestRing_GetReplicationSetForOperation(t *testing.T) { expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, }, - "should succeed on instances with zero timestamp but heartbeat timeout disabled": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: 0, Tokens: GenerateTokens(128, nil)}, - }, - ringHeartbeatTimeout: 0, - ringReplicationFactor: 1, - expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - }, - "should fail on 1 unhealthy instance and RF=1": { ringInstances: map[string]InstanceDesc{ "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Unix(), Tokens: GenerateTokens(128, nil)},