Skip to content

Commit 48bbf5c

Browse files
Add backoff to flush op
This commit adds a configurable backoff to flush ops in the ingester. This is to prevent situations where the store put operation fails fast (i.e. 401 Unauthorized) and can cause ingesters to be rate limited.
1 parent 21dd4af commit 48bbf5c

File tree

3 files changed

+149
-6
lines changed

3 files changed

+149
-6
lines changed

pkg/ingester/flush.go

+36-5
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ package ingester
22

33
import (
44
"bytes"
5+
"errors"
56
"fmt"
67
"net/http"
78
"sync"
89
"time"
910

1011
"github.com/go-kit/log/level"
12+
"github.com/grafana/dskit/backoff"
1113
"github.com/grafana/dskit/ring"
1214
"github.com/grafana/dskit/user"
1315
"github.com/prometheus/client_golang/prometheus"
@@ -147,7 +149,7 @@ func (i *Ingester) flushLoop(j int) {
147149
}
148150
op := o.(*flushOp)
149151

150-
err := i.flushUserSeries(op.userID, op.fp, op.immediate)
152+
err := i.flushOp(op)
151153
if err != nil {
152154
level.Error(util_log.WithUserID(op.userID, i.logger)).Log("msg", "failed to flush", "err", err)
153155
}
@@ -161,7 +163,39 @@ func (i *Ingester) flushLoop(j int) {
161163
}
162164
}
163165

164-
func (i *Ingester) flushUserSeries(userID string, fp model.Fingerprint, immediate bool) error {
166+
func (i *Ingester) flushOp(op *flushOp) error {
167+
// A flush is retried until either it is successful, the maximum number
168+
// of retries is exceeded, or the timeout has expired. The context is
169+
// used to cancel the backoff should the latter happen.
170+
ctx, cancelFunc := context.WithTimeout(context.Background(), i.cfg.FlushOpTimeout)
171+
defer cancelFunc()
172+
173+
b := backoff.New(ctx, i.cfg.FlushOpBackoff)
174+
for b.Ongoing() {
175+
err := i.flushUserSeries(ctx, op.userID, op.fp, op.immediate)
176+
if err == nil {
177+
break
178+
}
179+
level.Error(util_log.WithUserID(op.userID, i.logger)).Log("msg", "failed to flush", "retries", b.NumRetries(), "err", err)
180+
b.Wait()
181+
}
182+
183+
if err := b.Err(); err != nil {
184+
// If we got here then either the maximum number of retries have
185+
// been exceeded or the timeout expired. We do not need to check
186+
// ctx.Err() as it is checked in b.Err().
187+
if errors.Is(err, context.DeadlineExceeded) {
188+
return fmt.Errorf("timed out after %s: %w", i.cfg.FlushOpTimeout, err)
189+
}
190+
return err
191+
}
192+
193+
return nil
194+
}
195+
196+
func (i *Ingester) flushUserSeries(ctx context.Context, userID string, fp model.Fingerprint, immediate bool) error {
197+
ctx = user.InjectOrgID(context.Background(), userID)
198+
165199
instance, ok := i.getInstanceByID(userID)
166200
if !ok {
167201
return nil
@@ -175,9 +209,6 @@ func (i *Ingester) flushUserSeries(userID string, fp model.Fingerprint, immediat
175209
lbs := labels.String()
176210
level.Info(i.logger).Log("msg", "flushing stream", "user", userID, "fp", fp, "immediate", immediate, "num_chunks", len(chunks), "labels", lbs)
177211

178-
ctx := user.InjectOrgID(context.Background(), userID)
179-
ctx, cancel := context.WithTimeout(ctx, i.cfg.FlushOpTimeout)
180-
defer cancel()
181212
err := i.flushChunks(ctx, fp, labels, chunks, chunkMtx)
182213
if err != nil {
183214
return fmt.Errorf("failed to flush chunks: %w, num_chunks: %d, labels: %s", err, len(chunks), lbs)

pkg/ingester/flush_test.go

+98
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package ingester
22

33
import (
4+
"errors"
45
"fmt"
56
"os"
67
"sort"
@@ -102,6 +103,99 @@ func Benchmark_FlushLoop(b *testing.B) {
102103
}
103104
}
104105

106+
func Test_FlushOp(t *testing.T) {
107+
t.Run("no error", func(t *testing.T) {
108+
cfg := defaultIngesterTestConfig(t)
109+
cfg.FlushOpBackoff.MinBackoff = time.Second
110+
cfg.FlushOpBackoff.MaxBackoff = 10 * time.Second
111+
cfg.FlushOpBackoff.MaxRetries = 1
112+
cfg.FlushCheckPeriod = 100 * time.Millisecond
113+
114+
_, ing := newTestStore(t, cfg, nil)
115+
116+
ctx := user.InjectOrgID(context.Background(), "foo")
117+
ins, err := ing.GetOrCreateInstance("foo")
118+
require.NoError(t, err)
119+
120+
lbs := makeRandomLabels()
121+
req := &logproto.PushRequest{Streams: []logproto.Stream{{
122+
Labels: lbs.String(),
123+
Entries: entries(5, time.Now()),
124+
}}}
125+
require.NoError(t, ins.Push(ctx, req))
126+
127+
time.Sleep(cfg.FlushCheckPeriod)
128+
require.NoError(t, ing.flushOp(&flushOp{
129+
immediate: true,
130+
userID: "foo",
131+
fp: ins.getHashForLabels(lbs),
132+
}))
133+
})
134+
135+
t.Run("max retries exceeded", func(t *testing.T) {
136+
cfg := defaultIngesterTestConfig(t)
137+
cfg.FlushOpBackoff.MinBackoff = time.Second
138+
cfg.FlushOpBackoff.MaxBackoff = 10 * time.Second
139+
cfg.FlushOpBackoff.MaxRetries = 1
140+
cfg.FlushCheckPeriod = 100 * time.Millisecond
141+
142+
store, ing := newTestStore(t, cfg, nil)
143+
store.onPut = func(_ context.Context, _ []chunk.Chunk) error {
144+
return errors.New("failed to write chunks")
145+
}
146+
147+
ctx := user.InjectOrgID(context.Background(), "foo")
148+
ins, err := ing.GetOrCreateInstance("foo")
149+
require.NoError(t, err)
150+
151+
lbs := makeRandomLabels()
152+
req := &logproto.PushRequest{Streams: []logproto.Stream{{
153+
Labels: lbs.String(),
154+
Entries: entries(5, time.Now()),
155+
}}}
156+
require.NoError(t, ins.Push(ctx, req))
157+
158+
time.Sleep(cfg.FlushCheckPeriod)
159+
require.EqualError(t, ing.flushOp(&flushOp{
160+
immediate: true,
161+
userID: "foo",
162+
fp: ins.getHashForLabels(lbs),
163+
}), "terminated after 1 retries")
164+
})
165+
166+
t.Run("timeout expired", func(t *testing.T) {
167+
cfg := defaultIngesterTestConfig(t)
168+
cfg.FlushOpBackoff.MinBackoff = time.Second
169+
cfg.FlushOpBackoff.MaxBackoff = 10 * time.Second
170+
cfg.FlushOpBackoff.MaxRetries = 1
171+
cfg.FlushOpTimeout = time.Second
172+
cfg.FlushCheckPeriod = 100 * time.Millisecond
173+
174+
store, ing := newTestStore(t, cfg, nil)
175+
store.onPut = func(_ context.Context, _ []chunk.Chunk) error {
176+
return errors.New("store is unavailable")
177+
}
178+
179+
ctx := user.InjectOrgID(context.Background(), "foo")
180+
ins, err := ing.GetOrCreateInstance("foo")
181+
require.NoError(t, err)
182+
183+
lbs := makeRandomLabels()
184+
req := &logproto.PushRequest{Streams: []logproto.Stream{{
185+
Labels: lbs.String(),
186+
Entries: entries(5, time.Now()),
187+
}}}
188+
require.NoError(t, ins.Push(ctx, req))
189+
190+
time.Sleep(cfg.FlushCheckPeriod)
191+
require.EqualError(t, ing.flushOp(&flushOp{
192+
immediate: true,
193+
userID: "foo",
194+
fp: ins.getHashForLabels(lbs),
195+
}), "timed out after 1s: context deadline exceeded")
196+
})
197+
}
198+
105199
func Test_Flush(t *testing.T) {
106200
var (
107201
store, ing = newTestStore(t, defaultIngesterTestConfig(t), nil)
@@ -297,6 +391,10 @@ func defaultIngesterTestConfig(t testing.TB) Config {
297391

298392
cfg := Config{}
299393
flagext.DefaultValues(&cfg)
394+
cfg.FlushOpBackoff.MinBackoff = 100 * time.Millisecond
395+
cfg.FlushOpBackoff.MaxBackoff = 10 * time.Second
396+
cfg.FlushOpBackoff.MaxRetries = 1
397+
cfg.FlushOpTimeout = 15 * time.Second
300398
cfg.FlushCheckPeriod = 99999 * time.Hour
301399
cfg.MaxChunkIdle = 99999 * time.Hour
302400
cfg.ConcurrentFlushes = 1

pkg/ingester/ingester.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020

2121
"github.com/go-kit/log"
2222
"github.com/go-kit/log/level"
23+
"github.com/grafana/dskit/backoff"
2324
"github.com/grafana/dskit/concurrency"
2425
"github.com/grafana/dskit/modules"
2526
"github.com/grafana/dskit/multierror"
@@ -81,6 +82,7 @@ type Config struct {
8182

8283
ConcurrentFlushes int `yaml:"concurrent_flushes"`
8384
FlushCheckPeriod time.Duration `yaml:"flush_check_period"`
85+
FlushOpBackoff backoff.Config `yaml:"flush_op_backoff"`
8486
FlushOpTimeout time.Duration `yaml:"flush_op_timeout"`
8587
RetainPeriod time.Duration `yaml:"chunk_retain_period"`
8688
MaxChunkIdle time.Duration `yaml:"chunk_idle_period"`
@@ -126,7 +128,10 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
126128

127129
f.IntVar(&cfg.ConcurrentFlushes, "ingester.concurrent-flushes", 32, "How many flushes can happen concurrently from each stream.")
128130
f.DurationVar(&cfg.FlushCheckPeriod, "ingester.flush-check-period", 30*time.Second, "How often should the ingester see if there are any blocks to flush. The first flush check is delayed by a random time up to 0.8x the flush check period. Additionally, there is +/- 1% jitter added to the interval.")
129-
f.DurationVar(&cfg.FlushOpTimeout, "ingester.flush-op-timeout", 10*time.Minute, "The timeout before a flush is cancelled.")
131+
f.DurationVar(&cfg.FlushOpBackoff.MinBackoff, "ingester.flush-op-backoff-min-period", 10*time.Second, "Minimum backoff period when a flush fails. Each concurrent flush has its own backoff, see `ingester.concurrent-flushes`.")
132+
f.DurationVar(&cfg.FlushOpBackoff.MaxBackoff, "ingester.flush-op-backoff-max-period", time.Minute, "Maximum backoff period when a flush fails. Each concurrent flush has its own backoff, see `ingester.concurrent-flushes`.")
133+
f.IntVar(&cfg.FlushOpBackoff.MaxRetries, "ingester.flush-op-backoff-retries", 10, "Maximum retries for failed flushes. Is canceled when `ingester.flush-op-timeout` is exceeded.")
134+
f.DurationVar(&cfg.FlushOpTimeout, "ingester.flush-op-timeout", 10*time.Minute, "The timeout before a flush is canceled.")
130135
f.DurationVar(&cfg.RetainPeriod, "ingester.chunks-retain-period", 0, "How long chunks should be retained in-memory after they've been flushed.")
131136
f.DurationVar(&cfg.MaxChunkIdle, "ingester.chunks-idle-period", 30*time.Minute, "How long chunks should sit in-memory with no updates before being flushed if they don't hit the max block size. This means that half-empty chunks will still be flushed after a certain period as long as they receive no further activity.")
132137
f.IntVar(&cfg.BlockSize, "ingester.chunks-block-size", 256*1024, "The targeted _uncompressed_ size in bytes of a chunk block When this threshold is exceeded the head block will be cut and compressed inside the chunk.")
@@ -154,6 +159,15 @@ func (cfg *Config) Validate() error {
154159
return err
155160
}
156161

162+
if cfg.FlushOpBackoff.MinBackoff > cfg.FlushOpBackoff.MaxBackoff {
163+
return errors.New("invalid flush op min backoff: cannot be larger than max backoff")
164+
}
165+
if cfg.FlushOpBackoff.MaxRetries <= 0 {
166+
return fmt.Errorf("invalid flush op max retries: %s", cfg.FlushOpBackoff.MaxRetries)
167+
}
168+
if cfg.FlushOpTimeout <= 0 {
169+
return fmt.Errorf("invalid flush op timeout: %s", cfg.FlushOpTimeout)
170+
}
157171
if cfg.IndexShards <= 0 {
158172
return fmt.Errorf("invalid ingester index shard factor: %d", cfg.IndexShards)
159173
}

0 commit comments

Comments
 (0)