Skip to content

Commit 3136880

Browse files
feat: add backoff mechanism to the retention process (#14182)
Signed-off-by: Vladyslav Diachenko <[email protected]>
1 parent 9637790 commit 3136880

File tree

4 files changed

+120
-27
lines changed

4 files changed

+120
-27
lines changed

docs/sources/shared/configuration.md

+13
Original file line numberDiff line numberDiff line change
@@ -1900,6 +1900,19 @@ The `compactor` block configures the compactor component, which compacts index s
19001900
# CLI flag: -compactor.retention-table-timeout
19011901
[retention_table_timeout: <duration> | default = 0s]
19021902
1903+
retention_backoff_config:
1904+
# Minimum delay when backing off.
1905+
# CLI flag: -compactor.retention-backoff-config.backoff-min-period
1906+
[min_period: <duration> | default = 100ms]
1907+
1908+
# Maximum delay when backing off.
1909+
# CLI flag: -compactor.retention-backoff-config.backoff-max-period
1910+
[max_period: <duration> | default = 10s]
1911+
1912+
# Number of times to backoff and retry before failing.
1913+
# CLI flag: -compactor.retention-backoff-config.backoff-retries
1914+
[max_retries: <int> | default = 10]
1915+
19031916
# Store used for managing delete requests.
19041917
# CLI flag: -compactor.delete-request-store
19051918
[delete_request_store: <string> | default = ""]

pkg/compactor/compactor.go

+8-4
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@ import (
1212
"time"
1313

1414
"github.com/go-kit/log/level"
15-
"github.com/grafana/dskit/kv"
16-
"github.com/grafana/dskit/ring"
17-
"github.com/grafana/dskit/services"
1815
"github.com/pkg/errors"
1916
"github.com/prometheus/client_golang/prometheus"
2017
"github.com/prometheus/common/model"
2118

19+
"github.com/grafana/dskit/backoff"
20+
"github.com/grafana/dskit/kv"
21+
"github.com/grafana/dskit/ring"
22+
"github.com/grafana/dskit/services"
23+
2224
"github.com/grafana/loki/v3/pkg/analytics"
2325
"github.com/grafana/loki/v3/pkg/compactor/deletion"
2426
"github.com/grafana/loki/v3/pkg/compactor/retention"
@@ -77,6 +79,7 @@ type Config struct {
7779
RetentionDeleteDelay time.Duration `yaml:"retention_delete_delay"`
7880
RetentionDeleteWorkCount int `yaml:"retention_delete_worker_count"`
7981
RetentionTableTimeout time.Duration `yaml:"retention_table_timeout"`
82+
RetentionBackoffConfig backoff.Config `yaml:"retention_backoff_config"`
8083
DeleteRequestStore string `yaml:"delete_request_store"`
8184
DeleteRequestStoreKeyPrefix string `yaml:"delete_request_store_key_prefix"`
8285
DeleteBatchSize int `yaml:"delete_batch_size"`
@@ -110,6 +113,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
110113
f.IntVar(&cfg.TablesToCompact, "compactor.tables-to-compact", 0, "Number of tables that compactor will try to compact. Newer tables are chosen when this is less than the number of tables available.")
111114
f.IntVar(&cfg.SkipLatestNTables, "compactor.skip-latest-n-tables", 0, "Do not compact N latest tables. Together with -compactor.run-once and -compactor.tables-to-compact, this is useful when clearing compactor backlogs.")
112115

116+
cfg.RetentionBackoffConfig.RegisterFlagsWithPrefix("compactor.retention-backoff-config", f)
113117
// Ring
114118
skipFlags := []string{
115119
"compactor.ring.num-tokens",
@@ -323,7 +327,7 @@ func (c *Compactor) init(objectStoreClients map[config.DayTime]client.ObjectClie
323327
}
324328
chunkClient := client.NewClient(objectClient, encoder, schemaConfig)
325329

326-
sc.sweeper, err = retention.NewSweeper(retentionWorkDir, chunkClient, c.cfg.RetentionDeleteWorkCount, c.cfg.RetentionDeleteDelay, r)
330+
sc.sweeper, err = retention.NewSweeper(retentionWorkDir, chunkClient, c.cfg.RetentionDeleteWorkCount, c.cfg.RetentionDeleteDelay, c.cfg.RetentionBackoffConfig, r)
327331
if err != nil {
328332
return fmt.Errorf("failed to init sweeper: %w", err)
329333
}

pkg/compactor/retention/retention.go

+36-18
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111

1212
"github.com/go-kit/log"
1313
"github.com/go-kit/log/level"
14+
"github.com/grafana/dskit/backoff"
1415
"github.com/prometheus/client_golang/prometheus"
1516
"github.com/prometheus/common/model"
1617
"github.com/prometheus/prometheus/model/labels"
@@ -272,9 +273,17 @@ type Sweeper struct {
272273
markerProcessor MarkerProcessor
273274
chunkClient ChunkClient
274275
sweeperMetrics *sweeperMetrics
276+
backoffConfig backoff.Config
275277
}
276278

277-
func NewSweeper(workingDir string, deleteClient ChunkClient, deleteWorkerCount int, minAgeDelete time.Duration, r prometheus.Registerer) (*Sweeper, error) {
279+
func NewSweeper(
280+
workingDir string,
281+
deleteClient ChunkClient,
282+
deleteWorkerCount int,
283+
minAgeDelete time.Duration,
284+
backoffConfig backoff.Config,
285+
r prometheus.Registerer,
286+
) (*Sweeper, error) {
278287
m := newSweeperMetrics(r)
279288

280289
p, err := newMarkerStorageReader(workingDir, deleteWorkerCount, minAgeDelete, m)
@@ -285,34 +294,43 @@ func NewSweeper(workingDir string, deleteClient ChunkClient, deleteWorkerCount i
285294
markerProcessor: p,
286295
chunkClient: deleteClient,
287296
sweeperMetrics: m,
297+
backoffConfig: backoffConfig,
288298
}, nil
289299
}
290300

291301
func (s *Sweeper) Start() {
292-
s.markerProcessor.Start(func(ctx context.Context, chunkId []byte) error {
293-
status := statusSuccess
294-
start := time.Now()
295-
defer func() {
296-
s.sweeperMetrics.deleteChunkDurationSeconds.WithLabelValues(status).Observe(time.Since(start).Seconds())
297-
}()
298-
chunkIDString := unsafeGetString(chunkId)
299-
userID, err := getUserIDFromChunkID(chunkId)
300-
if err != nil {
301-
return err
302-
}
302+
s.markerProcessor.Start(s.deleteChunk)
303+
}
303304

305+
func (s *Sweeper) deleteChunk(ctx context.Context, chunkID []byte) error {
306+
status := statusSuccess
307+
start := time.Now()
308+
defer func() {
309+
s.sweeperMetrics.deleteChunkDurationSeconds.WithLabelValues(status).Observe(time.Since(start).Seconds())
310+
}()
311+
chunkIDString := unsafeGetString(chunkID)
312+
userID, err := getUserIDFromChunkID(chunkID)
313+
if err != nil {
314+
return err
315+
}
316+
317+
retry := backoff.New(ctx, s.backoffConfig)
318+
for retry.Ongoing() {
304319
err = s.chunkClient.DeleteChunk(ctx, unsafeGetString(userID), chunkIDString)
320+
if err == nil {
321+
return nil
322+
}
305323
if s.chunkClient.IsChunkNotFoundErr(err) {
306324
status = statusNotFound
307325
level.Debug(util_log.Logger).Log("msg", "delete on not found chunk", "chunkID", chunkIDString)
308326
return nil
309327
}
310-
if err != nil {
311-
level.Error(util_log.Logger).Log("msg", "error deleting chunk", "chunkID", chunkIDString, "err", err)
312-
status = statusFailure
313-
}
314-
return err
315-
})
328+
retry.Wait()
329+
}
330+
331+
level.Error(util_log.Logger).Log("msg", "error deleting chunk", "chunkID", chunkIDString, "err", err)
332+
status = statusFailure
333+
return err
316334
}
317335

318336
func getUserIDFromChunkID(chunkID []byte) ([]byte, error) {

pkg/compactor/retention/retention_test.go

+63-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"crypto/sha256"
66
"encoding/base64"
7+
"fmt"
78
"os"
89
"path"
910
"path/filepath"
@@ -14,6 +15,7 @@ import (
1415
"testing"
1516
"time"
1617

18+
"github.com/grafana/dskit/backoff"
1719
"github.com/prometheus/client_golang/prometheus"
1820
"github.com/prometheus/common/model"
1921
"github.com/prometheus/prometheus/model/labels"
@@ -32,14 +34,37 @@ import (
3234
)
3335

3436
type mockChunkClient struct {
35-
mtx sync.Mutex
36-
deletedChunks map[string]struct{}
37+
mtx sync.Mutex
38+
deletedChunks map[string]struct{}
39+
unstableDeletion bool
40+
perObjectCounter map[string]uint32
41+
}
42+
43+
// newMockChunkClient creates a client that fails every first call to DeleteChunk if `unstableDeletion` is true.
44+
func newMockChunkClient(unstableDeletion bool) *mockChunkClient {
45+
return &mockChunkClient{
46+
deletedChunks: map[string]struct{}{},
47+
unstableDeletion: unstableDeletion,
48+
perObjectCounter: map[string]uint32{},
49+
}
50+
}
51+
52+
// shouldFail returns true for every first call
53+
func (m *mockChunkClient) shouldFail(objectKey string) bool {
54+
if !m.unstableDeletion {
55+
return false
56+
}
57+
shouldFail := m.perObjectCounter[objectKey]%2 == 0
58+
m.perObjectCounter[objectKey]++
59+
return shouldFail
3760
}
3861

3962
func (m *mockChunkClient) DeleteChunk(_ context.Context, _, chunkID string) error {
4063
m.mtx.Lock()
4164
defer m.mtx.Unlock()
42-
65+
if m.shouldFail(chunkID) {
66+
return fmt.Errorf("chunk deletion for chunkID:%s is failed by mockChunkClient", chunkID)
67+
}
4368
m.deletedChunks[string([]byte(chunkID))] = struct{}{} // forces a copy, because this string is only valid within the delete fn.
4469
return nil
4570
}
@@ -144,8 +169,9 @@ func Test_Retention(t *testing.T) {
144169
// marks and sweep
145170
expiration := NewExpirationChecker(tt.limits)
146171
workDir := filepath.Join(t.TempDir(), "retention")
147-
chunkClient := &mockChunkClient{deletedChunks: map[string]struct{}{}}
148-
sweep, err := NewSweeper(workDir, chunkClient, 10, 0, nil)
172+
// must not fail the process because deletion must be retried
173+
chunkClient := newMockChunkClient(true)
174+
sweep, err := NewSweeper(workDir, chunkClient, 10, 0, backoff.Config{MaxRetries: 2}, nil)
149175
require.NoError(t, err)
150176
sweep.Start()
151177
defer sweep.Stop()
@@ -176,6 +202,38 @@ func Test_Retention(t *testing.T) {
176202
}
177203
}
178204

205+
func Test_Sweeper_deleteChunk(t *testing.T) {
206+
chunkID := "1/3fff2c2d7595e046:1916fa8c4bd:1916fdfb33d:bd55fc5"
207+
tests := map[string]struct {
208+
maxRetries int
209+
expectedError error
210+
}{
211+
"expected error if chunk is not deleted and retry is disabled": {
212+
maxRetries: 1,
213+
expectedError: fmt.Errorf("chunk deletion for chunkID:%s is failed by mockChunkClient", chunkID),
214+
},
215+
"expected no error if chunk is not deleted at the first attempt but retried": {
216+
maxRetries: 2,
217+
},
218+
}
219+
for name, data := range tests {
220+
t.Run(name, func(t *testing.T) {
221+
workDir := filepath.Join(t.TempDir(), "retention")
222+
chunkClient := newMockChunkClient(true)
223+
sweep, err := NewSweeper(workDir, chunkClient, 10, 0, backoff.Config{MaxRetries: data.maxRetries}, nil)
224+
require.NoError(t, err)
225+
226+
err = sweep.deleteChunk(context.Background(), []byte(chunkID))
227+
if data.expectedError != nil {
228+
require.Equal(t, data.expectedError, err)
229+
} else {
230+
require.NoError(t, err)
231+
}
232+
})
233+
}
234+
235+
}
236+
179237
type noopWriter struct {
180238
count int64
181239
}

0 commit comments

Comments
 (0)