Skip to content

Commit 11e1976

Browse files
authored
fix(blooms): Delete outdated metas during planning (#13363)
1 parent d8cc1ce commit 11e1976

File tree

5 files changed

+187
-54
lines changed

5 files changed

+187
-54
lines changed

pkg/bloombuild/planner/metrics.go

+9-6
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ const (
1515

1616
statusSuccess = "success"
1717
statusFailure = "failure"
18+
19+
phasePlanning = "planning"
20+
phaseBuilding = "building"
1821
)
1922

2023
type Metrics struct {
@@ -33,8 +36,8 @@ type Metrics struct {
3336
buildTime *prometheus.HistogramVec
3437
buildLastSuccess prometheus.Gauge
3538

36-
blocksDeleted prometheus.Counter
37-
metasDeleted prometheus.Counter
39+
blocksDeleted *prometheus.CounterVec
40+
metasDeleted *prometheus.CounterVec
3841

3942
tenantsDiscovered prometheus.Counter
4043
tenantTasksPlanned *prometheus.GaugeVec
@@ -127,18 +130,18 @@ func NewMetrics(
127130
Help: "Unix timestamp of the last successful build cycle.",
128131
}),
129132

130-
blocksDeleted: promauto.With(r).NewCounter(prometheus.CounterOpts{
133+
blocksDeleted: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
131134
Namespace: metricsNamespace,
132135
Subsystem: metricsSubsystem,
133136
Name: "blocks_deleted_total",
134137
Help: "Number of blocks deleted",
135-
}),
136-
metasDeleted: promauto.With(r).NewCounter(prometheus.CounterOpts{
138+
}, []string{"phase"}),
139+
metasDeleted: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
137140
Namespace: metricsNamespace,
138141
Subsystem: metricsSubsystem,
139142
Name: "metas_deleted_total",
140143
Help: "Number of metas deleted",
141-
}),
144+
}, []string{"phase"}),
142145

143146
tenantsDiscovered: promauto.With(r).NewCounter(prometheus.CounterOpts{
144147
Namespace: metricsNamespace,

pkg/bloombuild/planner/planner.go

+35-20
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,13 @@ func (p *Planner) computeTasks(
339339
return nil, nil, fmt.Errorf("failed to get metas: %w", err)
340340
}
341341

342+
// In case the planner restarted before deleting outdated metas in the previous iteration,
343+
// we delete them during the planning phase to avoid reprocessing them.
344+
metas, err = p.deleteOutdatedMetasAndBlocks(ctx, table, tenant, metas, phasePlanning)
345+
if err != nil {
346+
return nil, nil, fmt.Errorf("failed to delete outdated metas during planning: %w", err)
347+
}
348+
342349
for _, ownershipRange := range ownershipRanges {
343350
logger := log.With(logger, "ownership", ownershipRange.String())
344351

@@ -351,9 +358,6 @@ func (p *Planner) computeTasks(
351358
level.Error(logger).Log("msg", "failed to find outdated gaps", "err", err)
352359
continue
353360
}
354-
if len(gaps) == 0 {
355-
continue
356-
}
357361

358362
for _, gap := range gaps {
359363
tasks = append(tasks, protos.NewTask(table, tenant, ownershipRange, gap.tsdb, gap.gaps))
@@ -424,51 +428,62 @@ func (p *Planner) processTenantTaskResults(
424428
}
425429

426430
combined := append(originalMetas, newMetas...)
427-
outdated := outdatedMetas(combined)
428-
if len(outdated) == 0 {
429-
level.Debug(logger).Log("msg", "no outdated metas found")
430-
return tasksSucceed, nil
431-
}
432-
433-
level.Debug(logger).Log("msg", "found outdated metas", "outdated", len(outdated))
434-
if err := p.deleteOutdatedMetasAndBlocks(ctx, table, tenant, outdated); err != nil {
431+
if _, err := p.deleteOutdatedMetasAndBlocks(ctx, table, tenant, combined, phaseBuilding); err != nil {
435432
return 0, fmt.Errorf("failed to delete outdated metas: %w", err)
436433
}
437434

438435
return tasksSucceed, nil
439436
}
440437

438+
// deleteOutdatedMetasAndBlocks filters out the outdated metas from the `metas` argument and deletes them from the store.
439+
// It returns the up-to-date metas from the `metas` argument.
441440
func (p *Planner) deleteOutdatedMetasAndBlocks(
442441
ctx context.Context,
443442
table config.DayTable,
444443
tenant string,
445444
metas []bloomshipper.Meta,
446-
) error {
447-
logger := log.With(p.logger, "table", table.Addr(), "tenant", tenant)
445+
phase string,
446+
) ([]bloomshipper.Meta, error) {
447+
logger := log.With(p.logger, "table", table.Addr(), "tenant", tenant, "phase", phase)
448+
449+
upToDate, outdated := outdatedMetas(metas)
450+
if len(outdated) == 0 {
451+
level.Debug(logger).Log(
452+
"msg", "no outdated metas found",
453+
"upToDate", len(upToDate),
454+
)
455+
return upToDate, nil
456+
}
457+
458+
level.Debug(logger).Log(
459+
"msg", "found outdated metas",
460+
"outdated", len(outdated),
461+
"upToDate", len(upToDate),
462+
)
448463

449464
client, err := p.bloomStore.Client(table.ModelTime())
450465
if err != nil {
451466
level.Error(logger).Log("msg", "failed to get client", "err", err)
452-
return errors.Wrap(err, "failed to get client")
467+
return nil, errors.Wrap(err, "failed to get client")
453468
}
454469

455470
var (
456471
deletedMetas int
457472
deletedBlocks int
458473
)
459474
defer func() {
460-
p.metrics.metasDeleted.Add(float64(deletedMetas))
461-
p.metrics.blocksDeleted.Add(float64(deletedBlocks))
475+
p.metrics.metasDeleted.WithLabelValues(phase).Add(float64(deletedMetas))
476+
p.metrics.blocksDeleted.WithLabelValues(phase).Add(float64(deletedBlocks))
462477
}()
463478

464-
for _, meta := range metas {
479+
for _, meta := range outdated {
465480
for _, block := range meta.Blocks {
466481
if err := client.DeleteBlocks(ctx, []bloomshipper.BlockRef{block}); err != nil {
467482
if client.IsObjectNotFoundErr(err) {
468483
level.Debug(logger).Log("msg", "block not found while attempting delete, continuing", "block", block.String())
469484
} else {
470485
level.Error(logger).Log("msg", "failed to delete block", "err", err, "block", block.String())
471-
return errors.Wrap(err, "failed to delete block")
486+
return nil, errors.Wrap(err, "failed to delete block")
472487
}
473488
}
474489

@@ -482,7 +497,7 @@ func (p *Planner) deleteOutdatedMetasAndBlocks(
482497
level.Debug(logger).Log("msg", "meta not found while attempting delete, continuing", "meta", meta.MetaRef.String())
483498
} else {
484499
level.Error(logger).Log("msg", "failed to delete meta", "err", err, "meta", meta.MetaRef.String())
485-
return errors.Wrap(err, "failed to delete meta")
500+
return nil, errors.Wrap(err, "failed to delete meta")
486501
}
487502
}
488503
deletedMetas++
@@ -495,7 +510,7 @@ func (p *Planner) deleteOutdatedMetasAndBlocks(
495510
"blocks", deletedBlocks,
496511
)
497512

498-
return nil
513+
return upToDate, nil
499514
}
500515

501516
func (p *Planner) tables(ts time.Time) *dayRangeIterator {

pkg/bloombuild/planner/planner_test.go

+100-13
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ func createPlanner(
403403
HardLimit: flagext.Bytes(20 << 20),
404404
TTL: time.Hour,
405405
},
406+
CacheListOps: false,
406407
},
407408
FSConfig: local.FSConfig{
408409
Directory: t.TempDir(),
@@ -796,19 +797,7 @@ func Test_processTenantTaskResults(t *testing.T) {
796797
},
797798
)
798799
require.NoError(t, err)
799-
800-
// TODO(salvacorts): Fix this
801-
// For some reason, when the tests are run in the CI, we do not encode the `loc` of model.Time for each TSDB.
802-
// As a result, when we fetch them, the loc is empty whereas in the original metas, it is not. Therefore the
803-
// comparison fails. As a workaround to fix the issue, we will manually reset the TS of the sources to the
804-
// fetched metas
805-
for i := range metas {
806-
for j := range metas[i].Sources {
807-
sec := metas[i].Sources[j].TS.Unix()
808-
nsec := metas[i].Sources[j].TS.Nanosecond()
809-
metas[i].Sources[j].TS = time.Unix(sec, int64(nsec))
810-
}
811-
}
800+
removeLocFromMetasSources(metas)
812801

813802
// Compare metas
814803
require.Equal(t, len(tc.expectedMetas), len(metas))
@@ -817,6 +806,104 @@ func Test_processTenantTaskResults(t *testing.T) {
817806
}
818807
}
819808

809+
// For some reason, when the tests are run in the CI, we do not encode the `loc` of model.Time for each TSDB.
810+
// As a result, when we fetch them, the loc is empty whereas in the original metas, it is not. Therefore the
811+
// comparison fails. As a workaround to fix the issue, we will manually reset the TS of the sources to the
812+
// fetched metas
813+
func removeLocFromMetasSources(metas []bloomshipper.Meta) []bloomshipper.Meta {
814+
for i := range metas {
815+
for j := range metas[i].Sources {
816+
sec := metas[i].Sources[j].TS.Unix()
817+
nsec := metas[i].Sources[j].TS.Nanosecond()
818+
metas[i].Sources[j].TS = time.Unix(sec, int64(nsec))
819+
}
820+
}
821+
822+
return metas
823+
}
824+
825+
func Test_deleteOutdatedMetas(t *testing.T) {
826+
for _, tc := range []struct {
827+
name string
828+
originalMetas []bloomshipper.Meta
829+
expectedUpToDateMetas []bloomshipper.Meta
830+
}{
831+
{
832+
name: "no metas",
833+
},
834+
{
835+
name: "only up to date metas",
836+
originalMetas: []bloomshipper.Meta{
837+
genMeta(0, 10, []int{0}, []bloomshipper.BlockRef{genBlockRef(0, 10)}),
838+
genMeta(10, 20, []int{0}, []bloomshipper.BlockRef{genBlockRef(10, 20)}),
839+
},
840+
expectedUpToDateMetas: []bloomshipper.Meta{
841+
genMeta(0, 10, []int{0}, []bloomshipper.BlockRef{genBlockRef(0, 10)}),
842+
genMeta(10, 20, []int{0}, []bloomshipper.BlockRef{genBlockRef(10, 20)}),
843+
},
844+
},
845+
{
846+
name: "outdated metas",
847+
originalMetas: []bloomshipper.Meta{
848+
genMeta(0, 5, []int{0}, []bloomshipper.BlockRef{genBlockRef(0, 5)}),
849+
genMeta(6, 10, []int{0}, []bloomshipper.BlockRef{genBlockRef(6, 10)}),
850+
genMeta(0, 10, []int{1}, []bloomshipper.BlockRef{genBlockRef(0, 10)}),
851+
},
852+
expectedUpToDateMetas: []bloomshipper.Meta{
853+
genMeta(0, 10, []int{1}, []bloomshipper.BlockRef{genBlockRef(0, 10)}),
854+
},
855+
},
856+
} {
857+
t.Run(tc.name, func(t *testing.T) {
858+
logger := log.NewNopLogger()
859+
//logger := log.NewLogfmtLogger(os.Stdout)
860+
861+
cfg := Config{
862+
PlanningInterval: 1 * time.Hour,
863+
MaxQueuedTasksPerTenant: 10000,
864+
}
865+
planner := createPlanner(t, cfg, &fakeLimits{}, logger)
866+
867+
bloomClient, err := planner.bloomStore.Client(testDay.ModelTime())
868+
require.NoError(t, err)
869+
870+
// Create original metas and blocks
871+
err = putMetas(bloomClient, tc.originalMetas)
872+
require.NoError(t, err)
873+
874+
// Get all metas
875+
metas, err := planner.bloomStore.FetchMetas(
876+
context.Background(),
877+
bloomshipper.MetaSearchParams{
878+
TenantID: "fakeTenant",
879+
Interval: bloomshipper.NewInterval(testTable.Bounds()),
880+
Keyspace: v1.NewBounds(0, math.MaxUint64),
881+
},
882+
)
883+
require.NoError(t, err)
884+
removeLocFromMetasSources(metas)
885+
require.ElementsMatch(t, tc.originalMetas, metas)
886+
887+
upToDate, err := planner.deleteOutdatedMetasAndBlocks(context.Background(), testTable, "fakeTenant", tc.originalMetas, phasePlanning)
888+
require.NoError(t, err)
889+
require.ElementsMatch(t, tc.expectedUpToDateMetas, upToDate)
890+
891+
// Get all metas
892+
metas, err = planner.bloomStore.FetchMetas(
893+
context.Background(),
894+
bloomshipper.MetaSearchParams{
895+
TenantID: "fakeTenant",
896+
Interval: bloomshipper.NewInterval(testTable.Bounds()),
897+
Keyspace: v1.NewBounds(0, math.MaxUint64),
898+
},
899+
)
900+
require.NoError(t, err)
901+
removeLocFromMetasSources(metas)
902+
require.ElementsMatch(t, tc.expectedUpToDateMetas, metas)
903+
})
904+
}
905+
}
906+
820907
type fakeBuilder struct {
821908
mx sync.Mutex // Protects tasks and currTaskIdx.
822909
id string

pkg/bloombuild/planner/versioned_range.go

+12-2
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,9 @@ func (t tsdbTokenRange) reassemble(from int) tsdbTokenRange {
210210
return t[:len(t)-(reassembleTo-from)]
211211
}
212212

213-
func outdatedMetas(metas []bloomshipper.Meta) []bloomshipper.Meta {
213+
func outdatedMetas(metas []bloomshipper.Meta) ([]bloomshipper.Meta, []bloomshipper.Meta) {
214214
var outdated []bloomshipper.Meta
215+
var upToDate []bloomshipper.Meta
215216

216217
// Sort metas descending by most recent source when checking
217218
// for outdated metas (older metas are discarded if they don't change the range).
@@ -254,8 +255,17 @@ func outdatedMetas(metas []bloomshipper.Meta) []bloomshipper.Meta {
254255
tokenRange, added = tokenRange.Add(version, meta.Bounds)
255256
if !added {
256257
outdated = append(outdated, meta)
258+
continue
257259
}
260+
261+
upToDate = append(upToDate, meta)
258262
}
259263

260-
return outdated
264+
// We previously sorted the input metas by their TSDB source TS, therefore, they may not be sorted by FP anymore.
265+
// We need to re-sort them by their FP to match the original order.
266+
sort.Slice(upToDate, func(i, j int) bool {
267+
return upToDate[i].Bounds.Less(upToDate[j].Bounds)
268+
})
269+
270+
return upToDate, outdated
261271
}

0 commit comments

Comments
 (0)