-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
sql/backfill: Implement retry mechanism during index backfill
During an index backfill, if the bulko.index_backfill.batch_size setting is not appropriately configured for a table's size or definition, it may consume all available memory before writing the new index entries. This change introduces a retry mechanism to handle out-of-memory scenarios. Upon encountering memory issues, the batch size is halved on each retry, coupled with an exponential backoff. This backoff period allows the consumer of the index entries to free up memory. The retry mechanism reuses the same bound memory account, making it critical to accurately track memory usage, even during failed attempts. This ensures proper accounting and frees memory consumed during the failed operation. Epic: CRDB-37796 Closes #130939, #132048 Release note (bug fix): The schema changer's backfill process now includes a retry mechanism that reduces the batch size when memory issues occur. This improves the likelihood of operation success without requiring manual adjustment of the bulko.index_backfill.batch_size parameter.
- Loading branch information
Showing
6 changed files
with
185 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
// Copyright 2024 The Cockroach Authors. | ||
// | ||
// Use of this software is governed by the CockroachDB Software License | ||
// included in the /LICENSE file. | ||
|
||
package rowexec | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
"time" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/base" | ||
"github.com/cockroachdb/cockroach/pkg/security/username" | ||
"github.com/cockroachdb/cockroach/pkg/sql/isql" | ||
"github.com/cockroachdb/cockroach/pkg/sql/sqlerrors" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils" | ||
"github.com/cockroachdb/cockroach/pkg/util/leaktest" | ||
"github.com/cockroachdb/cockroach/pkg/util/log" | ||
"github.com/cockroachdb/cockroach/pkg/util/mon" | ||
"github.com/cockroachdb/cockroach/pkg/util/retry" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestRetryOfIndexEntryBatch(t *testing.T) { | ||
defer leaktest.AfterTest(t)() | ||
defer log.Scope(t).Close(t) | ||
|
||
ctx := context.Background() | ||
srv := serverutils.StartServerOnly(t, base.TestServerArgs{}) | ||
defer srv.Stopper().Stop(ctx) | ||
db := srv.SystemLayer().InternalDB().(isql.DB) | ||
|
||
const initialChunkSize int64 = 50000 | ||
oomErr := mon.NewMemoryBudgetExceededError(1, 1, 1) | ||
nonOomErr := sqlerrors.NewUndefinedUserError(username.NodeUserName()) | ||
|
||
for _, tc := range []struct { | ||
desc string | ||
errs []error | ||
retryErr error | ||
expectedErr error | ||
expectedChunkSize int64 | ||
}{ | ||
{"happy-path", nil, nil, nil, initialChunkSize}, | ||
{"retry-once", []error{oomErr}, nil, nil, initialChunkSize >> 1}, | ||
{"retry-then-fail", []error{oomErr, oomErr, nonOomErr}, nil, nonOomErr, initialChunkSize >> 2}, | ||
{"retry-exhaustive", []error{oomErr, oomErr, oomErr, oomErr}, nil, oomErr, initialChunkSize >> 3}, | ||
{"retry-error", []error{oomErr}, nonOomErr, oomErr, initialChunkSize}, | ||
} { | ||
t.Run(tc.desc, func(t *testing.T) { | ||
i := 0 | ||
br := indexBatchRetry{ | ||
nextChunkSize: initialChunkSize, | ||
retryOpts: retry.Options{ | ||
InitialBackoff: 2 * time.Millisecond, | ||
Multiplier: 2, | ||
MaxRetries: 2, | ||
MaxBackoff: 10 * time.Millisecond, | ||
}, | ||
buildIndexChunk: func(ctx context.Context, txn isql.Txn) error { | ||
if i < len(tc.errs) { | ||
return tc.errs[i] | ||
} | ||
return nil | ||
}, | ||
resetForNextAttempt: func(ctx context.Context) error { | ||
i++ | ||
return tc.retryErr | ||
}, | ||
} | ||
err := br.buildBatchWithRetry(ctx, db) | ||
if tc.expectedErr == nil { | ||
require.NoError(t, err) | ||
} else { | ||
require.ErrorIs(t, err, tc.expectedErr) | ||
} | ||
require.Equal(t, tc.expectedChunkSize, br.nextChunkSize) | ||
}) | ||
} | ||
} |