Skip to content

Commit

Permalink
add retry limit for excluded backoff type to avoid infinite retry (#1015
Browse files Browse the repository at this point in the history
)

* add retry limit for excluded backoff type to avoid infinite retry

Signed-off-by: cfzjywxk <[email protected]>

* lower the log level

Signed-off-by: cfzjywxk <[email protected]>

---------

Signed-off-by: cfzjywxk <[email protected]>
  • Loading branch information
cfzjywxk authored Oct 12, 2023
1 parent 32c4ef5 commit 35e4902
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 6 deletions.
4 changes: 2 additions & 2 deletions internal/locate/region_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -1791,7 +1791,7 @@ func (s *RegionRequestSender) onRegionError(bo *retry.Backoffer, ctx *RPCContext
return true, nil
}
}
logutil.Logger(bo.GetCtx()).Warn(
logutil.Logger(bo.GetCtx()).Debug(
"tikv reports `ServerIsBusy` retry later",
zap.String("reason", regionErr.GetServerIsBusy().GetReason()),
zap.Stringer("ctx", ctx))
Expand Down Expand Up @@ -1895,7 +1895,7 @@ func (s *RegionRequestSender) onRegionError(bo *retry.Backoffer, ctx *RPCContext
// This error is specific to stale read and the target replica is randomly selected. If the request is sent
// to the leader, the data must be ready, so we don't backoff here.
if regionErr.GetDataIsNotReady() != nil {
logutil.BgLogger().Warn("tikv reports `DataIsNotReady` retry later",
logutil.BgLogger().Debug("tikv reports `DataIsNotReady` retry later",
zap.Uint64("store-id", ctx.Store.storeID),
zap.Uint64("peer-id", regionErr.GetDataIsNotReady().GetPeerId()),
zap.Uint64("region-id", regionErr.GetDataIsNotReady().GetRegionId()),
Expand Down
11 changes: 9 additions & 2 deletions internal/retry/backoff.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,13 @@ func (b *Backoffer) BackoffWithCfgAndMaxSleep(cfg *Config, maxSleepMs int, err e
if b.noop {
return err
}
if b.maxSleep > 0 && (b.totalSleep-b.excludedSleep) >= b.maxSleep {
maxBackoffTimeExceeded := (b.totalSleep - b.excludedSleep) >= b.maxSleep
maxExcludedTimeExceeded := false
if maxLimit, ok := isSleepExcluded[cfg.name]; ok {
maxExcludedTimeExceeded = b.excludedSleep >= maxLimit && b.excludedSleep >= b.maxSleep
}
maxTimeExceeded := maxBackoffTimeExceeded || maxExcludedTimeExceeded
if b.maxSleep > 0 && maxTimeExceeded {
longestSleepCfg, longestSleepTime := b.longestSleepCfg()
errMsg := fmt.Sprintf("%s backoffer.maxSleep %dms is exceeded, errors:", cfg.String(), b.maxSleep)
for i, err := range b.errors {
Expand All @@ -163,7 +169,8 @@ func (b *Backoffer) BackoffWithCfgAndMaxSleep(cfg *Config, maxSleepMs int, err e
backoffDetail.WriteString(":")
backoffDetail.WriteString(strconv.Itoa(times))
}
errMsg += fmt.Sprintf("\ntotal-backoff-times: %v, backoff-detail: %v", totalTimes, backoffDetail.String())
errMsg += fmt.Sprintf("\ntotal-backoff-times: %v, backoff-detail: %v, maxBackoffTimeExceeded: %v, maxExcludedTimeExceeded: %v",
totalTimes, backoffDetail.String(), maxBackoffTimeExceeded, maxExcludedTimeExceeded)
returnedErr := err
if longestSleepCfg != nil {
errMsg += fmt.Sprintf("\nlongest sleep type: %s, time: %dms", longestSleepCfg.String(), longestSleepTime)
Expand Down
12 changes: 12 additions & 0 deletions internal/retry/backoff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,15 @@ func TestBackoffDeepCopy(t *testing.T) {
assert.ErrorIs(t, err, BoMaxDataNotReady.err)
}
}

func TestBackoffWithMaxExcludedExceed(t *testing.T) {
setBackoffExcluded(BoTiKVServerBusy.name, 1)
b := NewBackofferWithVars(context.TODO(), 1, nil)
err := b.Backoff(BoTiKVServerBusy, errors.New("server is busy"))
assert.Nil(t, err)

// As the total excluded sleep is greater than the max limited value, error should be returned.
err = b.Backoff(BoTiKVServerBusy, errors.New("server is busy"))
assert.NotNil(t, err)
assert.Greater(t, b.excludedSleep, b.maxSleep)
}
11 changes: 9 additions & 2 deletions internal/retry/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,18 @@ var (
BoTxnLockFast = NewConfig(txnLockFastName, &metrics.BackoffHistogramLockFast, NewBackoffFnCfg(2, 3000, EqualJitter), tikverr.ErrResolveLockTimeout)
)

var isSleepExcluded = map[string]struct{}{
BoTiKVServerBusy.name: {},
var isSleepExcluded = map[string]int{
BoTiKVServerBusy.name: 600000, // The max excluded limit is 10min.
// add BoTiFlashServerBusy if appropriate
}

// setBackoffExcluded is used for test only.
func setBackoffExcluded(name string, maxVal int) {
if _, ok := isSleepExcluded[name]; ok {
isSleepExcluded[name] = maxVal
}
}

const (
// NoJitter makes the backoff sequence strict exponential.
NoJitter = 1 + iota
Expand Down

0 comments on commit 35e4902

Please sign in to comment.