Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-22.2: kvserver: log if lease applies with a delay #96804

Merged
merged 1 commit into from
Feb 14, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion pkg/kv/kvserver/replica_proposal.go
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,17 @@ func (r *Replica) leasePostApplyLocked(
requiresExpirationBasedLease := r.requiresExpiringLeaseRLocked()
hasExpirationBasedLease := newLease.Type() == roachpb.LeaseExpiration

now := r.store.Clock().NowAsClockTimestamp()

// NB: ProposedTS is non-nil in practice, but we never fully migrated it
// in so we need to assume that it can be nil.
if iAmTheLeaseHolder && leaseChangingHands && newLease.ProposedTS != nil {
maybeLogSlowLeaseApplyWarning(ctx, time.Duration(now.WallTime-newLease.ProposedTS.WallTime), prevLease, newLease)
}

// Gossip the first range whenever its lease is acquired. We check to make
// sure the lease is active so that a trailing replica won't process an old
// lease request and attempt to gossip the first range.
now := r.store.Clock().NowAsClockTimestamp()
if leaseChangingHands && iAmTheLeaseHolder && r.IsFirstRange() && r.ownsValidLeaseRLocked(ctx, now) {
r.gossipFirstRangeLocked(ctx)
}
Expand Down Expand Up @@ -444,6 +451,49 @@ func (r *Replica) leasePostApplyLocked(
}
}

// maybeLogSlowLeaseApplyWarning is called when the lease changes hands on the
// new leaseholder. It logs if either the new lease was proposed well before it
// became visible on the leaseholder (indicating replication lag) or if the
// previous lease looks like we transferred a lease to a behind/offline replica.
func maybeLogSlowLeaseApplyWarning(
ctx context.Context, newLeaseAppDelay time.Duration, prevLease, newLease *roachpb.Lease,
) {
const slowLeaseApplyWarnThreshold = 500 * time.Millisecond
if newLeaseAppDelay > slowLeaseApplyWarnThreshold {
// If we hold the lease now and the lease was proposed "earlier", there
// must have been replication lag, and possibly reads and/or writes were
// delayed.
//
// We see this most commonly with lease transfers targeting a behind replica,
// or, in the worst case, a snapshot. We are constantly improving our
// heuristics for avoiding that[^1] but if it does happen it's good to know
// from the logs.
//
// In the case of a lease transfer, the two timestamps compared below are from
// different clocks, so there could be skew. We just pretend this is not the
// case, which is good enough here.
//
// [^1]: https://github.com/cockroachdb/cockroach/pull/82758
log.Warningf(ctx,
"lease %v active after replication lag of ~%.2fs; foreground traffic may have been impacted [prev=%v]",
newLease, newLeaseAppDelay.Seconds(), prevLease,
)
} else if prevLease.Type() == roachpb.LeaseExpiration &&
newLease.Type() == roachpb.LeaseEpoch &&
newLease.AcquisitionType == roachpb.LeaseAcquisitionType_Request {
// If the previous lease is expiration-based, but the new lease is not and
// the acquisition was non-cooperative, it is likely that a lease transfer
// (which is expiration-based) went to a follower that then couldn't hold
// the lease alive (for example, didn't apply it in time for it to
// actually serve any traffic). The result was likely an outage which
// resolves right now, so log to point this out.
log.Warningf(ctx,
"lease %v expired before being followed by lease %s; foreground traffic may have been impacted",
prevLease, newLease,
)
}
}

var addSSTPreApplyWarn = struct {
threshold time.Duration
log.EveryN
Expand Down