From cc1e012bc033ebf387f3de41ab5b3fbcc9bdb1a7 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 31 Jan 2023 08:28:05 +0000 Subject: [PATCH] kvserver: log if lease applies with a delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we transfer a lease to a lagging follower, there's often a latency blip that we get asked to investigate. This is time consuming; it's often very subtle to even figure out that it happened. We try to be better about not doing it, but at least on 22.1 we know it's possible, and we can't backport the rather involved fixes. This warning makes it fairly obvious when it happens. > W230131 [...] [T1,n2,s2,r23/3:‹/Table/2{1-2}›,raft] 165 lease repl=(n2,s2):3 seq=5 start=1675153630.108829000,0 epo=3 pro=1675153630.108829000,0 active after replication lag of ~0.58s; foreground traffic may have been impacted [prev=repl=(n3,s3):2 seq=4 start=1675153407.528408000,0 epo=2 pro=1675153419.837642000,0] Also log if we're acquiring an epoch-based lease following a non-cooperatively expired expiration-based lease, which would suggest that a lease transfer went to a node that couldn't service the lease. This would likely have caused an outage, and the log message will provide a way to pinpoint its end timestamp Addresses #95991. Epic: none Release note: None --- pkg/kv/kvserver/replica_proposal.go | 52 ++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go index f3e6ae4152e4..74ba60e0603f 100644 --- a/pkg/kv/kvserver/replica_proposal.go +++ b/pkg/kv/kvserver/replica_proposal.go @@ -347,10 +347,17 @@ func (r *Replica) leasePostApplyLocked( requiresExpirationBasedLease := r.requiresExpiringLeaseRLocked() hasExpirationBasedLease := newLease.Type() == roachpb.LeaseExpiration + now := r.store.Clock().NowAsClockTimestamp() + + // NB: ProposedTS is non-nil in practice, but we never fully migrated it + // in so we need to assume that it can be nil. + if iAmTheLeaseHolder && leaseChangingHands && newLease.ProposedTS != nil { + maybeLogSlowLeaseApplyWarning(ctx, time.Duration(now.WallTime-newLease.ProposedTS.WallTime), prevLease, newLease) + } + // Gossip the first range whenever its lease is acquired. We check to make // sure the lease is active so that a trailing replica won't process an old // lease request and attempt to gossip the first range. - now := r.store.Clock().NowAsClockTimestamp() if leaseChangingHands && iAmTheLeaseHolder && r.IsFirstRange() && r.ownsValidLeaseRLocked(ctx, now) { r.gossipFirstRangeLocked(ctx) } @@ -444,6 +451,49 @@ func (r *Replica) leasePostApplyLocked( } } +// maybeLogSlowLeaseApplyWarning is called when the lease changes hands on the +// new leaseholder. It logs if either the new lease was proposed well before it +// became visible on the leaseholder (indicating replication lag) or if the +// previous lease looks like we transferred a lease to a behind/offline replica. +func maybeLogSlowLeaseApplyWarning( + ctx context.Context, newLeaseAppDelay time.Duration, prevLease, newLease *roachpb.Lease, +) { + const slowLeaseApplyWarnThreshold = 500 * time.Millisecond + if newLeaseAppDelay > slowLeaseApplyWarnThreshold { + // If we hold the lease now and the lease was proposed "earlier", there + // must have been replication lag, and possibly reads and/or writes were + // delayed. + // + // We see this most commonly with lease transfers targeting a behind replica, + // or, in the worst case, a snapshot. We are constantly improving our + // heuristics for avoiding that[^1] but if it does happen it's good to know + // from the logs. + // + // In the case of a lease transfer, the two timestamps compared below are from + // different clocks, so there could be skew. We just pretend this is not the + // case, which is good enough here. + // + // [^1]: https://github.com/cockroachdb/cockroach/pull/82758 + log.Warningf(ctx, + "lease %v active after replication lag of ~%.2fs; foreground traffic may have been impacted [prev=%v]", + newLease, newLeaseAppDelay.Seconds(), prevLease, + ) + } else if prevLease.Type() == roachpb.LeaseExpiration && + newLease.Type() == roachpb.LeaseEpoch && + newLease.AcquisitionType == roachpb.LeaseAcquisitionType_Request { + // If the previous lease is expiration-based, but the new lease is not and + // the acquisition was non-cooperative, it is likely that a lease transfer + // (which is expiration-based) went to a follower that then couldn't hold + // the lease alive (for example, didn't apply it in time for it to + // actually serve any traffic). The result was likely an outage which + // resolves right now, so log to point this out. + log.Warningf(ctx, + "lease %v expired before being followed by lease %s; foreground traffic may have been impacted", + prevLease, newLease, + ) + } +} + var addSSTPreApplyWarn = struct { threshold time.Duration log.EveryN