From d2e0bec9909f9ffd5e8c7d4389d712a7757e12a9 Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Wed, 13 Dec 2017 17:24:39 -0500 Subject: [PATCH] storage: Write HardState atomically with committing splits Prior to this change, an ill-timed crash (between applying the raft command and calling splitPostApply) would leave the replica in a persistently broken state (no HardState). Found via jepsen. Fixes #20629 Fixes #20494 Release note (bugfix): Fixed a replica corruption that could occur if a process crashed in the middle of a range split. --- pkg/storage/replica.go | 22 +++++++++++++++++++++- pkg/storage/store.go | 28 ++++++++++++++++------------ 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go index 053d01889b84..76bb527c5f15 100644 --- a/pkg/storage/replica.go +++ b/pkg/storage/replica.go @@ -30,7 +30,7 @@ import ( "github.com/coreos/etcd/raft/raftpb" "github.com/google/btree" "github.com/kr/pretty" - "github.com/opentracing/opentracing-go" + opentracing "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "golang.org/x/net/context" @@ -4414,6 +4414,26 @@ func (r *Replica) processRaftCommand( raftCmd.ReplicatedEvalResult.AddSSTable = nil } + if raftCmd.ReplicatedEvalResult.Split != nil { + // Splits require a new HardState to be written to the new RHS + // range (and this needs to be atomic with the main batch). This + // cannot be constructed at evaluation time because it differs + // on each replica (votes may have already been cast on the + // uninitialized replica). Transform the write batch to add the + // updated HardState. + // See https://github.com/cockroachdb/cockroach/issues/20629 + // + // This is not the most efficient, but it only happens on splits, + // which are relatively infrequent and don't write much data. + tmpBatch := r.store.engine.NewBatch() + if err := tmpBatch.ApplyBatchRepr(writeBatch.Data, false); err != nil { + log.Fatal(ctx, err) + } + splitPreApply(ctx, r.store.cfg.Settings, tmpBatch, raftCmd.ReplicatedEvalResult.Split.SplitTrigger) + writeBatch.Data = tmpBatch.Repr() + tmpBatch.Close() + } + { var err error raftCmd.ReplicatedEvalResult.Delta, err = r.applyRaftCommand( diff --git a/pkg/storage/store.go b/pkg/storage/store.go index a63d2e2978c1..65da602c13a2 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -28,7 +28,7 @@ import ( "github.com/coreos/etcd/raft" "github.com/coreos/etcd/raft/raftpb" "github.com/google/btree" - "github.com/opentracing/opentracing-go" + opentracing "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "golang.org/x/net/context" "golang.org/x/time/rate" @@ -1797,6 +1797,21 @@ func (s *Store) NewRangeDescriptor( return desc, nil } +// splitPreApply is called when the raft command is applied. Any +// changes to the given ReadWriter will be written atomically with the +// split commit. +func splitPreApply( + ctx context.Context, st *cluster.Settings, eng engine.ReadWriter, split roachpb.SplitTrigger, +) { + // Update the raft HardState with the new Commit value now that the + // replica is initialized (combining it with existing or default + // Term and Vote). + rsl := makeReplicaStateLoader(split.RightDesc.RangeID) + if err := rsl.synthesizeRaftState(ctx, eng); err != nil { + log.Fatal(ctx, err) + } +} + // splitPostApply is the part of the split trigger which coordinates the actual // split with the Store. Requires that Replica.raftMu is held. // @@ -1821,18 +1836,7 @@ func splitPostApply( } } - // Finish up the initialization of the RHS' RaftState now that we have - // committed the split Batch (which included the initialization of the - // ReplicaState). This will synthesize and persist the correct lastIndex and - // HardState. - if err := makeReplicaStateLoader(split.RightDesc.RangeID).synthesizeRaftState( - ctx, r.store.Engine(), - ); err != nil { - log.Fatal(ctx, err) - } - // Finish initialization of the RHS. - r.mu.Lock() rightRng.mu.Lock() // Copy the minLeaseProposedTS from the LHS.