Skip to content

Commit

Permalink
storage: Write HardState atomically with committing splits
Browse files Browse the repository at this point in the history
Prior to this change, an ill-timed crash (between applying the raft
command and calling splitPostApply) would leave the replica in a
persistently broken state (no HardState).

Found via jepsen.

Fixes cockroachdb#20629
Fixes cockroachdb#20494

Release note (bugfix): Fixed a replica corruption that could occur if
a process crashed in the middle of a range split.
  • Loading branch information
bdarnell committed Dec 13, 2017
1 parent 9c752ab commit 8496576
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 14 deletions.
18 changes: 17 additions & 1 deletion pkg/storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
"github.com/coreos/etcd/raft/raftpb"
"github.com/google/btree"
"github.com/kr/pretty"
"github.com/opentracing/opentracing-go"
opentracing "github.com/opentracing/opentracing-go"
"github.com/pkg/errors"
"golang.org/x/net/context"

Expand Down Expand Up @@ -4606,6 +4606,22 @@ func (r *Replica) processRaftCommand(
raftCmd.ReplicatedEvalResult.AddSSTable = nil
}

if raftCmd.ReplicatedEvalResult.Split != nil {
// Splits require a new HardState to be written to the new RHS
// range (and this needs to be atomic with the main batch). This
// cannot be constructed at evaluation time because it differs
// on each replica (votes may have already been cast on the
// uninitialized replica). Transform the write batch to add the
// updated HardState.
tmpBatch := r.store.engine.NewBatch()
if err := tmpBatch.ApplyBatchRepr(writeBatch.Data, false); err != nil {
log.Fatal(ctx, err)
}
splitPreApply(ctx, r.store.cfg.Settings, tmpBatch, raftCmd.ReplicatedEvalResult.Split.SplitTrigger)
writeBatch.Data = tmpBatch.Repr()
tmpBatch.Close()
}

var delta enginepb.MVCCStats
{
var err error
Expand Down
29 changes: 16 additions & 13 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import (
"github.com/coreos/etcd/raft"
"github.com/coreos/etcd/raft/raftpb"
"github.com/google/btree"
"github.com/opentracing/opentracing-go"
opentracing "github.com/opentracing/opentracing-go"
"github.com/pkg/errors"
"golang.org/x/net/context"
"golang.org/x/time/rate"
Expand Down Expand Up @@ -1798,6 +1798,21 @@ func (s *Store) NewRangeDescriptor(
return desc, nil
}

// splitPreApply is called when the raft command is applied. Any
// changes to the given ReadWriter will be written atomically with the
// split commit.
func splitPreApply(
ctx context.Context, st *cluster.Settings, eng engine.ReadWriter, split roachpb.SplitTrigger,
) {
// Update the raft HardState with the new Commit value now that the
// replica is initialized (combining it with existing or default
// Term and Vote).
rsl := stateloader.Make(st, split.RightDesc.RangeID)
if err := rsl.SynthesizeRaftState(ctx, eng); err != nil {
log.Fatal(ctx, err)
}
}

// splitPostApply is the part of the split trigger which coordinates the actual
// split with the Store. Requires that Replica.raftMu is held.
//
Expand All @@ -1822,19 +1837,7 @@ func splitPostApply(
}
}

{
// Finish up the initialization of the RHS' RaftState now that we have
// committed the split Batch (which included the initialization of the
// ReplicaState). This will synthesize and persist the correct lastIndex and
// HardState.
rsl := stateloader.Make(r.store.cfg.Settings, split.RightDesc.RangeID)
if err := rsl.SynthesizeRaftState(ctx, r.store.Engine()); err != nil {
log.Fatal(ctx, err)
}
}

// Finish initialization of the RHS.

r.mu.Lock()
rightRng.mu.Lock()
// Copy the minLeaseProposedTS from the LHS.
Expand Down

0 comments on commit 8496576

Please sign in to comment.