Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core/state/snapshot: fix journal recovery #21775

Merged
merged 3 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 71 additions & 6 deletions core/blockchain_snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,11 @@ import (
// (v) Geth restarts normally, but it's requested to be rewound to a lower point via SetHead
// (vi) Geth restarts normally with a stale snapshot
type snapshotTest struct {
legacy bool // Flag whether the loaded snapshot is in legacy format
crash bool // Flag whether the Geth restarts from the previous crash
gapped int // Number of blocks to insert without enabling snapshot
setHead uint64 // Block number to set head back to
legacy bool // Flag whether the loaded snapshot is in legacy format
crash bool // Flag whether the Geth restarts from the previous crash
restartCrash int // Number of blocks to insert after the normal stop, then the crash happens
gapped int // Number of blocks to insert without enabling snapshot
setHead uint64 // Block number to set head back to

chainBlocks int // Number of blocks to generate for the canonical chain
snapshotBlock uint64 // Block number of the relevant snapshot disk layer
Expand Down Expand Up @@ -565,10 +566,50 @@ func TestSetHeadWithLegacySnapshot(t *testing.T) {
})
}

// Tests the Geth was running with snapshot(legacy-format) enabled and upgrades
// the disk layer journal(journal generator) to latest format. After that the Geth
// is restarted from a crash. In this case Geth will find the new-format disk layer
// journal but with legacy-format diff journal(the new-format is never committed),
// and the invalid diff journal is expected to be dropped.
func TestRecoverSnapshotFromCrashWithLegacyDiffJournal(t *testing.T) {
// Chain:
// G->C1->C2->C3->C4->C5->C6->C7->C8 (HEAD)
//
// Commit: G
// Snapshot: G
//
// SetHead(0)
//
// ------------------------------
//
// Expected in leveldb:
// G->C1->C2->C3->C4->C5->C6->C7->C8->C9->C10
//
// Expected head header : C10
// Expected head fast block: C10
// Expected head block : C8
// Expected snapshot disk : C10
testSnapshot(t, &snapshotTest{
legacy: true,
crash: false,
restartCrash: 2,
gapped: 0,
setHead: 0,
chainBlocks: 8,
snapshotBlock: 0,
commitBlock: 0,
expCanonicalBlocks: 10,
expHeadHeader: 10,
expHeadFastBlock: 10,
expHeadBlock: 8, // The persisted state in the first running
expSnapshotBottom: 10, // The persisted disk layer in the second running
})
}

func testSnapshot(t *testing.T, tt *snapshotTest) {
// It's hard to follow the test case, visualize the input
//log.Root().SetHandler(log.LvlFilterHandler(log.LvlTrace, log.StreamHandler(os.Stderr, log.TerminalFormat(true))))
//fmt.Println(tt.dump())
// log.Root().SetHandler(log.LvlFilterHandler(log.LvlTrace, log.StreamHandler(os.Stderr, log.TerminalFormat(true))))
// fmt.Println(tt.dump())

// Create a temporary persistent database
datadir, err := ioutil.TempDir("", "")
Expand Down Expand Up @@ -694,6 +735,30 @@ func testSnapshot(t *testing.T, tt *snapshotTest) {
chain.SetHead(tt.setHead)
chain.Stop()

chain, err = NewBlockChain(db, nil, params.AllEthashProtocolChanges, engine, vm.Config{}, nil, nil)
if err != nil {
t.Fatalf("Failed to recreate chain: %v", err)
}
defer chain.Stop()
} else if tt.restartCrash != 0 {
// Firstly, stop the chain properly, with all snapshot journal
// and state committed.
chain.Stop()

// Restart chain, forcibly flush the disk layer journal with new format
newBlocks, _ := GenerateChain(params.TestChainConfig, blocks[len(blocks)-1], engine, gendb, tt.restartCrash, func(i int, b *BlockGen) {})
chain, err = NewBlockChain(db, cacheConfig, params.AllEthashProtocolChanges, engine, vm.Config{}, nil, nil)
if err != nil {
t.Fatalf("Failed to recreate chain: %v", err)
}
chain.InsertChain(newBlocks)
chain.Snapshot().Cap(newBlocks[len(newBlocks)-1].Root(), 0)

// Simulate the blockchain crash
// Don't call chain.Stop here, so that no snapshot
// journal and latest state will be committed

// Restart the chain after the crash
chain, err = NewBlockChain(db, nil, params.AllEthashProtocolChanges, engine, vm.Config{}, nil, nil)
if err != nil {
t.Fatalf("Failed to recreate chain: %v", err)
Expand Down
16 changes: 10 additions & 6 deletions core/state/snapshot/journal.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,9 @@ func loadAndParseJournal(db ethdb.KeyValueStore, base *diskLayer) (snapshot, jou
// Retrieve the diff layer journal. It's possible that the journal is
// not existent, e.g. the disk layer is generating while that the Geth
// crashes without persisting the diff journal.
// So if there is no journal, or the journal is not matched with disk
// layer, we just discard all diffs and try to recover them later.
// So if there is no journal, or the journal is invalid(e.g. the journal
// is not matched with disk layer; or the it's the legacy-format journal,
// etc.), we just discard all diffs and try to recover them later.
journal := rawdb.ReadSnapshotJournal(db)
if len(journal) == 0 {
log.Warn("Loaded snapshot journal", "diskroot", base.root, "diffs", "missing")
Expand All @@ -115,13 +116,16 @@ func loadAndParseJournal(db ethdb.KeyValueStore, base *diskLayer) (snapshot, jou
// Firstly, resolve the first element as the journal version
version, err := r.Uint()
if err != nil {
return nil, journalGenerator{}, err
log.Warn("Failed to resolve the journal version", "error", err)
return base, generator, nil
}
if version != journalVersion {
return nil, journalGenerator{}, fmt.Errorf("journal version mismatch, want %d got %v", journalVersion, version)
log.Warn("Discarded the snapshot journal with wrong version", "required", journalVersion, "got", version)
return base, generator, nil
}
// Secondly, resolve the disk layer root, ensure it's continuous
// with disk layer.
// with disk layer. Note now we can ensure it's the snapshot journal
// correct version, so we expect everything can be resolved properly.
var root common.Hash
if err := r.Decode(&root); err != nil {
return nil, journalGenerator{}, errors.New("missing disk layer root")
Expand Down Expand Up @@ -159,7 +163,7 @@ func loadSnapshot(diskdb ethdb.KeyValueStore, triedb *trie.Database, cache int,
var legacy bool
snapshot, generator, err := loadAndParseJournal(diskdb, base)
if err != nil {
log.Debug("Failed to load new-format journal", "error", err)
log.Warn("Failed to load new-format journal", "error", err)
snapshot, generator, err = loadAndParseLegacyJournal(diskdb, base)
legacy = true
}
Expand Down