diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index a8af8a7e8..c2585b197 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -58,6 +58,8 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi - [evidence] [\#4839](https://github.com/tendermint/tendermint/pull/4839) Reject duplicate evidence from being proposed (@cmwaters) - [evidence] [\#4892](https://github.com/tendermint/tendermint/pull/4892) Remove redundant header from phantom validator evidence (@cmwaters) - [types] [\#4905](https://github.com/tendermint/tendermint/pull/4905) Add ValidateBasic to validator and validator set (@cmwaters) +- [consensus] [\#4578](https://github.com/tendermint/tendermint/issues/4578) Attempt to repair the consensus WAL file (`data/cs.wal/wal`) automatically in case of corruption (@alessio) + The original WAL file will be backed up to `data/cs.wal/wal.CORRUPTED`. - [lite2] [\#4935](https://github.com/tendermint/tendermint/pull/4935) Fetch and compare a new header with witnesses in parallel (@melekes) - [lite2] [\#4929](https://github.com/tendermint/tendermint/pull/4929) compare header w/ witnesses only when doing bisection (@melekes) diff --git a/consensus/state.go b/consensus/state.go index fea1eb54e..cae6094d5 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -4,6 +4,7 @@ import ( "bytes" "errors" "fmt" + "os" "reflect" "runtime/debug" "sync" @@ -276,23 +277,63 @@ func (cs *State) LoadCommit(height int64) *types.Commit { return cs.blockStore.LoadBlockCommit(height) } -// OnStart implements service.Service. -// It loads the latest state via the WAL, and starts the timeout and receive routines. +// OnStart loads the latest state via the WAL, and starts the timeout and +// receive routines. func (cs *State) OnStart() error { - if err := cs.evsw.Start(); err != nil { - return err - } - - // we may set the WAL in testing before calling Start, - // so only OpenWAL if its still the nilWAL + // We may set the WAL in testing before calling Start, so only OpenWAL if its + // still the nilWAL. if _, ok := cs.wal.(nilWAL); ok { - walFile := cs.config.WalFile() - wal, err := cs.OpenWAL(walFile) - if err != nil { - cs.Logger.Error("Error loading State wal", "err", err.Error()) + if err := cs.loadWalFile(); err != nil { return err } - cs.wal = wal + } + + // We may have lost some votes if the process crashed reload from consensus + // log to catchup. + if cs.doWALCatchup { + repairAttempted := false + LOOP: + for { + err := cs.catchupReplay(cs.Height) + switch { + case err == nil: + break LOOP + case !IsDataCorruptionError(err): + cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err) + break LOOP + case repairAttempted: + return err + } + + cs.Logger.Info("WAL file is corrupted. Attempting repair", "err", err) + + // 1) prep work + cs.wal.Stop() + repairAttempted = true + + // 2) backup original WAL file + corruptedFile := fmt.Sprintf("%s.CORRUPTED", cs.config.WalFile()) + if err := tmos.CopyFile(cs.config.WalFile(), corruptedFile); err != nil { + return err + } + cs.Logger.Info("Backed up WAL file", "src", cs.config.WalFile(), "dst", corruptedFile) + + // 3) try to repair (WAL file will be overwritten!) + if err := repairWalFile(corruptedFile, cs.config.WalFile()); err != nil { + cs.Logger.Error("Repair failed", "err", err) + return err + } + cs.Logger.Info("Successful repair") + + // reload WAL file + if err := cs.loadWalFile(); err != nil { + return err + } + } + } + + if err := cs.evsw.Start(); err != nil { + return err } // we need the timeoutRoutine for replay so @@ -304,33 +345,6 @@ func (cs *State) OnStart() error { return err } - // we may have lost some votes if the process crashed - // reload from consensus log to catchup - if cs.doWALCatchup { - if err := cs.catchupReplay(cs.Height); err != nil { - // don't try to recover from data corruption error - if IsDataCorruptionError(err) { - cs.Logger.Error("Encountered corrupt WAL file", "err", err.Error()) - cs.Logger.Error("Please repair the WAL file before restarting") - fmt.Println(`You can attempt to repair the WAL as follows: - ----- -WALFILE=~/.tendermint/data/cs.wal/wal -cp $WALFILE ${WALFILE}.bak # backup the file -go run scripts/wal2json/main.go $WALFILE > wal.json # this will panic, but can be ignored -rm $WALFILE # remove the corrupt file -go run scripts/json2wal/main.go wal.json $WALFILE # rebuild the file without corruption -----`) - - return err - } - - cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err.Error()) - // NOTE: if we ever do return an error here, - // make sure to stop the timeoutTicker - } - } - // now start the receiveRoutine go cs.receiveRoutine(0) @@ -352,6 +366,17 @@ func (cs *State) startRoutines(maxSteps int) { go cs.receiveRoutine(maxSteps) } +// loadWalFile loads WAL data from file. It overwrites cs.wal. +func (cs *State) loadWalFile() error { + wal, err := cs.OpenWAL(cs.config.WalFile()) + if err != nil { + cs.Logger.Error("Error loading State wal", "err", err) + return err + } + cs.wal = wal + return nil +} + // OnStop implements service.Service. func (cs *State) OnStop() { cs.evsw.Stop() @@ -366,15 +391,17 @@ func (cs *State) Wait() { <-cs.done } -// OpenWAL opens a file to log all consensus messages and timeouts for deterministic accountability +// OpenWAL opens a file to log all consensus messages and timeouts for +// deterministic accountability. func (cs *State) OpenWAL(walFile string) (WAL, error) { wal, err := NewWAL(walFile) if err != nil { - cs.Logger.Error("Failed to open WAL for consensus state", "wal", walFile, "err", err) + cs.Logger.Error("Failed to open WAL", "file", walFile, "err", err) return nil, err } wal.SetLogger(cs.Logger.With("wal", walFile)) if err := wal.Start(); err != nil { + cs.Logger.Error("Failed to start WAL", "err", err) return nil, err } return wal, nil @@ -2034,3 +2061,39 @@ func CompareHRS(h1 int64, r1 int, s1 cstypes.RoundStepType, h2 int64, r2 int, s2 } return 0 } + +// repairWalFile decodes messages from src (until the decoder errors) and +// writes them to dst. +func repairWalFile(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Open(dst) + if err != nil { + return err + } + defer out.Close() + + var ( + dec = NewWALDecoder(in) + enc = NewWALEncoder(out) + ) + + // best-case repair (until first error is encountered) + for { + msg, err := dec.Decode() + if err != nil { + break + } + + err = enc.Encode(msg) + if err != nil { + return fmt.Errorf("failed to encode msg: %w", err) + } + } + + return nil +} diff --git a/libs/os/os.go b/libs/os/os.go index 4773feef0..ea24a42f6 100644 --- a/libs/os/os.go +++ b/libs/os/os.go @@ -2,6 +2,7 @@ package os import ( "fmt" + "io" "io/ioutil" "os" "os/signal" @@ -80,3 +81,27 @@ func MustWriteFile(filePath string, contents []byte, mode os.FileMode) { Exit(fmt.Sprintf("MustWriteFile failed: %v", err)) } } + +// CopyFile copies a file. It truncates the destination file if it exists. +func CopyFile(src, dst string) error { + info, err := os.Stat(src) + if err != nil { + return err + } + + srcfile, err := os.Open(src) + if err != nil { + return err + } + defer srcfile.Close() + + // create new file, truncate if exists and apply same permissions as the original one + dstfile, err := os.OpenFile(dst, os.O_RDWR|os.O_CREATE|os.O_TRUNC, info.Mode().Perm()) + if err != nil { + return err + } + defer dstfile.Close() + + _, err = io.Copy(dstfile, srcfile) + return err +} diff --git a/libs/os/os_test.go b/libs/os/os_test.go new file mode 100644 index 000000000..9c80f1f5a --- /dev/null +++ b/libs/os/os_test.go @@ -0,0 +1,37 @@ +package os + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "testing" +) + +func TestCopyFile(t *testing.T) { + tmpfile, err := ioutil.TempFile("", "example") + if err != nil { + t.Fatal(err) + } + defer os.Remove(tmpfile.Name()) + content := []byte("hello world") + if _, err := tmpfile.Write(content); err != nil { + t.Fatal(err) + } + + copyfile := fmt.Sprintf("%s.copy", tmpfile.Name()) + if err := CopyFile(tmpfile.Name(), copyfile); err != nil { + t.Fatal(err) + } + if _, err := os.Stat(copyfile); os.IsNotExist(err) { + t.Fatal("copy should exist") + } + data, err := ioutil.ReadFile(copyfile) + if err != nil { + t.Fatal(err) + } + if !bytes.Equal(data, content) { + t.Fatalf("copy file content differs: expected %v, got %v", content, data) + } + os.Remove(copyfile) +}