Browse Source

WAL: better errors and new fail point (#3246)

* privval: more info in errors

* wal: change Debug logs to Info

* wal: log and return error on corrupted wal instead of panicing

* fail: Exit right away instead of sending interupt

* consensus: FAIL before handling our own vote

allows to replicate #3089:
- run using `FAIL_TEST_INDEX=0`
- delete some bytes from the end of the WAL
- start normally

Results in logs like:

```
I[2019-02-03|18:12:58.225] Searching for height module=consensus wal=/Users/ethanbuchman/.tendermint/data/cs.wal/wal height=1 min=0 max=0
E[2019-02-03|18:12:58.225] Error on catchup replay. Proceeding to start ConsensusState anyway module=consensus err="failed to read data: EOF"
I[2019-02-03|18:12:58.225] Started node module=main nodeInfo="{ProtocolVersion:{P2P:6 Block:9 App:1} ID_:35e87e93f2e31f305b65a5517fd2102331b56002 ListenAddr:tcp://0.0.0.0:26656 Network:test-chain-J8JvJH Version:0.29.1 Channels:4020212223303800 Moniker:Ethans-MacBook-Pro.local Other:{TxIndex:on RPCAddress:tcp://0.0.0.0:26657}}"
E[2019-02-03|18:12:58.226] Couldn't connect to any seeds module=p2p
I[2019-02-03|18:12:59.229] Timed out module=consensus dur=998.568ms height=1 round=0 step=RoundStepNewHeight
I[2019-02-03|18:12:59.230] enterNewRound(1/0). Current: 1/0/RoundStepNewHeight module=consensus height=1 round=0
I[2019-02-03|18:12:59.230] enterPropose(1/0). Current: 1/0/RoundStepNewRound module=consensus height=1 round=0
I[2019-02-03|18:12:59.230] enterPropose: Our turn to propose module=consensus height=1 round=0 proposer=AD278B7767B05D7FBEB76207024C650988FA77D5 privValidator="PrivValidator{AD278B7767B05D7FBEB76207024C650988FA77D5 LH:1, LR:0, LS:2}"
E[2019-02-03|18:12:59.230] enterPropose: Error signing proposal module=consensus height=1 round=0 err="Error signing proposal: Step regression at height 1 round 0. Got 1, last step 2"
I[2019-02-03|18:13:02.233] Timed out module=consensus dur=3s height=1 round=0 step=RoundStepPropose
I[2019-02-03|18:13:02.233] enterPrevote(1/0). Current: 1/0/RoundStepPropose module=consensus
I[2019-02-03|18:13:02.233] enterPrevote: ProposalBlock is nil module=consensus height=1 round=0
E[2019-02-03|18:13:02.234] Error signing vote module=consensus height=1 round=0 vote="Vote{0:AD278B7767B0 1/00/1(Prevote) 000000000000 000000000000 @ 2019-02-04T02:13:02.233897Z}" err="Error signing vote: Conflicting data"
```

Notice the EOF, the step regression, and the conflicting data.

* wal: change errors to be DataCorruptionError

* exit on corrupt WAL

* fix log

* fix new line
pull/3252/head
Ethan Buchman 5 years ago
committed by GitHub
parent
commit
39eba4e154
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 41 additions and 14 deletions
  1. +2
    -2
      consensus/replay.go
  2. +26
    -0
      consensus/state.go
  3. +7
    -7
      consensus/wal.go
  4. +3
    -2
      libs/fail/fail.go
  5. +3
    -3
      privval/file.go

+ 2
- 2
consensus/replay.go View File

@ -144,8 +144,8 @@ func (cs *ConsensusState) catchupReplay(csHeight int64) error {
if err == io.EOF {
break
} else if IsDataCorruptionError(err) {
cs.Logger.Debug("data has been corrupted in last height of consensus WAL", "err", err, "height", csHeight)
panic(fmt.Sprintf("data has been corrupted (%v) in last height %d of consensus WAL", err, csHeight))
cs.Logger.Error("data has been corrupted in last height of consensus WAL", "err", err, "height", csHeight)
return err
} else if err != nil {
return err
}


+ 26
- 0
consensus/state.go View File

@ -307,6 +307,23 @@ func (cs *ConsensusState) OnStart() error {
// reload from consensus log to catchup
if cs.doWALCatchup {
if err := cs.catchupReplay(cs.Height); err != nil {
// don't try to recover from data corruption error
if IsDataCorruptionError(err) {
cs.Logger.Error("Encountered corrupt WAL file", "err", err.Error())
cs.Logger.Error("Please repair the WAL file before restarting")
fmt.Println(`You can attempt to repair the WAL as follows:
----
WALFILE=~/.tendermint/data/cs.wal/wal
cp $WALFILE ${WALFILE}.bak # backup the file
go run scripts/wal2json/main.go $WALFILE > wal.json # this will panic, but can be ignored
rm $WALFILE # remove the corrupt file
go run scripts/json2wal/main.go wal.json $WALFILE # rebuild the file without corruption
----`)
return err
}
cs.Logger.Error("Error on catchup replay. Proceeding to start ConsensusState anyway", "err", err.Error())
// NOTE: if we ever do return an error here,
// make sure to stop the timeoutTicker
@ -624,6 +641,15 @@ func (cs *ConsensusState) receiveRoutine(maxSteps int) {
cs.handleMsg(mi)
case mi = <-cs.internalMsgQueue:
cs.wal.WriteSync(mi) // NOTE: fsync
if _, ok := mi.Msg.(*VoteMessage); ok {
// we actually want to simulate failing during
// the previous WriteSync, but this isn't easy to do.
// Equivalent would be to fail here and manually remove
// some bytes from the end of the wal.
fail.Fail() // XXX
}
// handles proposals, block parts, votes
cs.handleMsg(mi)
case ti := <-cs.timeoutTicker.Chan(): // tockChan:


+ 7
- 7
consensus/wal.go View File

@ -163,7 +163,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions)
// NOTE: starting from the last file in the group because we're usually
// searching for the last height. See replay.go
min, max := wal.group.MinIndex(), wal.group.MaxIndex()
wal.Logger.Debug("Searching for height", "height", height, "min", min, "max", max)
wal.Logger.Info("Searching for height", "height", height, "min", min, "max", max)
for index := max; index >= min; index-- {
gr, err = wal.group.NewReader(index)
if err != nil {
@ -183,7 +183,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions)
break
}
if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) {
wal.Logger.Debug("Corrupted entry. Skipping...", "err", err)
wal.Logger.Error("Corrupted entry. Skipping...", "err", err)
// do nothing
continue
} else if err != nil {
@ -194,7 +194,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions)
if m, ok := msg.Msg.(EndHeightMessage); ok {
lastHeightFound = m.Height
if m.Height == height { // found
wal.Logger.Debug("Found", "height", height, "index", index)
wal.Logger.Info("Found", "height", height, "index", index)
return gr, true, nil
}
}
@ -281,25 +281,25 @@ func (dec *WALDecoder) Decode() (*TimedWALMessage, error) {
return nil, err
}
if err != nil {
return nil, fmt.Errorf("failed to read checksum: %v", err)
return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %v", err)}
}
crc := binary.BigEndian.Uint32(b)
b = make([]byte, 4)
_, err = dec.rd.Read(b)
if err != nil {
return nil, fmt.Errorf("failed to read length: %v", err)
return nil, DataCorruptionError{fmt.Errorf("failed to read length: %v", err)}
}
length := binary.BigEndian.Uint32(b)
if length > maxMsgSizeBytes {
return nil, fmt.Errorf("length %d exceeded maximum possible value of %d bytes", length, maxMsgSizeBytes)
return nil, DataCorruptionError{fmt.Errorf("length %d exceeded maximum possible value of %d bytes", length, maxMsgSizeBytes)}
}
data := make([]byte, length)
_, err = dec.rd.Read(data)
if err != nil {
return nil, fmt.Errorf("failed to read data: %v", err)
return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v", err)}
}
// check checksum before decoding data


+ 3
- 2
libs/fail/fail.go View File

@ -72,7 +72,8 @@ func FailRand(n int) {
func Exit() {
fmt.Printf("*** fail-test %d ***\n", callIndex)
proc, _ := os.FindProcess(os.Getpid())
proc.Signal(os.Interrupt)
os.Exit(1)
// proc, _ := os.FindProcess(os.Getpid())
// proc.Signal(os.Interrupt)
// panic(fmt.Sprintf("*** fail-test %d ***", callIndex))
}

+ 3
- 3
privval/file.go View File

@ -87,17 +87,17 @@ type FilePVLastSignState struct {
func (lss *FilePVLastSignState) CheckHRS(height int64, round int, step int8) (bool, error) {
if lss.Height > height {
return false, errors.New("Height regression")
return false, fmt.Errorf("Height regression. Got %v, last height %v", height, lss.Height)
}
if lss.Height == height {
if lss.Round > round {
return false, errors.New("Round regression")
return false, fmt.Errorf("Round regression at height %v. Got %v, last round %v", height, round, lss.Round)
}
if lss.Round == round {
if lss.Step > step {
return false, errors.New("Step regression")
return false, fmt.Errorf("Step regression at height %v round %v. Got %v, last step %v", height, round, step, lss.Step)
} else if lss.Step == step {
if lss.SignBytes != nil {
if lss.Signature == nil {


Loading…
Cancel
Save