From d09a3a6d3a3bcc43f1007c323e57375d2a10d783 Mon Sep 17 00:00:00 2001 From: Anton Kaliaev Date: Fri, 3 Aug 2018 11:24:55 +0400 Subject: [PATCH] stop gracefully instead of trying to resume ops Refs #2072 We most probably shouldn't be running any further when there is some unexpected panic. Some unknown error happened, and so we don't know if that will result in the validator signing an invalid thing. It might be worthwhile to explore a mechanism for manual resuming via some console or secure RPC system, but for now, halting the chain upon unexpected consensus bugs sounds like the better option. --- consensus/state.go | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/consensus/state.go b/consensus/state.go index 435427a91..6ffe6ef64 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -553,10 +553,30 @@ func (cs *ConsensusState) newStep() { // Updates (state transitions) happen on timeouts, complete proposals, and 2/3 majorities. // ConsensusState must be locked before any internal state is updated. func (cs *ConsensusState) receiveRoutine(maxSteps int) { + onExit := func(cs *ConsensusState) { + // NOTE: the internalMsgQueue may have signed messages from our + // priv_val that haven't hit the WAL, but its ok because + // priv_val tracks LastSig + + // close wal now that we're done writing to it + cs.wal.Stop() + cs.wal.Wait() + + close(cs.done) + } + defer func() { if r := recover(); r != nil { cs.Logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack())) - go cs.receiveRoutine(0) + // stop gracefully + // + // NOTE: We most probably shouldn't be running any further when there is + // some unexpected panic. Some unknown error happened, and so we don't + // know if that will result in the validator signing an invalid thing. It + // might be worthwhile to explore a mechanism for manual resuming via + // some console or secure RPC system, but for now, halting the chain upon + // unexpected consensus bugs sounds like the better option. + onExit(cs) } }() @@ -589,15 +609,7 @@ func (cs *ConsensusState) receiveRoutine(maxSteps int) { // go to the next step cs.handleTimeout(ti, rs) case <-cs.Quit(): - // NOTE: the internalMsgQueue may have signed messages from our - // priv_val that haven't hit the WAL, but its ok because - // priv_val tracks LastSig - - // close wal now that we're done writing to it - cs.wal.Stop() - cs.wal.Wait() - - close(cs.done) + onExit(cs) return } }