From 605be12e2a1654a5a6c07c6cd386a26a9a87ed5d Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Fri, 11 Mar 2022 08:16:36 -0800 Subject: [PATCH] consensus: ensure the node terminates on consensus failure Updates #8077. The panic handler for consensus currently attempts to effect a clean shutdown, but this can leave a failed node running in an unknown state for an arbitrary amount of time after the failure. Since a panic at this point means consensus is already irrecoverably broken, we should not allow the node to continue executing. After making a best effort to shut down the writeahead log, re-panic to ensure the node will terminate before any further state transitions are processed. Even with this change, it is possible some transitions may occur while the cleanup is happening. It might be preferable to abort unconditionally without any attempt at cleanup. --- internal/consensus/state.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/consensus/state.go b/internal/consensus/state.go index 413c4ba56..1e16f84c2 100644 --- a/internal/consensus/state.go +++ b/internal/consensus/state.go @@ -869,15 +869,15 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) { defer func() { if r := recover(); r != nil { cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack())) - // stop gracefully - // - // NOTE: We most probably shouldn't be running any further when there is - // some unexpected panic. Some unknown error happened, and so we don't - // know if that will result in the validator signing an invalid thing. It - // might be worthwhile to explore a mechanism for manual resuming via - // some console or secure RPC system, but for now, halting the chain upon - // unexpected consensus bugs sounds like the better option. + + // Make a best-effort attempt to close the WAL, but otherwise do not + // attempt to gracefully terminate. Once consensus has irrecoverably + // failed, any additional progress we permit the node to make may + // complicate diagnosing and recovering from the failure. onExit(cs) + + // Re-panic to ensure the node terminates. + panic(r) } }()