@ -20,6 +20,7 @@ import (
cstypes "github.com/tendermint/tendermint/internal/consensus/types"
cstypes "github.com/tendermint/tendermint/internal/consensus/types"
"github.com/tendermint/tendermint/internal/eventbus"
"github.com/tendermint/tendermint/internal/eventbus"
"github.com/tendermint/tendermint/internal/jsontypes"
"github.com/tendermint/tendermint/internal/jsontypes"
"github.com/tendermint/tendermint/internal/libs/autofile"
sm "github.com/tendermint/tendermint/internal/state"
sm "github.com/tendermint/tendermint/internal/state"
tmevents "github.com/tendermint/tendermint/libs/events"
tmevents "github.com/tendermint/tendermint/libs/events"
"github.com/tendermint/tendermint/libs/log"
"github.com/tendermint/tendermint/libs/log"
@ -869,15 +870,27 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
defer func ( ) {
defer func ( ) {
if r := recover ( ) ; r != nil {
if r := recover ( ) ; r != nil {
cs . logger . Error ( "CONSENSUS FAILURE!!!" , "err" , r , "stack" , string ( debug . Stack ( ) ) )
cs . logger . Error ( "CONSENSUS FAILURE!!!" , "err" , r , "stack" , string ( debug . Stack ( ) ) )
// stop gracefully
//
// NOTE: We most probably shouldn't be running any further when there is
// some unexpected panic. Some unknown error happened, and so we don't
// know if that will result in the validator signing an invalid thing. It
// might be worthwhile to explore a mechanism for manual resuming via
// some console or secure RPC system, but for now, halting the chain upon
// unexpected consensus bugs sounds like the better option.
// Make a best-effort attempt to close the WAL, but otherwise do not
// attempt to gracefully terminate. Once consensus has irrecoverably
// failed, any additional progress we permit the node to make may
// complicate diagnosing and recovering from the failure.
onExit ( cs )
onExit ( cs )
// Re-panic to ensure the node terminates.
//
// TODO(creachadair): In ordinary operation, the WAL autofile should
// never be closed. This only happens during shutdown and production
// nodes usually halt by panicking. Many existing tests, however,
// assume a clean shutdown is possible. Prior to #8111, we were
// swallowing the panic in receiveRoutine, making that appear to
// work. Filtering this specific error is slightly risky, but should
// affect only unit tests. In any case, not re-panicking here only
// preserves the pre-existing behavior for this one error type.
if err , ok := r . ( error ) ; ok && errors . Is ( err , autofile . ErrAutoFileClosed ) {
return
}
panic ( r )
}
}
} ( )
} ( )
@ -906,8 +919,8 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
case mi := <- cs . internalMsgQueue :
case mi := <- cs . internalMsgQueue :
err := cs . wal . WriteSync ( mi ) // NOTE: fsync
err := cs . wal . WriteSync ( mi ) // NOTE: fsync
if err != nil {
if err != nil {
panic ( fmt . Sprint f(
"failed to write %v msg to consensus WAL due to %v ; check your file system and restart the node" ,
panic ( fmt . Error f(
"failed to write %v msg to consensus WAL due to %w ; check your file system and restart the node" ,
mi , err ,
mi , err ,
) )
) )
}
}
@ -1900,8 +1913,8 @@ func (cs *State) finalizeCommit(ctx context.Context, height int64) {
// restart).
// restart).
endMsg := EndHeightMessage { height }
endMsg := EndHeightMessage { height }
if err := cs . wal . WriteSync ( endMsg ) ; err != nil { // NOTE: fsync
if err := cs . wal . WriteSync ( endMsg ) ; err != nil { // NOTE: fsync
panic ( fmt . Sprint f(
"failed to write %v msg to consensus WAL due to %v ; check your file system and restart the node" ,
panic ( fmt . Error f(
"failed to write %v msg to consensus WAL due to %w ; check your file system and restart the node" ,
endMsg , err ,
endMsg , err ,
) )
) )
}
}