From e9bc33d80729e452a3975b2cd1dcfa5f43e03592 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Fri, 11 Mar 2022 13:30:15 -0800 Subject: [PATCH] consensus: ensure the node terminates on consensus failure (#8111) Updates #8077. The panic handler for consensus currently attempts to effect a clean shutdown, but this can leave a failed node running in an unknown state for an arbitrary amount of time after the failure. Since a panic at this point means consensus is already irrecoverably broken, we should not allow the node to continue executing. After making a best effort to shut down the writeahead log, re-panic to ensure the node will terminate before any further state transitions are processed. Even with this change, it is possible some transitions may occur while the cleanup is happening. It might be preferable to abort unconditionally without any attempt at cleanup. Related changes: - Clean up the creation of WAL directories. - Filter WAL close errors at rethrow. --- internal/consensus/common_test.go | 9 ++++++-- internal/consensus/reactor_test.go | 3 --- internal/consensus/state.go | 37 ++++++++++++++++++++---------- internal/libs/autofile/autofile.go | 10 ++++---- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/internal/consensus/common_test.go b/internal/consensus/common_test.go index 053e464a9..c9c207b55 100644 --- a/internal/consensus/common_test.go +++ b/internal/consensus/common_test.go @@ -69,6 +69,9 @@ func configSetup(t *testing.T) *config.Config { require.NoError(t, err) t.Cleanup(func() { os.RemoveAll(configByzantineTest.RootDir) }) + walDir := filepath.Dir(cfg.Consensus.WalFile()) + ensureDir(t, walDir, 0700) + return cfg } @@ -785,6 +788,7 @@ func makeConsensusState( configOpts ...func(*config.Config), ) ([]*State, cleanupFunc) { t.Helper() + tempDir := t.TempDir() valSet, privVals := factory.ValidatorSet(ctx, t, nValidators, 30) genDoc := factory.GenesisDoc(cfg, time.Now(), valSet.Validators, nil) @@ -799,7 +803,7 @@ func makeConsensusState( blockStore := store.NewBlockStore(dbm.NewMemDB()) // each state needs its own db state, err := sm.MakeGenesisState(genDoc) require.NoError(t, err) - thisConfig, err := ResetConfig(t.TempDir(), fmt.Sprintf("%s_%d", testName, i)) + thisConfig, err := ResetConfig(tempDir, fmt.Sprintf("%s_%d", testName, i)) require.NoError(t, err) configRootDirs = append(configRootDirs, thisConfig.RootDir) @@ -808,7 +812,8 @@ func makeConsensusState( opt(thisConfig) } - ensureDir(t, filepath.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal + walDir := filepath.Dir(thisConfig.Consensus.WalFile()) + ensureDir(t, walDir, 0700) app := kvstore.NewApplication() closeFuncs = append(closeFuncs, app.Close) diff --git a/internal/consensus/reactor_test.go b/internal/consensus/reactor_test.go index 1fe395d69..ef816d85f 100644 --- a/internal/consensus/reactor_test.go +++ b/internal/consensus/reactor_test.go @@ -5,7 +5,6 @@ import ( "errors" "fmt" "os" - "path" "sync" "testing" "time" @@ -466,7 +465,6 @@ func TestReactorWithEvidence(t *testing.T) { defer os.RemoveAll(thisConfig.RootDir) - ensureDir(t, path.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal app := kvstore.NewApplication() vals := types.TM2PB.ValidatorUpdates(state.Validators) app.InitChain(abci.RequestInitChain{Validators: vals}) @@ -564,7 +562,6 @@ func TestReactorCreatesBlockWhenEmptyBlocksFalse(t *testing.T) { c.Consensus.CreateEmptyBlocks = false }, ) - t.Cleanup(cleanup) rts := setup(ctx, t, n, states, 100) // buffer must be large enough to not deadlock diff --git a/internal/consensus/state.go b/internal/consensus/state.go index 413c4ba56..c6a54dd7c 100644 --- a/internal/consensus/state.go +++ b/internal/consensus/state.go @@ -20,6 +20,7 @@ import ( cstypes "github.com/tendermint/tendermint/internal/consensus/types" "github.com/tendermint/tendermint/internal/eventbus" "github.com/tendermint/tendermint/internal/jsontypes" + "github.com/tendermint/tendermint/internal/libs/autofile" sm "github.com/tendermint/tendermint/internal/state" tmevents "github.com/tendermint/tendermint/libs/events" "github.com/tendermint/tendermint/libs/log" @@ -869,15 +870,27 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) { defer func() { if r := recover(); r != nil { cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack())) - // stop gracefully - // - // NOTE: We most probably shouldn't be running any further when there is - // some unexpected panic. Some unknown error happened, and so we don't - // know if that will result in the validator signing an invalid thing. It - // might be worthwhile to explore a mechanism for manual resuming via - // some console or secure RPC system, but for now, halting the chain upon - // unexpected consensus bugs sounds like the better option. + + // Make a best-effort attempt to close the WAL, but otherwise do not + // attempt to gracefully terminate. Once consensus has irrecoverably + // failed, any additional progress we permit the node to make may + // complicate diagnosing and recovering from the failure. onExit(cs) + + // Re-panic to ensure the node terminates. + // + // TODO(creachadair): In ordinary operation, the WAL autofile should + // never be closed. This only happens during shutdown and production + // nodes usually halt by panicking. Many existing tests, however, + // assume a clean shutdown is possible. Prior to #8111, we were + // swallowing the panic in receiveRoutine, making that appear to + // work. Filtering this specific error is slightly risky, but should + // affect only unit tests. In any case, not re-panicking here only + // preserves the pre-existing behavior for this one error type. + if err, ok := r.(error); ok && errors.Is(err, autofile.ErrAutoFileClosed) { + return + } + panic(r) } }() @@ -906,8 +919,8 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) { case mi := <-cs.internalMsgQueue: err := cs.wal.WriteSync(mi) // NOTE: fsync if err != nil { - panic(fmt.Sprintf( - "failed to write %v msg to consensus WAL due to %v; check your file system and restart the node", + panic(fmt.Errorf( + "failed to write %v msg to consensus WAL due to %w; check your file system and restart the node", mi, err, )) } @@ -1900,8 +1913,8 @@ func (cs *State) finalizeCommit(ctx context.Context, height int64) { // restart). endMsg := EndHeightMessage{height} if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync - panic(fmt.Sprintf( - "failed to write %v msg to consensus WAL due to %v; check your file system and restart the node", + panic(fmt.Errorf( + "failed to write %v msg to consensus WAL due to %w; check your file system and restart the node", endMsg, err, )) } diff --git a/internal/libs/autofile/autofile.go b/internal/libs/autofile/autofile.go index 6f38fc43b..2cacf6a47 100644 --- a/internal/libs/autofile/autofile.go +++ b/internal/libs/autofile/autofile.go @@ -41,9 +41,9 @@ const ( autoFilePerms = os.FileMode(0600) ) -// errAutoFileClosed is reported when operations attempt to use an autofile +// ErrAutoFileClosed is reported when operations attempt to use an autofile // after it has been closed. -var errAutoFileClosed = errors.New("autofile is closed") +var ErrAutoFileClosed = errors.New("autofile is closed") // AutoFile automatically closes and re-opens file for writing. The file is // automatically setup to close itself every 1s and upon receiving SIGHUP. @@ -155,7 +155,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) { af.mtx.Lock() defer af.mtx.Unlock() if af.closed { - return 0, fmt.Errorf("write: %w", errAutoFileClosed) + return 0, fmt.Errorf("write: %w", ErrAutoFileClosed) } if af.file == nil { @@ -174,7 +174,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) { func (af *AutoFile) Sync() error { return af.withLock(func() error { if af.closed { - return fmt.Errorf("sync: %w", errAutoFileClosed) + return fmt.Errorf("sync: %w", ErrAutoFileClosed) } else if af.file == nil { return nil // nothing to sync } @@ -207,7 +207,7 @@ func (af *AutoFile) Size() (int64, error) { af.mtx.Lock() defer af.mtx.Unlock() if af.closed { - return 0, fmt.Errorf("size: %w", errAutoFileClosed) + return 0, fmt.Errorf("size: %w", ErrAutoFileClosed) } if af.file == nil {