diff --git a/internal/consensus/common_test.go b/internal/consensus/common_test.go index 053e464a9..c9c207b55 100644 --- a/internal/consensus/common_test.go +++ b/internal/consensus/common_test.go @@ -69,6 +69,9 @@ func configSetup(t *testing.T) *config.Config { require.NoError(t, err) t.Cleanup(func() { os.RemoveAll(configByzantineTest.RootDir) }) + walDir := filepath.Dir(cfg.Consensus.WalFile()) + ensureDir(t, walDir, 0700) + return cfg } @@ -785,6 +788,7 @@ func makeConsensusState( configOpts ...func(*config.Config), ) ([]*State, cleanupFunc) { t.Helper() + tempDir := t.TempDir() valSet, privVals := factory.ValidatorSet(ctx, t, nValidators, 30) genDoc := factory.GenesisDoc(cfg, time.Now(), valSet.Validators, nil) @@ -799,7 +803,7 @@ func makeConsensusState( blockStore := store.NewBlockStore(dbm.NewMemDB()) // each state needs its own db state, err := sm.MakeGenesisState(genDoc) require.NoError(t, err) - thisConfig, err := ResetConfig(t.TempDir(), fmt.Sprintf("%s_%d", testName, i)) + thisConfig, err := ResetConfig(tempDir, fmt.Sprintf("%s_%d", testName, i)) require.NoError(t, err) configRootDirs = append(configRootDirs, thisConfig.RootDir) @@ -808,7 +812,8 @@ func makeConsensusState( opt(thisConfig) } - ensureDir(t, filepath.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal + walDir := filepath.Dir(thisConfig.Consensus.WalFile()) + ensureDir(t, walDir, 0700) app := kvstore.NewApplication() closeFuncs = append(closeFuncs, app.Close) diff --git a/internal/consensus/reactor_test.go b/internal/consensus/reactor_test.go index 1fe395d69..ef816d85f 100644 --- a/internal/consensus/reactor_test.go +++ b/internal/consensus/reactor_test.go @@ -5,7 +5,6 @@ import ( "errors" "fmt" "os" - "path" "sync" "testing" "time" @@ -466,7 +465,6 @@ func TestReactorWithEvidence(t *testing.T) { defer os.RemoveAll(thisConfig.RootDir) - ensureDir(t, path.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal app := kvstore.NewApplication() vals := types.TM2PB.ValidatorUpdates(state.Validators) app.InitChain(abci.RequestInitChain{Validators: vals}) @@ -564,7 +562,6 @@ func TestReactorCreatesBlockWhenEmptyBlocksFalse(t *testing.T) { c.Consensus.CreateEmptyBlocks = false }, ) - t.Cleanup(cleanup) rts := setup(ctx, t, n, states, 100) // buffer must be large enough to not deadlock diff --git a/internal/consensus/state.go b/internal/consensus/state.go index 413c4ba56..c6a54dd7c 100644 --- a/internal/consensus/state.go +++ b/internal/consensus/state.go @@ -20,6 +20,7 @@ import ( cstypes "github.com/tendermint/tendermint/internal/consensus/types" "github.com/tendermint/tendermint/internal/eventbus" "github.com/tendermint/tendermint/internal/jsontypes" + "github.com/tendermint/tendermint/internal/libs/autofile" sm "github.com/tendermint/tendermint/internal/state" tmevents "github.com/tendermint/tendermint/libs/events" "github.com/tendermint/tendermint/libs/log" @@ -869,15 +870,27 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) { defer func() { if r := recover(); r != nil { cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack())) - // stop gracefully - // - // NOTE: We most probably shouldn't be running any further when there is - // some unexpected panic. Some unknown error happened, and so we don't - // know if that will result in the validator signing an invalid thing. It - // might be worthwhile to explore a mechanism for manual resuming via - // some console or secure RPC system, but for now, halting the chain upon - // unexpected consensus bugs sounds like the better option. + + // Make a best-effort attempt to close the WAL, but otherwise do not + // attempt to gracefully terminate. Once consensus has irrecoverably + // failed, any additional progress we permit the node to make may + // complicate diagnosing and recovering from the failure. onExit(cs) + + // Re-panic to ensure the node terminates. + // + // TODO(creachadair): In ordinary operation, the WAL autofile should + // never be closed. This only happens during shutdown and production + // nodes usually halt by panicking. Many existing tests, however, + // assume a clean shutdown is possible. Prior to #8111, we were + // swallowing the panic in receiveRoutine, making that appear to + // work. Filtering this specific error is slightly risky, but should + // affect only unit tests. In any case, not re-panicking here only + // preserves the pre-existing behavior for this one error type. + if err, ok := r.(error); ok && errors.Is(err, autofile.ErrAutoFileClosed) { + return + } + panic(r) } }() @@ -906,8 +919,8 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) { case mi := <-cs.internalMsgQueue: err := cs.wal.WriteSync(mi) // NOTE: fsync if err != nil { - panic(fmt.Sprintf( - "failed to write %v msg to consensus WAL due to %v; check your file system and restart the node", + panic(fmt.Errorf( + "failed to write %v msg to consensus WAL due to %w; check your file system and restart the node", mi, err, )) } @@ -1900,8 +1913,8 @@ func (cs *State) finalizeCommit(ctx context.Context, height int64) { // restart). endMsg := EndHeightMessage{height} if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync - panic(fmt.Sprintf( - "failed to write %v msg to consensus WAL due to %v; check your file system and restart the node", + panic(fmt.Errorf( + "failed to write %v msg to consensus WAL due to %w; check your file system and restart the node", endMsg, err, )) } diff --git a/internal/libs/autofile/autofile.go b/internal/libs/autofile/autofile.go index 6f38fc43b..2cacf6a47 100644 --- a/internal/libs/autofile/autofile.go +++ b/internal/libs/autofile/autofile.go @@ -41,9 +41,9 @@ const ( autoFilePerms = os.FileMode(0600) ) -// errAutoFileClosed is reported when operations attempt to use an autofile +// ErrAutoFileClosed is reported when operations attempt to use an autofile // after it has been closed. -var errAutoFileClosed = errors.New("autofile is closed") +var ErrAutoFileClosed = errors.New("autofile is closed") // AutoFile automatically closes and re-opens file for writing. The file is // automatically setup to close itself every 1s and upon receiving SIGHUP. @@ -155,7 +155,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) { af.mtx.Lock() defer af.mtx.Unlock() if af.closed { - return 0, fmt.Errorf("write: %w", errAutoFileClosed) + return 0, fmt.Errorf("write: %w", ErrAutoFileClosed) } if af.file == nil { @@ -174,7 +174,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) { func (af *AutoFile) Sync() error { return af.withLock(func() error { if af.closed { - return fmt.Errorf("sync: %w", errAutoFileClosed) + return fmt.Errorf("sync: %w", ErrAutoFileClosed) } else if af.file == nil { return nil // nothing to sync } @@ -207,7 +207,7 @@ func (af *AutoFile) Size() (int64, error) { af.mtx.Lock() defer af.mtx.Unlock() if af.closed { - return 0, fmt.Errorf("size: %w", errAutoFileClosed) + return 0, fmt.Errorf("size: %w", ErrAutoFileClosed) } if af.file == nil {