consensus: ensure the node terminates on consensus failure (#8111)

Updates #8077. The panic handler for consensus currently attempts to effect a clean shutdown, but this can leave a failed node running in an unknown state for an arbitrary amount of time after the failure. Since a panic at this point means consensus is already irrecoverably broken, we should not allow the node to continue executing. After making a best effort to shut down the writeahead log, re-panic to ensure the node will terminate before any further state transitions are processed. Even with this change, it is possible some transitions may occur while the cleanup is happening. It might be preferable to abort unconditionally without any attempt at cleanup. Related changes: - Clean up the creation of WAL directories. - Filter WAL close errors at rethrow.
3 years ago · e9bc33d807
--- a/internal/consensus/common_test.go
+++ b/internal/consensus/common_test.go
@ -69,6 +69,9 @@ func configSetup(t *testing.T) *config.Config {
 	require.NoError(t, err)
 	t.Cleanup(func() { os.RemoveAll(configByzantineTest.RootDir) })

 	walDir := filepath.Dir(cfg.Consensus.WalFile())
 	ensureDir(t, walDir, 0700)

 	return cfg
 }

@ -785,6 +788,7 @@ func makeConsensusState(
 	configOpts ...func(*config.Config),
 ) ([]*State, cleanupFunc) {
 	t.Helper()
 	tempDir := t.TempDir()

 	valSet, privVals := factory.ValidatorSet(ctx, t, nValidators, 30)
 	genDoc := factory.GenesisDoc(cfg, time.Now(), valSet.Validators, nil)
@ -799,7 +803,7 @@ func makeConsensusState(
 		blockStore := store.NewBlockStore(dbm.NewMemDB()) // each state needs its own db
 		state, err := sm.MakeGenesisState(genDoc)
 		require.NoError(t, err)
 		thisConfig, err := ResetConfig(t.TempDir(), fmt.Sprintf("%s_%d", testName, i))
 		thisConfig, err := ResetConfig(tempDir, fmt.Sprintf("%s_%d", testName, i))
 		require.NoError(t, err)

 		configRootDirs = append(configRootDirs, thisConfig.RootDir)
@ -808,7 +812,8 @@ func makeConsensusState(
 			opt(thisConfig)
 		}

 		ensureDir(t, filepath.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
 		walDir := filepath.Dir(thisConfig.Consensus.WalFile())
 		ensureDir(t, walDir, 0700)

 		app := kvstore.NewApplication()
 		closeFuncs = append(closeFuncs, app.Close)
--- a/internal/consensus/reactor_test.go
+++ b/internal/consensus/reactor_test.go
@ -5,7 +5,6 @@ import (
 	"errors"
 	"fmt"
 	"os"
 	"path"
 	"sync"
 	"testing"
 	"time"
@ -466,7 +465,6 @@ func TestReactorWithEvidence(t *testing.T) {

 		defer os.RemoveAll(thisConfig.RootDir)

 		ensureDir(t, path.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
 		app := kvstore.NewApplication()
 		vals := types.TM2PB.ValidatorUpdates(state.Validators)
 		app.InitChain(abci.RequestInitChain{Validators: vals})
@ -564,7 +562,6 @@ func TestReactorCreatesBlockWhenEmptyBlocksFalse(t *testing.T) {
 			c.Consensus.CreateEmptyBlocks = false
 		},
 	)

 	t.Cleanup(cleanup)

 	rts := setup(ctx, t, n, states, 100) // buffer must be large enough to not deadlock
--- a/internal/consensus/state.go
+++ b/internal/consensus/state.go
@ -20,6 +20,7 @@ import (
 	cstypes "github.com/tendermint/tendermint/internal/consensus/types"
 	"github.com/tendermint/tendermint/internal/eventbus"
 	"github.com/tendermint/tendermint/internal/jsontypes"
 	"github.com/tendermint/tendermint/internal/libs/autofile"
 	sm "github.com/tendermint/tendermint/internal/state"
 	tmevents "github.com/tendermint/tendermint/libs/events"
 	"github.com/tendermint/tendermint/libs/log"
@ -869,15 +870,27 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
 	defer func() {
 		if r := recover(); r != nil {
 			cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack()))
 			// stop gracefully
 			//
 			// NOTE: We most probably shouldn't be running any further when there is
 			// some unexpected panic. Some unknown error happened, and so we don't
 			// know if that will result in the validator signing an invalid thing. It
 			// might be worthwhile to explore a mechanism for manual resuming via
 			// some console or secure RPC system, but for now, halting the chain upon
 			// unexpected consensus bugs sounds like the better option.

 			// Make a best-effort attempt to close the WAL, but otherwise do not
 			// attempt to gracefully terminate. Once consensus has irrecoverably
 			// failed, any additional progress we permit the node to make may
 			// complicate diagnosing and recovering from the failure.
 			onExit(cs)

 			// Re-panic to ensure the node terminates.
 			//
 			// TODO(creachadair): In ordinary operation, the WAL autofile should
 			// never be closed. This only happens during shutdown and production
 			// nodes usually halt by panicking. Many existing tests, however,
 			// assume a clean shutdown is possible. Prior to #8111, we were
 			// swallowing the panic in receiveRoutine, making that appear to
 			// work. Filtering this specific error is slightly risky, but should
 			// affect only unit tests. In any case, not re-panicking here only
 			// preserves the pre-existing behavior for this one error type.
 			if err, ok := r.(error); ok && errors.Is(err, autofile.ErrAutoFileClosed) {
 				return
 			}
 			panic(r)
 		}
 	}()

@ -906,8 +919,8 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
 		case mi := <-cs.internalMsgQueue:
 			err := cs.wal.WriteSync(mi) // NOTE: fsync
 			if err != nil {
 				panic(fmt.Sprintf(
 					"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
 				panic(fmt.Errorf(
 					"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
 					mi, err,
 				))
 			}
@ -1900,8 +1913,8 @@ func (cs *State) finalizeCommit(ctx context.Context, height int64) {
 	// restart).
 	endMsg := EndHeightMessage{height}
 	if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync
 		panic(fmt.Sprintf(
 			"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
 		panic(fmt.Errorf(
 			"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
 			endMsg, err,
 		))
 	}
--- a/internal/libs/autofile/autofile.go
+++ b/internal/libs/autofile/autofile.go
@ -41,9 +41,9 @@ const (
 	autoFilePerms       = os.FileMode(0600)
 )

 // errAutoFileClosed is reported when operations attempt to use an autofile
 // ErrAutoFileClosed is reported when operations attempt to use an autofile
 // after it has been closed.
 var errAutoFileClosed = errors.New("autofile is closed")
 var ErrAutoFileClosed = errors.New("autofile is closed")

 // AutoFile automatically closes and re-opens file for writing. The file is
 // automatically setup to close itself every 1s and upon receiving SIGHUP.
@ -155,7 +155,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
 	af.mtx.Lock()
 	defer af.mtx.Unlock()
 	if af.closed {
 		return 0, fmt.Errorf("write: %w", errAutoFileClosed)
 		return 0, fmt.Errorf("write: %w", ErrAutoFileClosed)
 	}

 	if af.file == nil {
@ -174,7 +174,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
 func (af *AutoFile) Sync() error {
 	return af.withLock(func() error {
 		if af.closed {
 			return fmt.Errorf("sync: %w", errAutoFileClosed)
 			return fmt.Errorf("sync: %w", ErrAutoFileClosed)
 		} else if af.file == nil {
 			return nil // nothing to sync
 		}
@ -207,7 +207,7 @@ func (af *AutoFile) Size() (int64, error) {
 	af.mtx.Lock()
 	defer af.mtx.Unlock()
 	if af.closed {
 		return 0, fmt.Errorf("size: %w", errAutoFileClosed)
 		return 0, fmt.Errorf("size: %w", ErrAutoFileClosed)
 	}

 	if af.file == nil {