Merge branch 'master' into node-initalization-avoid-panic

3 years ago · bfd08660ef
--- a/internal/consensus/common_test.go
+++ b/internal/consensus/common_test.go
@ -69,6 +69,9 @@ func configSetup(t *testing.T) *config.Config {
 	require.NoError(t, err)
 	t.Cleanup(func() { os.RemoveAll(configByzantineTest.RootDir) })

 	walDir := filepath.Dir(cfg.Consensus.WalFile())
 	ensureDir(t, walDir, 0700)

 	return cfg
 }

@ -787,6 +790,7 @@ func makeConsensusState(
 	configOpts ...func(*config.Config),
 ) ([]*State, cleanupFunc) {
 	t.Helper()
 	tempDir := t.TempDir()

 	valSet, privVals := factory.ValidatorSet(ctx, t, nValidators, 30)
 	genDoc := factory.GenesisDoc(cfg, time.Now(), valSet.Validators, nil)
@ -801,7 +805,7 @@ func makeConsensusState(
 		blockStore := store.NewBlockStore(dbm.NewMemDB()) // each state needs its own db
 		state, err := sm.MakeGenesisState(genDoc)
 		require.NoError(t, err)
 		thisConfig, err := ResetConfig(t.TempDir(), fmt.Sprintf("%s_%d", testName, i))
 		thisConfig, err := ResetConfig(tempDir, fmt.Sprintf("%s_%d", testName, i))
 		require.NoError(t, err)

 		configRootDirs = append(configRootDirs, thisConfig.RootDir)
@ -810,7 +814,8 @@ func makeConsensusState(
 			opt(thisConfig)
 		}

 		ensureDir(t, filepath.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
 		walDir := filepath.Dir(thisConfig.Consensus.WalFile())
 		ensureDir(t, walDir, 0700)

 		app := kvstore.NewApplication()
 		closeFuncs = append(closeFuncs, app.Close)
--- a/internal/consensus/reactor_test.go
+++ b/internal/consensus/reactor_test.go
@ -5,7 +5,6 @@ import (
 	"errors"
 	"fmt"
 	"os"
 	"path"
 	"sync"
 	"testing"
 	"time"
@ -486,7 +485,6 @@ func TestReactorWithEvidence(t *testing.T) {

 		defer os.RemoveAll(thisConfig.RootDir)

 		ensureDir(t, path.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
 		app := kvstore.NewApplication()
 		vals := types.TM2PB.ValidatorUpdates(state.Validators)
 		app.InitChain(abci.RequestInitChain{Validators: vals})
@ -598,7 +596,6 @@ func TestReactorCreatesBlockWhenEmptyBlocksFalse(t *testing.T) {
 			c.Consensus.CreateEmptyBlocks = false
 		},
 	)

 	t.Cleanup(cleanup)

 	rts := setup(ctx, t, n, states, 100) // buffer must be large enough to not deadlock
--- a/internal/consensus/state.go
+++ b/internal/consensus/state.go
@ -20,6 +20,7 @@ import (
 	cstypes "github.com/tendermint/tendermint/internal/consensus/types"
 	"github.com/tendermint/tendermint/internal/eventbus"
 	"github.com/tendermint/tendermint/internal/jsontypes"
 	"github.com/tendermint/tendermint/internal/libs/autofile"
 	sm "github.com/tendermint/tendermint/internal/state"
 	tmevents "github.com/tendermint/tendermint/libs/events"
 	"github.com/tendermint/tendermint/libs/log"
@ -905,15 +906,27 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
 	defer func() {
 		if r := recover(); r != nil {
 			cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack()))
 			// stop gracefully
 			//
 			// NOTE: We most probably shouldn't be running any further when there is
 			// some unexpected panic. Some unknown error happened, and so we don't
 			// know if that will result in the validator signing an invalid thing. It
 			// might be worthwhile to explore a mechanism for manual resuming via
 			// some console or secure RPC system, but for now, halting the chain upon
 			// unexpected consensus bugs sounds like the better option.

 			// Make a best-effort attempt to close the WAL, but otherwise do not
 			// attempt to gracefully terminate. Once consensus has irrecoverably
 			// failed, any additional progress we permit the node to make may
 			// complicate diagnosing and recovering from the failure.
 			onExit(cs)

 			// Re-panic to ensure the node terminates.
 			//
 			// TODO(creachadair): In ordinary operation, the WAL autofile should
 			// never be closed. This only happens during shutdown and production
 			// nodes usually halt by panicking. Many existing tests, however,
 			// assume a clean shutdown is possible. Prior to #8111, we were
 			// swallowing the panic in receiveRoutine, making that appear to
 			// work. Filtering this specific error is slightly risky, but should
 			// affect only unit tests. In any case, not re-panicking here only
 			// preserves the pre-existing behavior for this one error type.
 			if err, ok := r.(error); ok && errors.Is(err, autofile.ErrAutoFileClosed) {
 				return
 			}
 			panic(r)
 		}
 	}()

@ -942,8 +955,8 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
 		case mi := <-cs.internalMsgQueue:
 			err := cs.wal.WriteSync(mi) // NOTE: fsync
 			if err != nil {
 				panic(fmt.Sprintf(
 					"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
 				panic(fmt.Errorf(
 					"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
 					mi, err,
 				))
 			}
@ -1984,8 +1997,8 @@ func (cs *State) finalizeCommit(ctx context.Context, height int64) {
 	// restart).
 	endMsg := EndHeightMessage{height}
 	if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync
 		panic(fmt.Sprintf(
 			"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
 		panic(fmt.Errorf(
 			"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
 			endMsg, err,
 		))
 	}
--- a/internal/libs/autofile/autofile.go
+++ b/internal/libs/autofile/autofile.go
@ -41,9 +41,9 @@ const (
 	autoFilePerms       = os.FileMode(0600)
 )

 // errAutoFileClosed is reported when operations attempt to use an autofile
 // ErrAutoFileClosed is reported when operations attempt to use an autofile
 // after it has been closed.
 var errAutoFileClosed = errors.New("autofile is closed")
 var ErrAutoFileClosed = errors.New("autofile is closed")

 // AutoFile automatically closes and re-opens file for writing. The file is
 // automatically setup to close itself every 1s and upon receiving SIGHUP.
@ -155,7 +155,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
 	af.mtx.Lock()
 	defer af.mtx.Unlock()
 	if af.closed {
 		return 0, fmt.Errorf("write: %w", errAutoFileClosed)
 		return 0, fmt.Errorf("write: %w", ErrAutoFileClosed)
 	}

 	if af.file == nil {
@ -174,7 +174,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
 func (af *AutoFile) Sync() error {
 	return af.withLock(func() error {
 		if af.closed {
 			return fmt.Errorf("sync: %w", errAutoFileClosed)
 			return fmt.Errorf("sync: %w", ErrAutoFileClosed)
 		} else if af.file == nil {
 			return nil // nothing to sync
 		}
@ -207,7 +207,7 @@ func (af *AutoFile) Size() (int64, error) {
 	af.mtx.Lock()
 	defer af.mtx.Unlock()
 	if af.closed {
 		return 0, fmt.Errorf("size: %w", errAutoFileClosed)
 		return 0, fmt.Errorf("size: %w", ErrAutoFileClosed)
 	}

 	if af.file == nil {
--- a/spec/abci++/abci++_basic_concepts_002_draft.md
+++ b/spec/abci++/abci++_basic_concepts_002_draft.md
@ -214,7 +214,7 @@ transactions and compute the same results.

 Some Applications may choose to execute the blocks that are about to be proposed
 (via `PrepareProposal`), or those that the Application is asked to validate
 (via `Processproposal`). However, the state changes caused by processing those
 (via `ProcessProposal`). However, the state changes caused by processing those
 proposed blocks must never replace the previous state until `FinalizeBlock` confirms
 the block decided.