Browse Source

Merge branch 'master' into node-initalization-avoid-panic

pull/8091/head
Sam Kleinman 3 years ago
committed by GitHub
parent
commit
bfd08660ef
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 38 additions and 23 deletions
  1. +7
    -2
      internal/consensus/common_test.go
  2. +0
    -3
      internal/consensus/reactor_test.go
  3. +25
    -12
      internal/consensus/state.go
  4. +5
    -5
      internal/libs/autofile/autofile.go
  5. +1
    -1
      spec/abci++/abci++_basic_concepts_002_draft.md

+ 7
- 2
internal/consensus/common_test.go View File

@ -69,6 +69,9 @@ func configSetup(t *testing.T) *config.Config {
require.NoError(t, err) require.NoError(t, err)
t.Cleanup(func() { os.RemoveAll(configByzantineTest.RootDir) }) t.Cleanup(func() { os.RemoveAll(configByzantineTest.RootDir) })
walDir := filepath.Dir(cfg.Consensus.WalFile())
ensureDir(t, walDir, 0700)
return cfg return cfg
} }
@ -787,6 +790,7 @@ func makeConsensusState(
configOpts ...func(*config.Config), configOpts ...func(*config.Config),
) ([]*State, cleanupFunc) { ) ([]*State, cleanupFunc) {
t.Helper() t.Helper()
tempDir := t.TempDir()
valSet, privVals := factory.ValidatorSet(ctx, t, nValidators, 30) valSet, privVals := factory.ValidatorSet(ctx, t, nValidators, 30)
genDoc := factory.GenesisDoc(cfg, time.Now(), valSet.Validators, nil) genDoc := factory.GenesisDoc(cfg, time.Now(), valSet.Validators, nil)
@ -801,7 +805,7 @@ func makeConsensusState(
blockStore := store.NewBlockStore(dbm.NewMemDB()) // each state needs its own db blockStore := store.NewBlockStore(dbm.NewMemDB()) // each state needs its own db
state, err := sm.MakeGenesisState(genDoc) state, err := sm.MakeGenesisState(genDoc)
require.NoError(t, err) require.NoError(t, err)
thisConfig, err := ResetConfig(t.TempDir(), fmt.Sprintf("%s_%d", testName, i))
thisConfig, err := ResetConfig(tempDir, fmt.Sprintf("%s_%d", testName, i))
require.NoError(t, err) require.NoError(t, err)
configRootDirs = append(configRootDirs, thisConfig.RootDir) configRootDirs = append(configRootDirs, thisConfig.RootDir)
@ -810,7 +814,8 @@ func makeConsensusState(
opt(thisConfig) opt(thisConfig)
} }
ensureDir(t, filepath.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
walDir := filepath.Dir(thisConfig.Consensus.WalFile())
ensureDir(t, walDir, 0700)
app := kvstore.NewApplication() app := kvstore.NewApplication()
closeFuncs = append(closeFuncs, app.Close) closeFuncs = append(closeFuncs, app.Close)


+ 0
- 3
internal/consensus/reactor_test.go View File

@ -5,7 +5,6 @@ import (
"errors" "errors"
"fmt" "fmt"
"os" "os"
"path"
"sync" "sync"
"testing" "testing"
"time" "time"
@ -486,7 +485,6 @@ func TestReactorWithEvidence(t *testing.T) {
defer os.RemoveAll(thisConfig.RootDir) defer os.RemoveAll(thisConfig.RootDir)
ensureDir(t, path.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
app := kvstore.NewApplication() app := kvstore.NewApplication()
vals := types.TM2PB.ValidatorUpdates(state.Validators) vals := types.TM2PB.ValidatorUpdates(state.Validators)
app.InitChain(abci.RequestInitChain{Validators: vals}) app.InitChain(abci.RequestInitChain{Validators: vals})
@ -598,7 +596,6 @@ func TestReactorCreatesBlockWhenEmptyBlocksFalse(t *testing.T) {
c.Consensus.CreateEmptyBlocks = false c.Consensus.CreateEmptyBlocks = false
}, },
) )
t.Cleanup(cleanup) t.Cleanup(cleanup)
rts := setup(ctx, t, n, states, 100) // buffer must be large enough to not deadlock rts := setup(ctx, t, n, states, 100) // buffer must be large enough to not deadlock


+ 25
- 12
internal/consensus/state.go View File

@ -20,6 +20,7 @@ import (
cstypes "github.com/tendermint/tendermint/internal/consensus/types" cstypes "github.com/tendermint/tendermint/internal/consensus/types"
"github.com/tendermint/tendermint/internal/eventbus" "github.com/tendermint/tendermint/internal/eventbus"
"github.com/tendermint/tendermint/internal/jsontypes" "github.com/tendermint/tendermint/internal/jsontypes"
"github.com/tendermint/tendermint/internal/libs/autofile"
sm "github.com/tendermint/tendermint/internal/state" sm "github.com/tendermint/tendermint/internal/state"
tmevents "github.com/tendermint/tendermint/libs/events" tmevents "github.com/tendermint/tendermint/libs/events"
"github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/libs/log"
@ -905,15 +906,27 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {
cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack())) cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack()))
// stop gracefully
//
// NOTE: We most probably shouldn't be running any further when there is
// some unexpected panic. Some unknown error happened, and so we don't
// know if that will result in the validator signing an invalid thing. It
// might be worthwhile to explore a mechanism for manual resuming via
// some console or secure RPC system, but for now, halting the chain upon
// unexpected consensus bugs sounds like the better option.
// Make a best-effort attempt to close the WAL, but otherwise do not
// attempt to gracefully terminate. Once consensus has irrecoverably
// failed, any additional progress we permit the node to make may
// complicate diagnosing and recovering from the failure.
onExit(cs) onExit(cs)
// Re-panic to ensure the node terminates.
//
// TODO(creachadair): In ordinary operation, the WAL autofile should
// never be closed. This only happens during shutdown and production
// nodes usually halt by panicking. Many existing tests, however,
// assume a clean shutdown is possible. Prior to #8111, we were
// swallowing the panic in receiveRoutine, making that appear to
// work. Filtering this specific error is slightly risky, but should
// affect only unit tests. In any case, not re-panicking here only
// preserves the pre-existing behavior for this one error type.
if err, ok := r.(error); ok && errors.Is(err, autofile.ErrAutoFileClosed) {
return
}
panic(r)
} }
}() }()
@ -942,8 +955,8 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
case mi := <-cs.internalMsgQueue: case mi := <-cs.internalMsgQueue:
err := cs.wal.WriteSync(mi) // NOTE: fsync err := cs.wal.WriteSync(mi) // NOTE: fsync
if err != nil { if err != nil {
panic(fmt.Sprintf(
"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
panic(fmt.Errorf(
"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
mi, err, mi, err,
)) ))
} }
@ -1984,8 +1997,8 @@ func (cs *State) finalizeCommit(ctx context.Context, height int64) {
// restart). // restart).
endMsg := EndHeightMessage{height} endMsg := EndHeightMessage{height}
if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync
panic(fmt.Sprintf(
"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
panic(fmt.Errorf(
"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
endMsg, err, endMsg, err,
)) ))
} }


+ 5
- 5
internal/libs/autofile/autofile.go View File

@ -41,9 +41,9 @@ const (
autoFilePerms = os.FileMode(0600) autoFilePerms = os.FileMode(0600)
) )
// errAutoFileClosed is reported when operations attempt to use an autofile
// ErrAutoFileClosed is reported when operations attempt to use an autofile
// after it has been closed. // after it has been closed.
var errAutoFileClosed = errors.New("autofile is closed")
var ErrAutoFileClosed = errors.New("autofile is closed")
// AutoFile automatically closes and re-opens file for writing. The file is // AutoFile automatically closes and re-opens file for writing. The file is
// automatically setup to close itself every 1s and upon receiving SIGHUP. // automatically setup to close itself every 1s and upon receiving SIGHUP.
@ -155,7 +155,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
af.mtx.Lock() af.mtx.Lock()
defer af.mtx.Unlock() defer af.mtx.Unlock()
if af.closed { if af.closed {
return 0, fmt.Errorf("write: %w", errAutoFileClosed)
return 0, fmt.Errorf("write: %w", ErrAutoFileClosed)
} }
if af.file == nil { if af.file == nil {
@ -174,7 +174,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
func (af *AutoFile) Sync() error { func (af *AutoFile) Sync() error {
return af.withLock(func() error { return af.withLock(func() error {
if af.closed { if af.closed {
return fmt.Errorf("sync: %w", errAutoFileClosed)
return fmt.Errorf("sync: %w", ErrAutoFileClosed)
} else if af.file == nil { } else if af.file == nil {
return nil // nothing to sync return nil // nothing to sync
} }
@ -207,7 +207,7 @@ func (af *AutoFile) Size() (int64, error) {
af.mtx.Lock() af.mtx.Lock()
defer af.mtx.Unlock() defer af.mtx.Unlock()
if af.closed { if af.closed {
return 0, fmt.Errorf("size: %w", errAutoFileClosed)
return 0, fmt.Errorf("size: %w", ErrAutoFileClosed)
} }
if af.file == nil { if af.file == nil {


+ 1
- 1
spec/abci++/abci++_basic_concepts_002_draft.md View File

@ -214,7 +214,7 @@ transactions and compute the same results.
Some Applications may choose to execute the blocks that are about to be proposed Some Applications may choose to execute the blocks that are about to be proposed
(via `PrepareProposal`), or those that the Application is asked to validate (via `PrepareProposal`), or those that the Application is asked to validate
(via `Processproposal`). However, the state changes caused by processing those
(via `ProcessProposal`). However, the state changes caused by processing those
proposed blocks must never replace the previous state until `FinalizeBlock` confirms proposed blocks must never replace the previous state until `FinalizeBlock` confirms
the block decided. the block decided.


Loading…
Cancel
Save