Browse Source

Merge branch 'master' into node-initalization-avoid-panic

pull/8091/head
Sam Kleinman 3 years ago
committed by GitHub
parent
commit
bfd08660ef
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 38 additions and 23 deletions
  1. +7
    -2
      internal/consensus/common_test.go
  2. +0
    -3
      internal/consensus/reactor_test.go
  3. +25
    -12
      internal/consensus/state.go
  4. +5
    -5
      internal/libs/autofile/autofile.go
  5. +1
    -1
      spec/abci++/abci++_basic_concepts_002_draft.md

+ 7
- 2
internal/consensus/common_test.go View File

@ -69,6 +69,9 @@ func configSetup(t *testing.T) *config.Config {
require.NoError(t, err)
t.Cleanup(func() { os.RemoveAll(configByzantineTest.RootDir) })
walDir := filepath.Dir(cfg.Consensus.WalFile())
ensureDir(t, walDir, 0700)
return cfg
}
@ -787,6 +790,7 @@ func makeConsensusState(
configOpts ...func(*config.Config),
) ([]*State, cleanupFunc) {
t.Helper()
tempDir := t.TempDir()
valSet, privVals := factory.ValidatorSet(ctx, t, nValidators, 30)
genDoc := factory.GenesisDoc(cfg, time.Now(), valSet.Validators, nil)
@ -801,7 +805,7 @@ func makeConsensusState(
blockStore := store.NewBlockStore(dbm.NewMemDB()) // each state needs its own db
state, err := sm.MakeGenesisState(genDoc)
require.NoError(t, err)
thisConfig, err := ResetConfig(t.TempDir(), fmt.Sprintf("%s_%d", testName, i))
thisConfig, err := ResetConfig(tempDir, fmt.Sprintf("%s_%d", testName, i))
require.NoError(t, err)
configRootDirs = append(configRootDirs, thisConfig.RootDir)
@ -810,7 +814,8 @@ func makeConsensusState(
opt(thisConfig)
}
ensureDir(t, filepath.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
walDir := filepath.Dir(thisConfig.Consensus.WalFile())
ensureDir(t, walDir, 0700)
app := kvstore.NewApplication()
closeFuncs = append(closeFuncs, app.Close)


+ 0
- 3
internal/consensus/reactor_test.go View File

@ -5,7 +5,6 @@ import (
"errors"
"fmt"
"os"
"path"
"sync"
"testing"
"time"
@ -486,7 +485,6 @@ func TestReactorWithEvidence(t *testing.T) {
defer os.RemoveAll(thisConfig.RootDir)
ensureDir(t, path.Dir(thisConfig.Consensus.WalFile()), 0700) // dir for wal
app := kvstore.NewApplication()
vals := types.TM2PB.ValidatorUpdates(state.Validators)
app.InitChain(abci.RequestInitChain{Validators: vals})
@ -598,7 +596,6 @@ func TestReactorCreatesBlockWhenEmptyBlocksFalse(t *testing.T) {
c.Consensus.CreateEmptyBlocks = false
},
)
t.Cleanup(cleanup)
rts := setup(ctx, t, n, states, 100) // buffer must be large enough to not deadlock


+ 25
- 12
internal/consensus/state.go View File

@ -20,6 +20,7 @@ import (
cstypes "github.com/tendermint/tendermint/internal/consensus/types"
"github.com/tendermint/tendermint/internal/eventbus"
"github.com/tendermint/tendermint/internal/jsontypes"
"github.com/tendermint/tendermint/internal/libs/autofile"
sm "github.com/tendermint/tendermint/internal/state"
tmevents "github.com/tendermint/tendermint/libs/events"
"github.com/tendermint/tendermint/libs/log"
@ -905,15 +906,27 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
defer func() {
if r := recover(); r != nil {
cs.logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack()))
// stop gracefully
//
// NOTE: We most probably shouldn't be running any further when there is
// some unexpected panic. Some unknown error happened, and so we don't
// know if that will result in the validator signing an invalid thing. It
// might be worthwhile to explore a mechanism for manual resuming via
// some console or secure RPC system, but for now, halting the chain upon
// unexpected consensus bugs sounds like the better option.
// Make a best-effort attempt to close the WAL, but otherwise do not
// attempt to gracefully terminate. Once consensus has irrecoverably
// failed, any additional progress we permit the node to make may
// complicate diagnosing and recovering from the failure.
onExit(cs)
// Re-panic to ensure the node terminates.
//
// TODO(creachadair): In ordinary operation, the WAL autofile should
// never be closed. This only happens during shutdown and production
// nodes usually halt by panicking. Many existing tests, however,
// assume a clean shutdown is possible. Prior to #8111, we were
// swallowing the panic in receiveRoutine, making that appear to
// work. Filtering this specific error is slightly risky, but should
// affect only unit tests. In any case, not re-panicking here only
// preserves the pre-existing behavior for this one error type.
if err, ok := r.(error); ok && errors.Is(err, autofile.ErrAutoFileClosed) {
return
}
panic(r)
}
}()
@ -942,8 +955,8 @@ func (cs *State) receiveRoutine(ctx context.Context, maxSteps int) {
case mi := <-cs.internalMsgQueue:
err := cs.wal.WriteSync(mi) // NOTE: fsync
if err != nil {
panic(fmt.Sprintf(
"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
panic(fmt.Errorf(
"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
mi, err,
))
}
@ -1984,8 +1997,8 @@ func (cs *State) finalizeCommit(ctx context.Context, height int64) {
// restart).
endMsg := EndHeightMessage{height}
if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync
panic(fmt.Sprintf(
"failed to write %v msg to consensus WAL due to %v; check your file system and restart the node",
panic(fmt.Errorf(
"failed to write %v msg to consensus WAL due to %w; check your file system and restart the node",
endMsg, err,
))
}


+ 5
- 5
internal/libs/autofile/autofile.go View File

@ -41,9 +41,9 @@ const (
autoFilePerms = os.FileMode(0600)
)
// errAutoFileClosed is reported when operations attempt to use an autofile
// ErrAutoFileClosed is reported when operations attempt to use an autofile
// after it has been closed.
var errAutoFileClosed = errors.New("autofile is closed")
var ErrAutoFileClosed = errors.New("autofile is closed")
// AutoFile automatically closes and re-opens file for writing. The file is
// automatically setup to close itself every 1s and upon receiving SIGHUP.
@ -155,7 +155,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
af.mtx.Lock()
defer af.mtx.Unlock()
if af.closed {
return 0, fmt.Errorf("write: %w", errAutoFileClosed)
return 0, fmt.Errorf("write: %w", ErrAutoFileClosed)
}
if af.file == nil {
@ -174,7 +174,7 @@ func (af *AutoFile) Write(b []byte) (n int, err error) {
func (af *AutoFile) Sync() error {
return af.withLock(func() error {
if af.closed {
return fmt.Errorf("sync: %w", errAutoFileClosed)
return fmt.Errorf("sync: %w", ErrAutoFileClosed)
} else if af.file == nil {
return nil // nothing to sync
}
@ -207,7 +207,7 @@ func (af *AutoFile) Size() (int64, error) {
af.mtx.Lock()
defer af.mtx.Unlock()
if af.closed {
return 0, fmt.Errorf("size: %w", errAutoFileClosed)
return 0, fmt.Errorf("size: %w", ErrAutoFileClosed)
}
if af.file == nil {


+ 1
- 1
spec/abci++/abci++_basic_concepts_002_draft.md View File

@ -214,7 +214,7 @@ transactions and compute the same results.
Some Applications may choose to execute the blocks that are about to be proposed
(via `PrepareProposal`), or those that the Application is asked to validate
(via `Processproposal`). However, the state changes caused by processing those
(via `ProcessProposal`). However, the state changes caused by processing those
proposed blocks must never replace the previous state until `FinalizeBlock` confirms
the block decided.


Loading…
Cancel
Save