@ -2,19 +2,24 @@ package consensus
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"runtime"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/tendermint/abci/example/dummy"
abci "github.com/tendermint/abci/types"
crypto "github.com/tendermint/go-crypto"
wire "github.com/tendermint/go-wire"
auto "github.com/tendermint/tmlibs/autofile"
cmn "github.com/tendermint/tmlibs/common"
dbm "github.com/tendermint/tmlibs/db"
@ -25,8 +30,10 @@ import (
"github.com/tendermint/tmlibs/log"
)
var consensusReplayConfig * cfg . Config
func init ( ) {
config = ResetConfig ( "consensus_replay_test" )
consensusReplayCon fig = ResetConfig ( "consensus_replay_test" )
}
// These tests ensure we can always recover from failure at any part of the consensus process.
@ -39,8 +46,7 @@ func init() {
// NOTE: Files in this dir are generated by running the `build.sh` therein.
// It's a simple way to generate wals for a single block, or multiple blocks, with random transactions,
// and different part sizes. The output is not deterministic, and the stepChanges may need to be adjusted
// after running it (eg. sometimes small_block2 will have 5 block parts, sometimes 6).
// and different part sizes. The output is not deterministic.
// It should only have to be re-run if there is some breaking change to the consensus data structures (eg. blocks, votes)
// or to the behaviour of the app (eg. computes app hash differently)
var data_dir = path . Join ( cmn . GoPath ( ) , "src/github.com/tendermint/tendermint/consensus" , "test_data" )
@ -52,230 +58,209 @@ var data_dir = path.Join(cmn.GoPath(), "src/github.com/tendermint/tendermint/con
// and which ones we need the wal for - then we'd also be able to only flush the
// wal writer when we need to, instead of with every message.
// the priv validator changes step at these lines for a block with 1 val and 1 part
var baseStepChanges = [ ] int { 3 , 6 , 8 }
// test recovery from each line in each testCase
var testCases = [ ] * testCase {
newTestCase ( "empty_block" , baseStepChanges ) , // empty block (has 1 block part)
newTestCase ( "small_block1" , baseStepChanges ) , // small block with txs in 1 block part
newTestCase ( "small_block2" , [ ] int { 3 , 12 , 14 } ) , // small block with txs across 6 smaller block parts
}
type testCase struct {
name string
log [ ] byte //full cs wal
stepMap map [ int ] int8 // map lines of log to privval step
proposeLine int
prevoteLine int
precommitLine int
}
func newTestCase ( name string , stepChanges [ ] int ) * testCase {
if len ( stepChanges ) != 3 {
panic ( cmn . Fmt ( "a full wal has 3 step changes! Got array %v" , stepChanges ) )
}
return & testCase {
name : name ,
log : readWAL ( path . Join ( data_dir , name + ".cswal" ) ) ,
stepMap : newMapFromChanges ( stepChanges ) ,
proposeLine : stepChanges [ 0 ] ,
prevoteLine : stepChanges [ 1 ] ,
precommitLine : stepChanges [ 2 ] ,
}
}
func newMapFromChanges ( changes [ ] int ) map [ int ] int8 {
changes = append ( changes , changes [ 2 ] + 1 ) // so we add the last step change to the map
m := make ( map [ int ] int8 )
var count int
for changeNum , nextChange := range changes {
for ; count < nextChange ; count ++ {
m [ count ] = int8 ( changeNum )
}
}
return m
}
func readWAL ( p string ) [ ] byte {
b , err := ioutil . ReadFile ( p )
if err != nil {
panic ( err )
}
return b
}
func writeWAL ( walMsgs [ ] byte ) string {
walFile , err := ioutil . TempFile ( "" , "wal" )
if err != nil {
panic ( fmt . Errorf ( "failed to create temp WAL file: %v" , err ) )
}
_ , err = walFile . Write ( walMsgs )
if err != nil {
panic ( fmt . Errorf ( "failed to write to temp WAL file: %v" , err ) )
}
if err := walFile . Close ( ) ; err != nil {
panic ( fmt . Errorf ( "failed to close temp WAL file: %v" , err ) )
}
return walFile . Name ( )
}
func waitForBlock ( newBlockCh chan interface { } , thisCase * testCase , i int ) {
after := time . After ( time . Second * 10 )
func startNewConsensusStateAndWaitForBlock ( t * testing . T , lastBlockHeight int , blockDB dbm . DB , stateDB dbm . DB ) {
logger := log . TestingLogger ( )
state , _ := sm . GetState ( stateDB , consensusReplayConfig . GenesisFile ( ) )
state . SetLogger ( logger . With ( "module" , "state" ) )
privValidator := loadPrivValidator ( consensusReplayConfig )
cs := newConsensusStateWithConfigAndBlockStore ( consensusReplayConfig , state , privValidator , dummy . NewDummyApplication ( ) , blockDB )
cs . SetLogger ( logger )
bytes , _ := ioutil . ReadFile ( cs . config . WalFile ( ) )
// fmt.Printf("====== WAL: \n\r%s\n", bytes)
t . Logf ( "====== WAL: \n\r%s\n" , bytes )
_ , err := cs . Start ( )
require . NoError ( t , err )
defer func ( ) {
cs . Stop ( )
} ( )
// This is just a signal that we haven't halted; its not something contained
// in the WAL itself. Assuming the consensus state is running, replay of any
// WAL, including the empty one, should eventually be followed by a new
// block, or else something is wrong.
newBlockCh := make ( chan interface { } , 1 )
err = cs . eventBus . Subscribe ( context . Background ( ) , testSubscriber , types . EventQueryNewBlock , newBlockCh )
require . NoError ( t , err )
select {
case <- newBlockCh :
case <- a fter:
panic ( cmn . Fmt ( "Timed out waiting for new block for case '%s' line %d" , thisCase . name , i ) )
case <- time . After ( 10 * time . Second ) :
t . Fatalf ( "Timed out waiting for new block (see trace above)" )
}
}
func runReplayTest ( t * testing . T , cs * ConsensusState , walFile string , newBlockCh chan interface { } ,
thisCase * testCase , i int ) {
cs . config . SetWalFile ( walFile )
started , err := cs . Start ( )
if err != nil {
t . Fatalf ( "Cannot start consensus: %v" , err )
}
if ! started {
t . Error ( "Consensus did not start" )
}
// Wait to make a new block.
// This is just a signal that we haven't halted; its not something contained in the WAL itself.
// Assuming the consensus state is running, replay of any WAL, including the empty one,
// should eventually be followed by a new block, or else something is wrong
waitForBlock ( newBlockCh , thisCase , i )
cs . evsw . Stop ( )
cs . Stop ( )
LOOP :
func sendTxs ( cs * ConsensusState , ctx context . Context ) {
i := 0
for {
select {
case <- newBlockCh :
case <- ctx . Done ( ) :
return
default :
break LOOP
cs . mempool . CheckTx ( [ ] byte { byte ( i ) } , nil )
i ++
}
}
cs . Wait ( )
}
func toPV ( pv types . PrivValidator ) * types . PrivValidatorFS {
return pv . ( * types . PrivValidatorFS )
// TestWALCrash uses crashing WAL to test we can recover from any WAL failure.
func TestWALCrash ( t * testing . T ) {
testCases := [ ] struct {
name string
initFn func ( * ConsensusState , context . Context )
heightToStop uint64
} {
{ "empty block" ,
func ( cs * ConsensusState , ctx context . Context ) { } ,
1 } ,
{ "block with a smaller part size" ,
func ( cs * ConsensusState , ctx context . Context ) {
// XXX: is there a better way to change BlockPartSizeBytes?
params := cs . state . Params
params . BlockPartSizeBytes = 512
cs . state . Params = params
sendTxs ( cs , ctx )
} ,
1 } ,
{ "many non-empty blocks" ,
sendTxs ,
3 } ,
}
for _ , tc := range testCases {
t . Run ( tc . name , func ( t * testing . T ) {
crashWALandCheckLiveness ( t , tc . initFn , tc . heightToStop )
} )
}
}
func setupReplayTest ( t * testing . T , thisCase * testCase , nLines int , crashAfter bool ) ( * ConsensusState , chan interface { } , [ ] byte , string ) {
t . Log ( "-------------------------------------" )
t . Logf ( "Starting replay test %v (of %d lines of WAL). Crash after = %v" , thisCase . name , nLines , crashAfter )
func crashWALandCheckLiveness ( t * testing . T , initFn func ( * ConsensusState , context . Context ) , heightToStop uint64 ) {
walPaniced := make ( chan error )
crashingWal := & crashingWAL { panicCh : walPaniced , heightToStop : heightToStop }
lineStep := nLines
if crashAfter {
lineStep -= 1
}
i := 1
LOOP :
for {
// fmt.Printf("====== LOOP %d\n", i)
t . Logf ( "====== LOOP %d\n" , i )
// create consensus state from a clean slate
logger := log . NewNopLogger ( )
stateDB := dbm . NewMemDB ( )
state , _ := sm . MakeGenesisStateFromFile ( stateDB , consensusReplayConfig . GenesisFile ( ) )
state . SetLogger ( logger . With ( "module" , "state" ) )
privValidator := loadPrivValidator ( consensusReplayConfig )
blockDB := dbm . NewMemDB ( )
cs := newConsensusStateWithConfigAndBlockStore ( consensusReplayConfig , state , privValidator , dummy . NewDummyApplication ( ) , blockDB )
cs . SetLogger ( logger )
// start sending transactions
ctx , cancel := context . WithCancel ( context . Background ( ) )
go initFn ( cs , ctx )
// clean up WAL file from the previous iteration
walFile := cs . config . WalFile ( )
os . Remove ( walFile )
// set crashing WAL
csWal , err := cs . OpenWAL ( walFile )
require . NoError ( t , err )
crashingWal . next = csWal
// reset the message counter
crashingWal . msgIndex = 1
cs . wal = crashingWal
// start consensus state
_ , err = cs . Start ( )
require . NoError ( t , err )
i ++
split := bytes . Split ( thisCase . log , walSeparator )
lastMsg := split [ nLines ]
select {
case err := <- walPaniced :
t . Logf ( "WAL paniced: %v" , err )
// we write those lines up to (not including) one with the signature
b := bytes . Join ( split [ : nLines ] , walSeparator )
b = append ( b , walSeparator ... )
walFile := writeWAL ( b )
// make sure we can make blocks after a crash
startNewConsensusStateAndWaitForBlock ( t , cs . Height , blockDB , stateDB )
cs := fixedConsensusStateDummy ( )
// stop consensus state and transactions sender (initFn)
cs . Stop ( )
cancel ( )
// set the last step according to when we crashed vs the wal
toPV ( cs . privValidator ) . LastHeight = 1 // first block
toPV ( cs . privValidator ) . LastStep = thisCase . stepMap [ lineStep ]
// if we reached the required height, exit
if _ , ok := err . ( ReachedHeightToStopError ) ; ok {
break LOOP
}
case <- time . After ( 10 * time . Second ) :
t . Fatal ( "WAL did not panic for 10 seconds (check the log)" )
}
}
}
t . Logf ( "[WARN] setupReplayTest LastStep=%v" , toPV ( cs . privValidator ) . LastStep )
// crashingWAL is a WAL which crashes or rather simulates a crash during Save
// (before and after). It remembers a message for which we last panicked
// (lastPanicedForMsgIndex), so we don't panic for it in subsequent iterations.
type crashingWAL struct {
next WAL
panicCh chan error
heightToStop uint64
newBlockCh := subscribeToEvent ( cs . evsw , "tester" , types . EventStringNewBlock ( ) , 1 )
msgIndex int // current message index
lastPanicedForMsgIndex int // last message for which we panicked
}
return cs , newBlockCh , lastMsg , walFile
// WALWriteError indicates a WAL crash.
type WALWriteError struct {
msg string
}
func readTimedWALMessage ( t * testing . T , rawMsg [ ] byte ) TimedWALMessage {
b := bytes . NewBuffer ( rawMsg )
// because rawMsg does not contain a separator and WALDecoder#Decode expects it
_ , err := b . Write ( walSeparator )
if err != nil {
t . Fatal ( err )
}
dec := NewWALDecoder ( b )
msg , err := dec . Decode ( )
if err != nil {
t . Fatalf ( "Error reading json data: %v" , err )
}
return * msg
func ( e WALWriteError ) Error ( ) string {
return e . msg
}
//-----------------------------------------------
// Test the log at every iteration, and set the privVal last step
// as if the log was written after signing, before the crash
func TestWALCrashAfterWrite ( t * testing . T ) {
for _ , thisCase := range testCases {
splitSize := bytes . Count ( thisCase . log , walSeparator )
for i := 0 ; i < splitSize - 1 ; i ++ {
t . Run ( fmt . Sprintf ( "%s:%d" , thisCase . name , i ) , func ( t * testing . T ) {
cs , newBlockCh , _ , walFile := setupReplayTest ( t , thisCase , i + 1 , true )
cs . config . TimeoutPropose = 100
runReplayTest ( t , cs , walFile , newBlockCh , thisCase , i + 1 )
// cleanup
os . Remove ( walFile )
} )
}
}
// ReachedHeightToStopError indicates we've reached the required consensus
// height and may exit.
type ReachedHeightToStopError struct {
height uint64
}
//-----------------------------------------------
// Test the log as if we crashed after signing but before writing.
// This relies on privValidator.LastSignature being set
func TestWALCrashBeforeWritePropose ( t * testing . T ) {
for _ , thisCase := range testCases {
lineNum := thisCase . proposeLine
t . Run ( fmt . Sprintf ( "%s:%d" , thisCase . name , lineNum ) , func ( t * testing . T ) {
// setup replay test where last message is a proposal
cs , newBlockCh , proposalMsg , walFile := setupReplayTest ( t , thisCase , lineNum , false )
cs . config . TimeoutPropose = 100
msg := readTimedWALMessage ( t , proposalMsg )
proposal := msg . Msg . ( msgInfo ) . Msg . ( * ProposalMessage )
// Set LastSig
toPV ( cs . privValidator ) . LastSignBytes = types . SignBytes ( cs . state . ChainID , proposal . Proposal )
toPV ( cs . privValidator ) . LastSignature = proposal . Proposal . Signature
runReplayTest ( t , cs , walFile , newBlockCh , thisCase , lineNum )
// cleanup
os . Remove ( walFile )
} )
}
func ( e ReachedHeightToStopError ) Error ( ) string {
return fmt . Sprintf ( "reached height to stop %d" , e . height )
}
func TestWALCrashBeforeWritePrevote ( t * testing . T ) {
for _ , thisCase := range testCases {
testReplayCrashBeforeWriteVote ( t , thisCase , thisCase . prevoteLine , types . EventStringCompleteProposal ( ) )
// Save simulate WAL's crashing by sending an error to the panicCh and then
// exiting the cs.receiveRoutine.
func ( w * crashingWAL ) Save ( m WALMessage ) {
if endMsg , ok := m . ( EndHeightMessage ) ; ok {
if endMsg . Height == w . heightToStop {
w . panicCh <- ReachedHeightToStopError { endMsg . Height }
runtime . Goexit ( )
} else {
w . next . Save ( m )
}
return
}
}
func TestWALCrashBeforeWritePrecommit ( t * testing . T ) {
for _ , thisCase := range testCases {
testReplayCrashBeforeWriteVote ( t , thisCase , thisCase . precommitLine , types . EventStringPolka ( ) )
if w . msgIndex > w . lastPanicedForMsgIndex {
w . lastPanicedForMsgIndex = w . msgIndex
_ , file , line , _ := runtime . Caller ( 1 )
w . panicCh <- WALWriteError { fmt . Sprintf ( "failed to write %T to WAL (fileline: %s:%d)" , m , file , line ) }
runtime . Goexit ( )
} else {
w . msgIndex ++
w . next . Save ( m )
}
}
func testReplayCrashBeforeWriteVote ( t * testing . T , thisCase * testCase , lineNum int , eventString string ) {
// setup replay test where last message is a vote
cs , newBlockCh , voteMsg , walFile := setupReplayTest ( t , thisCase , lineNum , false )
types . AddListenerForEvent ( cs . evsw , "tester" , eventString , func ( data types . TMEventData ) {
msg := readTimedWALMessage ( t , voteMsg )
vote := msg . Msg . ( msgInfo ) . Msg . ( * VoteMessage )
// Set LastSig
toPV ( cs . privValidator ) . LastSignBytes = types . SignBytes ( cs . state . ChainID , vote . Vote )
toPV ( cs . privValidator ) . LastSignature = vote . Vote . Signature
} )
runReplayTest ( t , cs , walFile , newBlockCh , thisCase , lineNum )
func ( w * crashingWAL ) Group ( ) * auto . Group { return w . next . Group ( ) }
func ( w * crashingWAL ) SearchForEndHeight ( height uint64 ) ( gr * auto . GroupReader , found bool , err error ) {
return w . next . SearchForEndHeight ( height )
}
func ( w * crashingWAL ) Start ( ) ( bool , error ) { return w . next . Start ( ) }
func ( w * crashingWAL ) Stop ( ) bool { return w . next . Stop ( ) }
func ( w * crashingWAL ) Wait ( ) { w . next . Wait ( ) }
//------------------------------------------------------------------------------------------
// Handshake Tests
@ -320,6 +305,21 @@ func TestHandshakeReplayNone(t *testing.T) {
}
}
func writeWAL ( walMsgs [ ] byte ) string {
walFile , err := ioutil . TempFile ( "" , "wal" )
if err != nil {
panic ( fmt . Errorf ( "failed to create temp WAL file: %v" , err ) )
}
_ , err = walFile . Write ( walMsgs )
if err != nil {
panic ( fmt . Errorf ( "failed to write to temp WAL file: %v" , err ) )
}
if err := walFile . Close ( ) ; err != nil {
panic ( fmt . Errorf ( "failed to close temp WAL file: %v" , err ) )
}
return walFile . Name ( )
}
// Make some blocks. Start a fresh app and apply nBlocks blocks. Then restart the app and sync it up with the remaining blocks
func testHandshakeReplay ( t * testing . T , nBlocks int , mode uint ) {
config := ResetConfig ( "proxy_test_" )
@ -397,7 +397,7 @@ func testHandshakeReplay(t *testing.T, nBlocks int, mode uint) {
func applyBlock ( st * sm . State , blk * types . Block , proxyApp proxy . AppConns ) {
testPartSize := st . Params . BlockPartSizeBytes
err := st . ApplyBlock ( nil , proxyApp . Consensus ( ) , blk , blk . MakePartSet ( testPartSize ) . Header ( ) , mempool )
err := st . ApplyBlock ( types . NopEventBus { } , proxyApp . Consensus ( ) , blk , blk . MakePartSet ( testPartSize ) . Header ( ) , mempool )
if err != nil {
panic ( err )
}
@ -477,7 +477,7 @@ func buildTMStateFromChain(config *cfg.Config, state *sm.State, chain []*types.B
//--------------------------
// utils for making blocks
func makeBlockchainFromWAL ( wal * WAL ) ( [ ] * types . Block , [ ] * types . Commit , error ) {
func makeBlockchainFromWAL ( wal WAL ) ( [ ] * types . Block , [ ] * types . Commit , error ) {
// Search for height marker
gr , found , err := wal . SearchForEndHeight ( 0 )
if err != nil {