@ -98,6 +98,10 @@ const (
// maxLightBlockRequestRetries is the amount of retries acceptable before
// maxLightBlockRequestRetries is the amount of retries acceptable before
// the backfill process aborts
// the backfill process aborts
maxLightBlockRequestRetries = 20
maxLightBlockRequestRetries = 20
// the amount of processes fetching light blocks - this should be roughly calculated
// as the time to fetch a block / time to verify a block
lightBlockFetchers = 4
)
)
// Reactor handles state sync, both restoring snapshots for the local node and
// Reactor handles state sync, both restoring snapshots for the local node and
@ -206,11 +210,11 @@ func (r *Reactor) OnStop() {
// application. It also saves tendermint state and runs a backfill process to
// application. It also saves tendermint state and runs a backfill process to
// retrieve the necessary amount of headers, commits and validators sets to be
// retrieve the necessary amount of headers, commits and validators sets to be
// able to process evidence and participate in consensus.
// able to process evidence and participate in consensus.
func ( r * Reactor ) Sync ( stateProvider StateProvider , discoveryTime time . Duration ) error {
func ( r * Reactor ) Sync ( stateProvider StateProvider , discoveryTime time . Duration ) ( sm . State , error ) {
r . mtx . Lock ( )
r . mtx . Lock ( )
if r . syncer != nil {
if r . syncer != nil {
r . mtx . Unlock ( )
r . mtx . Unlock ( )
return errors . New ( "a state sync is already in progress" )
return sm . State { } , errors . New ( "a state sync is already in progress" )
}
}
r . syncer = newSyncer ( r . Logger , r . conn , r . connQuery , stateProvider , r . snapshotCh . Out , r . chunkCh . Out , r . tempDir )
r . syncer = newSyncer ( r . Logger , r . conn , r . connQuery , stateProvider , r . snapshotCh . Out , r . chunkCh . Out , r . tempDir )
@ -229,7 +233,7 @@ func (r *Reactor) Sync(stateProvider StateProvider, discoveryTime time.Duration)
state , commit , err := r . syncer . SyncAny ( discoveryTime , hook )
state , commit , err := r . syncer . SyncAny ( discoveryTime , hook )
if err != nil {
if err != nil {
return err
return sm . State { } , err
}
}
r . mtx . Lock ( )
r . mtx . Lock ( )
@ -238,24 +242,41 @@ func (r *Reactor) Sync(stateProvider StateProvider, discoveryTime time.Duration)
err = r . stateStore . Bootstrap ( state )
err = r . stateStore . Bootstrap ( state )
if err != nil {
if err != nil {
return fmt . Errorf ( "failed to bootstrap node with new state: %w" , err )
return sm . State { } , fmt . Errorf ( "failed to bootstrap node with new state: %w" , err )
}
}
err = r . blockStore . SaveSeenCommit ( state . LastBlockHeight , commit )
err = r . blockStore . SaveSeenCommit ( state . LastBlockHeight , commit )
if err != nil {
if err != nil {
return fmt . Errorf ( "failed to store last seen commit: %w" , err )
return sm . State { } , fmt . Errorf ( "failed to store last seen commit: %w" , err )
}
}
// start backfill process to retrieve the necessary headers, commits and
// validator sets
return r . backfill ( state )
return state , nil
}
}
// Backfill sequentially fetches, verifies and stores light blocks in reverse
// Backfill sequentially fetches, verifies and stores light blocks in reverse
// order. It does not stop verifying blocks until reaching a block with a height
// order. It does not stop verifying blocks until reaching a block with a height
// and time that is less or equal to the stopHeight and stopTime. The
// and time that is less or equal to the stopHeight and stopTime. The
// trustedBlockID should be of the header at startHeight.
// trustedBlockID should be of the header at startHeight.
func ( r * Reactor ) Backfill (
func ( r * Reactor ) Backfill ( state sm . State ) error {
params := state . ConsensusParams . Evidence
stopHeight := state . LastBlockHeight - params . MaxAgeNumBlocks
stopTime := state . LastBlockTime . Add ( - params . MaxAgeDuration )
// ensure that stop height doesn't go below the initial height
if stopHeight < state . InitialHeight {
stopHeight = state . InitialHeight
// this essentially makes stop time a void criteria for termination
stopTime = state . LastBlockTime
}
return r . backfill (
context . Background ( ) ,
state . ChainID ,
state . LastBlockHeight , stopHeight ,
state . LastBlockID ,
stopTime ,
)
}
func ( r * Reactor ) backfill (
ctx context . Context ,
ctx context . Context ,
chainID string ,
chainID string ,
startHeight , stopHeight int64 ,
startHeight , stopHeight int64 ,
@ -265,6 +286,7 @@ func (r *Reactor) Backfill(
r . Logger . Info ( "starting backfill process..." , "startHeight" , startHeight ,
r . Logger . Info ( "starting backfill process..." , "startHeight" , startHeight ,
"stopHeight" , stopHeight , "trustedBlockID" , trustedBlockID )
"stopHeight" , stopHeight , "trustedBlockID" , trustedBlockID )
const sleepTime = 1 * time . Second
var (
var (
lastValidatorSet * types . ValidatorSet
lastValidatorSet * types . ValidatorSet
lastChangeHeight int64 = startHeight
lastChangeHeight int64 = startHeight
@ -277,7 +299,7 @@ func (r *Reactor) Backfill(
// time. Ideally we want the verification process to never have to be
// time. Ideally we want the verification process to never have to be
// waiting on blocks. If it takes 4s to retrieve a block and 1s to verify
// waiting on blocks. If it takes 4s to retrieve a block and 1s to verify
// it, then steady state involves four workers.
// it, then steady state involves four workers.
for i := 0 ; i < 4 ; i ++ {
for i := 0 ; i < lightBlockFetchers ; i ++ {
go func ( ) {
go func ( ) {
for {
for {
select {
select {
@ -285,30 +307,33 @@ func (r *Reactor) Backfill(
r . Logger . Debug ( "fetching next block" , "height" , height )
r . Logger . Debug ( "fetching next block" , "height" , height )
lb , peer , err := r . dispatcher . LightBlock ( ctx , height )
lb , peer , err := r . dispatcher . LightBlock ( ctx , height )
if err != nil {
if err != nil {
// we don't punish the peer as it might just not have the block
// at that height
r . Logger . Info ( "error with fetching light block" ,
"height" , height , "err" , err )
queue . retry ( height )
queue . retry ( height )
if errors . Is ( err , errNoConnectedPeers ) {
r . Logger . Info ( "backfill: no connected peers to fetch light blocks from; sleeping..." ,
"sleepTime" , sleepTime )
time . Sleep ( sleepTime )
} else {
// we don't punish the peer as it might just have not responded in time
r . Logger . Info ( "backfill: error with fetching light block" ,
"height" , height , "err" , err )
}
continue
continue
}
}
if lb == nil {
if lb == nil {
r . Logger . Info ( "peer didn't have block, fetching from another peer" , "height" , height )
queue . retry ( height )
continue
}
if lb . Height != height {
r . Logger . Info ( "peer provided wrong height, retrying..." , "height" , height )
r . Logger . Info ( "backfill: peer didn't have block, fetching from another peer" , "height" , height )
queue . retry ( height )
queue . retry ( height )
// as we are fetching blocks backwards, if this node doesn't have the block it likely doesn't
// have any prior ones, thus we remove it from the peer list
r . dispatcher . removePeer ( peer )
continue
continue
}
}
// run a validate basic. This checks the validator set and commit
// run a validate basic. This checks the validator set and commit
// hashes line up
// hashes line up
err = lb . ValidateBasic ( chainID )
err = lb . ValidateBasic ( chainID )
if err != nil {
r . Logger . Info ( "fetched light block failed validate basic, removing peer..." , "err" , err )
if err != nil || lb . Height != height {
r . Logger . Info ( "backfill: fetched light block failed validate basic, removing peer..." ,
"err" , err , "height" , height )
queue . retry ( height )
queue . retry ( height )
r . blockCh . Error <- p2p . PeerError {
r . blockCh . Error <- p2p . PeerError {
NodeID : peer ,
NodeID : peer ,
@ -322,7 +347,7 @@ func (r *Reactor) Backfill(
block : lb ,
block : lb ,
peer : peer ,
peer : peer ,
} )
} )
r . Logger . Debug ( "added light block to processing queue" , "height" , height )
r . Logger . Debug ( "backfill: added light block to processing queue" , "height" , height )
case <- queue . done ( ) :
case <- queue . done ( ) :
return
return
@ -376,7 +401,7 @@ func (r *Reactor) Backfill(
trustedBlockID = resp . block . LastBlockID
trustedBlockID = resp . block . LastBlockID
queue . success ( resp . block . Height )
queue . success ( resp . block . Height )
r . Logger . Info ( "verified and stored light block" , "height" , resp . block . Height )
r . Logger . Info ( "backfill: verified and stored light block" , "height" , resp . block . Height )
lastValidatorSet = resp . block . ValidatorSet
lastValidatorSet = resp . block . ValidatorSet
@ -386,7 +411,12 @@ func (r *Reactor) Backfill(
}
}
// save the final batch of validators
// save the final batch of validators
return r . stateStore . SaveValidatorSets ( queue . terminal . Height , lastChangeHeight , lastValidatorSet )
if err := r . stateStore . SaveValidatorSets ( queue . terminal . Height , lastChangeHeight , lastValidatorSet ) ; err != nil {
return err
}
r . Logger . Info ( "successfully completed backfill process" , "endHeight" , queue . terminal . Height )
return nil
}
}
}
}
}
}
@ -777,24 +807,3 @@ func (r *Reactor) fetchLightBlock(height uint64) (*types.LightBlock, error) {
} , nil
} , nil
}
}
// backfill is a convenience wrapper around the backfill function. It takes
// state to work out how many prior blocks need to be verified
func ( r * Reactor ) backfill ( state sm . State ) error {
params := state . ConsensusParams . Evidence
stopHeight := state . LastBlockHeight - params . MaxAgeNumBlocks
stopTime := state . LastBlockTime . Add ( - params . MaxAgeDuration )
// ensure that stop height doesn't go below the initial height
if stopHeight < state . InitialHeight {
stopHeight = state . InitialHeight
// this essentially makes stop time a void criteria for termination
stopTime = state . LastBlockTime
}
return r . Backfill (
context . Background ( ) ,
state . ChainID ,
state . LastBlockHeight , stopHeight ,
state . LastBlockID ,
stopTime ,
)
}