You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

452 lines
13 KiB

blockchain: Reorg reactor (#3561) * go routines in blockchain reactor * Added reference to the go routine diagram * Initial commit * cleanup * Undo testing_logger change, committed by mistake * Fix the test loggers * pulled some fsm code into pool.go * added pool tests * changes to the design added block requests under peer moved the request trigger in the reactor poolRoutine, triggered now by a ticker in general moved everything required for making block requests smarter in the poolRoutine added a simple map of heights to keep track of what will need to be requested next added a few more tests * send errors to FSM in a different channel than blocks send errors (RemovePeer) from switch on a different channel than the one receiving blocks renamed channels added more pool tests * more pool tests * lint errors * more tests * more tests * switch fast sync to new implementation * fixed data race in tests * cleanup * finished fsm tests * address golangci comments :) * address golangci comments :) * Added timeout on next block needed to advance * updating docs and cleanup * fix issue in test from previous cleanup * cleanup * Added termination scenarios, tests and more cleanup * small fixes to adr, comments and cleanup * Fix bug in sendRequest() If we tried to send a request to a peer not present in the switch, a missing continue statement caused the request to be blackholed in a peer that was removed and never retried. While this bug was manifesting, the reactor kept asking for other blocks that would be stored and never consumed. Added the number of unconsumed blocks in the math for requesting blocks ahead of current processing height so eventually there will be no more blocks requested until the already received ones are consumed. * remove bpPeer's didTimeout field * Use distinct err codes for peer timeout and FSM timeouts * Don't allow peers to update with lower height * review comments from Ethan and Zarko * some cleanup, renaming, comments * Move block execution in separate goroutine * Remove pool's numPending * review comments * fix lint, remove old blockchain reactor and duplicates in fsm tests * small reorg around peer after review comments * add the reactor spec * verify block only once * review comments * change to int for max number of pending requests * cleanup and godoc * Add configuration flag fast sync version * golangci fixes * fix config template * move both reactor versions under blockchain * cleanup, golint, renaming stuff * updated documentation, fixed more golint warnings * integrate with behavior package * sync with master * gofmt * add changelog_pending entry * move to improvments * suggestion to changelog entry
5 years ago
blockchain: Reorg reactor (#3561) * go routines in blockchain reactor * Added reference to the go routine diagram * Initial commit * cleanup * Undo testing_logger change, committed by mistake * Fix the test loggers * pulled some fsm code into pool.go * added pool tests * changes to the design added block requests under peer moved the request trigger in the reactor poolRoutine, triggered now by a ticker in general moved everything required for making block requests smarter in the poolRoutine added a simple map of heights to keep track of what will need to be requested next added a few more tests * send errors to FSM in a different channel than blocks send errors (RemovePeer) from switch on a different channel than the one receiving blocks renamed channels added more pool tests * more pool tests * lint errors * more tests * more tests * switch fast sync to new implementation * fixed data race in tests * cleanup * finished fsm tests * address golangci comments :) * address golangci comments :) * Added timeout on next block needed to advance * updating docs and cleanup * fix issue in test from previous cleanup * cleanup * Added termination scenarios, tests and more cleanup * small fixes to adr, comments and cleanup * Fix bug in sendRequest() If we tried to send a request to a peer not present in the switch, a missing continue statement caused the request to be blackholed in a peer that was removed and never retried. While this bug was manifesting, the reactor kept asking for other blocks that would be stored and never consumed. Added the number of unconsumed blocks in the math for requesting blocks ahead of current processing height so eventually there will be no more blocks requested until the already received ones are consumed. * remove bpPeer's didTimeout field * Use distinct err codes for peer timeout and FSM timeouts * Don't allow peers to update with lower height * review comments from Ethan and Zarko * some cleanup, renaming, comments * Move block execution in separate goroutine * Remove pool's numPending * review comments * fix lint, remove old blockchain reactor and duplicates in fsm tests * small reorg around peer after review comments * add the reactor spec * verify block only once * review comments * change to int for max number of pending requests * cleanup and godoc * Add configuration flag fast sync version * golangci fixes * fix config template * move both reactor versions under blockchain * cleanup, golint, renaming stuff * updated documentation, fixed more golint warnings * integrate with behavior package * sync with master * gofmt * add changelog_pending entry * move to improvments * suggestion to changelog entry
5 years ago
  1. package v1
  2. import (
  3. "errors"
  4. "fmt"
  5. "sync"
  6. "time"
  7. "github.com/tendermint/tendermint/libs/log"
  8. "github.com/tendermint/tendermint/p2p"
  9. "github.com/tendermint/tendermint/types"
  10. )
  11. // Blockchain Reactor State
  12. type bcReactorFSMState struct {
  13. name string
  14. // called when transitioning out of current state
  15. handle func(*BcReactorFSM, bReactorEvent, bReactorEventData) (next *bcReactorFSMState, err error)
  16. // called when entering the state
  17. enter func(fsm *BcReactorFSM)
  18. // timeout to ensure FSM is not stuck in a state forever
  19. // the timer is owned and run by the fsm instance
  20. timeout time.Duration
  21. }
  22. func (s *bcReactorFSMState) String() string {
  23. return s.name
  24. }
  25. // BcReactorFSM is the datastructure for the Blockchain Reactor State Machine
  26. type BcReactorFSM struct {
  27. logger log.Logger
  28. mtx sync.Mutex
  29. startTime time.Time
  30. state *bcReactorFSMState
  31. stateTimer *time.Timer
  32. pool *BlockPool
  33. // interface used to call the Blockchain reactor to send StatusRequest, BlockRequest, reporting errors, etc.
  34. toBcR bcReactor
  35. }
  36. // NewFSM creates a new reactor FSM.
  37. func NewFSM(height int64, toBcR bcReactor) *BcReactorFSM {
  38. return &BcReactorFSM{
  39. state: unknown,
  40. startTime: time.Now(),
  41. pool: NewBlockPool(height, toBcR),
  42. toBcR: toBcR,
  43. }
  44. }
  45. // bReactorEventData is part of the message sent by the reactor to the FSM and used by the state handlers.
  46. type bReactorEventData struct {
  47. peerID p2p.ID
  48. err error // for peer error: timeout, slow; for processed block event if error occurred
  49. height int64 // for status response; for processed block event
  50. block *types.Block // for block response
  51. stateName string // for state timeout events
  52. length int // for block response event, length of received block, used to detect slow peers
  53. maxNumRequests int // for request needed event, maximum number of pending requests
  54. }
  55. // Blockchain Reactor Events (the input to the state machine)
  56. type bReactorEvent uint
  57. const (
  58. // message type events
  59. startFSMEv = iota + 1
  60. statusResponseEv
  61. blockResponseEv
  62. processedBlockEv
  63. makeRequestsEv
  64. stopFSMEv
  65. // other events
  66. peerRemoveEv = iota + 256
  67. stateTimeoutEv
  68. )
  69. func (msg *bcReactorMessage) String() string {
  70. var dataStr string
  71. switch msg.event {
  72. case startFSMEv:
  73. dataStr = ""
  74. case statusResponseEv:
  75. dataStr = fmt.Sprintf("peer=%v height=%v", msg.data.peerID, msg.data.height)
  76. case blockResponseEv:
  77. dataStr = fmt.Sprintf("peer=%v block.height=%v length=%v",
  78. msg.data.peerID, msg.data.block.Height, msg.data.length)
  79. case processedBlockEv:
  80. dataStr = fmt.Sprintf("error=%v", msg.data.err)
  81. case makeRequestsEv:
  82. dataStr = ""
  83. case stopFSMEv:
  84. dataStr = ""
  85. case peerRemoveEv:
  86. dataStr = fmt.Sprintf("peer: %v is being removed by the switch", msg.data.peerID)
  87. case stateTimeoutEv:
  88. dataStr = fmt.Sprintf("state=%v", msg.data.stateName)
  89. default:
  90. dataStr = fmt.Sprintf("cannot interpret message data")
  91. }
  92. return fmt.Sprintf("%v: %v", msg.event, dataStr)
  93. }
  94. func (ev bReactorEvent) String() string {
  95. switch ev {
  96. case startFSMEv:
  97. return "startFSMEv"
  98. case statusResponseEv:
  99. return "statusResponseEv"
  100. case blockResponseEv:
  101. return "blockResponseEv"
  102. case processedBlockEv:
  103. return "processedBlockEv"
  104. case makeRequestsEv:
  105. return "makeRequestsEv"
  106. case stopFSMEv:
  107. return "stopFSMEv"
  108. case peerRemoveEv:
  109. return "peerRemoveEv"
  110. case stateTimeoutEv:
  111. return "stateTimeoutEv"
  112. default:
  113. return "event unknown"
  114. }
  115. }
  116. // states
  117. var (
  118. unknown *bcReactorFSMState
  119. waitForPeer *bcReactorFSMState
  120. waitForBlock *bcReactorFSMState
  121. finished *bcReactorFSMState
  122. )
  123. // timeouts for state timers
  124. const (
  125. waitForPeerTimeout = 3 * time.Second
  126. waitForBlockAtCurrentHeightTimeout = 10 * time.Second
  127. )
  128. // errors
  129. var (
  130. // internal to the package
  131. errNoErrorFinished = errors.New("fast sync is finished")
  132. errInvalidEvent = errors.New("invalid event in current state")
  133. errMissingBlock = errors.New("missing blocks")
  134. errNilPeerForBlockRequest = errors.New("peer for block request does not exist in the switch")
  135. errSendQueueFull = errors.New("block request not made, send-queue is full")
  136. errPeerTooShort = errors.New("peer height too low, old peer removed/ new peer not added")
  137. errSwitchRemovesPeer = errors.New("switch is removing peer")
  138. errTimeoutEventWrongState = errors.New("timeout event for a state different than the current one")
  139. errNoTallerPeer = errors.New("fast sync timed out on waiting for a peer taller than this node")
  140. // reported eventually to the switch
  141. // handle return
  142. errPeerLowersItsHeight = errors.New("fast sync peer reports a height lower than previous")
  143. // handle return
  144. errNoPeerResponseForCurrentHeights = errors.New("fast sync timed out on peer block response for current heights")
  145. errNoPeerResponse = errors.New("fast sync timed out on peer block response") // xx
  146. errBadDataFromPeer = errors.New("fast sync received block from wrong peer or block is bad") // xx
  147. errDuplicateBlock = errors.New("fast sync received duplicate block from peer")
  148. errBlockVerificationFailure = errors.New("fast sync block verification failure") // xx
  149. errSlowPeer = errors.New("fast sync peer is not sending us data fast enough") // xx
  150. )
  151. func init() {
  152. unknown = &bcReactorFSMState{
  153. name: "unknown",
  154. handle: func(fsm *BcReactorFSM, ev bReactorEvent, data bReactorEventData) (*bcReactorFSMState, error) {
  155. switch ev {
  156. case startFSMEv:
  157. // Broadcast Status message. Currently doesn't return non-nil error.
  158. fsm.toBcR.sendStatusRequest()
  159. return waitForPeer, nil
  160. case stopFSMEv:
  161. return finished, errNoErrorFinished
  162. default:
  163. return unknown, errInvalidEvent
  164. }
  165. },
  166. }
  167. waitForPeer = &bcReactorFSMState{
  168. name: "waitForPeer",
  169. timeout: waitForPeerTimeout,
  170. enter: func(fsm *BcReactorFSM) {
  171. // Stop when leaving the state.
  172. fsm.resetStateTimer()
  173. },
  174. handle: func(fsm *BcReactorFSM, ev bReactorEvent, data bReactorEventData) (*bcReactorFSMState, error) {
  175. switch ev {
  176. case stateTimeoutEv:
  177. if data.stateName != "waitForPeer" {
  178. fsm.logger.Error("received a state timeout event for different state",
  179. "state", data.stateName)
  180. return waitForPeer, errTimeoutEventWrongState
  181. }
  182. // There was no statusResponse received from any peer.
  183. // Should we send status request again?
  184. return finished, errNoTallerPeer
  185. case statusResponseEv:
  186. if err := fsm.pool.UpdatePeer(data.peerID, data.height); err != nil {
  187. if fsm.pool.NumPeers() == 0 {
  188. return waitForPeer, err
  189. }
  190. }
  191. if fsm.stateTimer != nil {
  192. fsm.stateTimer.Stop()
  193. }
  194. return waitForBlock, nil
  195. case stopFSMEv:
  196. if fsm.stateTimer != nil {
  197. fsm.stateTimer.Stop()
  198. }
  199. return finished, errNoErrorFinished
  200. default:
  201. return waitForPeer, errInvalidEvent
  202. }
  203. },
  204. }
  205. waitForBlock = &bcReactorFSMState{
  206. name: "waitForBlock",
  207. timeout: waitForBlockAtCurrentHeightTimeout,
  208. enter: func(fsm *BcReactorFSM) {
  209. // Stop when leaving the state.
  210. fsm.resetStateTimer()
  211. },
  212. handle: func(fsm *BcReactorFSM, ev bReactorEvent, data bReactorEventData) (*bcReactorFSMState, error) {
  213. switch ev {
  214. case statusResponseEv:
  215. err := fsm.pool.UpdatePeer(data.peerID, data.height)
  216. if fsm.pool.NumPeers() == 0 {
  217. return waitForPeer, err
  218. }
  219. if fsm.pool.ReachedMaxHeight() {
  220. return finished, err
  221. }
  222. return waitForBlock, err
  223. case blockResponseEv:
  224. fsm.logger.Debug("blockResponseEv", "H", data.block.Height)
  225. err := fsm.pool.AddBlock(data.peerID, data.block, data.length)
  226. if err != nil {
  227. // A block was received that was unsolicited, from unexpected peer, or that we already have it.
  228. // Ignore block, remove peer and send error to switch.
  229. fsm.pool.RemovePeer(data.peerID, err)
  230. fsm.toBcR.sendPeerError(err, data.peerID)
  231. }
  232. if fsm.pool.NumPeers() == 0 {
  233. return waitForPeer, err
  234. }
  235. return waitForBlock, err
  236. case processedBlockEv:
  237. if data.err != nil {
  238. first, second, _ := fsm.pool.FirstTwoBlocksAndPeers()
  239. fsm.logger.Error("error processing block", "err", data.err,
  240. "first", first.block.Height, "second", second.block.Height)
  241. fsm.logger.Error("send peer error for", "peer", first.peer.ID)
  242. fsm.toBcR.sendPeerError(data.err, first.peer.ID)
  243. fsm.logger.Error("send peer error for", "peer", second.peer.ID)
  244. fsm.toBcR.sendPeerError(data.err, second.peer.ID)
  245. // Remove the first two blocks. This will also remove the peers
  246. fsm.pool.InvalidateFirstTwoBlocks(data.err)
  247. } else {
  248. fsm.pool.ProcessedCurrentHeightBlock()
  249. // Since we advanced one block reset the state timer
  250. fsm.resetStateTimer()
  251. }
  252. // Both cases above may result in achieving maximum height.
  253. if fsm.pool.ReachedMaxHeight() {
  254. return finished, nil
  255. }
  256. return waitForBlock, data.err
  257. case peerRemoveEv:
  258. // This event is sent by the switch to remove disconnected and errored peers.
  259. fsm.pool.RemovePeer(data.peerID, data.err)
  260. if fsm.pool.NumPeers() == 0 {
  261. return waitForPeer, nil
  262. }
  263. if fsm.pool.ReachedMaxHeight() {
  264. return finished, nil
  265. }
  266. return waitForBlock, nil
  267. case makeRequestsEv:
  268. fsm.makeNextRequests(data.maxNumRequests)
  269. return waitForBlock, nil
  270. case stateTimeoutEv:
  271. if data.stateName != "waitForBlock" {
  272. fsm.logger.Error("received a state timeout event for different state",
  273. "state", data.stateName)
  274. return waitForBlock, errTimeoutEventWrongState
  275. }
  276. // We haven't received the block at current height or height+1. Remove peer.
  277. fsm.pool.RemovePeerAtCurrentHeights(errNoPeerResponseForCurrentHeights)
  278. fsm.resetStateTimer()
  279. if fsm.pool.NumPeers() == 0 {
  280. return waitForPeer, errNoPeerResponseForCurrentHeights
  281. }
  282. if fsm.pool.ReachedMaxHeight() {
  283. return finished, nil
  284. }
  285. return waitForBlock, errNoPeerResponseForCurrentHeights
  286. case stopFSMEv:
  287. if fsm.stateTimer != nil {
  288. fsm.stateTimer.Stop()
  289. }
  290. return finished, errNoErrorFinished
  291. default:
  292. return waitForBlock, errInvalidEvent
  293. }
  294. },
  295. }
  296. finished = &bcReactorFSMState{
  297. name: "finished",
  298. enter: func(fsm *BcReactorFSM) {
  299. fsm.logger.Info("Time to switch to consensus reactor!", "height", fsm.pool.Height)
  300. fsm.toBcR.switchToConsensus()
  301. fsm.cleanup()
  302. },
  303. handle: func(fsm *BcReactorFSM, ev bReactorEvent, data bReactorEventData) (*bcReactorFSMState, error) {
  304. return finished, nil
  305. },
  306. }
  307. }
  308. // Interface used by FSM for sending Block and Status requests,
  309. // informing of peer errors and state timeouts
  310. // Implemented by BlockchainReactor and tests
  311. type bcReactor interface {
  312. sendStatusRequest()
  313. sendBlockRequest(peerID p2p.ID, height int64) error
  314. sendPeerError(err error, peerID p2p.ID)
  315. resetStateTimer(name string, timer **time.Timer, timeout time.Duration)
  316. switchToConsensus()
  317. }
  318. // SetLogger sets the FSM logger.
  319. func (fsm *BcReactorFSM) SetLogger(l log.Logger) {
  320. fsm.logger = l
  321. fsm.pool.SetLogger(l)
  322. }
  323. // Start starts the FSM.
  324. func (fsm *BcReactorFSM) Start() {
  325. _ = fsm.Handle(&bcReactorMessage{event: startFSMEv})
  326. }
  327. // Handle processes messages and events sent to the FSM.
  328. func (fsm *BcReactorFSM) Handle(msg *bcReactorMessage) error {
  329. fsm.mtx.Lock()
  330. defer fsm.mtx.Unlock()
  331. fsm.logger.Debug("FSM received", "event", msg, "state", fsm.state)
  332. if fsm.state == nil {
  333. fsm.state = unknown
  334. }
  335. next, err := fsm.state.handle(fsm, msg.event, msg.data)
  336. if err != nil {
  337. fsm.logger.Error("FSM event handler returned", "err", err,
  338. "state", fsm.state, "event", msg.event)
  339. }
  340. oldState := fsm.state.name
  341. fsm.transition(next)
  342. if oldState != fsm.state.name {
  343. fsm.logger.Info("FSM changed state", "new_state", fsm.state)
  344. }
  345. return err
  346. }
  347. func (fsm *BcReactorFSM) transition(next *bcReactorFSMState) {
  348. if next == nil {
  349. return
  350. }
  351. if fsm.state != next {
  352. fsm.state = next
  353. if next.enter != nil {
  354. next.enter(fsm)
  355. }
  356. }
  357. }
  358. // Called when entering an FSM state in order to detect lack of progress in the state machine.
  359. // Note the use of the 'bcr' interface to facilitate testing without timer expiring.
  360. func (fsm *BcReactorFSM) resetStateTimer() {
  361. fsm.toBcR.resetStateTimer(fsm.state.name, &fsm.stateTimer, fsm.state.timeout)
  362. }
  363. func (fsm *BcReactorFSM) isCaughtUp() bool {
  364. return fsm.state == finished
  365. }
  366. func (fsm *BcReactorFSM) makeNextRequests(maxNumRequests int) {
  367. fsm.pool.MakeNextRequests(maxNumRequests)
  368. }
  369. func (fsm *BcReactorFSM) cleanup() {
  370. fsm.pool.Cleanup()
  371. }
  372. // NeedsBlocks checks if more block requests are required.
  373. func (fsm *BcReactorFSM) NeedsBlocks() bool {
  374. fsm.mtx.Lock()
  375. defer fsm.mtx.Unlock()
  376. return fsm.state.name == "waitForBlock" && fsm.pool.NeedsBlocks()
  377. }
  378. // FirstTwoBlocks returns the two blocks at pool height and height+1
  379. func (fsm *BcReactorFSM) FirstTwoBlocks() (first, second *types.Block, err error) {
  380. fsm.mtx.Lock()
  381. defer fsm.mtx.Unlock()
  382. firstBP, secondBP, err := fsm.pool.FirstTwoBlocksAndPeers()
  383. if err == nil {
  384. first = firstBP.block
  385. second = secondBP.block
  386. }
  387. return
  388. }
  389. // Status returns the pool's height and the maximum peer height.
  390. func (fsm *BcReactorFSM) Status() (height, maxPeerHeight int64) {
  391. fsm.mtx.Lock()
  392. defer fsm.mtx.Unlock()
  393. return fsm.pool.Height, fsm.pool.MaxPeerHeight
  394. }