You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

594 lines
16 KiB

blockchain: Reorg reactor (#3561) * go routines in blockchain reactor * Added reference to the go routine diagram * Initial commit * cleanup * Undo testing_logger change, committed by mistake * Fix the test loggers * pulled some fsm code into pool.go * added pool tests * changes to the design added block requests under peer moved the request trigger in the reactor poolRoutine, triggered now by a ticker in general moved everything required for making block requests smarter in the poolRoutine added a simple map of heights to keep track of what will need to be requested next added a few more tests * send errors to FSM in a different channel than blocks send errors (RemovePeer) from switch on a different channel than the one receiving blocks renamed channels added more pool tests * more pool tests * lint errors * more tests * more tests * switch fast sync to new implementation * fixed data race in tests * cleanup * finished fsm tests * address golangci comments :) * address golangci comments :) * Added timeout on next block needed to advance * updating docs and cleanup * fix issue in test from previous cleanup * cleanup * Added termination scenarios, tests and more cleanup * small fixes to adr, comments and cleanup * Fix bug in sendRequest() If we tried to send a request to a peer not present in the switch, a missing continue statement caused the request to be blackholed in a peer that was removed and never retried. While this bug was manifesting, the reactor kept asking for other blocks that would be stored and never consumed. Added the number of unconsumed blocks in the math for requesting blocks ahead of current processing height so eventually there will be no more blocks requested until the already received ones are consumed. * remove bpPeer's didTimeout field * Use distinct err codes for peer timeout and FSM timeouts * Don't allow peers to update with lower height * review comments from Ethan and Zarko * some cleanup, renaming, comments * Move block execution in separate goroutine * Remove pool's numPending * review comments * fix lint, remove old blockchain reactor and duplicates in fsm tests * small reorg around peer after review comments * add the reactor spec * verify block only once * review comments * change to int for max number of pending requests * cleanup and godoc * Add configuration flag fast sync version * golangci fixes * fix config template * move both reactor versions under blockchain * cleanup, golint, renaming stuff * updated documentation, fixed more golint warnings * integrate with behavior package * sync with master * gofmt * add changelog_pending entry * move to improvments * suggestion to changelog entry
5 years ago
blockchain: Reorg reactor (#3561) * go routines in blockchain reactor * Added reference to the go routine diagram * Initial commit * cleanup * Undo testing_logger change, committed by mistake * Fix the test loggers * pulled some fsm code into pool.go * added pool tests * changes to the design added block requests under peer moved the request trigger in the reactor poolRoutine, triggered now by a ticker in general moved everything required for making block requests smarter in the poolRoutine added a simple map of heights to keep track of what will need to be requested next added a few more tests * send errors to FSM in a different channel than blocks send errors (RemovePeer) from switch on a different channel than the one receiving blocks renamed channels added more pool tests * more pool tests * lint errors * more tests * more tests * switch fast sync to new implementation * fixed data race in tests * cleanup * finished fsm tests * address golangci comments :) * address golangci comments :) * Added timeout on next block needed to advance * updating docs and cleanup * fix issue in test from previous cleanup * cleanup * Added termination scenarios, tests and more cleanup * small fixes to adr, comments and cleanup * Fix bug in sendRequest() If we tried to send a request to a peer not present in the switch, a missing continue statement caused the request to be blackholed in a peer that was removed and never retried. While this bug was manifesting, the reactor kept asking for other blocks that would be stored and never consumed. Added the number of unconsumed blocks in the math for requesting blocks ahead of current processing height so eventually there will be no more blocks requested until the already received ones are consumed. * remove bpPeer's didTimeout field * Use distinct err codes for peer timeout and FSM timeouts * Don't allow peers to update with lower height * review comments from Ethan and Zarko * some cleanup, renaming, comments * Move block execution in separate goroutine * Remove pool's numPending * review comments * fix lint, remove old blockchain reactor and duplicates in fsm tests * small reorg around peer after review comments * add the reactor spec * verify block only once * review comments * change to int for max number of pending requests * cleanup and godoc * Add configuration flag fast sync version * golangci fixes * fix config template * move both reactor versions under blockchain * cleanup, golint, renaming stuff * updated documentation, fixed more golint warnings * integrate with behavior package * sync with master * gofmt * add changelog_pending entry * move to improvments * suggestion to changelog entry
5 years ago
  1. package v0
  2. import (
  3. "fmt"
  4. "sync"
  5. "time"
  6. bc "github.com/tendermint/tendermint/internal/blockchain"
  7. cons "github.com/tendermint/tendermint/internal/consensus"
  8. "github.com/tendermint/tendermint/internal/p2p"
  9. "github.com/tendermint/tendermint/libs/log"
  10. "github.com/tendermint/tendermint/libs/service"
  11. bcproto "github.com/tendermint/tendermint/proto/tendermint/blockchain"
  12. sm "github.com/tendermint/tendermint/state"
  13. "github.com/tendermint/tendermint/store"
  14. "github.com/tendermint/tendermint/types"
  15. )
  16. var (
  17. _ service.Service = (*Reactor)(nil)
  18. // ChannelShims contains a map of ChannelDescriptorShim objects, where each
  19. // object wraps a reference to a legacy p2p ChannelDescriptor and the corresponding
  20. // p2p proto.Message the new p2p Channel is responsible for handling.
  21. //
  22. //
  23. // TODO: Remove once p2p refactor is complete.
  24. // ref: https://github.com/tendermint/tendermint/issues/5670
  25. ChannelShims = map[p2p.ChannelID]*p2p.ChannelDescriptorShim{
  26. BlockchainChannel: {
  27. MsgType: new(bcproto.Message),
  28. Descriptor: &p2p.ChannelDescriptor{
  29. ID: byte(BlockchainChannel),
  30. Priority: 5,
  31. SendQueueCapacity: 1000,
  32. RecvBufferCapacity: 1024,
  33. RecvMessageCapacity: bc.MaxMsgSize,
  34. MaxSendBytes: 100,
  35. },
  36. },
  37. }
  38. )
  39. const (
  40. // BlockchainChannel is a channel for blocks and status updates
  41. BlockchainChannel = p2p.ChannelID(0x40)
  42. trySyncIntervalMS = 10
  43. // ask for best height every 10s
  44. statusUpdateIntervalSeconds = 10
  45. // check if we should switch to consensus reactor
  46. switchToConsensusIntervalSeconds = 1
  47. // switch to consensus after this duration of inactivity
  48. syncTimeout = 60 * time.Second
  49. )
  50. type consensusReactor interface {
  51. // For when we switch from blockchain reactor and fast sync to the consensus
  52. // machine.
  53. SwitchToConsensus(state sm.State, skipWAL bool)
  54. }
  55. type peerError struct {
  56. err error
  57. peerID types.NodeID
  58. }
  59. func (e peerError) Error() string {
  60. return fmt.Sprintf("error with peer %v: %s", e.peerID, e.err.Error())
  61. }
  62. // BlockchainReactor handles long-term catchup syncing.
  63. type Reactor struct {
  64. service.BaseService
  65. // immutable
  66. initialState sm.State
  67. blockExec *sm.BlockExecutor
  68. store *store.BlockStore
  69. pool *BlockPool
  70. consReactor consensusReactor
  71. fastSync bool
  72. blockchainCh *p2p.Channel
  73. peerUpdates *p2p.PeerUpdates
  74. peerUpdatesCh chan p2p.Envelope
  75. closeCh chan struct{}
  76. requestsCh <-chan BlockRequest
  77. errorsCh <-chan peerError
  78. // poolWG is used to synchronize the graceful shutdown of the poolRoutine and
  79. // requestRoutine spawned goroutines when stopping the reactor and before
  80. // stopping the p2p Channel(s).
  81. poolWG sync.WaitGroup
  82. metrics *cons.Metrics
  83. }
  84. // NewReactor returns new reactor instance.
  85. func NewReactor(
  86. logger log.Logger,
  87. state sm.State,
  88. blockExec *sm.BlockExecutor,
  89. store *store.BlockStore,
  90. consReactor consensusReactor,
  91. blockchainCh *p2p.Channel,
  92. peerUpdates *p2p.PeerUpdates,
  93. fastSync bool,
  94. metrics *cons.Metrics,
  95. ) (*Reactor, error) {
  96. if state.LastBlockHeight != store.Height() {
  97. return nil, fmt.Errorf("state (%v) and store (%v) height mismatch", state.LastBlockHeight, store.Height())
  98. }
  99. startHeight := store.Height() + 1
  100. if startHeight == 1 {
  101. startHeight = state.InitialHeight
  102. }
  103. requestsCh := make(chan BlockRequest, maxTotalRequesters)
  104. errorsCh := make(chan peerError, maxPeerErrBuffer) // NOTE: The capacity should be larger than the peer count.
  105. r := &Reactor{
  106. initialState: state,
  107. blockExec: blockExec,
  108. store: store,
  109. pool: NewBlockPool(startHeight, requestsCh, errorsCh),
  110. consReactor: consReactor,
  111. fastSync: fastSync,
  112. requestsCh: requestsCh,
  113. errorsCh: errorsCh,
  114. blockchainCh: blockchainCh,
  115. peerUpdates: peerUpdates,
  116. peerUpdatesCh: make(chan p2p.Envelope),
  117. closeCh: make(chan struct{}),
  118. metrics: metrics,
  119. }
  120. r.BaseService = *service.NewBaseService(logger, "Blockchain", r)
  121. return r, nil
  122. }
  123. // OnStart starts separate go routines for each p2p Channel and listens for
  124. // envelopes on each. In addition, it also listens for peer updates and handles
  125. // messages on that p2p channel accordingly. The caller must be sure to execute
  126. // OnStop to ensure the outbound p2p Channels are closed.
  127. //
  128. // If fastSync is enabled, we also start the pool and the pool processing
  129. // goroutine. If the pool fails to start, an error is returned.
  130. func (r *Reactor) OnStart() error {
  131. if r.fastSync {
  132. if err := r.pool.Start(); err != nil {
  133. return err
  134. }
  135. r.poolWG.Add(1)
  136. go r.poolRoutine(false)
  137. }
  138. go r.processBlockchainCh()
  139. go r.processPeerUpdates()
  140. return nil
  141. }
  142. // OnStop stops the reactor by signaling to all spawned goroutines to exit and
  143. // blocking until they all exit.
  144. func (r *Reactor) OnStop() {
  145. if r.fastSync {
  146. if err := r.pool.Stop(); err != nil {
  147. r.Logger.Error("failed to stop pool", "err", err)
  148. }
  149. }
  150. // wait for the poolRoutine and requestRoutine goroutines to gracefully exit
  151. r.poolWG.Wait()
  152. // Close closeCh to signal to all spawned goroutines to gracefully exit. All
  153. // p2p Channels should execute Close().
  154. close(r.closeCh)
  155. // Wait for all p2p Channels to be closed before returning. This ensures we
  156. // can easily reason about synchronization of all p2p Channels and ensure no
  157. // panics will occur.
  158. <-r.blockchainCh.Done()
  159. <-r.peerUpdates.Done()
  160. }
  161. // respondToPeer loads a block and sends it to the requesting peer, if we have it.
  162. // Otherwise, we'll respond saying we do not have it.
  163. func (r *Reactor) respondToPeer(msg *bcproto.BlockRequest, peerID types.NodeID) {
  164. block := r.store.LoadBlock(msg.Height)
  165. if block != nil {
  166. blockProto, err := block.ToProto()
  167. if err != nil {
  168. r.Logger.Error("failed to convert msg to protobuf", "err", err)
  169. return
  170. }
  171. r.blockchainCh.Out <- p2p.Envelope{
  172. To: peerID,
  173. Message: &bcproto.BlockResponse{Block: blockProto},
  174. }
  175. return
  176. }
  177. r.Logger.Info("peer requesting a block we do not have", "peer", peerID, "height", msg.Height)
  178. r.blockchainCh.Out <- p2p.Envelope{
  179. To: peerID,
  180. Message: &bcproto.NoBlockResponse{Height: msg.Height},
  181. }
  182. }
  183. // handleBlockchainMessage handles envelopes sent from peers on the
  184. // BlockchainChannel. It returns an error only if the Envelope.Message is unknown
  185. // for this channel. This should never be called outside of handleMessage.
  186. func (r *Reactor) handleBlockchainMessage(envelope p2p.Envelope) error {
  187. logger := r.Logger.With("peer", envelope.From)
  188. switch msg := envelope.Message.(type) {
  189. case *bcproto.BlockRequest:
  190. r.respondToPeer(msg, envelope.From)
  191. case *bcproto.BlockResponse:
  192. block, err := types.BlockFromProto(msg.Block)
  193. if err != nil {
  194. logger.Error("failed to convert block from proto", "err", err)
  195. return err
  196. }
  197. r.pool.AddBlock(envelope.From, block, block.Size())
  198. case *bcproto.StatusRequest:
  199. r.blockchainCh.Out <- p2p.Envelope{
  200. To: envelope.From,
  201. Message: &bcproto.StatusResponse{
  202. Height: r.store.Height(),
  203. Base: r.store.Base(),
  204. },
  205. }
  206. case *bcproto.StatusResponse:
  207. r.pool.SetPeerRange(envelope.From, msg.Base, msg.Height)
  208. case *bcproto.NoBlockResponse:
  209. logger.Debug("peer does not have the requested block", "height", msg.Height)
  210. default:
  211. return fmt.Errorf("received unknown message: %T", msg)
  212. }
  213. return nil
  214. }
  215. // handleMessage handles an Envelope sent from a peer on a specific p2p Channel.
  216. // It will handle errors and any possible panics gracefully. A caller can handle
  217. // any error returned by sending a PeerError on the respective channel.
  218. func (r *Reactor) handleMessage(chID p2p.ChannelID, envelope p2p.Envelope) (err error) {
  219. defer func() {
  220. if e := recover(); e != nil {
  221. err = fmt.Errorf("panic in processing message: %v", e)
  222. r.Logger.Error("recovering from processing message panic", "err", err)
  223. }
  224. }()
  225. r.Logger.Debug("received message", "message", envelope.Message, "peer", envelope.From)
  226. switch chID {
  227. case BlockchainChannel:
  228. err = r.handleBlockchainMessage(envelope)
  229. default:
  230. err = fmt.Errorf("unknown channel ID (%d) for envelope (%v)", chID, envelope)
  231. }
  232. return err
  233. }
  234. // processBlockchainCh initiates a blocking process where we listen for and handle
  235. // envelopes on the BlockchainChannel and peerUpdatesCh. Any error encountered during
  236. // message execution will result in a PeerError being sent on the BlockchainChannel.
  237. // When the reactor is stopped, we will catch the signal and close the p2p Channel
  238. // gracefully.
  239. func (r *Reactor) processBlockchainCh() {
  240. defer r.blockchainCh.Close()
  241. for {
  242. select {
  243. case envelope := <-r.blockchainCh.In:
  244. if err := r.handleMessage(r.blockchainCh.ID, envelope); err != nil {
  245. r.Logger.Error("failed to process message", "ch_id", r.blockchainCh.ID, "envelope", envelope, "err", err)
  246. r.blockchainCh.Error <- p2p.PeerError{
  247. NodeID: envelope.From,
  248. Err: err,
  249. }
  250. }
  251. case envelop := <-r.peerUpdatesCh:
  252. r.blockchainCh.Out <- envelop
  253. case <-r.closeCh:
  254. r.Logger.Debug("stopped listening on blockchain channel; closing...")
  255. return
  256. }
  257. }
  258. }
  259. // processPeerUpdate processes a PeerUpdate.
  260. func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
  261. r.Logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status)
  262. // XXX: Pool#RedoRequest can sometimes give us an empty peer.
  263. if len(peerUpdate.NodeID) == 0 {
  264. return
  265. }
  266. switch peerUpdate.Status {
  267. case p2p.PeerStatusUp:
  268. // send a status update the newly added peer
  269. r.peerUpdatesCh <- p2p.Envelope{
  270. To: peerUpdate.NodeID,
  271. Message: &bcproto.StatusResponse{
  272. Base: r.store.Base(),
  273. Height: r.store.Height(),
  274. },
  275. }
  276. case p2p.PeerStatusDown:
  277. r.pool.RemovePeer(peerUpdate.NodeID)
  278. }
  279. }
  280. // processPeerUpdates initiates a blocking process where we listen for and handle
  281. // PeerUpdate messages. When the reactor is stopped, we will catch the signal and
  282. // close the p2p PeerUpdatesCh gracefully.
  283. func (r *Reactor) processPeerUpdates() {
  284. defer r.peerUpdates.Close()
  285. for {
  286. select {
  287. case peerUpdate := <-r.peerUpdates.Updates():
  288. r.processPeerUpdate(peerUpdate)
  289. case <-r.closeCh:
  290. r.Logger.Debug("stopped listening on peer updates channel; closing...")
  291. return
  292. }
  293. }
  294. }
  295. // SwitchToFastSync is called by the state sync reactor when switching to fast
  296. // sync.
  297. func (r *Reactor) SwitchToFastSync(state sm.State) error {
  298. r.fastSync = true
  299. r.initialState = state
  300. r.pool.height = state.LastBlockHeight + 1
  301. if err := r.pool.Start(); err != nil {
  302. return err
  303. }
  304. r.poolWG.Add(1)
  305. go r.poolRoutine(true)
  306. return nil
  307. }
  308. func (r *Reactor) requestRoutine() {
  309. statusUpdateTicker := time.NewTicker(statusUpdateIntervalSeconds * time.Second)
  310. defer statusUpdateTicker.Stop()
  311. r.poolWG.Add(1)
  312. defer r.poolWG.Done()
  313. for {
  314. select {
  315. case <-r.closeCh:
  316. return
  317. case <-r.pool.Quit():
  318. return
  319. case request := <-r.requestsCh:
  320. r.blockchainCh.Out <- p2p.Envelope{
  321. To: request.PeerID,
  322. Message: &bcproto.BlockRequest{Height: request.Height},
  323. }
  324. case pErr := <-r.errorsCh:
  325. r.blockchainCh.Error <- p2p.PeerError{
  326. NodeID: pErr.peerID,
  327. Err: pErr.err,
  328. }
  329. case <-statusUpdateTicker.C:
  330. r.poolWG.Add(1)
  331. go func() {
  332. defer r.poolWG.Done()
  333. r.blockchainCh.Out <- p2p.Envelope{
  334. Broadcast: true,
  335. Message: &bcproto.StatusRequest{},
  336. }
  337. }()
  338. }
  339. }
  340. }
  341. // poolRoutine handles messages from the poolReactor telling the reactor what to
  342. // do.
  343. //
  344. // NOTE: Don't sleep in the FOR_LOOP or otherwise slow it down!
  345. func (r *Reactor) poolRoutine(stateSynced bool) {
  346. var (
  347. trySyncTicker = time.NewTicker(trySyncIntervalMS * time.Millisecond)
  348. switchToConsensusTicker = time.NewTicker(switchToConsensusIntervalSeconds * time.Second)
  349. blocksSynced = uint64(0)
  350. chainID = r.initialState.ChainID
  351. state = r.initialState
  352. lastHundred = time.Now()
  353. lastRate = 0.0
  354. didProcessCh = make(chan struct{}, 1)
  355. )
  356. defer trySyncTicker.Stop()
  357. defer switchToConsensusTicker.Stop()
  358. go r.requestRoutine()
  359. defer r.poolWG.Done()
  360. FOR_LOOP:
  361. for {
  362. select {
  363. case <-switchToConsensusTicker.C:
  364. var (
  365. height, numPending, lenRequesters = r.pool.GetStatus()
  366. lastAdvance = r.pool.LastAdvance()
  367. )
  368. r.Logger.Debug(
  369. "consensus ticker",
  370. "num_pending", numPending,
  371. "total", lenRequesters,
  372. "height", height,
  373. )
  374. switch {
  375. case r.pool.IsCaughtUp():
  376. r.Logger.Info("switching to consensus reactor", "height", height)
  377. case time.Since(lastAdvance) > syncTimeout:
  378. r.Logger.Error("no progress since last advance", "last_advance", lastAdvance)
  379. default:
  380. r.Logger.Info(
  381. "not caught up yet",
  382. "height", height,
  383. "max_peer_height", r.pool.MaxPeerHeight(),
  384. "timeout_in", syncTimeout-time.Since(lastAdvance),
  385. )
  386. continue
  387. }
  388. if err := r.pool.Stop(); err != nil {
  389. r.Logger.Error("failed to stop pool", "err", err)
  390. }
  391. if r.consReactor != nil {
  392. r.consReactor.SwitchToConsensus(state, blocksSynced > 0 || stateSynced)
  393. }
  394. break FOR_LOOP
  395. case <-trySyncTicker.C:
  396. select {
  397. case didProcessCh <- struct{}{}:
  398. default:
  399. }
  400. case <-didProcessCh:
  401. // NOTE: It is a subtle mistake to process more than a single block at a
  402. // time (e.g. 10) here, because we only send one BlockRequest per loop
  403. // iteration. The ratio mismatch can result in starving of blocks, i.e. a
  404. // sudden burst of requests and responses, and repeat. Consequently, it is
  405. // better to split these routines rather than coupling them as it is
  406. // written here.
  407. //
  408. // TODO: Uncouple from request routine.
  409. // see if there are any blocks to sync
  410. first, second := r.pool.PeekTwoBlocks()
  411. if first == nil || second == nil {
  412. // we need both to sync the first block
  413. continue FOR_LOOP
  414. } else {
  415. // try again quickly next loop
  416. didProcessCh <- struct{}{}
  417. }
  418. var (
  419. firstParts = first.MakePartSet(types.BlockPartSizeBytes)
  420. firstPartSetHeader = firstParts.Header()
  421. firstID = types.BlockID{Hash: first.Hash(), PartSetHeader: firstPartSetHeader}
  422. )
  423. // Finally, verify the first block using the second's commit.
  424. //
  425. // NOTE: We can probably make this more efficient, but note that calling
  426. // first.Hash() doesn't verify the tx contents, so MakePartSet() is
  427. // currently necessary.
  428. err := state.Validators.VerifyCommitLight(chainID, firstID, first.Height, second.LastCommit)
  429. if err != nil {
  430. err = fmt.Errorf("invalid last commit: %w", err)
  431. r.Logger.Error(
  432. err.Error(),
  433. "last_commit", second.LastCommit,
  434. "block_id", firstID,
  435. "height", first.Height,
  436. )
  437. // NOTE: We've already removed the peer's request, but we still need
  438. // to clean up the rest.
  439. peerID := r.pool.RedoRequest(first.Height)
  440. r.blockchainCh.Error <- p2p.PeerError{
  441. NodeID: peerID,
  442. Err: err,
  443. }
  444. peerID2 := r.pool.RedoRequest(second.Height)
  445. if peerID2 != peerID {
  446. r.blockchainCh.Error <- p2p.PeerError{
  447. NodeID: peerID2,
  448. Err: err,
  449. }
  450. }
  451. continue FOR_LOOP
  452. } else {
  453. r.pool.PopRequest()
  454. // TODO: batch saves so we do not persist to disk every block
  455. r.store.SaveBlock(first, firstParts, second.LastCommit)
  456. var err error
  457. // TODO: Same thing for app - but we would need a way to get the hash
  458. // without persisting the state.
  459. state, err = r.blockExec.ApplyBlock(state, firstID, first)
  460. if err != nil {
  461. // TODO: This is bad, are we zombie?
  462. panic(fmt.Sprintf("failed to process committed block (%d:%X): %v", first.Height, first.Hash(), err))
  463. }
  464. r.metrics.RecordConsMetrics(first)
  465. blocksSynced++
  466. if blocksSynced%100 == 0 {
  467. lastRate = 0.9*lastRate + 0.1*(100/time.Since(lastHundred).Seconds())
  468. r.Logger.Info(
  469. "fast sync rate",
  470. "height", r.pool.height,
  471. "max_peer_height", r.pool.MaxPeerHeight(),
  472. "blocks/s", lastRate,
  473. )
  474. lastHundred = time.Now()
  475. }
  476. }
  477. continue FOR_LOOP
  478. case <-r.closeCh:
  479. break FOR_LOOP
  480. }
  481. }
  482. }
  483. func (r *Reactor) GetMaxPeerBlockHeight() int64 {
  484. return r.pool.MaxPeerHeight()
  485. }