You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

568 lines
17 KiB

  1. package p2p
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "io"
  7. "sync"
  8. "time"
  9. "github.com/gogo/protobuf/proto"
  10. "github.com/tendermint/tendermint/libs/log"
  11. "github.com/tendermint/tendermint/libs/service"
  12. )
  13. // Router manages peer connections and routes messages between peers and reactor
  14. // channels. This is an early prototype.
  15. //
  16. // Channels are registered via OpenChannel(). When called, we register an input
  17. // message queue for the channel in channelQueues and spawn off a goroutine for
  18. // Router.routeChannel(). This goroutine reads off outbound messages and puts
  19. // them in the appropriate peer message queue, and processes peer errors which
  20. // will close (and thus disconnect) the appriate peer queue. It runs until
  21. // either the channel is closed by the caller or the router is stopped, at which
  22. // point the input message queue is closed and removed.
  23. //
  24. // On startup, the router spawns off two primary goroutines that maintain
  25. // connections to peers and run for the lifetime of the router:
  26. //
  27. // Router.dialPeers(): in a loop, asks the peerStore to dispense an
  28. // eligible peer to connect to, and attempts to resolve and dial each
  29. // address until successful.
  30. //
  31. // Router.acceptPeers(): in a loop, waits for the next inbound connection
  32. // from a peer, and attempts to claim it in the peerStore.
  33. //
  34. // Once either an inbound or outbound connection has been made, an outbound
  35. // message queue is registered in Router.peerQueues and a goroutine is spawned
  36. // off for Router.routePeer() which will spawn off additional goroutines for
  37. // Router.sendPeer() that sends outbound messages from the peer queue over the
  38. // connection and for Router.receivePeer() that reads inbound messages from
  39. // the connection and places them in the appropriate channel queue. When either
  40. // goroutine exits, the connection and peer queue is closed, which will cause
  41. // the other goroutines to close as well.
  42. //
  43. // The peerStore is used to coordinate peer connections, by only allowing a peer
  44. // to be claimed (owned) by a single caller at a time (both for outbound and
  45. // inbound connections). This is done either via peerStore.Dispense() which
  46. // dispenses and claims an eligible peer to dial, or via peerStore.Claim() which
  47. // attempts to claim a given peer for an inbound connection. Peers must be
  48. // returned to the peerStore with peerStore.Return() to release the claim. Over
  49. // time, the peerStore will also do peer scheduling and prioritization, e.g.
  50. // ensuring we do exponential backoff on dial failures and connecting to
  51. // more important peers first (such as persistent peers and validators).
  52. //
  53. // An additional goroutine Router.broadcastPeerUpdates() is also spawned off
  54. // on startup, which consumes peer updates from Router.peerUpdatesCh (currently
  55. // only connections and disconnections), and broadcasts them to all peer update
  56. // subscriptions registered via SubscribePeerUpdates().
  57. //
  58. // On router shutdown, we close Router.stopCh which will signal to all
  59. // goroutines to terminate. This in turn will cause all pending channel/peer
  60. // queues to close, and we wait for this as a signal that goroutines have ended.
  61. //
  62. // All message scheduling should be limited to the queue implementations used
  63. // for channel queues and peer queues. All message sending throughout the router
  64. // is blocking, and if any messages should be dropped or buffered this is the
  65. // sole responsibility of the queue, such that we can limit this logic to a
  66. // single place. There is currently only a FIFO queue implementation that always
  67. // blocks and never drops messages, but this must be improved with other
  68. // implementations. The only exception is that all message sending must also
  69. // select on appropriate channel/queue/router closure signals, to avoid blocking
  70. // forever on a channel that has no consumer.
  71. type Router struct {
  72. *service.BaseService
  73. logger log.Logger
  74. transports map[Protocol]Transport
  75. store *peerStore
  76. // FIXME: Consider using sync.Map.
  77. peerMtx sync.RWMutex
  78. peerQueues map[NodeID]queue
  79. // FIXME: We don't strictly need to use a mutex for this if we seal the
  80. // channels on router start. This depends on whether we want to allow
  81. // dynamic channels in the future.
  82. channelMtx sync.RWMutex
  83. channelQueues map[ChannelID]queue
  84. channelMessages map[ChannelID]proto.Message
  85. peerUpdatesCh chan PeerUpdate
  86. peerUpdatesMtx sync.RWMutex
  87. peerUpdatesSubs map[*PeerUpdatesCh]*PeerUpdatesCh // keyed by struct identity (address)
  88. // stopCh is used to signal router shutdown, by closing the channel.
  89. stopCh chan struct{}
  90. }
  91. // NewRouter creates a new Router, dialing the given peers.
  92. //
  93. // FIXME: providing protocol/transport maps is cumbersome in tests, we should
  94. // consider adding Protocols() to the Transport interface instead and register
  95. // protocol/transport mappings automatically on a first-come basis.
  96. func NewRouter(logger log.Logger, transports map[Protocol]Transport, peers []PeerAddress) *Router {
  97. router := &Router{
  98. logger: logger,
  99. transports: transports,
  100. store: newPeerStore(),
  101. stopCh: make(chan struct{}),
  102. channelQueues: map[ChannelID]queue{},
  103. channelMessages: map[ChannelID]proto.Message{},
  104. peerQueues: map[NodeID]queue{},
  105. peerUpdatesCh: make(chan PeerUpdate),
  106. peerUpdatesSubs: map[*PeerUpdatesCh]*PeerUpdatesCh{},
  107. }
  108. router.BaseService = service.NewBaseService(logger, "router", router)
  109. for _, address := range peers {
  110. if err := router.store.Add(address); err != nil {
  111. logger.Error("failed to add peer", "address", address, "err", err)
  112. }
  113. }
  114. return router
  115. }
  116. // OpenChannel opens a new channel for the given message type. The caller must
  117. // close the channel when done, and this must happen before the router stops.
  118. func (r *Router) OpenChannel(id ChannelID, messageType proto.Message) (*Channel, error) {
  119. // FIXME: NewChannel should take directional channels so we can pass
  120. // queue.dequeue() instead of reaching inside for queue.queueCh.
  121. queue := newFIFOQueue()
  122. channel := NewChannel(id, messageType, queue.queueCh, make(chan Envelope), make(chan PeerError))
  123. r.channelMtx.Lock()
  124. defer r.channelMtx.Unlock()
  125. if _, ok := r.channelQueues[id]; ok {
  126. return nil, fmt.Errorf("channel %v already exists", id)
  127. }
  128. r.channelQueues[id] = queue
  129. r.channelMessages[id] = messageType
  130. go func() {
  131. defer func() {
  132. r.channelMtx.Lock()
  133. delete(r.channelQueues, id)
  134. delete(r.channelMessages, id)
  135. r.channelMtx.Unlock()
  136. queue.close()
  137. }()
  138. r.routeChannel(channel)
  139. }()
  140. return channel, nil
  141. }
  142. // routeChannel receives outbound messages and errors from a channel and routes
  143. // them to the appropriate peer. It returns when either the channel is closed or
  144. // the router is shutting down.
  145. func (r *Router) routeChannel(channel *Channel) {
  146. for {
  147. select {
  148. case envelope, ok := <-channel.outCh:
  149. if !ok {
  150. return
  151. }
  152. // FIXME: This is a bit unergonomic, maybe it'd be better for Wrap()
  153. // to return a wrapped copy.
  154. if _, ok := channel.messageType.(Wrapper); ok {
  155. wrapper := proto.Clone(channel.messageType)
  156. if err := wrapper.(Wrapper).Wrap(envelope.Message); err != nil {
  157. r.Logger.Error("failed to wrap message", "err", err)
  158. continue
  159. }
  160. envelope.Message = wrapper
  161. }
  162. envelope.channelID = channel.id
  163. if envelope.Broadcast {
  164. r.peerMtx.RLock()
  165. peerQueues := make(map[NodeID]queue, len(r.peerQueues))
  166. for peerID, peerQueue := range r.peerQueues {
  167. peerQueues[peerID] = peerQueue
  168. }
  169. r.peerMtx.RUnlock()
  170. for peerID, peerQueue := range peerQueues {
  171. e := envelope
  172. e.Broadcast = false
  173. e.To = peerID
  174. select {
  175. case peerQueue.enqueue() <- e:
  176. case <-peerQueue.closed():
  177. case <-r.stopCh:
  178. return
  179. }
  180. }
  181. } else {
  182. r.peerMtx.RLock()
  183. peerQueue, ok := r.peerQueues[envelope.To]
  184. r.peerMtx.RUnlock()
  185. if !ok {
  186. r.logger.Error("dropping message for non-connected peer",
  187. "peer", envelope.To, "channel", channel.id)
  188. continue
  189. }
  190. select {
  191. case peerQueue.enqueue() <- envelope:
  192. case <-peerQueue.closed():
  193. r.logger.Error("dropping message for non-connected peer",
  194. "peer", envelope.To, "channel", channel.id)
  195. case <-r.stopCh:
  196. return
  197. }
  198. }
  199. case peerError, ok := <-channel.errCh:
  200. if !ok {
  201. return
  202. }
  203. // FIXME: We just disconnect the peer for now
  204. r.logger.Error("peer error, disconnecting", "peer", peerError.PeerID, "err", peerError.Err)
  205. r.peerMtx.RLock()
  206. peerQueue, ok := r.peerQueues[peerError.PeerID]
  207. r.peerMtx.RUnlock()
  208. if ok {
  209. peerQueue.close()
  210. }
  211. case <-channel.Done():
  212. return
  213. case <-r.stopCh:
  214. return
  215. }
  216. }
  217. }
  218. // acceptPeers accepts inbound connections from peers on the given transport.
  219. func (r *Router) acceptPeers(transport Transport) {
  220. for {
  221. select {
  222. case <-r.stopCh:
  223. return
  224. default:
  225. }
  226. conn, err := transport.Accept(context.Background())
  227. switch err {
  228. case nil:
  229. case ErrTransportClosed{}, io.EOF:
  230. r.logger.Info("transport closed; stopping accept routine", "transport", transport)
  231. return
  232. default:
  233. r.logger.Error("failed to accept connection", "transport", transport, "err", err)
  234. continue
  235. }
  236. peerID := conn.NodeInfo().NodeID
  237. if r.store.Claim(peerID) == nil {
  238. r.logger.Error("already connected to peer, rejecting connection", "peer", peerID)
  239. _ = conn.Close()
  240. continue
  241. }
  242. queue := newFIFOQueue()
  243. r.peerMtx.Lock()
  244. r.peerQueues[peerID] = queue
  245. r.peerMtx.Unlock()
  246. go func() {
  247. defer func() {
  248. r.peerMtx.Lock()
  249. delete(r.peerQueues, peerID)
  250. r.peerMtx.Unlock()
  251. queue.close()
  252. _ = conn.Close()
  253. r.store.Return(peerID)
  254. }()
  255. r.routePeer(peerID, conn, queue)
  256. }()
  257. }
  258. }
  259. // dialPeers maintains outbound connections to peers.
  260. func (r *Router) dialPeers() {
  261. for {
  262. select {
  263. case <-r.stopCh:
  264. return
  265. default:
  266. }
  267. peer := r.store.Dispense()
  268. if peer == nil {
  269. r.logger.Debug("no eligible peers, sleeping")
  270. select {
  271. case <-time.After(time.Second):
  272. continue
  273. case <-r.stopCh:
  274. return
  275. }
  276. }
  277. go func() {
  278. defer r.store.Return(peer.ID)
  279. conn, err := r.dialPeer(peer)
  280. if err != nil {
  281. r.logger.Error("failed to dial peer, will retry", "peer", peer.ID)
  282. return
  283. }
  284. defer conn.Close()
  285. queue := newFIFOQueue()
  286. defer queue.close()
  287. r.peerMtx.Lock()
  288. r.peerQueues[peer.ID] = queue
  289. r.peerMtx.Unlock()
  290. defer func() {
  291. r.peerMtx.Lock()
  292. delete(r.peerQueues, peer.ID)
  293. r.peerMtx.Unlock()
  294. }()
  295. r.routePeer(peer.ID, conn, queue)
  296. }()
  297. }
  298. }
  299. // dialPeer attempts to connect to a peer.
  300. func (r *Router) dialPeer(peer *peerInfo) (Connection, error) {
  301. ctx := context.Background()
  302. for _, address := range peer.Addresses {
  303. resolveCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
  304. defer cancel()
  305. r.logger.Info("resolving peer address", "peer", peer.ID, "address", address)
  306. endpoints, err := address.Resolve(resolveCtx)
  307. if err != nil {
  308. r.logger.Error("failed to resolve address", "address", address, "err", err)
  309. continue
  310. }
  311. for _, endpoint := range endpoints {
  312. t, ok := r.transports[endpoint.Protocol]
  313. if !ok {
  314. r.logger.Error("no transport found for protocol", "protocol", endpoint.Protocol)
  315. continue
  316. }
  317. dialCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
  318. defer cancel()
  319. conn, err := t.Dial(dialCtx, endpoint)
  320. if err != nil {
  321. r.logger.Error("failed to dial endpoint", "endpoint", endpoint)
  322. } else {
  323. r.logger.Info("connected to peer", "peer", peer.ID, "endpoint", endpoint)
  324. return conn, nil
  325. }
  326. }
  327. }
  328. return nil, errors.New("failed to connect to peer")
  329. }
  330. // routePeer routes inbound messages from a peer to channels, and also sends
  331. // outbound queued messages to the peer. It will close the connection and send
  332. // queue, using this as a signal to coordinate the internal receivePeer() and
  333. // sendPeer() goroutines. It blocks until the peer is done, e.g. when the
  334. // connection or queue is closed.
  335. func (r *Router) routePeer(peerID NodeID, conn Connection, sendQueue queue) {
  336. // FIXME: Peer updates should probably be handled by the peer store.
  337. r.peerUpdatesCh <- PeerUpdate{
  338. PeerID: peerID,
  339. Status: PeerStatusUp,
  340. }
  341. defer func() {
  342. r.peerUpdatesCh <- PeerUpdate{
  343. PeerID: peerID,
  344. Status: PeerStatusDown,
  345. }
  346. }()
  347. resultsCh := make(chan error, 2)
  348. go func() {
  349. resultsCh <- r.receivePeer(peerID, conn)
  350. }()
  351. go func() {
  352. resultsCh <- r.sendPeer(peerID, conn, sendQueue)
  353. }()
  354. err := <-resultsCh
  355. _ = conn.Close()
  356. sendQueue.close()
  357. if e := <-resultsCh; err == nil {
  358. // The first err was nil, so we update it with the second result,
  359. // which may or may not be nil.
  360. err = e
  361. }
  362. switch err {
  363. case nil, io.EOF, ErrTransportClosed{}:
  364. r.logger.Info("peer disconnected", "peer", peerID)
  365. default:
  366. r.logger.Error("peer failure", "peer", peerID, "err", err)
  367. }
  368. }
  369. // receivePeer receives inbound messages from a peer, deserializes them and
  370. // passes them on to the appropriate channel.
  371. func (r *Router) receivePeer(peerID NodeID, conn Connection) error {
  372. for {
  373. chID, bz, err := conn.ReceiveMessage()
  374. if err != nil {
  375. return err
  376. }
  377. r.channelMtx.RLock()
  378. queue, ok := r.channelQueues[ChannelID(chID)]
  379. messageType := r.channelMessages[ChannelID(chID)]
  380. r.channelMtx.RUnlock()
  381. if !ok {
  382. r.logger.Error("dropping message for unknown channel", "peer", peerID, "channel", chID)
  383. continue
  384. }
  385. msg := proto.Clone(messageType)
  386. if err := proto.Unmarshal(bz, msg); err != nil {
  387. r.logger.Error("message decoding failed, dropping message", "peer", peerID, "err", err)
  388. continue
  389. }
  390. if wrapper, ok := msg.(Wrapper); ok {
  391. msg, err = wrapper.Unwrap()
  392. if err != nil {
  393. r.logger.Error("failed to unwrap message", "err", err)
  394. continue
  395. }
  396. }
  397. select {
  398. // FIXME: ReceiveMessage() should return ChannelID.
  399. case queue.enqueue() <- Envelope{channelID: ChannelID(chID), From: peerID, Message: msg}:
  400. r.logger.Debug("received message", "peer", peerID, "message", msg)
  401. case <-queue.closed():
  402. r.logger.Error("channel closed, dropping message", "peer", peerID, "channel", chID)
  403. case <-r.stopCh:
  404. return nil
  405. }
  406. }
  407. }
  408. // sendPeer sends queued messages to a peer.
  409. func (r *Router) sendPeer(peerID NodeID, conn Connection, queue queue) error {
  410. for {
  411. select {
  412. case envelope := <-queue.dequeue():
  413. bz, err := proto.Marshal(envelope.Message)
  414. if err != nil {
  415. r.logger.Error("failed to marshal message", "peer", peerID, "err", err)
  416. continue
  417. }
  418. // FIXME: SendMessage() should take ChannelID.
  419. _, err = conn.SendMessage(byte(envelope.channelID), bz)
  420. if err != nil {
  421. return err
  422. }
  423. r.logger.Debug("sent message", "peer", envelope.To, "message", envelope.Message)
  424. case <-queue.closed():
  425. return nil
  426. case <-r.stopCh:
  427. return nil
  428. }
  429. }
  430. }
  431. // SubscribePeerUpdates creates a new peer updates subscription. The caller must
  432. // consume the peer updates in a timely fashion, since delivery is guaranteed and
  433. // will block peer connection/disconnection otherwise.
  434. func (r *Router) SubscribePeerUpdates() (*PeerUpdatesCh, error) {
  435. // FIXME: We may want to use a size 1 buffer here. When the router
  436. // broadcasts a peer update it has to loop over all of the
  437. // subscriptions, and we want to avoid blocking and waiting for a
  438. // context switch before continuing to the next subscription. This also
  439. // prevents tail latencies from compounding across updates. We also want
  440. // to make sure the subscribers are reasonably in sync, so it should be
  441. // kept at 1. However, this should be benchmarked first.
  442. peerUpdates := NewPeerUpdates(make(chan PeerUpdate))
  443. r.peerUpdatesMtx.Lock()
  444. r.peerUpdatesSubs[peerUpdates] = peerUpdates
  445. r.peerUpdatesMtx.Unlock()
  446. go func() {
  447. select {
  448. case <-peerUpdates.Done():
  449. r.peerUpdatesMtx.Lock()
  450. delete(r.peerUpdatesSubs, peerUpdates)
  451. r.peerUpdatesMtx.Unlock()
  452. case <-r.stopCh:
  453. }
  454. }()
  455. return peerUpdates, nil
  456. }
  457. // broadcastPeerUpdates broadcasts peer updates received from the router
  458. // to all subscriptions.
  459. func (r *Router) broadcastPeerUpdates() {
  460. for {
  461. select {
  462. case peerUpdate := <-r.peerUpdatesCh:
  463. subs := []*PeerUpdatesCh{}
  464. r.peerUpdatesMtx.RLock()
  465. for _, sub := range r.peerUpdatesSubs {
  466. subs = append(subs, sub)
  467. }
  468. r.peerUpdatesMtx.RUnlock()
  469. for _, sub := range subs {
  470. select {
  471. case sub.updatesCh <- peerUpdate:
  472. case <-sub.doneCh:
  473. case <-r.stopCh:
  474. return
  475. }
  476. }
  477. case <-r.stopCh:
  478. return
  479. }
  480. }
  481. }
  482. // OnStart implements service.Service.
  483. func (r *Router) OnStart() error {
  484. go r.broadcastPeerUpdates()
  485. go r.dialPeers()
  486. for _, transport := range r.transports {
  487. go r.acceptPeers(transport)
  488. }
  489. return nil
  490. }
  491. // OnStop implements service.Service.
  492. func (r *Router) OnStop() {
  493. // Collect all active queues, so we can wait for them to close.
  494. queues := []queue{}
  495. r.channelMtx.RLock()
  496. for _, q := range r.channelQueues {
  497. queues = append(queues, q)
  498. }
  499. r.channelMtx.RUnlock()
  500. r.peerMtx.RLock()
  501. for _, q := range r.peerQueues {
  502. queues = append(queues, q)
  503. }
  504. r.peerMtx.RUnlock()
  505. // Signal router shutdown, and wait for queues (and thus goroutines)
  506. // to complete.
  507. close(r.stopCh)
  508. for _, q := range queues {
  509. <-q.closed()
  510. }
  511. }