You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

567 lines
17 KiB

9 years ago
9 years ago
9 years ago
9 years ago
7 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
8 years ago
9 years ago
7 years ago
8 years ago
9 years ago
9 years ago
8 years ago
9 years ago
8 years ago
9 years ago
8 years ago
9 years ago
8 years ago
9 years ago
8 years ago
9 years ago
8 years ago
9 years ago
8 years ago
9 years ago
8 years ago
9 years ago
7 years ago
8 years ago
9 years ago
9 years ago
9 years ago
8 years ago
9 years ago
9 years ago
9 years ago
9 years ago
7 years ago
7 years ago
7 years ago
9 years ago
7 years ago
7 years ago
9 years ago
7 years ago
9 years ago
7 years ago
7 years ago
7 years ago
9 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
9 years ago
8 years ago
7 years ago
9 years ago
7 years ago
9 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
9 years ago
7 years ago
9 years ago
7 years ago
9 years ago
7 years ago
9 years ago
8 years ago
9 years ago
7 years ago
9 years ago
9 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
7 years ago
8 years ago
7 years ago
8 years ago
7 years ago
8 years ago
7 years ago
8 years ago
  1. package p2p
  2. import (
  3. "fmt"
  4. "math"
  5. "math/rand"
  6. "net"
  7. "time"
  8. "github.com/pkg/errors"
  9. crypto "github.com/tendermint/go-crypto"
  10. cfg "github.com/tendermint/tendermint/config"
  11. "github.com/tendermint/tendermint/p2p/conn"
  12. cmn "github.com/tendermint/tmlibs/common"
  13. )
  14. const (
  15. // wait a random amount of time from this interval
  16. // before dialing peers or reconnecting to help prevent DoS
  17. dialRandomizerIntervalMilliseconds = 3000
  18. // repeatedly try to reconnect for a few minutes
  19. // ie. 5 * 20 = 100s
  20. reconnectAttempts = 20
  21. reconnectInterval = 5 * time.Second
  22. // then move into exponential backoff mode for ~1day
  23. // ie. 3**10 = 16hrs
  24. reconnectBackOffAttempts = 10
  25. reconnectBackOffBaseSeconds = 3
  26. )
  27. //-----------------------------------------------------------------------------
  28. type AddrBook interface {
  29. AddAddress(addr *NetAddress, src *NetAddress) error
  30. Save()
  31. }
  32. //-----------------------------------------------------------------------------
  33. // `Switch` handles peer connections and exposes an API to receive incoming messages
  34. // on `Reactors`. Each `Reactor` is responsible for handling incoming messages of one
  35. // or more `Channels`. So while sending outgoing messages is typically performed on the peer,
  36. // incoming messages are received on the reactor.
  37. type Switch struct {
  38. cmn.BaseService
  39. config *cfg.P2PConfig
  40. peerConfig *PeerConfig
  41. listeners []Listener
  42. reactors map[string]Reactor
  43. chDescs []*conn.ChannelDescriptor
  44. reactorsByCh map[byte]Reactor
  45. peers *PeerSet
  46. dialing *cmn.CMap
  47. nodeInfo NodeInfo // our node info
  48. nodeKey *NodeKey // our node privkey
  49. filterConnByAddr func(net.Addr) error
  50. filterConnByPubKey func(crypto.PubKey) error
  51. rng *rand.Rand // seed for randomizing dial times and orders
  52. }
  53. func NewSwitch(config *cfg.P2PConfig) *Switch {
  54. sw := &Switch{
  55. config: config,
  56. peerConfig: DefaultPeerConfig(),
  57. reactors: make(map[string]Reactor),
  58. chDescs: make([]*conn.ChannelDescriptor, 0),
  59. reactorsByCh: make(map[byte]Reactor),
  60. peers: NewPeerSet(),
  61. dialing: cmn.NewCMap(),
  62. }
  63. // Ensure we have a completely undeterministic PRNG. cmd.RandInt64() draws
  64. // from a seed that's initialized with OS entropy on process start.
  65. sw.rng = rand.New(rand.NewSource(cmn.RandInt64()))
  66. // TODO: collapse the peerConfig into the config ?
  67. sw.peerConfig.MConfig.FlushThrottle = time.Duration(config.FlushThrottleTimeout) * time.Millisecond
  68. sw.peerConfig.MConfig.SendRate = config.SendRate
  69. sw.peerConfig.MConfig.RecvRate = config.RecvRate
  70. sw.peerConfig.MConfig.MaxMsgPacketPayloadSize = config.MaxMsgPacketPayloadSize
  71. sw.BaseService = *cmn.NewBaseService(nil, "P2P Switch", sw)
  72. return sw
  73. }
  74. //---------------------------------------------------------------------
  75. // Switch setup
  76. // AddReactor adds the given reactor to the switch.
  77. // NOTE: Not goroutine safe.
  78. func (sw *Switch) AddReactor(name string, reactor Reactor) Reactor {
  79. // Validate the reactor.
  80. // No two reactors can share the same channel.
  81. reactorChannels := reactor.GetChannels()
  82. for _, chDesc := range reactorChannels {
  83. chID := chDesc.ID
  84. if sw.reactorsByCh[chID] != nil {
  85. cmn.PanicSanity(fmt.Sprintf("Channel %X has multiple reactors %v & %v", chID, sw.reactorsByCh[chID], reactor))
  86. }
  87. sw.chDescs = append(sw.chDescs, chDesc)
  88. sw.reactorsByCh[chID] = reactor
  89. }
  90. sw.reactors[name] = reactor
  91. reactor.SetSwitch(sw)
  92. return reactor
  93. }
  94. // Reactors returns a map of reactors registered on the switch.
  95. // NOTE: Not goroutine safe.
  96. func (sw *Switch) Reactors() map[string]Reactor {
  97. return sw.reactors
  98. }
  99. // Reactor returns the reactor with the given name.
  100. // NOTE: Not goroutine safe.
  101. func (sw *Switch) Reactor(name string) Reactor {
  102. return sw.reactors[name]
  103. }
  104. // AddListener adds the given listener to the switch for listening to incoming peer connections.
  105. // NOTE: Not goroutine safe.
  106. func (sw *Switch) AddListener(l Listener) {
  107. sw.listeners = append(sw.listeners, l)
  108. }
  109. // Listeners returns the list of listeners the switch listens on.
  110. // NOTE: Not goroutine safe.
  111. func (sw *Switch) Listeners() []Listener {
  112. return sw.listeners
  113. }
  114. // IsListening returns true if the switch has at least one listener.
  115. // NOTE: Not goroutine safe.
  116. func (sw *Switch) IsListening() bool {
  117. return len(sw.listeners) > 0
  118. }
  119. // SetNodeInfo sets the switch's NodeInfo for checking compatibility and handshaking with other nodes.
  120. // NOTE: Not goroutine safe.
  121. func (sw *Switch) SetNodeInfo(nodeInfo NodeInfo) {
  122. sw.nodeInfo = nodeInfo
  123. }
  124. // NodeInfo returns the switch's NodeInfo.
  125. // NOTE: Not goroutine safe.
  126. func (sw *Switch) NodeInfo() NodeInfo {
  127. return sw.nodeInfo
  128. }
  129. // SetNodeKey sets the switch's private key for authenticated encryption.
  130. // NOTE: Not goroutine safe.
  131. func (sw *Switch) SetNodeKey(nodeKey *NodeKey) {
  132. sw.nodeKey = nodeKey
  133. }
  134. //---------------------------------------------------------------------
  135. // Service start/stop
  136. // OnStart implements BaseService. It starts all the reactors, peers, and listeners.
  137. func (sw *Switch) OnStart() error {
  138. // Start reactors
  139. for _, reactor := range sw.reactors {
  140. err := reactor.Start()
  141. if err != nil {
  142. return errors.Wrapf(err, "failed to start %v", reactor)
  143. }
  144. }
  145. // Start listeners
  146. for _, listener := range sw.listeners {
  147. go sw.listenerRoutine(listener)
  148. }
  149. return nil
  150. }
  151. // OnStop implements BaseService. It stops all listeners, peers, and reactors.
  152. func (sw *Switch) OnStop() {
  153. // Stop listeners
  154. for _, listener := range sw.listeners {
  155. listener.Stop()
  156. }
  157. sw.listeners = nil
  158. // Stop peers
  159. for _, peer := range sw.peers.List() {
  160. peer.Stop()
  161. sw.peers.Remove(peer)
  162. }
  163. // Stop reactors
  164. sw.Logger.Debug("Switch: Stopping reactors")
  165. for _, reactor := range sw.reactors {
  166. reactor.Stop()
  167. }
  168. }
  169. //---------------------------------------------------------------------
  170. // Peers
  171. // Broadcast runs a go routine for each attempted send, which will block
  172. // trying to send for defaultSendTimeoutSeconds. Returns a channel
  173. // which receives broadcast result for each attempted send (success=false if times out).
  174. // NOTE: Broadcast uses goroutines, so order of broadcast may not be preserved.
  175. // TODO: Something more intelligent.
  176. type BroadcastResult struct {
  177. PeerKey string
  178. Success bool
  179. }
  180. func (sw *Switch) Broadcast(chID byte, msg interface{}) chan BroadcastResult {
  181. successChan := make(chan BroadcastResult, len(sw.peers.List()))
  182. sw.Logger.Debug("Broadcast", "channel", chID, "msg", msg)
  183. for _, peer := range sw.peers.List() {
  184. go func(peer Peer) {
  185. success := peer.Send(chID, msg)
  186. successChan <- BroadcastResult{peer.Key(), success}
  187. }(peer)
  188. }
  189. return successChan
  190. }
  191. func (sw *Switch) TryBroadcast(chID byte, msg interface{}) chan BroadcastResult {
  192. successChan := make(chan BroadcastResult, len(sw.peers.List()))
  193. sw.Logger.Debug("TryBroadcast", "channel", chID, "msg", msg)
  194. for _, peer := range sw.peers.List() {
  195. success := peer.TrySend(chID, msg)
  196. if success {
  197. successChan <- BroadcastResult{peer.Key(), success}
  198. } else {
  199. go func(peer Peer) {
  200. success := peer.Send(chID, msg)
  201. successChan <- BroadcastResult{peer.Key(), success}
  202. }(peer)
  203. }
  204. }
  205. return successChan
  206. }
  207. // NumPeers returns the count of outbound/inbound and outbound-dialing peers.
  208. func (sw *Switch) NumPeers() (outbound, inbound, dialing int) {
  209. peers := sw.peers.List()
  210. for _, peer := range peers {
  211. if peer.IsOutbound() {
  212. outbound++
  213. } else {
  214. inbound++
  215. }
  216. }
  217. dialing = sw.dialing.Size()
  218. return
  219. }
  220. // Peers returns the set of peers that are connected to the switch.
  221. func (sw *Switch) Peers() IPeerSet {
  222. return sw.peers
  223. }
  224. // StopPeerForError disconnects from a peer due to external error.
  225. // If the peer is persistent, it will attempt to reconnect.
  226. // TODO: make record depending on reason.
  227. func (sw *Switch) StopPeerForError(peer Peer, reason interface{}) {
  228. sw.Logger.Error("Stopping peer for error", "peer", peer, "err", reason)
  229. sw.stopAndRemovePeer(peer, reason)
  230. if peer.IsPersistent() {
  231. go sw.reconnectToPeer(peer)
  232. }
  233. }
  234. // StopPeerGracefully disconnects from a peer gracefully.
  235. // TODO: handle graceful disconnects.
  236. func (sw *Switch) StopPeerGracefully(peer Peer) {
  237. sw.Logger.Info("Stopping peer gracefully")
  238. sw.stopAndRemovePeer(peer, nil)
  239. }
  240. func (sw *Switch) stopAndRemovePeer(peer Peer, reason interface{}) {
  241. sw.peers.Remove(peer)
  242. peer.Stop()
  243. for _, reactor := range sw.reactors {
  244. reactor.RemovePeer(peer, reason)
  245. }
  246. }
  247. // reconnectToPeer tries to reconnect to the peer, first repeatedly
  248. // with a fixed interval, then with exponential backoff.
  249. // If no success after all that, it stops trying, and leaves it
  250. // to the PEX/Addrbook to find the peer again
  251. func (sw *Switch) reconnectToPeer(peer Peer) {
  252. // NOTE this will connect to the self reported address,
  253. // not necessarily the original we dialed
  254. netAddr := peer.NodeInfo().NetAddress()
  255. start := time.Now()
  256. sw.Logger.Info("Reconnecting to peer", "peer", peer)
  257. for i := 0; i < reconnectAttempts; i++ {
  258. if !sw.IsRunning() {
  259. return
  260. }
  261. peer, err := sw.DialPeerWithAddress(netAddr, true)
  262. if err != nil {
  263. sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "peer", peer)
  264. // sleep a set amount
  265. sw.randomSleep(reconnectInterval)
  266. continue
  267. } else {
  268. sw.Logger.Info("Reconnected to peer", "peer", peer)
  269. return
  270. }
  271. }
  272. sw.Logger.Error("Failed to reconnect to peer. Beginning exponential backoff",
  273. "peer", peer, "elapsed", time.Since(start))
  274. for i := 0; i < reconnectBackOffAttempts; i++ {
  275. if !sw.IsRunning() {
  276. return
  277. }
  278. // sleep an exponentially increasing amount
  279. sleepIntervalSeconds := math.Pow(reconnectBackOffBaseSeconds, float64(i))
  280. sw.randomSleep(time.Duration(sleepIntervalSeconds) * time.Second)
  281. peer, err := sw.DialPeerWithAddress(netAddr, true)
  282. if err != nil {
  283. sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "peer", peer)
  284. continue
  285. } else {
  286. sw.Logger.Info("Reconnected to peer", "peer", peer)
  287. return
  288. }
  289. }
  290. sw.Logger.Error("Failed to reconnect to peer. Giving up", "peer", peer, "elapsed", time.Since(start))
  291. }
  292. //---------------------------------------------------------------------
  293. // Dialing
  294. // IsDialing returns true if the switch is currently dialing the given ID.
  295. func (sw *Switch) IsDialing(id ID) bool {
  296. return sw.dialing.Has(string(id))
  297. }
  298. // DialPeersAsync dials a list of peers asynchronously in random order (optionally, making them persistent).
  299. func (sw *Switch) DialPeersAsync(addrBook AddrBook, peers []string, persistent bool) error {
  300. netAddrs, errs := NewNetAddressStrings(peers)
  301. for _, err := range errs {
  302. sw.Logger.Error("Error in peer's address", "err", err)
  303. }
  304. if addrBook != nil {
  305. // add peers to `addrBook`
  306. ourAddr := sw.nodeInfo.NetAddress()
  307. for _, netAddr := range netAddrs {
  308. // do not add our address or ID
  309. if netAddr.Same(ourAddr) {
  310. continue
  311. }
  312. // TODO: move this out of here ?
  313. addrBook.AddAddress(netAddr, ourAddr)
  314. }
  315. // Persist some peers to disk right away.
  316. // NOTE: integration tests depend on this
  317. addrBook.Save()
  318. }
  319. // permute the list, dial them in random order.
  320. perm := sw.rng.Perm(len(netAddrs))
  321. for i := 0; i < len(perm); i++ {
  322. go func(i int) {
  323. sw.randomSleep(0)
  324. j := perm[i]
  325. peer, err := sw.DialPeerWithAddress(netAddrs[j], persistent)
  326. if err != nil {
  327. sw.Logger.Error("Error dialing peer", "err", err)
  328. } else {
  329. sw.Logger.Info("Connected to peer", "peer", peer)
  330. }
  331. }(i)
  332. }
  333. return nil
  334. }
  335. // DialPeerWithAddress dials the given peer and runs sw.addPeer if it connects and authenticates successfully.
  336. // If `persistent == true`, the switch will always try to reconnect to this peer if the connection ever fails.
  337. func (sw *Switch) DialPeerWithAddress(addr *NetAddress, persistent bool) (Peer, error) {
  338. sw.dialing.Set(string(addr.ID), addr)
  339. defer sw.dialing.Delete(string(addr.ID))
  340. return sw.addOutboundPeerWithConfig(addr, sw.peerConfig, persistent)
  341. }
  342. // sleep for interval plus some random amount of ms on [0, dialRandomizerIntervalMilliseconds]
  343. func (sw *Switch) randomSleep(interval time.Duration) {
  344. r := time.Duration(sw.rng.Int63n(dialRandomizerIntervalMilliseconds)) * time.Millisecond
  345. time.Sleep(r + interval)
  346. }
  347. //------------------------------------------------------------------------------------
  348. // Connection filtering
  349. // FilterConnByAddr returns an error if connecting to the given address is forbidden.
  350. func (sw *Switch) FilterConnByAddr(addr net.Addr) error {
  351. if sw.filterConnByAddr != nil {
  352. return sw.filterConnByAddr(addr)
  353. }
  354. return nil
  355. }
  356. // FilterConnByPubKey returns an error if connecting to the given public key is forbidden.
  357. func (sw *Switch) FilterConnByPubKey(pubkey crypto.PubKey) error {
  358. if sw.filterConnByPubKey != nil {
  359. return sw.filterConnByPubKey(pubkey)
  360. }
  361. return nil
  362. }
  363. // SetAddrFilter sets the function for filtering connections by address.
  364. func (sw *Switch) SetAddrFilter(f func(net.Addr) error) {
  365. sw.filterConnByAddr = f
  366. }
  367. // SetPubKeyFilter sets the function for filtering connections by public key.
  368. func (sw *Switch) SetPubKeyFilter(f func(crypto.PubKey) error) {
  369. sw.filterConnByPubKey = f
  370. }
  371. //------------------------------------------------------------------------------------
  372. func (sw *Switch) listenerRoutine(l Listener) {
  373. for {
  374. inConn, ok := <-l.Connections()
  375. if !ok {
  376. break
  377. }
  378. // ignore connection if we already have enough
  379. maxPeers := sw.config.MaxNumPeers
  380. if maxPeers <= sw.peers.Size() {
  381. sw.Logger.Info("Ignoring inbound connection: already have enough peers", "address", inConn.RemoteAddr().String(), "numPeers", sw.peers.Size(), "max", maxPeers)
  382. continue
  383. }
  384. // New inbound connection!
  385. err := sw.addInboundPeerWithConfig(inConn, sw.peerConfig)
  386. if err != nil {
  387. sw.Logger.Info("Ignoring inbound connection: error while adding peer", "address", inConn.RemoteAddr().String(), "err", err)
  388. continue
  389. }
  390. }
  391. // cleanup
  392. }
  393. func (sw *Switch) addInboundPeerWithConfig(conn net.Conn, config *PeerConfig) error {
  394. peer, err := newInboundPeer(conn, sw.reactorsByCh, sw.chDescs, sw.StopPeerForError, sw.nodeKey.PrivKey, config)
  395. if err != nil {
  396. conn.Close() // peer is nil
  397. return err
  398. }
  399. peer.SetLogger(sw.Logger.With("peer", conn.RemoteAddr()))
  400. if err = sw.addPeer(peer); err != nil {
  401. peer.CloseConn()
  402. return err
  403. }
  404. return nil
  405. }
  406. // dial the peer; make secret connection; authenticate against the dialed ID;
  407. // add the peer.
  408. func (sw *Switch) addOutboundPeerWithConfig(addr *NetAddress, config *PeerConfig, persistent bool) (Peer, error) {
  409. sw.Logger.Info("Dialing peer", "address", addr)
  410. peer, err := newOutboundPeer(addr, sw.reactorsByCh, sw.chDescs, sw.StopPeerForError, sw.nodeKey.PrivKey, config, persistent)
  411. if err != nil {
  412. sw.Logger.Error("Failed to dial peer", "address", addr, "err", err)
  413. return nil, err
  414. }
  415. peer.SetLogger(sw.Logger.With("peer", addr))
  416. // authenticate peer
  417. if addr.ID == "" {
  418. peer.Logger.Info("Dialed peer with unknown ID - unable to authenticate", "addr", addr)
  419. } else if addr.ID != peer.ID() {
  420. peer.CloseConn()
  421. return nil, ErrSwitchAuthenticationFailure{addr, peer.ID()}
  422. }
  423. err = sw.addPeer(peer)
  424. if err != nil {
  425. sw.Logger.Error("Failed to add peer", "address", addr, "err", err)
  426. peer.CloseConn()
  427. return nil, err
  428. }
  429. sw.Logger.Info("Dialed and added peer", "address", addr, "peer", peer)
  430. return peer, nil
  431. }
  432. // addPeer performs the Tendermint P2P handshake with a peer
  433. // that already has a SecretConnection. If all goes well,
  434. // it starts the peer and adds it to the switch.
  435. // NOTE: This performs a blocking handshake before the peer is added.
  436. // NOTE: If error is returned, caller is responsible for calling peer.CloseConn()
  437. func (sw *Switch) addPeer(peer *peer) error {
  438. // Avoid self
  439. if sw.nodeKey.ID() == peer.ID() {
  440. return ErrSwitchConnectToSelf
  441. }
  442. // Avoid duplicate
  443. if sw.peers.Has(peer.ID()) {
  444. return ErrSwitchDuplicatePeer
  445. }
  446. // Filter peer against white list
  447. if err := sw.FilterConnByAddr(peer.Addr()); err != nil {
  448. return err
  449. }
  450. if err := sw.FilterConnByPubKey(peer.PubKey()); err != nil {
  451. return err
  452. }
  453. // Exchange NodeInfo with the peer
  454. if err := peer.HandshakeTimeout(sw.nodeInfo, time.Duration(sw.peerConfig.HandshakeTimeout*time.Second)); err != nil {
  455. return err
  456. }
  457. // Validate the peers nodeInfo against the pubkey
  458. if err := peer.NodeInfo().Validate(peer.PubKey()); err != nil {
  459. return err
  460. }
  461. // Check version, chain id
  462. if err := sw.nodeInfo.CompatibleWith(peer.NodeInfo()); err != nil {
  463. return err
  464. }
  465. // All good. Start peer
  466. if sw.IsRunning() {
  467. sw.startInitPeer(peer)
  468. }
  469. // Add the peer to .peers.
  470. // We start it first so that a peer in the list is safe to Stop.
  471. // It should not err since we already checked peers.Has().
  472. if err := sw.peers.Add(peer); err != nil {
  473. return err
  474. }
  475. sw.Logger.Info("Added peer", "peer", peer)
  476. return nil
  477. }
  478. func (sw *Switch) startInitPeer(peer *peer) {
  479. err := peer.Start() // spawn send/recv routines
  480. if err != nil {
  481. // Should never happen
  482. sw.Logger.Error("Error starting peer", "peer", peer, "err", err)
  483. }
  484. for _, reactor := range sw.reactors {
  485. reactor.AddPeer(peer)
  486. }
  487. }