You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

692 lines
19 KiB

p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
p2p: file descriptor leaks (#3150) * close peer's connection to avoid fd leak Fixes #2967 * rename peer#Addr to RemoteAddr * fix test * fixes after Ethan's review * bring back the check * changelog entry * write a test for switch#acceptRoutine * increase timeouts? :( * remove extra assertNPeersWithTimeout * simplify test * assert number of peers (just to be safe) * Cleanup in OnStop * run tests with verbose flag on CircleCI * spawn a reading routine to prevent connection from closing * get port from the listener random port is faster, but often results in ``` panic: listen tcp 127.0.0.1:44068: bind: address already in use [recovered] panic: listen tcp 127.0.0.1:44068: bind: address already in use goroutine 79 [running]: testing.tRunner.func1(0xc0001bd600) /usr/local/go/src/testing/testing.go:792 +0x387 panic(0x974d20, 0xc0001b0500) /usr/local/go/src/runtime/panic.go:513 +0x1b9 github.com/tendermint/tendermint/p2p.MakeSwitch(0xc0000f42a0, 0x0, 0x9fb9cc, 0x9, 0x9fc346, 0xb, 0xb42128, 0x0, 0x0, 0x0, ...) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:182 +0xa28 github.com/tendermint/tendermint/p2p.MakeConnectedSwitches(0xc0000f42a0, 0x2, 0xb42128, 0xb41eb8, 0x4f1205, 0xc0001bed80, 0x4f16ed) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/test_util.go:75 +0xf9 github.com/tendermint/tendermint/p2p.MakeSwitchPair(0xbb8d20, 0xc0001bd600, 0xb42128, 0x2f7, 0x4f16c0) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:94 +0x4c github.com/tendermint/tendermint/p2p.TestSwitches(0xc0001bd600) /home/vagrant/go/src/github.com/tendermint/tendermint/p2p/switch_test.go:117 +0x58 testing.tRunner(0xc0001bd600, 0xb42038) /usr/local/go/src/testing/testing.go:827 +0xbf created by testing.(*T).Run /usr/local/go/src/testing/testing.go:878 +0x353 exit status 2 FAIL github.com/tendermint/tendermint/p2p 0.350s ```
6 years ago
  1. package p2p
  2. import (
  3. "fmt"
  4. "math"
  5. "sync"
  6. "time"
  7. "github.com/tendermint/tendermint/config"
  8. cmn "github.com/tendermint/tendermint/libs/common"
  9. "github.com/tendermint/tendermint/p2p/conn"
  10. )
  11. const (
  12. // wait a random amount of time from this interval
  13. // before dialing peers or reconnecting to help prevent DoS
  14. dialRandomizerIntervalMilliseconds = 3000
  15. // repeatedly try to reconnect for a few minutes
  16. // ie. 5 * 20 = 100s
  17. reconnectAttempts = 20
  18. reconnectInterval = 5 * time.Second
  19. // then move into exponential backoff mode for ~1day
  20. // ie. 3**10 = 16hrs
  21. reconnectBackOffAttempts = 10
  22. reconnectBackOffBaseSeconds = 3
  23. )
  24. // MConnConfig returns an MConnConfig with fields updated
  25. // from the P2PConfig.
  26. func MConnConfig(cfg *config.P2PConfig) conn.MConnConfig {
  27. mConfig := conn.DefaultMConnConfig()
  28. mConfig.FlushThrottle = cfg.FlushThrottleTimeout
  29. mConfig.SendRate = cfg.SendRate
  30. mConfig.RecvRate = cfg.RecvRate
  31. mConfig.MaxPacketMsgPayloadSize = cfg.MaxPacketMsgPayloadSize
  32. return mConfig
  33. }
  34. //-----------------------------------------------------------------------------
  35. // An AddrBook represents an address book from the pex package, which is used
  36. // to store peer addresses.
  37. type AddrBook interface {
  38. AddAddress(addr *NetAddress, src *NetAddress) error
  39. AddOurAddress(*NetAddress)
  40. OurAddress(*NetAddress) bool
  41. MarkGood(*NetAddress)
  42. RemoveAddress(*NetAddress)
  43. HasAddress(*NetAddress) bool
  44. Save()
  45. }
  46. // PeerFilterFunc to be implemented by filter hooks after a new Peer has been
  47. // fully setup.
  48. type PeerFilterFunc func(IPeerSet, Peer) error
  49. //-----------------------------------------------------------------------------
  50. // Switch handles peer connections and exposes an API to receive incoming messages
  51. // on `Reactors`. Each `Reactor` is responsible for handling incoming messages of one
  52. // or more `Channels`. So while sending outgoing messages is typically performed on the peer,
  53. // incoming messages are received on the reactor.
  54. type Switch struct {
  55. cmn.BaseService
  56. config *config.P2PConfig
  57. reactors map[string]Reactor
  58. chDescs []*conn.ChannelDescriptor
  59. reactorsByCh map[byte]Reactor
  60. peers *PeerSet
  61. dialing *cmn.CMap
  62. reconnecting *cmn.CMap
  63. nodeInfo NodeInfo // our node info
  64. nodeKey *NodeKey // our node privkey
  65. addrBook AddrBook
  66. transport Transport
  67. filterTimeout time.Duration
  68. peerFilters []PeerFilterFunc
  69. rng *cmn.Rand // seed for randomizing dial times and orders
  70. metrics *Metrics
  71. }
  72. // SwitchOption sets an optional parameter on the Switch.
  73. type SwitchOption func(*Switch)
  74. // NewSwitch creates a new Switch with the given config.
  75. func NewSwitch(
  76. cfg *config.P2PConfig,
  77. transport Transport,
  78. options ...SwitchOption,
  79. ) *Switch {
  80. sw := &Switch{
  81. config: cfg,
  82. reactors: make(map[string]Reactor),
  83. chDescs: make([]*conn.ChannelDescriptor, 0),
  84. reactorsByCh: make(map[byte]Reactor),
  85. peers: NewPeerSet(),
  86. dialing: cmn.NewCMap(),
  87. reconnecting: cmn.NewCMap(),
  88. metrics: NopMetrics(),
  89. transport: transport,
  90. filterTimeout: defaultFilterTimeout,
  91. }
  92. // Ensure we have a completely undeterministic PRNG.
  93. sw.rng = cmn.NewRand()
  94. sw.BaseService = *cmn.NewBaseService(nil, "P2P Switch", sw)
  95. for _, option := range options {
  96. option(sw)
  97. }
  98. return sw
  99. }
  100. // SwitchFilterTimeout sets the timeout used for peer filters.
  101. func SwitchFilterTimeout(timeout time.Duration) SwitchOption {
  102. return func(sw *Switch) { sw.filterTimeout = timeout }
  103. }
  104. // SwitchPeerFilters sets the filters for rejection of new peers.
  105. func SwitchPeerFilters(filters ...PeerFilterFunc) SwitchOption {
  106. return func(sw *Switch) { sw.peerFilters = filters }
  107. }
  108. // WithMetrics sets the metrics.
  109. func WithMetrics(metrics *Metrics) SwitchOption {
  110. return func(sw *Switch) { sw.metrics = metrics }
  111. }
  112. //---------------------------------------------------------------------
  113. // Switch setup
  114. // AddReactor adds the given reactor to the switch.
  115. // NOTE: Not goroutine safe.
  116. func (sw *Switch) AddReactor(name string, reactor Reactor) Reactor {
  117. // Validate the reactor.
  118. // No two reactors can share the same channel.
  119. reactorChannels := reactor.GetChannels()
  120. for _, chDesc := range reactorChannels {
  121. chID := chDesc.ID
  122. if sw.reactorsByCh[chID] != nil {
  123. cmn.PanicSanity(fmt.Sprintf("Channel %X has multiple reactors %v & %v", chID, sw.reactorsByCh[chID], reactor))
  124. }
  125. sw.chDescs = append(sw.chDescs, chDesc)
  126. sw.reactorsByCh[chID] = reactor
  127. }
  128. sw.reactors[name] = reactor
  129. reactor.SetSwitch(sw)
  130. return reactor
  131. }
  132. // Reactors returns a map of reactors registered on the switch.
  133. // NOTE: Not goroutine safe.
  134. func (sw *Switch) Reactors() map[string]Reactor {
  135. return sw.reactors
  136. }
  137. // Reactor returns the reactor with the given name.
  138. // NOTE: Not goroutine safe.
  139. func (sw *Switch) Reactor(name string) Reactor {
  140. return sw.reactors[name]
  141. }
  142. // SetNodeInfo sets the switch's NodeInfo for checking compatibility and handshaking with other nodes.
  143. // NOTE: Not goroutine safe.
  144. func (sw *Switch) SetNodeInfo(nodeInfo NodeInfo) {
  145. sw.nodeInfo = nodeInfo
  146. }
  147. // NodeInfo returns the switch's NodeInfo.
  148. // NOTE: Not goroutine safe.
  149. func (sw *Switch) NodeInfo() NodeInfo {
  150. return sw.nodeInfo
  151. }
  152. // SetNodeKey sets the switch's private key for authenticated encryption.
  153. // NOTE: Not goroutine safe.
  154. func (sw *Switch) SetNodeKey(nodeKey *NodeKey) {
  155. sw.nodeKey = nodeKey
  156. }
  157. //---------------------------------------------------------------------
  158. // Service start/stop
  159. // OnStart implements BaseService. It starts all the reactors and peers.
  160. func (sw *Switch) OnStart() error {
  161. // Start reactors
  162. for _, reactor := range sw.reactors {
  163. err := reactor.Start()
  164. if err != nil {
  165. return cmn.ErrorWrap(err, "failed to start %v", reactor)
  166. }
  167. }
  168. // Start accepting Peers.
  169. go sw.acceptRoutine()
  170. return nil
  171. }
  172. // OnStop implements BaseService. It stops all peers and reactors.
  173. func (sw *Switch) OnStop() {
  174. // Stop peers
  175. for _, p := range sw.peers.List() {
  176. sw.transport.Cleanup(p)
  177. p.Stop()
  178. if sw.peers.Remove(p) {
  179. sw.metrics.Peers.Add(float64(-1))
  180. }
  181. }
  182. // Stop reactors
  183. sw.Logger.Debug("Switch: Stopping reactors")
  184. for _, reactor := range sw.reactors {
  185. reactor.Stop()
  186. }
  187. }
  188. //---------------------------------------------------------------------
  189. // Peers
  190. // Broadcast runs a go routine for each attempted send, which will block trying
  191. // to send for defaultSendTimeoutSeconds. Returns a channel which receives
  192. // success values for each attempted send (false if times out). Channel will be
  193. // closed once msg bytes are sent to all peers (or time out).
  194. //
  195. // NOTE: Broadcast uses goroutines, so order of broadcast may not be preserved.
  196. func (sw *Switch) Broadcast(chID byte, msgBytes []byte) chan bool {
  197. sw.Logger.Debug("Broadcast", "channel", chID, "msgBytes", fmt.Sprintf("%X", msgBytes))
  198. peers := sw.peers.List()
  199. var wg sync.WaitGroup
  200. wg.Add(len(peers))
  201. successChan := make(chan bool, len(peers))
  202. for _, peer := range peers {
  203. go func(p Peer) {
  204. defer wg.Done()
  205. success := p.Send(chID, msgBytes)
  206. successChan <- success
  207. }(peer)
  208. }
  209. go func() {
  210. wg.Wait()
  211. close(successChan)
  212. }()
  213. return successChan
  214. }
  215. // NumPeers returns the count of outbound/inbound and outbound-dialing peers.
  216. func (sw *Switch) NumPeers() (outbound, inbound, dialing int) {
  217. peers := sw.peers.List()
  218. for _, peer := range peers {
  219. if peer.IsOutbound() {
  220. outbound++
  221. } else {
  222. inbound++
  223. }
  224. }
  225. dialing = sw.dialing.Size()
  226. return
  227. }
  228. // MaxNumOutboundPeers returns a maximum number of outbound peers.
  229. func (sw *Switch) MaxNumOutboundPeers() int {
  230. return sw.config.MaxNumOutboundPeers
  231. }
  232. // Peers returns the set of peers that are connected to the switch.
  233. func (sw *Switch) Peers() IPeerSet {
  234. return sw.peers
  235. }
  236. // StopPeerForError disconnects from a peer due to external error.
  237. // If the peer is persistent, it will attempt to reconnect.
  238. // TODO: make record depending on reason.
  239. func (sw *Switch) StopPeerForError(peer Peer, reason interface{}) {
  240. sw.Logger.Error("Stopping peer for error", "peer", peer, "err", reason)
  241. sw.stopAndRemovePeer(peer, reason)
  242. if peer.IsPersistent() {
  243. addr := peer.OriginalAddr()
  244. if addr == nil {
  245. // FIXME: persistent peers can't be inbound right now.
  246. // self-reported address for inbound persistent peers
  247. addr = peer.NodeInfo().NetAddress()
  248. }
  249. go sw.reconnectToPeer(addr)
  250. }
  251. }
  252. // StopPeerGracefully disconnects from a peer gracefully.
  253. // TODO: handle graceful disconnects.
  254. func (sw *Switch) StopPeerGracefully(peer Peer) {
  255. sw.Logger.Info("Stopping peer gracefully")
  256. sw.stopAndRemovePeer(peer, nil)
  257. }
  258. func (sw *Switch) stopAndRemovePeer(peer Peer, reason interface{}) {
  259. if sw.peers.Remove(peer) {
  260. sw.metrics.Peers.Add(float64(-1))
  261. }
  262. sw.transport.Cleanup(peer)
  263. peer.Stop()
  264. for _, reactor := range sw.reactors {
  265. reactor.RemovePeer(peer, reason)
  266. }
  267. }
  268. // reconnectToPeer tries to reconnect to the addr, first repeatedly
  269. // with a fixed interval, then with exponential backoff.
  270. // If no success after all that, it stops trying, and leaves it
  271. // to the PEX/Addrbook to find the peer with the addr again
  272. // NOTE: this will keep trying even if the handshake or auth fails.
  273. // TODO: be more explicit with error types so we only retry on certain failures
  274. // - ie. if we're getting ErrDuplicatePeer we can stop
  275. // because the addrbook got us the peer back already
  276. func (sw *Switch) reconnectToPeer(addr *NetAddress) {
  277. if sw.reconnecting.Has(string(addr.ID)) {
  278. return
  279. }
  280. sw.reconnecting.Set(string(addr.ID), addr)
  281. defer sw.reconnecting.Delete(string(addr.ID))
  282. start := time.Now()
  283. sw.Logger.Info("Reconnecting to peer", "addr", addr)
  284. for i := 0; i < reconnectAttempts; i++ {
  285. if !sw.IsRunning() {
  286. return
  287. }
  288. if sw.IsDialingOrExistingAddress(addr) {
  289. sw.Logger.Debug("Peer connection has been established or dialed while we waiting next try", "addr", addr)
  290. return
  291. }
  292. err := sw.DialPeerWithAddress(addr, true)
  293. if err == nil {
  294. return // success
  295. }
  296. sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "addr", addr)
  297. // sleep a set amount
  298. sw.randomSleep(reconnectInterval)
  299. continue
  300. }
  301. sw.Logger.Error("Failed to reconnect to peer. Beginning exponential backoff",
  302. "addr", addr, "elapsed", time.Since(start))
  303. for i := 0; i < reconnectBackOffAttempts; i++ {
  304. if !sw.IsRunning() {
  305. return
  306. }
  307. // sleep an exponentially increasing amount
  308. sleepIntervalSeconds := math.Pow(reconnectBackOffBaseSeconds, float64(i))
  309. sw.randomSleep(time.Duration(sleepIntervalSeconds) * time.Second)
  310. err := sw.DialPeerWithAddress(addr, true)
  311. if err == nil {
  312. return // success
  313. }
  314. sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "addr", addr)
  315. }
  316. sw.Logger.Error("Failed to reconnect to peer. Giving up", "addr", addr, "elapsed", time.Since(start))
  317. }
  318. // SetAddrBook allows to set address book on Switch.
  319. func (sw *Switch) SetAddrBook(addrBook AddrBook) {
  320. sw.addrBook = addrBook
  321. }
  322. // MarkPeerAsGood marks the given peer as good when it did something useful
  323. // like contributed to consensus.
  324. func (sw *Switch) MarkPeerAsGood(peer Peer) {
  325. if sw.addrBook != nil {
  326. sw.addrBook.MarkGood(peer.NodeInfo().NetAddress())
  327. }
  328. }
  329. //---------------------------------------------------------------------
  330. // Dialing
  331. // DialPeersAsync dials a list of peers asynchronously in random order (optionally, making them persistent).
  332. // Used to dial peers from config on startup or from unsafe-RPC (trusted sources).
  333. // TODO: remove addrBook arg since it's now set on the switch
  334. func (sw *Switch) DialPeersAsync(addrBook AddrBook, peers []string, persistent bool) error {
  335. netAddrs, errs := NewNetAddressStrings(peers)
  336. // only log errors, dial correct addresses
  337. for _, err := range errs {
  338. sw.Logger.Error("Error in peer's address", "err", err)
  339. }
  340. ourAddr := sw.nodeInfo.NetAddress()
  341. // TODO: this code feels like it's in the wrong place.
  342. // The integration tests depend on the addrBook being saved
  343. // right away but maybe we can change that. Recall that
  344. // the addrBook is only written to disk every 2min
  345. if addrBook != nil {
  346. // add peers to `addrBook`
  347. for _, netAddr := range netAddrs {
  348. // do not add our address or ID
  349. if !netAddr.Same(ourAddr) {
  350. if err := addrBook.AddAddress(netAddr, ourAddr); err != nil {
  351. sw.Logger.Error("Can't add peer's address to addrbook", "err", err)
  352. }
  353. }
  354. }
  355. // Persist some peers to disk right away.
  356. // NOTE: integration tests depend on this
  357. addrBook.Save()
  358. }
  359. // permute the list, dial them in random order.
  360. perm := sw.rng.Perm(len(netAddrs))
  361. for i := 0; i < len(perm); i++ {
  362. go func(i int) {
  363. j := perm[i]
  364. addr := netAddrs[j]
  365. if addr.Same(ourAddr) {
  366. sw.Logger.Debug("Ignore attempt to connect to ourselves", "addr", addr, "ourAddr", ourAddr)
  367. return
  368. }
  369. sw.randomSleep(0)
  370. if sw.IsDialingOrExistingAddress(addr) {
  371. sw.Logger.Debug("Ignore attempt to connect to an existing peer", "addr", addr)
  372. return
  373. }
  374. err := sw.DialPeerWithAddress(addr, persistent)
  375. if err != nil {
  376. switch err.(type) {
  377. case ErrSwitchConnectToSelf, ErrSwitchDuplicatePeerID:
  378. sw.Logger.Debug("Error dialing peer", "err", err)
  379. default:
  380. sw.Logger.Error("Error dialing peer", "err", err)
  381. }
  382. }
  383. }(i)
  384. }
  385. return nil
  386. }
  387. // DialPeerWithAddress dials the given peer and runs sw.addPeer if it connects and authenticates successfully.
  388. // If `persistent == true`, the switch will always try to reconnect to this peer if the connection ever fails.
  389. func (sw *Switch) DialPeerWithAddress(addr *NetAddress, persistent bool) error {
  390. sw.dialing.Set(string(addr.ID), addr)
  391. defer sw.dialing.Delete(string(addr.ID))
  392. return sw.addOutboundPeerWithConfig(addr, sw.config, persistent)
  393. }
  394. // sleep for interval plus some random amount of ms on [0, dialRandomizerIntervalMilliseconds]
  395. func (sw *Switch) randomSleep(interval time.Duration) {
  396. r := time.Duration(sw.rng.Int63n(dialRandomizerIntervalMilliseconds)) * time.Millisecond
  397. time.Sleep(r + interval)
  398. }
  399. // IsDialingOrExistingAddress returns true if switch has a peer with the given
  400. // address or dialing it at the moment.
  401. func (sw *Switch) IsDialingOrExistingAddress(addr *NetAddress) bool {
  402. return sw.dialing.Has(string(addr.ID)) ||
  403. sw.peers.Has(addr.ID) ||
  404. (!sw.config.AllowDuplicateIP && sw.peers.HasIP(addr.IP))
  405. }
  406. func (sw *Switch) acceptRoutine() {
  407. for {
  408. p, err := sw.transport.Accept(peerConfig{
  409. chDescs: sw.chDescs,
  410. onPeerError: sw.StopPeerForError,
  411. reactorsByCh: sw.reactorsByCh,
  412. metrics: sw.metrics,
  413. })
  414. if err != nil {
  415. switch err := err.(type) {
  416. case ErrRejected:
  417. if err.IsSelf() {
  418. // Remove the given address from the address book and add to our addresses
  419. // to avoid dialing in the future.
  420. addr := err.Addr()
  421. sw.addrBook.RemoveAddress(&addr)
  422. sw.addrBook.AddOurAddress(&addr)
  423. }
  424. sw.Logger.Info(
  425. "Inbound Peer rejected",
  426. "err", err,
  427. "numPeers", sw.peers.Size(),
  428. )
  429. continue
  430. case ErrFilterTimeout:
  431. sw.Logger.Error(
  432. "Peer filter timed out",
  433. "err", err,
  434. )
  435. continue
  436. case ErrTransportClosed:
  437. sw.Logger.Error(
  438. "Stopped accept routine, as transport is closed",
  439. "numPeers", sw.peers.Size(),
  440. )
  441. default:
  442. sw.Logger.Error(
  443. "Accept on transport errored",
  444. "err", err,
  445. "numPeers", sw.peers.Size(),
  446. )
  447. // We could instead have a retry loop around the acceptRoutine,
  448. // but that would need to stop and let the node shutdown eventually.
  449. // So might as well panic and let process managers restart the node.
  450. // There's no point in letting the node run without the acceptRoutine,
  451. // since it won't be able to accept new connections.
  452. panic(fmt.Errorf("accept routine exited: %v", err))
  453. }
  454. break
  455. }
  456. // Ignore connection if we already have enough peers.
  457. _, in, _ := sw.NumPeers()
  458. if in >= sw.config.MaxNumInboundPeers {
  459. sw.Logger.Info(
  460. "Ignoring inbound connection: already have enough inbound peers",
  461. "address", p.NodeInfo().NetAddress().String(),
  462. "have", in,
  463. "max", sw.config.MaxNumInboundPeers,
  464. )
  465. sw.transport.Cleanup(p)
  466. continue
  467. }
  468. if err := sw.addPeer(p); err != nil {
  469. sw.transport.Cleanup(p)
  470. if p.IsRunning() {
  471. _ = p.Stop()
  472. }
  473. sw.Logger.Info(
  474. "Ignoring inbound connection: error while adding peer",
  475. "err", err,
  476. "id", p.ID(),
  477. )
  478. }
  479. }
  480. }
  481. // dial the peer; make secret connection; authenticate against the dialed ID;
  482. // add the peer.
  483. // if dialing fails, start the reconnect loop. If handhsake fails, its over.
  484. // If peer is started succesffuly, reconnectLoop will start when
  485. // StopPeerForError is called
  486. func (sw *Switch) addOutboundPeerWithConfig(
  487. addr *NetAddress,
  488. cfg *config.P2PConfig,
  489. persistent bool,
  490. ) error {
  491. sw.Logger.Info("Dialing peer", "address", addr)
  492. // XXX(xla): Remove the leakage of test concerns in implementation.
  493. if cfg.TestDialFail {
  494. go sw.reconnectToPeer(addr)
  495. return fmt.Errorf("dial err (peerConfig.DialFail == true)")
  496. }
  497. p, err := sw.transport.Dial(*addr, peerConfig{
  498. chDescs: sw.chDescs,
  499. onPeerError: sw.StopPeerForError,
  500. persistent: persistent,
  501. reactorsByCh: sw.reactorsByCh,
  502. metrics: sw.metrics,
  503. })
  504. if err != nil {
  505. switch e := err.(type) {
  506. case ErrRejected:
  507. if e.IsSelf() {
  508. // Remove the given address from the address book and add to our addresses
  509. // to avoid dialing in the future.
  510. sw.addrBook.RemoveAddress(addr)
  511. sw.addrBook.AddOurAddress(addr)
  512. return err
  513. }
  514. }
  515. // retry persistent peers after
  516. // any dial error besides IsSelf()
  517. if persistent {
  518. go sw.reconnectToPeer(addr)
  519. }
  520. return err
  521. }
  522. if err := sw.addPeer(p); err != nil {
  523. sw.transport.Cleanup(p)
  524. if p.IsRunning() {
  525. _ = p.Stop()
  526. }
  527. return err
  528. }
  529. return nil
  530. }
  531. func (sw *Switch) filterPeer(p Peer) error {
  532. // Avoid duplicate
  533. if sw.peers.Has(p.ID()) {
  534. return ErrRejected{id: p.ID(), isDuplicate: true}
  535. }
  536. errc := make(chan error, len(sw.peerFilters))
  537. for _, f := range sw.peerFilters {
  538. go func(f PeerFilterFunc, p Peer, errc chan<- error) {
  539. errc <- f(sw.peers, p)
  540. }(f, p, errc)
  541. }
  542. for i := 0; i < cap(errc); i++ {
  543. select {
  544. case err := <-errc:
  545. if err != nil {
  546. return ErrRejected{id: p.ID(), err: err, isFiltered: true}
  547. }
  548. case <-time.After(sw.filterTimeout):
  549. return ErrFilterTimeout{}
  550. }
  551. }
  552. return nil
  553. }
  554. // addPeer starts up the Peer and adds it to the Switch. Error is returned if
  555. // the peer is filtered out or failed to start or can't be added.
  556. func (sw *Switch) addPeer(p Peer) error {
  557. if err := sw.filterPeer(p); err != nil {
  558. return err
  559. }
  560. p.SetLogger(sw.Logger.With("peer", p.NodeInfo().NetAddress()))
  561. // Handle the shut down case where the switch has stopped but we're
  562. // concurrently trying to add a peer.
  563. if !sw.IsRunning() {
  564. // XXX should this return an error or just log and terminate?
  565. sw.Logger.Error("Won't start a peer - switch is not running", "peer", p)
  566. return nil
  567. }
  568. // Start the peer's send/recv routines.
  569. // Must start it before adding it to the peer set
  570. // to prevent Start and Stop from being called concurrently.
  571. err := p.Start()
  572. if err != nil {
  573. // Should never happen
  574. sw.Logger.Error("Error starting peer", "err", err, "peer", p)
  575. return err
  576. }
  577. // Add the peer to PeerSet. Do this before starting the reactors
  578. // so that if Receive errors, we will find the peer and remove it.
  579. // Add should not err since we already checked peers.Has().
  580. if err := sw.peers.Add(p); err != nil {
  581. return err
  582. }
  583. sw.metrics.Peers.Add(float64(1))
  584. // Start all the reactor protocols on the peer.
  585. for _, reactor := range sw.reactors {
  586. reactor.AddPeer(p)
  587. }
  588. sw.Logger.Info("Added peer", "peer", p)
  589. return nil
  590. }