You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

256 lines
6.0 KiB

  1. package monitor
  2. import (
  3. "fmt"
  4. "math/rand"
  5. "sync"
  6. "time"
  7. "github.com/pkg/errors"
  8. "github.com/tendermint/tendermint/libs/log"
  9. tmtypes "github.com/tendermint/tendermint/types"
  10. )
  11. // waiting more than this many seconds for a block means we're unhealthy
  12. const nodeLivenessTimeout = 5 * time.Second
  13. // Monitor keeps track of the nodes and updates common statistics upon
  14. // receiving new events from nodes.
  15. //
  16. // Common statistics is stored in Network struct.
  17. type Monitor struct {
  18. mtx sync.Mutex
  19. Nodes []*Node
  20. Network *Network
  21. monitorQuit chan struct{} // monitor exitting
  22. nodeQuit map[string]chan struct{} // node is being stopped and removed from under the monitor
  23. recalculateNetworkUptimeEvery time.Duration
  24. numValidatorsUpdateInterval time.Duration
  25. logger log.Logger
  26. }
  27. // NewMonitor creates new instance of a Monitor. You can provide options to
  28. // change some default values.
  29. //
  30. // Example:
  31. // NewMonitor(monitor.SetNumValidatorsUpdateInterval(1 * time.Second))
  32. func NewMonitor(options ...func(*Monitor)) *Monitor {
  33. m := &Monitor{
  34. Nodes: make([]*Node, 0),
  35. Network: NewNetwork(),
  36. monitorQuit: make(chan struct{}),
  37. nodeQuit: make(map[string]chan struct{}),
  38. recalculateNetworkUptimeEvery: 10 * time.Second,
  39. numValidatorsUpdateInterval: 5 * time.Second,
  40. logger: log.NewNopLogger(),
  41. }
  42. for _, option := range options {
  43. option(m)
  44. }
  45. return m
  46. }
  47. // RecalculateNetworkUptimeEvery lets you change network uptime update interval.
  48. func RecalculateNetworkUptimeEvery(d time.Duration) func(m *Monitor) {
  49. return func(m *Monitor) {
  50. m.recalculateNetworkUptimeEvery = d
  51. }
  52. }
  53. // SetNumValidatorsUpdateInterval lets you change num validators update interval.
  54. func SetNumValidatorsUpdateInterval(d time.Duration) func(m *Monitor) {
  55. return func(m *Monitor) {
  56. m.numValidatorsUpdateInterval = d
  57. }
  58. }
  59. // SetLogger lets you set your own logger
  60. func (m *Monitor) SetLogger(l log.Logger) {
  61. m.logger = l
  62. }
  63. // Monitor begins to monitor the node `n`. The node will be started and added
  64. // to the monitor.
  65. func (m *Monitor) Monitor(n *Node) error {
  66. m.mtx.Lock()
  67. m.Nodes = append(m.Nodes, n)
  68. m.mtx.Unlock()
  69. blockCh := make(chan *tmtypes.Block, 10)
  70. n.SendBlocksTo(blockCh)
  71. blockLatencyCh := make(chan float64, 10)
  72. n.SendBlockLatenciesTo(blockLatencyCh)
  73. disconnectCh := make(chan bool, 10)
  74. n.NotifyAboutDisconnects(disconnectCh)
  75. if err := n.Start(); err != nil {
  76. return err
  77. }
  78. m.Network.NewNode(n.Name)
  79. m.nodeQuit[n.Name] = make(chan struct{})
  80. go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
  81. return nil
  82. }
  83. // Unmonitor stops monitoring node `n`. The node will be stopped and removed
  84. // from the monitor.
  85. func (m *Monitor) Unmonitor(n *Node) {
  86. m.Network.NodeDeleted(n.Name)
  87. n.Stop()
  88. close(m.nodeQuit[n.Name])
  89. delete(m.nodeQuit, n.Name)
  90. i, _ := m.NodeByName(n.Name)
  91. m.mtx.Lock()
  92. m.Nodes[i] = m.Nodes[len(m.Nodes)-1]
  93. m.Nodes = m.Nodes[:len(m.Nodes)-1]
  94. m.mtx.Unlock()
  95. }
  96. // NodeByName returns the node and its index if such node exists within the
  97. // monitor. Otherwise, -1 and nil are returned.
  98. func (m *Monitor) NodeByName(name string) (index int, node *Node) {
  99. m.mtx.Lock()
  100. defer m.mtx.Unlock()
  101. for i, n := range m.Nodes {
  102. if name == n.Name {
  103. return i, n
  104. }
  105. }
  106. return -1, nil
  107. }
  108. // NodeIsOnline is called when connection to the node is restored.
  109. // Must be safe to call multiple times.
  110. func (m *Monitor) NodeIsOnline(name string) {
  111. _, node := m.NodeByName(name)
  112. if nil != node {
  113. if online, ok := m.Network.nodeStatusMap[name]; ok && online {
  114. m.mtx.Lock()
  115. node.Online = online
  116. m.mtx.Unlock()
  117. }
  118. }
  119. }
  120. // Start starts the monitor's routines: recalculating network uptime and
  121. // updating number of validators.
  122. func (m *Monitor) Start() error {
  123. go m.recalculateNetworkUptimeLoop()
  124. go m.updateNumValidatorLoop()
  125. return nil
  126. }
  127. // Stop stops the monitor's routines.
  128. func (m *Monitor) Stop() {
  129. close(m.monitorQuit)
  130. for _, n := range m.Nodes {
  131. m.Unmonitor(n)
  132. }
  133. }
  134. // main loop where we listen for events from the node
  135. func (m *Monitor) listen(
  136. nodeName string,
  137. blockCh <-chan *tmtypes.Block,
  138. blockLatencyCh <-chan float64,
  139. disconnectCh <-chan bool,
  140. quit <-chan struct{}) {
  141. logger := m.logger.With("node", nodeName)
  142. for {
  143. select {
  144. case <-quit:
  145. return
  146. case b := <-blockCh:
  147. m.Network.NewBlock(b)
  148. m.Network.NodeIsOnline(nodeName)
  149. m.NodeIsOnline(nodeName)
  150. case l := <-blockLatencyCh:
  151. m.Network.NewBlockLatency(l)
  152. m.Network.NodeIsOnline(nodeName)
  153. m.NodeIsOnline(nodeName)
  154. case disconnected := <-disconnectCh:
  155. if disconnected {
  156. m.Network.NodeIsDown(nodeName)
  157. } else {
  158. m.Network.NodeIsOnline(nodeName)
  159. m.NodeIsOnline(nodeName)
  160. }
  161. case <-time.After(nodeLivenessTimeout):
  162. logger.Info("event", fmt.Sprintf("node was not responding for %v", nodeLivenessTimeout))
  163. m.Network.NodeIsDown(nodeName)
  164. }
  165. }
  166. }
  167. // recalculateNetworkUptimeLoop every N seconds.
  168. func (m *Monitor) recalculateNetworkUptimeLoop() {
  169. for {
  170. select {
  171. case <-m.monitorQuit:
  172. return
  173. case <-time.After(m.recalculateNetworkUptimeEvery):
  174. m.Network.RecalculateUptime()
  175. }
  176. }
  177. }
  178. // updateNumValidatorLoop sends a request to a random node once every N seconds,
  179. // which in turn makes an RPC call to get the latest validators.
  180. func (m *Monitor) updateNumValidatorLoop() {
  181. rand.Seed(time.Now().Unix())
  182. var height int64
  183. var num int
  184. var err error
  185. for {
  186. m.mtx.Lock()
  187. nodesCount := len(m.Nodes)
  188. m.mtx.Unlock()
  189. if 0 == nodesCount {
  190. time.Sleep(m.numValidatorsUpdateInterval)
  191. continue
  192. }
  193. randomNodeIndex := rand.Intn(nodesCount)
  194. select {
  195. case <-m.monitorQuit:
  196. return
  197. case <-time.After(m.numValidatorsUpdateInterval):
  198. i := 0
  199. m.mtx.Lock()
  200. for _, n := range m.Nodes {
  201. if i == randomNodeIndex {
  202. height, num, err = n.NumValidators()
  203. if err != nil {
  204. m.logger.Info("err", errors.Wrap(err, "update num validators failed"))
  205. }
  206. break
  207. }
  208. i++
  209. }
  210. m.mtx.Unlock()
  211. m.Network.UpdateNumValidatorsForHeight(num, height)
  212. }
  213. }
  214. }