You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

233 lines
5.6 KiB

  1. package monitor
  2. import (
  3. "fmt"
  4. "math/rand"
  5. "sync"
  6. "time"
  7. "github.com/pkg/errors"
  8. tmtypes "github.com/tendermint/tendermint/types"
  9. "github.com/tendermint/tmlibs/log"
  10. )
  11. // waiting more than this many seconds for a block means we're unhealthy
  12. const nodeLivenessTimeout = 5 * time.Second
  13. // Monitor keeps track of the nodes and updates common statistics upon
  14. // receiving new events from nodes.
  15. //
  16. // Common statistics is stored in Network struct.
  17. type Monitor struct {
  18. mtx sync.Mutex
  19. Nodes []*Node
  20. Network *Network
  21. monitorQuit chan struct{} // monitor exitting
  22. nodeQuit map[string]chan struct{} // node is being stopped and removed from under the monitor
  23. recalculateNetworkUptimeEvery time.Duration
  24. numValidatorsUpdateInterval time.Duration
  25. logger log.Logger
  26. }
  27. // NewMonitor creates new instance of a Monitor. You can provide options to
  28. // change some default values.
  29. //
  30. // Example:
  31. // NewMonitor(monitor.SetNumValidatorsUpdateInterval(1 * time.Second))
  32. func NewMonitor(options ...func(*Monitor)) *Monitor {
  33. m := &Monitor{
  34. Nodes: make([]*Node, 0),
  35. Network: NewNetwork(),
  36. monitorQuit: make(chan struct{}),
  37. nodeQuit: make(map[string]chan struct{}),
  38. recalculateNetworkUptimeEvery: 10 * time.Second,
  39. numValidatorsUpdateInterval: 5 * time.Second,
  40. logger: log.NewNopLogger(),
  41. }
  42. for _, option := range options {
  43. option(m)
  44. }
  45. return m
  46. }
  47. // RecalculateNetworkUptimeEvery lets you change network uptime update interval.
  48. func RecalculateNetworkUptimeEvery(d time.Duration) func(m *Monitor) {
  49. return func(m *Monitor) {
  50. m.recalculateNetworkUptimeEvery = d
  51. }
  52. }
  53. // SetNumValidatorsUpdateInterval lets you change num validators update interval.
  54. func SetNumValidatorsUpdateInterval(d time.Duration) func(m *Monitor) {
  55. return func(m *Monitor) {
  56. m.numValidatorsUpdateInterval = d
  57. }
  58. }
  59. // SetLogger lets you set your own logger
  60. func (m *Monitor) SetLogger(l log.Logger) {
  61. m.logger = l
  62. }
  63. // Monitor begins to monitor the node `n`. The node will be started and added
  64. // to the monitor.
  65. func (m *Monitor) Monitor(n *Node) error {
  66. m.mtx.Lock()
  67. m.Nodes = append(m.Nodes, n)
  68. m.mtx.Unlock()
  69. blockCh := make(chan tmtypes.Header, 10)
  70. n.SendBlocksTo(blockCh)
  71. blockLatencyCh := make(chan float64, 10)
  72. n.SendBlockLatenciesTo(blockLatencyCh)
  73. disconnectCh := make(chan bool, 10)
  74. n.NotifyAboutDisconnects(disconnectCh)
  75. if err := n.Start(); err != nil {
  76. return err
  77. }
  78. m.Network.NewNode(n.Name)
  79. m.nodeQuit[n.Name] = make(chan struct{})
  80. go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
  81. return nil
  82. }
  83. // Unmonitor stops monitoring node `n`. The node will be stopped and removed
  84. // from the monitor.
  85. func (m *Monitor) Unmonitor(n *Node) {
  86. m.Network.NodeDeleted(n.Name)
  87. n.Stop()
  88. close(m.nodeQuit[n.Name])
  89. delete(m.nodeQuit, n.Name)
  90. i, _ := m.NodeByName(n.Name)
  91. m.mtx.Lock()
  92. m.Nodes[i] = m.Nodes[len(m.Nodes)-1]
  93. m.Nodes = m.Nodes[:len(m.Nodes)-1]
  94. m.mtx.Unlock()
  95. }
  96. // NodeByName returns the node and its index if such node exists within the
  97. // monitor. Otherwise, -1 and nil are returned.
  98. func (m *Monitor) NodeByName(name string) (index int, node *Node) {
  99. m.mtx.Lock()
  100. defer m.mtx.Unlock()
  101. for i, n := range m.Nodes {
  102. if name == n.Name {
  103. return i, n
  104. }
  105. }
  106. return -1, nil
  107. }
  108. // Start starts the monitor's routines: recalculating network uptime and
  109. // updating number of validators.
  110. func (m *Monitor) Start() error {
  111. go m.recalculateNetworkUptimeLoop()
  112. go m.updateNumValidatorLoop()
  113. return nil
  114. }
  115. // Stop stops the monitor's routines.
  116. func (m *Monitor) Stop() {
  117. close(m.monitorQuit)
  118. for _, n := range m.Nodes {
  119. m.Unmonitor(n)
  120. }
  121. }
  122. // main loop where we listen for events from the node
  123. func (m *Monitor) listen(nodeName string, blockCh <-chan tmtypes.Header, blockLatencyCh <-chan float64, disconnectCh <-chan bool, quit <-chan struct{}) {
  124. logger := m.logger.With("node", nodeName)
  125. for {
  126. select {
  127. case <-quit:
  128. return
  129. case b := <-blockCh:
  130. m.Network.NewBlock(b)
  131. m.Network.NodeIsOnline(nodeName)
  132. case l := <-blockLatencyCh:
  133. m.Network.NewBlockLatency(l)
  134. m.Network.NodeIsOnline(nodeName)
  135. case disconnected := <-disconnectCh:
  136. if disconnected {
  137. m.Network.NodeIsDown(nodeName)
  138. } else {
  139. m.Network.NodeIsOnline(nodeName)
  140. }
  141. case <-time.After(nodeLivenessTimeout):
  142. logger.Info("event", fmt.Sprintf("node was not responding for %v", nodeLivenessTimeout))
  143. m.Network.NodeIsDown(nodeName)
  144. }
  145. }
  146. }
  147. // recalculateNetworkUptimeLoop every N seconds.
  148. func (m *Monitor) recalculateNetworkUptimeLoop() {
  149. for {
  150. select {
  151. case <-m.monitorQuit:
  152. return
  153. case <-time.After(m.recalculateNetworkUptimeEvery):
  154. m.Network.RecalculateUptime()
  155. }
  156. }
  157. }
  158. // updateNumValidatorLoop sends a request to a random node once every N seconds,
  159. // which in turn makes an RPC call to get the latest validators.
  160. func (m *Monitor) updateNumValidatorLoop() {
  161. rand.Seed(time.Now().Unix())
  162. var height int64
  163. var num int
  164. var err error
  165. for {
  166. m.mtx.Lock()
  167. nodesCount := len(m.Nodes)
  168. m.mtx.Unlock()
  169. if 0 == nodesCount {
  170. time.Sleep(m.numValidatorsUpdateInterval)
  171. continue
  172. }
  173. randomNodeIndex := rand.Intn(nodesCount)
  174. select {
  175. case <-m.monitorQuit:
  176. return
  177. case <-time.After(m.numValidatorsUpdateInterval):
  178. i := 0
  179. m.mtx.Lock()
  180. for _, n := range m.Nodes {
  181. if i == randomNodeIndex {
  182. height, num, err = n.NumValidators()
  183. if err != nil {
  184. m.logger.Info("err", errors.Wrap(err, "update num validators failed"))
  185. }
  186. break
  187. }
  188. i++
  189. }
  190. m.mtx.Unlock()
  191. m.Network.UpdateNumValidatorsForHeight(num, height)
  192. }
  193. }
  194. }