You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

251 lines
6.0 KiB

  1. package monitor
  2. import (
  3. "fmt"
  4. "math/rand"
  5. "sync"
  6. "time"
  7. "github.com/pkg/errors"
  8. "github.com/tendermint/tendermint/libs/log"
  9. tmtypes "github.com/tendermint/tendermint/types"
  10. )
  11. // waiting more than this many seconds for a block means we're unhealthy
  12. const nodeLivenessTimeout = 5 * time.Second
  13. // Monitor keeps track of the nodes and updates common statistics upon
  14. // receiving new events from nodes.
  15. //
  16. // Common statistics is stored in Network struct.
  17. type Monitor struct {
  18. mtx sync.Mutex
  19. Nodes []*Node
  20. Network *Network
  21. monitorQuit chan struct{} // monitor exitting
  22. nodeQuit map[string]chan struct{} // node is being stopped and removed from under the monitor
  23. recalculateNetworkUptimeEvery time.Duration
  24. numValidatorsUpdateInterval time.Duration
  25. logger log.Logger
  26. }
  27. // NewMonitor creates new instance of a Monitor. You can provide options to
  28. // change some default values.
  29. //
  30. // Example:
  31. // NewMonitor(monitor.SetNumValidatorsUpdateInterval(1 * time.Second))
  32. func NewMonitor(options ...func(*Monitor)) *Monitor {
  33. m := &Monitor{
  34. Nodes: make([]*Node, 0),
  35. Network: NewNetwork(),
  36. monitorQuit: make(chan struct{}),
  37. nodeQuit: make(map[string]chan struct{}),
  38. recalculateNetworkUptimeEvery: 10 * time.Second,
  39. numValidatorsUpdateInterval: 5 * time.Second,
  40. logger: log.NewNopLogger(),
  41. }
  42. for _, option := range options {
  43. option(m)
  44. }
  45. return m
  46. }
  47. // RecalculateNetworkUptimeEvery lets you change network uptime update interval.
  48. func RecalculateNetworkUptimeEvery(d time.Duration) func(m *Monitor) {
  49. return func(m *Monitor) {
  50. m.recalculateNetworkUptimeEvery = d
  51. }
  52. }
  53. // SetNumValidatorsUpdateInterval lets you change num validators update interval.
  54. func SetNumValidatorsUpdateInterval(d time.Duration) func(m *Monitor) {
  55. return func(m *Monitor) {
  56. m.numValidatorsUpdateInterval = d
  57. }
  58. }
  59. // SetLogger lets you set your own logger
  60. func (m *Monitor) SetLogger(l log.Logger) {
  61. m.logger = l
  62. }
  63. // Monitor begins to monitor the node `n`. The node will be started and added
  64. // to the monitor.
  65. func (m *Monitor) Monitor(n *Node) error {
  66. m.mtx.Lock()
  67. m.Nodes = append(m.Nodes, n)
  68. m.mtx.Unlock()
  69. blockCh := make(chan tmtypes.Header, 10)
  70. n.SendBlocksTo(blockCh)
  71. blockLatencyCh := make(chan float64, 10)
  72. n.SendBlockLatenciesTo(blockLatencyCh)
  73. disconnectCh := make(chan bool, 10)
  74. n.NotifyAboutDisconnects(disconnectCh)
  75. if err := n.Start(); err != nil {
  76. return err
  77. }
  78. m.Network.NewNode(n.Name)
  79. m.nodeQuit[n.Name] = make(chan struct{})
  80. go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
  81. return nil
  82. }
  83. // Unmonitor stops monitoring node `n`. The node will be stopped and removed
  84. // from the monitor.
  85. func (m *Monitor) Unmonitor(n *Node) {
  86. m.Network.NodeDeleted(n.Name)
  87. n.Stop()
  88. close(m.nodeQuit[n.Name])
  89. delete(m.nodeQuit, n.Name)
  90. i, _ := m.NodeByName(n.Name)
  91. m.mtx.Lock()
  92. m.Nodes[i] = m.Nodes[len(m.Nodes)-1]
  93. m.Nodes = m.Nodes[:len(m.Nodes)-1]
  94. m.mtx.Unlock()
  95. }
  96. // NodeByName returns the node and its index if such node exists within the
  97. // monitor. Otherwise, -1 and nil are returned.
  98. func (m *Monitor) NodeByName(name string) (index int, node *Node) {
  99. m.mtx.Lock()
  100. defer m.mtx.Unlock()
  101. for i, n := range m.Nodes {
  102. if name == n.Name {
  103. return i, n
  104. }
  105. }
  106. return -1, nil
  107. }
  108. // NodeIsOnline is called when connection to the node is restored.
  109. // Must be safe to call multiple times.
  110. func (m *Monitor) NodeIsOnline(name string) {
  111. _, node := m.NodeByName(name)
  112. if nil != node {
  113. if online, ok := m.Network.nodeStatusMap[name]; ok && online {
  114. m.mtx.Lock()
  115. node.Online = online
  116. m.mtx.Unlock()
  117. }
  118. }
  119. }
  120. // Start starts the monitor's routines: recalculating network uptime and
  121. // updating number of validators.
  122. func (m *Monitor) Start() error {
  123. go m.recalculateNetworkUptimeLoop()
  124. go m.updateNumValidatorLoop()
  125. return nil
  126. }
  127. // Stop stops the monitor's routines.
  128. func (m *Monitor) Stop() {
  129. close(m.monitorQuit)
  130. for _, n := range m.Nodes {
  131. m.Unmonitor(n)
  132. }
  133. }
  134. // main loop where we listen for events from the node
  135. func (m *Monitor) listen(nodeName string, blockCh <-chan tmtypes.Header, blockLatencyCh <-chan float64, disconnectCh <-chan bool, quit <-chan struct{}) {
  136. logger := m.logger.With("node", nodeName)
  137. for {
  138. select {
  139. case <-quit:
  140. return
  141. case b := <-blockCh:
  142. m.Network.NewBlock(b)
  143. m.Network.NodeIsOnline(nodeName)
  144. m.NodeIsOnline(nodeName)
  145. case l := <-blockLatencyCh:
  146. m.Network.NewBlockLatency(l)
  147. m.Network.NodeIsOnline(nodeName)
  148. m.NodeIsOnline(nodeName)
  149. case disconnected := <-disconnectCh:
  150. if disconnected {
  151. m.Network.NodeIsDown(nodeName)
  152. } else {
  153. m.Network.NodeIsOnline(nodeName)
  154. m.NodeIsOnline(nodeName)
  155. }
  156. case <-time.After(nodeLivenessTimeout):
  157. logger.Info("event", fmt.Sprintf("node was not responding for %v", nodeLivenessTimeout))
  158. m.Network.NodeIsDown(nodeName)
  159. }
  160. }
  161. }
  162. // recalculateNetworkUptimeLoop every N seconds.
  163. func (m *Monitor) recalculateNetworkUptimeLoop() {
  164. for {
  165. select {
  166. case <-m.monitorQuit:
  167. return
  168. case <-time.After(m.recalculateNetworkUptimeEvery):
  169. m.Network.RecalculateUptime()
  170. }
  171. }
  172. }
  173. // updateNumValidatorLoop sends a request to a random node once every N seconds,
  174. // which in turn makes an RPC call to get the latest validators.
  175. func (m *Monitor) updateNumValidatorLoop() {
  176. rand.Seed(time.Now().Unix())
  177. var height int64
  178. var num int
  179. var err error
  180. for {
  181. m.mtx.Lock()
  182. nodesCount := len(m.Nodes)
  183. m.mtx.Unlock()
  184. if 0 == nodesCount {
  185. time.Sleep(m.numValidatorsUpdateInterval)
  186. continue
  187. }
  188. randomNodeIndex := rand.Intn(nodesCount)
  189. select {
  190. case <-m.monitorQuit:
  191. return
  192. case <-time.After(m.numValidatorsUpdateInterval):
  193. i := 0
  194. m.mtx.Lock()
  195. for _, n := range m.Nodes {
  196. if i == randomNodeIndex {
  197. height, num, err = n.NumValidators()
  198. if err != nil {
  199. m.logger.Info("err", errors.Wrap(err, "update num validators failed"))
  200. }
  201. break
  202. }
  203. i++
  204. }
  205. m.mtx.Unlock()
  206. m.Network.UpdateNumValidatorsForHeight(num, height)
  207. }
  208. }
  209. }