You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

199 lines
5.3 KiB

  1. package monitor
  2. import (
  3. "sync"
  4. "time"
  5. metrics "github.com/rcrowley/go-metrics"
  6. tmtypes "github.com/tendermint/tendermint/types"
  7. )
  8. // UptimeData stores data for how long network has been running.
  9. type UptimeData struct {
  10. StartTime time.Time `json:"start_time"`
  11. Uptime float64 `json:"uptime" amino:"unsafe"` // percentage of time we've been healthy, ever
  12. totalDownTime time.Duration // total downtime (only updated when we come back online)
  13. wentDown time.Time
  14. }
  15. // Health describes the health of the network. Note that this applies only to
  16. // the observed nodes, and not to the entire cluster, which may consist of
  17. // thousands of machines. It may change in the future.
  18. type Health int
  19. const (
  20. // FullHealth means all nodes online, synced, validators making blocks
  21. FullHealth = Health(0)
  22. // ModerateHealth means we're making blocks
  23. ModerateHealth = Health(1)
  24. // Dead means we're not making blocks due to all validators freezing or crashing
  25. Dead = Health(2)
  26. )
  27. // Common statistics for network of nodes
  28. type Network struct {
  29. Height int64 `json:"height"`
  30. AvgBlockTime float64 `json:"avg_block_time" amino:"unsafe"` // ms (avg over last minute)
  31. blockTimeMeter metrics.Meter
  32. AvgTxThroughput float64 `json:"avg_tx_throughput" amino:"unsafe"` // tx/s (avg over last minute)
  33. txThroughputMeter metrics.Meter
  34. AvgBlockLatency float64 `json:"avg_block_latency" amino:"unsafe"` // ms (avg over last minute)
  35. blockLatencyMeter metrics.Meter
  36. NumValidators int `json:"num_validators"`
  37. NumNodesMonitored int `json:"num_nodes_monitored"`
  38. NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
  39. Health Health `json:"health"`
  40. UptimeData *UptimeData `json:"uptime_data"`
  41. nodeStatusMap map[string]bool
  42. mu sync.Mutex
  43. }
  44. func NewNetwork() *Network {
  45. return &Network{
  46. blockTimeMeter: metrics.NewMeter(),
  47. txThroughputMeter: metrics.NewMeter(),
  48. blockLatencyMeter: metrics.NewMeter(),
  49. Health: FullHealth,
  50. UptimeData: &UptimeData{
  51. StartTime: time.Now(),
  52. Uptime: 100.0,
  53. },
  54. nodeStatusMap: make(map[string]bool),
  55. }
  56. }
  57. func (n *Network) NewBlock(b tmtypes.Header) {
  58. n.mu.Lock()
  59. defer n.mu.Unlock()
  60. if n.Height >= b.Height {
  61. return
  62. }
  63. n.Height = b.Height
  64. n.blockTimeMeter.Mark(1)
  65. if n.blockTimeMeter.Rate1() > 0.0 {
  66. n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms
  67. } else {
  68. n.AvgBlockTime = 0.0
  69. }
  70. n.txThroughputMeter.Mark(int64(b.NumTxs))
  71. n.AvgTxThroughput = n.txThroughputMeter.Rate1()
  72. }
  73. func (n *Network) NewBlockLatency(l float64) {
  74. n.mu.Lock()
  75. defer n.mu.Unlock()
  76. n.blockLatencyMeter.Mark(int64(l))
  77. n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms
  78. }
  79. // RecalculateUptime calculates uptime on demand.
  80. func (n *Network) RecalculateUptime() {
  81. n.mu.Lock()
  82. defer n.mu.Unlock()
  83. since := time.Since(n.UptimeData.StartTime)
  84. uptime := since - n.UptimeData.totalDownTime
  85. if n.Health != FullHealth {
  86. uptime -= time.Since(n.UptimeData.wentDown)
  87. }
  88. n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
  89. }
  90. // NodeIsDown is called when the node disconnects for whatever reason.
  91. // Must be safe to call multiple times.
  92. func (n *Network) NodeIsDown(name string) {
  93. n.mu.Lock()
  94. defer n.mu.Unlock()
  95. if online, ok := n.nodeStatusMap[name]; !ok || online {
  96. n.nodeStatusMap[name] = false
  97. n.NumNodesMonitoredOnline--
  98. n.UptimeData.wentDown = time.Now()
  99. n.updateHealth()
  100. }
  101. }
  102. // NodeIsOnline is called when connection to the node is restored.
  103. // Must be safe to call multiple times.
  104. func (n *Network) NodeIsOnline(name string) {
  105. n.mu.Lock()
  106. defer n.mu.Unlock()
  107. if online, ok := n.nodeStatusMap[name]; ok && !online {
  108. n.nodeStatusMap[name] = true
  109. n.NumNodesMonitoredOnline++
  110. n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
  111. n.updateHealth()
  112. }
  113. }
  114. // NewNode is called when the new node is added to the monitor.
  115. func (n *Network) NewNode(name string) {
  116. n.NumNodesMonitored++
  117. n.NumNodesMonitoredOnline++
  118. }
  119. // NodeDeleted is called when the node is deleted from under the monitor.
  120. func (n *Network) NodeDeleted(name string) {
  121. n.NumNodesMonitored--
  122. n.NumNodesMonitoredOnline--
  123. }
  124. func (n *Network) updateHealth() {
  125. // if we are connected to all validators, we're at full health
  126. // TODO: make sure they're all at the same height (within a block)
  127. // and all proposing (and possibly validating ) Alternatively, just
  128. // check there hasn't been a new round in numValidators rounds
  129. if n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators {
  130. n.Health = FullHealth
  131. } else if n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored {
  132. n.Health = ModerateHealth
  133. } else {
  134. n.Health = Dead
  135. }
  136. }
  137. func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) {
  138. n.mu.Lock()
  139. defer n.mu.Unlock()
  140. if n.Height <= height {
  141. n.NumValidators = num
  142. }
  143. }
  144. func (n *Network) GetHealthString() string {
  145. switch n.Health {
  146. case FullHealth:
  147. return "full"
  148. case ModerateHealth:
  149. return "moderate"
  150. case Dead:
  151. return "dead"
  152. default:
  153. return "undefined"
  154. }
  155. }
  156. // Uptime returns network's uptime in percentages.
  157. func (n *Network) Uptime() float64 {
  158. n.mu.Lock()
  159. defer n.mu.Unlock()
  160. return n.UptimeData.Uptime
  161. }
  162. // StartTime returns time we started monitoring.
  163. func (n *Network) StartTime() time.Time {
  164. return n.UptimeData.StartTime
  165. }