You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

209 lines
5.4 KiB

Improved tm-monitor formatting (#4023) * tm-monitor: tweaked formatting of start time and avg tx throughput. * tm-monitor: update health when validator number is updated. * Updated CHANGELOG_PENDING * Added PR number to CHANGELOG_PENDING. Improves `tm-monitor` formatting of start time (RFC1123 without unnecessary precision) and avg tx throughput (three decimal places). The old tx throughput display was confusing during local testing where the tx rate is low and displayed as 0. Also updates the monitor health whenever the validator number changes. It otherwise starts with moderate health and fails to update this once it discovers the validators, leading to incorrect health reporting and invalid uptime statistics. Let me know if you would like me to submit this as a separate PR. ### Before: ``` 2019-09-29 20:40:00.992834 +0200 CEST m=+0.024057059 up -92030989600.42% Height: 2518 Avg block time: 1275.496 ms Avg tx throughput: 0 per sec Avg block latency: 2.464 ms Active nodes: 4/4 (health: moderate) Validators: 4 NAME HEIGHT BLOCK LATENCY ONLINE VALIDATOR localhost:26657 2518 0.935 ms true true localhost:26660 2518 0.710 ms true true localhost:26662 2518 0.708 ms true true localhost:26664 2518 0.717 ms true true ``` ### After: ``` Sun, 29 Sep 2019 20:21:59 +0200 up 100.00% Height: 2480 Avg block time: 1361.445 ms Avg tx throughput: 0.735 per sec Avg block latency: 4.232 ms Active nodes: 4/4 (health: full) Validators: 4 NAME HEIGHT BLOCK LATENCY ONLINE VALIDATOR localhost:26657 2480 1.174 ms true true localhost:26660 2480 1.037 ms true true localhost:26662 2480 0.981 ms true true localhost:26664 2480 0.995 ms true true ```
5 years ago
  1. package monitor
  2. import (
  3. "sync"
  4. "time"
  5. metrics "github.com/rcrowley/go-metrics"
  6. tmtypes "github.com/tendermint/tendermint/types"
  7. )
  8. // UptimeData stores data for how long network has been running.
  9. type UptimeData struct {
  10. StartTime time.Time `json:"start_time"`
  11. Uptime float64 `json:"uptime" amino:"unsafe"` // percentage of time we've been healthy, ever
  12. totalDownTime time.Duration // total downtime (only updated when we come back online)
  13. wentDown time.Time
  14. }
  15. // Health describes the health of the network. Note that this applies only to
  16. // the observed nodes, and not to the entire cluster, which may consist of
  17. // thousands of machines. It may change in the future.
  18. type Health int
  19. const (
  20. // FullHealth means all nodes online, synced, validators making blocks
  21. FullHealth = Health(0)
  22. // ModerateHealth means we're making blocks
  23. ModerateHealth = Health(1)
  24. // Dead means we're not making blocks due to all validators freezing or crashing
  25. Dead = Health(2)
  26. )
  27. // Common statistics for network of nodes
  28. type Network struct {
  29. Height int64 `json:"height"`
  30. AvgBlockTime float64 `json:"avg_block_time" amino:"unsafe"` // ms (avg over last minute)
  31. blockTimeMeter metrics.Meter
  32. AvgTxThroughput float64 `json:"avg_tx_throughput" amino:"unsafe"` // tx/s (avg over last minute)
  33. txThroughputMeter metrics.Meter
  34. AvgBlockLatency float64 `json:"avg_block_latency" amino:"unsafe"` // ms (avg over last minute)
  35. blockLatencyMeter metrics.Meter
  36. NumValidators int `json:"num_validators"`
  37. NumNodesMonitored int `json:"num_nodes_monitored"`
  38. NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
  39. Health Health `json:"health"`
  40. UptimeData *UptimeData `json:"uptime_data"`
  41. nodeStatusMap map[string]bool
  42. mu sync.Mutex
  43. }
  44. func NewNetwork() *Network {
  45. return &Network{
  46. blockTimeMeter: metrics.NewMeter(),
  47. txThroughputMeter: metrics.NewMeter(),
  48. blockLatencyMeter: metrics.NewMeter(),
  49. Health: FullHealth,
  50. UptimeData: &UptimeData{
  51. StartTime: time.Now(),
  52. Uptime: 100.0,
  53. },
  54. nodeStatusMap: make(map[string]bool),
  55. }
  56. }
  57. func (n *Network) NewBlock(b *tmtypes.Block) {
  58. n.mu.Lock()
  59. defer n.mu.Unlock()
  60. if n.Height >= b.Height {
  61. return
  62. }
  63. n.Height = b.Height
  64. n.blockTimeMeter.Mark(1)
  65. if n.blockTimeMeter.Rate1() > 0.0 {
  66. n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms
  67. } else {
  68. n.AvgBlockTime = 0.0
  69. }
  70. n.txThroughputMeter.Mark(int64(len(b.Data.Txs)))
  71. n.AvgTxThroughput = n.txThroughputMeter.Rate1()
  72. }
  73. func (n *Network) NewBlockLatency(l float64) {
  74. n.mu.Lock()
  75. defer n.mu.Unlock()
  76. n.blockLatencyMeter.Mark(int64(l))
  77. n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms
  78. }
  79. // RecalculateUptime calculates uptime on demand.
  80. func (n *Network) RecalculateUptime() {
  81. n.mu.Lock()
  82. defer n.mu.Unlock()
  83. since := time.Since(n.UptimeData.StartTime)
  84. uptime := since - n.UptimeData.totalDownTime
  85. if n.Health != FullHealth {
  86. uptime -= time.Since(n.UptimeData.wentDown)
  87. }
  88. n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
  89. }
  90. // NodeIsDown is called when the node disconnects for whatever reason.
  91. // Must be safe to call multiple times.
  92. func (n *Network) NodeIsDown(name string) {
  93. n.mu.Lock()
  94. defer n.mu.Unlock()
  95. if online, ok := n.nodeStatusMap[name]; !ok || online {
  96. n.nodeStatusMap[name] = false
  97. n.NumNodesMonitoredOnline--
  98. n.UptimeData.wentDown = time.Now()
  99. n.updateHealth()
  100. }
  101. }
  102. // NodeIsOnline is called when connection to the node is restored.
  103. // Must be safe to call multiple times.
  104. func (n *Network) NodeIsOnline(name string) {
  105. n.mu.Lock()
  106. defer n.mu.Unlock()
  107. if online, ok := n.nodeStatusMap[name]; ok && !online {
  108. n.nodeStatusMap[name] = true
  109. n.NumNodesMonitoredOnline++
  110. n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
  111. n.updateHealth()
  112. }
  113. }
  114. // NewNode is called when the new node is added to the monitor.
  115. func (n *Network) NewNode(name string) {
  116. n.mu.Lock()
  117. defer n.mu.Unlock()
  118. n.NumNodesMonitored++
  119. n.NumNodesMonitoredOnline++
  120. n.updateHealth()
  121. }
  122. // NodeDeleted is called when the node is deleted from under the monitor.
  123. func (n *Network) NodeDeleted(name string) {
  124. n.mu.Lock()
  125. defer n.mu.Unlock()
  126. n.NumNodesMonitored--
  127. n.NumNodesMonitoredOnline--
  128. n.updateHealth()
  129. }
  130. func (n *Network) updateHealth() {
  131. // if we are connected to all validators, we're at full health
  132. // TODO: make sure they're all at the same height (within a block)
  133. // and all proposing (and possibly validating ) Alternatively, just
  134. // check there hasn't been a new round in numValidators rounds
  135. switch {
  136. case n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators:
  137. n.Health = FullHealth
  138. case n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored:
  139. n.Health = ModerateHealth
  140. default:
  141. n.Health = Dead
  142. }
  143. }
  144. func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) {
  145. n.mu.Lock()
  146. defer n.mu.Unlock()
  147. if n.Height <= height {
  148. n.NumValidators = num
  149. }
  150. n.updateHealth()
  151. }
  152. func (n *Network) GetHealthString() string {
  153. switch n.Health {
  154. case FullHealth:
  155. return "full"
  156. case ModerateHealth:
  157. return "moderate"
  158. case Dead:
  159. return "dead"
  160. default:
  161. return "undefined"
  162. }
  163. }
  164. // Uptime returns network's uptime in percentages.
  165. func (n *Network) Uptime() float64 {
  166. n.mu.Lock()
  167. defer n.mu.Unlock()
  168. return n.UptimeData.Uptime
  169. }
  170. // StartTime returns time we started monitoring.
  171. func (n *Network) StartTime() time.Time {
  172. return n.UptimeData.StartTime
  173. }