Browse Source

differentiate between monitored nodes and nodes in a cluster

pull/1943/head
Anton Kaliaev 8 years ago
parent
commit
069c870614
No known key found for this signature in database GPG Key ID: 7B6881D965918214
3 changed files with 31 additions and 15 deletions
  1. +2
    -2
      tm-monitor/monitor.go
  2. +28
    -11
      tm-monitor/network.go
  3. +1
    -2
      tm-monitor/ton.go

+ 2
- 2
tm-monitor/monitor.go View File

@ -41,7 +41,7 @@ func (m *Monitor) Monitor(n *Node) error {
return err
}
m.Network.NumValidatorsOnline++
m.Network.NewNode(n.Name)
m.nodeQuit[n.Name] = make(chan struct{})
go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
@ -50,7 +50,7 @@ func (m *Monitor) Monitor(n *Node) error {
}
func (m *Monitor) Unmonitor(n *Node) {
m.Network.NumValidatorsOnline--
m.Network.NodeDeleted(n.Name)
n.Stop()
close(m.nodeQuit[n.Name])


+ 28
- 11
tm-monitor/network.go View File

@ -8,19 +8,22 @@ import (
tmtypes "github.com/tendermint/tendermint/types"
)
// UptimeData stores data for how long network has been running
// UptimeData stores data for how long network has been running.
type UptimeData struct {
StartTime time.Time `json:"start_time"`
Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been `ModerateHealth`y, ever
Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been healthy, ever
totalDownTime time.Duration // total downtime (only updated when we come back online)
wentDown time.Time
}
// Health describes the health of the network. Note that this applies only to
// the observed nodes, and not to the entire cluster, which may consist of
// thousands of machines. It may change in the future.
type Health int
const (
// FullHealth means all validators online, synced, making blocks
// FullHealth means all nodes online, synced, validators making blocks
FullHealth = iota
// ModerateHealth means we're making blocks
ModerateHealth
@ -39,9 +42,9 @@ type Network struct {
AvgBlockLatency float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute)
blockLatencyMeter metrics.Meter
// Network Info
NumValidators int `json:"num_validators"`
NumValidatorsOnline int `json:"num_validators_online"`
NumValidators int `json:"num_validators"`
NumNodesMonitored int `json:"num_nodes_monitored"`
NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
Health Health `json:"health"`
@ -93,7 +96,7 @@ func (n *Network) NewBlock(b tmtypes.Header) {
// TODO: make sure they're all at the same height (within a block)
// and all proposing (and possibly validating ) Alternatively, just
// check there hasn't been a new round in numValidators rounds
if n.NumValidatorsOnline == n.NumValidators {
if n.NumNodesMonitored == n.NumValidators {
n.Health = FullHealth
}
}
@ -119,36 +122,50 @@ func (n *Network) RecalculateUptime() {
n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
}
// NodeIsDown is called when the node disconnects for whatever reason.
func (n *Network) NodeIsDown(name string) {
n.mu.Lock()
defer n.mu.Unlock()
if online := n.nodeStatusMap[name]; online {
n.nodeStatusMap[name] = false
n.NumValidatorsOnline--
n.NumNodesMonitoredOnline--
n.UptimeData.wentDown = time.Now()
n.updateHealth()
}
}
// NodeIsOnline is called when connection to the node is restored.
func (n *Network) NodeIsOnline(name string) {
n.mu.Lock()
defer n.mu.Unlock()
if online, ok := n.nodeStatusMap[name]; !ok || !online {
n.nodeStatusMap[name] = true
n.NumValidatorsOnline++
n.NumNodesMonitoredOnline++
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
n.updateHealth()
}
}
// NewNode is called when the new node is added to the monitor.
func (n *Network) NewNode(name string) {
n.NumNodesMonitored++
n.NumNodesMonitoredOnline++
}
// NodeDeleted is called when the node is deleted from under the monitor.
func (n *Network) NodeDeleted(name string) {
n.NumNodesMonitored--
n.NumNodesMonitoredOnline--
}
func (n *Network) updateHealth() {
if n.NumValidatorsOnline < n.NumValidators {
if n.NumNodesMonitoredOnline < n.NumNodesMonitored {
n.Health = ModerateHealth
}
if n.NumValidatorsOnline == 0 {
if n.NumNodesMonitoredOnline == 0 {
n.Health = Dead
}
}


+ 1
- 2
tm-monitor/ton.go View File

@ -65,8 +65,7 @@ func (o *Ton) printHeader() {
fmt.Fprintf(o.Output, "Avg block time: %.3f ms\n", n.AvgBlockTime)
fmt.Fprintf(o.Output, "Avg Tx throughput: %.0f per sec\n", n.AvgTxThroughput)
fmt.Fprintf(o.Output, "Avg block latency: %.3f ms\n", n.AvgBlockLatency)
fmt.Fprintf(o.Output, "Validators: %d online / %d total ", n.NumValidatorsOnline, n.NumValidators)
fmt.Fprintf(o.Output, "Health: %s\n", n.GetHealthString())
fmt.Fprintf(o.Output, "Nodes: %d from %d online (health: %s) Validators: %d\n", n.NumNodesMonitoredOnline, n.NumNodesMonitored, n.GetHealthString(), n.NumValidators)
}
func (o *Ton) printTable() {


Loading…
Cancel
Save