Browse Source

fix Health calculation logic

pull/1943/head
Anton Kaliaev 8 years ago
parent
commit
975807c744
No known key found for this signature in database GPG Key ID: 7B6881D965918214
3 changed files with 17 additions and 23 deletions
  1. +2
    -0
      tm-monitor/monitor.go
  2. +12
    -20
      tm-monitor/network.go
  3. +3
    -3
      tm-monitor/network_test.go

+ 2
- 0
tm-monitor/monitor.go View File

@ -122,8 +122,10 @@ func (m *Monitor) listen(nodeName string, blockCh <-chan tmtypes.Header, blockLa
return
case b := <-blockCh:
m.Network.NewBlock(b)
m.Network.NodeIsOnline(nodeName)
case l := <-blockLatencyCh:
m.Network.NewBlockLatency(l)
m.Network.NodeIsOnline(nodeName)
case disconnected := <-disconnectCh:
if disconnected {
m.Network.NodeIsDown(nodeName)


+ 12
- 20
tm-monitor/network.go View File

@ -74,7 +74,7 @@ func (n *Network) NewBlock(b tmtypes.Header) {
defer n.mu.Unlock()
if n.Height >= uint64(b.Height) {
log.Debug("Received new block with height %v less or equal to recorded %v", b.Height, n.Height)
log.Debug("Received new block with height <= current", "received", b.Height, "current", n.Height)
return
}
@ -89,20 +89,6 @@ func (n *Network) NewBlock(b tmtypes.Header) {
}
n.txThroughputMeter.Mark(int64(b.NumTxs))
n.AvgTxThroughput = n.txThroughputMeter.Rate1()
// if we're making blocks, we're healthy
if n.Health == Dead {
n.Health = ModerateHealth
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
}
// if we are connected to all validators, we're at full health
// TODO: make sure they're all at the same height (within a block)
// and all proposing (and possibly validating ) Alternatively, just
// check there hasn't been a new round in numValidators rounds
if n.NumNodesMonitored == n.NumValidators {
n.Health = FullHealth
}
}
func (n *Network) NewBlockLatency(l float64) {
@ -127,6 +113,7 @@ func (n *Network) RecalculateUptime() {
}
// NodeIsDown is called when the node disconnects for whatever reason.
// Must be safe to call multiple times.
func (n *Network) NodeIsDown(name string) {
n.mu.Lock()
defer n.mu.Unlock()
@ -140,11 +127,12 @@ func (n *Network) NodeIsDown(name string) {
}
// NodeIsOnline is called when connection to the node is restored.
// Must be safe to call multiple times.
func (n *Network) NodeIsOnline(name string) {
n.mu.Lock()
defer n.mu.Unlock()
if online, ok := n.nodeStatusMap[name]; !ok || !online {
if online, ok := n.nodeStatusMap[name]; ok && !online {
n.nodeStatusMap[name] = true
n.NumNodesMonitoredOnline++
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
@ -165,11 +153,15 @@ func (n *Network) NodeDeleted(name string) {
}
func (n *Network) updateHealth() {
if n.NumNodesMonitoredOnline < n.NumNodesMonitored {
// if we are connected to all validators, we're at full health
// TODO: make sure they're all at the same height (within a block)
// and all proposing (and possibly validating ) Alternatively, just
// check there hasn't been a new round in numValidators rounds
if n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators {
n.Health = FullHealth
} else if n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored {
n.Health = ModerateHealth
}
if n.NumNodesMonitoredOnline == 0 {
} else {
n.Health = Dead
}
}


+ 3
- 3
tm-monitor/network_test.go View File

@ -43,9 +43,9 @@ func TestNetworkNodeIsDownThenOnline(t *testing.T) {
n.NodeIsOnline("test")
assert.Equal(1, n.NumNodesMonitoredOnline)
// assert.Equal(monitor.FullHealth, n.Health)
// n.NodeIsOnline("test")
// assert.Equal(1, n.NumNodesMonitoredOnline)
assert.Equal(monitor.ModerateHealth, n.Health)
n.NodeIsOnline("test")
assert.Equal(1, n.NumNodesMonitoredOnline)
}
func TestNetworkNewNode(t *testing.T) {


Loading…
Cancel
Save