package main
|
|
|
|
import (
|
|
"math/rand"
|
|
"time"
|
|
|
|
tmtypes "github.com/tendermint/tendermint/types"
|
|
)
|
|
|
|
// waiting more than this many seconds for a block means we're unhealthy
|
|
const nodeLivenessTimeout = 5 * time.Second
|
|
|
|
type Monitor struct {
|
|
Nodes map[string]*Node
|
|
Network *Network
|
|
|
|
monitorQuit chan struct{} // monitor exitting
|
|
nodeQuit map[string]chan struct{} // node is being stopped and removed from under the monitor
|
|
}
|
|
|
|
func NewMonitor() *Monitor {
|
|
return &Monitor{
|
|
Nodes: make(map[string]*Node),
|
|
Network: NewNetwork(),
|
|
monitorQuit: make(chan struct{}),
|
|
nodeQuit: make(map[string]chan struct{}),
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) Monitor(n *Node) error {
|
|
m.Nodes[n.Name] = n
|
|
|
|
blockCh := make(chan tmtypes.Header, 10)
|
|
n.SendBlocksTo(blockCh)
|
|
blockLatencyCh := make(chan float64, 10)
|
|
n.SendBlockLatenciesTo(blockLatencyCh)
|
|
disconnectCh := make(chan bool, 10)
|
|
n.NotifyAboutDisconnects(disconnectCh)
|
|
|
|
if err := n.Start(); err != nil {
|
|
return err
|
|
}
|
|
|
|
m.Network.NumValidatorsOnline++
|
|
|
|
m.nodeQuit[n.Name] = make(chan struct{})
|
|
go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Monitor) Unmonitor(n *Node) {
|
|
m.Network.NumValidatorsOnline--
|
|
|
|
n.Stop()
|
|
close(m.nodeQuit[n.Name])
|
|
delete(m.nodeQuit, n.Name)
|
|
delete(m.Nodes, n.Name)
|
|
}
|
|
|
|
func (m *Monitor) Start() error {
|
|
go m.recalculateNetworkUptime()
|
|
go m.updateNumValidators()
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Monitor) Stop() {
|
|
close(m.monitorQuit)
|
|
|
|
for _, n := range m.Nodes {
|
|
m.Unmonitor(n)
|
|
}
|
|
}
|
|
|
|
// main loop where we listen for events from the node
|
|
func (m *Monitor) listen(nodeName string, blockCh <-chan tmtypes.Header, blockLatencyCh <-chan float64, disconnectCh <-chan bool, quit <-chan struct{}) {
|
|
for {
|
|
select {
|
|
case <-quit:
|
|
return
|
|
case b := <-blockCh:
|
|
m.Network.NewBlock(b)
|
|
case l := <-blockLatencyCh:
|
|
m.Network.NewBlockLatency(l)
|
|
case disconnected := <-disconnectCh:
|
|
if disconnected {
|
|
m.Network.NodeIsDown(nodeName)
|
|
} else {
|
|
m.Network.NodeIsOnline(nodeName)
|
|
}
|
|
case <-time.After(nodeLivenessTimeout):
|
|
m.Network.NodeIsDown(nodeName)
|
|
}
|
|
}
|
|
}
|
|
|
|
// recalculateNetworkUptime every N seconds.
|
|
func (m *Monitor) recalculateNetworkUptime() {
|
|
for {
|
|
select {
|
|
case <-m.monitorQuit:
|
|
return
|
|
case <-time.After(10 * time.Second):
|
|
m.Network.RecalculateUptime()
|
|
}
|
|
}
|
|
}
|
|
|
|
// updateNumValidators sends a request to a random node once every N seconds,
|
|
// which in turn makes an RPC call to get the latest validators.
|
|
func (m *Monitor) updateNumValidators() {
|
|
rand.Seed(time.Now().Unix())
|
|
|
|
var height uint64
|
|
var num int
|
|
var err error
|
|
|
|
for {
|
|
if 0 == len(m.Nodes) {
|
|
m.Network.NumValidators = 0
|
|
time.Sleep(5 * time.Second)
|
|
continue
|
|
}
|
|
|
|
randomNodeIndex := rand.Intn(len(m.Nodes))
|
|
|
|
select {
|
|
case <-m.monitorQuit:
|
|
return
|
|
case <-time.After(5 * time.Second):
|
|
i := 0
|
|
for _, n := range m.Nodes {
|
|
if i == randomNodeIndex {
|
|
height, num, err = n.NumValidators()
|
|
if err != nil {
|
|
log.Debug(err.Error())
|
|
}
|
|
break
|
|
}
|
|
i++
|
|
}
|
|
|
|
if m.Network.Height <= height {
|
|
m.Network.NumValidators = num
|
|
}
|
|
}
|
|
}
|
|
}
|