zolfa
/
tendermint


								package monitor


								import (

									"sync"

									"time"


									metrics "github.com/rcrowley/go-metrics"

									tmtypes "github.com/tendermint/tendermint/types"

								)


								// UptimeData stores data for how long network has been running.

								type UptimeData struct {

									StartTime time.Time `json:"start_time"`

									Uptime    float64   `json:"uptime" wire:"unsafe"` // percentage of time we've been healthy, ever


									totalDownTime time.Duration // total downtime (only updated when we come back online)

									wentDown      time.Time

								}


								// Health describes the health of the network. Note that this applies only to

								// the observed nodes, and not to the entire cluster, which may consist of

								// thousands of machines. It may change in the future.

								type Health int


								const (

									// FullHealth means all nodes online, synced, validators making blocks

									FullHealth = Health(0)

									// ModerateHealth means we're making blocks

									ModerateHealth = Health(1)

									// Dead means we're not making blocks due to all validators freezing or crashing

									Dead = Health(2)

								)


								// Common statistics for network of nodes

								type Network struct {

									Height int64 `json:"height"`


									AvgBlockTime      float64 `json:"avg_block_time" wire:"unsafe"` // ms (avg over last minute)

									blockTimeMeter    metrics.Meter

									AvgTxThroughput   float64 `json:"avg_tx_throughput" wire:"unsafe"` // tx/s (avg over last minute)

									txThroughputMeter metrics.Meter

									AvgBlockLatency   float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute)

									blockLatencyMeter metrics.Meter


									NumValidators           int `json:"num_validators"`

									NumNodesMonitored       int `json:"num_nodes_monitored"`

									NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`


									Health Health `json:"health"`


									UptimeData *UptimeData `json:"uptime_data"`


									nodeStatusMap map[string]bool


									mu sync.Mutex

								}


								func NewNetwork() *Network {

									return &Network{

										blockTimeMeter:    metrics.NewMeter(),

										txThroughputMeter: metrics.NewMeter(),

										blockLatencyMeter: metrics.NewMeter(),

										Health:            FullHealth,

										UptimeData: &UptimeData{

											StartTime: time.Now(),

											Uptime:    100.0,

										},

										nodeStatusMap: make(map[string]bool),

									}

								}


								func (n *Network) NewBlock(b tmtypes.Header) {

									n.mu.Lock()

									defer n.mu.Unlock()


									if n.Height >= b.Height {

										return

									}


									n.Height = b.Height


									n.blockTimeMeter.Mark(1)

									if n.blockTimeMeter.Rate1() > 0.0 {

										n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms

									} else {

										n.AvgBlockTime = 0.0

									}

									n.txThroughputMeter.Mark(int64(b.NumTxs))

									n.AvgTxThroughput = n.txThroughputMeter.Rate1()

								}


								func (n *Network) NewBlockLatency(l float64) {

									n.mu.Lock()

									defer n.mu.Unlock()


									n.blockLatencyMeter.Mark(int64(l))

									n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms

								}


								// RecalculateUptime calculates uptime on demand.

								func (n *Network) RecalculateUptime() {

									n.mu.Lock()

									defer n.mu.Unlock()


									since := time.Since(n.UptimeData.StartTime)

									uptime := since - n.UptimeData.totalDownTime

									if n.Health != FullHealth {

										uptime -= time.Since(n.UptimeData.wentDown)

									}

									n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0

								}


								// NodeIsDown is called when the node disconnects for whatever reason.

								// Must be safe to call multiple times.

								func (n *Network) NodeIsDown(name string) {

									n.mu.Lock()

									defer n.mu.Unlock()


									if online, ok := n.nodeStatusMap[name]; !ok || online {

										n.nodeStatusMap[name] = false

										n.NumNodesMonitoredOnline--

										n.UptimeData.wentDown = time.Now()

										n.updateHealth()

									}

								}


								// NodeIsOnline is called when connection to the node is restored.

								// Must be safe to call multiple times.

								func (n *Network) NodeIsOnline(name string) {

									n.mu.Lock()

									defer n.mu.Unlock()


									if online, ok := n.nodeStatusMap[name]; ok && !online {

										n.nodeStatusMap[name] = true

										n.NumNodesMonitoredOnline++

										n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)

										n.updateHealth()

									}

								}


								// NewNode is called when the new node is added to the monitor.

								func (n *Network) NewNode(name string) {

									n.NumNodesMonitored++

									n.NumNodesMonitoredOnline++

								}


								// NodeDeleted is called when the node is deleted from under the monitor.

								func (n *Network) NodeDeleted(name string) {

									n.NumNodesMonitored--

									n.NumNodesMonitoredOnline--

								}


								func (n *Network) updateHealth() {

									// if we are connected to all validators, we're at full health

									// TODO: make sure they're all at the same height (within a block)

									// and all proposing (and possibly validating ) Alternatively, just

									// check there hasn't been a new round in numValidators rounds

									if n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators {

										n.Health = FullHealth

									} else if n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored {

										n.Health = ModerateHealth

									} else {

										n.Health = Dead

									}

								}


								func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) {

									n.mu.Lock()

									defer n.mu.Unlock()


									if n.Height <= height {

										n.NumValidators = num

									}

								}


								func (n *Network) GetHealthString() string {

									switch n.Health {

									case FullHealth:

										return "full"

									case ModerateHealth:

										return "moderate"

									case Dead:

										return "dead"

									default:

										return "undefined"

									}

								}


								// Uptime returns network's uptime in percentages.

								func (n *Network) Uptime() float64 {

									n.mu.Lock()

									defer n.mu.Unlock()

									return n.UptimeData.Uptime

								}


								// StartTime returns time we started monitoring.

								func (n *Network) StartTime() time.Time {

									return n.UptimeData.StartTime

								}