diff --git a/tm-monitor/monitor.go b/tm-monitor/monitor.go index 69858888..4a2ff112 100644 --- a/tm-monitor/monitor.go +++ b/tm-monitor/monitor.go @@ -41,7 +41,7 @@ func (m *Monitor) Monitor(n *Node) error { return err } - m.Network.NumValidatorsOnline++ + m.Network.NewNode(n.Name) m.nodeQuit[n.Name] = make(chan struct{}) go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name]) @@ -50,7 +50,7 @@ func (m *Monitor) Monitor(n *Node) error { } func (m *Monitor) Unmonitor(n *Node) { - m.Network.NumValidatorsOnline-- + m.Network.NodeDeleted(n.Name) n.Stop() close(m.nodeQuit[n.Name]) diff --git a/tm-monitor/network.go b/tm-monitor/network.go index cfbb2f03..1929bc08 100644 --- a/tm-monitor/network.go +++ b/tm-monitor/network.go @@ -8,19 +8,22 @@ import ( tmtypes "github.com/tendermint/tendermint/types" ) -// UptimeData stores data for how long network has been running +// UptimeData stores data for how long network has been running. type UptimeData struct { StartTime time.Time `json:"start_time"` - Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been `ModerateHealth`y, ever + Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been healthy, ever totalDownTime time.Duration // total downtime (only updated when we come back online) wentDown time.Time } +// Health describes the health of the network. Note that this applies only to +// the observed nodes, and not to the entire cluster, which may consist of +// thousands of machines. It may change in the future. type Health int const ( - // FullHealth means all validators online, synced, making blocks + // FullHealth means all nodes online, synced, validators making blocks FullHealth = iota // ModerateHealth means we're making blocks ModerateHealth @@ -39,9 +42,9 @@ type Network struct { AvgBlockLatency float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute) blockLatencyMeter metrics.Meter - // Network Info - NumValidators int `json:"num_validators"` - NumValidatorsOnline int `json:"num_validators_online"` + NumValidators int `json:"num_validators"` + NumNodesMonitored int `json:"num_nodes_monitored"` + NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"` Health Health `json:"health"` @@ -93,7 +96,7 @@ func (n *Network) NewBlock(b tmtypes.Header) { // TODO: make sure they're all at the same height (within a block) // and all proposing (and possibly validating ) Alternatively, just // check there hasn't been a new round in numValidators rounds - if n.NumValidatorsOnline == n.NumValidators { + if n.NumNodesMonitored == n.NumValidators { n.Health = FullHealth } } @@ -119,36 +122,50 @@ func (n *Network) RecalculateUptime() { n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0 } +// NodeIsDown is called when the node disconnects for whatever reason. func (n *Network) NodeIsDown(name string) { n.mu.Lock() defer n.mu.Unlock() if online := n.nodeStatusMap[name]; online { n.nodeStatusMap[name] = false - n.NumValidatorsOnline-- + n.NumNodesMonitoredOnline-- n.UptimeData.wentDown = time.Now() n.updateHealth() } } +// NodeIsOnline is called when connection to the node is restored. func (n *Network) NodeIsOnline(name string) { n.mu.Lock() defer n.mu.Unlock() if online, ok := n.nodeStatusMap[name]; !ok || !online { n.nodeStatusMap[name] = true - n.NumValidatorsOnline++ + n.NumNodesMonitoredOnline++ n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown) n.updateHealth() } } +// NewNode is called when the new node is added to the monitor. +func (n *Network) NewNode(name string) { + n.NumNodesMonitored++ + n.NumNodesMonitoredOnline++ +} + +// NodeDeleted is called when the node is deleted from under the monitor. +func (n *Network) NodeDeleted(name string) { + n.NumNodesMonitored-- + n.NumNodesMonitoredOnline-- +} + func (n *Network) updateHealth() { - if n.NumValidatorsOnline < n.NumValidators { + if n.NumNodesMonitoredOnline < n.NumNodesMonitored { n.Health = ModerateHealth } - if n.NumValidatorsOnline == 0 { + if n.NumNodesMonitoredOnline == 0 { n.Health = Dead } } diff --git a/tm-monitor/ton.go b/tm-monitor/ton.go index a9189e64..37837792 100644 --- a/tm-monitor/ton.go +++ b/tm-monitor/ton.go @@ -65,8 +65,7 @@ func (o *Ton) printHeader() { fmt.Fprintf(o.Output, "Avg block time: %.3f ms\n", n.AvgBlockTime) fmt.Fprintf(o.Output, "Avg Tx throughput: %.0f per sec\n", n.AvgTxThroughput) fmt.Fprintf(o.Output, "Avg block latency: %.3f ms\n", n.AvgBlockLatency) - fmt.Fprintf(o.Output, "Validators: %d online / %d total ", n.NumValidatorsOnline, n.NumValidators) - fmt.Fprintf(o.Output, "Health: %s\n", n.GetHealthString()) + fmt.Fprintf(o.Output, "Nodes: %d from %d online (health: %s) Validators: %d\n", n.NumNodesMonitoredOnline, n.NumNodesMonitored, n.GetHealthString(), n.NumValidators) } func (o *Ton) printTable() {