Erik Grinaker ac099aa272 Improved tm-monitor formatting (#4023)
* tm-monitor: tweaked formatting of start time and avg tx throughput.

* tm-monitor: update health when validator number is updated.

* Updated CHANGELOG_PENDING

* Added PR number to CHANGELOG_PENDING.

Improves `tm-monitor` formatting of start time (RFC1123 without unnecessary precision) and avg tx throughput (three decimal places). The old tx throughput display was confusing during local testing where the tx rate is low and displayed as 0.

Also updates the monitor health whenever the validator number changes. It otherwise starts with moderate health and fails to update this once it discovers the validators, leading to incorrect health reporting and invalid uptime statistics. Let me know if you would like me to submit this as a separate PR.

### Before:

```
2019-09-29 20:40:00.992834 +0200 CEST m=+0.024057059 up -92030989600.42%

Height: 2518
Avg block time: 1275.496 ms
Avg tx throughput: 0 per sec
Avg block latency: 2.464 ms
Active nodes: 4/4 (health: moderate) Validators: 4

NAME                HEIGHT     BLOCK LATENCY     ONLINE     VALIDATOR     
localhost:26657     2518       0.935 ms          true       true          
localhost:26660     2518       0.710 ms          true       true          
localhost:26662     2518       0.708 ms          true       true          
localhost:26664     2518       0.717 ms          true       true          
```

### After:

```
Sun, 29 Sep 2019 20:21:59 +0200 up 100.00%

Height: 2480
Avg block time: 1361.445 ms
Avg tx throughput: 0.735 per sec
Avg block latency: 4.232 ms
Active nodes: 4/4 (health: full) Validators: 4

NAME                HEIGHT     BLOCK LATENCY     ONLINE     VALIDATOR     
localhost:26657     2480       1.174 ms          true       true          
localhost:26660     2480       1.037 ms          true       true          
localhost:26662     2480       0.981 ms          true       true          
localhost:26664     2480       0.995 ms          true       true          
```
2019-09-29 22:30:30 -07:00

210 lines
5.4 KiB
Go

package monitor
import (
"sync"
"time"
metrics "github.com/rcrowley/go-metrics"
tmtypes "github.com/tendermint/tendermint/types"
)
// UptimeData stores data for how long network has been running.
type UptimeData struct {
StartTime time.Time `json:"start_time"`
Uptime float64 `json:"uptime" amino:"unsafe"` // percentage of time we've been healthy, ever
totalDownTime time.Duration // total downtime (only updated when we come back online)
wentDown time.Time
}
// Health describes the health of the network. Note that this applies only to
// the observed nodes, and not to the entire cluster, which may consist of
// thousands of machines. It may change in the future.
type Health int
const (
// FullHealth means all nodes online, synced, validators making blocks
FullHealth = Health(0)
// ModerateHealth means we're making blocks
ModerateHealth = Health(1)
// Dead means we're not making blocks due to all validators freezing or crashing
Dead = Health(2)
)
// Common statistics for network of nodes
type Network struct {
Height int64 `json:"height"`
AvgBlockTime float64 `json:"avg_block_time" amino:"unsafe"` // ms (avg over last minute)
blockTimeMeter metrics.Meter
AvgTxThroughput float64 `json:"avg_tx_throughput" amino:"unsafe"` // tx/s (avg over last minute)
txThroughputMeter metrics.Meter
AvgBlockLatency float64 `json:"avg_block_latency" amino:"unsafe"` // ms (avg over last minute)
blockLatencyMeter metrics.Meter
NumValidators int `json:"num_validators"`
NumNodesMonitored int `json:"num_nodes_monitored"`
NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
Health Health `json:"health"`
UptimeData *UptimeData `json:"uptime_data"`
nodeStatusMap map[string]bool
mu sync.Mutex
}
func NewNetwork() *Network {
return &Network{
blockTimeMeter: metrics.NewMeter(),
txThroughputMeter: metrics.NewMeter(),
blockLatencyMeter: metrics.NewMeter(),
Health: FullHealth,
UptimeData: &UptimeData{
StartTime: time.Now(),
Uptime: 100.0,
},
nodeStatusMap: make(map[string]bool),
}
}
func (n *Network) NewBlock(b tmtypes.Header) {
n.mu.Lock()
defer n.mu.Unlock()
if n.Height >= b.Height {
return
}
n.Height = b.Height
n.blockTimeMeter.Mark(1)
if n.blockTimeMeter.Rate1() > 0.0 {
n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms
} else {
n.AvgBlockTime = 0.0
}
n.txThroughputMeter.Mark(b.NumTxs)
n.AvgTxThroughput = n.txThroughputMeter.Rate1()
}
func (n *Network) NewBlockLatency(l float64) {
n.mu.Lock()
defer n.mu.Unlock()
n.blockLatencyMeter.Mark(int64(l))
n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms
}
// RecalculateUptime calculates uptime on demand.
func (n *Network) RecalculateUptime() {
n.mu.Lock()
defer n.mu.Unlock()
since := time.Since(n.UptimeData.StartTime)
uptime := since - n.UptimeData.totalDownTime
if n.Health != FullHealth {
uptime -= time.Since(n.UptimeData.wentDown)
}
n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
}
// NodeIsDown is called when the node disconnects for whatever reason.
// Must be safe to call multiple times.
func (n *Network) NodeIsDown(name string) {
n.mu.Lock()
defer n.mu.Unlock()
if online, ok := n.nodeStatusMap[name]; !ok || online {
n.nodeStatusMap[name] = false
n.NumNodesMonitoredOnline--
n.UptimeData.wentDown = time.Now()
n.updateHealth()
}
}
// NodeIsOnline is called when connection to the node is restored.
// Must be safe to call multiple times.
func (n *Network) NodeIsOnline(name string) {
n.mu.Lock()
defer n.mu.Unlock()
if online, ok := n.nodeStatusMap[name]; ok && !online {
n.nodeStatusMap[name] = true
n.NumNodesMonitoredOnline++
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
n.updateHealth()
}
}
// NewNode is called when the new node is added to the monitor.
func (n *Network) NewNode(name string) {
n.mu.Lock()
defer n.mu.Unlock()
n.NumNodesMonitored++
n.NumNodesMonitoredOnline++
n.updateHealth()
}
// NodeDeleted is called when the node is deleted from under the monitor.
func (n *Network) NodeDeleted(name string) {
n.mu.Lock()
defer n.mu.Unlock()
n.NumNodesMonitored--
n.NumNodesMonitoredOnline--
n.updateHealth()
}
func (n *Network) updateHealth() {
// if we are connected to all validators, we're at full health
// TODO: make sure they're all at the same height (within a block)
// and all proposing (and possibly validating ) Alternatively, just
// check there hasn't been a new round in numValidators rounds
switch {
case n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators:
n.Health = FullHealth
case n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored:
n.Health = ModerateHealth
default:
n.Health = Dead
}
}
func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) {
n.mu.Lock()
defer n.mu.Unlock()
if n.Height <= height {
n.NumValidators = num
}
n.updateHealth()
}
func (n *Network) GetHealthString() string {
switch n.Health {
case FullHealth:
return "full"
case ModerateHealth:
return "moderate"
case Dead:
return "dead"
default:
return "undefined"
}
}
// Uptime returns network's uptime in percentages.
func (n *Network) Uptime() float64 {
n.mu.Lock()
defer n.mu.Unlock()
return n.UptimeData.Uptime
}
// StartTime returns time we started monitoring.
func (n *Network) StartTime() time.Time {
return n.UptimeData.StartTime
}