differentiate between monitored nodes and nodes in a cluster

This commit is contained in:
Anton Kaliaev
2017-03-02 18:59:12 +04:00
parent 8c6ae55bd0
commit 069c870614
3 changed files with 31 additions and 15 deletions

View File

@ -41,7 +41,7 @@ func (m *Monitor) Monitor(n *Node) error {
return err return err
} }
m.Network.NumValidatorsOnline++ m.Network.NewNode(n.Name)
m.nodeQuit[n.Name] = make(chan struct{}) m.nodeQuit[n.Name] = make(chan struct{})
go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name]) go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
@ -50,7 +50,7 @@ func (m *Monitor) Monitor(n *Node) error {
} }
func (m *Monitor) Unmonitor(n *Node) { func (m *Monitor) Unmonitor(n *Node) {
m.Network.NumValidatorsOnline-- m.Network.NodeDeleted(n.Name)
n.Stop() n.Stop()
close(m.nodeQuit[n.Name]) close(m.nodeQuit[n.Name])

View File

@ -8,19 +8,22 @@ import (
tmtypes "github.com/tendermint/tendermint/types" tmtypes "github.com/tendermint/tendermint/types"
) )
// UptimeData stores data for how long network has been running // UptimeData stores data for how long network has been running.
type UptimeData struct { type UptimeData struct {
StartTime time.Time `json:"start_time"` StartTime time.Time `json:"start_time"`
Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been `ModerateHealth`y, ever Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been healthy, ever
totalDownTime time.Duration // total downtime (only updated when we come back online) totalDownTime time.Duration // total downtime (only updated when we come back online)
wentDown time.Time wentDown time.Time
} }
// Health describes the health of the network. Note that this applies only to
// the observed nodes, and not to the entire cluster, which may consist of
// thousands of machines. It may change in the future.
type Health int type Health int
const ( const (
// FullHealth means all validators online, synced, making blocks // FullHealth means all nodes online, synced, validators making blocks
FullHealth = iota FullHealth = iota
// ModerateHealth means we're making blocks // ModerateHealth means we're making blocks
ModerateHealth ModerateHealth
@ -39,9 +42,9 @@ type Network struct {
AvgBlockLatency float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute) AvgBlockLatency float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute)
blockLatencyMeter metrics.Meter blockLatencyMeter metrics.Meter
// Network Info
NumValidators int `json:"num_validators"` NumValidators int `json:"num_validators"`
NumValidatorsOnline int `json:"num_validators_online"` NumNodesMonitored int `json:"num_nodes_monitored"`
NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
Health Health `json:"health"` Health Health `json:"health"`
@ -93,7 +96,7 @@ func (n *Network) NewBlock(b tmtypes.Header) {
// TODO: make sure they're all at the same height (within a block) // TODO: make sure they're all at the same height (within a block)
// and all proposing (and possibly validating ) Alternatively, just // and all proposing (and possibly validating ) Alternatively, just
// check there hasn't been a new round in numValidators rounds // check there hasn't been a new round in numValidators rounds
if n.NumValidatorsOnline == n.NumValidators { if n.NumNodesMonitored == n.NumValidators {
n.Health = FullHealth n.Health = FullHealth
} }
} }
@ -119,36 +122,50 @@ func (n *Network) RecalculateUptime() {
n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0 n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
} }
// NodeIsDown is called when the node disconnects for whatever reason.
func (n *Network) NodeIsDown(name string) { func (n *Network) NodeIsDown(name string) {
n.mu.Lock() n.mu.Lock()
defer n.mu.Unlock() defer n.mu.Unlock()
if online := n.nodeStatusMap[name]; online { if online := n.nodeStatusMap[name]; online {
n.nodeStatusMap[name] = false n.nodeStatusMap[name] = false
n.NumValidatorsOnline-- n.NumNodesMonitoredOnline--
n.UptimeData.wentDown = time.Now() n.UptimeData.wentDown = time.Now()
n.updateHealth() n.updateHealth()
} }
} }
// NodeIsOnline is called when connection to the node is restored.
func (n *Network) NodeIsOnline(name string) { func (n *Network) NodeIsOnline(name string) {
n.mu.Lock() n.mu.Lock()
defer n.mu.Unlock() defer n.mu.Unlock()
if online, ok := n.nodeStatusMap[name]; !ok || !online { if online, ok := n.nodeStatusMap[name]; !ok || !online {
n.nodeStatusMap[name] = true n.nodeStatusMap[name] = true
n.NumValidatorsOnline++ n.NumNodesMonitoredOnline++
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown) n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
n.updateHealth() n.updateHealth()
} }
} }
// NewNode is called when the new node is added to the monitor.
func (n *Network) NewNode(name string) {
n.NumNodesMonitored++
n.NumNodesMonitoredOnline++
}
// NodeDeleted is called when the node is deleted from under the monitor.
func (n *Network) NodeDeleted(name string) {
n.NumNodesMonitored--
n.NumNodesMonitoredOnline--
}
func (n *Network) updateHealth() { func (n *Network) updateHealth() {
if n.NumValidatorsOnline < n.NumValidators { if n.NumNodesMonitoredOnline < n.NumNodesMonitored {
n.Health = ModerateHealth n.Health = ModerateHealth
} }
if n.NumValidatorsOnline == 0 { if n.NumNodesMonitoredOnline == 0 {
n.Health = Dead n.Health = Dead
} }
} }

View File

@ -65,8 +65,7 @@ func (o *Ton) printHeader() {
fmt.Fprintf(o.Output, "Avg block time: %.3f ms\n", n.AvgBlockTime) fmt.Fprintf(o.Output, "Avg block time: %.3f ms\n", n.AvgBlockTime)
fmt.Fprintf(o.Output, "Avg Tx throughput: %.0f per sec\n", n.AvgTxThroughput) fmt.Fprintf(o.Output, "Avg Tx throughput: %.0f per sec\n", n.AvgTxThroughput)
fmt.Fprintf(o.Output, "Avg block latency: %.3f ms\n", n.AvgBlockLatency) fmt.Fprintf(o.Output, "Avg block latency: %.3f ms\n", n.AvgBlockLatency)
fmt.Fprintf(o.Output, "Validators: %d online / %d total ", n.NumValidatorsOnline, n.NumValidators) fmt.Fprintf(o.Output, "Nodes: %d from %d online (health: %s) Validators: %d\n", n.NumNodesMonitoredOnline, n.NumNodesMonitored, n.GetHealthString(), n.NumValidators)
fmt.Fprintf(o.Output, "Health: %s\n", n.GetHealthString())
} }
func (o *Ton) printTable() { func (o *Ton) printTable() {