mirror of
https://github.com/fluencelabs/tendermint
synced 2025-06-10 20:01:20 +00:00
differentiate between monitored nodes and nodes in a cluster
This commit is contained in:
@ -41,7 +41,7 @@ func (m *Monitor) Monitor(n *Node) error {
|
||||
return err
|
||||
}
|
||||
|
||||
m.Network.NumValidatorsOnline++
|
||||
m.Network.NewNode(n.Name)
|
||||
|
||||
m.nodeQuit[n.Name] = make(chan struct{})
|
||||
go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
|
||||
@ -50,7 +50,7 @@ func (m *Monitor) Monitor(n *Node) error {
|
||||
}
|
||||
|
||||
func (m *Monitor) Unmonitor(n *Node) {
|
||||
m.Network.NumValidatorsOnline--
|
||||
m.Network.NodeDeleted(n.Name)
|
||||
|
||||
n.Stop()
|
||||
close(m.nodeQuit[n.Name])
|
||||
|
@ -8,19 +8,22 @@ import (
|
||||
tmtypes "github.com/tendermint/tendermint/types"
|
||||
)
|
||||
|
||||
// UptimeData stores data for how long network has been running
|
||||
// UptimeData stores data for how long network has been running.
|
||||
type UptimeData struct {
|
||||
StartTime time.Time `json:"start_time"`
|
||||
Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been `ModerateHealth`y, ever
|
||||
Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been healthy, ever
|
||||
|
||||
totalDownTime time.Duration // total downtime (only updated when we come back online)
|
||||
wentDown time.Time
|
||||
}
|
||||
|
||||
// Health describes the health of the network. Note that this applies only to
|
||||
// the observed nodes, and not to the entire cluster, which may consist of
|
||||
// thousands of machines. It may change in the future.
|
||||
type Health int
|
||||
|
||||
const (
|
||||
// FullHealth means all validators online, synced, making blocks
|
||||
// FullHealth means all nodes online, synced, validators making blocks
|
||||
FullHealth = iota
|
||||
// ModerateHealth means we're making blocks
|
||||
ModerateHealth
|
||||
@ -39,9 +42,9 @@ type Network struct {
|
||||
AvgBlockLatency float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute)
|
||||
blockLatencyMeter metrics.Meter
|
||||
|
||||
// Network Info
|
||||
NumValidators int `json:"num_validators"`
|
||||
NumValidatorsOnline int `json:"num_validators_online"`
|
||||
NumValidators int `json:"num_validators"`
|
||||
NumNodesMonitored int `json:"num_nodes_monitored"`
|
||||
NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
|
||||
|
||||
Health Health `json:"health"`
|
||||
|
||||
@ -93,7 +96,7 @@ func (n *Network) NewBlock(b tmtypes.Header) {
|
||||
// TODO: make sure they're all at the same height (within a block)
|
||||
// and all proposing (and possibly validating ) Alternatively, just
|
||||
// check there hasn't been a new round in numValidators rounds
|
||||
if n.NumValidatorsOnline == n.NumValidators {
|
||||
if n.NumNodesMonitored == n.NumValidators {
|
||||
n.Health = FullHealth
|
||||
}
|
||||
}
|
||||
@ -119,36 +122,50 @@ func (n *Network) RecalculateUptime() {
|
||||
n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
|
||||
}
|
||||
|
||||
// NodeIsDown is called when the node disconnects for whatever reason.
|
||||
func (n *Network) NodeIsDown(name string) {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
if online := n.nodeStatusMap[name]; online {
|
||||
n.nodeStatusMap[name] = false
|
||||
n.NumValidatorsOnline--
|
||||
n.NumNodesMonitoredOnline--
|
||||
n.UptimeData.wentDown = time.Now()
|
||||
n.updateHealth()
|
||||
}
|
||||
}
|
||||
|
||||
// NodeIsOnline is called when connection to the node is restored.
|
||||
func (n *Network) NodeIsOnline(name string) {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
if online, ok := n.nodeStatusMap[name]; !ok || !online {
|
||||
n.nodeStatusMap[name] = true
|
||||
n.NumValidatorsOnline++
|
||||
n.NumNodesMonitoredOnline++
|
||||
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
|
||||
n.updateHealth()
|
||||
}
|
||||
}
|
||||
|
||||
// NewNode is called when the new node is added to the monitor.
|
||||
func (n *Network) NewNode(name string) {
|
||||
n.NumNodesMonitored++
|
||||
n.NumNodesMonitoredOnline++
|
||||
}
|
||||
|
||||
// NodeDeleted is called when the node is deleted from under the monitor.
|
||||
func (n *Network) NodeDeleted(name string) {
|
||||
n.NumNodesMonitored--
|
||||
n.NumNodesMonitoredOnline--
|
||||
}
|
||||
|
||||
func (n *Network) updateHealth() {
|
||||
if n.NumValidatorsOnline < n.NumValidators {
|
||||
if n.NumNodesMonitoredOnline < n.NumNodesMonitored {
|
||||
n.Health = ModerateHealth
|
||||
}
|
||||
|
||||
if n.NumValidatorsOnline == 0 {
|
||||
if n.NumNodesMonitoredOnline == 0 {
|
||||
n.Health = Dead
|
||||
}
|
||||
}
|
||||
|
@ -65,8 +65,7 @@ func (o *Ton) printHeader() {
|
||||
fmt.Fprintf(o.Output, "Avg block time: %.3f ms\n", n.AvgBlockTime)
|
||||
fmt.Fprintf(o.Output, "Avg Tx throughput: %.0f per sec\n", n.AvgTxThroughput)
|
||||
fmt.Fprintf(o.Output, "Avg block latency: %.3f ms\n", n.AvgBlockLatency)
|
||||
fmt.Fprintf(o.Output, "Validators: %d online / %d total ", n.NumValidatorsOnline, n.NumValidators)
|
||||
fmt.Fprintf(o.Output, "Health: %s\n", n.GetHealthString())
|
||||
fmt.Fprintf(o.Output, "Nodes: %d from %d online (health: %s) Validators: %d\n", n.NumNodesMonitoredOnline, n.NumNodesMonitored, n.GetHealthString(), n.NumValidators)
|
||||
}
|
||||
|
||||
func (o *Ton) printTable() {
|
||||
|
Reference in New Issue
Block a user