mirror of
https://github.com/fluencelabs/tendermint
synced 2025-06-13 13:21:20 +00:00
differentiate between monitored nodes and nodes in a cluster
This commit is contained in:
@ -41,7 +41,7 @@ func (m *Monitor) Monitor(n *Node) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
m.Network.NumValidatorsOnline++
|
m.Network.NewNode(n.Name)
|
||||||
|
|
||||||
m.nodeQuit[n.Name] = make(chan struct{})
|
m.nodeQuit[n.Name] = make(chan struct{})
|
||||||
go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
|
go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
|
||||||
@ -50,7 +50,7 @@ func (m *Monitor) Monitor(n *Node) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *Monitor) Unmonitor(n *Node) {
|
func (m *Monitor) Unmonitor(n *Node) {
|
||||||
m.Network.NumValidatorsOnline--
|
m.Network.NodeDeleted(n.Name)
|
||||||
|
|
||||||
n.Stop()
|
n.Stop()
|
||||||
close(m.nodeQuit[n.Name])
|
close(m.nodeQuit[n.Name])
|
||||||
|
@ -8,19 +8,22 @@ import (
|
|||||||
tmtypes "github.com/tendermint/tendermint/types"
|
tmtypes "github.com/tendermint/tendermint/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
// UptimeData stores data for how long network has been running
|
// UptimeData stores data for how long network has been running.
|
||||||
type UptimeData struct {
|
type UptimeData struct {
|
||||||
StartTime time.Time `json:"start_time"`
|
StartTime time.Time `json:"start_time"`
|
||||||
Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been `ModerateHealth`y, ever
|
Uptime float64 `json:"uptime" wire:"unsafe"` // percentage of time we've been healthy, ever
|
||||||
|
|
||||||
totalDownTime time.Duration // total downtime (only updated when we come back online)
|
totalDownTime time.Duration // total downtime (only updated when we come back online)
|
||||||
wentDown time.Time
|
wentDown time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Health describes the health of the network. Note that this applies only to
|
||||||
|
// the observed nodes, and not to the entire cluster, which may consist of
|
||||||
|
// thousands of machines. It may change in the future.
|
||||||
type Health int
|
type Health int
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// FullHealth means all validators online, synced, making blocks
|
// FullHealth means all nodes online, synced, validators making blocks
|
||||||
FullHealth = iota
|
FullHealth = iota
|
||||||
// ModerateHealth means we're making blocks
|
// ModerateHealth means we're making blocks
|
||||||
ModerateHealth
|
ModerateHealth
|
||||||
@ -39,9 +42,9 @@ type Network struct {
|
|||||||
AvgBlockLatency float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute)
|
AvgBlockLatency float64 `json:"avg_block_latency" wire:"unsafe"` // ms (avg over last minute)
|
||||||
blockLatencyMeter metrics.Meter
|
blockLatencyMeter metrics.Meter
|
||||||
|
|
||||||
// Network Info
|
|
||||||
NumValidators int `json:"num_validators"`
|
NumValidators int `json:"num_validators"`
|
||||||
NumValidatorsOnline int `json:"num_validators_online"`
|
NumNodesMonitored int `json:"num_nodes_monitored"`
|
||||||
|
NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
|
||||||
|
|
||||||
Health Health `json:"health"`
|
Health Health `json:"health"`
|
||||||
|
|
||||||
@ -93,7 +96,7 @@ func (n *Network) NewBlock(b tmtypes.Header) {
|
|||||||
// TODO: make sure they're all at the same height (within a block)
|
// TODO: make sure they're all at the same height (within a block)
|
||||||
// and all proposing (and possibly validating ) Alternatively, just
|
// and all proposing (and possibly validating ) Alternatively, just
|
||||||
// check there hasn't been a new round in numValidators rounds
|
// check there hasn't been a new round in numValidators rounds
|
||||||
if n.NumValidatorsOnline == n.NumValidators {
|
if n.NumNodesMonitored == n.NumValidators {
|
||||||
n.Health = FullHealth
|
n.Health = FullHealth
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -119,36 +122,50 @@ func (n *Network) RecalculateUptime() {
|
|||||||
n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
|
n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NodeIsDown is called when the node disconnects for whatever reason.
|
||||||
func (n *Network) NodeIsDown(name string) {
|
func (n *Network) NodeIsDown(name string) {
|
||||||
n.mu.Lock()
|
n.mu.Lock()
|
||||||
defer n.mu.Unlock()
|
defer n.mu.Unlock()
|
||||||
|
|
||||||
if online := n.nodeStatusMap[name]; online {
|
if online := n.nodeStatusMap[name]; online {
|
||||||
n.nodeStatusMap[name] = false
|
n.nodeStatusMap[name] = false
|
||||||
n.NumValidatorsOnline--
|
n.NumNodesMonitoredOnline--
|
||||||
n.UptimeData.wentDown = time.Now()
|
n.UptimeData.wentDown = time.Now()
|
||||||
n.updateHealth()
|
n.updateHealth()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NodeIsOnline is called when connection to the node is restored.
|
||||||
func (n *Network) NodeIsOnline(name string) {
|
func (n *Network) NodeIsOnline(name string) {
|
||||||
n.mu.Lock()
|
n.mu.Lock()
|
||||||
defer n.mu.Unlock()
|
defer n.mu.Unlock()
|
||||||
|
|
||||||
if online, ok := n.nodeStatusMap[name]; !ok || !online {
|
if online, ok := n.nodeStatusMap[name]; !ok || !online {
|
||||||
n.nodeStatusMap[name] = true
|
n.nodeStatusMap[name] = true
|
||||||
n.NumValidatorsOnline++
|
n.NumNodesMonitoredOnline++
|
||||||
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
|
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
|
||||||
n.updateHealth()
|
n.updateHealth()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewNode is called when the new node is added to the monitor.
|
||||||
|
func (n *Network) NewNode(name string) {
|
||||||
|
n.NumNodesMonitored++
|
||||||
|
n.NumNodesMonitoredOnline++
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeDeleted is called when the node is deleted from under the monitor.
|
||||||
|
func (n *Network) NodeDeleted(name string) {
|
||||||
|
n.NumNodesMonitored--
|
||||||
|
n.NumNodesMonitoredOnline--
|
||||||
|
}
|
||||||
|
|
||||||
func (n *Network) updateHealth() {
|
func (n *Network) updateHealth() {
|
||||||
if n.NumValidatorsOnline < n.NumValidators {
|
if n.NumNodesMonitoredOnline < n.NumNodesMonitored {
|
||||||
n.Health = ModerateHealth
|
n.Health = ModerateHealth
|
||||||
}
|
}
|
||||||
|
|
||||||
if n.NumValidatorsOnline == 0 {
|
if n.NumNodesMonitoredOnline == 0 {
|
||||||
n.Health = Dead
|
n.Health = Dead
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -65,8 +65,7 @@ func (o *Ton) printHeader() {
|
|||||||
fmt.Fprintf(o.Output, "Avg block time: %.3f ms\n", n.AvgBlockTime)
|
fmt.Fprintf(o.Output, "Avg block time: %.3f ms\n", n.AvgBlockTime)
|
||||||
fmt.Fprintf(o.Output, "Avg Tx throughput: %.0f per sec\n", n.AvgTxThroughput)
|
fmt.Fprintf(o.Output, "Avg Tx throughput: %.0f per sec\n", n.AvgTxThroughput)
|
||||||
fmt.Fprintf(o.Output, "Avg block latency: %.3f ms\n", n.AvgBlockLatency)
|
fmt.Fprintf(o.Output, "Avg block latency: %.3f ms\n", n.AvgBlockLatency)
|
||||||
fmt.Fprintf(o.Output, "Validators: %d online / %d total ", n.NumValidatorsOnline, n.NumValidators)
|
fmt.Fprintf(o.Output, "Nodes: %d from %d online (health: %s) Validators: %d\n", n.NumNodesMonitoredOnline, n.NumNodesMonitored, n.GetHealthString(), n.NumValidators)
|
||||||
fmt.Fprintf(o.Output, "Health: %s\n", n.GetHealthString())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *Ton) printTable() {
|
func (o *Ton) printTable() {
|
||||||
|
Reference in New Issue
Block a user