mirror of
https://github.com/fluencelabs/tendermint
synced 2025-06-12 12:51:22 +00:00
mv tools files to tools repo
This commit is contained in:
251
tools/tm-monitor/monitor/monitor.go
Normal file
251
tools/tm-monitor/monitor/monitor.go
Normal file
@ -0,0 +1,251 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
tmtypes "github.com/tendermint/tendermint/types"
|
||||
"github.com/tendermint/tmlibs/log"
|
||||
)
|
||||
|
||||
// waiting more than this many seconds for a block means we're unhealthy
|
||||
const nodeLivenessTimeout = 5 * time.Second
|
||||
|
||||
// Monitor keeps track of the nodes and updates common statistics upon
|
||||
// receiving new events from nodes.
|
||||
//
|
||||
// Common statistics is stored in Network struct.
|
||||
type Monitor struct {
|
||||
mtx sync.Mutex
|
||||
Nodes []*Node
|
||||
|
||||
Network *Network
|
||||
|
||||
monitorQuit chan struct{} // monitor exitting
|
||||
nodeQuit map[string]chan struct{} // node is being stopped and removed from under the monitor
|
||||
|
||||
recalculateNetworkUptimeEvery time.Duration
|
||||
numValidatorsUpdateInterval time.Duration
|
||||
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
// NewMonitor creates new instance of a Monitor. You can provide options to
|
||||
// change some default values.
|
||||
//
|
||||
// Example:
|
||||
// NewMonitor(monitor.SetNumValidatorsUpdateInterval(1 * time.Second))
|
||||
func NewMonitor(options ...func(*Monitor)) *Monitor {
|
||||
m := &Monitor{
|
||||
Nodes: make([]*Node, 0),
|
||||
Network: NewNetwork(),
|
||||
monitorQuit: make(chan struct{}),
|
||||
nodeQuit: make(map[string]chan struct{}),
|
||||
recalculateNetworkUptimeEvery: 10 * time.Second,
|
||||
numValidatorsUpdateInterval: 5 * time.Second,
|
||||
logger: log.NewNopLogger(),
|
||||
}
|
||||
|
||||
for _, option := range options {
|
||||
option(m)
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
// RecalculateNetworkUptimeEvery lets you change network uptime update interval.
|
||||
func RecalculateNetworkUptimeEvery(d time.Duration) func(m *Monitor) {
|
||||
return func(m *Monitor) {
|
||||
m.recalculateNetworkUptimeEvery = d
|
||||
}
|
||||
}
|
||||
|
||||
// SetNumValidatorsUpdateInterval lets you change num validators update interval.
|
||||
func SetNumValidatorsUpdateInterval(d time.Duration) func(m *Monitor) {
|
||||
return func(m *Monitor) {
|
||||
m.numValidatorsUpdateInterval = d
|
||||
}
|
||||
}
|
||||
|
||||
// SetLogger lets you set your own logger
|
||||
func (m *Monitor) SetLogger(l log.Logger) {
|
||||
m.logger = l
|
||||
}
|
||||
|
||||
// Monitor begins to monitor the node `n`. The node will be started and added
|
||||
// to the monitor.
|
||||
func (m *Monitor) Monitor(n *Node) error {
|
||||
m.mtx.Lock()
|
||||
m.Nodes = append(m.Nodes, n)
|
||||
m.mtx.Unlock()
|
||||
|
||||
blockCh := make(chan tmtypes.Header, 10)
|
||||
n.SendBlocksTo(blockCh)
|
||||
blockLatencyCh := make(chan float64, 10)
|
||||
n.SendBlockLatenciesTo(blockLatencyCh)
|
||||
disconnectCh := make(chan bool, 10)
|
||||
n.NotifyAboutDisconnects(disconnectCh)
|
||||
|
||||
if err := n.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
m.Network.NewNode(n.Name)
|
||||
|
||||
m.nodeQuit[n.Name] = make(chan struct{})
|
||||
go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Unmonitor stops monitoring node `n`. The node will be stopped and removed
|
||||
// from the monitor.
|
||||
func (m *Monitor) Unmonitor(n *Node) {
|
||||
m.Network.NodeDeleted(n.Name)
|
||||
|
||||
n.Stop()
|
||||
close(m.nodeQuit[n.Name])
|
||||
delete(m.nodeQuit, n.Name)
|
||||
i, _ := m.NodeByName(n.Name)
|
||||
|
||||
m.mtx.Lock()
|
||||
m.Nodes[i] = m.Nodes[len(m.Nodes)-1]
|
||||
m.Nodes = m.Nodes[:len(m.Nodes)-1]
|
||||
m.mtx.Unlock()
|
||||
}
|
||||
|
||||
// NodeByName returns the node and its index if such node exists within the
|
||||
// monitor. Otherwise, -1 and nil are returned.
|
||||
func (m *Monitor) NodeByName(name string) (index int, node *Node) {
|
||||
m.mtx.Lock()
|
||||
defer m.mtx.Unlock()
|
||||
|
||||
for i, n := range m.Nodes {
|
||||
if name == n.Name {
|
||||
return i, n
|
||||
}
|
||||
}
|
||||
return -1, nil
|
||||
}
|
||||
|
||||
// NodeIsOnline is called when connection to the node is restored.
|
||||
// Must be safe to call multiple times.
|
||||
func (m *Monitor) NodeIsOnline(name string) {
|
||||
|
||||
_, node := m.NodeByName(name)
|
||||
if nil != node {
|
||||
if online, ok := m.Network.nodeStatusMap[name]; ok && online {
|
||||
m.mtx.Lock()
|
||||
node.Online = online
|
||||
m.mtx.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Start starts the monitor's routines: recalculating network uptime and
|
||||
// updating number of validators.
|
||||
func (m *Monitor) Start() error {
|
||||
go m.recalculateNetworkUptimeLoop()
|
||||
go m.updateNumValidatorLoop()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop stops the monitor's routines.
|
||||
func (m *Monitor) Stop() {
|
||||
close(m.monitorQuit)
|
||||
|
||||
for _, n := range m.Nodes {
|
||||
m.Unmonitor(n)
|
||||
}
|
||||
}
|
||||
|
||||
// main loop where we listen for events from the node
|
||||
func (m *Monitor) listen(nodeName string, blockCh <-chan tmtypes.Header, blockLatencyCh <-chan float64, disconnectCh <-chan bool, quit <-chan struct{}) {
|
||||
logger := m.logger.With("node", nodeName)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-quit:
|
||||
return
|
||||
case b := <-blockCh:
|
||||
m.Network.NewBlock(b)
|
||||
m.Network.NodeIsOnline(nodeName)
|
||||
m.NodeIsOnline(nodeName)
|
||||
case l := <-blockLatencyCh:
|
||||
m.Network.NewBlockLatency(l)
|
||||
m.Network.NodeIsOnline(nodeName)
|
||||
m.NodeIsOnline(nodeName)
|
||||
case disconnected := <-disconnectCh:
|
||||
if disconnected {
|
||||
m.Network.NodeIsDown(nodeName)
|
||||
} else {
|
||||
m.Network.NodeIsOnline(nodeName)
|
||||
m.NodeIsOnline(nodeName)
|
||||
}
|
||||
case <-time.After(nodeLivenessTimeout):
|
||||
logger.Info("event", fmt.Sprintf("node was not responding for %v", nodeLivenessTimeout))
|
||||
m.Network.NodeIsDown(nodeName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// recalculateNetworkUptimeLoop every N seconds.
|
||||
func (m *Monitor) recalculateNetworkUptimeLoop() {
|
||||
for {
|
||||
select {
|
||||
case <-m.monitorQuit:
|
||||
return
|
||||
case <-time.After(m.recalculateNetworkUptimeEvery):
|
||||
m.Network.RecalculateUptime()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// updateNumValidatorLoop sends a request to a random node once every N seconds,
|
||||
// which in turn makes an RPC call to get the latest validators.
|
||||
func (m *Monitor) updateNumValidatorLoop() {
|
||||
rand.Seed(time.Now().Unix())
|
||||
|
||||
var height int64
|
||||
var num int
|
||||
var err error
|
||||
|
||||
for {
|
||||
m.mtx.Lock()
|
||||
nodesCount := len(m.Nodes)
|
||||
m.mtx.Unlock()
|
||||
if 0 == nodesCount {
|
||||
time.Sleep(m.numValidatorsUpdateInterval)
|
||||
continue
|
||||
}
|
||||
|
||||
randomNodeIndex := rand.Intn(nodesCount)
|
||||
|
||||
select {
|
||||
case <-m.monitorQuit:
|
||||
return
|
||||
case <-time.After(m.numValidatorsUpdateInterval):
|
||||
i := 0
|
||||
|
||||
m.mtx.Lock()
|
||||
for _, n := range m.Nodes {
|
||||
if i == randomNodeIndex {
|
||||
height, num, err = n.NumValidators()
|
||||
if err != nil {
|
||||
m.logger.Info("err", errors.Wrap(err, "update num validators failed"))
|
||||
}
|
||||
break
|
||||
}
|
||||
i++
|
||||
}
|
||||
m.mtx.Unlock()
|
||||
|
||||
m.Network.UpdateNumValidatorsForHeight(num, height)
|
||||
}
|
||||
}
|
||||
}
|
72
tools/tm-monitor/monitor/monitor_test.go
Normal file
72
tools/tm-monitor/monitor/monitor_test.go
Normal file
@ -0,0 +1,72 @@
|
||||
package monitor_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
crypto "github.com/tendermint/go-crypto"
|
||||
ctypes "github.com/tendermint/tendermint/rpc/core/types"
|
||||
tmtypes "github.com/tendermint/tendermint/types"
|
||||
mock "github.com/tendermint/tools/tm-monitor/mock"
|
||||
monitor "github.com/tendermint/tools/tm-monitor/monitor"
|
||||
"github.com/tendermint/go-amino"
|
||||
)
|
||||
|
||||
func TestMonitorUpdatesNumberOfValidators(t *testing.T) {
|
||||
m := startMonitor(t)
|
||||
defer m.Stop()
|
||||
|
||||
n, _ := createValidatorNode(t)
|
||||
m.Monitor(n)
|
||||
assert.Equal(t, 1, m.Network.NumNodesMonitored)
|
||||
assert.Equal(t, 1, m.Network.NumNodesMonitoredOnline)
|
||||
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
// DATA RACE
|
||||
// assert.Equal(t, 1, m.Network.NumValidators())
|
||||
}
|
||||
|
||||
func TestMonitorRecalculatesNetworkUptime(t *testing.T) {
|
||||
m := startMonitor(t)
|
||||
defer m.Stop()
|
||||
assert.Equal(t, 100.0, m.Network.Uptime())
|
||||
|
||||
n, _ := createValidatorNode(t)
|
||||
m.Monitor(n)
|
||||
|
||||
m.Network.NodeIsDown(n.Name) // simulate node failure
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
m.Network.NodeIsOnline(n.Name)
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
assert.True(t, m.Network.Uptime() < 100.0, "Uptime should be less than 100%")
|
||||
}
|
||||
|
||||
func startMonitor(t *testing.T) *monitor.Monitor {
|
||||
m := monitor.NewMonitor(
|
||||
monitor.SetNumValidatorsUpdateInterval(200*time.Millisecond),
|
||||
monitor.RecalculateNetworkUptimeEvery(200*time.Millisecond),
|
||||
)
|
||||
err := m.Start()
|
||||
require.Nil(t, err)
|
||||
return m
|
||||
}
|
||||
|
||||
func createValidatorNode(t *testing.T) (n *monitor.Node, emMock *mock.EventMeter) {
|
||||
emMock = &mock.EventMeter{}
|
||||
|
||||
stubs := make(map[string]interface{})
|
||||
pubKey := crypto.GenPrivKeyEd25519().PubKey()
|
||||
stubs["validators"] = ctypes.ResultValidators{BlockHeight: blockHeight, Validators: []*tmtypes.Validator{tmtypes.NewValidator(pubKey, 0)}}
|
||||
stubs["status"] = ctypes.ResultStatus{ValidatorInfo: ctypes.ValidatorInfo{PubKey: pubKey}}
|
||||
cdc := amino.NewCodec()
|
||||
rpcClientMock := &mock.RpcClient{Stubs: stubs}
|
||||
rpcClientMock.SetCodec(cdc)
|
||||
|
||||
n = monitor.NewNodeWithEventMeterAndRpcClient("tcp://127.0.0.1:26657", emMock, rpcClientMock)
|
||||
return
|
||||
}
|
199
tools/tm-monitor/monitor/network.go
Normal file
199
tools/tm-monitor/monitor/network.go
Normal file
@ -0,0 +1,199 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
metrics "github.com/rcrowley/go-metrics"
|
||||
tmtypes "github.com/tendermint/tendermint/types"
|
||||
)
|
||||
|
||||
// UptimeData stores data for how long network has been running.
|
||||
type UptimeData struct {
|
||||
StartTime time.Time `json:"start_time"`
|
||||
Uptime float64 `json:"uptime" amino:"unsafe"` // percentage of time we've been healthy, ever
|
||||
|
||||
totalDownTime time.Duration // total downtime (only updated when we come back online)
|
||||
wentDown time.Time
|
||||
}
|
||||
|
||||
// Health describes the health of the network. Note that this applies only to
|
||||
// the observed nodes, and not to the entire cluster, which may consist of
|
||||
// thousands of machines. It may change in the future.
|
||||
type Health int
|
||||
|
||||
const (
|
||||
// FullHealth means all nodes online, synced, validators making blocks
|
||||
FullHealth = Health(0)
|
||||
// ModerateHealth means we're making blocks
|
||||
ModerateHealth = Health(1)
|
||||
// Dead means we're not making blocks due to all validators freezing or crashing
|
||||
Dead = Health(2)
|
||||
)
|
||||
|
||||
// Common statistics for network of nodes
|
||||
type Network struct {
|
||||
Height int64 `json:"height"`
|
||||
|
||||
AvgBlockTime float64 `json:"avg_block_time" amino:"unsafe"` // ms (avg over last minute)
|
||||
blockTimeMeter metrics.Meter
|
||||
AvgTxThroughput float64 `json:"avg_tx_throughput" amino:"unsafe"` // tx/s (avg over last minute)
|
||||
txThroughputMeter metrics.Meter
|
||||
AvgBlockLatency float64 `json:"avg_block_latency" amino:"unsafe"` // ms (avg over last minute)
|
||||
blockLatencyMeter metrics.Meter
|
||||
|
||||
NumValidators int `json:"num_validators"`
|
||||
NumNodesMonitored int `json:"num_nodes_monitored"`
|
||||
NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
|
||||
|
||||
Health Health `json:"health"`
|
||||
|
||||
UptimeData *UptimeData `json:"uptime_data"`
|
||||
|
||||
nodeStatusMap map[string]bool
|
||||
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
func NewNetwork() *Network {
|
||||
return &Network{
|
||||
blockTimeMeter: metrics.NewMeter(),
|
||||
txThroughputMeter: metrics.NewMeter(),
|
||||
blockLatencyMeter: metrics.NewMeter(),
|
||||
Health: FullHealth,
|
||||
UptimeData: &UptimeData{
|
||||
StartTime: time.Now(),
|
||||
Uptime: 100.0,
|
||||
},
|
||||
nodeStatusMap: make(map[string]bool),
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Network) NewBlock(b tmtypes.Header) {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
if n.Height >= b.Height {
|
||||
return
|
||||
}
|
||||
|
||||
n.Height = b.Height
|
||||
|
||||
n.blockTimeMeter.Mark(1)
|
||||
if n.blockTimeMeter.Rate1() > 0.0 {
|
||||
n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms
|
||||
} else {
|
||||
n.AvgBlockTime = 0.0
|
||||
}
|
||||
n.txThroughputMeter.Mark(int64(b.NumTxs))
|
||||
n.AvgTxThroughput = n.txThroughputMeter.Rate1()
|
||||
}
|
||||
|
||||
func (n *Network) NewBlockLatency(l float64) {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
n.blockLatencyMeter.Mark(int64(l))
|
||||
n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms
|
||||
}
|
||||
|
||||
// RecalculateUptime calculates uptime on demand.
|
||||
func (n *Network) RecalculateUptime() {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
since := time.Since(n.UptimeData.StartTime)
|
||||
uptime := since - n.UptimeData.totalDownTime
|
||||
if n.Health != FullHealth {
|
||||
uptime -= time.Since(n.UptimeData.wentDown)
|
||||
}
|
||||
n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
|
||||
}
|
||||
|
||||
// NodeIsDown is called when the node disconnects for whatever reason.
|
||||
// Must be safe to call multiple times.
|
||||
func (n *Network) NodeIsDown(name string) {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
if online, ok := n.nodeStatusMap[name]; !ok || online {
|
||||
n.nodeStatusMap[name] = false
|
||||
n.NumNodesMonitoredOnline--
|
||||
n.UptimeData.wentDown = time.Now()
|
||||
n.updateHealth()
|
||||
}
|
||||
}
|
||||
|
||||
// NodeIsOnline is called when connection to the node is restored.
|
||||
// Must be safe to call multiple times.
|
||||
func (n *Network) NodeIsOnline(name string) {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
if online, ok := n.nodeStatusMap[name]; ok && !online {
|
||||
n.nodeStatusMap[name] = true
|
||||
n.NumNodesMonitoredOnline++
|
||||
n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
|
||||
n.updateHealth()
|
||||
}
|
||||
}
|
||||
|
||||
// NewNode is called when the new node is added to the monitor.
|
||||
func (n *Network) NewNode(name string) {
|
||||
n.NumNodesMonitored++
|
||||
n.NumNodesMonitoredOnline++
|
||||
}
|
||||
|
||||
// NodeDeleted is called when the node is deleted from under the monitor.
|
||||
func (n *Network) NodeDeleted(name string) {
|
||||
n.NumNodesMonitored--
|
||||
n.NumNodesMonitoredOnline--
|
||||
}
|
||||
|
||||
func (n *Network) updateHealth() {
|
||||
// if we are connected to all validators, we're at full health
|
||||
// TODO: make sure they're all at the same height (within a block)
|
||||
// and all proposing (and possibly validating ) Alternatively, just
|
||||
// check there hasn't been a new round in numValidators rounds
|
||||
if n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators {
|
||||
n.Health = FullHealth
|
||||
} else if n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored {
|
||||
n.Health = ModerateHealth
|
||||
} else {
|
||||
n.Health = Dead
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
if n.Height <= height {
|
||||
n.NumValidators = num
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Network) GetHealthString() string {
|
||||
switch n.Health {
|
||||
case FullHealth:
|
||||
return "full"
|
||||
case ModerateHealth:
|
||||
return "moderate"
|
||||
case Dead:
|
||||
return "dead"
|
||||
default:
|
||||
return "undefined"
|
||||
}
|
||||
}
|
||||
|
||||
// Uptime returns network's uptime in percentages.
|
||||
func (n *Network) Uptime() float64 {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
return n.UptimeData.Uptime
|
||||
}
|
||||
|
||||
// StartTime returns time we started monitoring.
|
||||
func (n *Network) StartTime() time.Time {
|
||||
return n.UptimeData.StartTime
|
||||
}
|
79
tools/tm-monitor/monitor/network_test.go
Normal file
79
tools/tm-monitor/monitor/network_test.go
Normal file
@ -0,0 +1,79 @@
|
||||
package monitor_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
tmtypes "github.com/tendermint/tendermint/types"
|
||||
monitor "github.com/tendermint/tools/tm-monitor/monitor"
|
||||
)
|
||||
|
||||
func TestNetworkNewBlock(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
|
||||
n.NewBlock(tmtypes.Header{Height: 5, NumTxs: 100})
|
||||
assert.Equal(t, int64(5), n.Height)
|
||||
assert.Equal(t, 0.0, n.AvgBlockTime)
|
||||
assert.Equal(t, 0.0, n.AvgTxThroughput)
|
||||
}
|
||||
|
||||
func TestNetworkNewBlockLatency(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
|
||||
n.NewBlockLatency(9000000.0) // nanoseconds
|
||||
assert.Equal(t, 0.0, n.AvgBlockLatency)
|
||||
}
|
||||
|
||||
func TestNetworkNodeIsDownThenOnline(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
n.NewNode("test")
|
||||
|
||||
n.NodeIsDown("test")
|
||||
assert.Equal(t, 0, n.NumNodesMonitoredOnline)
|
||||
assert.Equal(t, monitor.Dead, n.Health)
|
||||
n.NodeIsDown("test")
|
||||
assert.Equal(t, 0, n.NumNodesMonitoredOnline)
|
||||
|
||||
n.NodeIsOnline("test")
|
||||
assert.Equal(t, 1, n.NumNodesMonitoredOnline)
|
||||
assert.Equal(t, monitor.ModerateHealth, n.Health)
|
||||
n.NodeIsOnline("test")
|
||||
assert.Equal(t, 1, n.NumNodesMonitoredOnline)
|
||||
}
|
||||
|
||||
func TestNetworkNewNode(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
assert.Equal(t, 0, n.NumNodesMonitored)
|
||||
assert.Equal(t, 0, n.NumNodesMonitoredOnline)
|
||||
n.NewNode("test")
|
||||
assert.Equal(t, 1, n.NumNodesMonitored)
|
||||
assert.Equal(t, 1, n.NumNodesMonitoredOnline)
|
||||
}
|
||||
|
||||
func TestNetworkNodeDeleted(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
n.NewNode("test")
|
||||
n.NodeDeleted("test")
|
||||
assert.Equal(t, 0, n.NumNodesMonitored)
|
||||
assert.Equal(t, 0, n.NumNodesMonitoredOnline)
|
||||
}
|
||||
|
||||
func TestNetworkGetHealthString(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
assert.Equal(t, "full", n.GetHealthString())
|
||||
n.Health = monitor.ModerateHealth
|
||||
assert.Equal(t, "moderate", n.GetHealthString())
|
||||
n.Health = monitor.Dead
|
||||
assert.Equal(t, "dead", n.GetHealthString())
|
||||
}
|
||||
|
||||
func TestNetworkUptime(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
assert.Equal(t, 100.0, n.Uptime())
|
||||
}
|
||||
|
||||
func TestNetworkStartTime(t *testing.T) {
|
||||
n := monitor.NewNetwork()
|
||||
assert.True(t, n.StartTime().Before(time.Now()))
|
||||
}
|
260
tools/tm-monitor/monitor/node.go
Normal file
260
tools/tm-monitor/monitor/node.go
Normal file
@ -0,0 +1,260 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
crypto "github.com/tendermint/go-crypto"
|
||||
ctypes "github.com/tendermint/tendermint/rpc/core/types"
|
||||
rpc_client "github.com/tendermint/tendermint/rpc/lib/client"
|
||||
tmtypes "github.com/tendermint/tendermint/types"
|
||||
"github.com/tendermint/tmlibs/events"
|
||||
"github.com/tendermint/tmlibs/log"
|
||||
em "github.com/tendermint/tools/tm-monitor/eventmeter"
|
||||
)
|
||||
|
||||
const maxRestarts = 25
|
||||
|
||||
type Node struct {
|
||||
rpcAddr string
|
||||
|
||||
IsValidator bool `json:"is_validator"` // validator or non-validator?
|
||||
pubKey crypto.PubKey `json:"pub_key"`
|
||||
|
||||
Name string `json:"name"`
|
||||
Online bool `json:"online"`
|
||||
Height int64 `json:"height"`
|
||||
BlockLatency float64 `json:"block_latency" amino:"unsafe"` // ms, interval between block commits
|
||||
|
||||
// em holds the ws connection. Each eventMeter callback is called in a separate go-routine.
|
||||
em eventMeter
|
||||
|
||||
// rpcClient is an client for making RPC calls to TM
|
||||
rpcClient rpc_client.HTTPClient
|
||||
|
||||
blockCh chan<- tmtypes.Header
|
||||
blockLatencyCh chan<- float64
|
||||
disconnectCh chan<- bool
|
||||
|
||||
checkIsValidatorInterval time.Duration
|
||||
|
||||
quit chan struct{}
|
||||
|
||||
logger log.Logger
|
||||
}
|
||||
|
||||
func NewNode(rpcAddr string, options ...func(*Node)) *Node {
|
||||
em := em.NewEventMeter(rpcAddr, UnmarshalEvent)
|
||||
rpcClient := rpc_client.NewURIClient(rpcAddr) // HTTP client by default
|
||||
rpcClient.SetCodec(cdc)
|
||||
return NewNodeWithEventMeterAndRpcClient(rpcAddr, em, rpcClient, options...)
|
||||
}
|
||||
|
||||
func NewNodeWithEventMeterAndRpcClient(rpcAddr string, em eventMeter, rpcClient rpc_client.HTTPClient, options ...func(*Node)) *Node {
|
||||
n := &Node{
|
||||
rpcAddr: rpcAddr,
|
||||
em: em,
|
||||
rpcClient: rpcClient,
|
||||
Name: rpcAddr,
|
||||
quit: make(chan struct{}),
|
||||
checkIsValidatorInterval: 5 * time.Second,
|
||||
logger: log.NewNopLogger(),
|
||||
}
|
||||
|
||||
for _, option := range options {
|
||||
option(n)
|
||||
}
|
||||
|
||||
return n
|
||||
}
|
||||
|
||||
// SetCheckIsValidatorInterval lets you change interval for checking whenever
|
||||
// node is still a validator or not.
|
||||
func SetCheckIsValidatorInterval(d time.Duration) func(n *Node) {
|
||||
return func(n *Node) {
|
||||
n.checkIsValidatorInterval = d
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Node) SendBlocksTo(ch chan<- tmtypes.Header) {
|
||||
n.blockCh = ch
|
||||
}
|
||||
|
||||
func (n *Node) SendBlockLatenciesTo(ch chan<- float64) {
|
||||
n.blockLatencyCh = ch
|
||||
}
|
||||
|
||||
func (n *Node) NotifyAboutDisconnects(ch chan<- bool) {
|
||||
n.disconnectCh = ch
|
||||
}
|
||||
|
||||
// SetLogger lets you set your own logger
|
||||
func (n *Node) SetLogger(l log.Logger) {
|
||||
n.logger = l
|
||||
n.em.SetLogger(l)
|
||||
}
|
||||
|
||||
func (n *Node) Start() error {
|
||||
if err := n.em.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
n.em.RegisterLatencyCallback(latencyCallback(n))
|
||||
err := n.em.Subscribe(tmtypes.EventQueryNewBlockHeader.String(), newBlockCallback(n))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
n.em.RegisterDisconnectCallback(disconnectCallback(n))
|
||||
|
||||
n.Online = true
|
||||
|
||||
n.checkIsValidator()
|
||||
go n.checkIsValidatorLoop()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n *Node) Stop() {
|
||||
n.Online = false
|
||||
|
||||
n.em.Stop()
|
||||
|
||||
close(n.quit)
|
||||
}
|
||||
|
||||
// implements eventmeter.EventCallbackFunc
|
||||
func newBlockCallback(n *Node) em.EventCallbackFunc {
|
||||
return func(metric *em.EventMetric, data interface{}) {
|
||||
block := data.(tmtypes.TMEventData).(tmtypes.EventDataNewBlockHeader).Header
|
||||
|
||||
n.Height = block.Height
|
||||
n.logger.Info("new block", "height", block.Height, "numTxs", block.NumTxs)
|
||||
|
||||
if n.blockCh != nil {
|
||||
n.blockCh <- *block
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// implements eventmeter.EventLatencyFunc
|
||||
func latencyCallback(n *Node) em.LatencyCallbackFunc {
|
||||
return func(latency float64) {
|
||||
n.BlockLatency = latency / 1000000.0 // ns to ms
|
||||
n.logger.Info("new block latency", "latency", n.BlockLatency)
|
||||
|
||||
if n.blockLatencyCh != nil {
|
||||
n.blockLatencyCh <- latency
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// implements eventmeter.DisconnectCallbackFunc
|
||||
func disconnectCallback(n *Node) em.DisconnectCallbackFunc {
|
||||
return func() {
|
||||
n.Online = false
|
||||
n.logger.Info("status", "down")
|
||||
|
||||
if n.disconnectCh != nil {
|
||||
n.disconnectCh <- true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Node) RestartEventMeterBackoff() error {
|
||||
attempt := 0
|
||||
|
||||
for {
|
||||
d := time.Duration(math.Exp2(float64(attempt)))
|
||||
time.Sleep(d * time.Second)
|
||||
|
||||
if err := n.em.Start(); err != nil {
|
||||
n.logger.Info("restart failed", "err", err)
|
||||
} else {
|
||||
// TODO: authenticate pubkey
|
||||
return nil
|
||||
}
|
||||
|
||||
attempt++
|
||||
|
||||
if attempt > maxRestarts {
|
||||
return errors.New("Reached max restarts")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Node) NumValidators() (height int64, num int, err error) {
|
||||
height, vals, err := n.validators()
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
return height, len(vals), nil
|
||||
}
|
||||
|
||||
func (n *Node) validators() (height int64, validators []*tmtypes.Validator, err error) {
|
||||
vals := new(ctypes.ResultValidators)
|
||||
if _, err = n.rpcClient.Call("validators", nil, vals); err != nil {
|
||||
return 0, make([]*tmtypes.Validator, 0), err
|
||||
}
|
||||
return vals.BlockHeight, vals.Validators, nil
|
||||
}
|
||||
|
||||
func (n *Node) checkIsValidatorLoop() {
|
||||
for {
|
||||
select {
|
||||
case <-n.quit:
|
||||
return
|
||||
case <-time.After(n.checkIsValidatorInterval):
|
||||
n.checkIsValidator()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Node) checkIsValidator() {
|
||||
_, validators, err := n.validators()
|
||||
if err == nil {
|
||||
for _, v := range validators {
|
||||
key, err1 := n.getPubKey()
|
||||
// TODO: use bytes.Equal
|
||||
if err1 == nil && v.PubKey == key {
|
||||
n.IsValidator = true
|
||||
}
|
||||
}
|
||||
} else {
|
||||
n.logger.Info("check is validator failed", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Node) getPubKey() (crypto.PubKey, error) {
|
||||
if n.pubKey != nil {
|
||||
return n.pubKey, nil
|
||||
}
|
||||
|
||||
status := new(ctypes.ResultStatus)
|
||||
_, err := n.rpcClient.Call("status", nil, status)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
n.pubKey = status.ValidatorInfo.PubKey
|
||||
return n.pubKey, nil
|
||||
}
|
||||
|
||||
type eventMeter interface {
|
||||
Start() error
|
||||
Stop()
|
||||
RegisterLatencyCallback(em.LatencyCallbackFunc)
|
||||
RegisterDisconnectCallback(em.DisconnectCallbackFunc)
|
||||
Subscribe(string, em.EventCallbackFunc) error
|
||||
Unsubscribe(string) error
|
||||
SetLogger(l log.Logger)
|
||||
}
|
||||
|
||||
// UnmarshalEvent unmarshals a json event
|
||||
func UnmarshalEvent(b json.RawMessage) (string, events.EventData, error) {
|
||||
event := new(ctypes.ResultEvent)
|
||||
if err := cdc.UnmarshalJSON(b, event); err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
return event.Query, event.Data, nil
|
||||
}
|
93
tools/tm-monitor/monitor/node_test.go
Normal file
93
tools/tm-monitor/monitor/node_test.go
Normal file
@ -0,0 +1,93 @@
|
||||
package monitor_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
crypto "github.com/tendermint/go-crypto"
|
||||
ctypes "github.com/tendermint/tendermint/rpc/core/types"
|
||||
tmtypes "github.com/tendermint/tendermint/types"
|
||||
em "github.com/tendermint/tools/tm-monitor/eventmeter"
|
||||
mock "github.com/tendermint/tools/tm-monitor/mock"
|
||||
monitor "github.com/tendermint/tools/tm-monitor/monitor"
|
||||
"github.com/tendermint/go-amino"
|
||||
)
|
||||
|
||||
const (
|
||||
blockHeight = int64(1)
|
||||
)
|
||||
|
||||
func TestNodeStartStop(t *testing.T) {
|
||||
n, _ := startValidatorNode(t)
|
||||
defer n.Stop()
|
||||
|
||||
assert.Equal(t, true, n.Online)
|
||||
assert.Equal(t, true, n.IsValidator)
|
||||
}
|
||||
|
||||
func TestNodeNewBlockReceived(t *testing.T) {
|
||||
blockCh := make(chan tmtypes.Header, 100)
|
||||
n, emMock := startValidatorNode(t)
|
||||
defer n.Stop()
|
||||
n.SendBlocksTo(blockCh)
|
||||
|
||||
blockHeader := &tmtypes.Header{Height: 5}
|
||||
emMock.Call("eventCallback", &em.EventMetric{}, tmtypes.EventDataNewBlockHeader{blockHeader})
|
||||
|
||||
assert.Equal(t, int64(5), n.Height)
|
||||
assert.Equal(t, *blockHeader, <-blockCh)
|
||||
}
|
||||
|
||||
func TestNodeNewBlockLatencyReceived(t *testing.T) {
|
||||
blockLatencyCh := make(chan float64, 100)
|
||||
n, emMock := startValidatorNode(t)
|
||||
defer n.Stop()
|
||||
n.SendBlockLatenciesTo(blockLatencyCh)
|
||||
|
||||
emMock.Call("latencyCallback", 1000000.0)
|
||||
|
||||
assert.Equal(t, 1.0, n.BlockLatency)
|
||||
assert.Equal(t, 1000000.0, <-blockLatencyCh)
|
||||
}
|
||||
|
||||
func TestNodeConnectionLost(t *testing.T) {
|
||||
disconnectCh := make(chan bool, 100)
|
||||
n, emMock := startValidatorNode(t)
|
||||
defer n.Stop()
|
||||
n.NotifyAboutDisconnects(disconnectCh)
|
||||
|
||||
emMock.Call("disconnectCallback")
|
||||
|
||||
assert.Equal(t, true, <-disconnectCh)
|
||||
assert.Equal(t, false, n.Online)
|
||||
}
|
||||
|
||||
func TestNumValidators(t *testing.T) {
|
||||
n, _ := startValidatorNode(t)
|
||||
defer n.Stop()
|
||||
|
||||
height, num, err := n.NumValidators()
|
||||
assert.Nil(t, err)
|
||||
assert.Equal(t, blockHeight, height)
|
||||
assert.Equal(t, 1, num)
|
||||
}
|
||||
|
||||
func startValidatorNode(t *testing.T) (n *monitor.Node, emMock *mock.EventMeter) {
|
||||
emMock = &mock.EventMeter{}
|
||||
|
||||
stubs := make(map[string]interface{})
|
||||
pubKey := crypto.GenPrivKeyEd25519().PubKey()
|
||||
stubs["validators"] = ctypes.ResultValidators{BlockHeight: blockHeight, Validators: []*tmtypes.Validator{tmtypes.NewValidator(pubKey, 0)}}
|
||||
stubs["status"] = ctypes.ResultStatus{ValidatorInfo: ctypes.ValidatorInfo{PubKey: pubKey}}
|
||||
cdc := amino.NewCodec()
|
||||
rpcClientMock := &mock.RpcClient{Stubs: stubs}
|
||||
rpcClientMock.SetCodec(cdc)
|
||||
|
||||
n = monitor.NewNodeWithEventMeterAndRpcClient("tcp://127.0.0.1:26657", emMock, rpcClientMock)
|
||||
|
||||
err := n.Start()
|
||||
require.Nil(t, err)
|
||||
return
|
||||
}
|
12
tools/tm-monitor/monitor/wire.go
Normal file
12
tools/tm-monitor/monitor/wire.go
Normal file
@ -0,0 +1,12 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
amino "github.com/tendermint/go-amino"
|
||||
ctypes "github.com/tendermint/tendermint/rpc/core/types"
|
||||
)
|
||||
|
||||
var cdc = amino.NewCodec()
|
||||
|
||||
func init() {
|
||||
ctypes.RegisterAmino(cdc)
|
||||
}
|
Reference in New Issue
Block a user