mv tools files to tools repo

2025-07-31 12:11:58 +00:00 · 2018-07-10 11:15:39 -04:00
parent 65487586f8
commit 2f4ab0c068
105 changed files with 0 additions and 3 deletions
--- a/tools/tm-monitor/monitor/monitor.go
+++ b/tools/tm-monitor/monitor/monitor.go
@@ -0,0 +1,251 @@
+package monitor
+
+import (
+	"fmt"
+	"math/rand"
+	"sync"
+	"time"
+
+	"github.com/pkg/errors"
+	tmtypes "github.com/tendermint/tendermint/types"
+	"github.com/tendermint/tmlibs/log"
+)
+
+// waiting more than this many seconds for a block means we're unhealthy
+const nodeLivenessTimeout = 5 * time.Second
+
+// Monitor keeps track of the nodes and updates common statistics upon
+// receiving new events from nodes.
+//
+// Common statistics is stored in Network struct.
+type Monitor struct {
+	mtx   sync.Mutex
+	Nodes []*Node
+
+	Network *Network
+
+	monitorQuit chan struct{}            // monitor exitting
+	nodeQuit    map[string]chan struct{} // node is being stopped and removed from under the monitor
+
+	recalculateNetworkUptimeEvery time.Duration
+	numValidatorsUpdateInterval   time.Duration
+
+	logger log.Logger
+}
+
+// NewMonitor creates new instance of a Monitor. You can provide options to
+// change some default values.
+//
+// Example:
+//   NewMonitor(monitor.SetNumValidatorsUpdateInterval(1 * time.Second))
+func NewMonitor(options ...func(*Monitor)) *Monitor {
+	m := &Monitor{
+		Nodes:                         make([]*Node, 0),
+		Network:                       NewNetwork(),
+		monitorQuit:                   make(chan struct{}),
+		nodeQuit:                      make(map[string]chan struct{}),
+		recalculateNetworkUptimeEvery: 10 * time.Second,
+		numValidatorsUpdateInterval:   5 * time.Second,
+		logger: log.NewNopLogger(),
+	}
+
+	for _, option := range options {
+		option(m)
+	}
+
+	return m
+}
+
+// RecalculateNetworkUptimeEvery lets you change network uptime update interval.
+func RecalculateNetworkUptimeEvery(d time.Duration) func(m *Monitor) {
+	return func(m *Monitor) {
+		m.recalculateNetworkUptimeEvery = d
+	}
+}
+
+// SetNumValidatorsUpdateInterval lets you change num validators update interval.
+func SetNumValidatorsUpdateInterval(d time.Duration) func(m *Monitor) {
+	return func(m *Monitor) {
+		m.numValidatorsUpdateInterval = d
+	}
+}
+
+// SetLogger lets you set your own logger
+func (m *Monitor) SetLogger(l log.Logger) {
+	m.logger = l
+}
+
+// Monitor begins to monitor the node `n`. The node will be started and added
+// to the monitor.
+func (m *Monitor) Monitor(n *Node) error {
+	m.mtx.Lock()
+	m.Nodes = append(m.Nodes, n)
+	m.mtx.Unlock()
+
+	blockCh := make(chan tmtypes.Header, 10)
+	n.SendBlocksTo(blockCh)
+	blockLatencyCh := make(chan float64, 10)
+	n.SendBlockLatenciesTo(blockLatencyCh)
+	disconnectCh := make(chan bool, 10)
+	n.NotifyAboutDisconnects(disconnectCh)
+
+	if err := n.Start(); err != nil {
+		return err
+	}
+
+	m.Network.NewNode(n.Name)
+
+	m.nodeQuit[n.Name] = make(chan struct{})
+	go m.listen(n.Name, blockCh, blockLatencyCh, disconnectCh, m.nodeQuit[n.Name])
+
+	return nil
+}
+
+// Unmonitor stops monitoring node `n`. The node will be stopped and removed
+// from the monitor.
+func (m *Monitor) Unmonitor(n *Node) {
+	m.Network.NodeDeleted(n.Name)
+
+	n.Stop()
+	close(m.nodeQuit[n.Name])
+	delete(m.nodeQuit, n.Name)
+	i, _ := m.NodeByName(n.Name)
+
+	m.mtx.Lock()
+	m.Nodes[i] = m.Nodes[len(m.Nodes)-1]
+	m.Nodes = m.Nodes[:len(m.Nodes)-1]
+	m.mtx.Unlock()
+}
+
+// NodeByName returns the node and its index if such node exists within the
+// monitor. Otherwise, -1 and nil are returned.
+func (m *Monitor) NodeByName(name string) (index int, node *Node) {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	for i, n := range m.Nodes {
+		if name == n.Name {
+			return i, n
+		}
+	}
+	return -1, nil
+}
+
+// NodeIsOnline is called when connection to the node is restored.
+// Must be safe to call multiple times.
+func (m *Monitor) NodeIsOnline(name string) {
+
+	_, node := m.NodeByName(name)
+	if nil != node {
+		if online, ok := m.Network.nodeStatusMap[name]; ok && online {
+			m.mtx.Lock()
+			node.Online = online
+			m.mtx.Unlock()
+		}
+	}
+
+}
+
+// Start starts the monitor's routines: recalculating network uptime and
+// updating number of validators.
+func (m *Monitor) Start() error {
+	go m.recalculateNetworkUptimeLoop()
+	go m.updateNumValidatorLoop()
+
+	return nil
+}
+
+// Stop stops the monitor's routines.
+func (m *Monitor) Stop() {
+	close(m.monitorQuit)
+
+	for _, n := range m.Nodes {
+		m.Unmonitor(n)
+	}
+}
+
+// main loop where we listen for events from the node
+func (m *Monitor) listen(nodeName string, blockCh <-chan tmtypes.Header, blockLatencyCh <-chan float64, disconnectCh <-chan bool, quit <-chan struct{}) {
+	logger := m.logger.With("node", nodeName)
+
+	for {
+		select {
+		case <-quit:
+			return
+		case b := <-blockCh:
+			m.Network.NewBlock(b)
+			m.Network.NodeIsOnline(nodeName)
+			m.NodeIsOnline(nodeName)
+		case l := <-blockLatencyCh:
+			m.Network.NewBlockLatency(l)
+			m.Network.NodeIsOnline(nodeName)
+			m.NodeIsOnline(nodeName)
+		case disconnected := <-disconnectCh:
+			if disconnected {
+				m.Network.NodeIsDown(nodeName)
+			} else {
+				m.Network.NodeIsOnline(nodeName)
+				m.NodeIsOnline(nodeName)
+			}
+		case <-time.After(nodeLivenessTimeout):
+			logger.Info("event", fmt.Sprintf("node was not responding for %v", nodeLivenessTimeout))
+			m.Network.NodeIsDown(nodeName)
+		}
+	}
+}
+
+// recalculateNetworkUptimeLoop every N seconds.
+func (m *Monitor) recalculateNetworkUptimeLoop() {
+	for {
+		select {
+		case <-m.monitorQuit:
+			return
+		case <-time.After(m.recalculateNetworkUptimeEvery):
+			m.Network.RecalculateUptime()
+		}
+	}
+}
+
+// updateNumValidatorLoop sends a request to a random node once every N seconds,
+// which in turn makes an RPC call to get the latest validators.
+func (m *Monitor) updateNumValidatorLoop() {
+	rand.Seed(time.Now().Unix())
+
+	var height int64
+	var num int
+	var err error
+
+	for {
+		m.mtx.Lock()
+		nodesCount := len(m.Nodes)
+		m.mtx.Unlock()
+		if 0 == nodesCount {
+			time.Sleep(m.numValidatorsUpdateInterval)
+			continue
+		}
+
+		randomNodeIndex := rand.Intn(nodesCount)
+
+		select {
+		case <-m.monitorQuit:
+			return
+		case <-time.After(m.numValidatorsUpdateInterval):
+			i := 0
+
+			m.mtx.Lock()
+			for _, n := range m.Nodes {
+				if i == randomNodeIndex {
+					height, num, err = n.NumValidators()
+					if err != nil {
+						m.logger.Info("err", errors.Wrap(err, "update num validators failed"))
+					}
+					break
+				}
+				i++
+			}
+			m.mtx.Unlock()
+
+			m.Network.UpdateNumValidatorsForHeight(num, height)
+		}
+	}
+}
--- a/tools/tm-monitor/monitor/monitor_test.go
+++ b/tools/tm-monitor/monitor/monitor_test.go
@@ -0,0 +1,72 @@
+package monitor_test
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	crypto "github.com/tendermint/go-crypto"
+	ctypes "github.com/tendermint/tendermint/rpc/core/types"
+	tmtypes "github.com/tendermint/tendermint/types"
+	mock "github.com/tendermint/tools/tm-monitor/mock"
+	monitor "github.com/tendermint/tools/tm-monitor/monitor"
+	"github.com/tendermint/go-amino"
+)
+
+func TestMonitorUpdatesNumberOfValidators(t *testing.T) {
+	m := startMonitor(t)
+	defer m.Stop()
+
+	n, _ := createValidatorNode(t)
+	m.Monitor(n)
+	assert.Equal(t, 1, m.Network.NumNodesMonitored)
+	assert.Equal(t, 1, m.Network.NumNodesMonitoredOnline)
+
+	time.Sleep(1 * time.Second)
+
+	// DATA RACE
+	// assert.Equal(t, 1, m.Network.NumValidators())
+}
+
+func TestMonitorRecalculatesNetworkUptime(t *testing.T) {
+	m := startMonitor(t)
+	defer m.Stop()
+	assert.Equal(t, 100.0, m.Network.Uptime())
+
+	n, _ := createValidatorNode(t)
+	m.Monitor(n)
+
+	m.Network.NodeIsDown(n.Name) // simulate node failure
+	time.Sleep(200 * time.Millisecond)
+	m.Network.NodeIsOnline(n.Name)
+	time.Sleep(1 * time.Second)
+
+	assert.True(t, m.Network.Uptime() < 100.0, "Uptime should be less than 100%")
+}
+
+func startMonitor(t *testing.T) *monitor.Monitor {
+	m := monitor.NewMonitor(
+		monitor.SetNumValidatorsUpdateInterval(200*time.Millisecond),
+		monitor.RecalculateNetworkUptimeEvery(200*time.Millisecond),
+	)
+	err := m.Start()
+	require.Nil(t, err)
+	return m
+}
+
+func createValidatorNode(t *testing.T) (n *monitor.Node, emMock *mock.EventMeter) {
+	emMock = &mock.EventMeter{}
+
+	stubs := make(map[string]interface{})
+	pubKey := crypto.GenPrivKeyEd25519().PubKey()
+	stubs["validators"] = ctypes.ResultValidators{BlockHeight: blockHeight, Validators: []*tmtypes.Validator{tmtypes.NewValidator(pubKey, 0)}}
+	stubs["status"] = ctypes.ResultStatus{ValidatorInfo: ctypes.ValidatorInfo{PubKey: pubKey}}
+	cdc := amino.NewCodec()
+	rpcClientMock := &mock.RpcClient{Stubs: stubs}
+	rpcClientMock.SetCodec(cdc)
+
+	n = monitor.NewNodeWithEventMeterAndRpcClient("tcp://127.0.0.1:26657", emMock, rpcClientMock)
+	return
+}
--- a/tools/tm-monitor/monitor/network.go
+++ b/tools/tm-monitor/monitor/network.go
@@ -0,0 +1,199 @@
+package monitor
+
+import (
+	"sync"
+	"time"
+
+	metrics "github.com/rcrowley/go-metrics"
+	tmtypes "github.com/tendermint/tendermint/types"
+)
+
+// UptimeData stores data for how long network has been running.
+type UptimeData struct {
+	StartTime time.Time `json:"start_time"`
+	Uptime    float64   `json:"uptime" amino:"unsafe"` // percentage of time we've been healthy, ever
+
+	totalDownTime time.Duration // total downtime (only updated when we come back online)
+	wentDown      time.Time
+}
+
+// Health describes the health of the network. Note that this applies only to
+// the observed nodes, and not to the entire cluster, which may consist of
+// thousands of machines. It may change in the future.
+type Health int
+
+const (
+	// FullHealth means all nodes online, synced, validators making blocks
+	FullHealth = Health(0)
+	// ModerateHealth means we're making blocks
+	ModerateHealth = Health(1)
+	// Dead means we're not making blocks due to all validators freezing or crashing
+	Dead = Health(2)
+)
+
+// Common statistics for network of nodes
+type Network struct {
+	Height int64 `json:"height"`
+
+	AvgBlockTime      float64 `json:"avg_block_time" amino:"unsafe"` // ms (avg over last minute)
+	blockTimeMeter    metrics.Meter
+	AvgTxThroughput   float64 `json:"avg_tx_throughput" amino:"unsafe"` // tx/s (avg over last minute)
+	txThroughputMeter metrics.Meter
+	AvgBlockLatency   float64 `json:"avg_block_latency" amino:"unsafe"` // ms (avg over last minute)
+	blockLatencyMeter metrics.Meter
+
+	NumValidators           int `json:"num_validators"`
+	NumNodesMonitored       int `json:"num_nodes_monitored"`
+	NumNodesMonitoredOnline int `json:"num_nodes_monitored_online"`
+
+	Health Health `json:"health"`
+
+	UptimeData *UptimeData `json:"uptime_data"`
+
+	nodeStatusMap map[string]bool
+
+	mu sync.Mutex
+}
+
+func NewNetwork() *Network {
+	return &Network{
+		blockTimeMeter:    metrics.NewMeter(),
+		txThroughputMeter: metrics.NewMeter(),
+		blockLatencyMeter: metrics.NewMeter(),
+		Health:            FullHealth,
+		UptimeData: &UptimeData{
+			StartTime: time.Now(),
+			Uptime:    100.0,
+		},
+		nodeStatusMap: make(map[string]bool),
+	}
+}
+
+func (n *Network) NewBlock(b tmtypes.Header) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if n.Height >= b.Height {
+		return
+	}
+
+	n.Height = b.Height
+
+	n.blockTimeMeter.Mark(1)
+	if n.blockTimeMeter.Rate1() > 0.0 {
+		n.AvgBlockTime = (1.0 / n.blockTimeMeter.Rate1()) * 1000 // 1/s to ms
+	} else {
+		n.AvgBlockTime = 0.0
+	}
+	n.txThroughputMeter.Mark(int64(b.NumTxs))
+	n.AvgTxThroughput = n.txThroughputMeter.Rate1()
+}
+
+func (n *Network) NewBlockLatency(l float64) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.blockLatencyMeter.Mark(int64(l))
+	n.AvgBlockLatency = n.blockLatencyMeter.Rate1() / 1000000.0 // ns to ms
+}
+
+// RecalculateUptime calculates uptime on demand.
+func (n *Network) RecalculateUptime() {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	since := time.Since(n.UptimeData.StartTime)
+	uptime := since - n.UptimeData.totalDownTime
+	if n.Health != FullHealth {
+		uptime -= time.Since(n.UptimeData.wentDown)
+	}
+	n.UptimeData.Uptime = (float64(uptime) / float64(since)) * 100.0
+}
+
+// NodeIsDown is called when the node disconnects for whatever reason.
+// Must be safe to call multiple times.
+func (n *Network) NodeIsDown(name string) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if online, ok := n.nodeStatusMap[name]; !ok || online {
+		n.nodeStatusMap[name] = false
+		n.NumNodesMonitoredOnline--
+		n.UptimeData.wentDown = time.Now()
+		n.updateHealth()
+	}
+}
+
+// NodeIsOnline is called when connection to the node is restored.
+// Must be safe to call multiple times.
+func (n *Network) NodeIsOnline(name string) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if online, ok := n.nodeStatusMap[name]; ok && !online {
+		n.nodeStatusMap[name] = true
+		n.NumNodesMonitoredOnline++
+		n.UptimeData.totalDownTime += time.Since(n.UptimeData.wentDown)
+		n.updateHealth()
+	}
+}
+
+// NewNode is called when the new node is added to the monitor.
+func (n *Network) NewNode(name string) {
+	n.NumNodesMonitored++
+	n.NumNodesMonitoredOnline++
+}
+
+// NodeDeleted is called when the node is deleted from under the monitor.
+func (n *Network) NodeDeleted(name string) {
+	n.NumNodesMonitored--
+	n.NumNodesMonitoredOnline--
+}
+
+func (n *Network) updateHealth() {
+	// if we are connected to all validators, we're at full health
+	// TODO: make sure they're all at the same height (within a block)
+	// and all proposing (and possibly validating ) Alternatively, just
+	// check there hasn't been a new round in numValidators rounds
+	if n.NumValidators != 0 && n.NumNodesMonitoredOnline == n.NumValidators {
+		n.Health = FullHealth
+	} else if n.NumNodesMonitoredOnline > 0 && n.NumNodesMonitoredOnline <= n.NumNodesMonitored {
+		n.Health = ModerateHealth
+	} else {
+		n.Health = Dead
+	}
+}
+
+func (n *Network) UpdateNumValidatorsForHeight(num int, height int64) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if n.Height <= height {
+		n.NumValidators = num
+	}
+}
+
+func (n *Network) GetHealthString() string {
+	switch n.Health {
+	case FullHealth:
+		return "full"
+	case ModerateHealth:
+		return "moderate"
+	case Dead:
+		return "dead"
+	default:
+		return "undefined"
+	}
+}
+
+// Uptime returns network's uptime in percentages.
+func (n *Network) Uptime() float64 {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+	return n.UptimeData.Uptime
+}
+
+// StartTime returns time we started monitoring.
+func (n *Network) StartTime() time.Time {
+	return n.UptimeData.StartTime
+}
--- a/tools/tm-monitor/monitor/network_test.go
+++ b/tools/tm-monitor/monitor/network_test.go
@@ -0,0 +1,79 @@
+package monitor_test
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	tmtypes "github.com/tendermint/tendermint/types"
+	monitor "github.com/tendermint/tools/tm-monitor/monitor"
+)
+
+func TestNetworkNewBlock(t *testing.T) {
+	n := monitor.NewNetwork()
+
+	n.NewBlock(tmtypes.Header{Height: 5, NumTxs: 100})
+	assert.Equal(t, int64(5), n.Height)
+	assert.Equal(t, 0.0, n.AvgBlockTime)
+	assert.Equal(t, 0.0, n.AvgTxThroughput)
+}
+
+func TestNetworkNewBlockLatency(t *testing.T) {
+	n := monitor.NewNetwork()
+
+	n.NewBlockLatency(9000000.0) // nanoseconds
+	assert.Equal(t, 0.0, n.AvgBlockLatency)
+}
+
+func TestNetworkNodeIsDownThenOnline(t *testing.T) {
+	n := monitor.NewNetwork()
+	n.NewNode("test")
+
+	n.NodeIsDown("test")
+	assert.Equal(t, 0, n.NumNodesMonitoredOnline)
+	assert.Equal(t, monitor.Dead, n.Health)
+	n.NodeIsDown("test")
+	assert.Equal(t, 0, n.NumNodesMonitoredOnline)
+
+	n.NodeIsOnline("test")
+	assert.Equal(t, 1, n.NumNodesMonitoredOnline)
+	assert.Equal(t, monitor.ModerateHealth, n.Health)
+	n.NodeIsOnline("test")
+	assert.Equal(t, 1, n.NumNodesMonitoredOnline)
+}
+
+func TestNetworkNewNode(t *testing.T) {
+	n := monitor.NewNetwork()
+	assert.Equal(t, 0, n.NumNodesMonitored)
+	assert.Equal(t, 0, n.NumNodesMonitoredOnline)
+	n.NewNode("test")
+	assert.Equal(t, 1, n.NumNodesMonitored)
+	assert.Equal(t, 1, n.NumNodesMonitoredOnline)
+}
+
+func TestNetworkNodeDeleted(t *testing.T) {
+	n := monitor.NewNetwork()
+	n.NewNode("test")
+	n.NodeDeleted("test")
+	assert.Equal(t, 0, n.NumNodesMonitored)
+	assert.Equal(t, 0, n.NumNodesMonitoredOnline)
+}
+
+func TestNetworkGetHealthString(t *testing.T) {
+	n := monitor.NewNetwork()
+	assert.Equal(t, "full", n.GetHealthString())
+	n.Health = monitor.ModerateHealth
+	assert.Equal(t, "moderate", n.GetHealthString())
+	n.Health = monitor.Dead
+	assert.Equal(t, "dead", n.GetHealthString())
+}
+
+func TestNetworkUptime(t *testing.T) {
+	n := monitor.NewNetwork()
+	assert.Equal(t, 100.0, n.Uptime())
+}
+
+func TestNetworkStartTime(t *testing.T) {
+	n := monitor.NewNetwork()
+	assert.True(t, n.StartTime().Before(time.Now()))
+}
--- a/tools/tm-monitor/monitor/node.go
+++ b/tools/tm-monitor/monitor/node.go
@@ -0,0 +1,260 @@
+package monitor
+
+import (
+	"encoding/json"
+	"math"
+	"time"
+
+	"github.com/pkg/errors"
+	crypto "github.com/tendermint/go-crypto"
+	ctypes "github.com/tendermint/tendermint/rpc/core/types"
+	rpc_client "github.com/tendermint/tendermint/rpc/lib/client"
+	tmtypes "github.com/tendermint/tendermint/types"
+	"github.com/tendermint/tmlibs/events"
+	"github.com/tendermint/tmlibs/log"
+	em "github.com/tendermint/tools/tm-monitor/eventmeter"
+)
+
+const maxRestarts = 25
+
+type Node struct {
+	rpcAddr string
+
+	IsValidator bool          `json:"is_validator"` // validator or non-validator?
+	pubKey      crypto.PubKey `json:"pub_key"`
+
+	Name         string  `json:"name"`
+	Online       bool    `json:"online"`
+	Height       int64   `json:"height"`
+	BlockLatency float64 `json:"block_latency" amino:"unsafe"` // ms, interval between block commits
+
+	// em holds the ws connection. Each eventMeter callback is called in a separate go-routine.
+	em eventMeter
+
+	// rpcClient is an client for making RPC calls to TM
+	rpcClient rpc_client.HTTPClient
+
+	blockCh        chan<- tmtypes.Header
+	blockLatencyCh chan<- float64
+	disconnectCh   chan<- bool
+
+	checkIsValidatorInterval time.Duration
+
+	quit chan struct{}
+
+	logger log.Logger
+}
+
+func NewNode(rpcAddr string, options ...func(*Node)) *Node {
+	em := em.NewEventMeter(rpcAddr, UnmarshalEvent)
+	rpcClient := rpc_client.NewURIClient(rpcAddr) // HTTP client by default
+	rpcClient.SetCodec(cdc)
+	return NewNodeWithEventMeterAndRpcClient(rpcAddr, em, rpcClient, options...)
+}
+
+func NewNodeWithEventMeterAndRpcClient(rpcAddr string, em eventMeter, rpcClient rpc_client.HTTPClient, options ...func(*Node)) *Node {
+	n := &Node{
+		rpcAddr:   rpcAddr,
+		em:        em,
+		rpcClient: rpcClient,
+		Name:      rpcAddr,
+		quit:      make(chan struct{}),
+		checkIsValidatorInterval: 5 * time.Second,
+		logger: log.NewNopLogger(),
+	}
+
+	for _, option := range options {
+		option(n)
+	}
+
+	return n
+}
+
+// SetCheckIsValidatorInterval lets you change interval for checking whenever
+// node is still a validator or not.
+func SetCheckIsValidatorInterval(d time.Duration) func(n *Node) {
+	return func(n *Node) {
+		n.checkIsValidatorInterval = d
+	}
+}
+
+func (n *Node) SendBlocksTo(ch chan<- tmtypes.Header) {
+	n.blockCh = ch
+}
+
+func (n *Node) SendBlockLatenciesTo(ch chan<- float64) {
+	n.blockLatencyCh = ch
+}
+
+func (n *Node) NotifyAboutDisconnects(ch chan<- bool) {
+	n.disconnectCh = ch
+}
+
+// SetLogger lets you set your own logger
+func (n *Node) SetLogger(l log.Logger) {
+	n.logger = l
+	n.em.SetLogger(l)
+}
+
+func (n *Node) Start() error {
+	if err := n.em.Start(); err != nil {
+		return err
+	}
+
+	n.em.RegisterLatencyCallback(latencyCallback(n))
+	err := n.em.Subscribe(tmtypes.EventQueryNewBlockHeader.String(), newBlockCallback(n))
+	if err != nil {
+		return err
+	}
+	n.em.RegisterDisconnectCallback(disconnectCallback(n))
+
+	n.Online = true
+
+	n.checkIsValidator()
+	go n.checkIsValidatorLoop()
+
+	return nil
+}
+
+func (n *Node) Stop() {
+	n.Online = false
+
+	n.em.Stop()
+
+	close(n.quit)
+}
+
+// implements eventmeter.EventCallbackFunc
+func newBlockCallback(n *Node) em.EventCallbackFunc {
+	return func(metric *em.EventMetric, data interface{}) {
+		block := data.(tmtypes.TMEventData).(tmtypes.EventDataNewBlockHeader).Header
+
+		n.Height = block.Height
+		n.logger.Info("new block", "height", block.Height, "numTxs", block.NumTxs)
+
+		if n.blockCh != nil {
+			n.blockCh <- *block
+		}
+	}
+}
+
+// implements eventmeter.EventLatencyFunc
+func latencyCallback(n *Node) em.LatencyCallbackFunc {
+	return func(latency float64) {
+		n.BlockLatency = latency / 1000000.0 // ns to ms
+		n.logger.Info("new block latency", "latency", n.BlockLatency)
+
+		if n.blockLatencyCh != nil {
+			n.blockLatencyCh <- latency
+		}
+	}
+}
+
+// implements eventmeter.DisconnectCallbackFunc
+func disconnectCallback(n *Node) em.DisconnectCallbackFunc {
+	return func() {
+		n.Online = false
+		n.logger.Info("status", "down")
+
+		if n.disconnectCh != nil {
+			n.disconnectCh <- true
+		}
+	}
+}
+
+func (n *Node) RestartEventMeterBackoff() error {
+	attempt := 0
+
+	for {
+		d := time.Duration(math.Exp2(float64(attempt)))
+		time.Sleep(d * time.Second)
+
+		if err := n.em.Start(); err != nil {
+			n.logger.Info("restart failed", "err", err)
+		} else {
+			// TODO: authenticate pubkey
+			return nil
+		}
+
+		attempt++
+
+		if attempt > maxRestarts {
+			return errors.New("Reached max restarts")
+		}
+	}
+}
+
+func (n *Node) NumValidators() (height int64, num int, err error) {
+	height, vals, err := n.validators()
+	if err != nil {
+		return 0, 0, err
+	}
+	return height, len(vals), nil
+}
+
+func (n *Node) validators() (height int64, validators []*tmtypes.Validator, err error) {
+	vals := new(ctypes.ResultValidators)
+	if _, err = n.rpcClient.Call("validators", nil, vals); err != nil {
+		return 0, make([]*tmtypes.Validator, 0), err
+	}
+	return vals.BlockHeight, vals.Validators, nil
+}
+
+func (n *Node) checkIsValidatorLoop() {
+	for {
+		select {
+		case <-n.quit:
+			return
+		case <-time.After(n.checkIsValidatorInterval):
+			n.checkIsValidator()
+		}
+	}
+}
+
+func (n *Node) checkIsValidator() {
+	_, validators, err := n.validators()
+	if err == nil {
+		for _, v := range validators {
+			key, err1 := n.getPubKey()
+			// TODO: use bytes.Equal
+			if err1 == nil && v.PubKey == key {
+				n.IsValidator = true
+			}
+		}
+	} else {
+		n.logger.Info("check is validator failed", "err", err)
+	}
+}
+
+func (n *Node) getPubKey() (crypto.PubKey, error) {
+	if n.pubKey != nil {
+		return n.pubKey, nil
+	}
+
+	status := new(ctypes.ResultStatus)
+	_, err := n.rpcClient.Call("status", nil, status)
+	if err != nil {
+		return nil, err
+	}
+	n.pubKey = status.ValidatorInfo.PubKey
+	return n.pubKey, nil
+}
+
+type eventMeter interface {
+	Start() error
+	Stop()
+	RegisterLatencyCallback(em.LatencyCallbackFunc)
+	RegisterDisconnectCallback(em.DisconnectCallbackFunc)
+	Subscribe(string, em.EventCallbackFunc) error
+	Unsubscribe(string) error
+	SetLogger(l log.Logger)
+}
+
+// UnmarshalEvent unmarshals a json event
+func UnmarshalEvent(b json.RawMessage) (string, events.EventData, error) {
+	event := new(ctypes.ResultEvent)
+	if err := cdc.UnmarshalJSON(b, event); err != nil {
+		return "", nil, err
+	}
+	return event.Query, event.Data, nil
+}
--- a/tools/tm-monitor/monitor/node_test.go
+++ b/tools/tm-monitor/monitor/node_test.go
@@ -0,0 +1,93 @@
+package monitor_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	crypto "github.com/tendermint/go-crypto"
+	ctypes "github.com/tendermint/tendermint/rpc/core/types"
+	tmtypes "github.com/tendermint/tendermint/types"
+	em "github.com/tendermint/tools/tm-monitor/eventmeter"
+	mock "github.com/tendermint/tools/tm-monitor/mock"
+	monitor "github.com/tendermint/tools/tm-monitor/monitor"
+	"github.com/tendermint/go-amino"
+)
+
+const (
+	blockHeight = int64(1)
+)
+
+func TestNodeStartStop(t *testing.T) {
+	n, _ := startValidatorNode(t)
+	defer n.Stop()
+
+	assert.Equal(t, true, n.Online)
+	assert.Equal(t, true, n.IsValidator)
+}
+
+func TestNodeNewBlockReceived(t *testing.T) {
+	blockCh := make(chan tmtypes.Header, 100)
+	n, emMock := startValidatorNode(t)
+	defer n.Stop()
+	n.SendBlocksTo(blockCh)
+
+	blockHeader := &tmtypes.Header{Height: 5}
+	emMock.Call("eventCallback", &em.EventMetric{}, tmtypes.EventDataNewBlockHeader{blockHeader})
+
+	assert.Equal(t, int64(5), n.Height)
+	assert.Equal(t, *blockHeader, <-blockCh)
+}
+
+func TestNodeNewBlockLatencyReceived(t *testing.T) {
+	blockLatencyCh := make(chan float64, 100)
+	n, emMock := startValidatorNode(t)
+	defer n.Stop()
+	n.SendBlockLatenciesTo(blockLatencyCh)
+
+	emMock.Call("latencyCallback", 1000000.0)
+
+	assert.Equal(t, 1.0, n.BlockLatency)
+	assert.Equal(t, 1000000.0, <-blockLatencyCh)
+}
+
+func TestNodeConnectionLost(t *testing.T) {
+	disconnectCh := make(chan bool, 100)
+	n, emMock := startValidatorNode(t)
+	defer n.Stop()
+	n.NotifyAboutDisconnects(disconnectCh)
+
+	emMock.Call("disconnectCallback")
+
+	assert.Equal(t, true, <-disconnectCh)
+	assert.Equal(t, false, n.Online)
+}
+
+func TestNumValidators(t *testing.T) {
+	n, _ := startValidatorNode(t)
+	defer n.Stop()
+
+	height, num, err := n.NumValidators()
+	assert.Nil(t, err)
+	assert.Equal(t, blockHeight, height)
+	assert.Equal(t, 1, num)
+}
+
+func startValidatorNode(t *testing.T) (n *monitor.Node, emMock *mock.EventMeter) {
+	emMock = &mock.EventMeter{}
+
+	stubs := make(map[string]interface{})
+	pubKey := crypto.GenPrivKeyEd25519().PubKey()
+	stubs["validators"] = ctypes.ResultValidators{BlockHeight: blockHeight, Validators: []*tmtypes.Validator{tmtypes.NewValidator(pubKey, 0)}}
+	stubs["status"] = ctypes.ResultStatus{ValidatorInfo: ctypes.ValidatorInfo{PubKey: pubKey}}
+	cdc := amino.NewCodec()
+	rpcClientMock := &mock.RpcClient{Stubs: stubs}
+	rpcClientMock.SetCodec(cdc)
+
+	n = monitor.NewNodeWithEventMeterAndRpcClient("tcp://127.0.0.1:26657", emMock, rpcClientMock)
+
+	err := n.Start()
+	require.Nil(t, err)
+	return
+}
--- a/tools/tm-monitor/monitor/wire.go
+++ b/tools/tm-monitor/monitor/wire.go
@@ -0,0 +1,12 @@
+package monitor
+
+import (
+	amino "github.com/tendermint/go-amino"
+	ctypes "github.com/tendermint/tendermint/rpc/core/types"
+)
+
+var cdc = amino.NewCodec()
+
+func init() {
+	ctypes.RegisterAmino(cdc)
+}