diff --git a/CHANGELOG.md b/CHANGELOG.md index 5afc0221..f69127d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## TBD + +FEATURES: +- [node] added metrics (served under /metrics using a Prometheus client; disabled by default) + ## 0.20.1 BUG FIXES: diff --git a/Gopkg.lock b/Gopkg.lock index 8019acb5..e45b84d1 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -1,6 +1,12 @@ # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. +[[projects]] + branch = "master" + name = "github.com/beorn7/perks" + packages = ["quantile"] + revision = "3a771d992973f24aa725d07868b467d1ddfceafb" + [[projects]] branch = "master" name = "github.com/btcsuite/btcd" @@ -36,7 +42,11 @@ packages = [ "log", "log/level", - "log/term" + "log/term", + "metrics", + "metrics/discard", + "metrics/internal/lv", + "metrics/prometheus" ] revision = "4dc7be5d2d12881735283bcab7352178e190fc71" version = "v0.6.0" @@ -131,6 +141,12 @@ revision = "c2353362d570a7bfa228149c62842019201cfb71" version = "v1.8.0" +[[projects]] + name = "github.com/matttproud/golang_protobuf_extensions" + packages = ["pbutil"] + revision = "c12348ce28de40eed0136aa2b644d0ee0650e56c" + version = "v1.0.1" + [[projects]] branch = "master" name = "github.com/mitchellh/mapstructure" @@ -155,6 +171,42 @@ revision = "792786c7400a136282c1664665ae0a8db921c6c2" version = "v1.0.0" +[[projects]] + name = "github.com/prometheus/client_golang" + packages = [ + "prometheus", + "prometheus/promhttp" + ] + revision = "c5b7fccd204277076155f10851dad72b76a49317" + version = "v0.8.0" + +[[projects]] + branch = "master" + name = "github.com/prometheus/client_model" + packages = ["go"] + revision = "99fa1f4be8e564e8a6b613da7fa6f46c9edafc6c" + +[[projects]] + branch = "master" + name = "github.com/prometheus/common" + packages = [ + "expfmt", + "internal/bitbucket.org/ww/goautoneg", + "model" + ] + revision = "7600349dcfe1abd18d72d3a1770870d9800a7801" + +[[projects]] + branch = "master" + name = "github.com/prometheus/procfs" + packages = [ + ".", + "internal/util", + "nfs", + "xfs" + ] + revision = "94663424ae5ae9856b40a9f170762b4197024661" + [[projects]] branch = "master" name = "github.com/rcrowley/go-metrics" @@ -374,6 +426,6 @@ [solve-meta] analyzer-name = "dep" analyzer-version = 1 - inputs-digest = "62d0626fa963b25411234da915fb3b8bfc7aefffd76824d27ac1647ca2b5d489" + inputs-digest = "3bd388e520a08cd0aa14df2d6f5ecb46449d7c36fd80cf52eb775798e6accbaa" solver-name = "gps-cdcl" solver-version = 1 diff --git a/Gopkg.toml b/Gopkg.toml index b7393a47..13339e55 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -97,3 +97,7 @@ [prune] go-tests = true unused-packages = true + +[[constraint]] + name = "github.com/prometheus/client_golang" + version = "0.8.0" diff --git a/config/config.go b/config/config.go index 5ba568f2..6a283a82 100644 --- a/config/config.go +++ b/config/config.go @@ -45,34 +45,37 @@ type Config struct { BaseConfig `mapstructure:",squash"` // Options for services - RPC *RPCConfig `mapstructure:"rpc"` - P2P *P2PConfig `mapstructure:"p2p"` - Mempool *MempoolConfig `mapstructure:"mempool"` - Consensus *ConsensusConfig `mapstructure:"consensus"` - TxIndex *TxIndexConfig `mapstructure:"tx_index"` + RPC *RPCConfig `mapstructure:"rpc"` + P2P *P2PConfig `mapstructure:"p2p"` + Mempool *MempoolConfig `mapstructure:"mempool"` + Consensus *ConsensusConfig `mapstructure:"consensus"` + TxIndex *TxIndexConfig `mapstructure:"tx_index"` + Instrumentation *InstrumentationConfig `mapstructure:"instrumentation"` } // DefaultConfig returns a default configuration for a Tendermint node func DefaultConfig() *Config { return &Config{ - BaseConfig: DefaultBaseConfig(), - RPC: DefaultRPCConfig(), - P2P: DefaultP2PConfig(), - Mempool: DefaultMempoolConfig(), - Consensus: DefaultConsensusConfig(), - TxIndex: DefaultTxIndexConfig(), + BaseConfig: DefaultBaseConfig(), + RPC: DefaultRPCConfig(), + P2P: DefaultP2PConfig(), + Mempool: DefaultMempoolConfig(), + Consensus: DefaultConsensusConfig(), + TxIndex: DefaultTxIndexConfig(), + Instrumentation: DefaultInstrumentationConfig(), } } // TestConfig returns a configuration that can be used for testing func TestConfig() *Config { return &Config{ - BaseConfig: TestBaseConfig(), - RPC: TestRPCConfig(), - P2P: TestP2PConfig(), - Mempool: TestMempoolConfig(), - Consensus: TestConsensusConfig(), - TxIndex: TestTxIndexConfig(), + BaseConfig: TestBaseConfig(), + RPC: TestRPCConfig(), + P2P: TestP2PConfig(), + Mempool: TestMempoolConfig(), + Consensus: TestConsensusConfig(), + TxIndex: TestTxIndexConfig(), + Instrumentation: TestInstrumentationConfig(), } } @@ -411,7 +414,7 @@ func (cfg *MempoolConfig) WalDir() string { //----------------------------------------------------------------------------- // ConsensusConfig -// ConsensusConfig defines the confuguration for the Tendermint consensus service, +// ConsensusConfig defines the configuration for the Tendermint consensus service, // including timeouts and details about the WAL and the block structure. type ConsensusConfig struct { RootDir string `mapstructure:"home"` @@ -536,14 +539,14 @@ func (cfg *ConsensusConfig) SetWalFile(walFile string) { //----------------------------------------------------------------------------- // TxIndexConfig -// TxIndexConfig defines the confuguration for the transaction +// TxIndexConfig defines the configuration for the transaction // indexer, including tags to index. type TxIndexConfig struct { // What indexer to use for transactions // // Options: - // 1) "null" (default) - // 2) "kv" - the simplest possible indexer, backed by key-value storage (defaults to levelDB; see DBBackend). + // 1) "null" + // 2) "kv" (default) - the simplest possible indexer, backed by key-value storage (defaults to levelDB; see DBBackend). Indexer string `mapstructure:"indexer"` // Comma-separated list of tags to index (by default the only tag is tx hash) @@ -573,6 +576,35 @@ func TestTxIndexConfig() *TxIndexConfig { return DefaultTxIndexConfig() } +//----------------------------------------------------------------------------- +// InstrumentationConfig + +// InstrumentationConfig defines the configuration for metrics reporting. +type InstrumentationConfig struct { + // When true, Prometheus metrics are served under /metrics on + // PrometheusListenAddr. + // Check out the documentation for the list of available metrics. + Prometheus bool `mapstructure:"prometheus"` + + // Address to listen for Prometheus collector(s) connections. + PrometheusListenAddr string `mapstructure:"prometheus_listen_addr"` +} + +// DefaultInstrumentationConfig returns a default configuration for metrics +// reporting. +func DefaultInstrumentationConfig() *InstrumentationConfig { + return &InstrumentationConfig{ + Prometheus: false, + PrometheusListenAddr: ":26660", + } +} + +// TestInstrumentationConfig returns a default configuration for metrics +// reporting. +func TestInstrumentationConfig() *InstrumentationConfig { + return DefaultInstrumentationConfig() +} + //----------------------------------------------------------------------------- // Utils diff --git a/config/toml.go b/config/toml.go index 7ed3e971..c3d41a9b 100644 --- a/config/toml.go +++ b/config/toml.go @@ -232,6 +232,17 @@ index_tags = "{{ .TxIndex.IndexTags }}" # desirable (see the comment above). IndexTags has a precedence over # IndexAllTags (i.e. when given both, IndexTags will be indexed). index_all_tags = {{ .TxIndex.IndexAllTags }} + +##### instrumentation configuration options ##### +[instrumentation] + +# When true, Prometheus metrics are served under /metrics on +# PrometheusListenAddr. +# Check out the documentation for the list of available metrics. +prometheus = {{ .Instrumentation.Prometheus }} + +# Address to listen for Prometheus collector(s) connections +prometheus_listen_addr = "{{ .Instrumentation.PrometheusListenAddr }}" ` /****** these are for test settings ***********/ diff --git a/consensus/metrics.go b/consensus/metrics.go new file mode 100644 index 00000000..253880e8 --- /dev/null +++ b/consensus/metrics.go @@ -0,0 +1,133 @@ +package consensus + +import ( + "github.com/go-kit/kit/metrics" + "github.com/go-kit/kit/metrics/discard" + + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +// Metrics contains metrics exposed by this package. +type Metrics struct { + // Height of the chain. + Height metrics.Gauge + + // Number of rounds. + Rounds metrics.Gauge + + // Number of validators. + Validators metrics.Gauge + // Total power of all validators. + ValidatorsPower metrics.Gauge + // Number of validators who did not sign. + MissingValidators metrics.Gauge + // Total power of the missing validators. + MissingValidatorsPower metrics.Gauge + // Number of validators who tried to double sign. + ByzantineValidators metrics.Gauge + // Total power of the byzantine validators. + ByzantineValidatorsPower metrics.Gauge + + // Time between this and the last block. + BlockIntervalSeconds metrics.Histogram + + // Number of transactions. + NumTxs metrics.Gauge + // Size of the block. + BlockSizeBytes metrics.Gauge + // Total number of transactions. + TotalTxs metrics.Gauge +} + +// PrometheusMetrics returns Metrics build using Prometheus client library. +func PrometheusMetrics() *Metrics { + return &Metrics{ + Height: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "height", + Help: "Height of the chain.", + }, []string{}), + Rounds: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "rounds", + Help: "Number of rounds.", + }, []string{}), + + Validators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "validators", + Help: "Number of validators.", + }, []string{}), + ValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "validators_power", + Help: "Total power of all validators.", + }, []string{}), + MissingValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "missing_validators", + Help: "Number of validators who did not sign.", + }, []string{}), + MissingValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "missing_validators_power", + Help: "Total power of the missing validators.", + }, []string{}), + ByzantineValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "byzantine_validators", + Help: "Number of validators who tried to double sign.", + }, []string{}), + ByzantineValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "byzantine_validators_power", + Help: "Total power of the byzantine validators.", + }, []string{}), + + BlockIntervalSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Subsystem: "consensus", + Name: "block_interval_seconds", + Help: "Time between this and the last block.", + Buckets: []float64{1, 2.5, 5, 10, 60}, + }, []string{}), + + NumTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "num_txs", + Help: "Number of transactions.", + }, []string{}), + BlockSizeBytes: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "block_size_bytes", + Help: "Size of the block.", + }, []string{}), + TotalTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "consensus", + Name: "total_txs", + Help: "Total number of transactions.", + }, []string{}), + } +} + +// NopMetrics returns no-op Metrics. +func NopMetrics() *Metrics { + return &Metrics{ + Height: discard.NewGauge(), + + Rounds: discard.NewGauge(), + + Validators: discard.NewGauge(), + ValidatorsPower: discard.NewGauge(), + MissingValidators: discard.NewGauge(), + MissingValidatorsPower: discard.NewGauge(), + ByzantineValidators: discard.NewGauge(), + ByzantineValidatorsPower: discard.NewGauge(), + + BlockIntervalSeconds: discard.NewHistogram(), + + NumTxs: discard.NewGauge(), + BlockSizeBytes: discard.NewGauge(), + TotalTxs: discard.NewGauge(), + } +} diff --git a/consensus/state.go b/consensus/state.go index d46ec583..aab82296 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -115,10 +115,24 @@ type ConsensusState struct { // synchronous pubsub between consensus state and reactor. // state only emits EventNewRoundStep, EventVote and EventProposalHeartbeat evsw tmevents.EventSwitch + + // for reporting metrics + metrics *Metrics } +// CSOption sets an optional parameter on the ConsensusState. +type CSOption func(*ConsensusState) + // NewConsensusState returns a new ConsensusState. -func NewConsensusState(config *cfg.ConsensusConfig, state sm.State, blockExec *sm.BlockExecutor, blockStore sm.BlockStore, mempool sm.Mempool, evpool sm.EvidencePool) *ConsensusState { +func NewConsensusState( + config *cfg.ConsensusConfig, + state sm.State, + blockExec *sm.BlockExecutor, + blockStore sm.BlockStore, + mempool sm.Mempool, + evpool sm.EvidencePool, + options ...CSOption, +) *ConsensusState { cs := &ConsensusState{ config: config, blockExec: blockExec, @@ -132,6 +146,7 @@ func NewConsensusState(config *cfg.ConsensusConfig, state sm.State, blockExec *s wal: nilWAL{}, evpool: evpool, evsw: tmevents.NewEventSwitch(), + metrics: NopMetrics(), } // set function defaults (may be overwritten before calling Start) cs.decideProposal = cs.defaultDecideProposal @@ -143,6 +158,9 @@ func NewConsensusState(config *cfg.ConsensusConfig, state sm.State, blockExec *s // We do that upon Start(). cs.reconstructLastCommit(state) cs.BaseService = *cmn.NewBaseService(nil, "ConsensusState", cs) + for _, option := range options { + option(cs) + } return cs } @@ -161,6 +179,11 @@ func (cs *ConsensusState) SetEventBus(b *types.EventBus) { cs.blockExec.SetEventBus(b) } +// WithMetrics sets the metrics. +func WithMetrics(metrics *Metrics) CSOption { + return func(cs *ConsensusState) { cs.metrics = metrics } +} + // String returns a string. func (cs *ConsensusState) String() string { // better not to access shared variables @@ -387,6 +410,7 @@ func (cs *ConsensusState) SetProposalAndBlock(proposal *types.Proposal, block *t // internal functions for managing the state func (cs *ConsensusState) updateHeight(height int64) { + cs.metrics.Height.Set(float64(height)) cs.Height = height } @@ -718,6 +742,7 @@ func (cs *ConsensusState) enterNewRound(height int64, round int) { cs.Votes.SetRound(round + 1) // also track next round (round+1) to allow round-skipping cs.eventBus.PublishEventNewRound(cs.RoundStateEvent()) + cs.metrics.Rounds.Set(float64(round)) // Wait for txs to be available in the mempool // before we enterPropose in round 0. If the last block changed the app hash, @@ -1276,6 +1301,9 @@ func (cs *ConsensusState) finalizeCommit(height int64) { fail.Fail() // XXX + // must be called before we update state + cs.recordMetrics(height, block) + // NewHeightStep! cs.updateToState(stateCopy) @@ -1291,6 +1319,44 @@ func (cs *ConsensusState) finalizeCommit(height int64) { // * cs.StartTime is set to when we will start round0. } +func (cs *ConsensusState) recordMetrics(height int64, block *types.Block) { + cs.metrics.Validators.Set(float64(cs.Validators.Size())) + cs.metrics.ValidatorsPower.Set(float64(cs.Validators.TotalVotingPower())) + missingValidators := 0 + missingValidatorsPower := int64(0) + for i, val := range cs.Validators.Validators { + var vote *types.Vote + if i < len(block.LastCommit.Precommits) { + vote = block.LastCommit.Precommits[i] + } + if vote == nil { + missingValidators++ + missingValidatorsPower += val.VotingPower + } + } + cs.metrics.MissingValidators.Set(float64(missingValidators)) + cs.metrics.MissingValidatorsPower.Set(float64(missingValidatorsPower)) + cs.metrics.ByzantineValidators.Set(float64(len(block.Evidence.Evidence))) + byzantineValidatorsPower := int64(0) + for _, ev := range block.Evidence.Evidence { + if _, val := cs.Validators.GetByAddress(ev.Address()); val != nil { + byzantineValidatorsPower += val.VotingPower + } + } + cs.metrics.ByzantineValidatorsPower.Set(float64(byzantineValidatorsPower)) + + if height > 1 { + lastBlockMeta := cs.blockStore.LoadBlockMeta(height - 1) + cs.metrics.BlockIntervalSeconds.Observe( + block.Time.Sub(lastBlockMeta.Header.Time).Seconds(), + ) + } + + cs.metrics.NumTxs.Set(float64(block.NumTxs)) + cs.metrics.BlockSizeBytes.Set(float64(block.Size())) + cs.metrics.TotalTxs.Set(float64(block.TotalTxs)) +} + //----------------------------------------------------------------------------- func (cs *ConsensusState) defaultSetProposal(proposal *types.Proposal) error { diff --git a/docs/architecture/adr-011-monitoring.md b/docs/architecture/adr-011-monitoring.md new file mode 100644 index 00000000..ca16a9a1 --- /dev/null +++ b/docs/architecture/adr-011-monitoring.md @@ -0,0 +1,116 @@ +# ADR 011: Monitoring + +## Changelog + +08-06-2018: Initial draft +11-06-2018: Reorg after @xla comments +13-06-2018: Clarification about usage of labels + +## Context + +In order to bring more visibility into Tendermint, we would like it to report +metrics and, maybe later, traces of transactions and RPC queries. See +https://github.com/tendermint/tendermint/issues/986. + +A few solutions were considered: + +1. [Prometheus](https://prometheus.io) + a) Prometheus API + b) [go-kit metrics package](https://github.com/go-kit/kit/tree/master/metrics) as an interface plus Prometheus + c) [telegraf](https://github.com/influxdata/telegraf) + d) new service, which will listen to events emitted by pubsub and report metrics +5. [OpenCensus](https://opencensus.io/go/index.html) + +### 1. Prometheus + +Prometheus seems to be the most popular product out there for monitoring. It has +a Go client library, powerful queries, alerts. + +**a) Prometheus API** + +We can commit to using Prometheus in Tendermint, but I think Tendermint users +should be free to choose whatever monitoring tool they feel will better suit +their needs (if they don't have existing one already). So we should try to +abstract interface enough so people can switch between Prometheus and other +similar tools. + +**b) go-kit metrics package as an interface** + +metrics package provides a set of uniform interfaces for service +instrumentation and offers adapters to popular metrics packages: + +https://godoc.org/github.com/go-kit/kit/metrics#pkg-subdirectories + +Comparing to Prometheus API, we're losing customisability and control, but gaining +freedom in choosing any instrument from the above list given we will extract +metrics creation into a separate function (see "providers" in node/node.go). + +**c) telegraf** + +Unlike already discussed options, telegraf does not require modifying Tendermint +source code. You create something called an input plugin, which polls +Tendermint RPC every second and calculates the metrics itself. + +While it may sound good, but some metrics we want to report are not exposed via +RPC or pubsub, therefore can't be accessed externally. + +**d) service, listening to pubsub** + +Same issue as the above. + +### 2. opencensus + +opencensus provides both metrics and tracing, which may be important in the +future. It's API looks different from go-kit and Prometheus, but looks like it +covers everything we need. + +Unfortunately, OpenCensus go client does not define any +interfaces, so if we want to abstract away metrics we +will need to write interfaces ourselves. + +### List of metrics + +| | Name | Type | Description | +| - | --------------------------------------- | ------- | ----------------------------------------------------------------------------- | +| A | consensus_height | Gauge | | +| A | consensus_validators | Gauge | Number of validators who signed | +| A | consensus_validators_power | Gauge | Total voting power of all validators | +| A | consensus_missing_validators | Gauge | Number of validators who did not sign | +| A | consensus_missing_validators_power | Gauge | Total voting power of the missing validators | +| A | consensus_byzantine_validators | Gauge | Number of validators who tried to double sign | +| A | consensus_byzantine_validators_power | Gauge | Total voting power of the byzantine validators | +| A | consensus_block_interval | Timing | Time between this and last block (Block.Header.Time) | +| | consensus_block_time | Timing | Time to create a block (from creating a proposal to commit) | +| | consensus_time_between_blocks | Timing | Time between committing last block and (receiving proposal creating proposal) | +| A | consensus_rounds | Gauge | Number of rounds | +| | consensus_prevotes | Gauge | | +| | consensus_precommits | Gauge | | +| | consensus_prevotes_total_power | Gauge | | +| | consensus_precommits_total_power | Gauge | | +| A | consensus_num_txs | Gauge | | +| A | mempool_size | Gauge | | +| A | consensus_total_txs | Gauge | | +| A | consensus_block_size | Gauge | In bytes | +| A | p2p_peers | Gauge | Number of peers node's connected to | + +`A` - will be implemented in the fist place. + +**Proposed solution** + +## Status + +Proposed. + +## Consequences + +### Positive + +Better visibility, support of variety of monitoring backends + +### Negative + +One more library to audit, messing metrics reporting code with business domain. + +### Neutral + +- diff --git a/docs/index.rst b/docs/index.rst index b40db3bf..e7d86bc2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -52,6 +52,7 @@ ABCI, Apps, Logging, Etc indexing-transactions.md how-to-read-logs.md running-in-production.md + metrics.md Research & Specification ------------------------ diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 00000000..1cd758b9 --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,40 @@ +# Metrics + +Tendermint can report and serve the Prometheus metrics, which in their turn can +be consumed by Prometheus collector(s). + +This functionality is disabled by default. + +To enable the Prometheus metrics, set `instrumentation.prometheus=true` if your +config file. Metrics will be served under `/metrics` on 26660 port by default. +Listen address can be changed in the config file (see +`prometheus_listen_addr`). + +## List of available metrics + +The following metrics are available: + +| Name | Type | Since | Description | +| --------------------------------------- | ------- | --------- | ----------------------------------------------------------------------------- | +| consensus_height | Gauge | 0.20.1 | Height of the chain | +| consensus_validators | Gauge | 0.20.1 | Number of validators | +| consensus_validators_power | Gauge | 0.20.1 | Total voting power of all validators | +| consensus_missing_validators | Gauge | 0.20.1 | Number of validators who did not sign | +| consensus_missing_validators_power | Gauge | 0.20.1 | Total voting power of the missing validators | +| consensus_byzantine_validators | Gauge | 0.20.1 | Number of validators who tried to double sign | +| consensus_byzantine_validators_power | Gauge | 0.20.1 | Total voting power of the byzantine validators | +| consensus_block_interval_seconds | Histogram | 0.20.1 | Time between this and last block (Block.Header.Time) in seconds | +| consensus_rounds | Gauge | 0.20.1 | Number of rounds | +| consensus_num_txs | Gauge | 0.20.1 | Number of transactions | +| mempool_size | Gauge | 0.20.1 | Number of uncommitted transactions | +| consensus_total_txs | Gauge | 0.20.1 | Total number of transactions committed | +| consensus_block_size_bytes | Gauge | 0.20.1 | Block size in bytes | +| p2p_peers | Gauge | 0.20.1 | Number of peers node's connected to | + +## Useful queries + +Percentage of missing + byzantine validators: + +``` +((consensus_byzantine_validators_power + consensus_missing_validators_power) / consensus_validators_power) * 100 +``` diff --git a/docs/specification/configuration.md b/docs/specification/configuration.md index d39aa3cb..08981c06 100644 --- a/docs/specification/configuration.md +++ b/docs/specification/configuration.md @@ -171,19 +171,30 @@ peer_query_maj23_sleep_duration = 2000 # What indexer to use for transactions # # Options: -# 1) "null" (default) -# 2) "kv" - the simplest possible indexer, backed by key-value storage (defaults to levelDB; see DBBackend). -indexer = "{{ .TxIndex.Indexer }}" +# 1) "null" +# 2) "kv" (default) - the simplest possible indexer, backed by key-value storage (defaults to levelDB; see DBBackend). +indexer = "kv" # Comma-separated list of tags to index (by default the only tag is tx hash) # # It's recommended to index only a subset of tags due to possible memory # bloat. This is, of course, depends on the indexer's DB and the volume of # transactions. -index_tags = "{{ .TxIndex.IndexTags }}" +index_tags = "" # When set to true, tells indexer to index all tags. Note this may be not # desirable (see the comment above). IndexTags has a precedence over # IndexAllTags (i.e. when given both, IndexTags will be indexed). -index_all_tags = {{ .TxIndex.IndexAllTags }} +index_all_tags = false + +##### instrumentation configuration options ##### +[instrumentation] + +# When true, Prometheus metrics are served under /metrics on +# PrometheusListenAddr. +# Check out the documentation for the list of available metrics. +prometheus = false + +# Address to listen for Prometheus collector(s) connections +prometheus_listen_addr = ":26660" ``` diff --git a/mempool/mempool.go b/mempool/mempool.go index 5af16b3c..418470a7 100644 --- a/mempool/mempool.go +++ b/mempool/mempool.go @@ -83,10 +83,20 @@ type Mempool struct { wal *auto.AutoFile logger log.Logger + + metrics *Metrics } +// MempoolOption sets an optional parameter on the Mempool. +type MempoolOption func(*Mempool) + // NewMempool returns a new Mempool with the given configuration and connection to an application. -func NewMempool(config *cfg.MempoolConfig, proxyAppConn proxy.AppConnMempool, height int64) *Mempool { +func NewMempool( + config *cfg.MempoolConfig, + proxyAppConn proxy.AppConnMempool, + height int64, + options ...MempoolOption, +) *Mempool { mempool := &Mempool{ config: config, proxyAppConn: proxyAppConn, @@ -98,8 +108,12 @@ func NewMempool(config *cfg.MempoolConfig, proxyAppConn proxy.AppConnMempool, he recheckEnd: nil, logger: log.NewNopLogger(), cache: newTxCache(config.CacheSize), + metrics: NopMetrics(), } proxyAppConn.SetResponseCallback(mempool.resCb) + for _, option := range options { + option(mempool) + } return mempool } @@ -115,6 +129,11 @@ func (mem *Mempool) SetLogger(l log.Logger) { mem.logger = l } +// WithMetrics sets the metrics. +func WithMetrics(metrics *Metrics) MempoolOption { + return func(mem *Mempool) { mem.metrics = metrics } +} + // CloseWAL closes and discards the underlying WAL file. // Any further writes will not be relayed to disk. func (mem *Mempool) CloseWAL() bool { @@ -250,6 +269,7 @@ func (mem *Mempool) resCb(req *abci.Request, res *abci.Response) { } else { mem.resCbRecheck(req, res) } + mem.metrics.Size.Set(float64(mem.Size())) } func (mem *Mempool) resCbNormal(req *abci.Request, res *abci.Response) { @@ -393,6 +413,7 @@ func (mem *Mempool) Update(height int64, txs types.Txs) error { // mem.recheckCursor re-scans mem.txs and possibly removes some txs. // Before mem.Reap(), we should wait for mem.recheckCursor to be nil. } + mem.metrics.Size.Set(float64(mem.Size())) return nil } diff --git a/mempool/metrics.go b/mempool/metrics.go new file mode 100644 index 00000000..f381678c --- /dev/null +++ b/mempool/metrics.go @@ -0,0 +1,34 @@ +package mempool + +import ( + "github.com/go-kit/kit/metrics" + "github.com/go-kit/kit/metrics/discard" + + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +// Metrics contains metrics exposed by this package. +// see MetricsProvider for descriptions. +type Metrics struct { + // Size of the mempool. + Size metrics.Gauge +} + +// PrometheusMetrics returns Metrics build using Prometheus client library. +func PrometheusMetrics() *Metrics { + return &Metrics{ + Size: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "mempool", + Name: "size", + Help: "Size of the mempool (number of uncommitted transactions).", + }, []string{}), + } +} + +// NopMetrics returns no-op Metrics. +func NopMetrics() *Metrics { + return &Metrics{ + Size: discard.NewGauge(), + } +} diff --git a/node/node.go b/node/node.go index efeb17ee..dc79cff7 100644 --- a/node/node.go +++ b/node/node.go @@ -2,11 +2,14 @@ package node import ( "bytes" + "context" "errors" "fmt" "net" "net/http" + "github.com/prometheus/client_golang/prometheus/promhttp" + abci "github.com/tendermint/abci/types" amino "github.com/tendermint/go-amino" crypto "github.com/tendermint/go-crypto" @@ -81,10 +84,25 @@ func DefaultNewNode(config *cfg.Config, logger log.Logger) (*Node, error) { proxy.DefaultClientCreator(config.ProxyApp, config.ABCI, config.DBDir()), DefaultGenesisDocProviderFunc(config), DefaultDBProvider, + DefaultMetricsProvider, logger, ) } +// MetricsProvider returns a consensus, p2p and mempool Metrics. +type MetricsProvider func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics) + +// DefaultMetricsProvider returns consensus, p2p and mempool Metrics build +// using Prometheus client library. +func DefaultMetricsProvider() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics) { + return cs.PrometheusMetrics(), p2p.PrometheusMetrics(), mempl.PrometheusMetrics() +} + +// NopMetricsProvider returns consensus, p2p and mempool Metrics as no-op. +func NopMetricsProvider() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics) { + return cs.NopMetrics(), p2p.NopMetrics(), mempl.NopMetrics() +} + //------------------------------------------------------------------------------ // Node is the highest level interface to a full Tendermint node. @@ -114,6 +132,7 @@ type Node struct { rpcListeners []net.Listener // rpc servers txIndexer txindex.TxIndexer indexerService *txindex.IndexerService + prometheusSrv *http.Server } // NewNode returns a new, ready to go, Tendermint Node. @@ -122,6 +141,7 @@ func NewNode(config *cfg.Config, clientCreator proxy.ClientCreator, genesisDocProvider GenesisDocProvider, dbProvider DBProvider, + metricsProvider MetricsProvider, logger log.Logger) (*Node, error) { // Get BlockStore @@ -208,11 +228,28 @@ func NewNode(config *cfg.Config, consensusLogger.Info("This node is not a validator", "addr", privValidator.GetAddress(), "pubKey", privValidator.GetPubKey()) } + // metrics + var ( + csMetrics *cs.Metrics + p2pMetrics *p2p.Metrics + memplMetrics *mempl.Metrics + ) + if config.Instrumentation.Prometheus { + csMetrics, p2pMetrics, memplMetrics = metricsProvider() + } else { + csMetrics, p2pMetrics, memplMetrics = NopMetricsProvider() + } + // Make MempoolReactor mempoolLogger := logger.With("module", "mempool") - mempool := mempl.NewMempool(config.Mempool, proxyApp.Mempool(), state.LastBlockHeight) - mempool.InitWAL() // no need to have the mempool wal during tests + mempool := mempl.NewMempool( + config.Mempool, + proxyApp.Mempool(), + state.LastBlockHeight, + mempl.WithMetrics(memplMetrics), + ) mempool.SetLogger(mempoolLogger) + mempool.InitWAL() // no need to have the mempool wal during tests mempoolReactor := mempl.NewMempoolReactor(config.Mempool, mempool) mempoolReactor.SetLogger(mempoolLogger) @@ -241,8 +278,15 @@ func NewNode(config *cfg.Config, bcReactor.SetLogger(logger.With("module", "blockchain")) // Make ConsensusReactor - consensusState := cs.NewConsensusState(config.Consensus, state.Copy(), - blockExec, blockStore, mempool, evidencePool) + consensusState := cs.NewConsensusState( + config.Consensus, + state.Copy(), + blockExec, + blockStore, + mempool, + evidencePool, + cs.WithMetrics(csMetrics), + ) consensusState.SetLogger(consensusLogger) if privValidator != nil { consensusState.SetPrivValidator(privValidator) @@ -252,7 +296,7 @@ func NewNode(config *cfg.Config, p2pLogger := logger.With("module", "p2p") - sw := p2p.NewSwitch(config.P2P) + sw := p2p.NewSwitch(config.P2P, p2p.WithMetrics(p2pMetrics)) sw.SetLogger(p2pLogger) sw.AddReactor("MEMPOOL", mempoolReactor) sw.AddReactor("BLOCKCHAIN", bcReactor) @@ -411,6 +455,10 @@ func (n *Node) OnStart() error { n.rpcListeners = listeners } + if n.config.Instrumentation.Prometheus { + n.prometheusSrv = n.startPrometheusServer(n.config.Instrumentation.PrometheusListenAddr) + } + // Start the switch (the P2P server). err = n.sw.Start() if err != nil { @@ -452,6 +500,13 @@ func (n *Node) OnStop() { n.Logger.Error("Error stopping priv validator socket client", "err", err) } } + + if n.prometheusSrv != nil { + if err := n.prometheusSrv.Shutdown(context.Background()); err != nil { + // Error from closing listeners, or context timeout: + n.Logger.Error("Prometheus HTTP server Shutdown", "err", err) + } + } } // RunForever waits for an interrupt signal and stops the node. @@ -527,6 +582,22 @@ func (n *Node) startRPC() ([]net.Listener, error) { return listeners, nil } +// startPrometheusServer starts a Prometheus HTTP server, listening for metrics +// collectors on addr. +func (n *Node) startPrometheusServer(addr string) *http.Server { + srv := &http.Server{ + Addr: addr, + Handler: promhttp.Handler(), + } + go func() { + if err := srv.ListenAndServe(); err != http.ErrServerClosed { + // Error starting or closing listener: + n.Logger.Error("Prometheus HTTP server ListenAndServe", "err", err) + } + }() + return srv +} + // Switch returns the Node's Switch. func (n *Node) Switch() *p2p.Switch { return n.sw diff --git a/p2p/metrics.go b/p2p/metrics.go new file mode 100644 index 00000000..ab876ee7 --- /dev/null +++ b/p2p/metrics.go @@ -0,0 +1,33 @@ +package p2p + +import ( + "github.com/go-kit/kit/metrics" + "github.com/go-kit/kit/metrics/discard" + + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +// Metrics contains metrics exposed by this package. +type Metrics struct { + // Number of peers. + Peers metrics.Gauge +} + +// PrometheusMetrics returns Metrics build using Prometheus client library. +func PrometheusMetrics() *Metrics { + return &Metrics{ + Peers: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Subsystem: "p2p", + Name: "peers", + Help: "Number of peers.", + }, []string{}), + } +} + +// NopMetrics returns no-op Metrics. +func NopMetrics() *Metrics { + return &Metrics{ + Peers: discard.NewGauge(), + } +} diff --git a/p2p/switch.go b/p2p/switch.go index f1ceee5c..bf5f9747 100644 --- a/p2p/switch.go +++ b/p2p/switch.go @@ -73,10 +73,15 @@ type Switch struct { mConfig conn.MConnConfig rng *cmn.Rand // seed for randomizing dial times and orders + + metrics *Metrics } +// SwitchOption sets an optional parameter on the Switch. +type SwitchOption func(*Switch) + // NewSwitch creates a new Switch with the given config. -func NewSwitch(cfg *config.P2PConfig) *Switch { +func NewSwitch(cfg *config.P2PConfig, options ...SwitchOption) *Switch { sw := &Switch{ config: cfg, reactors: make(map[string]Reactor), @@ -85,6 +90,7 @@ func NewSwitch(cfg *config.P2PConfig) *Switch { peers: NewPeerSet(), dialing: cmn.NewCMap(), reconnecting: cmn.NewCMap(), + metrics: NopMetrics(), } // Ensure we have a completely undeterministic PRNG. @@ -99,9 +105,19 @@ func NewSwitch(cfg *config.P2PConfig) *Switch { sw.mConfig = mConfig sw.BaseService = *cmn.NewBaseService(nil, "P2P Switch", sw) + + for _, option := range options { + option(sw) + } + return sw } +// WithMetrics sets the metrics. +func WithMetrics(metrics *Metrics) SwitchOption { + return func(sw *Switch) { sw.metrics = metrics } +} + //--------------------------------------------------------------------- // Switch setup @@ -279,6 +295,7 @@ func (sw *Switch) StopPeerGracefully(peer Peer) { func (sw *Switch) stopAndRemovePeer(peer Peer, reason interface{}) { sw.peers.Remove(peer) + sw.metrics.Peers.Add(float64(-1)) peer.Stop() for _, reactor := range sw.reactors { reactor.RemovePeer(peer, reason) @@ -623,6 +640,7 @@ func (sw *Switch) addPeer(pc peerConn) error { if err := sw.peers.Add(peer); err != nil { return err } + sw.metrics.Peers.Add(float64(1)) sw.Logger.Info("Added peer", "peer", peer) return nil diff --git a/rpc/test/helpers.go b/rpc/test/helpers.go index 5d18299b..b434c7d9 100644 --- a/rpc/test/helpers.go +++ b/rpc/test/helpers.go @@ -122,7 +122,9 @@ func NewTendermint(app abci.Application) *nm.Node { papp := proxy.NewLocalClientCreator(app) node, err := nm.NewNode(config, pv, papp, nm.DefaultGenesisDocProviderFunc(config), - nm.DefaultDBProvider, logger) + nm.DefaultDBProvider, + nm.DefaultMetricsProvider, + logger) if err != nil { panic(err) } diff --git a/types/block.go b/types/block.go index 3004672c..6adc0c4c 100644 --- a/types/block.go +++ b/types/block.go @@ -135,6 +135,15 @@ func (b *Block) HashesTo(hash []byte) bool { return bytes.Equal(b.Hash(), hash) } +// Size returns size of the block in bytes. +func (b *Block) Size() int { + bz, err := cdc.MarshalBinaryBare(b) + if err != nil { + return 0 + } + return len(bz) +} + // String returns a string representation of the block func (b *Block) String() string { return b.StringIndented("")