From 92343ef484374b6010a87e8c3c8de9fb77e0ef7a Mon Sep 17 00:00:00 2001 From: Matthew Slipper Date: Wed, 10 Oct 2018 09:27:43 -0700 Subject: [PATCH] Add additional metrics (#2500) * Add additional metrics Continues addressing https://github.com/cosmos/cosmos-sdk/issues/2169. * Add nop metrics to fix NPE * Tweak buckets, code review * Update buckets * Update docs with new metrics * Code review updates --- consensus/metrics.go | 10 +++++++ consensus/reactor.go | 4 +-- docs/tendermint-core/metrics.md | 50 ++++++++++++++++++------------- mempool/mempool.go | 4 ++- mempool/metrics.go | 36 ++++++++++++++++++++-- node/node.go | 26 ++++++++++------ p2p/metrics.go | 12 +++++++- state/execution.go | 53 +++++++++++++++++++++++++++------ state/metrics.go | 33 ++++++++++++++++++++ 9 files changed, 183 insertions(+), 45 deletions(-) create mode 100644 state/metrics.go diff --git a/consensus/metrics.go b/consensus/metrics.go index 39bfd24bd..7b4a3fbc9 100644 --- a/consensus/metrics.go +++ b/consensus/metrics.go @@ -44,6 +44,9 @@ type Metrics struct { CommittedHeight metrics.Gauge // Whether or not a node is fast syncing. 1 if yes, 0 if no. FastSyncing metrics.Gauge + + // Number of blockparts transmitted by peer. + BlockParts metrics.Counter } // PrometheusMetrics returns Metrics build using Prometheus client library. @@ -136,6 +139,12 @@ func PrometheusMetrics(namespace string) *Metrics { Name: "fast_syncing", Help: "Whether or not a node is fast syncing. 1 if yes, 0 if no.", }, []string{}), + BlockParts: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_parts", + Help: "Number of blockparts transmitted by peer.", + }, []string{"peer_id"}), } } @@ -160,5 +169,6 @@ func NopMetrics() *Metrics { TotalTxs: discard.NewGauge(), CommittedHeight: discard.NewGauge(), FastSyncing: discard.NewGauge(), + BlockParts: discard.NewCounter(), } } diff --git a/consensus/reactor.go b/consensus/reactor.go index 376b8eda9..ca63e8992 100644 --- a/consensus/reactor.go +++ b/consensus/reactor.go @@ -8,7 +8,7 @@ import ( "github.com/pkg/errors" - amino "github.com/tendermint/go-amino" + "github.com/tendermint/go-amino" cstypes "github.com/tendermint/tendermint/consensus/types" cmn "github.com/tendermint/tendermint/libs/common" @@ -274,7 +274,7 @@ func (conR *ConsensusReactor) Receive(chID byte, src p2p.Peer, msgBytes []byte) ps.ApplyProposalPOLMessage(msg) case *BlockPartMessage: ps.SetHasProposalBlockPart(msg.Height, msg.Round, msg.Part.Index) - + conR.metrics.BlockParts.With("peer_id", string(src.ID())).Add(1) conR.conS.peerMsgQueue <- msgInfo{msg, src.ID()} default: conR.Logger.Error(fmt.Sprintf("Unknown message type %v", reflect.TypeOf(msg))) diff --git a/docs/tendermint-core/metrics.md b/docs/tendermint-core/metrics.md index b469c6890..ad6d4c765 100644 --- a/docs/tendermint-core/metrics.md +++ b/docs/tendermint-core/metrics.md @@ -8,35 +8,45 @@ This functionality is disabled by default. To enable the Prometheus metrics, set `instrumentation.prometheus=true` if your config file. Metrics will be served under `/metrics` on 26660 port by default. Listen address can be changed in the config file (see -`instrumentation.prometheus_listen_addr`). +`instrumentation.prometheus\_listen\_addr`). ## List of available metrics The following metrics are available: -``` -| Name | Type | Since | Description | -| --------------------------------------- | ------- | --------- | ----------------------------------------------------------------------------- | -| consensus_height | Gauge | 0.21.0 | Height of the chain | -| consensus_validators | Gauge | 0.21.0 | Number of validators | -| consensus_validators_power | Gauge | 0.21.0 | Total voting power of all validators | -| consensus_missing_validators | Gauge | 0.21.0 | Number of validators who did not sign | -| consensus_missing_validators_power | Gauge | 0.21.0 | Total voting power of the missing validators | -| consensus_byzantine_validators | Gauge | 0.21.0 | Number of validators who tried to double sign | -| consensus_byzantine_validators_power | Gauge | 0.21.0 | Total voting power of the byzantine validators | -| consensus_block_interval_seconds | Histogram | 0.21.0 | Time between this and last block (Block.Header.Time) in seconds | -| consensus_rounds | Gauge | 0.21.0 | Number of rounds | -| consensus_num_txs | Gauge | 0.21.0 | Number of transactions | -| mempool_size | Gauge | 0.21.0 | Number of uncommitted transactions | -| consensus_total_txs | Gauge | 0.21.0 | Total number of transactions committed | -| consensus_block_size_bytes | Gauge | 0.21.0 | Block size in bytes | -| p2p_peers | Gauge | 0.21.0 | Number of peers node's connected to | -``` +| **Name** | **Type** | **Since** | **Tags** | **Description** | +|-----------------------------------------|-----------|-----------|----------|-----------------------------------------------------------------| +| consensus\_height | Gauge | 0.21.0 | | Height of the chain | +| consensus\_validators | Gauge | 0.21.0 | | Number of validators | +| consensus\_validators\_power | Gauge | 0.21.0 | | Total voting power of all validators | +| consensus\_missing\_validators | Gauge | 0.21.0 | | Number of validators who did not sign | +| consensus\_missing\_validators\_power | Gauge | 0.21.0 | | Total voting power of the missing validators | +| consensus\_byzantine\_validators | Gauge | 0.21.0 | | Number of validators who tried to double sign | +| consensus\_byzantine\_validators\_power | Gauge | 0.21.0 | | Total voting power of the byzantine validators | +| consensus\_block\_interval\_seconds | Histogram | 0.21.0 | | Time between this and last block (Block.Header.Time) in seconds | +| consensus\_rounds | Gauge | 0.21.0 | | Number of rounds | +| consensus\_num\_txs | Gauge | 0.21.0 | | Number of transactions | +| consensus\_block\_parts | counter | on dev | peer\_id | number of blockparts transmitted by peer | +| consensus\_latest\_block\_height | gauge | on dev | | /status sync\_info number | +| consensus\_fast\_syncing | gauge | on dev | | either 0 (not fast syncing) or 1 (syncing) | +| consensus\_total\_txs | Gauge | 0.21.0 | | Total number of transactions committed | +| consensus\_block\_size\_bytes | Gauge | 0.21.0 | | Block size in bytes | +| p2p\_peers | Gauge | 0.21.0 | | Number of peers node's connected to | +| p2p\_peer\_receive\_bytes\_total | counter | on dev | peer\_id | number of bytes received from a given peer | +| p2p\_peer\_send\_bytes\_total | counter | on dev | peer\_id | number of bytes sent to a given peer | +| p2p\_peer\_pending\_send\_bytes | gauge | on dev | peer\_id | number of pending bytes to be sent to a given peer | +| p2p\_num\_txs | gauge | on dev | peer\_id | number of transactions submitted by each peer\_id | +| p2p\_pending\_send\_bytes | gauge | on dev | peer\_id | amount of data pending to be sent to peer | +| mempool\_size | Gauge | 0.21.0 | | Number of uncommitted transactions | +| mempool\_tx\_size\_bytes | histogram | on dev | | transaction sizes in bytes | +| mempool\_failed\_txs | counter | on dev | | number of failed transactions | +| mempool\_recheck\_times | counter | on dev | | number of transactions rechecked in the mempool | +| state\_block\_processing\_time | histogram | on dev | | time between BeginBlock and EndBlock in ms | ## Useful queries Percentage of missing + byzantine validators: ``` -((consensus_byzantine_validators_power + consensus_missing_validators_power) / consensus_validators_power) * 100 +((consensus\_byzantine\_validators\_power + consensus\_missing\_validators\_power) / consensus\_validators\_power) * 100 ``` diff --git a/mempool/mempool.go b/mempool/mempool.go index db5f6160c..65cd55354 100644 --- a/mempool/mempool.go +++ b/mempool/mempool.go @@ -326,6 +326,7 @@ func (mem *Mempool) resCb(req *abci.Request, res *abci.Response) { if mem.recheckCursor == nil { mem.resCbNormal(req, res) } else { + mem.metrics.RecheckTimes.Add(1) mem.resCbRecheck(req, res) } mem.metrics.Size.Set(float64(mem.Size())) @@ -346,11 +347,12 @@ func (mem *Mempool) resCbNormal(req *abci.Request, res *abci.Response) { } mem.txs.PushBack(memTx) mem.logger.Info("Added good transaction", "tx", TxID(tx), "res", r, "total", mem.Size()) + mem.metrics.TxSizeBytes.Observe(float64(len(tx))) mem.notifyTxsAvailable() } else { // ignore bad transaction mem.logger.Info("Rejected bad transaction", "tx", TxID(tx), "res", r) - + mem.metrics.FailedTxs.Add(1) // remove from cache (it might be good later) mem.cache.Remove(tx) } diff --git a/mempool/metrics.go b/mempool/metrics.go index fc4bb4fbe..3418f1efe 100644 --- a/mempool/metrics.go +++ b/mempool/metrics.go @@ -3,15 +3,23 @@ package mempool import ( "github.com/go-kit/kit/metrics" "github.com/go-kit/kit/metrics/discard" - prometheus "github.com/go-kit/kit/metrics/prometheus" + "github.com/go-kit/kit/metrics/prometheus" stdprometheus "github.com/prometheus/client_golang/prometheus" ) +const MetricsSubsytem = "mempool" + // Metrics contains metrics exposed by this package. // see MetricsProvider for descriptions. type Metrics struct { // Size of the mempool. Size metrics.Gauge + // Histogram of transaction sizes, in bytes. + TxSizeBytes metrics.Histogram + // Number of failed transactions. + FailedTxs metrics.Counter + // Number of times transactions are rechecked in the mempool. + RecheckTimes metrics.Counter } // PrometheusMetrics returns Metrics build using Prometheus client library. @@ -19,16 +27,38 @@ func PrometheusMetrics(namespace string) *Metrics { return &Metrics{ Size: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, - Subsystem: "mempool", + Subsystem: MetricsSubsytem, Name: "size", Help: "Size of the mempool (number of uncommitted transactions).", }, []string{}), + TxSizeBytes: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsytem, + Name: "tx_size_bytes", + Help: "Transaction sizes in bytes.", + Buckets: stdprometheus.ExponentialBuckets(1, 3, 17), + }, []string{}), + FailedTxs: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsytem, + Name: "failed_txs", + Help: "Number of failed transactions.", + }, []string{}), + RecheckTimes: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsytem, + Name: "recheck_times", + Help: "Number of times transactions are rechecked in the mempool.", + }, []string{}), } } // NopMetrics returns no-op Metrics. func NopMetrics() *Metrics { return &Metrics{ - Size: discard.NewGauge(), + Size: discard.NewGauge(), + TxSizeBytes: discard.NewHistogram(), + FailedTxs: discard.NewCounter(), + RecheckTimes: discard.NewCounter(), } } diff --git a/node/node.go b/node/node.go index 9f9e3636f..d1ab0f86a 100644 --- a/node/node.go +++ b/node/node.go @@ -13,7 +13,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" - amino "github.com/tendermint/go-amino" + "github.com/tendermint/go-amino" abci "github.com/tendermint/tendermint/abci/types" bc "github.com/tendermint/tendermint/blockchain" @@ -32,8 +32,8 @@ import ( rpccore "github.com/tendermint/tendermint/rpc/core" ctypes "github.com/tendermint/tendermint/rpc/core/types" grpccore "github.com/tendermint/tendermint/rpc/grpc" - rpc "github.com/tendermint/tendermint/rpc/lib" - rpcserver "github.com/tendermint/tendermint/rpc/lib/server" + "github.com/tendermint/tendermint/rpc/lib" + "github.com/tendermint/tendermint/rpc/lib/server" sm "github.com/tendermint/tendermint/state" "github.com/tendermint/tendermint/state/txindex" "github.com/tendermint/tendermint/state/txindex/kv" @@ -98,16 +98,17 @@ func DefaultNewNode(config *cfg.Config, logger log.Logger) (*Node, error) { } // MetricsProvider returns a consensus, p2p and mempool Metrics. -type MetricsProvider func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics) +type MetricsProvider func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics, *sm.Metrics) // DefaultMetricsProvider returns Metrics build using Prometheus client library // if Prometheus is enabled. Otherwise, it returns no-op Metrics. func DefaultMetricsProvider(config *cfg.InstrumentationConfig) MetricsProvider { - return func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics) { + return func() (*cs.Metrics, *p2p.Metrics, *mempl.Metrics, *sm.Metrics) { if config.Prometheus { - return cs.PrometheusMetrics(config.Namespace), p2p.PrometheusMetrics(config.Namespace), mempl.PrometheusMetrics(config.Namespace) + return cs.PrometheusMetrics(config.Namespace), p2p.PrometheusMetrics(config.Namespace), + mempl.PrometheusMetrics(config.Namespace), sm.PrometheusMetrics(config.Namespace) } - return cs.NopMetrics(), p2p.NopMetrics(), mempl.NopMetrics() + return cs.NopMetrics(), p2p.NopMetrics(), mempl.NopMetrics(), sm.NopMetrics() } } @@ -245,7 +246,7 @@ func NewNode(config *cfg.Config, consensusLogger.Info("This node is not a validator", "addr", privValidator.GetAddress(), "pubKey", privValidator.GetPubKey()) } - csMetrics, p2pMetrics, memplMetrics := metricsProvider() + csMetrics, p2pMetrics, memplMetrics, smMetrics := metricsProvider() // Make MempoolReactor mempool := mempl.NewMempool( @@ -289,7 +290,14 @@ func NewNode(config *cfg.Config, blockExecLogger := logger.With("module", "state") // make block executor for consensus and blockchain reactors to execute blocks - blockExec := sm.NewBlockExecutor(stateDB, blockExecLogger, proxyApp.Consensus(), mempool, evidencePool) + blockExec := sm.NewBlockExecutor( + stateDB, + blockExecLogger, + proxyApp.Consensus(), + mempool, + evidencePool, + sm.BlockExecutorWithMetrics(smMetrics), + ) // Make BlockchainReactor bcReactor := bc.NewBlockchainReactor(state.Copy(), blockExec, blockStore, fastSync) diff --git a/p2p/metrics.go b/p2p/metrics.go index 94794dfd9..86a205056 100644 --- a/p2p/metrics.go +++ b/p2p/metrics.go @@ -3,7 +3,7 @@ package p2p import ( "github.com/go-kit/kit/metrics" "github.com/go-kit/kit/metrics/discard" - prometheus "github.com/go-kit/kit/metrics/prometheus" + "github.com/go-kit/kit/metrics/prometheus" stdprometheus "github.com/prometheus/client_golang/prometheus" ) @@ -19,6 +19,8 @@ type Metrics struct { PeerSendBytesTotal metrics.Counter // Pending bytes to be sent to a given peer. PeerPendingSendBytes metrics.Gauge + // Number of transactions submitted by each peer. + NumTxs metrics.Gauge } // PrometheusMetrics returns Metrics build using Prometheus client library. @@ -48,6 +50,13 @@ func PrometheusMetrics(namespace string) *Metrics { Name: "peer_pending_send_bytes", Help: "Number of pending bytes to be sent to a given peer.", }, []string{"peer_id"}), + NumTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "num_txs", + Help: "Number of transactions submitted by each peer.", + }, []string{"peer_id"}), + } } @@ -58,5 +67,6 @@ func NopMetrics() *Metrics { PeerReceiveBytesTotal: discard.NewCounter(), PeerSendBytesTotal: discard.NewCounter(), PeerPendingSendBytes: discard.NewGauge(), + NumTxs: discard.NewGauge(), } } diff --git a/state/execution.go b/state/execution.go index c6d5ce0a1..d5a1a1617 100644 --- a/state/execution.go +++ b/state/execution.go @@ -2,8 +2,9 @@ package state import ( "fmt" + "time" - fail "github.com/ebuchman/fail-test" + "github.com/ebuchman/fail-test" abci "github.com/tendermint/tendermint/abci/types" dbm "github.com/tendermint/tendermint/libs/db" "github.com/tendermint/tendermint/libs/log" @@ -33,20 +34,37 @@ type BlockExecutor struct { evpool EvidencePool logger log.Logger + + metrics *Metrics +} + +type BlockExecutorOption func(executor *BlockExecutor) + +func BlockExecutorWithMetrics(metrics *Metrics) BlockExecutorOption { + return func(blockExec *BlockExecutor) { + blockExec.metrics = metrics + } } // NewBlockExecutor returns a new BlockExecutor with a NopEventBus. // Call SetEventBus to provide one. func NewBlockExecutor(db dbm.DB, logger log.Logger, proxyApp proxy.AppConnConsensus, - mempool Mempool, evpool EvidencePool) *BlockExecutor { - return &BlockExecutor{ + mempool Mempool, evpool EvidencePool, options ...BlockExecutorOption) *BlockExecutor { + res := &BlockExecutor{ db: db, proxyApp: proxyApp, eventBus: types.NopEventBus{}, mempool: mempool, evpool: evpool, logger: logger, + metrics: NopMetrics(), } + + for _, option := range options { + option(res) + } + + return res } // SetEventBus - sets the event bus for publishing block related events. @@ -74,7 +92,10 @@ func (blockExec *BlockExecutor) ApplyBlock(state State, blockID types.BlockID, b return state, ErrInvalidBlock(err) } + startTime := time.Now().UnixNano() abciResponses, err := execBlockOnProxyApp(blockExec.logger, blockExec.proxyApp, block, state.LastValidators, blockExec.db) + endTime := time.Now().UnixNano() + blockExec.metrics.BlockProcessingTime.Observe(float64(endTime - startTime) / 1000000) if err != nil { return state, ErrProxyAppConn(err) } @@ -176,8 +197,13 @@ func (blockExec *BlockExecutor) Commit( // Executes block's transactions on proxyAppConn. // Returns a list of transaction results and updates to the validator set -func execBlockOnProxyApp(logger log.Logger, proxyAppConn proxy.AppConnConsensus, - block *types.Block, lastValSet *types.ValidatorSet, stateDB dbm.DB) (*ABCIResponses, error) { +func execBlockOnProxyApp( + logger log.Logger, + proxyAppConn proxy.AppConnConsensus, + block *types.Block, + lastValSet *types.ValidatorSet, + stateDB dbm.DB, +) (*ABCIResponses, error) { var validTxs, invalidTxs = 0, 0 txIndex := 0 @@ -333,8 +359,12 @@ func updateValidators(currentSet *types.ValidatorSet, abciUpdates []abci.Validat } // updateState returns a new State updated according to the header and responses. -func updateState(state State, blockID types.BlockID, header *types.Header, - abciResponses *ABCIResponses) (State, error) { +func updateState( + state State, + blockID types.BlockID, + header *types.Header, + abciResponses *ABCIResponses, +) (State, error) { // Copy the valset so we can apply changes from EndBlock // and update s.LastValidators and s.Validators. @@ -417,8 +447,13 @@ func fireEvents(logger log.Logger, eventBus types.BlockEventPublisher, block *ty // ExecCommitBlock executes and commits a block on the proxyApp without validating or mutating the state. // It returns the application root hash (result of abci.Commit). -func ExecCommitBlock(appConnConsensus proxy.AppConnConsensus, block *types.Block, - logger log.Logger, lastValSet *types.ValidatorSet, stateDB dbm.DB) ([]byte, error) { +func ExecCommitBlock( + appConnConsensus proxy.AppConnConsensus, + block *types.Block, + logger log.Logger, + lastValSet *types.ValidatorSet, + stateDB dbm.DB, +) ([]byte, error) { _, err := execBlockOnProxyApp(logger, appConnConsensus, block, lastValSet, stateDB) if err != nil { logger.Error("Error executing block on proxy app", "height", block.Height, "err", err) diff --git a/state/metrics.go b/state/metrics.go new file mode 100644 index 000000000..7acbafa30 --- /dev/null +++ b/state/metrics.go @@ -0,0 +1,33 @@ +package state + +import ( + "github.com/go-kit/kit/metrics" + "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" + "github.com/go-kit/kit/metrics/discard" +) + +const MetricsSubsystem = "state" + +type Metrics struct { + // Time between BeginBlock and EndBlock. + BlockProcessingTime metrics.Histogram +} + +func PrometheusMetrics(namespace string) *Metrics { + return &Metrics{ + BlockProcessingTime: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_processing_time", + Help: "Time between BeginBlock and EndBlock in ms.", + Buckets: stdprometheus.LinearBuckets(1, 10, 10), + }, []string{}), + } +} + +func NopMetrics() *Metrics { + return &Metrics{ + BlockProcessingTime: discard.NewHistogram(), + } +}