diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index f37904d1b..6c3ff0f95 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -22,5 +22,6 @@ ### FEATURES: ### IMPROVEMENTS: +- [p2p] \#3666 Add per channel telemtry to improve reactor observability ### BUG FIXES: diff --git a/docs/tendermint-core/metrics.md b/docs/tendermint-core/metrics.md index ad6d4c765..94313ddbb 100644 --- a/docs/tendermint-core/metrics.md +++ b/docs/tendermint-core/metrics.md @@ -14,34 +14,34 @@ Listen address can be changed in the config file (see The following metrics are available: -| **Name** | **Type** | **Since** | **Tags** | **Description** | -|-----------------------------------------|-----------|-----------|----------|-----------------------------------------------------------------| -| consensus\_height | Gauge | 0.21.0 | | Height of the chain | -| consensus\_validators | Gauge | 0.21.0 | | Number of validators | -| consensus\_validators\_power | Gauge | 0.21.0 | | Total voting power of all validators | -| consensus\_missing\_validators | Gauge | 0.21.0 | | Number of validators who did not sign | -| consensus\_missing\_validators\_power | Gauge | 0.21.0 | | Total voting power of the missing validators | -| consensus\_byzantine\_validators | Gauge | 0.21.0 | | Number of validators who tried to double sign | -| consensus\_byzantine\_validators\_power | Gauge | 0.21.0 | | Total voting power of the byzantine validators | -| consensus\_block\_interval\_seconds | Histogram | 0.21.0 | | Time between this and last block (Block.Header.Time) in seconds | -| consensus\_rounds | Gauge | 0.21.0 | | Number of rounds | -| consensus\_num\_txs | Gauge | 0.21.0 | | Number of transactions | -| consensus\_block\_parts | counter | on dev | peer\_id | number of blockparts transmitted by peer | -| consensus\_latest\_block\_height | gauge | on dev | | /status sync\_info number | -| consensus\_fast\_syncing | gauge | on dev | | either 0 (not fast syncing) or 1 (syncing) | -| consensus\_total\_txs | Gauge | 0.21.0 | | Total number of transactions committed | -| consensus\_block\_size\_bytes | Gauge | 0.21.0 | | Block size in bytes | -| p2p\_peers | Gauge | 0.21.0 | | Number of peers node's connected to | -| p2p\_peer\_receive\_bytes\_total | counter | on dev | peer\_id | number of bytes received from a given peer | -| p2p\_peer\_send\_bytes\_total | counter | on dev | peer\_id | number of bytes sent to a given peer | -| p2p\_peer\_pending\_send\_bytes | gauge | on dev | peer\_id | number of pending bytes to be sent to a given peer | -| p2p\_num\_txs | gauge | on dev | peer\_id | number of transactions submitted by each peer\_id | -| p2p\_pending\_send\_bytes | gauge | on dev | peer\_id | amount of data pending to be sent to peer | -| mempool\_size | Gauge | 0.21.0 | | Number of uncommitted transactions | -| mempool\_tx\_size\_bytes | histogram | on dev | | transaction sizes in bytes | -| mempool\_failed\_txs | counter | on dev | | number of failed transactions | -| mempool\_recheck\_times | counter | on dev | | number of transactions rechecked in the mempool | -| state\_block\_processing\_time | histogram | on dev | | time between BeginBlock and EndBlock in ms | +| **Name** | **Type** | **Since** | **Tags** | **Description** | +|-----------------------------------------|-----------|-----------|----------------|-----------------------------------------------------------------| +| consensus\_height | Gauge | 0.21.0 | | Height of the chain | +| consensus\_validators | Gauge | 0.21.0 | | Number of validators | +| consensus\_validators\_power | Gauge | 0.21.0 | | Total voting power of all validators | +| consensus\_missing\_validators | Gauge | 0.21.0 | | Number of validators who did not sign | +| consensus\_missing\_validators\_power | Gauge | 0.21.0 | | Total voting power of the missing validators | +| consensus\_byzantine\_validators | Gauge | 0.21.0 | | Number of validators who tried to double sign | +| consensus\_byzantine\_validators\_power | Gauge | 0.21.0 | | Total voting power of the byzantine validators | +| consensus\_block\_interval\_seconds | Histogram | 0.21.0 | | Time between this and last block (Block.Header.Time) in seconds | +| consensus\_rounds | Gauge | 0.21.0 | | Number of rounds | +| consensus\_num\_txs | Gauge | 0.21.0 | | Number of transactions | +| consensus\_block\_parts | counter | on dev | peer\_id | number of blockparts transmitted by peer | +| consensus\_latest\_block\_height | gauge | on dev | | /status sync\_info number | +| consensus\_fast\_syncing | gauge | on dev | | either 0 (not fast syncing) or 1 (syncing) | +| consensus\_total\_txs | Gauge | 0.21.0 | | Total number of transactions committed | +| consensus\_block\_size\_bytes | Gauge | 0.21.0 | | Block size in bytes | +| p2p\_peers | Gauge | 0.21.0 | | Number of peers node's connected to | +| p2p\_peer\_receive\_bytes\_total | counter | on dev | peer\_id, chID | number of bytes per channel received from a given peer | +| p2p\_peer\_send\_bytes\_total | counter | on dev | peer\_id, chID | number of bytes per channel sent to a given peer | +| p2p\_peer\_pending\_send\_bytes | gauge | on dev | peer\_id | number of pending bytes to be sent to a given peer | +| p2p\_num\_txs | gauge | on dev | peer\_id | number of transactions submitted by each peer\_id | +| p2p\_pending\_send\_bytes | gauge | on dev | peer\_id | amount of data pending to be sent to peer | +| mempool\_size | Gauge | 0.21.0 | | Number of uncommitted transactions | +| mempool\_tx\_size\_bytes | histogram | on dev | | transaction sizes in bytes | +| mempool\_failed\_txs | counter | on dev | | number of failed transactions | +| mempool\_recheck\_times | counter | on dev | | number of transactions rechecked in the mempool | +| state\_block\_processing\_time | histogram | on dev | | time between BeginBlock and EndBlock in ms | ## Useful queries diff --git a/p2p/metrics.go b/p2p/metrics.go index 3a6b9568a..675dd9c7c 100644 --- a/p2p/metrics.go +++ b/p2p/metrics.go @@ -47,13 +47,13 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { Subsystem: MetricsSubsystem, Name: "peer_receive_bytes_total", Help: "Number of bytes received from a given peer.", - }, append(labels, "peer_id")).With(labelsAndValues...), + }, append(labels, "peer_id", "chID")).With(labelsAndValues...), PeerSendBytesTotal: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, Name: "peer_send_bytes_total", Help: "Number of bytes sent to a given peer.", - }, append(labels, "peer_id")).With(labelsAndValues...), + }, append(labels, "peer_id", "chID")).With(labelsAndValues...), PeerPendingSendBytes: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, diff --git a/p2p/peer.go b/p2p/peer.go index fab3b42d4..80be0db53 100644 --- a/p2p/peer.go +++ b/p2p/peer.go @@ -248,7 +248,11 @@ func (p *peer) Send(chID byte, msgBytes []byte) bool { } res := p.mconn.Send(chID, msgBytes) if res { - p.metrics.PeerSendBytesTotal.With("peer_id", string(p.ID())).Add(float64(len(msgBytes))) + labels := []string{ + "peer_id", string(p.ID()), + "chID", fmt.Sprintf("%#x", chID), + } + p.metrics.PeerSendBytesTotal.With(labels...).Add(float64(len(msgBytes))) } return res } @@ -263,7 +267,11 @@ func (p *peer) TrySend(chID byte, msgBytes []byte) bool { } res := p.mconn.TrySend(chID, msgBytes) if res { - p.metrics.PeerSendBytesTotal.With("peer_id", string(p.ID())).Add(float64(len(msgBytes))) + labels := []string{ + "peer_id", string(p.ID()), + "chID", fmt.Sprintf("%#x", chID), + } + p.metrics.PeerSendBytesTotal.With(labels...).Add(float64(len(msgBytes))) } return res } @@ -369,7 +377,11 @@ func createMConnection( // which does onPeerError. panic(fmt.Sprintf("Unknown channel %X", chID)) } - p.metrics.PeerReceiveBytesTotal.With("peer_id", string(p.ID())).Add(float64(len(msgBytes))) + labels := []string{ + "peer_id", string(p.ID()), + "chID", fmt.Sprintf("%#x", chID), + } + p.metrics.PeerReceiveBytesTotal.With(labels...).Add(float64(len(msgBytes))) reactor.Receive(chID, p, msgBytes) }