You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

270 lines
10 KiB

6 years ago
6 years ago
internal/proxy: add initial set of abci metrics (#7115) This PR adds an initial set of metrics for use ABCI. The initial metrics enable the calculation of timing histograms and call counts for each of the ABCI methods. The metrics are also labeled as either 'sync' or 'async' to determine if the method call was performed using ABCI's `*Async` methods. An example of these metrics is included here for reference: ``` tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0001"} 0 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0004"} 5 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.002"} 12 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.009"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.02"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.1"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.65"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="2"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="6"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="25"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="+Inf"} 13 tendermint_abci_connection_method_timing_sum{chain_id="ci",method="commit",type="sync"} 0.007802058000000001 tendermint_abci_connection_method_timing_count{chain_id="ci",method="commit",type="sync"} 13 ``` These metrics can easily be graphed using prometheus's `histogram_quantile(...)` method to pick out a particular quantile to graph or examine. I chose buckets that were somewhat of an estimate of expected range of times for ABCI operations. They start at .0001 seconds and range to 25 seconds. The hope is that this range captures enough possible times to be useful for us and operators.
3 years ago
3 years ago
internal/proxy: add initial set of abci metrics (#7115) This PR adds an initial set of metrics for use ABCI. The initial metrics enable the calculation of timing histograms and call counts for each of the ABCI methods. The metrics are also labeled as either 'sync' or 'async' to determine if the method call was performed using ABCI's `*Async` methods. An example of these metrics is included here for reference: ``` tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0001"} 0 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.0004"} 5 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.002"} 12 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.009"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.02"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.1"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="0.65"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="2"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="6"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="25"} 13 tendermint_abci_connection_method_timing_bucket{chain_id="ci",method="commit",type="sync",le="+Inf"} 13 tendermint_abci_connection_method_timing_sum{chain_id="ci",method="commit",type="sync"} 0.007802058000000001 tendermint_abci_connection_method_timing_count{chain_id="ci",method="commit",type="sync"} 13 ``` These metrics can easily be graphed using prometheus's `histogram_quantile(...)` method to pick out a particular quantile to graph or examine. I chose buckets that were somewhat of an estimate of expected range of times for ABCI operations. They start at .0001 seconds and range to 25 seconds. The hope is that this range captures enough possible times to be useful for us and operators.
3 years ago
3 years ago
3 years ago
  1. package consensus
  2. import (
  3. "github.com/go-kit/kit/metrics"
  4. "github.com/go-kit/kit/metrics/discard"
  5. "github.com/tendermint/tendermint/types"
  6. prometheus "github.com/go-kit/kit/metrics/prometheus"
  7. stdprometheus "github.com/prometheus/client_golang/prometheus"
  8. )
  9. const (
  10. // MetricsSubsystem is a subsystem shared by all metrics exposed by this
  11. // package.
  12. MetricsSubsystem = "consensus"
  13. )
  14. // Metrics contains metrics exposed by this package.
  15. type Metrics struct {
  16. // Height of the chain.
  17. Height metrics.Gauge
  18. // ValidatorLastSignedHeight of a validator.
  19. ValidatorLastSignedHeight metrics.Gauge
  20. // Number of rounds.
  21. Rounds metrics.Gauge
  22. // Number of validators.
  23. Validators metrics.Gauge
  24. // Total power of all validators.
  25. ValidatorsPower metrics.Gauge
  26. // Power of a validator.
  27. ValidatorPower metrics.Gauge
  28. // Amount of blocks missed by a validator.
  29. ValidatorMissedBlocks metrics.Gauge
  30. // Number of validators who did not sign.
  31. MissingValidators metrics.Gauge
  32. // Total power of the missing validators.
  33. MissingValidatorsPower metrics.Gauge
  34. // Number of validators who tried to double sign.
  35. ByzantineValidators metrics.Gauge
  36. // Total power of the byzantine validators.
  37. ByzantineValidatorsPower metrics.Gauge
  38. // Time between this and the last block.
  39. BlockIntervalSeconds metrics.Histogram
  40. // Number of transactions.
  41. NumTxs metrics.Gauge
  42. // Size of the block.
  43. BlockSizeBytes metrics.Histogram
  44. // Total number of transactions.
  45. TotalTxs metrics.Gauge
  46. // The latest block height.
  47. CommittedHeight metrics.Gauge
  48. // Whether or not a node is block syncing. 1 if yes, 0 if no.
  49. BlockSyncing metrics.Gauge
  50. // Whether or not a node is state syncing. 1 if yes, 0 if no.
  51. StateSyncing metrics.Gauge
  52. // Number of blockparts transmitted by peer.
  53. BlockParts metrics.Counter
  54. // Histogram of time taken per step annotated with reason that the step proceeded.
  55. StepTime metrics.Histogram
  56. // QuroumPrevoteMessageDelay is the interval in seconds between the proposal
  57. // timestamp and the timestamp of the earliest prevote that achieved a quorum
  58. // during the prevote step.
  59. //
  60. // To compute it, sum the voting power over each prevote received, in increasing
  61. // order of timestamp. The timestamp of the first prevote to increase the sum to
  62. // be above 2/3 of the total voting power of the network defines the endpoint
  63. // the endpoint of the interval. Subtract the proposal timestamp from this endpoint
  64. // to obtain the quorum delay.
  65. QuorumPrevoteMessageDelay metrics.Gauge
  66. // FullPrevoteMessageDelay is the interval in seconds between the proposal
  67. // timestamp and the timestamp of the latest prevote in a round where 100%
  68. // of the voting power on the network issued prevotes.
  69. FullPrevoteMessageDelay metrics.Gauge
  70. }
  71. // PrometheusMetrics returns Metrics build using Prometheus client library.
  72. // Optionally, labels can be provided along with their values ("foo",
  73. // "fooValue").
  74. func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics {
  75. labels := []string{}
  76. for i := 0; i < len(labelsAndValues); i += 2 {
  77. labels = append(labels, labelsAndValues[i])
  78. }
  79. return &Metrics{
  80. Height: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  81. Namespace: namespace,
  82. Subsystem: MetricsSubsystem,
  83. Name: "height",
  84. Help: "Height of the chain.",
  85. }, labels).With(labelsAndValues...),
  86. Rounds: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  87. Namespace: namespace,
  88. Subsystem: MetricsSubsystem,
  89. Name: "rounds",
  90. Help: "Number of rounds.",
  91. }, labels).With(labelsAndValues...),
  92. Validators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  93. Namespace: namespace,
  94. Subsystem: MetricsSubsystem,
  95. Name: "validators",
  96. Help: "Number of validators.",
  97. }, labels).With(labelsAndValues...),
  98. ValidatorLastSignedHeight: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  99. Namespace: namespace,
  100. Subsystem: MetricsSubsystem,
  101. Name: "validator_last_signed_height",
  102. Help: "Last signed height for a validator",
  103. }, append(labels, "validator_address")).With(labelsAndValues...),
  104. ValidatorMissedBlocks: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  105. Namespace: namespace,
  106. Subsystem: MetricsSubsystem,
  107. Name: "validator_missed_blocks",
  108. Help: "Total missed blocks for a validator",
  109. }, append(labels, "validator_address")).With(labelsAndValues...),
  110. ValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  111. Namespace: namespace,
  112. Subsystem: MetricsSubsystem,
  113. Name: "validators_power",
  114. Help: "Total power of all validators.",
  115. }, labels).With(labelsAndValues...),
  116. ValidatorPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  117. Namespace: namespace,
  118. Subsystem: MetricsSubsystem,
  119. Name: "validator_power",
  120. Help: "Power of a validator",
  121. }, append(labels, "validator_address")).With(labelsAndValues...),
  122. MissingValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  123. Namespace: namespace,
  124. Subsystem: MetricsSubsystem,
  125. Name: "missing_validators",
  126. Help: "Number of validators who did not sign.",
  127. }, labels).With(labelsAndValues...),
  128. MissingValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  129. Namespace: namespace,
  130. Subsystem: MetricsSubsystem,
  131. Name: "missing_validators_power",
  132. Help: "Total power of the missing validators.",
  133. }, labels).With(labelsAndValues...),
  134. ByzantineValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  135. Namespace: namespace,
  136. Subsystem: MetricsSubsystem,
  137. Name: "byzantine_validators",
  138. Help: "Number of validators who tried to double sign.",
  139. }, labels).With(labelsAndValues...),
  140. ByzantineValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  141. Namespace: namespace,
  142. Subsystem: MetricsSubsystem,
  143. Name: "byzantine_validators_power",
  144. Help: "Total power of the byzantine validators.",
  145. }, labels).With(labelsAndValues...),
  146. BlockIntervalSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
  147. Namespace: namespace,
  148. Subsystem: MetricsSubsystem,
  149. Name: "block_interval_seconds",
  150. Help: "Time between this and the last block.",
  151. }, labels).With(labelsAndValues...),
  152. NumTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  153. Namespace: namespace,
  154. Subsystem: MetricsSubsystem,
  155. Name: "num_txs",
  156. Help: "Number of transactions.",
  157. }, labels).With(labelsAndValues...),
  158. BlockSizeBytes: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
  159. Namespace: namespace,
  160. Subsystem: MetricsSubsystem,
  161. Name: "block_size_bytes",
  162. Help: "Size of the block.",
  163. }, labels).With(labelsAndValues...),
  164. TotalTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  165. Namespace: namespace,
  166. Subsystem: MetricsSubsystem,
  167. Name: "total_txs",
  168. Help: "Total number of transactions.",
  169. }, labels).With(labelsAndValues...),
  170. CommittedHeight: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  171. Namespace: namespace,
  172. Subsystem: MetricsSubsystem,
  173. Name: "latest_block_height",
  174. Help: "The latest block height.",
  175. }, labels).With(labelsAndValues...),
  176. BlockSyncing: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  177. Namespace: namespace,
  178. Subsystem: MetricsSubsystem,
  179. Name: "block_syncing",
  180. Help: "Whether or not a node is block syncing. 1 if yes, 0 if no.",
  181. }, labels).With(labelsAndValues...),
  182. StateSyncing: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  183. Namespace: namespace,
  184. Subsystem: MetricsSubsystem,
  185. Name: "state_syncing",
  186. Help: "Whether or not a node is state syncing. 1 if yes, 0 if no.",
  187. }, labels).With(labelsAndValues...),
  188. BlockParts: prometheus.NewCounterFrom(stdprometheus.CounterOpts{
  189. Namespace: namespace,
  190. Subsystem: MetricsSubsystem,
  191. Name: "block_parts",
  192. Help: "Number of blockparts transmitted by peer.",
  193. }, append(labels, "peer_id")).With(labelsAndValues...),
  194. StepTime: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{
  195. Namespace: namespace,
  196. Subsystem: MetricsSubsystem,
  197. Name: "step_time",
  198. Help: "Time spent per step.",
  199. }, append(labels, "step", "reason")).With(labelsAndValues...),
  200. QuorumPrevoteMessageDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  201. Namespace: namespace,
  202. Subsystem: MetricsSubsystem,
  203. Name: "quorum_prevote_message_delay",
  204. Help: "Difference in seconds between the proposal timestamp and the timestamp " +
  205. "of the latest prevote that achieved a quorum in the prevote step.",
  206. }, labels).With(labelsAndValues...),
  207. FullPrevoteMessageDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
  208. Namespace: namespace,
  209. Subsystem: MetricsSubsystem,
  210. Name: "full_prevote_message_delay",
  211. Help: "Difference in seconds between the proposal timestamp and the timestamp " +
  212. "of the latest prevote that achieved 100% of the voting power in the prevote step.",
  213. }, labels).With(labelsAndValues...),
  214. }
  215. }
  216. // NopMetrics returns no-op Metrics.
  217. func NopMetrics() *Metrics {
  218. return &Metrics{
  219. Height: discard.NewGauge(),
  220. ValidatorLastSignedHeight: discard.NewGauge(),
  221. Rounds: discard.NewGauge(),
  222. Validators: discard.NewGauge(),
  223. ValidatorsPower: discard.NewGauge(),
  224. ValidatorPower: discard.NewGauge(),
  225. ValidatorMissedBlocks: discard.NewGauge(),
  226. MissingValidators: discard.NewGauge(),
  227. MissingValidatorsPower: discard.NewGauge(),
  228. ByzantineValidators: discard.NewGauge(),
  229. ByzantineValidatorsPower: discard.NewGauge(),
  230. BlockIntervalSeconds: discard.NewHistogram(),
  231. NumTxs: discard.NewGauge(),
  232. BlockSizeBytes: discard.NewHistogram(),
  233. TotalTxs: discard.NewGauge(),
  234. CommittedHeight: discard.NewGauge(),
  235. BlockSyncing: discard.NewGauge(),
  236. StateSyncing: discard.NewGauge(),
  237. BlockParts: discard.NewCounter(),
  238. QuorumPrevoteMessageDelay: discard.NewGauge(),
  239. FullPrevoteMessageDelay: discard.NewGauge(),
  240. }
  241. }
  242. // RecordConsMetrics uses for recording the block related metrics during fast-sync.
  243. func (m *Metrics) RecordConsMetrics(block *types.Block) {
  244. m.NumTxs.Set(float64(len(block.Data.Txs)))
  245. m.TotalTxs.Add(float64(len(block.Data.Txs)))
  246. m.BlockSizeBytes.Observe(float64(block.Size()))
  247. m.CommittedHeight.Set(float64(block.Height))
  248. }