From 165cc29474a83b480f1209694e5398dac35c81e5 Mon Sep 17 00:00:00 2001 From: William Banfield <4561443+williambanfield@users.noreply.github.com> Date: Fri, 18 Feb 2022 13:24:25 -0500 Subject: [PATCH] consensus: additional timing metrics (#7849) This change introduces an additional set of metrics aimed at helping operators understand the timing for consensus. This change adds the following metrics: ``` # HELP tendermint_consensus_round_duration Time spent in a round # TYPE tendermint_consensus_round_duration histogram tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="0.1"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="0.2682695795279726"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="0.7196856730011522"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="1.9306977288832508"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="5.1794746792312125"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="13.894954943731381"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="37.27593720314942"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="100.00000000000006"} 29 tendermint_consensus_round_duration_bucket{chain_id="test-chain-IrF74Y",le="+Inf"} 29 tendermint_consensus_round_duration_sum{chain_id="test-chain-IrF74Y"} 0.028651869999999996 tendermint_consensus_round_duration_count{chain_id="test-chain-IrF74Y"} 29 ``` ``` # HELP tendermint_consensus_step_duration Time spent per step. # TYPE tendermint_consensus_step_duration histogram tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="0.1"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="0.2682695795279726"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="0.7196856730011522"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="1.9306977288832508"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="5.1794746792312125"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="13.894954943731381"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="37.27593720314942"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="100.00000000000006"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Commit",le="+Inf"} 29 tendermint_consensus_step_duration_sum{chain_id="test-chain-IrF74Y",step="Commit"} 0.26650875 tendermint_consensus_step_duration_count{chain_id="test-chain-IrF74Y",step="Commit"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="0.1"} 0 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="0.2682695795279726"} 0 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="0.7196856730011522"} 0 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="1.9306977288832508"} 28 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="5.1794746792312125"} 28 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="13.894954943731381"} 28 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="37.27593720314942"} 28 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="100.00000000000006"} 28 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewHeight",le="+Inf"} 28 tendermint_consensus_step_duration_sum{chain_id="test-chain-IrF74Y",step="NewHeight"} 27.773921702 tendermint_consensus_step_duration_count{chain_id="test-chain-IrF74Y",step="NewHeight"} 28 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="0.1"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="0.2682695795279726"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="0.7196856730011522"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="1.9306977288832508"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="5.1794746792312125"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="13.894954943731381"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="37.27593720314942"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="100.00000000000006"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="NewRound",le="+Inf"} 29 tendermint_consensus_step_duration_sum{chain_id="test-chain-IrF74Y",step="NewRound"} 0.168961052 tendermint_consensus_step_duration_count{chain_id="test-chain-IrF74Y",step="NewRound"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="0.1"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="0.2682695795279726"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="0.7196856730011522"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="1.9306977288832508"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="5.1794746792312125"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="13.894954943731381"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="37.27593720314942"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="100.00000000000006"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Precommit",le="+Inf"} 29 tendermint_consensus_step_duration_sum{chain_id="test-chain-IrF74Y",step="Precommit"} 0.06414115999999999 tendermint_consensus_step_duration_count{chain_id="test-chain-IrF74Y",step="Precommit"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="0.1"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="0.2682695795279726"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="0.7196856730011522"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="1.9306977288832508"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="5.1794746792312125"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="13.894954943731381"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="37.27593720314942"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="100.00000000000006"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Prevote",le="+Inf"} 29 tendermint_consensus_step_duration_sum{chain_id="test-chain-IrF74Y",step="Prevote"} 0.177714525 tendermint_consensus_step_duration_count{chain_id="test-chain-IrF74Y",step="Prevote"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="0.1"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="0.2682695795279726"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="0.7196856730011522"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="1.9306977288832508"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="5.1794746792312125"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="13.894954943731381"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="37.27593720314942"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="100.00000000000006"} 29 tendermint_consensus_step_duration_bucket{chain_id="test-chain-IrF74Y",step="Propose",le="+Inf"} 29 tendermint_consensus_step_duration_sum{chain_id="test-chain-IrF74Y",step="Propose"} 0.221851927 tendermint_consensus_step_duration_count{chain_id="test-chain-IrF74Y",step="Propose"} 29 ``` ``` # HELP tendermint_consensus_block_gossip_receive_time Difference in seconds between when the validator learns of a new blockand when the validator receives the last piece of the block. # TYPE tendermint_consensus_block_gossip_receive_time histogram tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="0.1"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="0.2682695795279726"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="0.7196856730011522"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="1.9306977288832508"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="5.1794746792312125"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="13.894954943731381"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="37.27593720314942"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="100.00000000000006"} 29 tendermint_consensus_block_gossip_receive_time_bucket{chain_id="test-chain-IrF74Y",le="+Inf"} 29 tendermint_consensus_block_gossip_receive_time_sum{chain_id="test-chain-IrF74Y"} 0.06380 ``` ``` # HELP tendermint_consensus_block_gossip_parts_received Number of block parts received by the node, labeled by whether thepart was relevant to the block the node was currently gathering or not # TYPE tendermint_consensus_block_gossip_parts_received counter tendermint_consensus_block_gossip_parts_received{chain_id="test-chain-IrF74Y",matches_current="true"} 29 ``` --- internal/consensus/metrics.go | 80 ++++++++++++++++++++++++++++++++--- internal/consensus/state.go | 24 +++++++++-- 2 files changed, 94 insertions(+), 10 deletions(-) diff --git a/internal/consensus/metrics.go b/internal/consensus/metrics.go index 7526518dc..5934be420 100644 --- a/internal/consensus/metrics.go +++ b/internal/consensus/metrics.go @@ -1,9 +1,13 @@ package consensus import ( + "strings" + "time" + "github.com/go-kit/kit/metrics" "github.com/go-kit/kit/metrics/discard" + cstypes "github.com/tendermint/tendermint/internal/consensus/types" "github.com/tendermint/tendermint/types" prometheus "github.com/go-kit/kit/metrics/prometheus" @@ -27,6 +31,9 @@ type Metrics struct { // Number of rounds. Rounds metrics.Gauge + // Histogram of round duration. + RoundDuration metrics.Histogram + // Number of validators. Validators metrics.Gauge // Total power of all validators. @@ -63,8 +70,18 @@ type Metrics struct { // Number of blockparts transmitted by peer. BlockParts metrics.Counter - // Histogram of time taken per step annotated with reason that the step proceeded. - StepTime metrics.Histogram + // Histogram of step duration. + StepDuration metrics.Histogram + stepStart time.Time + + // Histogram of time taken to receive a block in seconds, measured between when a new block is first + // discovered to when the block is completed. + BlockGossipReceiveLatency metrics.Histogram + blockGossipStart time.Time + + // Number of block parts received by the node, separated by whether the part + // was relevant to the block the node is trying to gather or not. + BlockGossipPartsReceived metrics.Counter // QuroumPrevoteMessageDelay is the interval in seconds between the proposal // timestamp and the timestamp of the earliest prevote that achieved a quorum @@ -109,7 +126,13 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { Name: "rounds", Help: "Number of rounds.", }, labels).With(labelsAndValues...), - + RoundDuration: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "round_duration", + Help: "Time spent in a round.", + Buckets: stdprometheus.ExponentialBucketsRange(0.1, 100, 8), + }, labels).With(labelsAndValues...), Validators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, @@ -212,12 +235,28 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { Name: "block_parts", Help: "Number of blockparts transmitted by peer.", }, append(labels, "peer_id")).With(labelsAndValues...), - StepTime: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + BlockGossipReceiveLatency: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_gossip_receive_latency", + Help: "Difference in seconds between when the validator learns of a new block" + + "and when the validator receives the last piece of the block.", + Buckets: stdprometheus.ExponentialBucketsRange(0.1, 100, 8), + }, labels).With(labelsAndValues...), + BlockGossipPartsReceived: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, - Name: "step_time", + Name: "block_gossip_parts_received", + Help: "Number of block parts received by the node, labeled by whether the " + + "part was relevant to the block the node was currently gathering or not.", + }, append(labels, "matches_current")).With(labelsAndValues...), + StepDuration: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "step_duration", Help: "Time spent per step.", - }, append(labels, "step", "reason")).With(labelsAndValues...), + Buckets: stdprometheus.ExponentialBucketsRange(0.1, 100, 8), + }, append(labels, "step")).With(labelsAndValues...), QuorumPrevoteMessageDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, @@ -251,7 +290,9 @@ func NopMetrics() *Metrics { ValidatorLastSignedHeight: discard.NewGauge(), - Rounds: discard.NewGauge(), + Rounds: discard.NewGauge(), + RoundDuration: discard.NewHistogram(), + StepDuration: discard.NewHistogram(), Validators: discard.NewGauge(), ValidatorsPower: discard.NewGauge(), @@ -271,6 +312,8 @@ func NopMetrics() *Metrics { BlockSyncing: discard.NewGauge(), StateSyncing: discard.NewGauge(), BlockParts: discard.NewCounter(), + BlockGossipReceiveLatency: discard.NewHistogram(), + BlockGossipPartsReceived: discard.NewCounter(), QuorumPrevoteMessageDelay: discard.NewGauge(), FullPrevoteMessageDelay: discard.NewGauge(), ProposalTimestampDifference: discard.NewHistogram(), @@ -284,3 +327,26 @@ func (m *Metrics) RecordConsMetrics(block *types.Block) { m.BlockSizeBytes.Observe(float64(block.Size())) m.CommittedHeight.Set(float64(block.Height)) } + +func (m *Metrics) MarkBlockGossipStarted() { + m.blockGossipStart = time.Now() +} + +func (m *Metrics) MarkBlockGossipComplete() { + m.BlockGossipReceiveLatency.Observe(time.Since(m.blockGossipStart).Seconds()) +} + +func (m *Metrics) MarkRound(r int32, st time.Time) { + m.Rounds.Set(float64(r)) + roundTime := time.Since(st).Seconds() + m.RoundDuration.Observe(roundTime) +} + +func (m *Metrics) MarkStep(s cstypes.RoundStepType) { + if !m.stepStart.IsZero() { + stepTime := time.Since(m.stepStart).Seconds() + stepName := strings.TrimPrefix(s.String(), "RoundStep") + m.StepDuration.With("step", stepName).Observe(stepTime) + } + m.stepStart = time.Now() +} diff --git a/internal/consensus/state.go b/internal/consensus/state.go index f8b5dfb4a..bfd588a44 100644 --- a/internal/consensus/state.go +++ b/internal/consensus/state.go @@ -628,6 +628,14 @@ func (cs *State) updateHeight(height int64) { } func (cs *State) updateRoundStep(round int32, step cstypes.RoundStepType) { + if !cs.replayMode { + if round != cs.Round || round == 0 && step == cstypes.RoundStepNewRound { + cs.metrics.MarkRound(cs.Round, cs.StartTime) + } + if cs.Step != step { + cs.metrics.MarkStep(cs.Step) + } + } cs.Round = round cs.Step = step } @@ -1124,9 +1132,6 @@ func (cs *State) enterNewRound(ctx context.Context, height int64, round int32) { if err := cs.eventBus.PublishEventNewRound(ctx, cs.NewRoundEvent()); err != nil { cs.logger.Error("failed publishing new round", "err", err) } - - cs.metrics.Rounds.Set(float64(round)) - // Wait for txs to be available in the mempool // before we enterPropose in round 0. If the last block changed the app hash, // we may need an empty "proof" block, and enterPropose immediately. @@ -1668,6 +1673,7 @@ func (cs *State) enterPrecommit(ctx context.Context, height int64, round int32) if !cs.ProposalBlockParts.HasHeader(blockID.PartSetHeader) { cs.ProposalBlock = nil + cs.metrics.MarkBlockGossipStarted() cs.ProposalBlockParts = types.NewPartSetFromHeader(blockID.PartSetHeader) } @@ -1758,6 +1764,7 @@ func (cs *State) enterCommit(ctx context.Context, height int64, commitRound int3 // We're getting the wrong block. // Set up ProposalBlockParts and keep waiting. cs.ProposalBlock = nil + cs.metrics.MarkBlockGossipStarted() cs.ProposalBlockParts = types.NewPartSetFromHeader(blockID.PartSetHeader) if err := cs.eventBus.PublishEventValidBlock(ctx, cs.RoundStateEvent()); err != nil { @@ -2034,6 +2041,7 @@ func (cs *State) defaultSetProposal(proposal *types.Proposal, recvTime time.Time // This happens if we're already in cstypes.RoundStepCommit or if there is a valid block in the current round. // TODO: We can check if Proposal is for a different block as this is a sign of misbehavior! if cs.ProposalBlockParts == nil { + cs.metrics.MarkBlockGossipStarted() cs.ProposalBlockParts = types.NewPartSetFromHeader(proposal.BlockID.PartSetHeader) } @@ -2054,11 +2062,13 @@ func (cs *State) addProposalBlockPart( // Blocks might be reused, so round mismatch is OK if cs.Height != height { cs.logger.Debug("received block part from wrong height", "height", height, "round", round) + cs.metrics.BlockGossipPartsReceived.With("matches_current", "false").Add(1) return false, nil } // We're not expecting a block part. if cs.ProposalBlockParts == nil { + cs.metrics.BlockGossipPartsReceived.With("matches_current", "false").Add(1) // NOTE: this can happen when we've gone to a higher round and // then receive parts from the previous round - not necessarily a bad peer. cs.logger.Debug( @@ -2073,14 +2083,21 @@ func (cs *State) addProposalBlockPart( added, err = cs.ProposalBlockParts.AddPart(part) if err != nil { + if errors.Is(err, types.ErrPartSetInvalidProof) || errors.Is(err, types.ErrPartSetUnexpectedIndex) { + cs.metrics.BlockGossipPartsReceived.With("matches_current", "false").Add(1) + } return added, err } + + cs.metrics.BlockGossipPartsReceived.With("matches_current", "true").Add(1) + if cs.ProposalBlockParts.ByteSize() > cs.state.ConsensusParams.Block.MaxBytes { return added, fmt.Errorf("total size of proposal block parts exceeds maximum block bytes (%d > %d)", cs.ProposalBlockParts.ByteSize(), cs.state.ConsensusParams.Block.MaxBytes, ) } if added && cs.ProposalBlockParts.IsComplete() { + cs.metrics.MarkBlockGossipComplete() bz, err := io.ReadAll(cs.ProposalBlockParts.GetReader()) if err != nil { return added, err @@ -2293,6 +2310,7 @@ func (cs *State) addVote( } if !cs.ProposalBlockParts.HasHeader(blockID.PartSetHeader) { + cs.metrics.MarkBlockGossipStarted() cs.ProposalBlockParts = types.NewPartSetFromHeader(blockID.PartSetHeader) }