package consensus import ( "context" "errors" "fmt" "runtime/debug" "sync" "time" cstypes "github.com/tendermint/tendermint/internal/consensus/types" "github.com/tendermint/tendermint/internal/eventbus" "github.com/tendermint/tendermint/internal/p2p" sm "github.com/tendermint/tendermint/internal/state" "github.com/tendermint/tendermint/libs/bits" tmevents "github.com/tendermint/tendermint/libs/events" "github.com/tendermint/tendermint/libs/log" "github.com/tendermint/tendermint/libs/service" tmtime "github.com/tendermint/tendermint/libs/time" tmcons "github.com/tendermint/tendermint/proto/tendermint/consensus" tmproto "github.com/tendermint/tendermint/proto/tendermint/types" "github.com/tendermint/tendermint/types" ) var ( _ service.Service = (*Reactor)(nil) _ p2p.Wrapper = (*tmcons.Message)(nil) ) // GetChannelDescriptor produces an instance of a descriptor for this // package's required channels. func getChannelDescriptors() map[p2p.ChannelID]*p2p.ChannelDescriptor { return map[p2p.ChannelID]*p2p.ChannelDescriptor{ StateChannel: { ID: StateChannel, MessageType: new(tmcons.Message), Priority: 8, SendQueueCapacity: 64, RecvMessageCapacity: maxMsgSize, RecvBufferCapacity: 128, }, DataChannel: { // TODO: Consider a split between gossiping current block and catchup // stuff. Once we gossip the whole block there is nothing left to send // until next height or round. ID: DataChannel, MessageType: new(tmcons.Message), Priority: 12, SendQueueCapacity: 64, RecvBufferCapacity: 512, RecvMessageCapacity: maxMsgSize, }, VoteChannel: { ID: VoteChannel, MessageType: new(tmcons.Message), Priority: 10, SendQueueCapacity: 64, RecvBufferCapacity: 128, RecvMessageCapacity: maxMsgSize, }, VoteSetBitsChannel: { ID: VoteSetBitsChannel, MessageType: new(tmcons.Message), Priority: 5, SendQueueCapacity: 8, RecvBufferCapacity: 128, RecvMessageCapacity: maxMsgSize, }, } } const ( StateChannel = p2p.ChannelID(0x20) DataChannel = p2p.ChannelID(0x21) VoteChannel = p2p.ChannelID(0x22) VoteSetBitsChannel = p2p.ChannelID(0x23) maxMsgSize = 1048576 // 1MB; NOTE: keep in sync with types.PartSet sizes. blocksToContributeToBecomeGoodPeer = 10000 votesToContributeToBecomeGoodPeer = 10000 listenerIDConsensus = "consensus-reactor" ) // NOTE: Temporary interface for switching to block sync, we should get rid of v0. // See: https://github.com/tendermint/tendermint/issues/4595 type BlockSyncReactor interface { SwitchToBlockSync(context.Context, sm.State) error GetMaxPeerBlockHeight() int64 // GetTotalSyncedTime returns the time duration since the blocksync starting. GetTotalSyncedTime() time.Duration // GetRemainingSyncTime returns the estimating time the node will be fully synced, // if will return 0 if the blocksync does not perform or the number of block synced is // too small (less than 100). GetRemainingSyncTime() time.Duration } //go:generate ../../scripts/mockery_generate.sh ConsSyncReactor // ConsSyncReactor defines an interface used for testing abilities of node.startStateSync. type ConsSyncReactor interface { SwitchToConsensus(sm.State, bool) SetStateSyncingMetrics(float64) SetBlockSyncingMetrics(float64) } // Reactor defines a reactor for the consensus service. type Reactor struct { service.BaseService logger log.Logger state *State eventBus *eventbus.EventBus Metrics *Metrics mtx sync.RWMutex peers map[types.NodeID]*PeerState waitSync bool stateCh *p2p.Channel dataCh *p2p.Channel voteCh *p2p.Channel voteSetBitsCh *p2p.Channel peerUpdates *p2p.PeerUpdates } // NewReactor returns a reference to a new consensus reactor, which implements // the service.Service interface. It accepts a logger, consensus state, references // to relevant p2p Channels and a channel to listen for peer updates on. The // reactor will close all p2p Channels when stopping. func NewReactor( ctx context.Context, logger log.Logger, cs *State, channelCreator p2p.ChannelCreator, peerUpdates *p2p.PeerUpdates, waitSync bool, metrics *Metrics, ) (*Reactor, error) { chans := getChannelDescriptors() stateCh, err := channelCreator(ctx, chans[StateChannel]) if err != nil { return nil, err } dataCh, err := channelCreator(ctx, chans[DataChannel]) if err != nil { return nil, err } voteCh, err := channelCreator(ctx, chans[VoteChannel]) if err != nil { return nil, err } voteSetBitsCh, err := channelCreator(ctx, chans[VoteSetBitsChannel]) if err != nil { return nil, err } r := &Reactor{ logger: logger, state: cs, waitSync: waitSync, peers: make(map[types.NodeID]*PeerState), Metrics: metrics, stateCh: stateCh, dataCh: dataCh, voteCh: voteCh, voteSetBitsCh: voteSetBitsCh, peerUpdates: peerUpdates, } r.BaseService = *service.NewBaseService(logger, "Consensus", r) return r, nil } // OnStart starts separate go routines for each p2p Channel and listens for // envelopes on each. In addition, it also listens for peer updates and handles // messages on that p2p channel accordingly. The caller must be sure to execute // OnStop to ensure the outbound p2p Channels are closed. func (r *Reactor) OnStart(ctx context.Context) error { r.logger.Debug("consensus wait sync", "wait_sync", r.WaitSync()) // start routine that computes peer statistics for evaluating peer quality // // TODO: Evaluate if we need this to be synchronized via WaitGroup as to not // leak the goroutine when stopping the reactor. go r.peerStatsRoutine(ctx) r.subscribeToBroadcastEvents() if !r.WaitSync() { if err := r.state.Start(ctx); err != nil { return err } } go r.processStateCh(ctx) go r.processDataCh(ctx) go r.processVoteCh(ctx) go r.processVoteSetBitsCh(ctx) go r.processPeerUpdates(ctx) return nil } // OnStop stops the reactor by signaling to all spawned goroutines to exit and // blocking until they all exit, as well as unsubscribing from events and stopping // state. func (r *Reactor) OnStop() { r.unsubscribeFromBroadcastEvents() if err := r.state.Stop(); err != nil { if !errors.Is(err, service.ErrAlreadyStopped) { r.logger.Error("failed to stop consensus state", "err", err) } } if !r.WaitSync() { r.state.Wait() } r.mtx.Lock() // Close and wait for each of the peers to shutdown. // This is safe to perform with the lock since none of the peers require the // lock to complete any of the methods that the waitgroup is waiting on. for _, state := range r.peers { state.closer.Close() state.broadcastWG.Wait() } r.mtx.Unlock() } // SetEventBus sets the reactor's event bus. func (r *Reactor) SetEventBus(b *eventbus.EventBus) { r.eventBus = b r.state.SetEventBus(b) } // WaitSync returns whether the consensus reactor is waiting for state/block sync. func (r *Reactor) WaitSync() bool { r.mtx.RLock() defer r.mtx.RUnlock() return r.waitSync } // SwitchToConsensus switches from block-sync mode to consensus mode. It resets // the state, turns off block-sync, and starts the consensus state-machine. func (r *Reactor) SwitchToConsensus(ctx context.Context, state sm.State, skipWAL bool) { r.logger.Info("switching to consensus") // we have no votes, so reconstruct LastCommit from SeenCommit if state.LastBlockHeight > 0 { r.state.reconstructLastCommit(state) } // NOTE: The line below causes broadcastNewRoundStepRoutine() to broadcast a // NewRoundStepMessage. r.state.updateToState(ctx, state) r.mtx.Lock() r.waitSync = false r.mtx.Unlock() r.Metrics.BlockSyncing.Set(0) r.Metrics.StateSyncing.Set(0) if skipWAL { r.state.doWALCatchup = false } if err := r.state.Start(ctx); err != nil { panic(fmt.Sprintf(`failed to start consensus state: %v conS: %+v conR: %+v`, err, r.state, r)) } d := types.EventDataBlockSyncStatus{Complete: true, Height: state.LastBlockHeight} if err := r.eventBus.PublishEventBlockSyncStatus(ctx, d); err != nil { r.logger.Error("failed to emit the blocksync complete event", "err", err) } } // String returns a string representation of the Reactor. // // NOTE: For now, it is just a hard-coded string to avoid accessing unprotected // shared variables. // // TODO: improve! func (r *Reactor) String() string { return "ConsensusReactor" } // StringIndented returns an indented string representation of the Reactor. func (r *Reactor) StringIndented(indent string) string { r.mtx.RLock() defer r.mtx.RUnlock() s := "ConsensusReactor{\n" s += indent + " " + r.state.StringIndented(indent+" ") + "\n" for _, ps := range r.peers { s += indent + " " + ps.StringIndented(indent+" ") + "\n" } s += indent + "}" return s } // GetPeerState returns PeerState for a given NodeID. func (r *Reactor) GetPeerState(peerID types.NodeID) (*PeerState, bool) { r.mtx.RLock() defer r.mtx.RUnlock() ps, ok := r.peers[peerID] return ps, ok } func (r *Reactor) broadcastNewRoundStepMessage(ctx context.Context, rs *cstypes.RoundState) error { return r.stateCh.Send(ctx, p2p.Envelope{ Broadcast: true, Message: makeRoundStepMessage(rs), }) } func (r *Reactor) broadcastNewValidBlockMessage(ctx context.Context, rs *cstypes.RoundState) error { psHeader := rs.ProposalBlockParts.Header() return r.stateCh.Send(ctx, p2p.Envelope{ Broadcast: true, Message: &tmcons.NewValidBlock{ Height: rs.Height, Round: rs.Round, BlockPartSetHeader: psHeader.ToProto(), BlockParts: rs.ProposalBlockParts.BitArray().ToProto(), IsCommit: rs.Step == cstypes.RoundStepCommit, }, }) } func (r *Reactor) broadcastHasVoteMessage(ctx context.Context, vote *types.Vote) error { return r.stateCh.Send(ctx, p2p.Envelope{ Broadcast: true, Message: &tmcons.HasVote{ Height: vote.Height, Round: vote.Round, Type: vote.Type, Index: vote.ValidatorIndex, }, }) } // subscribeToBroadcastEvents subscribes for new round steps and votes using the // internal pubsub defined in the consensus state to broadcast them to peers // upon receiving. func (r *Reactor) subscribeToBroadcastEvents() { err := r.state.evsw.AddListenerForEvent( listenerIDConsensus, types.EventNewRoundStepValue, func(ctx context.Context, data tmevents.EventData) error { if err := r.broadcastNewRoundStepMessage(ctx, data.(*cstypes.RoundState)); err != nil { return err } select { case r.state.onStopCh <- data.(*cstypes.RoundState): return nil case <-ctx.Done(): return ctx.Err() default: return nil } }, ) if err != nil { r.logger.Error("failed to add listener for events", "err", err) } err = r.state.evsw.AddListenerForEvent( listenerIDConsensus, types.EventValidBlockValue, func(ctx context.Context, data tmevents.EventData) error { return r.broadcastNewValidBlockMessage(ctx, data.(*cstypes.RoundState)) }, ) if err != nil { r.logger.Error("failed to add listener for events", "err", err) } err = r.state.evsw.AddListenerForEvent( listenerIDConsensus, types.EventVoteValue, func(ctx context.Context, data tmevents.EventData) error { return r.broadcastHasVoteMessage(ctx, data.(*types.Vote)) }, ) if err != nil { r.logger.Error("failed to add listener for events", "err", err) } } func (r *Reactor) unsubscribeFromBroadcastEvents() { r.state.evsw.RemoveListener(listenerIDConsensus) } func makeRoundStepMessage(rs *cstypes.RoundState) *tmcons.NewRoundStep { return &tmcons.NewRoundStep{ Height: rs.Height, Round: rs.Round, Step: uint32(rs.Step), SecondsSinceStartTime: int64(time.Since(rs.StartTime).Seconds()), LastCommitRound: rs.LastCommit.GetRound(), } } func (r *Reactor) sendNewRoundStepMessage(ctx context.Context, peerID types.NodeID) error { return r.stateCh.Send(ctx, p2p.Envelope{ To: peerID, Message: makeRoundStepMessage(r.state.GetRoundState()), }) } func (r *Reactor) gossipDataForCatchup(ctx context.Context, rs *cstypes.RoundState, prs *cstypes.PeerRoundState, ps *PeerState) { logger := r.logger.With("height", prs.Height).With("peer", ps.peerID) if index, ok := prs.ProposalBlockParts.Not().PickRandom(); ok { // ensure that the peer's PartSetHeader is correct blockMeta := r.state.blockStore.LoadBlockMeta(prs.Height) if blockMeta == nil { logger.Error( "failed to load block meta", "our_height", rs.Height, "blockstore_base", r.state.blockStore.Base(), "blockstore_height", r.state.blockStore.Height(), ) time.Sleep(r.state.config.PeerGossipSleepDuration) return } else if !blockMeta.BlockID.PartSetHeader.Equals(prs.ProposalBlockPartSetHeader) { logger.Info( "peer ProposalBlockPartSetHeader mismatch; sleeping", "block_part_set_header", blockMeta.BlockID.PartSetHeader, "peer_block_part_set_header", prs.ProposalBlockPartSetHeader, ) time.Sleep(r.state.config.PeerGossipSleepDuration) return } part := r.state.blockStore.LoadBlockPart(prs.Height, index) if part == nil { logger.Error( "failed to load block part", "index", index, "block_part_set_header", blockMeta.BlockID.PartSetHeader, "peer_block_part_set_header", prs.ProposalBlockPartSetHeader, ) time.Sleep(r.state.config.PeerGossipSleepDuration) return } partProto, err := part.ToProto() if err != nil { logger.Error("failed to convert block part to proto", "err", err) time.Sleep(r.state.config.PeerGossipSleepDuration) return } logger.Debug("sending block part for catchup", "round", prs.Round, "index", index) _ = r.dataCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.BlockPart{ Height: prs.Height, // not our height, so it does not matter. Round: prs.Round, // not our height, so it does not matter Part: *partProto, }, }) return } time.Sleep(r.state.config.PeerGossipSleepDuration) } func (r *Reactor) gossipDataRoutine(ctx context.Context, ps *PeerState) { logger := r.logger.With("peer", ps.peerID) defer ps.broadcastWG.Done() timer := time.NewTimer(0) defer timer.Stop() OUTER_LOOP: for { if !r.IsRunning() { return } select { case <-ctx.Done(): return case <-ps.closer.Done(): // The peer is marked for removal via a PeerUpdate as the doneCh was // explicitly closed to signal we should exit. return default: } rs := r.state.GetRoundState() prs := ps.GetRoundState() // Send proposal Block parts? if rs.ProposalBlockParts.HasHeader(prs.ProposalBlockPartSetHeader) { if index, ok := rs.ProposalBlockParts.BitArray().Sub(prs.ProposalBlockParts.Copy()).PickRandom(); ok { part := rs.ProposalBlockParts.GetPart(index) partProto, err := part.ToProto() if err != nil { logger.Error("failed to convert block part to proto", "err", err) return } logger.Debug("sending block part", "height", prs.Height, "round", prs.Round) if err := r.dataCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.BlockPart{ Height: rs.Height, // this tells peer that this part applies to us Round: rs.Round, // this tells peer that this part applies to us Part: *partProto, }, }); err != nil { return } ps.SetHasProposalBlockPart(prs.Height, prs.Round, index) continue OUTER_LOOP } } // if the peer is on a previous height that we have, help catch up blockStoreBase := r.state.blockStore.Base() if blockStoreBase > 0 && 0 < prs.Height && prs.Height < rs.Height && prs.Height >= blockStoreBase { heightLogger := logger.With("height", prs.Height) // If we never received the commit message from the peer, the block parts // will not be initialized. if prs.ProposalBlockParts == nil { blockMeta := r.state.blockStore.LoadBlockMeta(prs.Height) if blockMeta == nil { heightLogger.Error( "failed to load block meta", "blockstoreBase", blockStoreBase, "blockstoreHeight", r.state.blockStore.Height(), ) timer.Reset(r.state.config.PeerGossipSleepDuration) select { case <-timer.C: case <-ctx.Done(): return } } else { ps.InitProposalBlockParts(blockMeta.BlockID.PartSetHeader) } // Continue the loop since prs is a copy and not effected by this // initialization. continue OUTER_LOOP } r.gossipDataForCatchup(ctx, rs, prs, ps) continue OUTER_LOOP } // if height and round don't match, sleep if (rs.Height != prs.Height) || (rs.Round != prs.Round) { timer.Reset(r.state.config.PeerGossipSleepDuration) select { case <-timer.C: case <-ctx.Done(): return } continue OUTER_LOOP } // By here, height and round match. // Proposal block parts were already matched and sent if any were wanted. // (These can match on hash so the round doesn't matter) // Now consider sending other things, like the Proposal itself. // Send Proposal && ProposalPOL BitArray? if rs.Proposal != nil && !prs.Proposal { // Proposal: share the proposal metadata with peer. { propProto := rs.Proposal.ToProto() logger.Debug("sending proposal", "height", prs.Height, "round", prs.Round) if err := r.dataCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.Proposal{ Proposal: *propProto, }, }); err != nil { return } // NOTE: A peer might have received a different proposal message, so // this Proposal msg will be rejected! ps.SetHasProposal(rs.Proposal) } // ProposalPOL: lets peer know which POL votes we have so far. The peer // must receive ProposalMessage first. Note, rs.Proposal was validated, // so rs.Proposal.POLRound <= rs.Round, so we definitely have // rs.Votes.Prevotes(rs.Proposal.POLRound). if 0 <= rs.Proposal.POLRound { pPol := rs.Votes.Prevotes(rs.Proposal.POLRound).BitArray() pPolProto := pPol.ToProto() logger.Debug("sending POL", "height", prs.Height, "round", prs.Round) if err := r.dataCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.ProposalPOL{ Height: rs.Height, ProposalPolRound: rs.Proposal.POLRound, ProposalPol: *pPolProto, }, }); err != nil { return } } continue OUTER_LOOP } // nothing to do -- sleep timer.Reset(r.state.config.PeerGossipSleepDuration) select { case <-timer.C: case <-ctx.Done(): return } continue OUTER_LOOP } } // pickSendVote picks a vote and sends it to the peer. It will return true if // there is a vote to send and false otherwise. func (r *Reactor) pickSendVote(ctx context.Context, ps *PeerState, votes types.VoteSetReader) (bool, error) { vote, ok := ps.PickVoteToSend(votes) if !ok { return false, nil } r.logger.Debug("sending vote message", "ps", ps, "vote", vote) if err := r.voteCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.Vote{ Vote: vote.ToProto(), }, }); err != nil { return false, err } ps.SetHasVote(vote) return true, nil } func (r *Reactor) gossipVotesForHeight( ctx context.Context, rs *cstypes.RoundState, prs *cstypes.PeerRoundState, ps *PeerState, ) (bool, error) { logger := r.logger.With("height", prs.Height).With("peer", ps.peerID) // if there are lastCommits to send... if prs.Step == cstypes.RoundStepNewHeight { if ok, err := r.pickSendVote(ctx, ps, rs.LastCommit); err != nil { return false, err } else if ok { logger.Debug("picked rs.LastCommit to send") return true, nil } } // if there are POL prevotes to send... if prs.Step <= cstypes.RoundStepPropose && prs.Round != -1 && prs.Round <= rs.Round && prs.ProposalPOLRound != -1 { if polPrevotes := rs.Votes.Prevotes(prs.ProposalPOLRound); polPrevotes != nil { if ok, err := r.pickSendVote(ctx, ps, polPrevotes); err != nil { return false, err } else if ok { logger.Debug("picked rs.Prevotes(prs.ProposalPOLRound) to send", "round", prs.ProposalPOLRound) return true, nil } } } // if there are prevotes to send... if prs.Step <= cstypes.RoundStepPrevoteWait && prs.Round != -1 && prs.Round <= rs.Round { if ok, err := r.pickSendVote(ctx, ps, rs.Votes.Prevotes(prs.Round)); err != nil { return false, err } else if ok { logger.Debug("picked rs.Prevotes(prs.Round) to send", "round", prs.Round) return true, nil } } // if there are precommits to send... if prs.Step <= cstypes.RoundStepPrecommitWait && prs.Round != -1 && prs.Round <= rs.Round { if ok, err := r.pickSendVote(ctx, ps, rs.Votes.Precommits(prs.Round)); err != nil { return false, err } else if ok { logger.Debug("picked rs.Precommits(prs.Round) to send", "round", prs.Round) return true, nil } } // if there are prevotes to send...(which are needed because of validBlock mechanism) if prs.Round != -1 && prs.Round <= rs.Round { if ok, err := r.pickSendVote(ctx, ps, rs.Votes.Prevotes(prs.Round)); err != nil { return false, err } else if ok { logger.Debug("picked rs.Prevotes(prs.Round) to send", "round", prs.Round) return true, nil } } // if there are POLPrevotes to send... if prs.ProposalPOLRound != -1 { if polPrevotes := rs.Votes.Prevotes(prs.ProposalPOLRound); polPrevotes != nil { if ok, err := r.pickSendVote(ctx, ps, polPrevotes); err != nil { return false, err } else if ok { logger.Debug("picked rs.Prevotes(prs.ProposalPOLRound) to send", "round", prs.ProposalPOLRound) return true, nil } } } return false, nil } func (r *Reactor) gossipVotesRoutine(ctx context.Context, ps *PeerState) { logger := r.logger.With("peer", ps.peerID) defer ps.broadcastWG.Done() // XXX: simple hack to throttle logs upon sleep logThrottle := 0 timer := time.NewTimer(0) defer timer.Stop() OUTER_LOOP: for { if !r.IsRunning() { return } select { case <-ctx.Done(): return case <-ps.closer.Done(): // The peer is marked for removal via a PeerUpdate as the doneCh was // explicitly closed to signal we should exit. return default: } rs := r.state.GetRoundState() prs := ps.GetRoundState() switch logThrottle { case 1: // first sleep logThrottle = 2 case 2: // no more sleep logThrottle = 0 } // if height matches, then send LastCommit, Prevotes, and Precommits if rs.Height == prs.Height { if ok, err := r.gossipVotesForHeight(ctx, rs, prs, ps); err != nil { return } else if ok { continue OUTER_LOOP } } // special catchup logic -- if peer is lagging by height 1, send LastCommit if prs.Height != 0 && rs.Height == prs.Height+1 { if ok, err := r.pickSendVote(ctx, ps, rs.LastCommit); err != nil { return } else if ok { logger.Debug("picked rs.LastCommit to send", "height", prs.Height) continue OUTER_LOOP } } // catchup logic -- if peer is lagging by more than 1, send Commit blockStoreBase := r.state.blockStore.Base() if blockStoreBase > 0 && prs.Height != 0 && rs.Height >= prs.Height+2 && prs.Height >= blockStoreBase { // Load the block commit for prs.Height, which contains precommit // signatures for prs.Height. if commit := r.state.blockStore.LoadBlockCommit(prs.Height); commit != nil { if ok, err := r.pickSendVote(ctx, ps, commit); err != nil { return } else if ok { logger.Debug("picked Catchup commit to send", "height", prs.Height) continue OUTER_LOOP } } } if logThrottle == 0 { // we sent nothing -- sleep logThrottle = 1 logger.Debug( "no votes to send; sleeping", "rs.Height", rs.Height, "prs.Height", prs.Height, "localPV", rs.Votes.Prevotes(rs.Round).BitArray(), "peerPV", prs.Prevotes, "localPC", rs.Votes.Precommits(rs.Round).BitArray(), "peerPC", prs.Precommits, ) } else if logThrottle == 2 { logThrottle = 1 } timer.Reset(r.state.config.PeerGossipSleepDuration) select { case <-ctx.Done(): return case <-timer.C: } continue OUTER_LOOP } } // NOTE: `queryMaj23Routine` has a simple crude design since it only comes // into play for liveness when there's a signature DDoS attack happening. func (r *Reactor) queryMaj23Routine(ctx context.Context, ps *PeerState) { defer ps.broadcastWG.Done() timer := time.NewTimer(0) defer timer.Stop() OUTER_LOOP: for { if !r.IsRunning() { return } select { case <-ctx.Done(): return case <-ps.closer.Done(): // The peer is marked for removal via a PeerUpdate as the doneCh was // explicitly closed to signal we should exit. return default: } // maybe send Height/Round/Prevotes { rs := r.state.GetRoundState() prs := ps.GetRoundState() if rs.Height == prs.Height { if maj23, ok := rs.Votes.Prevotes(prs.Round).TwoThirdsMajority(); ok { if err := r.stateCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.VoteSetMaj23{ Height: prs.Height, Round: prs.Round, Type: tmproto.PrevoteType, BlockID: maj23.ToProto(), }, }); err != nil { return } timer.Reset(r.state.config.PeerQueryMaj23SleepDuration) select { case <-timer.C: case <-ctx.Done(): return } } } } // maybe send Height/Round/Precommits { rs := r.state.GetRoundState() prs := ps.GetRoundState() if rs.Height == prs.Height { if maj23, ok := rs.Votes.Precommits(prs.Round).TwoThirdsMajority(); ok { if err := r.stateCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.VoteSetMaj23{ Height: prs.Height, Round: prs.Round, Type: tmproto.PrecommitType, BlockID: maj23.ToProto(), }, }); err != nil { return } select { case <-timer.C: timer.Reset(r.state.config.PeerQueryMaj23SleepDuration) case <-ctx.Done(): return } } } } // maybe send Height/Round/ProposalPOL { rs := r.state.GetRoundState() prs := ps.GetRoundState() if rs.Height == prs.Height && prs.ProposalPOLRound >= 0 { if maj23, ok := rs.Votes.Prevotes(prs.ProposalPOLRound).TwoThirdsMajority(); ok { if err := r.stateCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.VoteSetMaj23{ Height: prs.Height, Round: prs.ProposalPOLRound, Type: tmproto.PrevoteType, BlockID: maj23.ToProto(), }, }); err != nil { return } timer.Reset(r.state.config.PeerQueryMaj23SleepDuration) select { case <-timer.C: case <-ctx.Done(): return } } } } // Little point sending LastCommitRound/LastCommit, these are fleeting and // non-blocking. // maybe send Height/CatchupCommitRound/CatchupCommit { prs := ps.GetRoundState() if prs.CatchupCommitRound != -1 && prs.Height > 0 && prs.Height <= r.state.blockStore.Height() && prs.Height >= r.state.blockStore.Base() { if commit := r.state.LoadCommit(prs.Height); commit != nil { if err := r.stateCh.Send(ctx, p2p.Envelope{ To: ps.peerID, Message: &tmcons.VoteSetMaj23{ Height: prs.Height, Round: commit.Round, Type: tmproto.PrecommitType, BlockID: commit.BlockID.ToProto(), }, }); err != nil { return } timer.Reset(r.state.config.PeerQueryMaj23SleepDuration) select { case <-timer.C: case <-ctx.Done(): return } } } } timer.Reset(r.state.config.PeerQueryMaj23SleepDuration) select { case <-timer.C: case <-ctx.Done(): return } continue OUTER_LOOP } } // processPeerUpdate process a peer update message. For new or reconnected peers, // we create a peer state if one does not exist for the peer, which should always // be the case, and we spawn all the relevant goroutine to broadcast messages to // the peer. During peer removal, we remove the peer for our set of peers and // signal to all spawned goroutines to gracefully exit in a non-blocking manner. func (r *Reactor) processPeerUpdate(ctx context.Context, peerUpdate p2p.PeerUpdate) { r.logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() defer r.mtx.Unlock() switch peerUpdate.Status { case p2p.PeerStatusUp: // Do not allow starting new broadcasting goroutines after reactor shutdown // has been initiated. This can happen after we've manually closed all // peer goroutines, but the router still sends in-flight peer updates. if !r.IsRunning() { return } var ( ps *PeerState ok bool ) ps, ok = r.peers[peerUpdate.NodeID] if !ok { ps = NewPeerState(r.logger, peerUpdate.NodeID) r.peers[peerUpdate.NodeID] = ps } if !ps.IsRunning() { // Set the peer state's closer to signal to all spawned goroutines to exit // when the peer is removed. We also set the running state to ensure we // do not spawn multiple instances of the same goroutines and finally we // set the waitgroup counter so we know when all goroutines have exited. ps.broadcastWG.Add(3) ps.SetRunning(true) // start goroutines for this peer go r.gossipDataRoutine(ctx, ps) go r.gossipVotesRoutine(ctx, ps) go r.queryMaj23Routine(ctx, ps) // Send our state to the peer. If we're block-syncing, broadcast a // RoundStepMessage later upon SwitchToConsensus(). if !r.waitSync { go func() { _ = r.sendNewRoundStepMessage(ctx, ps.peerID) }() } } case p2p.PeerStatusDown: ps, ok := r.peers[peerUpdate.NodeID] if ok && ps.IsRunning() { // signal to all spawned goroutines for the peer to gracefully exit ps.closer.Close() go func() { // Wait for all spawned broadcast goroutines to exit before marking the // peer state as no longer running and removal from the peers map. ps.broadcastWG.Wait() r.mtx.Lock() delete(r.peers, peerUpdate.NodeID) r.mtx.Unlock() ps.SetRunning(false) }() } } } // handleStateMessage handles envelopes sent from peers on the StateChannel. // An error is returned if the message is unrecognized or if validation fails. // If we fail to find the peer state for the envelope sender, we perform a no-op // and return. This can happen when we process the envelope after the peer is // removed. func (r *Reactor) handleStateMessage(ctx context.Context, envelope *p2p.Envelope, msgI Message) error { ps, ok := r.GetPeerState(envelope.From) if !ok || ps == nil { r.logger.Debug("failed to find peer state", "peer", envelope.From, "ch_id", "StateChannel") return nil } switch msg := envelope.Message.(type) { case *tmcons.NewRoundStep: r.state.mtx.RLock() initialHeight := r.state.state.InitialHeight r.state.mtx.RUnlock() if err := msgI.(*NewRoundStepMessage).ValidateHeight(initialHeight); err != nil { r.logger.Error("peer sent us an invalid msg", "msg", msg, "err", err) return err } ps.ApplyNewRoundStepMessage(msgI.(*NewRoundStepMessage)) case *tmcons.NewValidBlock: ps.ApplyNewValidBlockMessage(msgI.(*NewValidBlockMessage)) case *tmcons.HasVote: ps.ApplyHasVoteMessage(msgI.(*HasVoteMessage)) case *tmcons.VoteSetMaj23: r.state.mtx.RLock() height, votes := r.state.Height, r.state.Votes r.state.mtx.RUnlock() if height != msg.Height { return nil } vsmMsg := msgI.(*VoteSetMaj23Message) // peer claims to have a maj23 for some BlockID at err := votes.SetPeerMaj23(msg.Round, msg.Type, ps.peerID, vsmMsg.BlockID) if err != nil { return err } // Respond with a VoteSetBitsMessage showing which votes we have and // consequently shows which we don't have. var ourVotes *bits.BitArray switch vsmMsg.Type { case tmproto.PrevoteType: ourVotes = votes.Prevotes(msg.Round).BitArrayByBlockID(vsmMsg.BlockID) case tmproto.PrecommitType: ourVotes = votes.Precommits(msg.Round).BitArrayByBlockID(vsmMsg.BlockID) default: panic("bad VoteSetBitsMessage field type; forgot to add a check in ValidateBasic?") } eMsg := &tmcons.VoteSetBits{ Height: msg.Height, Round: msg.Round, Type: msg.Type, BlockID: msg.BlockID, } if votesProto := ourVotes.ToProto(); votesProto != nil { eMsg.Votes = *votesProto } if err := r.voteSetBitsCh.Send(ctx, p2p.Envelope{ To: envelope.From, Message: eMsg, }); err != nil { return err } default: return fmt.Errorf("received unknown message on StateChannel: %T", msg) } return nil } // handleDataMessage handles envelopes sent from peers on the DataChannel. If we // fail to find the peer state for the envelope sender, we perform a no-op and // return. This can happen when we process the envelope after the peer is // removed. func (r *Reactor) handleDataMessage(ctx context.Context, envelope *p2p.Envelope, msgI Message) error { logger := r.logger.With("peer", envelope.From, "ch_id", "DataChannel") ps, ok := r.GetPeerState(envelope.From) if !ok || ps == nil { r.logger.Debug("failed to find peer state") return nil } if r.WaitSync() { logger.Info("ignoring message received during sync", "msg", fmt.Sprintf("%T", msgI)) return nil } switch msg := envelope.Message.(type) { case *tmcons.Proposal: pMsg := msgI.(*ProposalMessage) ps.SetHasProposal(pMsg.Proposal) select { case <-ctx.Done(): return ctx.Err() case r.state.peerMsgQueue <- msgInfo{pMsg, envelope.From, tmtime.Now()}: } case *tmcons.ProposalPOL: ps.ApplyProposalPOLMessage(msgI.(*ProposalPOLMessage)) case *tmcons.BlockPart: bpMsg := msgI.(*BlockPartMessage) ps.SetHasProposalBlockPart(bpMsg.Height, bpMsg.Round, int(bpMsg.Part.Index)) r.Metrics.BlockParts.With("peer_id", string(envelope.From)).Add(1) select { case r.state.peerMsgQueue <- msgInfo{bpMsg, envelope.From, tmtime.Now()}: return nil case <-ctx.Done(): return ctx.Err() } default: return fmt.Errorf("received unknown message on DataChannel: %T", msg) } return nil } // handleVoteMessage handles envelopes sent from peers on the VoteChannel. If we // fail to find the peer state for the envelope sender, we perform a no-op and // return. This can happen when we process the envelope after the peer is // removed. func (r *Reactor) handleVoteMessage(ctx context.Context, envelope *p2p.Envelope, msgI Message) error { logger := r.logger.With("peer", envelope.From, "ch_id", "VoteChannel") ps, ok := r.GetPeerState(envelope.From) if !ok || ps == nil { r.logger.Debug("failed to find peer state") return nil } if r.WaitSync() { logger.Info("ignoring message received during sync", "msg", msgI) return nil } switch msg := envelope.Message.(type) { case *tmcons.Vote: r.state.mtx.RLock() height, valSize, lastCommitSize := r.state.Height, r.state.Validators.Size(), r.state.LastCommit.Size() r.state.mtx.RUnlock() vMsg := msgI.(*VoteMessage) ps.EnsureVoteBitArrays(height, valSize) ps.EnsureVoteBitArrays(height-1, lastCommitSize) ps.SetHasVote(vMsg.Vote) select { case r.state.peerMsgQueue <- msgInfo{vMsg, envelope.From, tmtime.Now()}: return nil case <-ctx.Done(): return ctx.Err() } default: return fmt.Errorf("received unknown message on VoteChannel: %T", msg) } } // handleVoteSetBitsMessage handles envelopes sent from peers on the // VoteSetBitsChannel. If we fail to find the peer state for the envelope sender, // we perform a no-op and return. This can happen when we process the envelope // after the peer is removed. func (r *Reactor) handleVoteSetBitsMessage(ctx context.Context, envelope *p2p.Envelope, msgI Message) error { logger := r.logger.With("peer", envelope.From, "ch_id", "VoteSetBitsChannel") ps, ok := r.GetPeerState(envelope.From) if !ok || ps == nil { r.logger.Debug("failed to find peer state") return nil } if r.WaitSync() { logger.Info("ignoring message received during sync", "msg", msgI) return nil } switch msg := envelope.Message.(type) { case *tmcons.VoteSetBits: r.state.mtx.RLock() height, votes := r.state.Height, r.state.Votes r.state.mtx.RUnlock() vsbMsg := msgI.(*VoteSetBitsMessage) if height == msg.Height { var ourVotes *bits.BitArray switch msg.Type { case tmproto.PrevoteType: ourVotes = votes.Prevotes(msg.Round).BitArrayByBlockID(vsbMsg.BlockID) case tmproto.PrecommitType: ourVotes = votes.Precommits(msg.Round).BitArrayByBlockID(vsbMsg.BlockID) default: panic("bad VoteSetBitsMessage field type; forgot to add a check in ValidateBasic?") } ps.ApplyVoteSetBitsMessage(vsbMsg, ourVotes) } else { ps.ApplyVoteSetBitsMessage(vsbMsg, nil) } default: return fmt.Errorf("received unknown message on VoteSetBitsChannel: %T", msg) } return nil } // handleMessage handles an Envelope sent from a peer on a specific p2p Channel. // It will handle errors and any possible panics gracefully. A caller can handle // any error returned by sending a PeerError on the respective channel. // // NOTE: We process these messages even when we're block syncing. Messages affect // either a peer state or the consensus state. Peer state updates can happen in // parallel, but processing of proposals, block parts, and votes are ordered by // the p2p channel. // // NOTE: We block on consensus state for proposals, block parts, and votes. func (r *Reactor) handleMessage(ctx context.Context, chID p2p.ChannelID, envelope *p2p.Envelope) (err error) { defer func() { if e := recover(); e != nil { err = fmt.Errorf("panic in processing message: %v", e) r.logger.Error( "recovering from processing message panic", "err", err, "stack", string(debug.Stack()), ) } }() // We wrap the envelope's message in a Proto wire type so we can convert back // the domain type that individual channel message handlers can work with. We // do this here once to avoid having to do it for each individual message type. // and because a large part of the core business logic depends on these // domain types opposed to simply working with the Proto types. protoMsg := new(tmcons.Message) if err := protoMsg.Wrap(envelope.Message); err != nil { return err } msgI, err := MsgFromProto(protoMsg) if err != nil { return err } r.logger.Debug("received message", "ch_id", chID, "message", msgI, "peer", envelope.From) switch chID { case StateChannel: err = r.handleStateMessage(ctx, envelope, msgI) case DataChannel: err = r.handleDataMessage(ctx, envelope, msgI) case VoteChannel: err = r.handleVoteMessage(ctx, envelope, msgI) case VoteSetBitsChannel: err = r.handleVoteSetBitsMessage(ctx, envelope, msgI) default: err = fmt.Errorf("unknown channel ID (%d) for envelope (%v)", chID, envelope) } return err } // processStateCh initiates a blocking process where we listen for and handle // envelopes on the StateChannel. Any error encountered during message // execution will result in a PeerError being sent on the StateChannel. When // the reactor is stopped, we will catch the signal and close the p2p Channel // gracefully. func (r *Reactor) processStateCh(ctx context.Context) { iter := r.stateCh.Receive(ctx) for iter.Next(ctx) { envelope := iter.Envelope() if err := r.handleMessage(ctx, r.stateCh.ID, envelope); err != nil { r.logger.Error("failed to process message", "ch_id", r.stateCh.ID, "envelope", envelope, "err", err) if serr := r.stateCh.SendError(ctx, p2p.PeerError{ NodeID: envelope.From, Err: err, }); serr != nil { return } } } } // processDataCh initiates a blocking process where we listen for and handle // envelopes on the DataChannel. Any error encountered during message // execution will result in a PeerError being sent on the DataChannel. When // the reactor is stopped, we will catch the signal and close the p2p Channel // gracefully. func (r *Reactor) processDataCh(ctx context.Context) { iter := r.dataCh.Receive(ctx) for iter.Next(ctx) { envelope := iter.Envelope() if err := r.handleMessage(ctx, r.dataCh.ID, envelope); err != nil { r.logger.Error("failed to process message", "ch_id", r.dataCh.ID, "envelope", envelope, "err", err) if serr := r.dataCh.SendError(ctx, p2p.PeerError{ NodeID: envelope.From, Err: err, }); serr != nil { return } } } } // processVoteCh initiates a blocking process where we listen for and handle // envelopes on the VoteChannel. Any error encountered during message // execution will result in a PeerError being sent on the VoteChannel. When // the reactor is stopped, we will catch the signal and close the p2p Channel // gracefully. func (r *Reactor) processVoteCh(ctx context.Context) { iter := r.voteCh.Receive(ctx) for iter.Next(ctx) { envelope := iter.Envelope() if err := r.handleMessage(ctx, r.voteCh.ID, envelope); err != nil { r.logger.Error("failed to process message", "ch_id", r.voteCh.ID, "envelope", envelope, "err", err) if serr := r.voteCh.SendError(ctx, p2p.PeerError{ NodeID: envelope.From, Err: err, }); serr != nil { return } } } } // processVoteCh initiates a blocking process where we listen for and handle // envelopes on the VoteSetBitsChannel. Any error encountered during message // execution will result in a PeerError being sent on the VoteSetBitsChannel. // When the reactor is stopped, we will catch the signal and close the p2p // Channel gracefully. func (r *Reactor) processVoteSetBitsCh(ctx context.Context) { iter := r.voteSetBitsCh.Receive(ctx) for iter.Next(ctx) { envelope := iter.Envelope() if err := r.handleMessage(ctx, r.voteSetBitsCh.ID, envelope); err != nil { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return } r.logger.Error("failed to process message", "ch_id", r.voteSetBitsCh.ID, "envelope", envelope, "err", err) if serr := r.voteSetBitsCh.SendError(ctx, p2p.PeerError{ NodeID: envelope.From, Err: err, }); serr != nil { return } } } } // processPeerUpdates initiates a blocking process where we listen for and handle // PeerUpdate messages. When the reactor is stopped, we will catch the signal and // close the p2p PeerUpdatesCh gracefully. func (r *Reactor) processPeerUpdates(ctx context.Context) { for { select { case <-ctx.Done(): return case peerUpdate := <-r.peerUpdates.Updates(): r.processPeerUpdate(ctx, peerUpdate) } } } func (r *Reactor) peerStatsRoutine(ctx context.Context) { for { if !r.IsRunning() { r.logger.Info("stopping peerStatsRoutine") return } select { case msg := <-r.state.statsMsgQueue: ps, ok := r.GetPeerState(msg.PeerID) if !ok || ps == nil { r.logger.Debug("attempt to update stats for non-existent peer", "peer", msg.PeerID) continue } switch msg.Msg.(type) { case *VoteMessage: if numVotes := ps.RecordVote(); numVotes%votesToContributeToBecomeGoodPeer == 0 { r.peerUpdates.SendUpdate(ctx, p2p.PeerUpdate{ NodeID: msg.PeerID, Status: p2p.PeerStatusGood, }) } case *BlockPartMessage: if numParts := ps.RecordBlockPart(); numParts%blocksToContributeToBecomeGoodPeer == 0 { r.peerUpdates.SendUpdate(ctx, p2p.PeerUpdate{ NodeID: msg.PeerID, Status: p2p.PeerStatusGood, }) } } case <-ctx.Done(): return } } } func (r *Reactor) GetConsensusState() *State { return r.state } func (r *Reactor) SetStateSyncingMetrics(v float64) { r.Metrics.StateSyncing.Set(v) } func (r *Reactor) SetBlockSyncingMetrics(v float64) { r.Metrics.BlockSyncing.Set(v) }