package p2p import ( "fmt" "sort" "strconv" "github.com/gogo/protobuf/proto" tmsync "github.com/tendermint/tendermint/internal/libs/sync" "github.com/tendermint/tendermint/libs/log" ) // wrappedEnvelope wraps a p2p Envelope with its precomputed size. type wrappedEnvelope struct { envelope Envelope size uint } // assert the WDRR scheduler implements the queue interface at compile-time var _ queue = (*wdrrScheduler)(nil) // wdrrQueue implements a Weighted Deficit Round Robin (WDRR) scheduling // algorithm via the queue interface. A WDRR queue is created per peer, where // the queue will have N number of flows. Each flow corresponds to a p2p Channel, // so there are n input flows and a single output source, the peer's connection. // // The WDRR scheduler contains a shared buffer with a fixed capacity. // // Each flow has the following: // - quantum: The number of bytes that is added to the deficit counter of the // flow in each round. The flow can send at most quantum bytes at a time. Each // flow has its own unique quantum, which gives the queue its weighted nature. // A higher quantum corresponds to a higher weight/priority. The quantum is // computed as MaxSendBytes * Priority. // - deficit counter: The number of bytes that the flow is allowed to transmit // when it is its turn. // // See: https://en.wikipedia.org/wiki/Deficit_round_robin type wdrrScheduler struct { logger log.Logger metrics *Metrics chDescs []ChannelDescriptor capacity uint size uint chPriorities map[ChannelID]uint buffer map[ChannelID][]wrappedEnvelope quanta map[ChannelID]uint deficits map[ChannelID]uint closer *tmsync.Closer doneCh *tmsync.Closer enqueueCh chan Envelope dequeueCh chan Envelope } func newWDRRScheduler( logger log.Logger, m *Metrics, chDescs []ChannelDescriptor, enqueueBuf, dequeueBuf, capacity uint, ) *wdrrScheduler { // copy each ChannelDescriptor and sort them by channel priority chDescsCopy := make([]ChannelDescriptor, len(chDescs)) copy(chDescsCopy, chDescs) sort.Slice(chDescsCopy, func(i, j int) bool { return chDescsCopy[i].Priority > chDescsCopy[j].Priority }) var ( buffer = make(map[ChannelID][]wrappedEnvelope) chPriorities = make(map[ChannelID]uint) quanta = make(map[ChannelID]uint) deficits = make(map[ChannelID]uint) ) for _, chDesc := range chDescsCopy { chID := ChannelID(chDesc.ID) chPriorities[chID] = uint(chDesc.Priority) buffer[chID] = make([]wrappedEnvelope, 0) quanta[chID] = chDesc.MaxSendBytes * uint(chDesc.Priority) } return &wdrrScheduler{ logger: logger.With("queue", "wdrr"), metrics: m, capacity: capacity, chPriorities: chPriorities, chDescs: chDescsCopy, buffer: buffer, quanta: quanta, deficits: deficits, closer: tmsync.NewCloser(), doneCh: tmsync.NewCloser(), enqueueCh: make(chan Envelope, enqueueBuf), dequeueCh: make(chan Envelope, dequeueBuf), } } // enqueue returns an unbuffered write-only channel which a producer can send on. func (s *wdrrScheduler) enqueue() chan<- Envelope { return s.enqueueCh } // dequeue returns an unbuffered read-only channel which a consumer can read from. func (s *wdrrScheduler) dequeue() <-chan Envelope { return s.dequeueCh } func (s *wdrrScheduler) closed() <-chan struct{} { return s.closer.Done() } // close closes the WDRR queue. After this call enqueue() will block, so the // caller must select on closed() as well to avoid blocking forever. The // enqueue() and dequeue() along with the internal channels will NOT be closed. // Note, close() will block until all externally spawned goroutines have exited. func (s *wdrrScheduler) close() { s.closer.Close() <-s.doneCh.Done() } // start starts the WDRR queue process in a blocking goroutine. This must be // called before the queue can start to process and accept Envelopes. func (s *wdrrScheduler) start() { go s.process() } // process starts a blocking WDRR scheduler process, where we continuously // evaluate if we need to attempt to enqueue an Envelope or schedule Envelopes // to be dequeued and subsequently read and sent on the source connection. // Internally, each p2p Channel maps to a flow, where each flow has a deficit // and a quantum. // // For each Envelope requested to be enqueued, we evaluate if there is sufficient // capacity in the shared buffer to add the Envelope. If so, it is added. // Otherwise, we evaluate all flows of lower priority where we attempt find an // existing Envelope in the shared buffer of sufficient size that can be dropped // in place of the incoming Envelope. If there is no such Envelope that can be // dropped, then the incoming Envelope is dropped. // // When there is nothing to be enqueued, we perform the WDRR algorithm and // determine which Envelopes can be dequeued. For each Envelope that can be // dequeued, it is sent on the dequeueCh. Specifically, for each flow, if it is // non-empty, its deficit counter is incremented by its quantum value. Then, the // value of the deficit counter is a maximal amount of bytes that can be sent at // this round. If the deficit counter is greater than the Envelopes's message // size at the head of the queue (HoQ), this envelope can be sent and the value // of the counter is decremented by the message's size. Then, the size of the // next Envelopes's message is compared to the counter value, etc. Once the flow // is empty or the value of the counter is insufficient, the scheduler will skip // to the next flow. If the flow is empty, the value of the deficit counter is // reset to 0. // // XXX/TODO: Evaluate the single goroutine scheduler mechanism. In other words, // evaluate the effectiveness and performance of having a single goroutine // perform handling both enqueueing and dequeueing logic. Specifically, there // is potentially contention between reading off of enqueueCh and trying to // enqueue while also attempting to perform the WDRR algorithm and find the next // set of Envelope(s) to send on the dequeueCh. Alternatively, we could consider // separate scheduling goroutines, but then that requires the use of mutexes and // possibly a degrading performance. func (s *wdrrScheduler) process() { defer s.doneCh.Close() for { select { case <-s.closer.Done(): return case e := <-s.enqueueCh: // attempt to enqueue the incoming Envelope chIDStr := strconv.Itoa(int(e.channelID)) wEnv := wrappedEnvelope{envelope: e, size: uint(proto.Size(e.Message))} msgSize := wEnv.size s.metrics.PeerPendingSendBytes.With("peer_id", string(e.To)).Add(float64(msgSize)) // If we're at capacity, we need to either drop the incoming Envelope or // an Envelope from a lower priority flow. Otherwise, we add the (wrapped) // envelope to the flow's queue. if s.size+wEnv.size > s.capacity { chPriority := s.chPriorities[e.channelID] var ( canDrop bool dropIdx int dropChID ChannelID ) // Evaluate all lower priority flows and determine if there exists an // Envelope that is of equal or greater size that we can drop in favor // of the incoming Envelope. for i := len(s.chDescs) - 1; i >= 0 && uint(s.chDescs[i].Priority) < chPriority && !canDrop; i-- { currChID := ChannelID(s.chDescs[i].ID) flow := s.buffer[currChID] for j := 0; j < len(flow) && !canDrop; j++ { if flow[j].size >= wEnv.size { canDrop = true dropIdx = j dropChID = currChID break } } } // If we can drop an existing Envelope, drop it and enqueue the incoming // Envelope. if canDrop { chIDStr = strconv.Itoa(int(dropChID)) chPriority = s.chPriorities[dropChID] msgSize = s.buffer[dropChID][dropIdx].size // Drop Envelope for the lower priority flow and update the queue's // buffer size s.size -= msgSize s.buffer[dropChID] = append(s.buffer[dropChID][:dropIdx], s.buffer[dropChID][dropIdx+1:]...) // add the incoming Envelope and update queue's buffer size s.size += wEnv.size s.buffer[e.channelID] = append(s.buffer[e.channelID], wEnv) s.metrics.PeerQueueMsgSize.With("ch_id", chIDStr).Set(float64(wEnv.size)) } // We either dropped the incoming Enevelope or one from an existing // lower priority flow. s.metrics.PeerQueueDroppedMsgs.With("ch_id", chIDStr).Add(1) s.logger.Debug( "dropped envelope", "ch_id", chIDStr, "priority", chPriority, "capacity", s.capacity, "msg_size", msgSize, ) } else { // we have sufficient capacity to enqueue the incoming Envelope s.metrics.PeerQueueMsgSize.With("ch_id", chIDStr).Set(float64(wEnv.size)) s.buffer[e.channelID] = append(s.buffer[e.channelID], wEnv) s.size += wEnv.size } default: // perform the WDRR algorithm for _, chDesc := range s.chDescs { chID := ChannelID(chDesc.ID) // only consider non-empty flows if len(s.buffer[chID]) > 0 { // bump flow's quantum s.deficits[chID] += s.quanta[chID] // grab the flow's current deficit counter and HoQ (wrapped) Envelope d := s.deficits[chID] we := s.buffer[chID][0] // While the flow is non-empty and we can send the current Envelope // on the dequeueCh: // // 1. send the Envelope // 2. update the scheduler's shared buffer's size // 3. update the flow's deficit // 4. remove from the flow's queue // 5. grab the next HoQ Envelope and flow's deficit for len(s.buffer[chID]) > 0 && d >= we.size { s.metrics.PeerSendBytesTotal.With( "chID", fmt.Sprint(chID), "peer_id", string(we.envelope.To)).Add(float64(we.size)) s.dequeueCh <- we.envelope s.size -= we.size s.deficits[chID] -= we.size s.buffer[chID] = s.buffer[chID][1:] if len(s.buffer[chID]) > 0 { d = s.deficits[chID] we = s.buffer[chID][0] } } } // reset the flow's deficit to zero if it is empty if len(s.buffer[chID]) == 0 { s.deficits[chID] = 0 } } } } }