You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

287 lines
9.9 KiB

package p2p
import (
"fmt"
"sort"
"strconv"
"github.com/gogo/protobuf/proto"
tmsync "github.com/tendermint/tendermint/internal/libs/sync"
"github.com/tendermint/tendermint/libs/log"
)
// wrappedEnvelope wraps a p2p Envelope with its precomputed size.
type wrappedEnvelope struct {
envelope Envelope
size uint
}
// assert the WDRR scheduler implements the queue interface at compile-time
var _ queue = (*wdrrScheduler)(nil)
// wdrrQueue implements a Weighted Deficit Round Robin (WDRR) scheduling
// algorithm via the queue interface. A WDRR queue is created per peer, where
// the queue will have N number of flows. Each flow corresponds to a p2p Channel,
// so there are n input flows and a single output source, the peer's connection.
//
// The WDRR scheduler contains a shared buffer with a fixed capacity.
//
// Each flow has the following:
// - quantum: The number of bytes that is added to the deficit counter of the
// flow in each round. The flow can send at most quantum bytes at a time. Each
// flow has its own unique quantum, which gives the queue its weighted nature.
// A higher quantum corresponds to a higher weight/priority. The quantum is
// computed as MaxSendBytes * Priority.
// - deficit counter: The number of bytes that the flow is allowed to transmit
// when it is its turn.
//
// See: https://en.wikipedia.org/wiki/Deficit_round_robin
type wdrrScheduler struct {
logger log.Logger
metrics *Metrics
chDescs []ChannelDescriptor
capacity uint
size uint
chPriorities map[ChannelID]uint
buffer map[ChannelID][]wrappedEnvelope
quanta map[ChannelID]uint
deficits map[ChannelID]uint
closer *tmsync.Closer
doneCh *tmsync.Closer
enqueueCh chan Envelope
dequeueCh chan Envelope
}
func newWDRRScheduler(
logger log.Logger,
m *Metrics,
chDescs []ChannelDescriptor,
enqueueBuf, dequeueBuf, capacity uint,
) *wdrrScheduler {
// copy each ChannelDescriptor and sort them by channel priority
chDescsCopy := make([]ChannelDescriptor, len(chDescs))
copy(chDescsCopy, chDescs)
sort.Slice(chDescsCopy, func(i, j int) bool { return chDescsCopy[i].Priority > chDescsCopy[j].Priority })
var (
buffer = make(map[ChannelID][]wrappedEnvelope)
chPriorities = make(map[ChannelID]uint)
quanta = make(map[ChannelID]uint)
deficits = make(map[ChannelID]uint)
)
for _, chDesc := range chDescsCopy {
chID := ChannelID(chDesc.ID)
chPriorities[chID] = uint(chDesc.Priority)
buffer[chID] = make([]wrappedEnvelope, 0)
quanta[chID] = chDesc.MaxSendBytes * uint(chDesc.Priority)
}
return &wdrrScheduler{
logger: logger.With("queue", "wdrr"),
metrics: m,
capacity: capacity,
chPriorities: chPriorities,
chDescs: chDescsCopy,
buffer: buffer,
quanta: quanta,
deficits: deficits,
closer: tmsync.NewCloser(),
doneCh: tmsync.NewCloser(),
enqueueCh: make(chan Envelope, enqueueBuf),
dequeueCh: make(chan Envelope, dequeueBuf),
}
}
// enqueue returns an unbuffered write-only channel which a producer can send on.
func (s *wdrrScheduler) enqueue() chan<- Envelope {
return s.enqueueCh
}
// dequeue returns an unbuffered read-only channel which a consumer can read from.
func (s *wdrrScheduler) dequeue() <-chan Envelope {
return s.dequeueCh
}
func (s *wdrrScheduler) closed() <-chan struct{} {
return s.closer.Done()
}
// close closes the WDRR queue. After this call enqueue() will block, so the
// caller must select on closed() as well to avoid blocking forever. The
// enqueue() and dequeue() along with the internal channels will NOT be closed.
// Note, close() will block until all externally spawned goroutines have exited.
func (s *wdrrScheduler) close() {
s.closer.Close()
<-s.doneCh.Done()
}
// start starts the WDRR queue process in a blocking goroutine. This must be
// called before the queue can start to process and accept Envelopes.
func (s *wdrrScheduler) start() {
go s.process()
}
// process starts a blocking WDRR scheduler process, where we continuously
// evaluate if we need to attempt to enqueue an Envelope or schedule Envelopes
// to be dequeued and subsequently read and sent on the source connection.
// Internally, each p2p Channel maps to a flow, where each flow has a deficit
// and a quantum.
//
// For each Envelope requested to be enqueued, we evaluate if there is sufficient
// capacity in the shared buffer to add the Envelope. If so, it is added.
// Otherwise, we evaluate all flows of lower priority where we attempt find an
// existing Envelope in the shared buffer of sufficient size that can be dropped
// in place of the incoming Envelope. If there is no such Envelope that can be
// dropped, then the incoming Envelope is dropped.
//
// When there is nothing to be enqueued, we perform the WDRR algorithm and
// determine which Envelopes can be dequeued. For each Envelope that can be
// dequeued, it is sent on the dequeueCh. Specifically, for each flow, if it is
// non-empty, its deficit counter is incremented by its quantum value. Then, the
// value of the deficit counter is a maximal amount of bytes that can be sent at
// this round. If the deficit counter is greater than the Envelopes's message
// size at the head of the queue (HoQ), this envelope can be sent and the value
// of the counter is decremented by the message's size. Then, the size of the
// next Envelopes's message is compared to the counter value, etc. Once the flow
// is empty or the value of the counter is insufficient, the scheduler will skip
// to the next flow. If the flow is empty, the value of the deficit counter is
// reset to 0.
//
// XXX/TODO: Evaluate the single goroutine scheduler mechanism. In other words,
// evaluate the effectiveness and performance of having a single goroutine
// perform handling both enqueueing and dequeueing logic. Specifically, there
// is potentially contention between reading off of enqueueCh and trying to
// enqueue while also attempting to perform the WDRR algorithm and find the next
// set of Envelope(s) to send on the dequeueCh. Alternatively, we could consider
// separate scheduling goroutines, but then that requires the use of mutexes and
// possibly a degrading performance.
func (s *wdrrScheduler) process() {
defer s.doneCh.Close()
for {
select {
case <-s.closer.Done():
return
case e := <-s.enqueueCh:
// attempt to enqueue the incoming Envelope
chIDStr := strconv.Itoa(int(e.channelID))
wEnv := wrappedEnvelope{envelope: e, size: uint(proto.Size(e.Message))}
msgSize := wEnv.size
s.metrics.PeerPendingSendBytes.With("peer_id", string(e.To)).Add(float64(msgSize))
// If we're at capacity, we need to either drop the incoming Envelope or
// an Envelope from a lower priority flow. Otherwise, we add the (wrapped)
// envelope to the flow's queue.
if s.size+wEnv.size > s.capacity {
chPriority := s.chPriorities[e.channelID]
var (
canDrop bool
dropIdx int
dropChID ChannelID
)
// Evaluate all lower priority flows and determine if there exists an
// Envelope that is of equal or greater size that we can drop in favor
// of the incoming Envelope.
for i := len(s.chDescs) - 1; i >= 0 && uint(s.chDescs[i].Priority) < chPriority && !canDrop; i-- {
currChID := ChannelID(s.chDescs[i].ID)
flow := s.buffer[currChID]
for j := 0; j < len(flow) && !canDrop; j++ {
if flow[j].size >= wEnv.size {
canDrop = true
dropIdx = j
dropChID = currChID
break
}
}
}
// If we can drop an existing Envelope, drop it and enqueue the incoming
// Envelope.
if canDrop {
chIDStr = strconv.Itoa(int(dropChID))
chPriority = s.chPriorities[dropChID]
msgSize = s.buffer[dropChID][dropIdx].size
// Drop Envelope for the lower priority flow and update the queue's
// buffer size
s.size -= msgSize
s.buffer[dropChID] = append(s.buffer[dropChID][:dropIdx], s.buffer[dropChID][dropIdx+1:]...)
// add the incoming Envelope and update queue's buffer size
s.size += wEnv.size
s.buffer[e.channelID] = append(s.buffer[e.channelID], wEnv)
s.metrics.PeerQueueMsgSize.With("ch_id", chIDStr).Set(float64(wEnv.size))
}
// We either dropped the incoming Enevelope or one from an existing
// lower priority flow.
s.metrics.PeerQueueDroppedMsgs.With("ch_id", chIDStr).Add(1)
s.logger.Debug(
"dropped envelope",
"ch_id", chIDStr,
"priority", chPriority,
"capacity", s.capacity,
"msg_size", msgSize,
)
} else {
// we have sufficient capacity to enqueue the incoming Envelope
s.metrics.PeerQueueMsgSize.With("ch_id", chIDStr).Set(float64(wEnv.size))
s.buffer[e.channelID] = append(s.buffer[e.channelID], wEnv)
s.size += wEnv.size
}
default:
// perform the WDRR algorithm
for _, chDesc := range s.chDescs {
chID := ChannelID(chDesc.ID)
// only consider non-empty flows
if len(s.buffer[chID]) > 0 {
// bump flow's quantum
s.deficits[chID] += s.quanta[chID]
// grab the flow's current deficit counter and HoQ (wrapped) Envelope
d := s.deficits[chID]
we := s.buffer[chID][0]
// While the flow is non-empty and we can send the current Envelope
// on the dequeueCh:
//
// 1. send the Envelope
// 2. update the scheduler's shared buffer's size
// 3. update the flow's deficit
// 4. remove from the flow's queue
// 5. grab the next HoQ Envelope and flow's deficit
for len(s.buffer[chID]) > 0 && d >= we.size {
s.metrics.PeerSendBytesTotal.With(
"chID", fmt.Sprint(chID),
"peer_id", string(we.envelope.To)).Add(float64(we.size))
s.dequeueCh <- we.envelope
s.size -= we.size
s.deficits[chID] -= we.size
s.buffer[chID] = s.buffer[chID][1:]
if len(s.buffer[chID]) > 0 {
d = s.deficits[chID]
we = s.buffer[chID][0]
}
}
}
// reset the flow's deficit to zero if it is empty
if len(s.buffer[chID]) == 0 {
s.deficits[chID] = 0
}
}
}
}
}