From a6fd04f1be92d5b6f33bff831b0c8415c0852bf7 Mon Sep 17 00:00:00 2001
From: "M. J. Fromberger" <fromberger@interchain.io>
Date: Mon, 14 Mar 2022 13:01:00 -0700
Subject: [PATCH] p2p: update polling interval calculation for PEX requests
 (backport #8106) (#8118)

A manual cherry-pick of 89b4321af2fe80b754590e1847921204cae80a82.
---
 internal/p2p/pex/reactor.go      | 109 ++++++++++++++-----------------
 internal/p2p/pex/reactor_test.go |   2 +-
 2 files changed, 49 insertions(+), 62 deletions(-)

diff --git a/internal/p2p/pex/reactor.go b/internal/p2p/pex/reactor.go
index 8cff2f95b..570832384 100644
--- a/internal/p2p/pex/reactor.go
+++ b/internal/p2p/pex/reactor.go
@@ -10,7 +10,6 @@ import (
 	"github.com/tendermint/tendermint/internal/p2p"
 	"github.com/tendermint/tendermint/internal/p2p/conn"
 	"github.com/tendermint/tendermint/libs/log"
-	tmmath "github.com/tendermint/tendermint/libs/math"
 	"github.com/tendermint/tendermint/libs/service"
 	protop2p "github.com/tendermint/tendermint/proto/tendermint/p2p"
 	"github.com/tendermint/tendermint/types"
@@ -95,17 +94,10 @@ type ReactorV2 struct {
 	lastReceivedRequests map[types.NodeID]time.Time
 
 	// the time when another request will be sent
-	nextRequestTime time.Time
+	nextRequestInterval time.Duration
 
-	// keep track of how many new peers to existing peers we have received to
-	// extrapolate the size of the network
-	newPeers   uint32
-	totalPeers uint32
-
-	// discoveryRatio is the inverse ratio of new peers to old peers squared.
-	// This is multiplied by the minimum duration to calculate how long to wait
-	// between each request.
-	discoveryRatio float32
+	// the total number of unique peers added
+	totalPeers int
 }
 
 // NewReactor returns a reference to a new reactor.
@@ -159,6 +151,7 @@ func (r *ReactorV2) OnStop() {
 func (r *ReactorV2) processPexCh() {
 	defer r.pexCh.Close()
 
+	r.nextRequestInterval = minReceiveRequestInterval
 	for {
 		select {
 		case <-r.closeCh:
@@ -235,6 +228,7 @@ func (r *ReactorV2) handlePexMessage(envelope p2p.Envelope) error {
 			)
 		}
 
+		var numAdded int
 		for _, pexAddress := range msg.Addresses {
 			// no protocol is prefixed so we assume the default (mconn)
 			peerAddress, err := p2p.ParseNodeAddress(
@@ -247,11 +241,11 @@ func (r *ReactorV2) handlePexMessage(envelope p2p.Envelope) error {
 				logger.Error("failed to add PEX address", "address", peerAddress, "err", err)
 			}
 			if added {
-				r.newPeers++
+				numAdded++
 				logger.Debug("added PEX address", "address", peerAddress)
 			}
-			r.totalPeers++
 		}
+		r.calculateNextRequestTime(numAdded)
 
 	// V2 PEX MESSAGES
 	case *protop2p.PexRequestV2:
@@ -289,6 +283,7 @@ func (r *ReactorV2) handlePexMessage(envelope p2p.Envelope) error {
 			)
 		}
 
+		var numAdded int
 		for _, pexAddress := range msg.Addresses {
 			peerAddress, err := p2p.ParseNodeAddress(pexAddress.URL)
 			if err != nil {
@@ -299,11 +294,11 @@ func (r *ReactorV2) handlePexMessage(envelope p2p.Envelope) error {
 				logger.Error("failed to add V2 PEX address", "address", peerAddress, "err", err)
 			}
 			if added {
-				r.newPeers++
+				numAdded++
 				logger.Debug("added V2 PEX address", "address", peerAddress)
 			}
-			r.totalPeers++
 		}
+		r.calculateNextRequestTime(numAdded)
 
 	default:
 		return fmt.Errorf("received unknown message: %T", msg)
@@ -409,7 +404,7 @@ func (r *ReactorV2) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
 }
 
 func (r *ReactorV2) waitUntilNextRequest() <-chan time.Time {
-	return time.After(time.Until(r.nextRequestTime))
+	return time.After(r.nextRequestInterval)
 }
 
 // sendRequestForPeers pops the first peerID off the list and sends the
@@ -421,14 +416,12 @@ func (r *ReactorV2) sendRequestForPeers() {
 	defer r.mtx.Unlock()
 	if len(r.availablePeers) == 0 {
 		// no peers are available
-		r.Logger.Debug("no available peers to send request to, waiting...")
-		r.nextRequestTime = time.Now().Add(noAvailablePeersWaitPeriod)
-
+		r.Logger.Debug("no available peers to send a PEX request to (retrying)")
 		return
 	}
-	var peerID types.NodeID
 
-	// use range to get a random peer.
+	// Select an arbitrary peer from the available set.
+	var peerID types.NodeID
 	for peerID = range r.availablePeers {
 		break
 	}
@@ -449,55 +442,49 @@ func (r *ReactorV2) sendRequestForPeers() {
 	// remove the peer from the abvailable peers list and mark it in the requestsSent map
 	delete(r.availablePeers, peerID)
 	r.requestsSent[peerID] = struct{}{}
-
-	r.calculateNextRequestTime()
-	r.Logger.Debug("peer request sent", "next_request_time", r.nextRequestTime)
 }
 
-// calculateNextRequestTime implements something of a proportional controller
-// to estimate how often the reactor should be requesting new peer addresses.
-// The dependent variable in this calculation is the ratio of new peers to
-// all peers that the reactor receives. The interval is thus calculated as the
-// inverse squared. In the beginning, all peers should be new peers.
-// We  expect this ratio to be near 1 and thus the interval to be as short
-// as possible. As the node becomes more familiar with the network the ratio of
-// new nodes will plummet to a very small number, meaning the interval expands
-// to its upper bound.
-// CONTRACT: Must use a write lock as nextRequestTime is updated
-func (r *ReactorV2) calculateNextRequestTime() {
-	// check if the peer store is full. If so then there is no need
-	// to send peer requests too often
+// calculateNextRequestTime selects how long we should wait before attempting
+// to send out another request for peer addresses.
+//
+// This implements a simplified proportional control mechanism to poll more
+// often when our knowledge of the network is incomplete, and less often as our
+// knowledge grows. To estimate our knowledge of the network, we use the
+// fraction of "new" peers (addresses we have not previously seen) to the total
+// so far observed. When we first join the network, this fraction will be close
+// to 1, meaning most new peers are "new" to us, and as we discover more peers,
+// the fraction will go toward zero.
+//
+// The minimum interval will be minReceiveRequestInterval to ensure we will not
+// request from any peer more often than we would allow them to do from us.
+func (r *ReactorV2) calculateNextRequestTime(added int) {
+	r.mtx.Lock()
+	defer r.mtx.Unlock()
+
+	r.totalPeers += added
+
+	// If the peer store is nearly full, wait the maximum interval.
 	if ratio := r.peerManager.PeerRatio(); ratio >= 0.95 {
-		r.Logger.Debug("peer manager near full ratio, sleeping...",
+		r.Logger.Debug("Peer manager is nearly full",
 			"sleep_period", fullCapacityInterval, "ratio", ratio)
-		r.nextRequestTime = time.Now().Add(fullCapacityInterval)
+		r.nextRequestInterval = fullCapacityInterval
 		return
 	}
 
-	// baseTime represents the shortest interval that we can send peer requests
-	// in. For example if we have 10 peers and we can't send a message to the
-	// same peer every 500ms, then we can send a request every 50ms. In practice
-	// we use a safety margin of 2, ergo 100ms
-	peers := tmmath.MinInt(len(r.availablePeers), 50)
-	baseTime := minReceiveRequestInterval
-	if peers > 0 {
-		baseTime = minReceiveRequestInterval * 2 / time.Duration(peers)
+	// If there are no available peers to query, poll less aggressively.
+	if len(r.availablePeers) == 0 {
+		r.Logger.Debug("No available peers to send a PEX request",
+			"sleep_period", noAvailablePeersWaitPeriod)
+		r.nextRequestInterval = noAvailablePeersWaitPeriod
+		return
 	}
 
-	if r.totalPeers > 0 || r.discoveryRatio == 0 {
-		// find the ratio of new peers. NOTE: We add 1 to both sides to avoid
-		// divide by zero problems
-		ratio := float32(r.totalPeers+1) / float32(r.newPeers+1)
-		// square the ratio in order to get non linear time intervals
-		// NOTE: The longest possible interval for a network with 100 or more peers
-		// where a node is connected to 50 of them is 2 minutes.
-		r.discoveryRatio = ratio * ratio
-		r.newPeers = 0
-		r.totalPeers = 0
-	}
-	// NOTE: As ratio is always >= 1, discovery ratio is >= 1. Therefore we don't need to worry
-	// about the next request time being less than the minimum time
-	r.nextRequestTime = time.Now().Add(baseTime * time.Duration(r.discoveryRatio))
+	// Reaching here, there are available peers to query and the peer store
+	// still has space. Estimate our knowledge of the network from the latest
+	// update and choose a new interval.
+	base := float64(minReceiveRequestInterval) / float64(len(r.availablePeers))
+	multiplier := float64(r.totalPeers+1) / float64(added+1) // +1 to avert zero division
+	r.nextRequestInterval = time.Duration(base*multiplier*multiplier) + minReceiveRequestInterval
 }
 
 func (r *ReactorV2) markPeerRequest(peer types.NodeID) error {
diff --git a/internal/p2p/pex/reactor_test.go b/internal/p2p/pex/reactor_test.go
index cb1cf117d..d00260587 100644
--- a/internal/p2p/pex/reactor_test.go
+++ b/internal/p2p/pex/reactor_test.go
@@ -91,7 +91,7 @@ func TestReactorSendsRequestsTooOften(t *testing.T) {
 	peerErr := <-r.pexErrCh
 	require.Error(t, peerErr.Err)
 	require.Empty(t, r.pexOutCh)
-	require.Contains(t, peerErr.Err.Error(), "peer sent a request too close after a prior one")
+	require.Contains(t, peerErr.Err.Error(), "sent PEX request too soon")
 	require.Equal(t, badNode, peerErr.NodeID)
 }