From 77deb710fbf2ab602db19bd55522a79116c50999 Mon Sep 17 00:00:00 2001
From: Anton Kaliaev <anton.kalyaev@gmail.com>
Date: Mon, 21 Dec 2020 19:17:45 +0400
Subject: [PATCH] mempool: disable MaxBatchBytes (#5800)

@p4u from vocdoni.io reported that the mempool might behave incorrectly under a
high load. The consequences can range from pauses between blocks to the peers
disconnecting from this node.

My current theory is that the flowrate lib we're using to control flow
(multiplex over a single TCP connection) was not designed w/ large blobs
(1MB batch of txs) in mind.

I've tried decreasing the Mempool reactor priority, but that did not
have any visible effect. What actually worked is adding a time.Sleep
into mempool.Reactor#broadcastTxRoutine after an each successful send ==
manual control flow of sort.

As a temporary remedy (until the mempool package
is refactored), the max-batch-bytes was disabled. Transactions will be sent
one by one without batching

Closes #5796
---
 CHANGELOG_PENDING.md        |  7 ++++++
 config/config.go            | 16 ++++--------
 config/toml.go              |  1 +
 docs/nodes/configuration.md |  5 ++--
 mempool/reactor.go          | 50 +++++++++----------------------------
 mempool/reactor_test.go     | 15 ++++-------
 6 files changed, 33 insertions(+), 61 deletions(-)

diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md
index 28c74e4ed..d0980f1d9 100644
--- a/CHANGELOG_PENDING.md
+++ b/CHANGELOG_PENDING.md
@@ -4,6 +4,12 @@
 
 Special thanks to external contributors on this release:
 
+@p4u from vocdoni.io reported that the mempool might behave incorrectly under a
+high load. The consequences can range from pauses between blocks to the peers
+disconnecting from this node. As a temporary remedy (until the mempool package
+is refactored), the `max-batch-bytes` was disabled. Transactions will be sent
+one by one without batching.
+
 Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermint).
 
 ### BREAKING CHANGES
@@ -53,3 +59,4 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi
 - [blockchain/v1] [\#5701](https://github.com/tendermint/tendermint/pull/5701) Handle peers without blocks (@melekes)
 - [crypto] \#5707 Fix infinite recursion in string formatting of Secp256k1 keys (@erikgrinaker)
 - [blockchain/v1] \#5711 Fix deadlock (@melekes)
+- [mempool] \#5800 Disable `max-batch-bytes` (@melekes)
diff --git a/config/config.go b/config/config.go
index b9b338933..fd8b6669d 100644
--- a/config/config.go
+++ b/config/config.go
@@ -645,6 +645,7 @@ type MempoolConfig struct {
 	MaxTxBytes int `mapstructure:"max-tx-bytes"`
 	// Maximum size of a batch of transactions to send to a peer
 	// Including space needed by encoding (one varint per transaction).
+	// XXX: Unused due to https://github.com/tendermint/tendermint/issues/5796
 	MaxBatchBytes int `mapstructure:"max-batch-bytes"`
 }
 
@@ -656,11 +657,10 @@ func DefaultMempoolConfig() *MempoolConfig {
 		WalPath:   "",
 		// Each signature verification takes .5ms, Size reduced until we implement
 		// ABCI Recheck
-		Size:          5000,
-		MaxTxsBytes:   1024 * 1024 * 1024, // 1GB
-		CacheSize:     10000,
-		MaxTxBytes:    1024 * 1024,      // 1MB
-		MaxBatchBytes: 10 * 1024 * 1024, // 10MB
+		Size:        5000,
+		MaxTxsBytes: 1024 * 1024 * 1024, // 1GB
+		CacheSize:   10000,
+		MaxTxBytes:  1024 * 1024, // 1MB
 	}
 }
 
@@ -696,12 +696,6 @@ func (cfg *MempoolConfig) ValidateBasic() error {
 	if cfg.MaxTxBytes < 0 {
 		return errors.New("max-tx-bytes can't be negative")
 	}
-	if cfg.MaxBatchBytes < 0 {
-		return errors.New("max-batch-bytes can't be negative")
-	}
-	if cfg.MaxBatchBytes <= cfg.MaxTxBytes {
-		return errors.New("max-batch-bytes can't be less or equal to max-tx-bytes")
-	}
 	return nil
 }
 
diff --git a/config/toml.go b/config/toml.go
index beaa7283e..ed29ab46c 100644
--- a/config/toml.go
+++ b/config/toml.go
@@ -341,6 +341,7 @@ max-tx-bytes = {{ .Mempool.MaxTxBytes }}
 
 # Maximum size of a batch of transactions to send to a peer
 # Including space needed by encoding (one varint per transaction).
+# XXX: Unused due to https://github.com/tendermint/tendermint/issues/5796
 max-batch-bytes = {{ .Mempool.MaxBatchBytes }}
 
 #######################################################
diff --git a/docs/nodes/configuration.md b/docs/nodes/configuration.md
index b3de15c03..7119c866e 100644
--- a/docs/nodes/configuration.md
+++ b/docs/nodes/configuration.md
@@ -289,7 +289,8 @@ max-tx-bytes = 1048576
 
 # Maximum size of a batch of transactions to send to a peer
 # Including space needed by encoding (one varint per transaction).
-max-batch-bytes = 10485760
+# XXX: Unused due to https://github.com/tendermint/tendermint/issues/5796
+max-batch-bytes = 0
 
 #######################################################
 ###         State Sync Configuration Options        ###
@@ -493,4 +494,4 @@ This section will cover settings within the p2p section of the `config.toml`.
 - `unconditional-peer-ids` = is similar to `persistent-peers` except that these peers will be connected to even if you are already connected to the maximum number of peers. This can be a validator node ID on your sentry node.
 - `pex` = turns the peer exchange reactor on or off. Validator node will want the `pex` turned off so it would not begin gossiping to unknown peers on the network. PeX can also be turned off for statically configured networks with fixed network connectivity. For full nodes on open, dynamic networks, it should be turned on.
 - `seed-mode` = is used for when node operators want to run their node as a seed node. Seed node's run a variation of the PeX protocol that disconnects from peers after sending them a list of peers to connect to. To minimize the servers usage, it is recommended to set the mempool's size to 0.
--  `private-peer-ids` = is a comma separated list of node ids that you would not like exposed to other peers (ie. you will not tell other peers about the private-peer-ids). This can be filled with a validators node id.
+-  `private-peer-ids` = is a comma separated list of node ids that you would not like exposed to other peers (ie. you will not tell other peers about the private-peer-ids). This can be filled with a validators node id.
\ No newline at end of file
diff --git a/mempool/reactor.go b/mempool/reactor.go
index 6b2b56f83..da5d330f0 100644
--- a/mempool/reactor.go
+++ b/mempool/reactor.go
@@ -134,12 +134,18 @@ func (memR *Reactor) OnStart() error {
 // GetChannels implements Reactor by returning the list of channels for this
 // reactor.
 func (memR *Reactor) GetChannels() []*p2p.ChannelDescriptor {
-	maxMsgSize := memR.config.MaxBatchBytes
+	largestTx := make([]byte, memR.config.MaxTxBytes)
+	batchMsg := protomem.Message{
+		Sum: &protomem.Message_Txs{
+			Txs: &protomem.Txs{Txs: [][]byte{largestTx}},
+		},
+	}
+
 	return []*p2p.ChannelDescriptor{
 		{
 			ID:                  MempoolChannel,
 			Priority:            5,
-			RecvMessageCapacity: maxMsgSize,
+			RecvMessageCapacity: batchMsg.Size(),
 		},
 	}
 }
@@ -234,20 +240,19 @@ func (memR *Reactor) broadcastTxRoutine(peer p2p.Peer) {
 			continue
 		}
 
-		txs := memR.txs(next, peerID, peerState.GetHeight()) // WARNING: mutates next!
+		// NOTE: Transaction batching was disabled due to
+		// https://github.com/tendermint/tendermint/issues/5796
 
-		// send txs
-		if len(txs) > 0 {
+		if _, ok := memTx.senders.Load(peerID); !ok {
 			msg := protomem.Message{
 				Sum: &protomem.Message_Txs{
-					Txs: &protomem.Txs{Txs: txs},
+					Txs: &protomem.Txs{Txs: [][]byte{memTx.tx}},
 				},
 			}
 			bz, err := msg.Marshal()
 			if err != nil {
 				panic(err)
 			}
-			memR.Logger.Debug("Sending N txs to peer", "N", len(txs), "peer", peer)
 			success := peer.Send(MempoolChannel, bz)
 			if !success {
 				time.Sleep(peerCatchupSleepIntervalMS * time.Millisecond)
@@ -267,37 +272,6 @@ func (memR *Reactor) broadcastTxRoutine(peer p2p.Peer) {
 	}
 }
 
-// txs iterates over the transaction list and builds a batch of txs. next is
-// included.
-// WARNING: mutates next!
-func (memR *Reactor) txs(next *clist.CElement, peerID uint16, peerHeight int64) [][]byte {
-	batch := make([][]byte, 0)
-
-	for {
-		memTx := next.Value.(*mempoolTx)
-
-		if _, ok := memTx.senders.Load(peerID); !ok {
-			// If current batch + this tx size is greater than max => return.
-			batchMsg := protomem.Message{
-				Sum: &protomem.Message_Txs{
-					Txs: &protomem.Txs{Txs: append(batch, memTx.tx)},
-				},
-			}
-			if batchMsg.Size() > memR.config.MaxBatchBytes {
-				return batch
-			}
-
-			batch = append(batch, memTx.tx)
-		}
-
-		n := next.Next()
-		if n == nil {
-			return batch
-		}
-		next = n
-	}
-}
-
 //-----------------------------------------------------------------------------
 // Messages
 
diff --git a/mempool/reactor_test.go b/mempool/reactor_test.go
index d9e67d166..bc51bfd9b 100644
--- a/mempool/reactor_test.go
+++ b/mempool/reactor_test.go
@@ -149,9 +149,8 @@ func TestReactorNoBroadcastToSender(t *testing.T) {
 	ensureNoTxs(t, reactors[peerID], 100*time.Millisecond)
 }
 
-func TestReactor_MaxBatchBytes(t *testing.T) {
+func TestReactor_MaxTxBytes(t *testing.T) {
 	config := cfg.TestConfig()
-	config.Mempool.MaxBatchBytes = 1024
 
 	const N = 2
 	reactors := makeAndConnectReactors(config, N)
@@ -168,9 +167,9 @@ func TestReactor_MaxBatchBytes(t *testing.T) {
 		}
 	}
 
-	// Broadcast a tx, which has the max size (minus proto overhead)
+	// Broadcast a tx, which has the max size
 	// => ensure it's received by the second reactor.
-	tx1 := tmrand.Bytes(1018)
+	tx1 := tmrand.Bytes(config.Mempool.MaxTxBytes)
 	err := reactors[0].mempool.CheckTx(tx1, nil, TxInfo{SenderID: UnknownPeerID})
 	require.NoError(t, err)
 	waitForTxsOnReactors(t, []types.Tx{tx1}, reactors)
@@ -180,13 +179,9 @@ func TestReactor_MaxBatchBytes(t *testing.T) {
 
 	// Broadcast a tx, which is beyond the max size
 	// => ensure it's not sent
-	tx2 := tmrand.Bytes(1020)
+	tx2 := tmrand.Bytes(config.Mempool.MaxTxBytes + 1)
 	err = reactors[0].mempool.CheckTx(tx2, nil, TxInfo{SenderID: UnknownPeerID})
-	require.NoError(t, err)
-	ensureNoTxs(t, reactors[1], 100*time.Millisecond)
-	// => ensure the second reactor did not disconnect from us
-	out, in, _ := reactors[1].Switch.NumPeers()
-	assert.Equal(t, 1, out+in)
+	require.Error(t, err)
 }
 
 func TestBroadcastTxForPeerStopsWhenPeerStops(t *testing.T) {