statesync: increase chunk priority and robustness (#6582)

3 years ago · d515bbcf1d
--- a/CHANGELOG_PENDING.md
+++ b/CHANGELOG_PENDING.md
@ -25,8 +25,9 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi
 ### IMPROVEMENTS

 - [statesync] \#6566 Allow state sync fetchers and request timeout to be configurable. (@alexanderbez)
 - [statesync] \#6378 Retry requests for snapshots and add a minimum discovery time (5s) for new snapshots.
 - [statesync] \#6378 Retry requests for snapshots and add a minimum discovery time (5s) for new snapshots. (@tychoish)
 - [statesync] \#6582 Increase chunk priority and add multiple retry chunk requests (@cmwaters)

 ### BUG FIXES

 - [evidence] \#6375 Fix bug with inconsistent LightClientAttackEvidence hashing (cmwaters)
 - [evidence] \#6375 Fix bug with inconsistent LightClientAttackEvidence hashing (@cmwaters)
--- a/config/config.go
+++ b/config/config.go
@ -789,8 +789,8 @@ func (cfg *StateSyncConfig) ValidateBasic() error {
 			return fmt.Errorf("invalid trusted_hash: %w", err)
 		}

 		if cfg.ChunkRequestTimeout < time.Second {
 			return errors.New("chunk_request_timeout must be least a one second")
 		if cfg.ChunkRequestTimeout < 5*time.Second {
 			return errors.New("chunk_request_timeout must be at least 5 seconds")
 		}

 		if cfg.ChunkFetchers <= 0 {
--- a/statesync/reactor.go
+++ b/statesync/reactor.go
@ -69,8 +69,8 @@ func (r *Reactor) GetChannels() []*p2p.ChannelDescriptor {
 		},
 		{
 			ID:                  ChunkChannel,
 			Priority:            1,
 			SendQueueCapacity:   4,
 			Priority:            3,
 			SendQueueCapacity:   10,
 			RecvMessageCapacity: chunkMsgSize,
 		},
 	}
--- a/statesync/syncer.go
+++ b/statesync/syncer.go
@ -50,13 +50,14 @@ var (
 // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific
 // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate.
 type syncer struct {
 	cfg           config.StateSyncConfig
 	logger        log.Logger
 	stateProvider StateProvider
 	conn          proxy.AppConnSnapshot
 	connQuery     proxy.AppConnQuery
 	snapshots     *snapshotPool
 	tempDir       string
 	chunkFetchers int32
 	retryTimeout  time.Duration

 	mtx    tmsync.RWMutex
 	chunks *chunkQueue
@ -73,13 +74,14 @@ func newSyncer(
 ) *syncer {

 	return &syncer{
 		cfg:           cfg,
 		logger:        logger,
 		stateProvider: stateProvider,
 		conn:          conn,
 		connQuery:     connQuery,
 		snapshots:     newSnapshotPool(stateProvider),
 		tempDir:       tempDir,
 		chunkFetchers: cfg.ChunkFetchers,
 		retryTimeout:  cfg.ChunkRequestTimeout,
 	}
 }

@ -250,7 +252,7 @@ func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.
 	// Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled.
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 	for i := int32(0); i < s.cfg.ChunkFetchers; i++ {
 	for i := int32(0); i < s.chunkFetchers; i++ {
 		go s.fetchChunks(ctx, snapshot, chunks)
 	}

@ -383,36 +385,45 @@ func (s *syncer) applyChunks(chunks *chunkQueue) error {
 // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks
 // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add().
 func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) {
 	var (
 		next  = true
 		index uint32
 		err   error
 	)

 	for {
 		index, err := chunks.Allocate()
 		if err == errDone {
 			// Keep checking until the context is cancelled (restore is done), in case any
 			// chunks need to be refetched.
 			select {
 			case <-ctx.Done():
 		if next {
 			index, err = chunks.Allocate()
 			if errors.Is(err, errDone) {
 				// Keep checking until the context is canceled (restore is done), in case any
 				// chunks need to be refetched.
 				select {
 				case <-ctx.Done():
 					return
 				default:
 				}
 				time.Sleep(2 * time.Second)
 				continue
 			}
 			if err != nil {
 				s.logger.Error("Failed to allocate chunk from queue", "err", err)
 				return
 			default:
 			}
 			time.Sleep(2 * time.Second)
 			continue
 		}
 		if err != nil {
 			s.logger.Error("Failed to allocate chunk from queue", "err", err)
 			return
 		}
 		s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height,
 			"format", snapshot.Format, "chunk", index, "total", chunks.Size())

 		ticker := time.NewTicker(s.cfg.ChunkRequestTimeout)
 		ticker := time.NewTicker(s.retryTimeout)
 		defer ticker.Stop()

 		s.requestChunk(snapshot, index)

 		select {
 		case <-chunks.WaitFor(index):
 			next = true

 		case <-ticker.C:
 			s.requestChunk(snapshot, index)
 			next = false

 		case <-ctx.Done():
 			return
--- a/test/e2e/generator/generate.go
+++ b/test/e2e/generator/generate.go
@ -82,10 +82,10 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
 		numValidators = 4
 	case "large":
 		// FIXME Networks are kept small since large ones use too much CPU.
 		numSeeds = r.Intn(3)
 		numSeeds = r.Intn(2)
 		numLightClients = r.Intn(3)
 		numValidators = 4 + r.Intn(7)
 		numFulls = r.Intn(5)
 		numValidators = 4 + r.Intn(4)
 		numFulls = r.Intn(4)
 	default:
 		return manifest, fmt.Errorf("unknown topology %q", opt["topology"])
 	}
--- a/test/e2e/runner/load.go
+++ b/test/e2e/runner/load.go
@ -81,7 +81,7 @@ func loadGenerate(ctx context.Context, chTx chan<- types.Tx, multiplier int) {

 		select {
 		case chTx <- tx:
 			time.Sleep(time.Duration(100/multiplier) * time.Millisecond)
 			time.Sleep(time.Second / time.Duration(multiplier))

 		case <-ctx.Done():
 			close(chTx)
--- a/test/e2e/runner/rpc.go
+++ b/test/e2e/runner/rpc.go
@ -84,7 +84,7 @@ func waitForNode(node *e2e.Node, height int64, timeout time.Duration) (*rpctypes
 			return status, nil
 		}

 		time.Sleep(200 * time.Millisecond)
 		time.Sleep(300 * time.Millisecond)
 	}
 }