Browse Source

statesync: increase chunk priority and robustness (#6582)

pull/6597/head
Callum Waters 3 years ago
committed by GitHub
parent
commit
d515bbcf1d
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 41 additions and 29 deletions
  1. +3
    -2
      CHANGELOG_PENDING.md
  2. +2
    -2
      config/config.go
  3. +2
    -2
      statesync/reactor.go
  4. +29
    -18
      statesync/syncer.go
  5. +3
    -3
      test/e2e/generator/generate.go
  6. +1
    -1
      test/e2e/runner/load.go
  7. +1
    -1
      test/e2e/runner/rpc.go

+ 3
- 2
CHANGELOG_PENDING.md View File

@ -25,8 +25,9 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi
### IMPROVEMENTS
- [statesync] \#6566 Allow state sync fetchers and request timeout to be configurable. (@alexanderbez)
- [statesync] \#6378 Retry requests for snapshots and add a minimum discovery time (5s) for new snapshots.
- [statesync] \#6378 Retry requests for snapshots and add a minimum discovery time (5s) for new snapshots. (@tychoish)
- [statesync] \#6582 Increase chunk priority and add multiple retry chunk requests (@cmwaters)
### BUG FIXES
- [evidence] \#6375 Fix bug with inconsistent LightClientAttackEvidence hashing (cmwaters)
- [evidence] \#6375 Fix bug with inconsistent LightClientAttackEvidence hashing (@cmwaters)

+ 2
- 2
config/config.go View File

@ -789,8 +789,8 @@ func (cfg *StateSyncConfig) ValidateBasic() error {
return fmt.Errorf("invalid trusted_hash: %w", err)
}
if cfg.ChunkRequestTimeout < time.Second {
return errors.New("chunk_request_timeout must be least a one second")
if cfg.ChunkRequestTimeout < 5*time.Second {
return errors.New("chunk_request_timeout must be at least 5 seconds")
}
if cfg.ChunkFetchers <= 0 {


+ 2
- 2
statesync/reactor.go View File

@ -69,8 +69,8 @@ func (r *Reactor) GetChannels() []*p2p.ChannelDescriptor {
},
{
ID: ChunkChannel,
Priority: 1,
SendQueueCapacity: 4,
Priority: 3,
SendQueueCapacity: 10,
RecvMessageCapacity: chunkMsgSize,
},
}


+ 29
- 18
statesync/syncer.go View File

@ -50,13 +50,14 @@ var (
// sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific
// snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate.
type syncer struct {
cfg config.StateSyncConfig
logger log.Logger
stateProvider StateProvider
conn proxy.AppConnSnapshot
connQuery proxy.AppConnQuery
snapshots *snapshotPool
tempDir string
chunkFetchers int32
retryTimeout time.Duration
mtx tmsync.RWMutex
chunks *chunkQueue
@ -73,13 +74,14 @@ func newSyncer(
) *syncer {
return &syncer{
cfg: cfg,
logger: logger,
stateProvider: stateProvider,
conn: conn,
connQuery: connQuery,
snapshots: newSnapshotPool(stateProvider),
tempDir: tempDir,
chunkFetchers: cfg.ChunkFetchers,
retryTimeout: cfg.ChunkRequestTimeout,
}
}
@ -250,7 +252,7 @@ func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.
// Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
for i := int32(0); i < s.cfg.ChunkFetchers; i++ {
for i := int32(0); i < s.chunkFetchers; i++ {
go s.fetchChunks(ctx, snapshot, chunks)
}
@ -383,36 +385,45 @@ func (s *syncer) applyChunks(chunks *chunkQueue) error {
// fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks
// will be received from the reactor via syncer.AddChunks() to chunkQueue.Add().
func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) {
var (
next = true
index uint32
err error
)
for {
index, err := chunks.Allocate()
if err == errDone {
// Keep checking until the context is cancelled (restore is done), in case any
// chunks need to be refetched.
select {
case <-ctx.Done():
if next {
index, err = chunks.Allocate()
if errors.Is(err, errDone) {
// Keep checking until the context is canceled (restore is done), in case any
// chunks need to be refetched.
select {
case <-ctx.Done():
return
default:
}
time.Sleep(2 * time.Second)
continue
}
if err != nil {
s.logger.Error("Failed to allocate chunk from queue", "err", err)
return
default:
}
time.Sleep(2 * time.Second)
continue
}
if err != nil {
s.logger.Error("Failed to allocate chunk from queue", "err", err)
return
}
s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height,
"format", snapshot.Format, "chunk", index, "total", chunks.Size())
ticker := time.NewTicker(s.cfg.ChunkRequestTimeout)
ticker := time.NewTicker(s.retryTimeout)
defer ticker.Stop()
s.requestChunk(snapshot, index)
select {
case <-chunks.WaitFor(index):
next = true
case <-ticker.C:
s.requestChunk(snapshot, index)
next = false
case <-ctx.Done():
return


+ 3
- 3
test/e2e/generator/generate.go View File

@ -82,10 +82,10 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
numValidators = 4
case "large":
// FIXME Networks are kept small since large ones use too much CPU.
numSeeds = r.Intn(3)
numSeeds = r.Intn(2)
numLightClients = r.Intn(3)
numValidators = 4 + r.Intn(7)
numFulls = r.Intn(5)
numValidators = 4 + r.Intn(4)
numFulls = r.Intn(4)
default:
return manifest, fmt.Errorf("unknown topology %q", opt["topology"])
}


+ 1
- 1
test/e2e/runner/load.go View File

@ -81,7 +81,7 @@ func loadGenerate(ctx context.Context, chTx chan<- types.Tx, multiplier int) {
select {
case chTx <- tx:
time.Sleep(time.Duration(100/multiplier) * time.Millisecond)
time.Sleep(time.Second / time.Duration(multiplier))
case <-ctx.Done():
close(chTx)


+ 1
- 1
test/e2e/runner/rpc.go View File

@ -84,7 +84,7 @@ func waitForNode(node *e2e.Node, height int64, timeout time.Duration) (*rpctypes
return status, nil
}
time.Sleep(200 * time.Millisecond)
time.Sleep(300 * time.Millisecond)
}
}


Loading…
Cancel
Save