You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

478 lines
15 KiB

  1. package statesync
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "fmt"
  7. "time"
  8. abci "github.com/tendermint/tendermint/abci/types"
  9. "github.com/tendermint/tendermint/config"
  10. "github.com/tendermint/tendermint/libs/log"
  11. tmsync "github.com/tendermint/tendermint/libs/sync"
  12. "github.com/tendermint/tendermint/p2p"
  13. ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync"
  14. "github.com/tendermint/tendermint/proxy"
  15. sm "github.com/tendermint/tendermint/state"
  16. "github.com/tendermint/tendermint/types"
  17. )
  18. const (
  19. // chunkTimeout is the timeout while waiting for the next chunk from the chunk queue.
  20. chunkTimeout = 2 * time.Minute
  21. // minimumDiscoveryTime is the lowest allowable time for a
  22. // SyncAny discovery time.
  23. minimumDiscoveryTime = 5 * time.Second
  24. )
  25. var (
  26. // errAbort is returned by Sync() when snapshot restoration is aborted.
  27. errAbort = errors.New("state sync aborted")
  28. // errRetrySnapshot is returned by Sync() when the snapshot should be retried.
  29. errRetrySnapshot = errors.New("retry snapshot")
  30. // errRejectSnapshot is returned by Sync() when the snapshot is rejected.
  31. errRejectSnapshot = errors.New("snapshot was rejected")
  32. // errRejectFormat is returned by Sync() when the snapshot format is rejected.
  33. errRejectFormat = errors.New("snapshot format was rejected")
  34. // errRejectSender is returned by Sync() when the snapshot sender is rejected.
  35. errRejectSender = errors.New("snapshot sender was rejected")
  36. // errVerifyFailed is returned by Sync() when app hash or last height verification fails.
  37. errVerifyFailed = errors.New("verification failed")
  38. // errTimeout is returned by Sync() when we've waited too long to receive a chunk.
  39. errTimeout = errors.New("timed out waiting for chunk")
  40. // errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled.
  41. errNoSnapshots = errors.New("no suitable snapshots found")
  42. )
  43. // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to
  44. // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific
  45. // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate.
  46. type syncer struct {
  47. logger log.Logger
  48. stateProvider StateProvider
  49. conn proxy.AppConnSnapshot
  50. connQuery proxy.AppConnQuery
  51. snapshots *snapshotPool
  52. tempDir string
  53. chunkFetchers int32
  54. retryTimeout time.Duration
  55. mtx tmsync.RWMutex
  56. chunks *chunkQueue
  57. }
  58. // newSyncer creates a new syncer.
  59. func newSyncer(
  60. cfg config.StateSyncConfig,
  61. logger log.Logger,
  62. conn proxy.AppConnSnapshot,
  63. connQuery proxy.AppConnQuery,
  64. stateProvider StateProvider,
  65. tempDir string,
  66. ) *syncer {
  67. return &syncer{
  68. logger: logger,
  69. stateProvider: stateProvider,
  70. conn: conn,
  71. connQuery: connQuery,
  72. snapshots: newSnapshotPool(stateProvider),
  73. tempDir: tempDir,
  74. chunkFetchers: cfg.ChunkFetchers,
  75. retryTimeout: cfg.ChunkRequestTimeout,
  76. }
  77. }
  78. // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already
  79. // been added to the queue, or an error if there's no sync in progress.
  80. func (s *syncer) AddChunk(chunk *chunk) (bool, error) {
  81. s.mtx.RLock()
  82. defer s.mtx.RUnlock()
  83. if s.chunks == nil {
  84. return false, errors.New("no state sync in progress")
  85. }
  86. added, err := s.chunks.Add(chunk)
  87. if err != nil {
  88. return false, err
  89. }
  90. if added {
  91. s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format,
  92. "chunk", chunk.Index)
  93. } else {
  94. s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format,
  95. "chunk", chunk.Index)
  96. }
  97. return added, nil
  98. }
  99. // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen
  100. // snapshot was accepted and added.
  101. func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) {
  102. added, err := s.snapshots.Add(peer, snapshot)
  103. if err != nil {
  104. return false, err
  105. }
  106. if added {
  107. s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format,
  108. "hash", snapshot.Hash)
  109. }
  110. return added, nil
  111. }
  112. // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request
  113. // to discover snapshots, later we may want to do retries and stuff.
  114. func (s *syncer) AddPeer(peer p2p.Peer) {
  115. s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID())
  116. peer.Send(SnapshotChannel, mustEncodeMsg(&ssproto.SnapshotsRequest{}))
  117. }
  118. // RemovePeer removes a peer from the pool.
  119. func (s *syncer) RemovePeer(peer p2p.Peer) {
  120. s.logger.Debug("Removing peer from sync", "peer", peer.ID())
  121. s.snapshots.RemovePeer(peer.ID())
  122. }
  123. // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further
  124. // snapshots if none were found and discoveryTime > 0. It returns the latest state and block commit
  125. // which the caller must use to bootstrap the node.
  126. func (s *syncer) SyncAny(discoveryTime time.Duration, retryHook func()) (sm.State, *types.Commit, error) {
  127. if discoveryTime != 0 && discoveryTime < minimumDiscoveryTime {
  128. discoveryTime = 5 * minimumDiscoveryTime
  129. }
  130. if discoveryTime > 0 {
  131. s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime))
  132. time.Sleep(discoveryTime)
  133. }
  134. // The app may ask us to retry a snapshot restoration, in which case we need to reuse
  135. // the snapshot and chunk queue from the previous loop iteration.
  136. var (
  137. snapshot *snapshot
  138. chunks *chunkQueue
  139. err error
  140. )
  141. for {
  142. // If not nil, we're going to retry restoration of the same snapshot.
  143. if snapshot == nil {
  144. snapshot = s.snapshots.Best()
  145. chunks = nil
  146. }
  147. if snapshot == nil {
  148. if discoveryTime == 0 {
  149. return sm.State{}, nil, errNoSnapshots
  150. }
  151. retryHook()
  152. s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime))
  153. time.Sleep(discoveryTime)
  154. continue
  155. }
  156. if chunks == nil {
  157. chunks, err = newChunkQueue(snapshot, s.tempDir)
  158. if err != nil {
  159. return sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err)
  160. }
  161. defer chunks.Close() // in case we forget to close it elsewhere
  162. }
  163. newState, commit, err := s.Sync(snapshot, chunks)
  164. switch {
  165. case err == nil:
  166. return newState, commit, nil
  167. case errors.Is(err, errAbort):
  168. return sm.State{}, nil, err
  169. case errors.Is(err, errRetrySnapshot):
  170. chunks.RetryAll()
  171. s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format,
  172. "hash", snapshot.Hash)
  173. continue
  174. case errors.Is(err, errTimeout):
  175. s.snapshots.Reject(snapshot)
  176. s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot",
  177. "height", snapshot.Height, "format", snapshot.Format, "hash", snapshot.Hash)
  178. case errors.Is(err, errRejectSnapshot):
  179. s.snapshots.Reject(snapshot)
  180. s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format,
  181. "hash", snapshot.Hash)
  182. case errors.Is(err, errRejectFormat):
  183. s.snapshots.RejectFormat(snapshot.Format)
  184. s.logger.Info("Snapshot format rejected", "format", snapshot.Format)
  185. case errors.Is(err, errRejectSender):
  186. s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format,
  187. "hash", snapshot.Hash)
  188. for _, peer := range s.snapshots.GetPeers(snapshot) {
  189. s.snapshots.RejectPeer(peer.ID())
  190. s.logger.Info("Snapshot sender rejected", "peer", peer.ID())
  191. }
  192. case errors.Is(err, context.DeadlineExceeded):
  193. s.logger.Info("Timed out validating snapshot, rejecting", "height", snapshot.Height, "err", err)
  194. s.snapshots.Reject(snapshot)
  195. default:
  196. return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err)
  197. }
  198. // Discard snapshot and chunks for next iteration
  199. err = chunks.Close()
  200. if err != nil {
  201. s.logger.Error("Failed to clean up chunk queue", "err", err)
  202. }
  203. snapshot = nil
  204. chunks = nil
  205. }
  206. }
  207. // Sync executes a sync for a specific snapshot, returning the latest state and block commit which
  208. // the caller must use to bootstrap the node.
  209. func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.Commit, error) {
  210. s.mtx.Lock()
  211. if s.chunks != nil {
  212. s.mtx.Unlock()
  213. return sm.State{}, nil, errors.New("a state sync is already in progress")
  214. }
  215. s.chunks = chunks
  216. s.mtx.Unlock()
  217. defer func() {
  218. s.mtx.Lock()
  219. s.chunks = nil
  220. s.mtx.Unlock()
  221. }()
  222. // Offer snapshot to ABCI app.
  223. err := s.offerSnapshot(snapshot)
  224. if err != nil {
  225. return sm.State{}, nil, err
  226. }
  227. // Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled.
  228. ctx, cancel := context.WithCancel(context.Background())
  229. defer cancel()
  230. for i := int32(0); i < s.chunkFetchers; i++ {
  231. go s.fetchChunks(ctx, snapshot, chunks)
  232. }
  233. pctx, pcancel := context.WithTimeout(context.Background(), 15*time.Second)
  234. defer pcancel()
  235. // Optimistically build new state, so we don't discover any light client failures at the end.
  236. state, err := s.stateProvider.State(pctx, snapshot.Height)
  237. if err != nil {
  238. return sm.State{}, nil, fmt.Errorf("failed to build new state: %w", err)
  239. }
  240. commit, err := s.stateProvider.Commit(pctx, snapshot.Height)
  241. if err != nil {
  242. return sm.State{}, nil, fmt.Errorf("failed to fetch commit: %w", err)
  243. }
  244. // Restore snapshot
  245. err = s.applyChunks(chunks)
  246. if err != nil {
  247. return sm.State{}, nil, err
  248. }
  249. // Verify app and update app version
  250. appVersion, err := s.verifyApp(snapshot)
  251. if err != nil {
  252. return sm.State{}, nil, err
  253. }
  254. state.Version.Consensus.App = appVersion
  255. // Done! 🎉
  256. s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format,
  257. "hash", snapshot.Hash)
  258. return state, commit, nil
  259. }
  260. // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's
  261. // response, or nil if the snapshot was accepted.
  262. func (s *syncer) offerSnapshot(snapshot *snapshot) error {
  263. s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height,
  264. "format", snapshot.Format, "hash", snapshot.Hash)
  265. resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{
  266. Snapshot: &abci.Snapshot{
  267. Height: snapshot.Height,
  268. Format: snapshot.Format,
  269. Chunks: snapshot.Chunks,
  270. Hash: snapshot.Hash,
  271. Metadata: snapshot.Metadata,
  272. },
  273. AppHash: snapshot.trustedAppHash,
  274. })
  275. if err != nil {
  276. return fmt.Errorf("failed to offer snapshot: %w", err)
  277. }
  278. switch resp.Result {
  279. case abci.ResponseOfferSnapshot_ACCEPT:
  280. s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height,
  281. "format", snapshot.Format, "hash", snapshot.Hash)
  282. return nil
  283. case abci.ResponseOfferSnapshot_ABORT:
  284. return errAbort
  285. case abci.ResponseOfferSnapshot_REJECT:
  286. return errRejectSnapshot
  287. case abci.ResponseOfferSnapshot_REJECT_FORMAT:
  288. return errRejectFormat
  289. case abci.ResponseOfferSnapshot_REJECT_SENDER:
  290. return errRejectSender
  291. default:
  292. return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result)
  293. }
  294. }
  295. // applyChunks applies chunks to the app. It returns various errors depending on the app's
  296. // response, or nil once the snapshot is fully restored.
  297. func (s *syncer) applyChunks(chunks *chunkQueue) error {
  298. for {
  299. chunk, err := chunks.Next()
  300. if err == errDone {
  301. return nil
  302. } else if err != nil {
  303. return fmt.Errorf("failed to fetch chunk: %w", err)
  304. }
  305. resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{
  306. Index: chunk.Index,
  307. Chunk: chunk.Chunk,
  308. Sender: string(chunk.Sender),
  309. })
  310. if err != nil {
  311. return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err)
  312. }
  313. s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height,
  314. "format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size())
  315. // Discard and refetch any chunks as requested by the app
  316. for _, index := range resp.RefetchChunks {
  317. err := chunks.Discard(index)
  318. if err != nil {
  319. return fmt.Errorf("failed to discard chunk %v: %w", index, err)
  320. }
  321. }
  322. // Reject any senders as requested by the app
  323. for _, sender := range resp.RejectSenders {
  324. if sender != "" {
  325. s.snapshots.RejectPeer(p2p.ID(sender))
  326. err := chunks.DiscardSender(p2p.ID(sender))
  327. if err != nil {
  328. return fmt.Errorf("failed to reject sender: %w", err)
  329. }
  330. }
  331. }
  332. switch resp.Result {
  333. case abci.ResponseApplySnapshotChunk_ACCEPT:
  334. case abci.ResponseApplySnapshotChunk_ABORT:
  335. return errAbort
  336. case abci.ResponseApplySnapshotChunk_RETRY:
  337. chunks.Retry(chunk.Index)
  338. case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT:
  339. return errRetrySnapshot
  340. case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT:
  341. return errRejectSnapshot
  342. default:
  343. return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result)
  344. }
  345. }
  346. }
  347. // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks
  348. // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add().
  349. func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) {
  350. var (
  351. next = true
  352. index uint32
  353. err error
  354. )
  355. for {
  356. if next {
  357. index, err = chunks.Allocate()
  358. if errors.Is(err, errDone) {
  359. // Keep checking until the context is canceled (restore is done), in case any
  360. // chunks need to be refetched.
  361. select {
  362. case <-ctx.Done():
  363. return
  364. default:
  365. }
  366. time.Sleep(2 * time.Second)
  367. continue
  368. }
  369. if err != nil {
  370. s.logger.Error("Failed to allocate chunk from queue", "err", err)
  371. return
  372. }
  373. }
  374. s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height,
  375. "format", snapshot.Format, "chunk", index, "total", chunks.Size())
  376. ticker := time.NewTicker(s.retryTimeout)
  377. defer ticker.Stop()
  378. s.requestChunk(snapshot, index)
  379. select {
  380. case <-chunks.WaitFor(index):
  381. next = true
  382. case <-ticker.C:
  383. next = false
  384. case <-ctx.Done():
  385. return
  386. }
  387. ticker.Stop()
  388. }
  389. }
  390. // requestChunk requests a chunk from a peer.
  391. func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) {
  392. peer := s.snapshots.GetPeer(snapshot)
  393. if peer == nil {
  394. s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height,
  395. "format", snapshot.Format, "hash", snapshot.Hash)
  396. return
  397. }
  398. s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height,
  399. "format", snapshot.Format, "chunk", chunk, "peer", peer.ID())
  400. peer.Send(ChunkChannel, mustEncodeMsg(&ssproto.ChunkRequest{
  401. Height: snapshot.Height,
  402. Format: snapshot.Format,
  403. Index: chunk,
  404. }))
  405. }
  406. // verifyApp verifies the sync, checking the app hash and last block height. It returns the
  407. // app version, which should be returned as part of the initial state.
  408. func (s *syncer) verifyApp(snapshot *snapshot) (uint64, error) {
  409. resp, err := s.connQuery.InfoSync(proxy.RequestInfo)
  410. if err != nil {
  411. return 0, fmt.Errorf("failed to query ABCI app for appHash: %w", err)
  412. }
  413. if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) {
  414. s.logger.Error("appHash verification failed",
  415. "expected", snapshot.trustedAppHash,
  416. "actual", resp.LastBlockAppHash)
  417. return 0, errVerifyFailed
  418. }
  419. if uint64(resp.LastBlockHeight) != snapshot.Height {
  420. s.logger.Error("ABCI app reported unexpected last block height",
  421. "expected", snapshot.Height, "actual", resp.LastBlockHeight)
  422. return 0, errVerifyFailed
  423. }
  424. s.logger.Info("Verified ABCI app", "height", snapshot.Height, "appHash", snapshot.trustedAppHash)
  425. return resp.AppVersion, nil
  426. }