You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

443 lines
15 KiB

  1. package statesync
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "fmt"
  7. "time"
  8. abci "github.com/tendermint/tendermint/abci/types"
  9. "github.com/tendermint/tendermint/libs/log"
  10. tmsync "github.com/tendermint/tendermint/libs/sync"
  11. "github.com/tendermint/tendermint/p2p"
  12. ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync"
  13. "github.com/tendermint/tendermint/proxy"
  14. sm "github.com/tendermint/tendermint/state"
  15. "github.com/tendermint/tendermint/types"
  16. )
  17. const (
  18. // chunkFetchers is the number of concurrent chunk fetchers to run.
  19. chunkFetchers = 4
  20. // chunkTimeout is the timeout while waiting for the next chunk from the chunk queue.
  21. chunkTimeout = 2 * time.Minute
  22. // requestTimeout is the timeout before rerequesting a chunk, possibly from a different peer.
  23. chunkRequestTimeout = 10 * time.Second
  24. )
  25. var (
  26. // errAbort is returned by Sync() when snapshot restoration is aborted.
  27. errAbort = errors.New("state sync aborted")
  28. // errRetrySnapshot is returned by Sync() when the snapshot should be retried.
  29. errRetrySnapshot = errors.New("retry snapshot")
  30. // errRejectSnapshot is returned by Sync() when the snapshot is rejected.
  31. errRejectSnapshot = errors.New("snapshot was rejected")
  32. // errRejectFormat is returned by Sync() when the snapshot format is rejected.
  33. errRejectFormat = errors.New("snapshot format was rejected")
  34. // errRejectSender is returned by Sync() when the snapshot sender is rejected.
  35. errRejectSender = errors.New("snapshot sender was rejected")
  36. // errVerifyFailed is returned by Sync() when app hash or last height verification fails.
  37. errVerifyFailed = errors.New("verification failed")
  38. // errTimeout is returned by Sync() when we've waited too long to receive a chunk.
  39. errTimeout = errors.New("timed out waiting for chunk")
  40. // errNoSnapshots is returned by SyncAny() if no snapshots are found and discovery is disabled.
  41. errNoSnapshots = errors.New("no suitable snapshots found")
  42. )
  43. // syncer runs a state sync against an ABCI app. Use either SyncAny() to automatically attempt to
  44. // sync all snapshots in the pool (pausing to discover new ones), or Sync() to sync a specific
  45. // snapshot. Snapshots and chunks are fed via AddSnapshot() and AddChunk() as appropriate.
  46. type syncer struct {
  47. logger log.Logger
  48. stateProvider StateProvider
  49. conn proxy.AppConnSnapshot
  50. connQuery proxy.AppConnQuery
  51. snapshots *snapshotPool
  52. tempDir string
  53. mtx tmsync.RWMutex
  54. chunks *chunkQueue
  55. }
  56. // newSyncer creates a new syncer.
  57. func newSyncer(logger log.Logger, conn proxy.AppConnSnapshot, connQuery proxy.AppConnQuery,
  58. stateProvider StateProvider, tempDir string) *syncer {
  59. return &syncer{
  60. logger: logger,
  61. stateProvider: stateProvider,
  62. conn: conn,
  63. connQuery: connQuery,
  64. snapshots: newSnapshotPool(stateProvider),
  65. tempDir: tempDir,
  66. }
  67. }
  68. // AddChunk adds a chunk to the chunk queue, if any. It returns false if the chunk has already
  69. // been added to the queue, or an error if there's no sync in progress.
  70. func (s *syncer) AddChunk(chunk *chunk) (bool, error) {
  71. s.mtx.RLock()
  72. defer s.mtx.RUnlock()
  73. if s.chunks == nil {
  74. return false, errors.New("no state sync in progress")
  75. }
  76. added, err := s.chunks.Add(chunk)
  77. if err != nil {
  78. return false, err
  79. }
  80. if added {
  81. s.logger.Debug("Added chunk to queue", "height", chunk.Height, "format", chunk.Format,
  82. "chunk", chunk.Index)
  83. } else {
  84. s.logger.Debug("Ignoring duplicate chunk in queue", "height", chunk.Height, "format", chunk.Format,
  85. "chunk", chunk.Index)
  86. }
  87. return added, nil
  88. }
  89. // AddSnapshot adds a snapshot to the snapshot pool. It returns true if a new, previously unseen
  90. // snapshot was accepted and added.
  91. func (s *syncer) AddSnapshot(peer p2p.Peer, snapshot *snapshot) (bool, error) {
  92. added, err := s.snapshots.Add(peer, snapshot)
  93. if err != nil {
  94. return false, err
  95. }
  96. if added {
  97. s.logger.Info("Discovered new snapshot", "height", snapshot.Height, "format", snapshot.Format,
  98. "hash", fmt.Sprintf("%X", snapshot.Hash))
  99. }
  100. return added, nil
  101. }
  102. // AddPeer adds a peer to the pool. For now we just keep it simple and send a single request
  103. // to discover snapshots, later we may want to do retries and stuff.
  104. func (s *syncer) AddPeer(peer p2p.Peer) {
  105. s.logger.Debug("Requesting snapshots from peer", "peer", peer.ID())
  106. peer.Send(SnapshotChannel, mustEncodeMsg(&ssproto.SnapshotsRequest{}))
  107. }
  108. // RemovePeer removes a peer from the pool.
  109. func (s *syncer) RemovePeer(peer p2p.Peer) {
  110. s.logger.Debug("Removing peer from sync", "peer", peer.ID())
  111. s.snapshots.RemovePeer(peer.ID())
  112. }
  113. // SyncAny tries to sync any of the snapshots in the snapshot pool, waiting to discover further
  114. // snapshots if none were found and discoveryTime > 0. It returns the latest state and block commit
  115. // which the caller must use to bootstrap the node.
  116. func (s *syncer) SyncAny(discoveryTime time.Duration) (sm.State, *types.Commit, error) {
  117. if discoveryTime > 0 {
  118. s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime))
  119. time.Sleep(discoveryTime)
  120. }
  121. // The app may ask us to retry a snapshot restoration, in which case we need to reuse
  122. // the snapshot and chunk queue from the previous loop iteration.
  123. var (
  124. snapshot *snapshot
  125. chunks *chunkQueue
  126. err error
  127. )
  128. for {
  129. // If not nil, we're going to retry restoration of the same snapshot.
  130. if snapshot == nil {
  131. snapshot = s.snapshots.Best()
  132. chunks = nil
  133. }
  134. if snapshot == nil {
  135. if discoveryTime == 0 {
  136. return sm.State{}, nil, errNoSnapshots
  137. }
  138. s.logger.Info(fmt.Sprintf("Discovering snapshots for %v", discoveryTime))
  139. time.Sleep(discoveryTime)
  140. continue
  141. }
  142. if chunks == nil {
  143. chunks, err = newChunkQueue(snapshot, s.tempDir)
  144. if err != nil {
  145. return sm.State{}, nil, fmt.Errorf("failed to create chunk queue: %w", err)
  146. }
  147. defer chunks.Close() // in case we forget to close it elsewhere
  148. }
  149. newState, commit, err := s.Sync(snapshot, chunks)
  150. switch {
  151. case err == nil:
  152. return newState, commit, nil
  153. case errors.Is(err, errAbort):
  154. return sm.State{}, nil, err
  155. case errors.Is(err, errRetrySnapshot):
  156. chunks.RetryAll()
  157. s.logger.Info("Retrying snapshot", "height", snapshot.Height, "format", snapshot.Format,
  158. "hash", fmt.Sprintf("%X", snapshot.Hash))
  159. continue
  160. case errors.Is(err, errTimeout):
  161. s.snapshots.Reject(snapshot)
  162. s.logger.Error("Timed out waiting for snapshot chunks, rejected snapshot",
  163. "height", snapshot.Height, "format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash))
  164. case errors.Is(err, errRejectSnapshot):
  165. s.snapshots.Reject(snapshot)
  166. s.logger.Info("Snapshot rejected", "height", snapshot.Height, "format", snapshot.Format,
  167. "hash", fmt.Sprintf("%X", snapshot.Hash))
  168. case errors.Is(err, errRejectFormat):
  169. s.snapshots.RejectFormat(snapshot.Format)
  170. s.logger.Info("Snapshot format rejected", "format", snapshot.Format)
  171. case errors.Is(err, errRejectSender):
  172. s.logger.Info("Snapshot senders rejected", "height", snapshot.Height, "format", snapshot.Format,
  173. "hash", fmt.Sprintf("%X", snapshot.Hash))
  174. for _, peer := range s.snapshots.GetPeers(snapshot) {
  175. s.snapshots.RejectPeer(peer.ID())
  176. s.logger.Info("Snapshot sender rejected", "peer", peer.ID())
  177. }
  178. default:
  179. return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err)
  180. }
  181. // Discard snapshot and chunks for next iteration
  182. err = chunks.Close()
  183. if err != nil {
  184. s.logger.Error("Failed to clean up chunk queue", "err", err)
  185. }
  186. snapshot = nil
  187. chunks = nil
  188. }
  189. }
  190. // Sync executes a sync for a specific snapshot, returning the latest state and block commit which
  191. // the caller must use to bootstrap the node.
  192. func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.Commit, error) {
  193. s.mtx.Lock()
  194. if s.chunks != nil {
  195. s.mtx.Unlock()
  196. return sm.State{}, nil, errors.New("a state sync is already in progress")
  197. }
  198. s.chunks = chunks
  199. s.mtx.Unlock()
  200. defer func() {
  201. s.mtx.Lock()
  202. s.chunks = nil
  203. s.mtx.Unlock()
  204. }()
  205. // Offer snapshot to ABCI app.
  206. err := s.offerSnapshot(snapshot)
  207. if err != nil {
  208. return sm.State{}, nil, err
  209. }
  210. // Spawn chunk fetchers. They will terminate when the chunk queue is closed or context cancelled.
  211. ctx, cancel := context.WithCancel(context.Background())
  212. defer cancel()
  213. for i := int32(0); i < chunkFetchers; i++ {
  214. go s.fetchChunks(ctx, snapshot, chunks)
  215. }
  216. pctx, pcancel := context.WithTimeout(context.Background(), 10*time.Second)
  217. defer pcancel()
  218. // Optimistically build new state, so we don't discover any light client failures at the end.
  219. state, err := s.stateProvider.State(pctx, snapshot.Height)
  220. if err != nil {
  221. return sm.State{}, nil, fmt.Errorf("failed to build new state: %w", err)
  222. }
  223. commit, err := s.stateProvider.Commit(pctx, snapshot.Height)
  224. if err != nil {
  225. return sm.State{}, nil, fmt.Errorf("failed to fetch commit: %w", err)
  226. }
  227. // Restore snapshot
  228. err = s.applyChunks(chunks)
  229. if err != nil {
  230. return sm.State{}, nil, err
  231. }
  232. // Verify app and update app version
  233. appVersion, err := s.verifyApp(snapshot)
  234. if err != nil {
  235. return sm.State{}, nil, err
  236. }
  237. state.Version.Consensus.App = appVersion
  238. // Done! 🎉
  239. s.logger.Info("Snapshot restored", "height", snapshot.Height, "format", snapshot.Format,
  240. "hash", fmt.Sprintf("%X", snapshot.Hash))
  241. return state, commit, nil
  242. }
  243. // offerSnapshot offers a snapshot to the app. It returns various errors depending on the app's
  244. // response, or nil if the snapshot was accepted.
  245. func (s *syncer) offerSnapshot(snapshot *snapshot) error {
  246. s.logger.Info("Offering snapshot to ABCI app", "height", snapshot.Height,
  247. "format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash))
  248. resp, err := s.conn.OfferSnapshotSync(abci.RequestOfferSnapshot{
  249. Snapshot: &abci.Snapshot{
  250. Height: snapshot.Height,
  251. Format: snapshot.Format,
  252. Chunks: snapshot.Chunks,
  253. Hash: snapshot.Hash,
  254. Metadata: snapshot.Metadata,
  255. },
  256. AppHash: snapshot.trustedAppHash,
  257. })
  258. if err != nil {
  259. return fmt.Errorf("failed to offer snapshot: %w", err)
  260. }
  261. switch resp.Result {
  262. case abci.ResponseOfferSnapshot_ACCEPT:
  263. s.logger.Info("Snapshot accepted, restoring", "height", snapshot.Height,
  264. "format", snapshot.Format, "hash", fmt.Sprintf("%X", snapshot.Hash))
  265. return nil
  266. case abci.ResponseOfferSnapshot_ABORT:
  267. return errAbort
  268. case abci.ResponseOfferSnapshot_REJECT:
  269. return errRejectSnapshot
  270. case abci.ResponseOfferSnapshot_REJECT_FORMAT:
  271. return errRejectFormat
  272. case abci.ResponseOfferSnapshot_REJECT_SENDER:
  273. return errRejectSender
  274. default:
  275. return fmt.Errorf("unknown ResponseOfferSnapshot result %v", resp.Result)
  276. }
  277. }
  278. // applyChunks applies chunks to the app. It returns various errors depending on the app's
  279. // response, or nil once the snapshot is fully restored.
  280. func (s *syncer) applyChunks(chunks *chunkQueue) error {
  281. for {
  282. chunk, err := chunks.Next()
  283. if err == errDone {
  284. return nil
  285. } else if err != nil {
  286. return fmt.Errorf("failed to fetch chunk: %w", err)
  287. }
  288. resp, err := s.conn.ApplySnapshotChunkSync(abci.RequestApplySnapshotChunk{
  289. Index: chunk.Index,
  290. Chunk: chunk.Chunk,
  291. Sender: string(chunk.Sender),
  292. })
  293. if err != nil {
  294. return fmt.Errorf("failed to apply chunk %v: %w", chunk.Index, err)
  295. }
  296. s.logger.Info("Applied snapshot chunk to ABCI app", "height", chunk.Height,
  297. "format", chunk.Format, "chunk", chunk.Index, "total", chunks.Size())
  298. // Discard and refetch any chunks as requested by the app
  299. for _, index := range resp.RefetchChunks {
  300. err := chunks.Discard(index)
  301. if err != nil {
  302. return fmt.Errorf("failed to discard chunk %v: %w", index, err)
  303. }
  304. }
  305. // Reject any senders as requested by the app
  306. for _, sender := range resp.RejectSenders {
  307. if sender != "" {
  308. s.snapshots.RejectPeer(p2p.ID(sender))
  309. err := chunks.DiscardSender(p2p.ID(sender))
  310. if err != nil {
  311. return fmt.Errorf("failed to reject sender: %w", err)
  312. }
  313. }
  314. }
  315. switch resp.Result {
  316. case abci.ResponseApplySnapshotChunk_ACCEPT:
  317. case abci.ResponseApplySnapshotChunk_ABORT:
  318. return errAbort
  319. case abci.ResponseApplySnapshotChunk_RETRY:
  320. chunks.Retry(chunk.Index)
  321. case abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT:
  322. return errRetrySnapshot
  323. case abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT:
  324. return errRejectSnapshot
  325. default:
  326. return fmt.Errorf("unknown ResponseApplySnapshotChunk result %v", resp.Result)
  327. }
  328. }
  329. }
  330. // fetchChunks requests chunks from peers, receiving allocations from the chunk queue. Chunks
  331. // will be received from the reactor via syncer.AddChunks() to chunkQueue.Add().
  332. func (s *syncer) fetchChunks(ctx context.Context, snapshot *snapshot, chunks *chunkQueue) {
  333. for {
  334. index, err := chunks.Allocate()
  335. if err == errDone {
  336. // Keep checking until the context is cancelled (restore is done), in case any
  337. // chunks need to be refetched.
  338. select {
  339. case <-ctx.Done():
  340. return
  341. default:
  342. }
  343. time.Sleep(2 * time.Second)
  344. continue
  345. }
  346. if err != nil {
  347. s.logger.Error("Failed to allocate chunk from queue", "err", err)
  348. return
  349. }
  350. s.logger.Info("Fetching snapshot chunk", "height", snapshot.Height,
  351. "format", snapshot.Format, "chunk", index, "total", chunks.Size())
  352. ticker := time.NewTicker(chunkRequestTimeout)
  353. defer ticker.Stop()
  354. s.requestChunk(snapshot, index)
  355. select {
  356. case <-chunks.WaitFor(index):
  357. case <-ticker.C:
  358. s.requestChunk(snapshot, index)
  359. case <-ctx.Done():
  360. return
  361. }
  362. ticker.Stop()
  363. }
  364. }
  365. // requestChunk requests a chunk from a peer.
  366. func (s *syncer) requestChunk(snapshot *snapshot, chunk uint32) {
  367. peer := s.snapshots.GetPeer(snapshot)
  368. if peer == nil {
  369. s.logger.Error("No valid peers found for snapshot", "height", snapshot.Height,
  370. "format", snapshot.Format, "hash", snapshot.Hash)
  371. return
  372. }
  373. s.logger.Debug("Requesting snapshot chunk", "height", snapshot.Height,
  374. "format", snapshot.Format, "chunk", chunk, "peer", peer.ID())
  375. peer.Send(ChunkChannel, mustEncodeMsg(&ssproto.ChunkRequest{
  376. Height: snapshot.Height,
  377. Format: snapshot.Format,
  378. Index: chunk,
  379. }))
  380. }
  381. // verifyApp verifies the sync, checking the app hash and last block height. It returns the
  382. // app version, which should be returned as part of the initial state.
  383. func (s *syncer) verifyApp(snapshot *snapshot) (uint64, error) {
  384. resp, err := s.connQuery.InfoSync(proxy.RequestInfo)
  385. if err != nil {
  386. return 0, fmt.Errorf("failed to query ABCI app for appHash: %w", err)
  387. }
  388. if !bytes.Equal(snapshot.trustedAppHash, resp.LastBlockAppHash) {
  389. s.logger.Error("appHash verification failed",
  390. "expected", fmt.Sprintf("%X", snapshot.trustedAppHash),
  391. "actual", fmt.Sprintf("%X", resp.LastBlockAppHash))
  392. return 0, errVerifyFailed
  393. }
  394. if uint64(resp.LastBlockHeight) != snapshot.Height {
  395. s.logger.Error("ABCI app reported unexpected last block height",
  396. "expected", snapshot.Height, "actual", resp.LastBlockHeight)
  397. return 0, errVerifyFailed
  398. }
  399. s.logger.Info("Verified ABCI app", "height", snapshot.Height,
  400. "appHash", fmt.Sprintf("%X", snapshot.trustedAppHash))
  401. return resp.AppVersion, nil
  402. }