You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

785 lines
27 KiB

statesync: remove deadlock on init fail (#7029) When statesync is stopped during shutdown, it has the possibility of deadlocking. A dump of goroutines reveals that this is related to the peerUpdates channel not returning anything on its `Done()` channel when `OnStop` is called. As this is occuring, `processPeerUpdate` is attempting to acquire the reactor lock. It appears that this lock can never be acquired. I looked for the places where the lock may remain locked accidentally and cleaned them up in hopes to eradicate the issue. Dumps of the relevant goroutines may be found below. Note that the line numbers below are relative to the code in the `v0.35.0-rc1` tag. ``` goroutine 36 [chan receive]: github.com/tendermint/tendermint/internal/statesync.(*Reactor).OnStop(0xc00058f200) github.com/tendermint/tendermint/internal/statesync/reactor.go:243 +0x117 github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc00058f200, 0x0, 0x0) github.com/tendermint/tendermint/libs/service/service.go:171 +0x323 github.com/tendermint/tendermint/node.(*nodeImpl).OnStop(0xc0001ea240) github.com/tendermint/tendermint/node/node.go:769 +0x132 github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc0001ea240, 0x0, 0x0) github.com/tendermint/tendermint/libs/service/service.go:171 +0x323 github.com/tendermint/tendermint/cmd/tendermint/commands.NewRunNodeCmd.func1.1() github.com/tendermint/tendermint/cmd/tendermint/commands/run_node.go:143 +0x62 github.com/tendermint/tendermint/libs/os.TrapSignal.func1(0xc000629500, 0x7fdb52f96358, 0xc0002b5030, 0xc00000daa0) github.com/tendermint/tendermint/libs/os/os.go:26 +0x102 created by github.com/tendermint/tendermint/libs/os.TrapSignal github.com/tendermint/tendermint/libs/os/os.go:22 +0xe6 goroutine 188 [semacquire]: sync.runtime_SemacquireMutex(0xc00026b1cc, 0x0, 0x1) runtime/sema.go:71 +0x47 sync.(*Mutex).lockSlow(0xc00026b1c8) sync/mutex.go:138 +0x105 sync.(*Mutex).Lock(...) sync/mutex.go:81 sync.(*RWMutex).Lock(0xc00026b1c8) sync/rwmutex.go:111 +0x90 github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdate(0xc00026b080, 0xc000650008, 0x28, 0x124de90, 0x4) github.com/tendermint/tendermint/internal/statesync/reactor.go:849 +0x1a5 github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdates(0xc00026b080) github.com/tendermint/tendermint/internal/statesync/reactor.go:883 +0xab created by github.com/tendermint/tendermint/internal/statesync.(*Reactor.OnStart github.com/tendermint/tendermint/internal/statesync/reactor.go:219 +0xcd) ```
3 years ago
statesync: remove deadlock on init fail (#7029) When statesync is stopped during shutdown, it has the possibility of deadlocking. A dump of goroutines reveals that this is related to the peerUpdates channel not returning anything on its `Done()` channel when `OnStop` is called. As this is occuring, `processPeerUpdate` is attempting to acquire the reactor lock. It appears that this lock can never be acquired. I looked for the places where the lock may remain locked accidentally and cleaned them up in hopes to eradicate the issue. Dumps of the relevant goroutines may be found below. Note that the line numbers below are relative to the code in the `v0.35.0-rc1` tag. ``` goroutine 36 [chan receive]: github.com/tendermint/tendermint/internal/statesync.(*Reactor).OnStop(0xc00058f200) github.com/tendermint/tendermint/internal/statesync/reactor.go:243 +0x117 github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc00058f200, 0x0, 0x0) github.com/tendermint/tendermint/libs/service/service.go:171 +0x323 github.com/tendermint/tendermint/node.(*nodeImpl).OnStop(0xc0001ea240) github.com/tendermint/tendermint/node/node.go:769 +0x132 github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc0001ea240, 0x0, 0x0) github.com/tendermint/tendermint/libs/service/service.go:171 +0x323 github.com/tendermint/tendermint/cmd/tendermint/commands.NewRunNodeCmd.func1.1() github.com/tendermint/tendermint/cmd/tendermint/commands/run_node.go:143 +0x62 github.com/tendermint/tendermint/libs/os.TrapSignal.func1(0xc000629500, 0x7fdb52f96358, 0xc0002b5030, 0xc00000daa0) github.com/tendermint/tendermint/libs/os/os.go:26 +0x102 created by github.com/tendermint/tendermint/libs/os.TrapSignal github.com/tendermint/tendermint/libs/os/os.go:22 +0xe6 goroutine 188 [semacquire]: sync.runtime_SemacquireMutex(0xc00026b1cc, 0x0, 0x1) runtime/sema.go:71 +0x47 sync.(*Mutex).lockSlow(0xc00026b1c8) sync/mutex.go:138 +0x105 sync.(*Mutex).Lock(...) sync/mutex.go:81 sync.(*RWMutex).Lock(0xc00026b1c8) sync/rwmutex.go:111 +0x90 github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdate(0xc00026b080, 0xc000650008, 0x28, 0x124de90, 0x4) github.com/tendermint/tendermint/internal/statesync/reactor.go:849 +0x1a5 github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdates(0xc00026b080) github.com/tendermint/tendermint/internal/statesync/reactor.go:883 +0xab created by github.com/tendermint/tendermint/internal/statesync.(*Reactor.OnStart github.com/tendermint/tendermint/internal/statesync/reactor.go:219 +0xcd) ```
3 years ago
  1. package statesync
  2. import (
  3. "context"
  4. "errors"
  5. "sync"
  6. "testing"
  7. "time"
  8. "github.com/stretchr/testify/assert"
  9. "github.com/stretchr/testify/mock"
  10. "github.com/stretchr/testify/require"
  11. abci "github.com/tendermint/tendermint/abci/types"
  12. "github.com/tendermint/tendermint/internal/proxy"
  13. proxymocks "github.com/tendermint/tendermint/internal/proxy/mocks"
  14. sm "github.com/tendermint/tendermint/internal/state"
  15. "github.com/tendermint/tendermint/internal/statesync/mocks"
  16. ssproto "github.com/tendermint/tendermint/proto/tendermint/statesync"
  17. "github.com/tendermint/tendermint/types"
  18. "github.com/tendermint/tendermint/version"
  19. )
  20. func TestSyncer_SyncAny(t *testing.T) {
  21. ctx, cancel := context.WithCancel(context.Background())
  22. defer cancel()
  23. state := sm.State{
  24. ChainID: "chain",
  25. Version: sm.Version{
  26. Consensus: version.Consensus{
  27. Block: version.BlockProtocol,
  28. App: testAppVersion,
  29. },
  30. Software: version.TMVersion,
  31. },
  32. LastBlockHeight: 1,
  33. LastBlockID: types.BlockID{Hash: []byte("blockhash")},
  34. LastBlockTime: time.Now(),
  35. LastResultsHash: []byte("last_results_hash"),
  36. AppHash: []byte("app_hash"),
  37. LastValidators: &types.ValidatorSet{Proposer: &types.Validator{Address: []byte("val1")}},
  38. Validators: &types.ValidatorSet{Proposer: &types.Validator{Address: []byte("val2")}},
  39. NextValidators: &types.ValidatorSet{Proposer: &types.Validator{Address: []byte("val3")}},
  40. ConsensusParams: *types.DefaultConsensusParams(),
  41. LastHeightConsensusParamsChanged: 1,
  42. }
  43. commit := &types.Commit{BlockID: types.BlockID{Hash: []byte("blockhash")}}
  44. chunks := []*chunk{
  45. {Height: 1, Format: 1, Index: 0, Chunk: []byte{1, 1, 0}},
  46. {Height: 1, Format: 1, Index: 1, Chunk: []byte{1, 1, 1}},
  47. {Height: 1, Format: 1, Index: 2, Chunk: []byte{1, 1, 2}},
  48. }
  49. s := &snapshot{Height: 1, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  50. stateProvider := &mocks.StateProvider{}
  51. stateProvider.On("AppHash", mock.Anything, uint64(1)).Return(state.AppHash, nil)
  52. stateProvider.On("AppHash", mock.Anything, uint64(2)).Return([]byte("app_hash_2"), nil)
  53. stateProvider.On("Commit", mock.Anything, uint64(1)).Return(commit, nil)
  54. stateProvider.On("State", mock.Anything, uint64(1)).Return(state, nil)
  55. connSnapshot := &proxymocks.AppConnSnapshot{}
  56. connQuery := &proxymocks.AppConnQuery{}
  57. peerAID := types.NodeID("aa")
  58. peerBID := types.NodeID("bb")
  59. peerCID := types.NodeID("cc")
  60. rts := setup(ctx, t, connSnapshot, connQuery, stateProvider, 4)
  61. rts.reactor.syncer = rts.syncer
  62. // Adding a chunk should error when no sync is in progress
  63. _, err := rts.syncer.AddChunk(&chunk{Height: 1, Format: 1, Index: 0, Chunk: []byte{1}})
  64. require.Error(t, err)
  65. // Adding a couple of peers should trigger snapshot discovery messages
  66. err = rts.syncer.AddPeer(ctx, peerAID)
  67. require.NoError(t, err)
  68. e := <-rts.snapshotOutCh
  69. require.Equal(t, &ssproto.SnapshotsRequest{}, e.Message)
  70. require.Equal(t, peerAID, e.To)
  71. err = rts.syncer.AddPeer(ctx, peerBID)
  72. require.NoError(t, err)
  73. e = <-rts.snapshotOutCh
  74. require.Equal(t, &ssproto.SnapshotsRequest{}, e.Message)
  75. require.Equal(t, peerBID, e.To)
  76. // Both peers report back with snapshots. One of them also returns a snapshot we don't want, in
  77. // format 2, which will be rejected by the ABCI application.
  78. new, err := rts.syncer.AddSnapshot(peerAID, s)
  79. require.NoError(t, err)
  80. require.True(t, new)
  81. new, err = rts.syncer.AddSnapshot(peerBID, s)
  82. require.NoError(t, err)
  83. require.False(t, new)
  84. s2 := &snapshot{Height: 2, Format: 2, Chunks: 3, Hash: []byte{1}}
  85. new, err = rts.syncer.AddSnapshot(peerBID, s2)
  86. require.NoError(t, err)
  87. require.True(t, new)
  88. new, err = rts.syncer.AddSnapshot(peerCID, s2)
  89. require.NoError(t, err)
  90. require.False(t, new)
  91. // We start a sync, with peers sending back chunks when requested. We first reject the snapshot
  92. // with height 2 format 2, and accept the snapshot at height 1.
  93. connSnapshot.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  94. Snapshot: &abci.Snapshot{
  95. Height: 2,
  96. Format: 2,
  97. Chunks: 3,
  98. Hash: []byte{1},
  99. },
  100. AppHash: []byte("app_hash_2"),
  101. }).Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_REJECT_FORMAT}, nil)
  102. connSnapshot.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  103. Snapshot: &abci.Snapshot{
  104. Height: s.Height,
  105. Format: s.Format,
  106. Chunks: s.Chunks,
  107. Hash: s.Hash,
  108. Metadata: s.Metadata,
  109. },
  110. AppHash: []byte("app_hash"),
  111. }).Times(2).Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_ACCEPT}, nil)
  112. chunkRequests := make(map[uint32]int)
  113. chunkRequestsMtx := sync.Mutex{}
  114. chunkProcessDone := make(chan struct{})
  115. go func() {
  116. defer close(chunkProcessDone)
  117. var seen int
  118. for {
  119. if seen >= 4 {
  120. return
  121. }
  122. select {
  123. case <-ctx.Done():
  124. t.Logf("sent %d chunks", seen)
  125. return
  126. case e := <-rts.chunkOutCh:
  127. msg, ok := e.Message.(*ssproto.ChunkRequest)
  128. assert.True(t, ok)
  129. assert.EqualValues(t, 1, msg.Height)
  130. assert.EqualValues(t, 1, msg.Format)
  131. assert.LessOrEqual(t, msg.Index, uint32(len(chunks)))
  132. added, err := rts.syncer.AddChunk(chunks[msg.Index])
  133. assert.NoError(t, err)
  134. assert.True(t, added)
  135. chunkRequestsMtx.Lock()
  136. chunkRequests[msg.Index]++
  137. chunkRequestsMtx.Unlock()
  138. seen++
  139. t.Logf("added chunk (%d of 4): %d", seen, msg.Index)
  140. }
  141. }
  142. }()
  143. // The first time we're applying chunk 2 we tell it to retry the snapshot and discard chunk 1,
  144. // which should cause it to keep the existing chunk 0 and 2, and restart restoration from
  145. // beginning. We also wait for a little while, to exercise the retry logic in fetchChunks().
  146. connSnapshot.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  147. Index: 2, Chunk: []byte{1, 1, 2},
  148. }).Once().Run(func(args mock.Arguments) { time.Sleep(1 * time.Second) }).Return(
  149. &abci.ResponseApplySnapshotChunk{
  150. Result: abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT,
  151. RefetchChunks: []uint32{1},
  152. }, nil)
  153. connSnapshot.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  154. Index: 0, Chunk: []byte{1, 1, 0},
  155. }).Times(2).Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  156. connSnapshot.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  157. Index: 1, Chunk: []byte{1, 1, 1},
  158. }).Times(2).Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  159. connSnapshot.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  160. Index: 2, Chunk: []byte{1, 1, 2},
  161. }).Once().Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  162. connQuery.On("Info", mock.Anything, proxy.RequestInfo).Return(&abci.ResponseInfo{
  163. AppVersion: testAppVersion,
  164. LastBlockHeight: 1,
  165. LastBlockAppHash: []byte("app_hash"),
  166. }, nil)
  167. newState, lastCommit, err := rts.syncer.SyncAny(ctx, 0, func() error { return nil })
  168. require.NoError(t, err)
  169. <-chunkProcessDone
  170. chunkRequestsMtx.Lock()
  171. require.Equal(t, map[uint32]int{0: 1, 1: 2, 2: 1}, chunkRequests)
  172. chunkRequestsMtx.Unlock()
  173. expectState := state
  174. require.Equal(t, expectState, newState)
  175. require.Equal(t, commit, lastCommit)
  176. require.Equal(t, len(chunks), int(rts.syncer.processingSnapshot.Chunks))
  177. require.Equal(t, expectState.LastBlockHeight, rts.syncer.lastSyncedSnapshotHeight)
  178. require.True(t, rts.syncer.avgChunkTime > 0)
  179. require.Equal(t, int64(rts.syncer.processingSnapshot.Chunks), rts.reactor.SnapshotChunksTotal())
  180. require.Equal(t, rts.syncer.lastSyncedSnapshotHeight, rts.reactor.SnapshotHeight())
  181. require.Equal(t, time.Duration(rts.syncer.avgChunkTime), rts.reactor.ChunkProcessAvgTime())
  182. require.Equal(t, int64(len(rts.syncer.snapshots.snapshots)), rts.reactor.TotalSnapshots())
  183. require.Equal(t, int64(0), rts.reactor.SnapshotChunksCount())
  184. connSnapshot.AssertExpectations(t)
  185. connQuery.AssertExpectations(t)
  186. }
  187. func TestSyncer_SyncAny_noSnapshots(t *testing.T) {
  188. stateProvider := &mocks.StateProvider{}
  189. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  190. ctx, cancel := context.WithCancel(context.Background())
  191. defer cancel()
  192. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  193. _, _, err := rts.syncer.SyncAny(ctx, 0, func() error { return nil })
  194. require.Equal(t, errNoSnapshots, err)
  195. }
  196. func TestSyncer_SyncAny_abort(t *testing.T) {
  197. stateProvider := &mocks.StateProvider{}
  198. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  199. ctx, cancel := context.WithCancel(context.Background())
  200. defer cancel()
  201. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  202. s := &snapshot{Height: 1, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  203. peerID := types.NodeID("aa")
  204. _, err := rts.syncer.AddSnapshot(peerID, s)
  205. require.NoError(t, err)
  206. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  207. Snapshot: toABCI(s), AppHash: []byte("app_hash"),
  208. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_ABORT}, nil)
  209. _, _, err = rts.syncer.SyncAny(ctx, 0, func() error { return nil })
  210. require.Equal(t, errAbort, err)
  211. rts.conn.AssertExpectations(t)
  212. }
  213. func TestSyncer_SyncAny_reject(t *testing.T) {
  214. stateProvider := &mocks.StateProvider{}
  215. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  216. ctx, cancel := context.WithCancel(context.Background())
  217. defer cancel()
  218. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  219. // s22 is tried first, then s12, then s11, then errNoSnapshots
  220. s22 := &snapshot{Height: 2, Format: 2, Chunks: 3, Hash: []byte{1, 2, 3}}
  221. s12 := &snapshot{Height: 1, Format: 2, Chunks: 3, Hash: []byte{1, 2, 3}}
  222. s11 := &snapshot{Height: 1, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  223. peerID := types.NodeID("aa")
  224. _, err := rts.syncer.AddSnapshot(peerID, s22)
  225. require.NoError(t, err)
  226. _, err = rts.syncer.AddSnapshot(peerID, s12)
  227. require.NoError(t, err)
  228. _, err = rts.syncer.AddSnapshot(peerID, s11)
  229. require.NoError(t, err)
  230. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  231. Snapshot: toABCI(s22), AppHash: []byte("app_hash"),
  232. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_REJECT}, nil)
  233. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  234. Snapshot: toABCI(s12), AppHash: []byte("app_hash"),
  235. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_REJECT}, nil)
  236. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  237. Snapshot: toABCI(s11), AppHash: []byte("app_hash"),
  238. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_REJECT}, nil)
  239. _, _, err = rts.syncer.SyncAny(ctx, 0, func() error { return nil })
  240. require.Equal(t, errNoSnapshots, err)
  241. rts.conn.AssertExpectations(t)
  242. }
  243. func TestSyncer_SyncAny_reject_format(t *testing.T) {
  244. stateProvider := &mocks.StateProvider{}
  245. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  246. ctx, cancel := context.WithCancel(context.Background())
  247. defer cancel()
  248. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  249. // s22 is tried first, which reject s22 and s12, then s11 will abort.
  250. s22 := &snapshot{Height: 2, Format: 2, Chunks: 3, Hash: []byte{1, 2, 3}}
  251. s12 := &snapshot{Height: 1, Format: 2, Chunks: 3, Hash: []byte{1, 2, 3}}
  252. s11 := &snapshot{Height: 1, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  253. peerID := types.NodeID("aa")
  254. _, err := rts.syncer.AddSnapshot(peerID, s22)
  255. require.NoError(t, err)
  256. _, err = rts.syncer.AddSnapshot(peerID, s12)
  257. require.NoError(t, err)
  258. _, err = rts.syncer.AddSnapshot(peerID, s11)
  259. require.NoError(t, err)
  260. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  261. Snapshot: toABCI(s22), AppHash: []byte("app_hash"),
  262. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_REJECT_FORMAT}, nil)
  263. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  264. Snapshot: toABCI(s11), AppHash: []byte("app_hash"),
  265. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_ABORT}, nil)
  266. _, _, err = rts.syncer.SyncAny(ctx, 0, func() error { return nil })
  267. require.Equal(t, errAbort, err)
  268. rts.conn.AssertExpectations(t)
  269. }
  270. func TestSyncer_SyncAny_reject_sender(t *testing.T) {
  271. stateProvider := &mocks.StateProvider{}
  272. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  273. ctx, cancel := context.WithCancel(context.Background())
  274. defer cancel()
  275. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  276. peerAID := types.NodeID("aa")
  277. peerBID := types.NodeID("bb")
  278. peerCID := types.NodeID("cc")
  279. // sbc will be offered first, which will be rejected with reject_sender, causing all snapshots
  280. // submitted by both b and c (i.e. sb, sc, sbc) to be rejected. Finally, sa will reject and
  281. // errNoSnapshots is returned.
  282. sa := &snapshot{Height: 1, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  283. sb := &snapshot{Height: 2, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  284. sc := &snapshot{Height: 3, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  285. sbc := &snapshot{Height: 4, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  286. _, err := rts.syncer.AddSnapshot(peerAID, sa)
  287. require.NoError(t, err)
  288. _, err = rts.syncer.AddSnapshot(peerBID, sb)
  289. require.NoError(t, err)
  290. _, err = rts.syncer.AddSnapshot(peerCID, sc)
  291. require.NoError(t, err)
  292. _, err = rts.syncer.AddSnapshot(peerBID, sbc)
  293. require.NoError(t, err)
  294. _, err = rts.syncer.AddSnapshot(peerCID, sbc)
  295. require.NoError(t, err)
  296. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  297. Snapshot: toABCI(sbc), AppHash: []byte("app_hash"),
  298. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_REJECT_SENDER}, nil)
  299. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  300. Snapshot: toABCI(sa), AppHash: []byte("app_hash"),
  301. }).Once().Return(&abci.ResponseOfferSnapshot{Result: abci.ResponseOfferSnapshot_REJECT}, nil)
  302. _, _, err = rts.syncer.SyncAny(ctx, 0, func() error { return nil })
  303. require.Equal(t, errNoSnapshots, err)
  304. rts.conn.AssertExpectations(t)
  305. }
  306. func TestSyncer_SyncAny_abciError(t *testing.T) {
  307. stateProvider := &mocks.StateProvider{}
  308. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  309. ctx, cancel := context.WithCancel(context.Background())
  310. defer cancel()
  311. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  312. errBoom := errors.New("boom")
  313. s := &snapshot{Height: 1, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}}
  314. peerID := types.NodeID("aa")
  315. _, err := rts.syncer.AddSnapshot(peerID, s)
  316. require.NoError(t, err)
  317. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  318. Snapshot: toABCI(s), AppHash: []byte("app_hash"),
  319. }).Once().Return(nil, errBoom)
  320. _, _, err = rts.syncer.SyncAny(ctx, 0, func() error { return nil })
  321. require.True(t, errors.Is(err, errBoom))
  322. rts.conn.AssertExpectations(t)
  323. }
  324. func TestSyncer_offerSnapshot(t *testing.T) {
  325. unknownErr := errors.New("unknown error")
  326. boom := errors.New("boom")
  327. testcases := map[string]struct {
  328. result abci.ResponseOfferSnapshot_Result
  329. err error
  330. expectErr error
  331. }{
  332. "accept": {abci.ResponseOfferSnapshot_ACCEPT, nil, nil},
  333. "abort": {abci.ResponseOfferSnapshot_ABORT, nil, errAbort},
  334. "reject": {abci.ResponseOfferSnapshot_REJECT, nil, errRejectSnapshot},
  335. "reject_format": {abci.ResponseOfferSnapshot_REJECT_FORMAT, nil, errRejectFormat},
  336. "reject_sender": {abci.ResponseOfferSnapshot_REJECT_SENDER, nil, errRejectSender},
  337. "unknown": {abci.ResponseOfferSnapshot_UNKNOWN, nil, unknownErr},
  338. "error": {0, boom, boom},
  339. "unknown non-zero": {9, nil, unknownErr},
  340. }
  341. ctx, cancel := context.WithCancel(context.Background())
  342. defer cancel()
  343. for name, tc := range testcases {
  344. tc := tc
  345. t.Run(name, func(t *testing.T) {
  346. ctx, cancel := context.WithCancel(ctx)
  347. defer cancel()
  348. stateProvider := &mocks.StateProvider{}
  349. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  350. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  351. s := &snapshot{Height: 1, Format: 1, Chunks: 3, Hash: []byte{1, 2, 3}, trustedAppHash: []byte("app_hash")}
  352. rts.conn.On("OfferSnapshot", mock.Anything, abci.RequestOfferSnapshot{
  353. Snapshot: toABCI(s),
  354. AppHash: []byte("app_hash"),
  355. }).Return(&abci.ResponseOfferSnapshot{Result: tc.result}, tc.err)
  356. err := rts.syncer.offerSnapshot(ctx, s)
  357. if tc.expectErr == unknownErr {
  358. require.Error(t, err)
  359. } else {
  360. unwrapped := errors.Unwrap(err)
  361. if unwrapped != nil {
  362. err = unwrapped
  363. }
  364. require.Equal(t, tc.expectErr, err)
  365. }
  366. })
  367. }
  368. }
  369. func TestSyncer_applyChunks_Results(t *testing.T) {
  370. unknownErr := errors.New("unknown error")
  371. boom := errors.New("boom")
  372. testcases := map[string]struct {
  373. result abci.ResponseApplySnapshotChunk_Result
  374. err error
  375. expectErr error
  376. }{
  377. "accept": {abci.ResponseApplySnapshotChunk_ACCEPT, nil, nil},
  378. "abort": {abci.ResponseApplySnapshotChunk_ABORT, nil, errAbort},
  379. "retry": {abci.ResponseApplySnapshotChunk_RETRY, nil, nil},
  380. "retry_snapshot": {abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT, nil, errRetrySnapshot},
  381. "reject_snapshot": {abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT, nil, errRejectSnapshot},
  382. "unknown": {abci.ResponseApplySnapshotChunk_UNKNOWN, nil, unknownErr},
  383. "error": {0, boom, boom},
  384. "unknown non-zero": {9, nil, unknownErr},
  385. }
  386. ctx, cancel := context.WithCancel(context.Background())
  387. defer cancel()
  388. for name, tc := range testcases {
  389. tc := tc
  390. t.Run(name, func(t *testing.T) {
  391. ctx, cancel := context.WithCancel(ctx)
  392. defer cancel()
  393. stateProvider := &mocks.StateProvider{}
  394. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  395. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  396. body := []byte{1, 2, 3}
  397. chunks, err := newChunkQueue(&snapshot{Height: 1, Format: 1, Chunks: 1}, t.TempDir())
  398. require.NoError(t, err)
  399. fetchStartTime := time.Now()
  400. _, err = chunks.Add(&chunk{Height: 1, Format: 1, Index: 0, Chunk: body})
  401. require.NoError(t, err)
  402. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  403. Index: 0, Chunk: body,
  404. }).Once().Return(&abci.ResponseApplySnapshotChunk{Result: tc.result}, tc.err)
  405. if tc.result == abci.ResponseApplySnapshotChunk_RETRY {
  406. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  407. Index: 0, Chunk: body,
  408. }).Once().Return(&abci.ResponseApplySnapshotChunk{
  409. Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  410. }
  411. err = rts.syncer.applyChunks(ctx, chunks, fetchStartTime)
  412. if tc.expectErr == unknownErr {
  413. require.Error(t, err)
  414. } else {
  415. unwrapped := errors.Unwrap(err)
  416. if unwrapped != nil {
  417. err = unwrapped
  418. }
  419. require.Equal(t, tc.expectErr, err)
  420. }
  421. rts.conn.AssertExpectations(t)
  422. })
  423. }
  424. }
  425. func TestSyncer_applyChunks_RefetchChunks(t *testing.T) {
  426. // Discarding chunks via refetch_chunks should work the same for all results
  427. testcases := map[string]struct {
  428. result abci.ResponseApplySnapshotChunk_Result
  429. }{
  430. "accept": {abci.ResponseApplySnapshotChunk_ACCEPT},
  431. "abort": {abci.ResponseApplySnapshotChunk_ABORT},
  432. "retry": {abci.ResponseApplySnapshotChunk_RETRY},
  433. "retry_snapshot": {abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT},
  434. "reject_snapshot": {abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT},
  435. }
  436. ctx, cancel := context.WithCancel(context.Background())
  437. defer cancel()
  438. for name, tc := range testcases {
  439. tc := tc
  440. t.Run(name, func(t *testing.T) {
  441. ctx, cancel := context.WithCancel(ctx)
  442. defer cancel()
  443. stateProvider := &mocks.StateProvider{}
  444. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  445. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  446. chunks, err := newChunkQueue(&snapshot{Height: 1, Format: 1, Chunks: 3}, t.TempDir())
  447. require.NoError(t, err)
  448. fetchStartTime := time.Now()
  449. added, err := chunks.Add(&chunk{Height: 1, Format: 1, Index: 0, Chunk: []byte{0}})
  450. require.True(t, added)
  451. require.NoError(t, err)
  452. added, err = chunks.Add(&chunk{Height: 1, Format: 1, Index: 1, Chunk: []byte{1}})
  453. require.True(t, added)
  454. require.NoError(t, err)
  455. added, err = chunks.Add(&chunk{Height: 1, Format: 1, Index: 2, Chunk: []byte{2}})
  456. require.True(t, added)
  457. require.NoError(t, err)
  458. // The first two chunks are accepted, before the last one asks for 1 to be refetched
  459. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  460. Index: 0, Chunk: []byte{0},
  461. }).Once().Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  462. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  463. Index: 1, Chunk: []byte{1},
  464. }).Once().Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  465. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  466. Index: 2, Chunk: []byte{2},
  467. }).Once().Return(&abci.ResponseApplySnapshotChunk{
  468. Result: tc.result,
  469. RefetchChunks: []uint32{1},
  470. }, nil)
  471. // Since removing the chunk will cause Next() to block, we spawn a goroutine, then
  472. // check the queue contents, and finally close the queue to end the goroutine.
  473. // We don't really care about the result of applyChunks, since it has separate test.
  474. go func() {
  475. rts.syncer.applyChunks(ctx, chunks, fetchStartTime) //nolint:errcheck // purposefully ignore error
  476. }()
  477. time.Sleep(50 * time.Millisecond)
  478. require.True(t, chunks.Has(0))
  479. require.False(t, chunks.Has(1))
  480. require.True(t, chunks.Has(2))
  481. require.NoError(t, chunks.Close())
  482. })
  483. }
  484. }
  485. func TestSyncer_applyChunks_RejectSenders(t *testing.T) {
  486. // Banning chunks senders via ban_chunk_senders should work the same for all results
  487. testcases := map[string]struct {
  488. result abci.ResponseApplySnapshotChunk_Result
  489. }{
  490. "accept": {abci.ResponseApplySnapshotChunk_ACCEPT},
  491. "abort": {abci.ResponseApplySnapshotChunk_ABORT},
  492. "retry": {abci.ResponseApplySnapshotChunk_RETRY},
  493. "retry_snapshot": {abci.ResponseApplySnapshotChunk_RETRY_SNAPSHOT},
  494. "reject_snapshot": {abci.ResponseApplySnapshotChunk_REJECT_SNAPSHOT},
  495. }
  496. ctx, cancel := context.WithCancel(context.Background())
  497. defer cancel()
  498. for name, tc := range testcases {
  499. tc := tc
  500. t.Run(name, func(t *testing.T) {
  501. ctx, cancel := context.WithCancel(ctx)
  502. defer cancel()
  503. stateProvider := &mocks.StateProvider{}
  504. stateProvider.On("AppHash", mock.Anything, mock.Anything).Return([]byte("app_hash"), nil)
  505. rts := setup(ctx, t, nil, nil, stateProvider, 2)
  506. // Set up three peers across two snapshots, and ask for one of them to be banned.
  507. // It should be banned from all snapshots.
  508. peerAID := types.NodeID("aa")
  509. peerBID := types.NodeID("bb")
  510. peerCID := types.NodeID("cc")
  511. s1 := &snapshot{Height: 1, Format: 1, Chunks: 3}
  512. s2 := &snapshot{Height: 2, Format: 1, Chunks: 3}
  513. _, err := rts.syncer.AddSnapshot(peerAID, s1)
  514. require.NoError(t, err)
  515. _, err = rts.syncer.AddSnapshot(peerAID, s2)
  516. require.NoError(t, err)
  517. _, err = rts.syncer.AddSnapshot(peerBID, s1)
  518. require.NoError(t, err)
  519. _, err = rts.syncer.AddSnapshot(peerBID, s2)
  520. require.NoError(t, err)
  521. _, err = rts.syncer.AddSnapshot(peerCID, s1)
  522. require.NoError(t, err)
  523. _, err = rts.syncer.AddSnapshot(peerCID, s2)
  524. require.NoError(t, err)
  525. chunks, err := newChunkQueue(s1, t.TempDir())
  526. require.NoError(t, err)
  527. fetchStartTime := time.Now()
  528. added, err := chunks.Add(&chunk{Height: 1, Format: 1, Index: 0, Chunk: []byte{0}, Sender: peerAID})
  529. require.True(t, added)
  530. require.NoError(t, err)
  531. added, err = chunks.Add(&chunk{Height: 1, Format: 1, Index: 1, Chunk: []byte{1}, Sender: peerBID})
  532. require.True(t, added)
  533. require.NoError(t, err)
  534. added, err = chunks.Add(&chunk{Height: 1, Format: 1, Index: 2, Chunk: []byte{2}, Sender: peerCID})
  535. require.True(t, added)
  536. require.NoError(t, err)
  537. // The first two chunks are accepted, before the last one asks for b sender to be rejected
  538. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  539. Index: 0, Chunk: []byte{0}, Sender: "aa",
  540. }).Once().Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  541. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  542. Index: 1, Chunk: []byte{1}, Sender: "bb",
  543. }).Once().Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  544. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  545. Index: 2, Chunk: []byte{2}, Sender: "cc",
  546. }).Once().Return(&abci.ResponseApplySnapshotChunk{
  547. Result: tc.result,
  548. RejectSenders: []string{string(peerBID)},
  549. }, nil)
  550. // On retry, the last chunk will be tried again, so we just accept it then.
  551. if tc.result == abci.ResponseApplySnapshotChunk_RETRY {
  552. rts.conn.On("ApplySnapshotChunk", mock.Anything, abci.RequestApplySnapshotChunk{
  553. Index: 2, Chunk: []byte{2}, Sender: "cc",
  554. }).Once().Return(&abci.ResponseApplySnapshotChunk{Result: abci.ResponseApplySnapshotChunk_ACCEPT}, nil)
  555. }
  556. // We don't really care about the result of applyChunks, since it has separate test.
  557. // However, it will block on e.g. retry result, so we spawn a goroutine that will
  558. // be shut down when the chunk queue closes.
  559. go func() {
  560. rts.syncer.applyChunks(ctx, chunks, fetchStartTime) //nolint:errcheck // purposefully ignore error
  561. }()
  562. time.Sleep(50 * time.Millisecond)
  563. s1peers := rts.syncer.snapshots.GetPeers(s1)
  564. require.Len(t, s1peers, 2)
  565. require.EqualValues(t, "aa", s1peers[0])
  566. require.EqualValues(t, "cc", s1peers[1])
  567. rts.syncer.snapshots.GetPeers(s1)
  568. require.Len(t, s1peers, 2)
  569. require.EqualValues(t, "aa", s1peers[0])
  570. require.EqualValues(t, "cc", s1peers[1])
  571. require.NoError(t, chunks.Close())
  572. })
  573. }
  574. }
  575. func TestSyncer_verifyApp(t *testing.T) {
  576. boom := errors.New("boom")
  577. const appVersion = 9
  578. appVersionMismatchErr := errors.New("app version mismatch. Expected: 9, got: 2")
  579. s := &snapshot{Height: 3, Format: 1, Chunks: 5, Hash: []byte{1, 2, 3}, trustedAppHash: []byte("app_hash")}
  580. testcases := map[string]struct {
  581. response *abci.ResponseInfo
  582. err error
  583. expectErr error
  584. }{
  585. "verified": {&abci.ResponseInfo{
  586. LastBlockHeight: 3,
  587. LastBlockAppHash: []byte("app_hash"),
  588. AppVersion: appVersion,
  589. }, nil, nil},
  590. "invalid app version": {&abci.ResponseInfo{
  591. LastBlockHeight: 3,
  592. LastBlockAppHash: []byte("app_hash"),
  593. AppVersion: 2,
  594. }, nil, appVersionMismatchErr},
  595. "invalid height": {&abci.ResponseInfo{
  596. LastBlockHeight: 5,
  597. LastBlockAppHash: []byte("app_hash"),
  598. AppVersion: appVersion,
  599. }, nil, errVerifyFailed},
  600. "invalid hash": {&abci.ResponseInfo{
  601. LastBlockHeight: 3,
  602. LastBlockAppHash: []byte("xxx"),
  603. AppVersion: appVersion,
  604. }, nil, errVerifyFailed},
  605. "error": {nil, boom, boom},
  606. }
  607. ctx, cancel := context.WithCancel(context.Background())
  608. defer cancel()
  609. for name, tc := range testcases {
  610. tc := tc
  611. t.Run(name, func(t *testing.T) {
  612. ctx, cancel := context.WithCancel(ctx)
  613. defer cancel()
  614. rts := setup(ctx, t, nil, nil, nil, 2)
  615. rts.connQuery.On("Info", mock.Anything, proxy.RequestInfo).Return(tc.response, tc.err)
  616. err := rts.syncer.verifyApp(ctx, s, appVersion)
  617. unwrapped := errors.Unwrap(err)
  618. if unwrapped != nil {
  619. err = unwrapped
  620. }
  621. require.Equal(t, tc.expectErr, err)
  622. })
  623. }
  624. }
  625. func toABCI(s *snapshot) *abci.Snapshot {
  626. return &abci.Snapshot{
  627. Height: s.Height,
  628. Format: s.Format,
  629. Chunks: s.Chunks,
  630. Hash: s.Hash,
  631. Metadata: s.Metadata,
  632. }
  633. }