Browse Source

statesync: remove deadlock on init fail (#7029)

When statesync is stopped during shutdown, it has the possibility of deadlocking. A dump of goroutines reveals that this is related to the peerUpdates channel not returning anything on its `Done()` channel when `OnStop` is called. As this is occuring, `processPeerUpdate` is attempting to acquire the reactor lock. It appears that this lock can never be acquired. I looked for the places where the lock may remain locked accidentally and cleaned them up in hopes to eradicate the issue. Dumps of the relevant goroutines may be found below. Note that the line numbers below are relative to the code in the `v0.35.0-rc1` tag.

```
goroutine 36 [chan receive]:
github.com/tendermint/tendermint/internal/statesync.(*Reactor).OnStop(0xc00058f200)
        github.com/tendermint/tendermint/internal/statesync/reactor.go:243 +0x117
github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc00058f200, 0x0, 0x0)
        github.com/tendermint/tendermint/libs/service/service.go:171 +0x323
github.com/tendermint/tendermint/node.(*nodeImpl).OnStop(0xc0001ea240)
        github.com/tendermint/tendermint/node/node.go:769 +0x132
github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc0001ea240, 0x0, 0x0)
        github.com/tendermint/tendermint/libs/service/service.go:171 +0x323
github.com/tendermint/tendermint/cmd/tendermint/commands.NewRunNodeCmd.func1.1()
        github.com/tendermint/tendermint/cmd/tendermint/commands/run_node.go:143 +0x62
github.com/tendermint/tendermint/libs/os.TrapSignal.func1(0xc000629500, 0x7fdb52f96358, 0xc0002b5030, 0xc00000daa0)
        github.com/tendermint/tendermint/libs/os/os.go:26 +0x102
created by github.com/tendermint/tendermint/libs/os.TrapSignal
        github.com/tendermint/tendermint/libs/os/os.go:22 +0xe6

goroutine 188 [semacquire]:
sync.runtime_SemacquireMutex(0xc00026b1cc, 0x0, 0x1)
        runtime/sema.go:71 +0x47
sync.(*Mutex).lockSlow(0xc00026b1c8)
        sync/mutex.go:138 +0x105
sync.(*Mutex).Lock(...)
        sync/mutex.go:81
sync.(*RWMutex).Lock(0xc00026b1c8)
        sync/rwmutex.go:111 +0x90
github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdate(0xc00026b080, 0xc000650008, 0x28, 0x124de90, 0x4)
        github.com/tendermint/tendermint/internal/statesync/reactor.go:849 +0x1a5
github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdates(0xc00026b080)
        github.com/tendermint/tendermint/internal/statesync/reactor.go:883 +0xab
created by github.com/tendermint/tendermint/internal/statesync.(*Reactor.OnStart
        github.com/tendermint/tendermint/internal/statesync/reactor.go:219 +0xcd)
```
pull/7042/head
William Banfield 3 years ago
committed by GitHub
parent
commit
177850a2c9
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 6 deletions
  1. +7
    -3
      internal/statesync/reactor.go
  2. +12
    -1
      internal/statesync/syncer.go
  3. +4
    -2
      internal/statesync/syncer_test.go

+ 7
- 3
internal/statesync/reactor.go View File

@ -280,6 +280,7 @@ func (r *Reactor) Sync(ctx context.Context) (sm.State, error) {
}
if err := r.initStateProvider(ctx, r.chainID, r.initialHeight); err != nil {
r.mtx.Unlock()
return sm.State{}, err
}
@ -889,17 +890,20 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) {
}
r.mtx.Lock()
defer r.mtx.Unlock()
if r.syncer == nil {
r.mtx.Unlock()
return
}
defer r.mtx.Unlock()
switch peerUpdate.Status {
case p2p.PeerStatusUp:
newProvider := NewBlockProvider(peerUpdate.NodeID, r.chainID, r.dispatcher)
r.providers[peerUpdate.NodeID] = newProvider
r.syncer.AddPeer(peerUpdate.NodeID)
err := r.syncer.AddPeer(peerUpdate.NodeID)
if err != nil {
r.Logger.Error("error adding peer to syncer", "error", err)
return
}
if sp, ok := r.stateProvider.(*stateProviderP2P); ok {
// we do this in a separate routine to not block whilst waiting for the light client to finish
// whatever call it's currently executing


+ 12
- 1
internal/statesync/syncer.go View File

@ -141,7 +141,17 @@ func (s *syncer) AddSnapshot(peerID types.NodeID, snapshot *snapshot) (bool, err
// AddPeer adds a peer to the pool. For now we just keep it simple and send a
// single request to discover snapshots, later we may want to do retries and stuff.
func (s *syncer) AddPeer(peerID types.NodeID) {
func (s *syncer) AddPeer(peerID types.NodeID) (err error) {
defer func() {
// TODO: remove panic recover once AddPeer can no longer accientally send on
// closed channel.
// This recover was added to protect against the p2p message being sent
// to the snapshot channel after the snapshot channel was closed.
if r := recover(); r != nil {
err = fmt.Errorf("panic sending peer snapshot request: %v", r)
}
}()
s.logger.Debug("Requesting snapshots from peer", "peer", peerID)
msg := p2p.Envelope{
@ -153,6 +163,7 @@ func (s *syncer) AddPeer(peerID types.NodeID) {
case <-s.closeCh:
case s.snapshotCh <- msg:
}
return err
}
// RemovePeer removes a peer from the pool.


+ 4
- 2
internal/statesync/syncer_test.go View File

@ -77,12 +77,14 @@ func TestSyncer_SyncAny(t *testing.T) {
require.Error(t, err)
// Adding a couple of peers should trigger snapshot discovery messages
rts.syncer.AddPeer(peerAID)
err = rts.syncer.AddPeer(peerAID)
require.NoError(t, err)
e := <-rts.snapshotOutCh
require.Equal(t, &ssproto.SnapshotsRequest{}, e.Message)
require.Equal(t, peerAID, e.To)
rts.syncer.AddPeer(peerBID)
err = rts.syncer.AddPeer(peerBID)
require.NoError(t, err)
e = <-rts.snapshotOutCh
require.Equal(t, &ssproto.SnapshotsRequest{}, e.Message)
require.Equal(t, peerBID, e.To)


Loading…
Cancel
Save