From 177850a2c95408d967c9f7c71ef463cf4ed61df9 Mon Sep 17 00:00:00 2001 From: William Banfield <4561443+williambanfield@users.noreply.github.com> Date: Thu, 30 Sep 2021 15:19:10 -0400 Subject: [PATCH] statesync: remove deadlock on init fail (#7029) When statesync is stopped during shutdown, it has the possibility of deadlocking. A dump of goroutines reveals that this is related to the peerUpdates channel not returning anything on its `Done()` channel when `OnStop` is called. As this is occuring, `processPeerUpdate` is attempting to acquire the reactor lock. It appears that this lock can never be acquired. I looked for the places where the lock may remain locked accidentally and cleaned them up in hopes to eradicate the issue. Dumps of the relevant goroutines may be found below. Note that the line numbers below are relative to the code in the `v0.35.0-rc1` tag. ``` goroutine 36 [chan receive]: github.com/tendermint/tendermint/internal/statesync.(*Reactor).OnStop(0xc00058f200) github.com/tendermint/tendermint/internal/statesync/reactor.go:243 +0x117 github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc00058f200, 0x0, 0x0) github.com/tendermint/tendermint/libs/service/service.go:171 +0x323 github.com/tendermint/tendermint/node.(*nodeImpl).OnStop(0xc0001ea240) github.com/tendermint/tendermint/node/node.go:769 +0x132 github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc0001ea240, 0x0, 0x0) github.com/tendermint/tendermint/libs/service/service.go:171 +0x323 github.com/tendermint/tendermint/cmd/tendermint/commands.NewRunNodeCmd.func1.1() github.com/tendermint/tendermint/cmd/tendermint/commands/run_node.go:143 +0x62 github.com/tendermint/tendermint/libs/os.TrapSignal.func1(0xc000629500, 0x7fdb52f96358, 0xc0002b5030, 0xc00000daa0) github.com/tendermint/tendermint/libs/os/os.go:26 +0x102 created by github.com/tendermint/tendermint/libs/os.TrapSignal github.com/tendermint/tendermint/libs/os/os.go:22 +0xe6 goroutine 188 [semacquire]: sync.runtime_SemacquireMutex(0xc00026b1cc, 0x0, 0x1) runtime/sema.go:71 +0x47 sync.(*Mutex).lockSlow(0xc00026b1c8) sync/mutex.go:138 +0x105 sync.(*Mutex).Lock(...) sync/mutex.go:81 sync.(*RWMutex).Lock(0xc00026b1c8) sync/rwmutex.go:111 +0x90 github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdate(0xc00026b080, 0xc000650008, 0x28, 0x124de90, 0x4) github.com/tendermint/tendermint/internal/statesync/reactor.go:849 +0x1a5 github.com/tendermint/tendermint/internal/statesync.(*Reactor).processPeerUpdates(0xc00026b080) github.com/tendermint/tendermint/internal/statesync/reactor.go:883 +0xab created by github.com/tendermint/tendermint/internal/statesync.(*Reactor.OnStart github.com/tendermint/tendermint/internal/statesync/reactor.go:219 +0xcd) ``` --- internal/statesync/reactor.go | 10 +++++++--- internal/statesync/syncer.go | 13 ++++++++++++- internal/statesync/syncer_test.go | 6 ++++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/internal/statesync/reactor.go b/internal/statesync/reactor.go index a20c8ca41..abfa973b5 100644 --- a/internal/statesync/reactor.go +++ b/internal/statesync/reactor.go @@ -280,6 +280,7 @@ func (r *Reactor) Sync(ctx context.Context) (sm.State, error) { } if err := r.initStateProvider(ctx, r.chainID, r.initialHeight); err != nil { + r.mtx.Unlock() return sm.State{}, err } @@ -889,17 +890,20 @@ func (r *Reactor) processPeerUpdate(peerUpdate p2p.PeerUpdate) { } r.mtx.Lock() + defer r.mtx.Unlock() if r.syncer == nil { - r.mtx.Unlock() return } - defer r.mtx.Unlock() switch peerUpdate.Status { case p2p.PeerStatusUp: newProvider := NewBlockProvider(peerUpdate.NodeID, r.chainID, r.dispatcher) r.providers[peerUpdate.NodeID] = newProvider - r.syncer.AddPeer(peerUpdate.NodeID) + err := r.syncer.AddPeer(peerUpdate.NodeID) + if err != nil { + r.Logger.Error("error adding peer to syncer", "error", err) + return + } if sp, ok := r.stateProvider.(*stateProviderP2P); ok { // we do this in a separate routine to not block whilst waiting for the light client to finish // whatever call it's currently executing diff --git a/internal/statesync/syncer.go b/internal/statesync/syncer.go index 68bec6880..b4212961a 100644 --- a/internal/statesync/syncer.go +++ b/internal/statesync/syncer.go @@ -141,7 +141,17 @@ func (s *syncer) AddSnapshot(peerID types.NodeID, snapshot *snapshot) (bool, err // AddPeer adds a peer to the pool. For now we just keep it simple and send a // single request to discover snapshots, later we may want to do retries and stuff. -func (s *syncer) AddPeer(peerID types.NodeID) { +func (s *syncer) AddPeer(peerID types.NodeID) (err error) { + defer func() { + // TODO: remove panic recover once AddPeer can no longer accientally send on + // closed channel. + // This recover was added to protect against the p2p message being sent + // to the snapshot channel after the snapshot channel was closed. + if r := recover(); r != nil { + err = fmt.Errorf("panic sending peer snapshot request: %v", r) + } + }() + s.logger.Debug("Requesting snapshots from peer", "peer", peerID) msg := p2p.Envelope{ @@ -153,6 +163,7 @@ func (s *syncer) AddPeer(peerID types.NodeID) { case <-s.closeCh: case s.snapshotCh <- msg: } + return err } // RemovePeer removes a peer from the pool. diff --git a/internal/statesync/syncer_test.go b/internal/statesync/syncer_test.go index 14297c049..ad902a54c 100644 --- a/internal/statesync/syncer_test.go +++ b/internal/statesync/syncer_test.go @@ -77,12 +77,14 @@ func TestSyncer_SyncAny(t *testing.T) { require.Error(t, err) // Adding a couple of peers should trigger snapshot discovery messages - rts.syncer.AddPeer(peerAID) + err = rts.syncer.AddPeer(peerAID) + require.NoError(t, err) e := <-rts.snapshotOutCh require.Equal(t, &ssproto.SnapshotsRequest{}, e.Message) require.Equal(t, peerAID, e.To) - rts.syncer.AddPeer(peerBID) + err = rts.syncer.AddPeer(peerBID) + require.NoError(t, err) e = <-rts.snapshotOutCh require.Equal(t, &ssproto.SnapshotsRequest{}, e.Message) require.Equal(t, peerBID, e.To)