diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index b66cd4e8e..434232e4f 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -25,3 +25,5 @@ Special thanks to external contributors on this release: ### BUG FIXES: - [node] \#3186 EventBus and indexerService should be started before first block (for replay last block on handshake) execution +- [p2p] \#3247 Fix panic in SeedMode when calling FlushStop and OnStop + concurrently diff --git a/p2p/conn/connection.go b/p2p/conn/connection.go index fb20c4775..920734914 100644 --- a/p2p/conn/connection.go +++ b/p2p/conn/connection.go @@ -8,6 +8,7 @@ import ( "math" "net" "reflect" + "sync" "sync/atomic" "time" @@ -89,6 +90,10 @@ type MConnection struct { quitSendRoutine chan struct{} doneSendRoutine chan struct{} + // used to ensure FlushStop and OnStop + // are safe to call concurrently. + stopMtx sync.Mutex + flushTimer *cmn.ThrottleTimer // flush writes as necessary but throttled. pingTimer *cmn.RepeatTimer // send pings periodically @@ -210,8 +215,17 @@ func (c *MConnection) OnStart() error { // It additionally ensures that all successful // .Send() calls will get flushed before closing // the connection. -// NOTE: it is not safe to call this method more than once. func (c *MConnection) FlushStop() { + c.stopMtx.Lock() + defer c.stopMtx.Unlock() + + select { + case <-c.quitSendRoutine: + // already quit via OnStop + return + default: + } + c.BaseService.OnStop() c.flushTimer.Stop() c.pingTimer.Stop() @@ -247,6 +261,9 @@ func (c *MConnection) FlushStop() { // OnStop implements BaseService func (c *MConnection) OnStop() { + c.stopMtx.Lock() + defer c.stopMtx.Unlock() + select { case <-c.quitSendRoutine: // already quit via FlushStop diff --git a/p2p/pex/pex_reactor_test.go b/p2p/pex/pex_reactor_test.go index f5125c603..4f4ccb039 100644 --- a/p2p/pex/pex_reactor_test.go +++ b/p2p/pex/pex_reactor_test.go @@ -316,6 +316,81 @@ func TestPEXReactorCrawlStatus(t *testing.T) { // TODO: test } +// connect a peer to a seed, wait a bit, then stop it. +// this should give it time to request addrs and for the seed +// to call FlushStop, and allows us to test calling Stop concurrently +// with FlushStop. Before a fix, this non-deterministically reproduced +// https://github.com/tendermint/tendermint/issues/3231. +func TestPEXReactorSeedModeFlushStop(t *testing.T) { + N := 2 + switches := make([]*p2p.Switch, N) + + // directory to store address books + dir, err := ioutil.TempDir("", "pex_reactor") + require.Nil(t, err) + defer os.RemoveAll(dir) // nolint: errcheck + + books := make([]*addrBook, N) + logger := log.TestingLogger() + + // create switches + for i := 0; i < N; i++ { + switches[i] = p2p.MakeSwitch(cfg, i, "testing", "123.123.123", func(i int, sw *p2p.Switch) *p2p.Switch { + books[i] = NewAddrBook(filepath.Join(dir, fmt.Sprintf("addrbook%d.json", i)), false) + books[i].SetLogger(logger.With("pex", i)) + sw.SetAddrBook(books[i]) + + sw.SetLogger(logger.With("pex", i)) + + config := &PEXReactorConfig{} + if i == 0 { + // first one is a seed node + config = &PEXReactorConfig{SeedMode: true} + } + r := NewPEXReactor(books[i], config) + r.SetLogger(logger.With("pex", i)) + r.SetEnsurePeersPeriod(250 * time.Millisecond) + sw.AddReactor("pex", r) + + return sw + }) + } + + for _, sw := range switches { + err := sw.Start() // start switch and reactors + require.Nil(t, err) + } + + reactor := switches[0].Reactors()["pex"].(*PEXReactor) + peerID := switches[1].NodeInfo().ID() + + err = switches[1].DialPeerWithAddress(switches[0].NodeInfo().NetAddress(), false) + assert.NoError(t, err) + + // sleep up to a second while waiting for the peer to send us a message. + // this isn't perfect since it's possible the peer sends us a msg and we FlushStop + // before this loop catches it. but non-deterministically it works pretty well. + for i := 0; i < 1000; i++ { + v := reactor.lastReceivedRequests.Get(string(peerID)) + if v != nil { + break + } + time.Sleep(time.Millisecond) + } + + // by now the FlushStop should have happened. Try stopping the peer. + // it should be safe to do this. + peers := switches[0].Peers().List() + for _, peer := range peers { + peer.Stop() + } + + // stop the switches + for _, s := range switches { + s.Stop() + } +} + func TestPEXReactorDoesNotAddPrivatePeersToAddrBook(t *testing.T) { peer := p2p.CreateRandomPeer(false)