diff --git a/CHANGELOG.md b/CHANGELOG.md index 52a926aed..057b2e7be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog +## v0.31.4 + +*April 12th, 2019* + +This release fixes a regression from v0.31.3 which used the peer's `SocketAddr` to add the peer to +the address book. This swallowed the peer's self-reported port which is important in case of reconnect. +It brings back `NetAddress()` to `NodeInfo` and uses it instead of `SocketAddr` for adding peers. +Additionally, it improves response time on the `/validators` or `/status` RPC endpoints. +As a side-effect it makes these RPC endpoint more difficult to DoS and fixes a performance degradation in `ExecCommitBlock`. +Also, it contains an [ADR](https://github.com/tendermint/tendermint/pull/3539) that proposes decoupling the +responsibility for peer behaviour from the `p2p.Switch` (by @brapse). + +Special thanks to external contributors on this release: +@brapse, @guagualvcha, @mydring + +### IMPROVEMENTS: +- [p2p] [\#3463](https://github.com/tendermint/tendermint/pull/3463) Do not log "Can't add peer's address to addrbook" error for a private peer +- [p2p] [\#3547](https://github.com/tendermint/tendermint/pull/3547) Fix a couple of annoying typos (@mdyring) + +### BUG FIXES: + +- [docs] [\#3514](https://github.com/tendermint/tendermint/issues/3514) Fix block.Header.Time description (@melekes) +- [p2p] [\#2716](https://github.com/tendermint/tendermint/issues/2716) Check if we're already connected to peer right before dialing it (@melekes) +- [p2p] [\#3545](https://github.com/tendermint/tendermint/issues/3545) Add back `NetAddress()` to `NodeInfo` and use it instead of peer's `SocketAddr()` when adding a peer to the `PEXReactor` (potential fix for [\#3532](https://github.com/tendermint/tendermint/issues/3532)) +- [state] [\#3438](https://github.com/tendermint/tendermint/pull/3438) + Persist validators every 100000 blocks even if no changes to the set + occurred (@guagualvcha). This + 1) Prevents possible DoS attack using `/validators` or `/status` RPC + endpoints. Before response time was growing linearly with height if no + changes were made to the validator set. + 2) Fixes performance degradation in `ExecCommitBlock` where we call + `LoadValidators` for each `Evidence` in the block. + ## v0.31.3 *April 1st, 2019* @@ -8,6 +41,12 @@ This release includes two security sensitive fixes: it ensures generated private keys are valid, and it prevents certain DNS lookups that would cause the node to panic if the lookup failed. +### BREAKING CHANGES: +* Go API + - [crypto/secp256k1] [\#3439](https://github.com/tendermint/tendermint/issues/3439) + The `secp256k1.GenPrivKeySecp256k1` function has changed to guarantee that it returns a valid key, which means it + will return a different private key than in previous versions for the same secret. + ### BUG FIXES: - [crypto/secp256k1] [\#3439](https://github.com/tendermint/tendermint/issues/3439) @@ -35,7 +74,7 @@ Special thanks to external contributors on this release: * Apps * Go API -- [libs/autofile] [\#3504](https://github.com/tendermint/tendermint/issues/3504) Remove unused code in autofile package. Deleted functions: `Group.Search`, `Group.FindLast`, `GroupReader.ReadLine`, `GroupReader.PushLine`, `MakeSimpleSearchFunc` (@guagualvcha) + - [libs/autofile] [\#3504](https://github.com/tendermint/tendermint/issues/3504) Remove unused code in autofile package. Deleted functions: `Group.Search`, `Group.FindLast`, `GroupReader.ReadLine`, `GroupReader.PushLine`, `MakeSimpleSearchFunc` (@guagualvcha) * Blockchain Protocol diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 045247937..bcb2af539 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -1,4 +1,4 @@ -## v0.31.2 +## v0.31.5 ** @@ -19,4 +19,3 @@ ### IMPROVEMENTS: ### BUG FIXES: - diff --git a/docs/architecture/adr-037-peer-behaviour.md b/docs/architecture/adr-037-peer-behaviour.md new file mode 100644 index 000000000..36b024482 --- /dev/null +++ b/docs/architecture/adr-037-peer-behaviour.md @@ -0,0 +1,145 @@ +# ADR 037: Peer Behaviour Interface + +## Changelog +* 07-03-2019: Initial draft + +## Context + +The responsibility for signaling and acting upon peer behaviour lacks a single +owning component and is heavily coupled with the network stack[1](#references). Reactors +maintain a reference to the `p2p.Switch` which they use to call +`switch.StopPeerForError(...)` when a peer misbehaves and +`switch.MarkAsGood(...)` when a peer contributes in some meaningful way. +While the switch handles `StopPeerForError` internally, the `MarkAsGood` +method delegates to another component, `p2p.AddrBook`. This scheme of delegation +across Switch obscures the responsibility for handling peer behaviour +and ties up the reactors in a larger dependency graph when testing. + +## Decision + +Introduce a `PeerBehaviour` interface and concrete implementations which +provide methods for reactors to signal peer behaviour without direct +coupling `p2p.Switch`. Introduce a ErrPeer to provide +concrete reasons for stopping peers. + +### Implementation Changes + +PeerBehaviour then becomes an interface for signaling peer errors as well +as for marking peers as `good`. + +XXX: It might be better to pass p2p.ID instead of the whole peer but as +a first draft maintain the underlying implementation as much as +possible. + +```go +type PeerBehaviour interface { + Errored(peer Peer, reason ErrPeer) + MarkPeerAsGood(peer Peer) +} +``` + +Instead of signaling peers to stop with arbitrary reasons: +`reason interface{}` + +We introduce a concrete error type ErrPeer: +```go +type ErrPeer int + +const ( + ErrPeerUnknown = iota + ErrPeerBadMessage + ErrPeerMessageOutofOrder + ... +) +``` + +As a first iteration we provide a concrete implementation which wraps +the switch: +```go +type SwitchedPeerBehaviour struct { + sw *Switch +} + +func (spb *SwitchedPeerBehaviour) Errored(peer Peer, reason ErrPeer) { + spb.sw.StopPeerForError(peer, reason) +} + +func (spb *SwitchedPeerBehaviour) MarkPeerAsGood(peer Peer) { + spb.sw.MarkPeerAsGood(peer) +} + +func NewSwitchedPeerBehaviour(sw *Switch) *SwitchedPeerBehaviour { + return &SwitchedPeerBehaviour{ + sw: sw, + } +} +``` + +Reactors, which are often difficult to unit test[2](#references). could use an implementation which exposes the signals produced by the reactor in +manufactured scenarios: + +```go +type PeerErrors map[Peer][]ErrPeer +type GoodPeers map[Peer]bool + +type StorePeerBehaviour struct { + pe PeerErrors + gp GoodPeers +} + +func NewStorePeerBehaviour() *StorePeerBehaviour{ + return &StorePeerBehaviour{ + pe: make(PeerErrors), + gp: GoodPeers{}, + } +} + +func (spb StorePeerBehaviour) Errored(peer Peer, reason ErrPeer) { + if _, ok := spb.pe[peer]; !ok { + spb.pe[peer] = []ErrPeer{reason} + } else { + spb.pe[peer] = append(spb.pe[peer], reason) + } +} + +func (mpb *StorePeerBehaviour) GetPeerErrors() PeerErrors { + return mpb.pe +} + +func (spb *StorePeerBehaviour) MarkPeerAsGood(peer Peer) { + if _, ok := spb.gp[peer]; !ok { + spb.gp[peer] = true + } +} + +func (spb *StorePeerBehaviour) GetGoodPeers() GoodPeers { + return spb.gp +} +``` + +## Status + +Proposed + +## Consequences + +### Positive + + * De-couple signaling from acting upon peer behaviour. + * Reduce the coupling of reactors and the Switch and the network + stack + * The responsibility of managing peer behaviour can be migrated to + a single component instead of split between the switch and the + address book. + +### Negative + + * The first iteration will simply wrap the Switch and introduce a + level of indirection. + +### Neutral + +## References + +1. Issue [#2067](https://github.com/tendermint/tendermint/issues/2067): P2P Refactor +2. PR: [#3506](https://github.com/tendermint/tendermint/pull/3506): ADR 036: Blockchain Reactor Refactor diff --git a/docs/spec/abci/abci.md b/docs/spec/abci/abci.md index c696c9384..c65d96ec1 100644 --- a/docs/spec/abci/abci.md +++ b/docs/spec/abci/abci.md @@ -347,8 +347,10 @@ Commit are included in the header of the next block. - `Version (Version)`: Version of the blockchain and the application - `ChainID (string)`: ID of the blockchain - `Height (int64)`: Height of the block in the chain - - `Time (google.protobuf.Timestamp)`: Time of the block. It is the proposer's - local time when block was created. + - `Time (google.protobuf.Timestamp)`: Time of the previous block. + For heights > 1, it's the weighted median of the timestamps of the valid + votes in the block.LastCommit. + For height == 1, it's genesis time. - `NumTxs (int32)`: Number of transactions in the block - `TotalTxs (int64)`: Total number of transactions in the blockchain until now diff --git a/docs/spec/abci/apps.md b/docs/spec/abci/apps.md index ca6abe7f8..47e62eed9 100644 --- a/docs/spec/abci/apps.md +++ b/docs/spec/abci/apps.md @@ -31,8 +31,13 @@ states to the latest committed state at once. When `Commit` completes, it unlocks the mempool. -Note that it is not possible to send transactions to Tendermint during `Commit` - if your app -tries to send a `/broadcast_tx` to Tendermint during Commit, it will deadlock. +WARNING: if the ABCI app logic processing the `Commit` message sends a +`/broadcast_tx_sync` or `/broadcast_tx_commit` and waits for the response +before proceeding, it will deadlock. Executing those `broadcast_tx` calls +involves acquiring a lock that is held during the `Commit` call, so it's not +possible. If you make the call to the `broadcast_tx` endpoints concurrently, +that's no problem, it just can't be part of the sequential logic of the +`Commit` function. ### Consensus Connection diff --git a/docs/spec/blockchain/blockchain.md b/docs/spec/blockchain/blockchain.md index 9f88d6417..cd31c5dc1 100644 --- a/docs/spec/blockchain/blockchain.md +++ b/docs/spec/blockchain/blockchain.md @@ -244,7 +244,7 @@ The height is an incrementing integer. The first block has `block.Header.Height ### Time ``` -block.Header.Timestamp >= prevBlock.Header.Timestamp + 1 ms +block.Header.Timestamp >= prevBlock.Header.Timestamp + state.consensusParams.Block.TimeIotaMs block.Header.Timestamp == MedianTime(block.LastCommit, state.LastValidators) ``` diff --git a/node/node.go b/node/node.go index e91d36357..c0a4d736e 100644 --- a/node/node.go +++ b/node/node.go @@ -498,6 +498,12 @@ func NewNode(config *cfg.Config, &pex.PEXReactorConfig{ Seeds: splitAndTrimEmpty(config.P2P.Seeds, ",", " "), SeedMode: config.P2P.SeedMode, + // See consensus/reactor.go: blocksToContributeToBecomeGoodPeer 10000 + // blocks assuming 10s blocks ~ 28 hours. + // TODO (melekes): make it dynamic based on the actual block latencies + // from the live network. + // https://github.com/tendermint/tendermint/issues/3523 + SeedDisconnectWaitPeriod: 28 * time.Hour, }) pexReactor.SetLogger(logger.With("module", "pex")) sw.AddReactor("PEX", pexReactor) diff --git a/p2p/errors.go b/p2p/errors.go index 706150945..3650a7a0a 100644 --- a/p2p/errors.go +++ b/p2p/errors.go @@ -103,7 +103,7 @@ type ErrSwitchDuplicatePeerID struct { } func (e ErrSwitchDuplicatePeerID) Error() string { - return fmt.Sprintf("Duplicate peer ID %v", e.ID) + return fmt.Sprintf("duplicate peer ID %v", e.ID) } // ErrSwitchDuplicatePeerIP to be raised whena a peer is connecting with a known @@ -113,7 +113,7 @@ type ErrSwitchDuplicatePeerIP struct { } func (e ErrSwitchDuplicatePeerIP) Error() string { - return fmt.Sprintf("Duplicate peer IP %v", e.IP.String()) + return fmt.Sprintf("duplicate peer IP %v", e.IP.String()) } // ErrSwitchConnectToSelf to be raised when trying to connect to itself. @@ -122,7 +122,7 @@ type ErrSwitchConnectToSelf struct { } func (e ErrSwitchConnectToSelf) Error() string { - return fmt.Sprintf("Connect to self: %v", e.Addr) + return fmt.Sprintf("connect to self: %v", e.Addr) } type ErrSwitchAuthenticationFailure struct { @@ -132,7 +132,7 @@ type ErrSwitchAuthenticationFailure struct { func (e ErrSwitchAuthenticationFailure) Error() string { return fmt.Sprintf( - "Failed to authenticate peer. Dialed %v, but got peer with ID %s", + "failed to authenticate peer. Dialed %v, but got peer with ID %s", e.Dialed, e.Got, ) @@ -152,7 +152,7 @@ type ErrNetAddressNoID struct { } func (e ErrNetAddressNoID) Error() string { - return fmt.Sprintf("Address (%s) does not contain ID", e.Addr) + return fmt.Sprintf("address (%s) does not contain ID", e.Addr) } type ErrNetAddressInvalid struct { @@ -161,7 +161,7 @@ type ErrNetAddressInvalid struct { } func (e ErrNetAddressInvalid) Error() string { - return fmt.Sprintf("Invalid address (%s): %v", e.Addr, e.Err) + return fmt.Sprintf("invalid address (%s): %v", e.Addr, e.Err) } type ErrNetAddressLookup struct { @@ -170,5 +170,15 @@ type ErrNetAddressLookup struct { } func (e ErrNetAddressLookup) Error() string { - return fmt.Sprintf("Error looking up host (%s): %v", e.Addr, e.Err) + return fmt.Sprintf("error looking up host (%s): %v", e.Addr, e.Err) +} + +// ErrCurrentlyDialingOrExistingAddress indicates that we're currently +// dialing this address or it belongs to an existing peer. +type ErrCurrentlyDialingOrExistingAddress struct { + Addr string +} + +func (e ErrCurrentlyDialingOrExistingAddress) Error() string { + return fmt.Sprintf("connection with %s has been established or dialed", e.Addr) } diff --git a/p2p/node_info.go b/p2p/node_info.go index e80f1e1b7..8195471ed 100644 --- a/p2p/node_info.go +++ b/p2p/node_info.go @@ -24,9 +24,14 @@ func MaxNodeInfoSize() int { // and determines if we're compatible. type NodeInfo interface { ID() ID + nodeInfoAddress nodeInfoTransport } +type nodeInfoAddress interface { + NetAddress() (*NetAddress, error) +} + // nodeInfoTransport validates a nodeInfo and checks // our compatibility with it. It's for use in the handshake. type nodeInfoTransport interface { @@ -209,20 +214,9 @@ OUTER_LOOP: // it includes the authenticated peer ID and the self-reported // ListenAddr. Note that the ListenAddr is not authenticated and // may not match that address actually dialed if its an outbound peer. -func (info DefaultNodeInfo) NetAddress() *NetAddress { +func (info DefaultNodeInfo) NetAddress() (*NetAddress, error) { idAddr := IDAddressString(info.ID(), info.ListenAddr) - netAddr, err := NewNetAddressString(idAddr) - if err != nil { - switch err.(type) { - case ErrNetAddressLookup: - // XXX If the peer provided a host name and the lookup fails here - // we're out of luck. - // TODO: use a NetAddress in DefaultNodeInfo - default: - panic(err) // everything should be well formed by now - } - } - return netAddr + return NewNetAddressString(idAddr) } //----------------------------------------------------------- diff --git a/p2p/pex/addrbook.go b/p2p/pex/addrbook.go index 3cb91c380..85dd05248 100644 --- a/p2p/pex/addrbook.go +++ b/p2p/pex/addrbook.go @@ -55,7 +55,7 @@ type AddrBook interface { PickAddress(biasTowardsNewAddrs int) *p2p.NetAddress // Mark address - MarkGood(*p2p.NetAddress) + MarkGood(p2p.ID) MarkAttempt(*p2p.NetAddress) MarkBad(*p2p.NetAddress) @@ -66,8 +66,7 @@ type AddrBook interface { // Send a selection of addresses with bias GetSelectionWithBias(biasTowardsNewAddrs int) []*p2p.NetAddress - // TODO: remove - ListOfKnownAddresses() []*knownAddress + Size() int // Persist to disk Save() @@ -254,7 +253,7 @@ func (a *addrBook) PickAddress(biasTowardsNewAddrs int) *p2p.NetAddress { bookSize := a.size() if bookSize <= 0 { if bookSize < 0 { - a.Logger.Error("Addrbook size less than 0", "nNew", a.nNew, "nOld", a.nOld) + panic(fmt.Sprintf("Addrbook size %d (new: %d + old: %d) is less than 0", a.nNew+a.nOld, a.nNew, a.nOld)) } return nil } @@ -297,11 +296,11 @@ func (a *addrBook) PickAddress(biasTowardsNewAddrs int) *p2p.NetAddress { // MarkGood implements AddrBook - it marks the peer as good and // moves it into an "old" bucket. -func (a *addrBook) MarkGood(addr *p2p.NetAddress) { +func (a *addrBook) MarkGood(id p2p.ID) { a.mtx.Lock() defer a.mtx.Unlock() - ka := a.addrLookup[addr.ID] + ka := a.addrLookup[id] if ka == nil { return } @@ -339,7 +338,7 @@ func (a *addrBook) GetSelection() []*p2p.NetAddress { bookSize := a.size() if bookSize <= 0 { if bookSize < 0 { - a.Logger.Error("Addrbook size less than 0", "nNew", a.nNew, "nOld", a.nOld) + panic(fmt.Sprintf("Addrbook size %d (new: %d + old: %d) is less than 0", a.nNew+a.nOld, a.nNew, a.nOld)) } return nil } @@ -389,7 +388,7 @@ func (a *addrBook) GetSelectionWithBias(biasTowardsNewAddrs int) []*p2p.NetAddre bookSize := a.size() if bookSize <= 0 { if bookSize < 0 { - a.Logger.Error("Addrbook size less than 0", "nNew", a.nNew, "nOld", a.nOld) + panic(fmt.Sprintf("Addrbook size %d (new: %d + old: %d) is less than 0", a.nNew+a.nOld, a.nNew, a.nOld)) } return nil } @@ -414,18 +413,6 @@ func (a *addrBook) GetSelectionWithBias(biasTowardsNewAddrs int) []*p2p.NetAddre return selection } -// ListOfKnownAddresses returns the new and old addresses. -func (a *addrBook) ListOfKnownAddresses() []*knownAddress { - a.mtx.Lock() - defer a.mtx.Unlock() - - addrs := []*knownAddress{} - for _, addr := range a.addrLookup { - addrs = append(addrs, addr.copy()) - } - return addrs -} - //------------------------------------------------ // Size returns the number of addresses in the book. @@ -473,8 +460,7 @@ func (a *addrBook) getBucket(bucketType byte, bucketIdx int) map[string]*knownAd case bucketTypeOld: return a.bucketsOld[bucketIdx] default: - cmn.PanicSanity("Should not happen") - return nil + panic("Invalid bucket type") } } @@ -600,16 +586,16 @@ func (a *addrBook) addAddress(addr, src *p2p.NetAddress) error { return ErrAddrBookNilAddr{addr, src} } - if a.routabilityStrict && !addr.Routable() { - return ErrAddrBookNonRoutable{addr} + if !addr.HasID() { + return ErrAddrBookInvalidAddrNoID{addr} } - if !addr.Valid() { - return ErrAddrBookInvalidAddr{addr} + if _, ok := a.privateIDs[addr.ID]; ok { + return ErrAddrBookPrivate{addr} } - if !addr.HasID() { - return ErrAddrBookInvalidAddrNoID{addr} + if _, ok := a.privateIDs[src.ID]; ok { + return ErrAddrBookPrivateSrc{src} } // TODO: we should track ourAddrs by ID and by IP:PORT and refuse both. @@ -617,12 +603,12 @@ func (a *addrBook) addAddress(addr, src *p2p.NetAddress) error { return ErrAddrBookSelf{addr} } - if _, ok := a.privateIDs[addr.ID]; ok { - return ErrAddrBookPrivate{addr} + if a.routabilityStrict && !addr.Routable() { + return ErrAddrBookNonRoutable{addr} } - if _, ok := a.privateIDs[src.ID]; ok { - return ErrAddrBookPrivateSrc{src} + if !addr.Valid() { + return ErrAddrBookInvalidAddr{addr} } ka := a.addrLookup[addr.ID] diff --git a/p2p/pex/addrbook_test.go b/p2p/pex/addrbook_test.go index fdcb0c8ad..13bac28c8 100644 --- a/p2p/pex/addrbook_test.go +++ b/p2p/pex/addrbook_test.go @@ -41,7 +41,7 @@ func TestAddrBookPickAddress(t *testing.T) { assert.NotNil(t, addr, "expected an address") // pick an address when we only have old address - book.MarkGood(addrSrc.addr) + book.MarkGood(addrSrc.addr.ID) addr = book.PickAddress(0) assert.NotNil(t, addr, "expected an address") addr = book.PickAddress(50) @@ -126,7 +126,7 @@ func TestAddrBookPromoteToOld(t *testing.T) { // Promote half of them for i, addrSrc := range randAddrs { if i%2 == 0 { - book.MarkGood(addrSrc.addr) + book.MarkGood(addrSrc.addr.ID) } } @@ -330,7 +330,7 @@ func TestAddrBookGetSelectionWithBias(t *testing.T) { randAddrsLen := len(randAddrs) for i, addrSrc := range randAddrs { if int((float64(i)/float64(randAddrsLen))*100) >= 20 { - book.MarkGood(addrSrc.addr) + book.MarkGood(addrSrc.addr.ID) } } @@ -569,7 +569,7 @@ func createAddrBookWithMOldAndNNewAddrs(t *testing.T, nOld, nNew int) (book *add randAddrs := randNetAddressPairs(t, nOld) for _, addr := range randAddrs { book.AddAddress(addr.addr, addr.src) - book.MarkGood(addr.addr) + book.MarkGood(addr.addr.ID) } randAddrs = randNetAddressPairs(t, nNew) diff --git a/p2p/pex/errors.go b/p2p/pex/errors.go index 1f44ceee7..543056af5 100644 --- a/p2p/pex/errors.go +++ b/p2p/pex/errors.go @@ -30,6 +30,10 @@ func (err ErrAddrBookPrivate) Error() string { return fmt.Sprintf("Cannot add private peer with address %v", err.Addr) } +func (err ErrAddrBookPrivate) PrivateAddr() bool { + return true +} + type ErrAddrBookPrivateSrc struct { Src *p2p.NetAddress } @@ -38,6 +42,10 @@ func (err ErrAddrBookPrivateSrc) Error() string { return fmt.Sprintf("Cannot add peer coming from private peer with address %v", err.Src) } +func (err ErrAddrBookPrivateSrc) PrivateAddr() bool { + return true +} + type ErrAddrBookNilAddr struct { Addr *p2p.NetAddress Src *p2p.NetAddress diff --git a/p2p/pex/file.go b/p2p/pex/file.go index 33fec0336..d4a516850 100644 --- a/p2p/pex/file.go +++ b/p2p/pex/file.go @@ -16,16 +16,15 @@ type addrBookJSON struct { } func (a *addrBook) saveToFile(filePath string) { - a.Logger.Info("Saving AddrBook to file", "size", a.Size()) - a.mtx.Lock() defer a.mtx.Unlock() - // Compile Addrs - addrs := []*knownAddress{} + + a.Logger.Info("Saving AddrBook to file", "size", a.size()) + + addrs := make([]*knownAddress, 0, len(a.addrLookup)) for _, ka := range a.addrLookup { addrs = append(addrs, ka) } - aJSON := &addrBookJSON{ Key: a.key, Addrs: addrs, diff --git a/p2p/pex/known_address.go b/p2p/pex/known_address.go index 5673dec11..acde385bc 100644 --- a/p2p/pex/known_address.go +++ b/p2p/pex/known_address.go @@ -33,18 +33,6 @@ func (ka *knownAddress) ID() p2p.ID { return ka.Addr.ID } -func (ka *knownAddress) copy() *knownAddress { - return &knownAddress{ - Addr: ka.Addr, - Src: ka.Src, - Attempts: ka.Attempts, - LastAttempt: ka.LastAttempt, - LastSuccess: ka.LastSuccess, - BucketType: ka.BucketType, - Buckets: ka.Buckets, - } -} - func (ka *knownAddress) isOld() bool { return ka.BucketType == bucketTypeOld } diff --git a/p2p/pex/pex_reactor.go b/p2p/pex/pex_reactor.go index 0ce116326..c24ee983c 100644 --- a/p2p/pex/pex_reactor.go +++ b/p2p/pex/pex_reactor.go @@ -3,7 +3,6 @@ package pex import ( "fmt" "reflect" - "sort" "sync" "time" @@ -35,16 +34,11 @@ const ( // Seed/Crawler constants - // We want seeds to only advertise good peers. Therefore they should wait at - // least as long as we expect it to take for a peer to become good before - // disconnecting. - // see consensus/reactor.go: blocksToContributeToBecomeGoodPeer - // 10000 blocks assuming 1s blocks ~ 2.7 hours. - defaultSeedDisconnectWaitPeriod = 3 * time.Hour + // minTimeBetweenCrawls is a minimum time between attempts to crawl a peer. + minTimeBetweenCrawls = 2 * time.Minute - defaultCrawlPeerInterval = 2 * time.Minute // don't redial for this. TODO: back-off. what for? - - defaultCrawlPeersPeriod = 30 * time.Second // check some peers every this + // check some peers every this + crawlPeerPeriod = 30 * time.Second maxAttemptsToDial = 16 // ~ 35h in total (last attempt - 18h) @@ -77,6 +71,9 @@ type PEXReactor struct { seedAddrs []*p2p.NetAddress attemptsToDial sync.Map // address (string) -> {number of attempts (int), last time dialed (time.Time)} + + // seed/crawled mode fields + crawlPeerInfos map[p2p.ID]crawlPeerInfo } func (r *PEXReactor) minReceiveRequestInterval() time.Duration { @@ -90,6 +87,11 @@ type PEXReactorConfig struct { // Seed/Crawler mode SeedMode bool + // We want seeds to only advertise good peers. Therefore they should wait at + // least as long as we expect it to take for a peer to become good before + // disconnecting. + SeedDisconnectWaitPeriod time.Duration + // Seeds is a list of addresses reactor may use // if it can't connect to peers in the addrbook. Seeds []string @@ -108,6 +110,7 @@ func NewPEXReactor(b AddrBook, config *PEXReactorConfig) *PEXReactor { ensurePeersPeriod: defaultEnsurePeersPeriod, requestsSent: cmn.NewCMap(), lastReceivedRequests: cmn.NewCMap(), + crawlPeerInfos: make(map[p2p.ID]crawlPeerInfo), } r.BaseReactor = *p2p.NewBaseReactor("PEXReactor", r) return r @@ -167,12 +170,18 @@ func (r *PEXReactor) AddPeer(p Peer) { } } else { // inbound peer is its own source - addr := p.SocketAddr() + addr, err := p.NodeInfo().NetAddress() + if err != nil { + r.Logger.Error("Failed to get peer NetAddress", "err", err, "peer", p) + return + } + + // Make it explicit that addr and src are the same for an inbound peer. src := addr // add to book. dont RequestAddrs right away because // we don't trust inbound as much - let ensurePeersRoutine handle it. - err := r.book.AddAddress(addr, src) + err = r.book.AddAddress(addr, src) r.logErrAddrBook(err) } } @@ -309,7 +318,10 @@ func (r *PEXReactor) ReceiveAddrs(addrs []*p2p.NetAddress, src Peer) error { } r.requestsSent.Delete(id) - srcAddr := src.SocketAddr() + srcAddr, err := src.NodeInfo().NetAddress() + if err != nil { + return err + } for _, netAddr := range addrs { // Validate netAddr. Disconnect from a peer if it sends us invalid data. if netAddr == nil { @@ -363,9 +375,9 @@ func (r *PEXReactor) ensurePeersRoutine() { ) // Randomize first round of communication to avoid thundering herd. - // If no potential peers are present directly start connecting so we guarantee - // swift setup with the help of configured seeds. - if r.hasPotentialPeers() { + // If no peers are present directly start connecting so we guarantee swift + // setup with the help of configured seeds. + if r.nodeHasSomePeersOrDialingAny() { time.Sleep(time.Duration(jitter)) } @@ -493,23 +505,26 @@ func (r *PEXReactor) dialPeer(addr *p2p.NetAddress) { err := r.Switch.DialPeerWithAddress(addr, false) if err != nil { + if _, ok := err.(p2p.ErrCurrentlyDialingOrExistingAddress); ok { + return + } + r.Logger.Error("Dialing failed", "addr", addr, "err", err, "attempts", attempts) - // TODO: detect more "bad peer" scenarios + markAddrInBookBasedOnErr(addr, r.book, err) if _, ok := err.(p2p.ErrSwitchAuthenticationFailure); ok { - r.book.MarkBad(addr) r.attemptsToDial.Delete(addr.DialString()) } else { - r.book.MarkAttempt(addr) // FIXME: if the addr is going to be removed from the addrbook (hard to // tell at this point), we need to Delete it from attemptsToDial, not // record another attempt. // record attempt r.attemptsToDial.Store(addr.DialString(), _attemptsToDial{attempts + 1, time.Now()}) } - } else { - // cleanup any history - r.attemptsToDial.Delete(addr.DialString()) + return } + + // cleanup any history + r.attemptsToDial.Delete(addr.DialString()) } // checkSeeds checks that addresses are well formed. @@ -568,101 +583,92 @@ func (r *PEXReactor) AttemptsToDial(addr *p2p.NetAddress) int { // from peers, except other seed nodes. func (r *PEXReactor) crawlPeersRoutine() { // Do an initial crawl - r.crawlPeers() + r.crawlPeers(r.book.GetSelection()) // Fire periodically - ticker := time.NewTicker(defaultCrawlPeersPeriod) + ticker := time.NewTicker(crawlPeerPeriod) for { select { case <-ticker.C: r.attemptDisconnects() - r.crawlPeers() + r.crawlPeers(r.book.GetSelection()) + r.cleanupCrawlPeerInfos() case <-r.Quit(): return } } } -// hasPotentialPeers indicates if there is a potential peer to connect to, by -// consulting the Switch as well as the AddrBook. -func (r *PEXReactor) hasPotentialPeers() bool { +// nodeHasSomePeersOrDialingAny returns true if the node is connected to some +// peers or dialing them currently. +func (r *PEXReactor) nodeHasSomePeersOrDialingAny() bool { out, in, dial := r.Switch.NumPeers() - - return out+in+dial > 0 && len(r.book.ListOfKnownAddresses()) > 0 + return out+in+dial > 0 } -// crawlPeerInfo handles temporary data needed for the -// network crawling performed during seed/crawler mode. +// crawlPeerInfo handles temporary data needed for the network crawling +// performed during seed/crawler mode. type crawlPeerInfo struct { - // The listening address of a potential peer we learned about - Addr *p2p.NetAddress - - // The last time we attempt to reach this address - LastAttempt time.Time - - // The last time we successfully reached this address - LastSuccess time.Time + Addr *p2p.NetAddress `json:"addr"` + // The last time we crawled the peer or attempted to do so. + LastCrawled time.Time `json:"last_crawled"` } -// oldestFirst implements sort.Interface for []crawlPeerInfo -// based on the LastAttempt field. -type oldestFirst []crawlPeerInfo - -func (of oldestFirst) Len() int { return len(of) } -func (of oldestFirst) Swap(i, j int) { of[i], of[j] = of[j], of[i] } -func (of oldestFirst) Less(i, j int) bool { return of[i].LastAttempt.Before(of[j].LastAttempt) } +// crawlPeers will crawl the network looking for new peer addresses. +func (r *PEXReactor) crawlPeers(addrs []*p2p.NetAddress) { + now := time.Now() -// getPeersToCrawl returns addresses of potential peers that we wish to validate. -// NOTE: The status information is ordered as described above. -func (r *PEXReactor) getPeersToCrawl() []crawlPeerInfo { - // TODO: be more selective - addrs := r.book.ListOfKnownAddresses() - of := make(oldestFirst, 0, len(addrs)) for _, addr := range addrs { - if len(addr.ID()) == 0 { - continue // dont use peers without id - } - - of = append(of, crawlPeerInfo{ - Addr: addr.Addr, - LastAttempt: addr.LastAttempt, - LastSuccess: addr.LastSuccess, - }) - } - sort.Sort(of) - return of -} - -// crawlPeers will crawl the network looking for new peer addresses. (once) -func (r *PEXReactor) crawlPeers() { - peerInfos := r.getPeersToCrawl() + peerInfo, ok := r.crawlPeerInfos[addr.ID] - now := time.Now() - // Use addresses we know of to reach additional peers - for _, pi := range peerInfos { - // Do not attempt to connect with peers we recently dialed - if now.Sub(pi.LastAttempt) < defaultCrawlPeerInterval { + // Do not attempt to connect with peers we recently crawled. + if ok && now.Sub(peerInfo.LastCrawled) < minTimeBetweenCrawls { continue } - // Otherwise, attempt to connect with the known address - err := r.Switch.DialPeerWithAddress(pi.Addr, false) + + // Record crawling attempt. + r.crawlPeerInfos[addr.ID] = crawlPeerInfo{ + Addr: addr, + LastCrawled: now, + } + + err := r.Switch.DialPeerWithAddress(addr, false) if err != nil { - r.book.MarkAttempt(pi.Addr) + if _, ok := err.(p2p.ErrCurrentlyDialingOrExistingAddress); ok { + continue + } + + r.Logger.Error("Dialing failed", "addr", addr, "err", err) + markAddrInBookBasedOnErr(addr, r.book, err) continue } - // Ask for more addresses - peer := r.Switch.Peers().Get(pi.Addr.ID) + + peer := r.Switch.Peers().Get(addr.ID) if peer != nil { r.RequestAddrs(peer) } } } +func (r *PEXReactor) cleanupCrawlPeerInfos() { + for id, info := range r.crawlPeerInfos { + // If we did not crawl a peer for 24 hours, it means the peer was removed + // from the addrbook => remove + // + // 10000 addresses / maxGetSelection = 40 cycles to get all addresses in + // the ideal case, + // 40 * crawlPeerPeriod ~ 20 minutes + if time.Since(info.LastCrawled) > 24*time.Hour { + delete(r.crawlPeerInfos, id) + } + } +} + // attemptDisconnects checks if we've been with each peer long enough to disconnect func (r *PEXReactor) attemptDisconnects() { for _, peer := range r.Switch.Peers().List() { - if peer.Status().Duration < defaultSeedDisconnectWaitPeriod { + if peer.Status().Duration < r.config.SeedDisconnectWaitPeriod { continue } if peer.IsPersistent() { @@ -672,6 +678,16 @@ func (r *PEXReactor) attemptDisconnects() { } } +func markAddrInBookBasedOnErr(addr *p2p.NetAddress, book AddrBook, err error) { + // TODO: detect more "bad peer" scenarios + switch err.(type) { + case p2p.ErrSwitchAuthenticationFailure: + book.MarkBad(addr) + default: + book.MarkAttempt(addr) + } +} + //----------------------------------------------------------------------------- // Messages diff --git a/p2p/pex/pex_reactor_test.go b/p2p/pex/pex_reactor_test.go index 4a6118c63..077f07a60 100644 --- a/p2p/pex/pex_reactor_test.go +++ b/p2p/pex/pex_reactor_test.go @@ -204,26 +204,26 @@ func TestCheckSeeds(t *testing.T) { defer os.RemoveAll(dir) // nolint: errcheck // 1. test creating peer with no seeds works - peer := testCreateDefaultPeer(dir, 0) - require.Nil(t, peer.Start()) - peer.Stop() + peerSwitch := testCreateDefaultPeer(dir, 0) + require.Nil(t, peerSwitch.Start()) + peerSwitch.Stop() // 2. create seed seed := testCreateSeed(dir, 1, []*p2p.NetAddress{}, []*p2p.NetAddress{}) // 3. test create peer with online seed works - peer = testCreatePeerWithSeed(dir, 2, seed) - require.Nil(t, peer.Start()) - peer.Stop() + peerSwitch = testCreatePeerWithSeed(dir, 2, seed) + require.Nil(t, peerSwitch.Start()) + peerSwitch.Stop() // 4. test create peer with all seeds having unresolvable DNS fails badPeerConfig := &PEXReactorConfig{ Seeds: []string{"ed3dfd27bfc4af18f67a49862f04cc100696e84d@bad.network.addr:26657", "d824b13cb5d40fa1d8a614e089357c7eff31b670@anotherbad.network.addr:26657"}, } - peer = testCreatePeerWithConfig(dir, 2, badPeerConfig) - require.Error(t, peer.Start()) - peer.Stop() + peerSwitch = testCreatePeerWithConfig(dir, 2, badPeerConfig) + require.Error(t, peerSwitch.Start()) + peerSwitch.Stop() // 5. test create peer with one good seed address succeeds badPeerConfig = &PEXReactorConfig{ @@ -231,9 +231,9 @@ func TestCheckSeeds(t *testing.T) { "d824b13cb5d40fa1d8a614e089357c7eff31b670@anotherbad.network.addr:26657", seed.NetAddress().String()}, } - peer = testCreatePeerWithConfig(dir, 2, badPeerConfig) - require.Nil(t, peer.Start()) - peer.Stop() + peerSwitch = testCreatePeerWithConfig(dir, 2, badPeerConfig) + require.Nil(t, peerSwitch.Start()) + peerSwitch.Stop() } func TestPEXReactorUsesSeedsIfNeeded(t *testing.T) { @@ -285,31 +285,41 @@ func TestConnectionSpeedForPeerReceivedFromSeed(t *testing.T) { assertPeersWithTimeout(t, []*p2p.Switch{secondPeer}, 10*time.Millisecond, 1*time.Second, 2) } -func TestPEXReactorCrawlStatus(t *testing.T) { - pexR, book := createReactor(&PEXReactorConfig{SeedMode: true}) +func TestPEXReactorSeedMode(t *testing.T) { + // directory to store address books + dir, err := ioutil.TempDir("", "pex_reactor") + require.Nil(t, err) + defer os.RemoveAll(dir) // nolint: errcheck + + pexR, book := createReactor(&PEXReactorConfig{SeedMode: true, SeedDisconnectWaitPeriod: 10 * time.Millisecond}) defer teardownReactor(book) - // Seed/Crawler mode uses data from the Switch sw := createSwitchAndAddReactors(pexR) sw.SetAddrBook(book) + err = sw.Start() + require.NoError(t, err) + defer sw.Stop() - // Create a peer, add it to the peer set and the addrbook. - peer := p2p.CreateRandomPeer(false) - p2p.AddPeerToSwitch(pexR.Switch, peer) - addr1 := peer.SocketAddr() - pexR.book.AddAddress(addr1, addr1) + assert.Zero(t, sw.Peers().Size()) + + peerSwitch := testCreateDefaultPeer(dir, 1) + require.NoError(t, peerSwitch.Start()) + defer peerSwitch.Stop() - // Add a non-connected address to the book. - _, addr2 := p2p.CreateRoutableAddr() - pexR.book.AddAddress(addr2, addr1) + // 1. Test crawlPeers dials the peer + pexR.crawlPeers([]*p2p.NetAddress{peerSwitch.NetAddress()}) + assert.Equal(t, 1, sw.Peers().Size()) + assert.True(t, sw.Peers().Has(peerSwitch.NodeInfo().ID())) - // Get some peerInfos to crawl - peerInfos := pexR.getPeersToCrawl() + // 2. attemptDisconnects should not disconnect because of wait period + pexR.attemptDisconnects() + assert.Equal(t, 1, sw.Peers().Size()) - // Make sure it has the proper number of elements - assert.Equal(t, 2, len(peerInfos)) + time.Sleep(100 * time.Millisecond) - // TODO: test + // 3. attemptDisconnects should disconnect after wait period + pexR.attemptDisconnects() + assert.Equal(t, 0, sw.Peers().Size()) } // connect a peer to a seed, wait a bit, then stop it. @@ -529,7 +539,7 @@ func testCreateSeed(dir string, id int, knownAddrs, srcAddrs []*p2p.NetAddress) book.SetLogger(log.TestingLogger()) for j := 0; j < len(knownAddrs); j++ { book.AddAddress(knownAddrs[j], srcAddrs[j]) - book.MarkGood(knownAddrs[j]) + book.MarkGood(knownAddrs[j].ID) } sw.SetAddrBook(book) diff --git a/p2p/switch.go b/p2p/switch.go index 76da9ad0c..afd7d9656 100644 --- a/p2p/switch.go +++ b/p2p/switch.go @@ -6,6 +6,8 @@ import ( "sync" "time" + "github.com/pkg/errors" + "github.com/tendermint/tendermint/config" cmn "github.com/tendermint/tendermint/libs/common" "github.com/tendermint/tendermint/p2p/conn" @@ -46,7 +48,7 @@ type AddrBook interface { AddAddress(addr *NetAddress, src *NetAddress) error AddOurAddress(*NetAddress) OurAddress(*NetAddress) bool - MarkGood(*NetAddress) + MarkGood(ID) RemoveAddress(*NetAddress) HasAddress(*NetAddress) bool Save() @@ -339,14 +341,11 @@ func (sw *Switch) reconnectToPeer(addr *NetAddress) { return } - if sw.IsDialingOrExistingAddress(addr) { - sw.Logger.Debug("Peer connection has been established or dialed while we waiting next try", "addr", addr) - return - } - err := sw.DialPeerWithAddress(addr, true) if err == nil { return // success + } else if _, ok := err.(ErrCurrentlyDialingOrExistingAddress); ok { + return } sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "addr", addr) @@ -365,9 +364,12 @@ func (sw *Switch) reconnectToPeer(addr *NetAddress) { // sleep an exponentially increasing amount sleepIntervalSeconds := math.Pow(reconnectBackOffBaseSeconds, float64(i)) sw.randomSleep(time.Duration(sleepIntervalSeconds) * time.Second) + err := sw.DialPeerWithAddress(addr, true) if err == nil { return // success + } else if _, ok := err.(ErrCurrentlyDialingOrExistingAddress); ok { + return } sw.Logger.Info("Error reconnecting to peer. Trying again", "tries", i, "err", err, "addr", addr) } @@ -383,13 +385,22 @@ func (sw *Switch) SetAddrBook(addrBook AddrBook) { // like contributed to consensus. func (sw *Switch) MarkPeerAsGood(peer Peer) { if sw.addrBook != nil { - sw.addrBook.MarkGood(peer.SocketAddr()) + sw.addrBook.MarkGood(peer.ID()) } } //--------------------------------------------------------------------- // Dialing +type privateAddr interface { + PrivateAddr() bool +} + +func isPrivateAddr(err error) bool { + te, ok := errors.Cause(err).(privateAddr) + return ok && te.PrivateAddr() +} + // DialPeersAsync dials a list of peers asynchronously in random order (optionally, making them persistent). // Used to dial peers from config on startup or from unsafe-RPC (trusted sources). // TODO: remove addrBook arg since it's now set on the switch @@ -412,7 +423,11 @@ func (sw *Switch) DialPeersAsync(addrBook AddrBook, peers []string, persistent b // do not add our address or ID if !netAddr.Same(ourAddr) { if err := addrBook.AddAddress(netAddr, ourAddr); err != nil { - sw.Logger.Error("Can't add peer's address to addrbook", "err", err) + if isPrivateAddr(err) { + sw.Logger.Debug("Won't add peer's address to addrbook", "err", err) + } else { + sw.Logger.Error("Can't add peer's address to addrbook", "err", err) + } } } } @@ -435,15 +450,10 @@ func (sw *Switch) DialPeersAsync(addrBook AddrBook, peers []string, persistent b sw.randomSleep(0) - if sw.IsDialingOrExistingAddress(addr) { - sw.Logger.Debug("Ignore attempt to connect to an existing peer", "addr", addr) - return - } - err := sw.DialPeerWithAddress(addr, persistent) if err != nil { switch err.(type) { - case ErrSwitchConnectToSelf, ErrSwitchDuplicatePeerID: + case ErrSwitchConnectToSelf, ErrSwitchDuplicatePeerID, ErrCurrentlyDialingOrExistingAddress: sw.Logger.Debug("Error dialing peer", "err", err) default: sw.Logger.Error("Error dialing peer", "err", err) @@ -454,11 +464,20 @@ func (sw *Switch) DialPeersAsync(addrBook AddrBook, peers []string, persistent b return nil } -// DialPeerWithAddress dials the given peer and runs sw.addPeer if it connects and authenticates successfully. -// If `persistent == true`, the switch will always try to reconnect to this peer if the connection ever fails. +// DialPeerWithAddress dials the given peer and runs sw.addPeer if it connects +// and authenticates successfully. +// If `persistent == true`, the switch will always try to reconnect to this +// peer if the connection ever fails. +// If we're currently dialing this address or it belongs to an existing peer, +// ErrCurrentlyDialingOrExistingAddress is returned. func (sw *Switch) DialPeerWithAddress(addr *NetAddress, persistent bool) error { + if sw.IsDialingOrExistingAddress(addr) { + return ErrCurrentlyDialingOrExistingAddress{addr.String()} + } + sw.dialing.Set(string(addr.ID), addr) defer sw.dialing.Delete(string(addr.ID)) + return sw.addOutboundPeerWithConfig(addr, sw.config, persistent) } diff --git a/p2p/switch_test.go b/p2p/switch_test.go index ab8ae9e9f..bf105e0fa 100644 --- a/p2p/switch_test.go +++ b/p2p/switch_test.go @@ -626,7 +626,7 @@ func (book *addrBookMock) OurAddress(addr *NetAddress) bool { _, ok := book.ourAddrs[addr.String()] return ok } -func (book *addrBookMock) MarkGood(*NetAddress) {} +func (book *addrBookMock) MarkGood(ID) {} func (book *addrBookMock) HasAddress(addr *NetAddress) bool { _, ok := book.addrs[addr.String()] return ok diff --git a/p2p/test_util.go b/p2p/test_util.go index df60539ba..f8020924c 100644 --- a/p2p/test_util.go +++ b/p2p/test_util.go @@ -23,7 +23,7 @@ type mockNodeInfo struct { } func (ni mockNodeInfo) ID() ID { return ni.addr.ID } -func (ni mockNodeInfo) NetAddress() *NetAddress { return ni.addr } +func (ni mockNodeInfo) NetAddress() (*NetAddress, error) { return ni.addr, nil } func (ni mockNodeInfo) Validate() error { return nil } func (ni mockNodeInfo) CompatibleWith(other NodeInfo) error { return nil } diff --git a/p2p/transport.go b/p2p/transport.go index 6717db483..ebf77c9f4 100644 --- a/p2p/transport.go +++ b/p2p/transport.go @@ -364,7 +364,7 @@ func (mt *MultiplexTransport) upgrade( if err != nil { return nil, nil, ErrRejected{ conn: c, - err: fmt.Errorf("secrect conn failed: %v", err), + err: fmt.Errorf("secret conn failed: %v", err), isAuthFailure: true, } } @@ -377,7 +377,7 @@ func (mt *MultiplexTransport) upgrade( conn: c, id: connID, err: fmt.Errorf( - "conn.ID (%v) dialed ID (%v) missmatch", + "conn.ID (%v) dialed ID (%v) mismatch", connID, dialedID, ), @@ -409,7 +409,7 @@ func (mt *MultiplexTransport) upgrade( conn: c, id: connID, err: fmt.Errorf( - "conn.ID (%v) NodeInfo.ID (%v) missmatch", + "conn.ID (%v) NodeInfo.ID (%v) mismatch", connID, nodeInfo.ID(), ), diff --git a/scripts/release_management/github-draft.py b/scripts/release_management/github-draft.py index 1fccd38b9..8a189d53e 100755 --- a/scripts/release_management/github-draft.py +++ b/scripts/release_management/github-draft.py @@ -34,7 +34,7 @@ def create_draft(org,repo,branch,version): 'tag_name': version, 'target_commitish': '{0}'.format(branch), 'name': '{0} (WARNING: ALPHA SOFTWARE)'.format(version), - 'body': 'https://github.com/{0}/{1}/blob/master/CHANGELOG.md#{2}'.format(org,repo,version.replace('v','').replace('.','')), + 'body': 'https://github.com/{0}/{1}/blob/{2}/CHANGELOG.md#{3}'.format(org,repo,branch,version.replace('.','')), 'draft': True, 'prerelease': False } diff --git a/state/store.go b/state/store.go index 6b01a8295..73116b43f 100644 --- a/state/store.go +++ b/state/store.go @@ -9,6 +9,14 @@ import ( "github.com/tendermint/tendermint/types" ) +const ( + // persist validators every valSetCheckpointInterval blocks to avoid + // LoadValidators taking too much time. + // https://github.com/tendermint/tendermint/pull/3438 + // 100000 results in ~ 100ms to get 100 validators (see BenchmarkLoadValidators) + valSetCheckpointInterval = 100000 +) + //------------------------------------------------------------------------ func calcValidatorsKey(height int64) []byte { @@ -182,25 +190,38 @@ func LoadValidators(db dbm.DB, height int64) (*types.ValidatorSet, error) { if valInfo == nil { return nil, ErrNoValSetForHeight{height} } - if valInfo.ValidatorSet == nil { - valInfo2 := loadValidatorsInfo(db, valInfo.LastHeightChanged) + lastStoredHeight := lastStoredHeightFor(height, valInfo.LastHeightChanged) + valInfo2 := loadValidatorsInfo(db, lastStoredHeight) if valInfo2 == nil { - panic( - fmt.Sprintf( - "Couldn't find validators at height %d as last changed from height %d", - valInfo.LastHeightChanged, - height, - ), - ) + // TODO (melekes): remove the below if condition in the 0.33 major + // release and just panic. Old chains might panic otherwise if they + // haven't saved validators at intermediate (%valSetCheckpointInterval) + // height yet. + // https://github.com/tendermint/tendermint/issues/3543 + valInfo2 = loadValidatorsInfo(db, valInfo.LastHeightChanged) + lastStoredHeight = valInfo.LastHeightChanged + if valInfo2 == nil { + panic( + fmt.Sprintf("Couldn't find validators at height %d (height %d was originally requested)", + lastStoredHeight, + height, + ), + ) + } } - valInfo2.ValidatorSet.IncrementProposerPriority(int(height - valInfo.LastHeightChanged)) // mutate + valInfo2.ValidatorSet.IncrementProposerPriority(int(height - lastStoredHeight)) // mutate valInfo = valInfo2 } return valInfo.ValidatorSet, nil } +func lastStoredHeightFor(height, lastHeightChanged int64) int64 { + checkpointHeight := height - height%valSetCheckpointInterval + return cmn.MaxInt64(checkpointHeight, lastHeightChanged) +} + // CONTRACT: Returned ValidatorsInfo can be mutated. func loadValidatorsInfo(db dbm.DB, height int64) *ValidatorsInfo { buf := db.Get(calcValidatorsKey(height)) @@ -221,10 +242,10 @@ func loadValidatorsInfo(db dbm.DB, height int64) *ValidatorsInfo { } // saveValidatorsInfo persists the validator set. -// `height` is the effective height for which the validator is responsible for signing. -// It should be called from s.Save(), right before the state itself is persisted. -// If the validator set did not change after processing the latest block, -// only the last height for which the validators changed is persisted. +// +// `height` is the effective height for which the validator is responsible for +// signing. It should be called from s.Save(), right before the state itself is +// persisted. func saveValidatorsInfo(db dbm.DB, height, lastHeightChanged int64, valSet *types.ValidatorSet) { if lastHeightChanged > height { panic("LastHeightChanged cannot be greater than ValidatorsInfo height") @@ -232,7 +253,9 @@ func saveValidatorsInfo(db dbm.DB, height, lastHeightChanged int64, valSet *type valInfo := &ValidatorsInfo{ LastHeightChanged: lastHeightChanged, } - if lastHeightChanged == height { + // Only persist validator set if it was updated or checkpoint height (see + // valSetCheckpointInterval) is reached. + if height == lastHeightChanged || height%valSetCheckpointInterval == 0 { valInfo.ValidatorSet = valSet } db.Set(calcValidatorsKey(height), valInfo.Bytes()) diff --git a/state/store_test.go b/state/store_test.go new file mode 100644 index 000000000..dd48cae71 --- /dev/null +++ b/state/store_test.go @@ -0,0 +1,67 @@ +package state + +import ( + "fmt" + "os" + "testing" + + "github.com/stretchr/testify/assert" + + cfg "github.com/tendermint/tendermint/config" + dbm "github.com/tendermint/tendermint/libs/db" + "github.com/tendermint/tendermint/types" +) + +func TestSaveValidatorsInfo(t *testing.T) { + // test we persist validators every valSetCheckpointInterval blocks + stateDB := dbm.NewMemDB() + val, _ := types.RandValidator(true, 10) + vals := types.NewValidatorSet([]*types.Validator{val}) + + // TODO(melekes): remove in 0.33 release + // https://github.com/tendermint/tendermint/issues/3543 + saveValidatorsInfo(stateDB, 1, 1, vals) + saveValidatorsInfo(stateDB, 2, 1, vals) + assert.NotPanics(t, func() { + _, err := LoadValidators(stateDB, 2) + if err != nil { + panic(err) + } + }) + //ENDREMOVE + + saveValidatorsInfo(stateDB, valSetCheckpointInterval, 1, vals) + + loadedVals, err := LoadValidators(stateDB, valSetCheckpointInterval) + assert.NoError(t, err) + assert.NotZero(t, loadedVals.Size()) +} + +func BenchmarkLoadValidators(b *testing.B) { + const valSetSize = 100 + + config := cfg.ResetTestRoot("state_") + defer os.RemoveAll(config.RootDir) + dbType := dbm.DBBackendType(config.DBBackend) + stateDB := dbm.NewDB("state", dbType, config.DBDir()) + state, err := LoadStateFromDBOrGenesisFile(stateDB, config.GenesisFile()) + if err != nil { + b.Fatal(err) + } + state.Validators = genValSet(valSetSize) + state.NextValidators = state.Validators.CopyIncrementProposerPriority(1) + SaveState(stateDB, state) + + for i := 10; i < 10000000000; i *= 10 { // 10, 100, 1000, ... + saveValidatorsInfo(stateDB, int64(i), state.LastHeightValidatorsChanged, state.NextValidators) + + b.Run(fmt.Sprintf("height=%d", i), func(b *testing.B) { + for n := 0; n < b.N; n++ { + _, err := LoadValidators(stateDB, int64(i)) + if err != nil { + b.Fatal(err) + } + } + }) + } +} diff --git a/version/version.go b/version/version.go index a42a8f005..9ba38de6c 100644 --- a/version/version.go +++ b/version/version.go @@ -20,7 +20,7 @@ const ( // Must be a string because scripts like dist.sh read this file. // XXX: Don't change the name of this variable or you will break // automation :) - TMCoreSemVer = "0.31.3" + TMCoreSemVer = "0.31.4" // ABCISemVer is the semantic version of the ABCI library ABCISemVer = "0.16.0"