You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

347 lines
9.3 KiB

7 years ago
7 years ago
pubsub 2.0 (#3227) * green pubsub tests :OK: * get rid of clientToQueryMap * Subscribe and SubscribeUnbuffered * start adapting other pkgs to new pubsub * nope * rename MsgAndTags to Message * remove TagMap it does not bring any additional benefits * bring back EventSubscriber * fix test * fix data race in TestStartNextHeightCorrectly ``` Write at 0x00c0001c7418 by goroutine 796: github.com/tendermint/tendermint/consensus.TestStartNextHeightCorrectly() /go/src/github.com/tendermint/tendermint/consensus/state_test.go:1296 +0xad testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 Previous read at 0x00c0001c7418 by goroutine 858: github.com/tendermint/tendermint/consensus.(*ConsensusState).addVote() /go/src/github.com/tendermint/tendermint/consensus/state.go:1631 +0x1366 github.com/tendermint/tendermint/consensus.(*ConsensusState).tryAddVote() /go/src/github.com/tendermint/tendermint/consensus/state.go:1476 +0x8f github.com/tendermint/tendermint/consensus.(*ConsensusState).handleMsg() /go/src/github.com/tendermint/tendermint/consensus/state.go:667 +0xa1e github.com/tendermint/tendermint/consensus.(*ConsensusState).receiveRoutine() /go/src/github.com/tendermint/tendermint/consensus/state.go:628 +0x794 Goroutine 796 (running) created at: testing.(*T).Run() /usr/local/go/src/testing/testing.go:878 +0x659 testing.runTests.func1() /usr/local/go/src/testing/testing.go:1119 +0xa8 testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 testing.runTests() /usr/local/go/src/testing/testing.go:1117 +0x4ee testing.(*M).Run() /usr/local/go/src/testing/testing.go:1034 +0x2ee main.main() _testmain.go:214 +0x332 Goroutine 858 (running) created at: github.com/tendermint/tendermint/consensus.(*ConsensusState).startRoutines() /go/src/github.com/tendermint/tendermint/consensus/state.go:334 +0x221 github.com/tendermint/tendermint/consensus.startTestRound() /go/src/github.com/tendermint/tendermint/consensus/common_test.go:122 +0x63 github.com/tendermint/tendermint/consensus.TestStateFullRound1() /go/src/github.com/tendermint/tendermint/consensus/state_test.go:255 +0x397 testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 ``` * fixes after my own review * fix formatting * wait 100ms before kicking a subscriber out + a test for indexer_service * fixes after my second review * no timeout * add changelog entries * fix merge conflicts * fix typos after Thane's review Co-Authored-By: melekes <anton.kalyaev@gmail.com> * reformat code * rewrite indexer service in the attempt to fix failing test https://github.com/tendermint/tendermint/pull/3227/#issuecomment-462316527 * Revert "rewrite indexer service in the attempt to fix failing test" This reverts commit 0d9107a098230de7138abb1c201877c246e89ed1. * another attempt to fix indexer * fixes after Ethan's review * use unbuffered channel when indexing transactions Refs https://github.com/tendermint/tendermint/pull/3227#discussion_r258786716 * add a comment for EventBus#SubscribeUnbuffered * format code
5 years ago
pubsub 2.0 (#3227) * green pubsub tests :OK: * get rid of clientToQueryMap * Subscribe and SubscribeUnbuffered * start adapting other pkgs to new pubsub * nope * rename MsgAndTags to Message * remove TagMap it does not bring any additional benefits * bring back EventSubscriber * fix test * fix data race in TestStartNextHeightCorrectly ``` Write at 0x00c0001c7418 by goroutine 796: github.com/tendermint/tendermint/consensus.TestStartNextHeightCorrectly() /go/src/github.com/tendermint/tendermint/consensus/state_test.go:1296 +0xad testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 Previous read at 0x00c0001c7418 by goroutine 858: github.com/tendermint/tendermint/consensus.(*ConsensusState).addVote() /go/src/github.com/tendermint/tendermint/consensus/state.go:1631 +0x1366 github.com/tendermint/tendermint/consensus.(*ConsensusState).tryAddVote() /go/src/github.com/tendermint/tendermint/consensus/state.go:1476 +0x8f github.com/tendermint/tendermint/consensus.(*ConsensusState).handleMsg() /go/src/github.com/tendermint/tendermint/consensus/state.go:667 +0xa1e github.com/tendermint/tendermint/consensus.(*ConsensusState).receiveRoutine() /go/src/github.com/tendermint/tendermint/consensus/state.go:628 +0x794 Goroutine 796 (running) created at: testing.(*T).Run() /usr/local/go/src/testing/testing.go:878 +0x659 testing.runTests.func1() /usr/local/go/src/testing/testing.go:1119 +0xa8 testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 testing.runTests() /usr/local/go/src/testing/testing.go:1117 +0x4ee testing.(*M).Run() /usr/local/go/src/testing/testing.go:1034 +0x2ee main.main() _testmain.go:214 +0x332 Goroutine 858 (running) created at: github.com/tendermint/tendermint/consensus.(*ConsensusState).startRoutines() /go/src/github.com/tendermint/tendermint/consensus/state.go:334 +0x221 github.com/tendermint/tendermint/consensus.startTestRound() /go/src/github.com/tendermint/tendermint/consensus/common_test.go:122 +0x63 github.com/tendermint/tendermint/consensus.TestStateFullRound1() /go/src/github.com/tendermint/tendermint/consensus/state_test.go:255 +0x397 testing.tRunner() /usr/local/go/src/testing/testing.go:827 +0x162 ``` * fixes after my own review * fix formatting * wait 100ms before kicking a subscriber out + a test for indexer_service * fixes after my second review * no timeout * add changelog entries * fix merge conflicts * fix typos after Thane's review Co-Authored-By: melekes <anton.kalyaev@gmail.com> * reformat code * rewrite indexer service in the attempt to fix failing test https://github.com/tendermint/tendermint/pull/3227/#issuecomment-462316527 * Revert "rewrite indexer service in the attempt to fix failing test" This reverts commit 0d9107a098230de7138abb1c201877c246e89ed1. * another attempt to fix indexer * fixes after Ethan's review * use unbuffered channel when indexing transactions Refs https://github.com/tendermint/tendermint/pull/3227#discussion_r258786716 * add a comment for EventBus#SubscribeUnbuffered * format code
5 years ago
6 years ago
fix non deterministic test failures and race in privval socket (#3258) * node: decrease retry conn timeout in test Should fix #3256 The retry timeout was set to the default, which is the same as the accept timeout, so it's no wonder this would fail. Here we decrease the retry timeout so we can try many times before the accept timeout. * p2p: increase handshake timeout in test This fails sometimes, presumably because the handshake timeout is so low (only 50ms). So increase it to 1s. Should fix #3187 * privval: fix race with ping. closes #3237 Pings happen in a go-routine and can happen concurrently with other messages. Since we use a request/response protocol, we expect to send a request and get back the corresponding response. But with pings happening concurrently, this assumption could be violated. We were using a mutex, but only a RWMutex, where the RLock was being held for sending messages - this was to allow the underlying connection to be replaced if it fails. Turns out we actually need to use a full lock (not just a read lock) to prevent multiple requests from happening concurrently. * node: fix test name. DelayedStop -> DelayedStart * autofile: Wait() method In the TestWALTruncate in consensus/wal_test.go we remove the WAL directory at the end of the test. However the wal.Stop() does not properly wait for the autofile group to finish shutting down. Hence it was possible that the group's go-routine is still running when the cleanup happens, which causes a panic since the directory disappeared. Here we add a Wait() method to properly wait until the go-routine exits so we can safely clean up. This fixes #2852.
5 years ago
fix non deterministic test failures and race in privval socket (#3258) * node: decrease retry conn timeout in test Should fix #3256 The retry timeout was set to the default, which is the same as the accept timeout, so it's no wonder this would fail. Here we decrease the retry timeout so we can try many times before the accept timeout. * p2p: increase handshake timeout in test This fails sometimes, presumably because the handshake timeout is so low (only 50ms). So increase it to 1s. Should fix #3187 * privval: fix race with ping. closes #3237 Pings happen in a go-routine and can happen concurrently with other messages. Since we use a request/response protocol, we expect to send a request and get back the corresponding response. But with pings happening concurrently, this assumption could be violated. We were using a mutex, but only a RWMutex, where the RLock was being held for sending messages - this was to allow the underlying connection to be replaced if it fails. Turns out we actually need to use a full lock (not just a read lock) to prevent multiple requests from happening concurrently. * node: fix test name. DelayedStop -> DelayedStart * autofile: Wait() method In the TestWALTruncate in consensus/wal_test.go we remove the WAL directory at the end of the test. However the wal.Stop() does not properly wait for the autofile group to finish shutting down. Hence it was possible that the group's go-routine is still running when the cleanup happens, which causes a panic since the directory disappeared. Here we add a Wait() method to properly wait until the go-routine exits so we can safely clean up. This fixes #2852.
5 years ago
Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
privval: improve Remote Signer implementation (#3351) This issue is related to #3107 This is a first renaming/refactoring step before reworking and removing heartbeats. As discussed with @Liamsi , we preferred to go for a couple of independent and separate PRs to simplify review work. The changes: Help to clarify the relation between the validator and remote signer endpoints Differentiate between timeouts and deadlines Prepare to encapsulate networking related code behind RemoteSigner in the next PR My intention is to separate and encapsulate the "network related" code from the actual signer. SignerRemote ---(uses/contains)--> SignerValidatorEndpoint <--(connects to)--> SignerServiceEndpoint ---> SignerService (future.. not here yet but would like to decouple too) All reconnection/heartbeat/whatever code goes in the endpoints. Signer[Remote/Service] do not need to know about that. I agree Endpoint may not be the perfect name. I tried to find something "Go-ish" enough. It is a common name in go-kit, kubernetes, etc. Right now: SignerValidatorEndpoint: handles the listener contains SignerRemote Implements the PrivValidator interface connects and sets a connection object in a contained SignerRemote delegates PrivValidator some calls to SignerRemote which in turn uses the conn object that was set externally SignerRemote: Implements the PrivValidator interface read/writes from a connection object directly handles heartbeats SignerServiceEndpoint: Does most things in a single place delegates to a PrivValidator IIRC. * cleanup * Refactoring step 1 * Refactoring step 2 * move messages to another file * mark for future work / next steps * mark deprecated classes in docs * Fix linter problems * additional linter fixes
5 years ago
Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
privval: improve Remote Signer implementation (#3351) This issue is related to #3107 This is a first renaming/refactoring step before reworking and removing heartbeats. As discussed with @Liamsi , we preferred to go for a couple of independent and separate PRs to simplify review work. The changes: Help to clarify the relation between the validator and remote signer endpoints Differentiate between timeouts and deadlines Prepare to encapsulate networking related code behind RemoteSigner in the next PR My intention is to separate and encapsulate the "network related" code from the actual signer. SignerRemote ---(uses/contains)--> SignerValidatorEndpoint <--(connects to)--> SignerServiceEndpoint ---> SignerService (future.. not here yet but would like to decouple too) All reconnection/heartbeat/whatever code goes in the endpoints. Signer[Remote/Service] do not need to know about that. I agree Endpoint may not be the perfect name. I tried to find something "Go-ish" enough. It is a common name in go-kit, kubernetes, etc. Right now: SignerValidatorEndpoint: handles the listener contains SignerRemote Implements the PrivValidator interface connects and sets a connection object in a contained SignerRemote delegates PrivValidator some calls to SignerRemote which in turn uses the conn object that was set externally SignerRemote: Implements the PrivValidator interface read/writes from a connection object directly handles heartbeats SignerServiceEndpoint: Does most things in a single place delegates to a PrivValidator IIRC. * cleanup * Refactoring step 1 * Refactoring step 2 * move messages to another file * mark for future work / next steps * mark deprecated classes in docs * Fix linter problems * additional linter fixes
5 years ago
Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
privval: improve Remote Signer implementation (#3351) This issue is related to #3107 This is a first renaming/refactoring step before reworking and removing heartbeats. As discussed with @Liamsi , we preferred to go for a couple of independent and separate PRs to simplify review work. The changes: Help to clarify the relation between the validator and remote signer endpoints Differentiate between timeouts and deadlines Prepare to encapsulate networking related code behind RemoteSigner in the next PR My intention is to separate and encapsulate the "network related" code from the actual signer. SignerRemote ---(uses/contains)--> SignerValidatorEndpoint <--(connects to)--> SignerServiceEndpoint ---> SignerService (future.. not here yet but would like to decouple too) All reconnection/heartbeat/whatever code goes in the endpoints. Signer[Remote/Service] do not need to know about that. I agree Endpoint may not be the perfect name. I tried to find something "Go-ish" enough. It is a common name in go-kit, kubernetes, etc. Right now: SignerValidatorEndpoint: handles the listener contains SignerRemote Implements the PrivValidator interface connects and sets a connection object in a contained SignerRemote delegates PrivValidator some calls to SignerRemote which in turn uses the conn object that was set externally SignerRemote: Implements the PrivValidator interface read/writes from a connection object directly handles heartbeats SignerServiceEndpoint: Does most things in a single place delegates to a PrivValidator IIRC. * cleanup * Refactoring step 1 * Refactoring step 2 * move messages to another file * mark for future work / next steps * mark deprecated classes in docs * Fix linter problems * additional linter fixes
5 years ago
Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
privval: improve Remote Signer implementation (#3351) This issue is related to #3107 This is a first renaming/refactoring step before reworking and removing heartbeats. As discussed with @Liamsi , we preferred to go for a couple of independent and separate PRs to simplify review work. The changes: Help to clarify the relation between the validator and remote signer endpoints Differentiate between timeouts and deadlines Prepare to encapsulate networking related code behind RemoteSigner in the next PR My intention is to separate and encapsulate the "network related" code from the actual signer. SignerRemote ---(uses/contains)--> SignerValidatorEndpoint <--(connects to)--> SignerServiceEndpoint ---> SignerService (future.. not here yet but would like to decouple too) All reconnection/heartbeat/whatever code goes in the endpoints. Signer[Remote/Service] do not need to know about that. I agree Endpoint may not be the perfect name. I tried to find something "Go-ish" enough. It is a common name in go-kit, kubernetes, etc. Right now: SignerValidatorEndpoint: handles the listener contains SignerRemote Implements the PrivValidator interface connects and sets a connection object in a contained SignerRemote delegates PrivValidator some calls to SignerRemote which in turn uses the conn object that was set externally SignerRemote: Implements the PrivValidator interface read/writes from a connection object directly handles heartbeats SignerServiceEndpoint: Does most things in a single place delegates to a PrivValidator IIRC. * cleanup * Refactoring step 1 * Refactoring step 2 * move messages to another file * mark for future work / next steps * mark deprecated classes in docs * Fix linter problems * additional linter fixes
5 years ago
Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
privval: improve Remote Signer implementation (#3351) This issue is related to #3107 This is a first renaming/refactoring step before reworking and removing heartbeats. As discussed with @Liamsi , we preferred to go for a couple of independent and separate PRs to simplify review work. The changes: Help to clarify the relation between the validator and remote signer endpoints Differentiate between timeouts and deadlines Prepare to encapsulate networking related code behind RemoteSigner in the next PR My intention is to separate and encapsulate the "network related" code from the actual signer. SignerRemote ---(uses/contains)--> SignerValidatorEndpoint <--(connects to)--> SignerServiceEndpoint ---> SignerService (future.. not here yet but would like to decouple too) All reconnection/heartbeat/whatever code goes in the endpoints. Signer[Remote/Service] do not need to know about that. I agree Endpoint may not be the perfect name. I tried to find something "Go-ish" enough. It is a common name in go-kit, kubernetes, etc. Right now: SignerValidatorEndpoint: handles the listener contains SignerRemote Implements the PrivValidator interface connects and sets a connection object in a contained SignerRemote delegates PrivValidator some calls to SignerRemote which in turn uses the conn object that was set externally SignerRemote: Implements the PrivValidator interface read/writes from a connection object directly handles heartbeats SignerServiceEndpoint: Does most things in a single place delegates to a PrivValidator IIRC. * cleanup * Refactoring step 1 * Refactoring step 2 * move messages to another file * mark for future work / next steps * mark deprecated classes in docs * Fix linter problems * additional linter fixes
5 years ago
fix non deterministic test failures and race in privval socket (#3258) * node: decrease retry conn timeout in test Should fix #3256 The retry timeout was set to the default, which is the same as the accept timeout, so it's no wonder this would fail. Here we decrease the retry timeout so we can try many times before the accept timeout. * p2p: increase handshake timeout in test This fails sometimes, presumably because the handshake timeout is so low (only 50ms). So increase it to 1s. Should fix #3187 * privval: fix race with ping. closes #3237 Pings happen in a go-routine and can happen concurrently with other messages. Since we use a request/response protocol, we expect to send a request and get back the corresponding response. But with pings happening concurrently, this assumption could be violated. We were using a mutex, but only a RWMutex, where the RLock was being held for sending messages - this was to allow the underlying connection to be replaced if it fails. Turns out we actually need to use a full lock (not just a read lock) to prevent multiple requests from happening concurrently. * node: fix test name. DelayedStop -> DelayedStart * autofile: Wait() method In the TestWALTruncate in consensus/wal_test.go we remove the WAL directory at the end of the test. However the wal.Stop() does not properly wait for the autofile group to finish shutting down. Hence it was possible that the group's go-routine is still running when the cleanup happens, which causes a panic since the directory disappeared. Here we add a Wait() method to properly wait until the go-routine exits so we can safely clean up. This fixes #2852.
5 years ago
fix non deterministic test failures and race in privval socket (#3258) * node: decrease retry conn timeout in test Should fix #3256 The retry timeout was set to the default, which is the same as the accept timeout, so it's no wonder this would fail. Here we decrease the retry timeout so we can try many times before the accept timeout. * p2p: increase handshake timeout in test This fails sometimes, presumably because the handshake timeout is so low (only 50ms). So increase it to 1s. Should fix #3187 * privval: fix race with ping. closes #3237 Pings happen in a go-routine and can happen concurrently with other messages. Since we use a request/response protocol, we expect to send a request and get back the corresponding response. But with pings happening concurrently, this assumption could be violated. We were using a mutex, but only a RWMutex, where the RLock was being held for sending messages - this was to allow the underlying connection to be replaced if it fails. Turns out we actually need to use a full lock (not just a read lock) to prevent multiple requests from happening concurrently. * node: fix test name. DelayedStop -> DelayedStart * autofile: Wait() method In the TestWALTruncate in consensus/wal_test.go we remove the WAL directory at the end of the test. However the wal.Stop() does not properly wait for the autofile group to finish shutting down. Hence it was possible that the group's go-routine is still running when the cleanup happens, which causes a panic since the directory disappeared. Here we add a Wait() method to properly wait until the go-routine exits so we can safely clean up. This fixes #2852.
5 years ago
fix non deterministic test failures and race in privval socket (#3258) * node: decrease retry conn timeout in test Should fix #3256 The retry timeout was set to the default, which is the same as the accept timeout, so it's no wonder this would fail. Here we decrease the retry timeout so we can try many times before the accept timeout. * p2p: increase handshake timeout in test This fails sometimes, presumably because the handshake timeout is so low (only 50ms). So increase it to 1s. Should fix #3187 * privval: fix race with ping. closes #3237 Pings happen in a go-routine and can happen concurrently with other messages. Since we use a request/response protocol, we expect to send a request and get back the corresponding response. But with pings happening concurrently, this assumption could be violated. We were using a mutex, but only a RWMutex, where the RLock was being held for sending messages - this was to allow the underlying connection to be replaced if it fails. Turns out we actually need to use a full lock (not just a read lock) to prevent multiple requests from happening concurrently. * node: fix test name. DelayedStop -> DelayedStart * autofile: Wait() method In the TestWALTruncate in consensus/wal_test.go we remove the WAL directory at the end of the test. However the wal.Stop() does not properly wait for the autofile group to finish shutting down. Hence it was possible that the group's go-routine is still running when the cleanup happens, which causes a panic since the directory disappeared. Here we add a Wait() method to properly wait until the go-routine exits so we can safely clean up. This fixes #2852.
5 years ago
privval: improve Remote Signer implementation (#3351) This issue is related to #3107 This is a first renaming/refactoring step before reworking and removing heartbeats. As discussed with @Liamsi , we preferred to go for a couple of independent and separate PRs to simplify review work. The changes: Help to clarify the relation between the validator and remote signer endpoints Differentiate between timeouts and deadlines Prepare to encapsulate networking related code behind RemoteSigner in the next PR My intention is to separate and encapsulate the "network related" code from the actual signer. SignerRemote ---(uses/contains)--> SignerValidatorEndpoint <--(connects to)--> SignerServiceEndpoint ---> SignerService (future.. not here yet but would like to decouple too) All reconnection/heartbeat/whatever code goes in the endpoints. Signer[Remote/Service] do not need to know about that. I agree Endpoint may not be the perfect name. I tried to find something "Go-ish" enough. It is a common name in go-kit, kubernetes, etc. Right now: SignerValidatorEndpoint: handles the listener contains SignerRemote Implements the PrivValidator interface connects and sets a connection object in a contained SignerRemote delegates PrivValidator some calls to SignerRemote which in turn uses the conn object that was set externally SignerRemote: Implements the PrivValidator interface read/writes from a connection object directly handles heartbeats SignerServiceEndpoint: Does most things in a single place delegates to a PrivValidator IIRC. * cleanup * Refactoring step 1 * Refactoring step 2 * move messages to another file * mark for future work / next steps * mark deprecated classes in docs * Fix linter problems * additional linter fixes
5 years ago
mempool: move interface into mempool package (#3524) ## Description Refs #2659 Breaking changes in the mempool package: [mempool] #2659 Mempool now an interface old Mempool renamed to CListMempool NewMempool renamed to NewCListMempool Option renamed to CListOption MempoolReactor renamed to Reactor NewMempoolReactor renamed to NewReactor unexpose TxID method TxInfo.PeerID renamed to SenderID unexpose MempoolReactor.Mempool Breaking changes in the state package: [state] #2659 Mempool interface moved to mempool package MockMempool moved to top-level mock package and renamed to Mempool Non Breaking changes in the node package: [node] #2659 Add Mempool method, which allows you to access mempool ## Commits * move Mempool interface into mempool package Refs #2659 Breaking changes in the mempool package: - Mempool now an interface - old Mempool renamed to CListMempool Breaking changes to state package: - MockMempool moved to mempool/mock package and renamed to Mempool - Mempool interface moved to mempool package * assert CListMempool impl Mempool * gofmt code * rename MempoolReactor to Reactor - combine everything into one interface - rename TxInfo.PeerID to TxInfo.SenderID - unexpose MempoolReactor.Mempool * move mempool mock into top-level mock package * add a fixme TxsFront should not be a part of the Mempool interface because it leaks implementation details. Instead, we need to come up with general interface for querying the mempool so the MempoolReactor can fetch and broadcast txs to peers. * change node#Mempool to return interface * save commit = new reactor arch * Revert "save commit = new reactor arch" This reverts commit 1bfceacd9d65a720574683a7f22771e69af9af4d. * require CListMempool in mempool.Reactor * add two changelog entries * fixes after my own review * quote interfaces, structs and functions * fixes after Ismail's review * make node's mempool an interface * make InitWAL/CloseWAL methods a part of Mempool interface * fix merge conflicts * make node's mempool an interface
5 years ago
  1. package node
  2. import (
  3. "context"
  4. "fmt"
  5. "net"
  6. "os"
  7. "syscall"
  8. "testing"
  9. "time"
  10. "github.com/stretchr/testify/assert"
  11. "github.com/stretchr/testify/require"
  12. "github.com/tendermint/tendermint/abci/example/kvstore"
  13. cfg "github.com/tendermint/tendermint/config"
  14. "github.com/tendermint/tendermint/crypto/ed25519"
  15. "github.com/tendermint/tendermint/evidence"
  16. cmn "github.com/tendermint/tendermint/libs/common"
  17. "github.com/tendermint/tendermint/libs/log"
  18. mempl "github.com/tendermint/tendermint/mempool"
  19. "github.com/tendermint/tendermint/p2p"
  20. p2pmock "github.com/tendermint/tendermint/p2p/mock"
  21. "github.com/tendermint/tendermint/privval"
  22. "github.com/tendermint/tendermint/proxy"
  23. sm "github.com/tendermint/tendermint/state"
  24. "github.com/tendermint/tendermint/types"
  25. tmtime "github.com/tendermint/tendermint/types/time"
  26. "github.com/tendermint/tendermint/version"
  27. dbm "github.com/tendermint/tm-db"
  28. )
  29. func TestNodeStartStop(t *testing.T) {
  30. config := cfg.ResetTestRoot("node_node_test")
  31. defer os.RemoveAll(config.RootDir)
  32. // create & start node
  33. n, err := DefaultNewNode(config, log.TestingLogger())
  34. require.NoError(t, err)
  35. err = n.Start()
  36. require.NoError(t, err)
  37. t.Logf("Started node %v", n.sw.NodeInfo())
  38. // wait for the node to produce a block
  39. blocksSub, err := n.EventBus().Subscribe(context.Background(), "node_test", types.EventQueryNewBlock)
  40. require.NoError(t, err)
  41. select {
  42. case <-blocksSub.Out():
  43. case <-blocksSub.Cancelled():
  44. t.Fatal("blocksSub was cancelled")
  45. case <-time.After(10 * time.Second):
  46. t.Fatal("timed out waiting for the node to produce a block")
  47. }
  48. // stop the node
  49. go func() {
  50. n.Stop()
  51. }()
  52. select {
  53. case <-n.Quit():
  54. case <-time.After(5 * time.Second):
  55. pid := os.Getpid()
  56. p, err := os.FindProcess(pid)
  57. if err != nil {
  58. panic(err)
  59. }
  60. err = p.Signal(syscall.SIGABRT)
  61. fmt.Println(err)
  62. t.Fatal("timed out waiting for shutdown")
  63. }
  64. }
  65. func TestSplitAndTrimEmpty(t *testing.T) {
  66. testCases := []struct {
  67. s string
  68. sep string
  69. cutset string
  70. expected []string
  71. }{
  72. {"a,b,c", ",", " ", []string{"a", "b", "c"}},
  73. {" a , b , c ", ",", " ", []string{"a", "b", "c"}},
  74. {" a, b, c ", ",", " ", []string{"a", "b", "c"}},
  75. {" a, ", ",", " ", []string{"a"}},
  76. {" ", ",", " ", []string{}},
  77. }
  78. for _, tc := range testCases {
  79. assert.Equal(t, tc.expected, splitAndTrimEmpty(tc.s, tc.sep, tc.cutset), "%s", tc.s)
  80. }
  81. }
  82. func TestNodeDelayedStart(t *testing.T) {
  83. config := cfg.ResetTestRoot("node_delayed_start_test")
  84. defer os.RemoveAll(config.RootDir)
  85. now := tmtime.Now()
  86. // create & start node
  87. n, err := DefaultNewNode(config, log.TestingLogger())
  88. n.GenesisDoc().GenesisTime = now.Add(2 * time.Second)
  89. require.NoError(t, err)
  90. err = n.Start()
  91. require.NoError(t, err)
  92. defer n.Stop()
  93. startTime := tmtime.Now()
  94. assert.Equal(t, true, startTime.After(n.GenesisDoc().GenesisTime))
  95. }
  96. func TestNodeSetAppVersion(t *testing.T) {
  97. config := cfg.ResetTestRoot("node_app_version_test")
  98. defer os.RemoveAll(config.RootDir)
  99. // create & start node
  100. n, err := DefaultNewNode(config, log.TestingLogger())
  101. require.NoError(t, err)
  102. // default config uses the kvstore app
  103. var appVersion version.Protocol = kvstore.ProtocolVersion
  104. // check version is set in state
  105. state := sm.LoadState(n.stateDB)
  106. assert.Equal(t, state.Version.Consensus.App, appVersion)
  107. // check version is set in node info
  108. assert.Equal(t, n.nodeInfo.(p2p.DefaultNodeInfo).ProtocolVersion.App, appVersion)
  109. }
  110. func TestNodeSetPrivValTCP(t *testing.T) {
  111. addr := "tcp://" + testFreeAddr(t)
  112. config := cfg.ResetTestRoot("node_priv_val_tcp_test")
  113. defer os.RemoveAll(config.RootDir)
  114. config.BaseConfig.PrivValidatorListenAddr = addr
  115. dialer := privval.DialTCPFn(addr, 100*time.Millisecond, ed25519.GenPrivKey())
  116. pvsc := privval.NewSignerServiceEndpoint(
  117. log.TestingLogger(),
  118. config.ChainID(),
  119. types.NewMockPV(),
  120. dialer,
  121. )
  122. privval.SignerServiceEndpointTimeoutReadWrite(100 * time.Millisecond)(pvsc)
  123. go func() {
  124. err := pvsc.Start()
  125. if err != nil {
  126. panic(err)
  127. }
  128. }()
  129. defer pvsc.Stop()
  130. n, err := DefaultNewNode(config, log.TestingLogger())
  131. require.NoError(t, err)
  132. assert.IsType(t, &privval.SignerValidatorEndpoint{}, n.PrivValidator())
  133. }
  134. // address without a protocol must result in error
  135. func TestPrivValidatorListenAddrNoProtocol(t *testing.T) {
  136. addrNoPrefix := testFreeAddr(t)
  137. config := cfg.ResetTestRoot("node_priv_val_tcp_test")
  138. defer os.RemoveAll(config.RootDir)
  139. config.BaseConfig.PrivValidatorListenAddr = addrNoPrefix
  140. _, err := DefaultNewNode(config, log.TestingLogger())
  141. assert.Error(t, err)
  142. }
  143. func TestNodeSetPrivValIPC(t *testing.T) {
  144. tmpfile := "/tmp/kms." + cmn.RandStr(6) + ".sock"
  145. defer os.Remove(tmpfile) // clean up
  146. config := cfg.ResetTestRoot("node_priv_val_tcp_test")
  147. defer os.RemoveAll(config.RootDir)
  148. config.BaseConfig.PrivValidatorListenAddr = "unix://" + tmpfile
  149. dialer := privval.DialUnixFn(tmpfile)
  150. pvsc := privval.NewSignerServiceEndpoint(
  151. log.TestingLogger(),
  152. config.ChainID(),
  153. types.NewMockPV(),
  154. dialer,
  155. )
  156. privval.SignerServiceEndpointTimeoutReadWrite(100 * time.Millisecond)(pvsc)
  157. go func() {
  158. err := pvsc.Start()
  159. require.NoError(t, err)
  160. }()
  161. defer pvsc.Stop()
  162. n, err := DefaultNewNode(config, log.TestingLogger())
  163. require.NoError(t, err)
  164. assert.IsType(t, &privval.SignerValidatorEndpoint{}, n.PrivValidator())
  165. }
  166. // testFreeAddr claims a free port so we don't block on listener being ready.
  167. func testFreeAddr(t *testing.T) string {
  168. ln, err := net.Listen("tcp", "127.0.0.1:0")
  169. require.NoError(t, err)
  170. defer ln.Close()
  171. return fmt.Sprintf("127.0.0.1:%d", ln.Addr().(*net.TCPAddr).Port)
  172. }
  173. // create a proposal block using real and full
  174. // mempool and evidence pool and validate it.
  175. func TestCreateProposalBlock(t *testing.T) {
  176. config := cfg.ResetTestRoot("node_create_proposal")
  177. defer os.RemoveAll(config.RootDir)
  178. cc := proxy.NewLocalClientCreator(kvstore.NewKVStoreApplication())
  179. proxyApp := proxy.NewAppConns(cc)
  180. err := proxyApp.Start()
  181. require.Nil(t, err)
  182. defer proxyApp.Stop()
  183. logger := log.TestingLogger()
  184. var height int64 = 1
  185. state, stateDB := state(1, height)
  186. maxBytes := 16384
  187. state.ConsensusParams.Block.MaxBytes = int64(maxBytes)
  188. proposerAddr, _ := state.Validators.GetByIndex(0)
  189. // Make Mempool
  190. memplMetrics := mempl.PrometheusMetrics("node_test")
  191. mempool := mempl.NewCListMempool(
  192. config.Mempool,
  193. proxyApp.Mempool(),
  194. state.LastBlockHeight,
  195. mempl.WithMetrics(memplMetrics),
  196. mempl.WithPreCheck(sm.TxPreCheck(state)),
  197. mempl.WithPostCheck(sm.TxPostCheck(state)),
  198. )
  199. mempool.SetLogger(logger)
  200. // Make EvidencePool
  201. types.RegisterMockEvidencesGlobal() // XXX!
  202. evidence.RegisterMockEvidences()
  203. evidenceDB := dbm.NewMemDB()
  204. evidencePool := evidence.NewEvidencePool(stateDB, evidenceDB)
  205. evidencePool.SetLogger(logger)
  206. // fill the evidence pool with more evidence
  207. // than can fit in a block
  208. minEvSize := 12
  209. numEv := (maxBytes / types.MaxEvidenceBytesDenominator) / minEvSize
  210. for i := 0; i < numEv; i++ {
  211. ev := types.NewMockRandomGoodEvidence(1, proposerAddr, cmn.RandBytes(minEvSize))
  212. err := evidencePool.AddEvidence(ev)
  213. assert.NoError(t, err)
  214. }
  215. // fill the mempool with more txs
  216. // than can fit in a block
  217. txLength := 1000
  218. for i := 0; i < maxBytes/txLength; i++ {
  219. tx := cmn.RandBytes(txLength)
  220. err := mempool.CheckTx(tx, nil)
  221. assert.NoError(t, err)
  222. }
  223. blockExec := sm.NewBlockExecutor(
  224. stateDB,
  225. logger,
  226. proxyApp.Consensus(),
  227. mempool,
  228. evidencePool,
  229. )
  230. commit := types.NewCommit(types.BlockID{}, nil)
  231. block, _ := blockExec.CreateProposalBlock(
  232. height,
  233. state, commit,
  234. proposerAddr,
  235. )
  236. err = blockExec.ValidateBlock(state, block)
  237. assert.NoError(t, err)
  238. }
  239. func TestNodeNewNodeCustomReactors(t *testing.T) {
  240. config := cfg.ResetTestRoot("node_new_node_custom_reactors_test")
  241. defer os.RemoveAll(config.RootDir)
  242. cr := p2pmock.NewReactor()
  243. customBlockchainReactor := p2pmock.NewReactor()
  244. nodeKey, err := p2p.LoadOrGenNodeKey(config.NodeKeyFile())
  245. require.NoError(t, err)
  246. n, err := NewNode(config,
  247. privval.LoadOrGenFilePV(config.PrivValidatorKeyFile(), config.PrivValidatorStateFile()),
  248. nodeKey,
  249. proxy.DefaultClientCreator(config.ProxyApp, config.ABCI, config.DBDir()),
  250. DefaultGenesisDocProviderFunc(config),
  251. DefaultDBProvider,
  252. DefaultMetricsProvider(config.Instrumentation),
  253. log.TestingLogger(),
  254. CustomReactors(map[string]p2p.Reactor{"FOO": cr, "BLOCKCHAIN": customBlockchainReactor}),
  255. )
  256. require.NoError(t, err)
  257. err = n.Start()
  258. require.NoError(t, err)
  259. defer n.Stop()
  260. assert.True(t, cr.IsRunning())
  261. assert.Equal(t, cr, n.Switch().Reactor("FOO"))
  262. assert.True(t, customBlockchainReactor.IsRunning())
  263. assert.Equal(t, customBlockchainReactor, n.Switch().Reactor("BLOCKCHAIN"))
  264. }
  265. func state(nVals int, height int64) (sm.State, dbm.DB) {
  266. vals := make([]types.GenesisValidator, nVals)
  267. for i := 0; i < nVals; i++ {
  268. secret := []byte(fmt.Sprintf("test%d", i))
  269. pk := ed25519.GenPrivKeyFromSecret(secret)
  270. vals[i] = types.GenesisValidator{
  271. Address: pk.PubKey().Address(),
  272. PubKey: pk.PubKey(),
  273. Power: 1000,
  274. Name: fmt.Sprintf("test%d", i),
  275. }
  276. }
  277. s, _ := sm.MakeGenesisState(&types.GenesisDoc{
  278. ChainID: "test-chain",
  279. Validators: vals,
  280. AppHash: nil,
  281. })
  282. // save validators to db for 2 heights
  283. stateDB := dbm.NewMemDB()
  284. sm.SaveState(stateDB, s)
  285. for i := 1; i < int(height); i++ {
  286. s.LastBlockHeight++
  287. s.LastValidators = s.Validators.Copy()
  288. sm.SaveState(stateDB, s)
  289. }
  290. return s, stateDB
  291. }