From 2afae13a4859f643e0642c6eb249f05e8e7467d5 Mon Sep 17 00:00:00 2001 From: Anton Kaliaev Date: Mon, 11 May 2020 17:42:19 +0400 Subject: [PATCH] privval: retry GetPubKey/SignVote/SignProposal N times before returning an error Closes #4707 --- CHANGELOG.md | 9 +++- CHANGELOG_PENDING.md | 1 + node/node.go | 18 ++++++-- node/node_test.go | 4 +- privval/doc.go | 6 +++ privval/retry_signer_client.go | 83 ++++++++++++++++++++++++++++++++++ 6 files changed, 114 insertions(+), 7 deletions(-) create mode 100644 privval/retry_signer_client.go diff --git a/CHANGELOG.md b/CHANGELOG.md index e3f7df149..085ae1108 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,7 +42,6 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi ### BUG FIXES: - [rpc] [\#4568](https://github.com/tendermint/tendermint/issues/4568) Fix panic when `Subscribe` is called, but HTTP client is not running. `Subscribe`, `Unsubscribe(All)` methods return an error now (@melekes). - ## v0.33.3 @@ -339,6 +338,12 @@ subjectivity interface. Refer to the [spec](https://github.com/tendermint/spec/b - [consensus/types] [\#4243](https://github.com/tendermint/tendermint/issues/4243) fix BenchmarkRoundStateDeepCopy panics (@cuonglm) - [rpc] [\#4256](https://github.com/tendermint/tendermint/issues/4256) Pass `outCapacity` to `eventBus#Subscribe` when subscribing using a local client +## v0.32.11 + +### BUG FIXES: + +- [privval] [\#4275](https://github.com/tendermint/tendermint/issues/4275) Fix consensus failure when remote signer drops (@melekes) + ## v0.32.10 *April 6, 2020* @@ -421,7 +426,7 @@ program](https://hackerone.com/tendermint). ### BUG FIXES: -- [rpc/lib] [\#4051](https://github.com/tendermint/tendermint/pull/4131) Fix RPC client, which was previously resolving https protocol to http (@yenkhoon) +- [rpc/lib] [\#4131](https://github.com/tendermint/tendermint/pull/4131) Fix RPC client, which was previously resolving https protocol to http (@yenkhoon) - [cs] [\#4069](https://github.com/tendermint/tendermint/issues/4069) Don't panic when block meta is not found in store (@gregzaitsev) ## v0.32.8 diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 3ccf61fb1..cc149f3c8 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -60,3 +60,4 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi - [light] [\#4741](https://github.com/tendermint/tendermint/pull/4741) Correctly return `ErrSignedHeaderNotFound` and `ErrValidatorSetNotFound` on corresponding RPC errors (@erikgrinaker) - [rpc] \#4805 Attempt to handle panics during panic recovery (@erikgrinaker) - [types] [\#4764](https://github.com/tendermint/tendermint/pull/4764) Return an error if voting power overflows in `VerifyCommitTrusting` (@melekes) +- [privval] [\#4812](https://github.com/tendermint/tendermint/pull/4812) Retry `GetPubKey/SignVote/SignProposal` a few times before returning an error (@melekes) diff --git a/node/node.go b/node/node.go index 97eef3d6b..686515f24 100644 --- a/node/node.go +++ b/node/node.go @@ -1310,15 +1310,27 @@ func createAndStartPrivValidatorSocketClient( ) (types.PrivValidator, error) { pve, err := privval.NewSignerListener(listenAddr, logger) if err != nil { - return nil, errors.Wrap(err, "failed to start private validator") + return nil, fmt.Errorf("failed to start private validator: %w", err) } pvsc, err := privval.NewSignerClient(pve) if err != nil { - return nil, errors.Wrap(err, "failed to start private validator") + return nil, fmt.Errorf("failed to start private validator: %w", err) } - return pvsc, nil + // try to get a pubkey from private validate first time + _, err = pvsc.GetPubKey() + if err != nil { + return nil, fmt.Errorf("can't get pubkey: %w", err) + } + + const ( + retries = 50 // 50 * 100ms = 5s total + timeout = 100 * time.Millisecond + ) + pvscWithRetries := privval.NewRetrySignerClient(pvsc, retries, timeout) + + return pvscWithRetries, nil } // splitAndTrimEmpty slices s into all subslices separated by sep and returns a diff --git a/node/node_test.go b/node/node_test.go index cb8f62752..d8f20fe9c 100644 --- a/node/node_test.go +++ b/node/node_test.go @@ -160,7 +160,7 @@ func TestNodeSetPrivValTCP(t *testing.T) { n, err := DefaultNewNode(config, log.TestingLogger()) require.NoError(t, err) - assert.IsType(t, &privval.SignerClient{}, n.PrivValidator()) + assert.IsType(t, &privval.RetrySignerClient{}, n.PrivValidator()) } // address without a protocol must result in error @@ -204,7 +204,7 @@ func TestNodeSetPrivValIPC(t *testing.T) { n, err := DefaultNewNode(config, log.TestingLogger()) require.NoError(t, err) - assert.IsType(t, &privval.SignerClient{}, n.PrivValidator()) + assert.IsType(t, &privval.RetrySignerClient{}, n.PrivValidator()) } // testFreeAddr claims a free port so we don't block on listener being ready. diff --git a/privval/doc.go b/privval/doc.go index 668e5ebc4..7695ffe9d 100644 --- a/privval/doc.go +++ b/privval/doc.go @@ -19,5 +19,11 @@ SignerDialerEndpoint SignerDialerEndpoint is a simple wrapper around a net.Conn. It's used by both IPCVal and TCPVal. +SignerClient + +SignerClient handles remote validator connections that provide signing services. +In production, it's recommended to wrap it with RetrySignerClient to avoid +termination in case of temporary errors. + */ package privval diff --git a/privval/retry_signer_client.go b/privval/retry_signer_client.go new file mode 100644 index 000000000..d2cde8741 --- /dev/null +++ b/privval/retry_signer_client.go @@ -0,0 +1,83 @@ +package privval + +import ( + "fmt" + "time" + + "github.com/tendermint/tendermint/crypto" + "github.com/tendermint/tendermint/types" +) + +// RetrySignerClient wraps SignerClient adding retry for each operation (except +// Ping) w/ a timeout. +type RetrySignerClient struct { + next *SignerClient + retries int + timeout time.Duration +} + +// NewRetrySignerClient returns RetrySignerClient. If +retries+ is 0, the +// client will be retrying each operation indefinitely. +func NewRetrySignerClient(sc *SignerClient, retries int, timeout time.Duration) *RetrySignerClient { + return &RetrySignerClient{sc, retries, timeout} +} + +var _ types.PrivValidator = (*RetrySignerClient)(nil) + +func (sc *RetrySignerClient) Close() error { + return sc.next.Close() +} + +func (sc *RetrySignerClient) IsConnected() bool { + return sc.next.IsConnected() +} + +func (sc *RetrySignerClient) WaitForConnection(maxWait time.Duration) error { + return sc.next.WaitForConnection(maxWait) +} + +//-------------------------------------------------------- +// Implement PrivValidator + +func (sc *RetrySignerClient) Ping() error { + return sc.next.Ping() +} + +func (sc *RetrySignerClient) GetPubKey() (crypto.PubKey, error) { + var ( + pk crypto.PubKey + err error + ) + for i := 0; i < sc.retries || sc.retries == 0; i++ { + pk, err = sc.next.GetPubKey() + if err == nil { + return pk, nil + } + time.Sleep(sc.timeout) + } + return nil, fmt.Errorf("exhausted all attempts to get pubkey: %w", err) +} + +func (sc *RetrySignerClient) SignVote(chainID string, vote *types.Vote) error { + var err error + for i := 0; i < sc.retries || sc.retries == 0; i++ { + err = sc.next.SignVote(chainID, vote) + if err == nil { + return nil + } + time.Sleep(sc.timeout) + } + return fmt.Errorf("exhausted all attempts to sign vote: %w", err) +} + +func (sc *RetrySignerClient) SignProposal(chainID string, proposal *types.Proposal) error { + var err error + for i := 0; i < sc.retries || sc.retries == 0; i++ { + err = sc.next.SignProposal(chainID, proposal) + if err == nil { + return nil + } + time.Sleep(sc.timeout) + } + return fmt.Errorf("exhausted all attempts to sign proposal: %w", err) +}