diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 125d6f12a..f273e57a7 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -24,3 +24,6 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi ### BUG FIXES +- [light] [\#6685](https://github.com/tendermint/tendermint/pull/6685) fix bug + with incorrecly handling contexts which occasionally froze state sync. + diff --git a/light/client.go b/light/client.go index ef7fc1ba3..92b6edcf8 100644 --- a/light/client.go +++ b/light/client.go @@ -996,6 +996,9 @@ func (c *Client) lightBlockFromPrimary(ctx context.Context, height int64) (*type // Everything went smoothly. We reset the lightBlockRequests and return the light block return l, nil + case context.Canceled, context.DeadlineExceeded: + return l, err + case provider.ErrNoResponse, provider.ErrLightBlockNotFound, provider.ErrHeightTooHigh: // we find a new witness to replace the primary c.logger.Debug("error from light block request from primary, replacing...", diff --git a/light/provider/http/http.go b/light/provider/http/http.go index e0abbb41f..665fcbe70 100644 --- a/light/provider/http/http.go +++ b/light/provider/http/http.go @@ -18,8 +18,9 @@ var ( // This is very brittle, see: https://github.com/tendermint/tendermint/issues/4740 regexpMissingHeight = regexp.MustCompile(`height \d+ is not available`) regexpTooHigh = regexp.MustCompile(`height \d+ must be less than or equal to`) + regexpTimedOut = regexp.MustCompile(`Timeout exceeded`) - maxRetryAttempts = 10 + maxRetryAttempts = 5 timeout uint = 5 // sec. ) @@ -119,45 +120,51 @@ func (p *http) validatorSet(ctx context.Context, height *int64) (*types.Validato total = -1 ) +OUTER_LOOP: for len(vals) != total && page <= maxPages { for attempt := 1; attempt <= maxRetryAttempts; attempt++ { res, err := p.client.Validators(ctx, height, &page, &perPage) - if err != nil { - // TODO: standardize errors on the RPC side - if regexpTooHigh.MatchString(err.Error()) { - return nil, provider.ErrHeightTooHigh + switch { + case err == nil: + // Validate response. + if len(res.Validators) == 0 { + return nil, provider.ErrBadLightBlock{ + Reason: fmt.Errorf("validator set is empty (height: %d, page: %d, per_page: %d)", + height, page, perPage), + } } - - if regexpMissingHeight.MatchString(err.Error()) { - return nil, provider.ErrLightBlockNotFound - } - // if we have exceeded retry attempts then return no response error - if attempt == maxRetryAttempts { - return nil, provider.ErrNoResponse + if res.Total <= 0 { + return nil, provider.ErrBadLightBlock{ + Reason: fmt.Errorf("total number of vals is <= 0: %d (height: %d, page: %d, per_page: %d)", + res.Total, height, page, perPage), + } } - // else we wait and try again with exponential backoff + + total = res.Total + vals = append(vals, res.Validators...) + page++ + continue OUTER_LOOP + + case regexpTooHigh.MatchString(err.Error()): + return nil, provider.ErrHeightTooHigh + + case regexpMissingHeight.MatchString(err.Error()): + return nil, provider.ErrLightBlockNotFound + + // if we have exceeded retry attempts then return no response error + case attempt == maxRetryAttempts: + return nil, provider.ErrNoResponse + + case regexpTimedOut.MatchString(err.Error()): + // we wait and try again with exponential backoff time.Sleep(backoffTimeout(uint16(attempt))) continue - } - // Validate response. - if len(res.Validators) == 0 { - return nil, provider.ErrBadLightBlock{ - Reason: fmt.Errorf("validator set is empty (height: %d, page: %d, per_page: %d)", - height, page, perPage), - } - } - if res.Total <= 0 { - return nil, provider.ErrBadLightBlock{ - Reason: fmt.Errorf("total number of vals is <= 0: %d (height: %d, page: %d, per_page: %d)", - res.Total, height, page, perPage), - } + // context canceled or connection refused we return the error + default: + return nil, err } - total = res.Total - vals = append(vals, res.Validators...) - page++ - break } } @@ -171,20 +178,25 @@ func (p *http) validatorSet(ctx context.Context, height *int64) (*types.Validato func (p *http) signedHeader(ctx context.Context, height *int64) (*types.SignedHeader, error) { for attempt := 1; attempt <= maxRetryAttempts; attempt++ { commit, err := p.client.Commit(ctx, height) - if err != nil { - // TODO: standardize errors on the RPC side - if regexpTooHigh.MatchString(err.Error()) { - return nil, provider.ErrHeightTooHigh - } + switch { + case err == nil: + return &commit.SignedHeader, nil - if regexpMissingHeight.MatchString(err.Error()) { - return nil, provider.ErrLightBlockNotFound - } + case regexpTooHigh.MatchString(err.Error()): + return nil, provider.ErrHeightTooHigh + + case regexpMissingHeight.MatchString(err.Error()): + return nil, provider.ErrLightBlockNotFound + + case regexpTimedOut.MatchString(err.Error()): // we wait and try again with exponential backoff time.Sleep(backoffTimeout(uint16(attempt))) continue + + // either context was cancelled or connection refused. + default: + return nil, err } - return &commit.SignedHeader, nil } return nil, provider.ErrNoResponse } diff --git a/light/provider/http/http_test.go b/light/provider/http/http_test.go index 1a0beb339..36be4d281 100644 --- a/light/provider/http/http_test.go +++ b/light/provider/http/http_test.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -32,27 +33,17 @@ func TestNewProvider(t *testing.T) { require.Equal(t, fmt.Sprintf("%s", c), "http{http://153.200.0.1}") } -func TestMain(m *testing.M) { +func TestProvider(t *testing.T) { app := kvstore.NewApplication() - app.RetainBlocks = 9 + app.RetainBlocks = 10 node := rpctest.StartTendermint(app) - code := m.Run() - - rpctest.StopTendermint(node) - os.Exit(code) -} - -func TestProvider(t *testing.T) { cfg := rpctest.GetConfig() defer os.RemoveAll(cfg.RootDir) rpcAddr := cfg.RPC.ListenAddress genDoc, err := types.GenesisDocFromFile(cfg.GenesisFile()) - if err != nil { - panic(err) - } + require.NoError(t, err) chainID := genDoc.ChainID - t.Log("chainID:", chainID) c, err := rpchttp.New(rpcAddr, "/websocket") require.Nil(t, err) @@ -66,26 +57,37 @@ func TestProvider(t *testing.T) { require.NoError(t, err) // let's get the highest block - sh, err := p.LightBlock(context.Background(), 0) + lb, err := p.LightBlock(context.Background(), 0) require.NoError(t, err) - assert.True(t, sh.Height < 1000) + require.NotNil(t, lb) + assert.True(t, lb.Height < 1000) // let's check this is valid somehow - assert.Nil(t, sh.ValidateBasic(chainID)) + assert.Nil(t, lb.ValidateBasic(chainID)) // historical queries now work :) - lower := sh.Height - 3 - sh, err = p.LightBlock(context.Background(), lower) + lower := lb.Height - 3 + lb, err = p.LightBlock(context.Background(), lower) require.NoError(t, err) - assert.Equal(t, lower, sh.Height) + assert.Equal(t, lower, lb.Height) // fetching missing heights (both future and pruned) should return appropriate errors - lb, err := p.LightBlock(context.Background(), 1000) + lb, err = p.LightBlock(context.Background(), 1000) require.Error(t, err) require.Nil(t, lb) assert.Equal(t, provider.ErrHeightTooHigh, err) _, err = p.LightBlock(context.Background(), 1) require.Error(t, err) + require.Nil(t, lb) assert.Equal(t, provider.ErrLightBlockNotFound, err) + + // stop the full node and check that a no response error is returned + rpctest.StopTendermint(node) + time.Sleep(10 * time.Second) + lb, err = p.LightBlock(context.Background(), lower+2) + // we should see a connection refused + require.Error(t, err) + require.Contains(t, err.Error(), "connection refused") + require.Nil(t, lb) } diff --git a/rpc/test/helpers.go b/rpc/test/helpers.go index 19099d5e5..ebf2e14d5 100644 --- a/rpc/test/helpers.go +++ b/rpc/test/helpers.go @@ -142,7 +142,7 @@ func StartTendermint(app abci.Application, opts ...func(*Options)) *nm.Node { // cleans up test/config files. func StopTendermint(node *nm.Node) { if err := node.Stop(); err != nil { - node.Logger.Error("Error when tryint to stop node", "err", err) + node.Logger.Error("Error when trying to stop node", "err", err) } node.Wait() os.RemoveAll(node.Config().RootDir) diff --git a/statesync/reactor.go b/statesync/reactor.go index 427fe6653..8434b6adf 100644 --- a/statesync/reactor.go +++ b/statesync/reactor.go @@ -154,6 +154,7 @@ func (r *Reactor) Receive(chID byte, src p2p.Peer, msgBytes []byte) { Hash: msg.Hash, Metadata: msg.Metadata, }) + // TODO: We may want to consider punishing the peer for certain errors if err != nil { r.Logger.Error("Failed to add snapshot", "height", msg.Height, "format", msg.Format, "peer", src.ID(), "err", err) diff --git a/statesync/syncer.go b/statesync/syncer.go index 9cf7f99ec..3a3050ebf 100644 --- a/statesync/syncer.go +++ b/statesync/syncer.go @@ -213,6 +213,10 @@ func (s *syncer) SyncAny(discoveryTime time.Duration, retryHook func()) (sm.Stat s.logger.Info("Snapshot sender rejected", "peer", peer.ID()) } + case errors.Is(err, context.DeadlineExceeded): + s.logger.Info("Timed out validating snapshot, rejecting", "height", snapshot.Height, "err", err) + s.snapshots.Reject(snapshot) + default: return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err) } @@ -256,7 +260,7 @@ func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types. go s.fetchChunks(ctx, snapshot, chunks) } - pctx, pcancel := context.WithTimeout(context.Background(), 10*time.Second) + pctx, pcancel := context.WithTimeout(context.Background(), 15*time.Second) defer pcancel() // Optimistically build new state, so we don't discover any light client failures at the end.