Browse Source

light: correctly handle contexts (backport -> v0.34.x) (#6685)

pull/6717/head
Callum Waters 3 years ago
committed by GitHub
parent
commit
2c2f511f24
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 86 additions and 61 deletions
  1. +3
    -0
      CHANGELOG_PENDING.md
  2. +3
    -0
      light/client.go
  3. +51
    -39
      light/provider/http/http.go
  4. +22
    -20
      light/provider/http/http_test.go
  5. +1
    -1
      rpc/test/helpers.go
  6. +1
    -0
      statesync/reactor.go
  7. +5
    -1
      statesync/syncer.go

+ 3
- 0
CHANGELOG_PENDING.md View File

@ -24,3 +24,6 @@ Friendly reminder, we have a [bug bounty program](https://hackerone.com/tendermi
### BUG FIXES
- [light] [\#6685](https://github.com/tendermint/tendermint/pull/6685) fix bug
with incorrecly handling contexts which occasionally froze state sync.

+ 3
- 0
light/client.go View File

@ -996,6 +996,9 @@ func (c *Client) lightBlockFromPrimary(ctx context.Context, height int64) (*type
// Everything went smoothly. We reset the lightBlockRequests and return the light block
return l, nil
case context.Canceled, context.DeadlineExceeded:
return l, err
case provider.ErrNoResponse, provider.ErrLightBlockNotFound, provider.ErrHeightTooHigh:
// we find a new witness to replace the primary
c.logger.Debug("error from light block request from primary, replacing...",


+ 51
- 39
light/provider/http/http.go View File

@ -18,8 +18,9 @@ var (
// This is very brittle, see: https://github.com/tendermint/tendermint/issues/4740
regexpMissingHeight = regexp.MustCompile(`height \d+ is not available`)
regexpTooHigh = regexp.MustCompile(`height \d+ must be less than or equal to`)
regexpTimedOut = regexp.MustCompile(`Timeout exceeded`)
maxRetryAttempts = 10
maxRetryAttempts = 5
timeout uint = 5 // sec.
)
@ -119,45 +120,51 @@ func (p *http) validatorSet(ctx context.Context, height *int64) (*types.Validato
total = -1
)
OUTER_LOOP:
for len(vals) != total && page <= maxPages {
for attempt := 1; attempt <= maxRetryAttempts; attempt++ {
res, err := p.client.Validators(ctx, height, &page, &perPage)
if err != nil {
// TODO: standardize errors on the RPC side
if regexpTooHigh.MatchString(err.Error()) {
return nil, provider.ErrHeightTooHigh
switch {
case err == nil:
// Validate response.
if len(res.Validators) == 0 {
return nil, provider.ErrBadLightBlock{
Reason: fmt.Errorf("validator set is empty (height: %d, page: %d, per_page: %d)",
height, page, perPage),
}
}
if regexpMissingHeight.MatchString(err.Error()) {
return nil, provider.ErrLightBlockNotFound
}
// if we have exceeded retry attempts then return no response error
if attempt == maxRetryAttempts {
return nil, provider.ErrNoResponse
if res.Total <= 0 {
return nil, provider.ErrBadLightBlock{
Reason: fmt.Errorf("total number of vals is <= 0: %d (height: %d, page: %d, per_page: %d)",
res.Total, height, page, perPage),
}
}
// else we wait and try again with exponential backoff
total = res.Total
vals = append(vals, res.Validators...)
page++
continue OUTER_LOOP
case regexpTooHigh.MatchString(err.Error()):
return nil, provider.ErrHeightTooHigh
case regexpMissingHeight.MatchString(err.Error()):
return nil, provider.ErrLightBlockNotFound
// if we have exceeded retry attempts then return no response error
case attempt == maxRetryAttempts:
return nil, provider.ErrNoResponse
case regexpTimedOut.MatchString(err.Error()):
// we wait and try again with exponential backoff
time.Sleep(backoffTimeout(uint16(attempt)))
continue
}
// Validate response.
if len(res.Validators) == 0 {
return nil, provider.ErrBadLightBlock{
Reason: fmt.Errorf("validator set is empty (height: %d, page: %d, per_page: %d)",
height, page, perPage),
}
}
if res.Total <= 0 {
return nil, provider.ErrBadLightBlock{
Reason: fmt.Errorf("total number of vals is <= 0: %d (height: %d, page: %d, per_page: %d)",
res.Total, height, page, perPage),
}
// context canceled or connection refused we return the error
default:
return nil, err
}
total = res.Total
vals = append(vals, res.Validators...)
page++
break
}
}
@ -171,20 +178,25 @@ func (p *http) validatorSet(ctx context.Context, height *int64) (*types.Validato
func (p *http) signedHeader(ctx context.Context, height *int64) (*types.SignedHeader, error) {
for attempt := 1; attempt <= maxRetryAttempts; attempt++ {
commit, err := p.client.Commit(ctx, height)
if err != nil {
// TODO: standardize errors on the RPC side
if regexpTooHigh.MatchString(err.Error()) {
return nil, provider.ErrHeightTooHigh
}
switch {
case err == nil:
return &commit.SignedHeader, nil
if regexpMissingHeight.MatchString(err.Error()) {
return nil, provider.ErrLightBlockNotFound
}
case regexpTooHigh.MatchString(err.Error()):
return nil, provider.ErrHeightTooHigh
case regexpMissingHeight.MatchString(err.Error()):
return nil, provider.ErrLightBlockNotFound
case regexpTimedOut.MatchString(err.Error()):
// we wait and try again with exponential backoff
time.Sleep(backoffTimeout(uint16(attempt)))
continue
// either context was cancelled or connection refused.
default:
return nil, err
}
return &commit.SignedHeader, nil
}
return nil, provider.ErrNoResponse
}


+ 22
- 20
light/provider/http/http_test.go View File

@ -5,6 +5,7 @@ import (
"fmt"
"os"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@ -32,27 +33,17 @@ func TestNewProvider(t *testing.T) {
require.Equal(t, fmt.Sprintf("%s", c), "http{http://153.200.0.1}")
}
func TestMain(m *testing.M) {
func TestProvider(t *testing.T) {
app := kvstore.NewApplication()
app.RetainBlocks = 9
app.RetainBlocks = 10
node := rpctest.StartTendermint(app)
code := m.Run()
rpctest.StopTendermint(node)
os.Exit(code)
}
func TestProvider(t *testing.T) {
cfg := rpctest.GetConfig()
defer os.RemoveAll(cfg.RootDir)
rpcAddr := cfg.RPC.ListenAddress
genDoc, err := types.GenesisDocFromFile(cfg.GenesisFile())
if err != nil {
panic(err)
}
require.NoError(t, err)
chainID := genDoc.ChainID
t.Log("chainID:", chainID)
c, err := rpchttp.New(rpcAddr, "/websocket")
require.Nil(t, err)
@ -66,26 +57,37 @@ func TestProvider(t *testing.T) {
require.NoError(t, err)
// let's get the highest block
sh, err := p.LightBlock(context.Background(), 0)
lb, err := p.LightBlock(context.Background(), 0)
require.NoError(t, err)
assert.True(t, sh.Height < 1000)
require.NotNil(t, lb)
assert.True(t, lb.Height < 1000)
// let's check this is valid somehow
assert.Nil(t, sh.ValidateBasic(chainID))
assert.Nil(t, lb.ValidateBasic(chainID))
// historical queries now work :)
lower := sh.Height - 3
sh, err = p.LightBlock(context.Background(), lower)
lower := lb.Height - 3
lb, err = p.LightBlock(context.Background(), lower)
require.NoError(t, err)
assert.Equal(t, lower, sh.Height)
assert.Equal(t, lower, lb.Height)
// fetching missing heights (both future and pruned) should return appropriate errors
lb, err := p.LightBlock(context.Background(), 1000)
lb, err = p.LightBlock(context.Background(), 1000)
require.Error(t, err)
require.Nil(t, lb)
assert.Equal(t, provider.ErrHeightTooHigh, err)
_, err = p.LightBlock(context.Background(), 1)
require.Error(t, err)
require.Nil(t, lb)
assert.Equal(t, provider.ErrLightBlockNotFound, err)
// stop the full node and check that a no response error is returned
rpctest.StopTendermint(node)
time.Sleep(10 * time.Second)
lb, err = p.LightBlock(context.Background(), lower+2)
// we should see a connection refused
require.Error(t, err)
require.Contains(t, err.Error(), "connection refused")
require.Nil(t, lb)
}

+ 1
- 1
rpc/test/helpers.go View File

@ -142,7 +142,7 @@ func StartTendermint(app abci.Application, opts ...func(*Options)) *nm.Node {
// cleans up test/config files.
func StopTendermint(node *nm.Node) {
if err := node.Stop(); err != nil {
node.Logger.Error("Error when tryint to stop node", "err", err)
node.Logger.Error("Error when trying to stop node", "err", err)
}
node.Wait()
os.RemoveAll(node.Config().RootDir)


+ 1
- 0
statesync/reactor.go View File

@ -154,6 +154,7 @@ func (r *Reactor) Receive(chID byte, src p2p.Peer, msgBytes []byte) {
Hash: msg.Hash,
Metadata: msg.Metadata,
})
// TODO: We may want to consider punishing the peer for certain errors
if err != nil {
r.Logger.Error("Failed to add snapshot", "height", msg.Height, "format", msg.Format,
"peer", src.ID(), "err", err)


+ 5
- 1
statesync/syncer.go View File

@ -213,6 +213,10 @@ func (s *syncer) SyncAny(discoveryTime time.Duration, retryHook func()) (sm.Stat
s.logger.Info("Snapshot sender rejected", "peer", peer.ID())
}
case errors.Is(err, context.DeadlineExceeded):
s.logger.Info("Timed out validating snapshot, rejecting", "height", snapshot.Height, "err", err)
s.snapshots.Reject(snapshot)
default:
return sm.State{}, nil, fmt.Errorf("snapshot restoration failed: %w", err)
}
@ -256,7 +260,7 @@ func (s *syncer) Sync(snapshot *snapshot, chunks *chunkQueue) (sm.State, *types.
go s.fetchChunks(ctx, snapshot, chunks)
}
pctx, pcancel := context.WithTimeout(context.Background(), 10*time.Second)
pctx, pcancel := context.WithTimeout(context.Background(), 15*time.Second)
defer pcancel()
// Optimistically build new state, so we don't discover any light client failures at the end.


Loading…
Cancel
Save