From 37d36cd5bcda2b4fe0902ed644472c71718b4243 Mon Sep 17 00:00:00 2001 From: Callum Waters Date: Mon, 1 Mar 2021 12:04:02 +0100 Subject: [PATCH] light: improve provider handling (#6053) Introduces heuristics that track the amount of no responses or unavailable blocks a provider has for more robust provider handling by the light client. Use concurrent calls to all witnesses when a new primary is needed. --- light/client.go | 299 ++++++++++++++++++++----------- light/client_test.go | 8 +- light/detector.go | 44 ++--- light/detector_test.go | 2 +- light/provider/errors.go | 17 +- light/provider/http/http.go | 95 +++++++--- light/provider/http/http_test.go | 22 ++- light/provider/mock/deadmock.go | 7 +- light/verifier.go | 2 +- 9 files changed, 326 insertions(+), 170 deletions(-) diff --git a/light/client.go b/light/client.go index 13501e9fb..08dae678d 100644 --- a/light/client.go +++ b/light/client.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "sort" + "sync" "time" "github.com/tendermint/tendermint/libs/log" @@ -126,8 +127,10 @@ type Client struct { pruningSize uint16 // See ConfirmationFunction option confirmationFn func(action string) bool - - quit chan struct{} + // The light client keeps track of how many times it has requested a light + // block from it's providers. When this exceeds the amount of witnesses the + // light client will just return the last error sent by the providers + // repeatRequests uint16 logger log.Logger } @@ -161,14 +164,14 @@ func NewClient( } if c.latestTrustedBlock != nil { - c.logger.Info("Checking trusted light block using options") + c.logger.Info("checking trusted light block using options") if err := c.checkTrustedHeaderUsingOptions(ctx, trustOptions); err != nil { return nil, err } } if c.latestTrustedBlock == nil || c.latestTrustedBlock.Height < trustOptions.Height { - c.logger.Info("Downloading trusted light block using options") + c.logger.Info("downloading trusted light block using options") if err := c.initializeWithTrustOptions(ctx, trustOptions); err != nil { return nil, err } @@ -199,7 +202,6 @@ func NewClientFromTrustedStore( trustedStore: trustedStore, pruningSize: defaultPruningSize, confirmationFn: func(action string) bool { return true }, - quit: make(chan struct{}), logger: log.NewNopLogger(), } @@ -237,7 +239,7 @@ func (c *Client) restoreTrustedLightBlock() error { return fmt.Errorf("can't get last trusted light block: %w", err) } c.latestTrustedBlock = trustedBlock - c.logger.Info("Restored trusted light block", "height", lastHeight) + c.logger.Info("restored trusted light block", "height", lastHeight) } return nil @@ -273,7 +275,7 @@ func (c *Client) checkTrustedHeaderUsingOptions(ctx context.Context, options Tru case options.Height == c.latestTrustedBlock.Height: primaryHash = options.Hash case options.Height < c.latestTrustedBlock.Height: - c.logger.Info("Client initialized with old header (trusted is more recent)", + c.logger.Info("client initialized with old header (trusted is more recent)", "old", options.Height, "trustedHeight", c.latestTrustedBlock.Height, "trustedHash", c.latestTrustedBlock.Hash()) @@ -299,11 +301,11 @@ func (c *Client) checkTrustedHeaderUsingOptions(ctx context.Context, options Tru } if !bytes.Equal(primaryHash, c.latestTrustedBlock.Hash()) { - c.logger.Info("Prev. trusted header's hash (h1) doesn't match hash from primary provider (h2)", + c.logger.Info("previous trusted header's hash (h1) doesn't match hash from primary provider (h2)", "h1", c.latestTrustedBlock.Hash(), "h2", primaryHash) action := fmt.Sprintf( - "Prev. trusted header's hash %X doesn't match hash %X from primary provider. Remove all the stored light blocks?", + "Previous trusted header's hash %X doesn't match hash %X from primary provider. Remove all the stored light blocks?", c.latestTrustedBlock.Hash(), primaryHash) if c.confirmationFn(action) { err := c.Cleanup() @@ -415,7 +417,7 @@ func (c *Client) Update(ctx context.Context, now time.Time) (*types.LightBlock, if err != nil { return nil, err } - c.logger.Info("Advanced to new state", "height", latestBlock.Height, "hash", latestBlock.Hash()) + c.logger.Info("advanced to new state", "height", latestBlock.Height, "hash", latestBlock.Hash()) return latestBlock, nil } @@ -440,7 +442,7 @@ func (c *Client) VerifyLightBlockAtHeight(ctx context.Context, height int64, now // Check if the light block is already verified. h, err := c.TrustedLightBlock(height) if err == nil { - c.logger.Info("Header has already been verified", "height", height, "hash", h.Hash()) + c.logger.Debug("header has already been verified", "height", height, "hash", h.Hash()) // Return already trusted light block return h, nil } @@ -497,7 +499,7 @@ func (c *Client) VerifyHeader(ctx context.Context, newHeader *types.Header, now if !bytes.Equal(l.Hash(), newHeader.Hash()) { return fmt.Errorf("existing trusted header %X does not match newHeader %X", l.Hash(), newHeader.Hash()) } - c.logger.Info("Header has already been verified", + c.logger.Debug("header has already been verified", "height", newHeader.Height, "hash", newHeader.Hash()) return nil } @@ -516,7 +518,7 @@ func (c *Client) VerifyHeader(ctx context.Context, newHeader *types.Header, now } func (c *Client) verifyLightBlock(ctx context.Context, newLightBlock *types.LightBlock, now time.Time) error { - c.logger.Info("VerifyHeader", "height", newLightBlock.Height, "hash", newLightBlock.Hash()) + c.logger.Info("verify light block", "height", newLightBlock.Height, "hash", newLightBlock.Hash()) var ( verifyFunc func(ctx context.Context, trusted *types.LightBlock, new *types.LightBlock, now time.Time) error @@ -561,7 +563,7 @@ func (c *Client) verifyLightBlock(ctx context.Context, newLightBlock *types.Ligh err = verifyFunc(ctx, closestBlock, newLightBlock, now) } if err != nil { - c.logger.Error("Can't verify", "err", err) + c.logger.Error("failed to verify", "err", err) return err } @@ -595,7 +597,7 @@ func (c *Client) verifySequential( } // 2) Verify them - c.logger.Debug("Verify adjacent newLightBlock against verifiedBlock", + c.logger.Debug("verify adjacent newLightBlock against verifiedBlock", "trustedHeight", verifiedBlock.Height, "trustedHash", verifiedBlock.Hash(), "newHeight", interimBlock.Height, @@ -610,32 +612,21 @@ func (c *Client) verifySequential( case ErrInvalidHeader: // If the target header is invalid, return immediately. if err.To == newLightBlock.Height { - c.logger.Debug("Target header is invalid", "err", err) + c.logger.Debug("target header is invalid", "err", err) return err } - // If some intermediate header is invalid, replace the primary and try - // again. - c.logger.Error("primary sent invalid header -> replacing", "err", err, "primary", c.primary) - replaceErr := c.replacePrimaryProvider() - if replaceErr != nil { - c.logger.Error("Can't replace primary", "err", replaceErr) - // return original error - return err - } + // If some intermediate header is invalid, remove the primary and try again. + c.logger.Error("primary sent invalid header -> removing", "err", err, "primary", c.primary) - replacementBlock, fErr := c.lightBlockFromPrimary(ctx, newLightBlock.Height) - if fErr != nil { - c.logger.Error("Can't fetch light block from primary", "err", fErr) - // return original error + replacementBlock, removeErr := c.findNewPrimary(ctx, newLightBlock.Height, true) + if removeErr != nil { + c.logger.Debug("failed to replace primary. Returning original error", "err", removeErr) return err } if !bytes.Equal(replacementBlock.Hash(), newLightBlock.Hash()) { - c.logger.Error("Replacement provider has a different light block", - "newHash", newLightBlock.Hash(), - "replHash", replacementBlock.Hash()) - // return original error + c.logger.Debug("replaced primary but new primary has a different block to the initial one") return err } @@ -686,7 +677,7 @@ func (c *Client) verifySkipping( ) for { - c.logger.Debug("Verify non-adjacent newHeader against verifiedBlock", + c.logger.Debug("verify non-adjacent newHeader against verifiedBlock", "trustedHeight", verifiedBlock.Height, "trustedHash", verifiedBlock.Hash(), "newHeight", blockCache[depth].Height, @@ -716,10 +707,20 @@ func (c *Client) verifySkipping( pivotHeight := verifiedBlock.Height + (blockCache[depth].Height-verifiedBlock. Height)*verifySkippingNumerator/verifySkippingDenominator interimBlock, providerErr := source.LightBlock(ctx, pivotHeight) - if providerErr != nil { + switch providerErr { + case nil: + blockCache = append(blockCache, interimBlock) + + // if the error is benign, the client does not need to replace the primary + case provider.ErrLightBlockNotFound, provider.ErrNoResponse: + return nil, err + + // all other errors such as ErrBadLightBlock or ErrUnreliableProvider are seen as malevolent and the + // provider is removed + default: return nil, ErrVerificationFailed{From: verifiedBlock.Height, To: pivotHeight, Reason: providerErr} } - blockCache = append(blockCache, interimBlock) + } depth++ @@ -744,32 +745,20 @@ func (c *Client) verifySkippingAgainstPrimary( // If the target header is invalid, return immediately. invalidHeaderHeight := err.(ErrVerificationFailed).To if invalidHeaderHeight == newLightBlock.Height { - c.logger.Debug("Target header is invalid", "err", err) + c.logger.Debug("target header is invalid", "err", err) return err } - // If some intermediate header is invalid, replace the primary and try - // again. + // If some intermediate header is invalid, remove the primary and try again. c.logger.Error("primary sent invalid header -> replacing", "err", err, "primary", c.primary) - replaceErr := c.replacePrimaryProvider() - if replaceErr != nil { - c.logger.Error("Can't replace primary", "err", replaceErr) - // return original error - return err - } - - replacementBlock, fErr := c.lightBlockFromPrimary(ctx, newLightBlock.Height) - if fErr != nil { - c.logger.Error("Can't fetch light block from primary", "err", fErr) - // return original error + replacementBlock, removeErr := c.findNewPrimary(ctx, newLightBlock.Height, true) + if removeErr != nil { + c.logger.Error("failed to replace primary. Returning original error", "err", removeErr) return err } if !bytes.Equal(replacementBlock.Hash(), newLightBlock.Hash()) { - c.logger.Error("Replacement provider has a different light block", - "newHash", newLightBlock.Hash(), - "replHash", replacementBlock.Hash()) - // return original error + c.logger.Debug("replaced primary but new primary has a different block to the initial one. Returning original error") return err } @@ -835,7 +824,7 @@ func (c *Client) Witnesses() []provider.Provider { // Cleanup removes all the data (headers and validator sets) stored. Note: the // client must be stopped at this point. func (c *Client) Cleanup() error { - c.logger.Info("Removing all light blocks") + c.logger.Info("removing all light blocks") c.latestTrustedBlock = nil return c.trustedStore.Prune(0) } @@ -908,21 +897,31 @@ func (c *Client) backwards( return fmt.Errorf("failed to obtain the header at height #%d: %w", verifiedHeader.Height-1, err) } interimHeader = interimBlock.Header - c.logger.Debug("Verify newHeader against verifiedHeader", + c.logger.Debug("verify newHeader against verifiedHeader", "trustedHeight", verifiedHeader.Height, "trustedHash", verifiedHeader.Hash(), "newHeight", interimHeader.Height, "newHash", interimHeader.Hash()) if err := VerifyBackwards(interimHeader, verifiedHeader); err != nil { - c.logger.Error("primary sent invalid header -> replacing", "err", err, "primary", c.primary) - if replaceErr := c.replacePrimaryProvider(); replaceErr != nil { - c.logger.Error("Can't replace primary", "err", replaceErr) - // return original error - return fmt.Errorf("verify backwards from %d to %d failed: %w", - verifiedHeader.Height, interimHeader.Height, err) + // verification has failed + c.logger.Error("backwards verification failed, replacing primary...", "err", err, "primary", c.primary) + + // the client tries to see if it can get a witness to continue with the request + newPrimarysBlock, replaceErr := c.findNewPrimary(ctx, newHeader.Height, true) + if replaceErr != nil { + c.logger.Debug("failed to replace primary. Returning original error", "err", replaceErr) + return err } - // we need to verify the header at the same height again - continue + + // before continuing we must check that they have the same target header to validate + if !bytes.Equal(newPrimarysBlock.Hash(), newHeader.Hash()) { + c.logger.Debug("replaced primary but new primary has a different block to the initial one") + // return the original error + return err + } + + // try again with the new primary + return c.backwards(ctx, verifiedHeader, newPrimarysBlock.Header) } verifiedHeader = interimHeader } @@ -930,55 +929,145 @@ func (c *Client) backwards( return nil } -// NOTE: requires a providerMutex locked. -func (c *Client) removeWitness(idx int) { - switch len(c.witnesses) { - case 0: - panic(fmt.Sprintf("wanted to remove %d element from empty witnesses slice", idx)) - case 1: - c.witnesses = make([]provider.Provider, 0) +// lightBlockFromPrimary retrieves the lightBlock from the primary provider +// at the specified height. This method also handles provider behavior as follows: +// +// 1. If the provider does not respond or does not have the block, it tries again +// with a different provider +// 2. If all providers return the same error, the light client forwards the error to +// where the initial request came from +// 3. If the provider provides an invalid light block, is deemed unreliable or returns +// any other error, the primary is permanently dropped and is replaced by a witness. +func (c *Client) lightBlockFromPrimary(ctx context.Context, height int64) (*types.LightBlock, error) { + c.providerMutex.Lock() + l, err := c.primary.LightBlock(ctx, height) + c.providerMutex.Unlock() + + switch err { + case nil: + // Everything went smoothly. We reset the lightBlockRequests and return the light block + return l, nil + + case provider.ErrNoResponse, provider.ErrLightBlockNotFound: + // we find a new witness to replace the primary + c.logger.Debug("error from light block request from primary, replacing...", "error", err, "primary", c.primary) + return c.findNewPrimary(ctx, height, false) + default: - c.witnesses[idx] = c.witnesses[len(c.witnesses)-1] + // The light client has most likely received either provider.ErrUnreliableProvider or provider.ErrBadLightBlock + // These errors mean that the light client should drop the primary and try with another provider instead + c.logger.Error("error from light block request from primary, removing...", "error", err, "primary", c.primary) + return c.findNewPrimary(ctx, height, true) + } +} + +// NOTE: requires a providerMutex lock +func (c *Client) removeWitnesses(indexes []int) error { + // check that we will still have witnesses remaining + if len(c.witnesses) <= len(indexes) { + return ErrNoWitnesses + } + + // we need to make sure that we remove witnesses by index in the reverse + // order so as to not affect the indexes themselves + sort.Ints(indexes) + for i := len(indexes) - 1; i >= 0; i-- { + c.witnesses[indexes[i]] = c.witnesses[len(c.witnesses)-1] c.witnesses = c.witnesses[:len(c.witnesses)-1] } + + return nil } -// replaceProvider takes the first alternative provider and promotes it as the -// primary provider. -func (c *Client) replacePrimaryProvider() error { +type witnessResponse struct { + lb *types.LightBlock + witnessIndex int + err error +} + +// findNewPrimary concurrently sends a light block request, promoting the first witness to return +// a valid light block as the new primary. The remove option indicates whether the primary should be +// entire removed or just appended to the back of the witnesses list. This method also handles witness +// errors. If no witness is available, it returns the last error of the witness. +func (c *Client) findNewPrimary(ctx context.Context, height int64, remove bool) (*types.LightBlock, error) { c.providerMutex.Lock() defer c.providerMutex.Unlock() if len(c.witnesses) <= 1 { - return ErrNoWitnesses + return nil, ErrNoWitnesses } - c.primary = c.witnesses[0] - c.witnesses = c.witnesses[1:] - c.logger.Info("Replacing primary with the first witness", "new_primary", c.primary) - return nil -} + var ( + witnessResponsesC = make(chan witnessResponse, len(c.witnesses)) + witnessesToRemove []int + lastError error + wg sync.WaitGroup + ) -// lightBlockFromPrimary retrieves the lightBlock from the primary provider -// at the specified height. Handles dropout by the primary provider by swapping -// with an alternative provider. -func (c *Client) lightBlockFromPrimary(ctx context.Context, height int64) (*types.LightBlock, error) { - c.providerMutex.Lock() - l, err := c.primary.LightBlock(ctx, height) - c.providerMutex.Unlock() - if err != nil { - c.logger.Debug("Error on light block request from primary", "error", err, "primary", c.primary) - replaceErr := c.replacePrimaryProvider() - if replaceErr != nil { - return nil, fmt.Errorf("%v. Tried to replace primary but: %w", err.Error(), replaceErr) + // send out a light block request to all witnesses + subctx, cancel := context.WithCancel(ctx) + defer cancel() + for index := range c.witnesses { + wg.Add(1) + go func(witnessIndex int, witnessResponsesC chan witnessResponse) { + defer wg.Done() + + lb, err := c.witnesses[witnessIndex].LightBlock(subctx, height) + witnessResponsesC <- witnessResponse{lb, witnessIndex, err} + }(index, witnessResponsesC) + } + + // process all the responses as they come in + for i := 0; i < cap(witnessResponsesC); i++ { + response := <-witnessResponsesC + switch response.err { + // success! We have found a new primary + case nil: + cancel() // cancel all remaining requests to other witnesses + + wg.Wait() // wait for all goroutines to finish + + // if we are not intending on removing the primary then append the old primary to the end of the witness slice + if !remove { + c.witnesses = append(c.witnesses, c.primary) + } + + // promote respondent as the new primary + c.logger.Debug("found new primary", "primary", c.witnesses[response.witnessIndex]) + c.primary = c.witnesses[response.witnessIndex] + + // add promoted witness to the list of witnesses to be removed + witnessesToRemove = append(witnessesToRemove, response.witnessIndex) + + // remove witnesses marked as bad (the client must do this before we alter the witness slice and change the indexes + // of witnesses). Removal is done in descending order + if err := c.removeWitnesses(witnessesToRemove); err != nil { + return nil, err + } + + // return the light block that new primary responded with + return response.lb, nil + + // process benign errors by logging them only + case provider.ErrNoResponse, provider.ErrLightBlockNotFound: + lastError = response.err + c.logger.Debug("error on light block request from witness", + "error", response.err, "primary", c.witnesses[response.witnessIndex]) + continue + + // process malevolent errors like ErrUnreliableProvider and ErrBadLightBlock by removing the witness + default: + lastError = response.err + c.logger.Error("error on light block request from witness, removing...", + "error", response.err, "primary", c.witnesses[response.witnessIndex]) + witnessesToRemove = append(witnessesToRemove, response.witnessIndex) } - // replace primary and request a light block again - return c.lightBlockFromPrimary(ctx, height) } - return l, err + + return nil, lastError } -// compareFirstHeaderWithWitnesses compares h with all witnesses. If any +// compareFirstHeaderWithWitnesses concurrently compares h with all witnesses. If any // witness reports a different header than h, the function returns an error. func (c *Client) compareFirstHeaderWithWitnesses(ctx context.Context, h *types.SignedHeader) error { compareCtx, cancel := context.WithCancel(ctx) @@ -1006,26 +1095,20 @@ func (c *Client) compareFirstHeaderWithWitnesses(ctx context.Context, h *types.S case nil: continue case errConflictingHeaders: - c.logger.Error(fmt.Sprintf(`Witness #%d has a different header. Please check primary is correct -and remove witness. Otherwise, use the different primary`, e.WitnessIndex), "witness", c.witnesses[e.WitnessIndex]) + c.logger.Error(fmt.Sprintf(`witness #%d has a different header. Please check primary is correct +and remove witness. Otherwise, use a different primary`, e.WitnessIndex), "witness", c.witnesses[e.WitnessIndex]) return err case errBadWitness: // If witness sent us an invalid header, then remove it. If it didn't // respond or couldn't find the block, then we ignore it and move on to // the next witness. if _, ok := e.Reason.(provider.ErrBadLightBlock); ok { - c.logger.Info("Witness sent us invalid header / vals -> removing it", "witness", c.witnesses[e.WitnessIndex]) + c.logger.Info("Witness sent an invalid light block, removing...", "witness", c.witnesses[e.WitnessIndex]) witnessesToRemove = append(witnessesToRemove, e.WitnessIndex) } } } - // we need to make sure that we remove witnesses by index in the reverse - // order so as to not affect the indexes themselves - sort.Ints(witnessesToRemove) - for i := len(witnessesToRemove) - 1; i >= 0; i-- { - c.removeWitness(witnessesToRemove[i]) - } - - return nil + // remove all witnesses that misbehaved + return c.removeWitnesses(witnessesToRemove) } diff --git a/light/client_test.go b/light/client_test.go index 2489b23c8..d9bc9410f 100644 --- a/light/client_test.go +++ b/light/client_test.go @@ -740,7 +740,7 @@ func TestClient_Concurrency(t *testing.T) { defer wg.Done() // NOTE: Cleanup, Stop, VerifyLightBlockAtHeight and Verify are not supposed - // to be concurrenly safe. + // to be concurrently safe. assert.Equal(t, chainID, c.ChainID()) @@ -774,8 +774,12 @@ func TestClientReplacesPrimaryWithWitnessIfPrimaryIsUnavailable(t *testing.T) { _, err = c.Update(ctx, bTime.Add(2*time.Hour)) require.NoError(t, err) + // the primary should no longer be the deadNode assert.NotEqual(t, c.Primary(), deadNode) - assert.Equal(t, 1, len(c.Witnesses())) + + // we should still have the dead node as a witness because it + // hasn't repeatedly been unresponsive yet + assert.Equal(t, 2, len(c.Witnesses())) } func TestClient_BackwardsVerification(t *testing.T) { diff --git a/light/detector.go b/light/detector.go index a9510b9e3..50b9f671b 100644 --- a/light/detector.go +++ b/light/detector.go @@ -5,7 +5,6 @@ import ( "context" "errors" "fmt" - "sort" "time" "github.com/tendermint/tendermint/light/provider" @@ -35,7 +34,7 @@ func (c *Client) detectDivergence(ctx context.Context, primaryTrace []*types.Lig lastVerifiedHeader = primaryTrace[len(primaryTrace)-1].SignedHeader witnessesToRemove = make([]int, 0) ) - c.logger.Debug("Running detector against trace", "endBlockHeight", lastVerifiedHeader.Height, + c.logger.Debug("running detector against trace", "endBlockHeight", lastVerifiedHeader.Height, "endBlockHash", lastVerifiedHeader.Hash, "length", len(primaryTrace)) c.providerMutex.Lock() @@ -75,7 +74,7 @@ func (c *Client) detectDivergence(ctx context.Context, primaryTrace []*types.Lig now, ) if err != nil { - c.logger.Info("Error validating witness's divergent header", "witness", supportingWitness, "err", err) + c.logger.Info("error validating witness's divergent header", "witness", supportingWitness, "err", err) witnessesToRemove = append(witnessesToRemove, e.WitnessIndex) continue } @@ -83,7 +82,7 @@ func (c *Client) detectDivergence(ctx context.Context, primaryTrace []*types.Lig // We are suspecting that the primary is faulty, hence we hold the witness as the source of truth // and generate evidence against the primary that we can send to the witness primaryEv := newLightClientAttackEvidence(primaryBlock, witnessTrace[len(witnessTrace)-1], witnessTrace[0]) - c.logger.Error("Attempted attack detected. Sending evidence againt primary by witness", "ev", primaryEv, + c.logger.Error("ATTEMPTED ATTACK DETECTED. Sending evidence againt primary by witness", "ev", primaryEv, "primary", c.primary, "witness", supportingWitness) c.sendEvidence(ctx, primaryEv, supportingWitness) @@ -117,22 +116,17 @@ func (c *Client) detectDivergence(ctx context.Context, primaryTrace []*types.Lig return ErrLightClientAttack case errBadWitness: - c.logger.Info("Witness returned an error during header comparison", "witness", c.witnesses[e.WitnessIndex], - "err", err) - // if witness sent us an invalid header, then remove it. If it didn't respond or couldn't find the block, then we - // ignore it and move on to the next witness - if _, ok := e.Reason.(provider.ErrBadLightBlock); ok { - c.logger.Info("Witness sent us invalid header / vals -> removing it", "witness", c.witnesses[e.WitnessIndex]) - witnessesToRemove = append(witnessesToRemove, e.WitnessIndex) - } + c.logger.Info("witness returned an error during header comparison, removing...", + "witness", c.witnesses[e.WitnessIndex], "err", err) + witnessesToRemove = append(witnessesToRemove, e.WitnessIndex) + default: + c.logger.Debug("error in light block request to witness", "err", err) } } - // we need to make sure that we remove witnesses by index in the reverse - // order so as to not affect the indexes themselves - sort.Ints(witnessesToRemove) - for i := len(witnessesToRemove) - 1; i >= 0; i-- { - c.removeWitness(witnessesToRemove[i]) + // remove witnesses that have misbehaved + if err := c.removeWitnesses(witnessesToRemove); err != nil { + return err } // 1. If we had at least one witness that returned the same header then we @@ -156,7 +150,17 @@ func (c *Client) compareNewHeaderWithWitness(ctx context.Context, errc chan erro witness provider.Provider, witnessIndex int) { lightBlock, err := witness.LightBlock(ctx, h.Height) - if err != nil { + switch err { + case nil: + break + + case provider.ErrNoResponse, provider.ErrLightBlockNotFound: + errc <- err + return + + default: + // all other errors (i.e. invalid block or unreliable provider) we mark the witness as bad + // and remove it errc <- errBadWitness{Reason: err, WitnessIndex: witnessIndex} return } @@ -165,7 +169,7 @@ func (c *Client) compareNewHeaderWithWitness(ctx context.Context, errc chan erro errc <- errConflictingHeaders{Block: lightBlock, WitnessIndex: witnessIndex} } - c.logger.Debug("Matching header received by witness", "height", h.Height, "witness", witnessIndex) + c.logger.Debug("matching header received by witness", "height", h.Height, "witness", witnessIndex) errc <- nil } @@ -173,7 +177,7 @@ func (c *Client) compareNewHeaderWithWitness(ctx context.Context, errc chan erro func (c *Client) sendEvidence(ctx context.Context, ev *types.LightClientAttackEvidence, receiver provider.Provider) { err := receiver.ReportEvidence(ctx, ev) if err != nil { - c.logger.Error("Failed to report evidence to provider", "ev", ev, "provider", receiver) + c.logger.Error("failed to report evidence to provider", "ev", ev, "provider", receiver) } } diff --git a/light/detector_test.go b/light/detector_test.go index d81467f9b..02e7bb249 100644 --- a/light/detector_test.go +++ b/light/detector_test.go @@ -253,5 +253,5 @@ func TestClientDivergentTraces3(t *testing.T) { _, err = c.VerifyLightBlockAtHeight(ctx, 10, bTime.Add(1*time.Hour)) assert.Error(t, err) - assert.Equal(t, 0, len(c.Witnesses())) + assert.Equal(t, 1, len(c.Witnesses())) } diff --git a/light/provider/errors.go b/light/provider/errors.go index 5d24efd73..40e0c6fc8 100644 --- a/light/provider/errors.go +++ b/light/provider/errors.go @@ -7,15 +7,15 @@ import ( var ( // ErrLightBlockNotFound is returned when a provider can't find the - // requested header. + // requested header. The light client will not remove the provider ErrLightBlockNotFound = errors.New("light block not found") // ErrNoResponse is returned if the provider doesn't respond to the - // request in a gieven time + // request in a given time. The light client will not remove the provider ErrNoResponse = errors.New("client failed to respond") ) // ErrBadLightBlock is returned when a provider returns an invalid -// light block. +// light block. The light client will remove the provider. type ErrBadLightBlock struct { Reason error } @@ -23,3 +23,14 @@ type ErrBadLightBlock struct { func (e ErrBadLightBlock) Error() string { return fmt.Sprintf("client provided bad signed header: %s", e.Reason.Error()) } + +// ErrUnreliableProvider is a generic error that indicates that the provider isn't +// behaving in a reliable manner to the light client. The light client will +// remove the provider +type ErrUnreliableProvider struct { + Reason string +} + +func (e ErrUnreliableProvider) Error() string { + return fmt.Sprintf("client deemed unreliable: %s", e.Reason) +} diff --git a/light/provider/http/http.go b/light/provider/http/http.go index f96a0edb5..125589b6b 100644 --- a/light/provider/http/http.go +++ b/light/provider/http/http.go @@ -18,8 +18,10 @@ import ( ) var defaultOptions = Options{ - MaxRetryAttempts: 5, - Timeout: 3 * time.Second, + MaxRetryAttempts: 5, + Timeout: 3 * time.Second, + NoBlockThreshold: 5, + NoResponseThreshold: 5, } // http provider uses an RPC client to obtain the necessary information. @@ -27,14 +29,37 @@ type http struct { chainID string client rpcclient.RemoteClient - maxRetryAttempts int + // httt provider heuristics + + // The provider tracks the amount of times that the + // client doesn't respond. If this exceeds the threshold + // then the provider will return an unreliable provider error + noResponseThreshold uint16 + noResponseCount uint16 + + // The provider tracks the amount of time the client + // doesn't have a block. If this exceeds the threshold + // then the provider will return an unreliable provider error + noBlockThreshold uint16 + noBlockCount uint16 + + // In a single request, the provider attempts multiple times + // with exponential backoff to reach the client. If this + // exceeds the maxRetry attempts, this result in a ErrNoResponse + maxRetryAttempts uint16 } type Options struct { - // -1 means no limit - MaxRetryAttempts int + // 0 means no retries + MaxRetryAttempts uint16 // 0 means no timeout. Timeout time.Duration + // The amount of requests that a client doesn't have the block + // for before the provider deems the client unreliable + NoBlockThreshold uint16 + // The amount of requests that a client doesn't respond to + // before the provider deems the client unreliable + NoResponseThreshold uint16 } // New creates a HTTP provider, which is using the rpchttp.HTTP client under @@ -67,9 +92,11 @@ func NewWithClient(chainID string, client rpcclient.RemoteClient) provider.Provi // NewWithClient allows you to provide a custom client. func NewWithClientAndOptions(chainID string, client rpcclient.RemoteClient, options Options) provider.Provider { return &http{ - client: client, - chainID: chainID, - maxRetryAttempts: options.MaxRetryAttempts, + client: client, + chainID: chainID, + maxRetryAttempts: options.MaxRetryAttempts, + noResponseThreshold: options.NoResponseThreshold, + noBlockThreshold: options.NoBlockThreshold, } } @@ -130,7 +157,8 @@ func (p *http) validatorSet(ctx context.Context, height *int64) (*types.Validato for len(vals) != total && page <= maxPages { // create another for loop to control retries. If p.maxRetryAttempts // is negative we will keep repeating. - for attempt := 0; attempt != p.maxRetryAttempts+1; attempt++ { + attempt := uint16(0) + for { res, err := p.client.Validators(ctx, height, &page, &perPage) switch e := err.(type) { case nil: // success!! Now we validate the response @@ -149,8 +177,13 @@ func (p *http) validatorSet(ctx context.Context, height *int64) (*types.Validato case *url.Error: if e.Timeout() { + // if we have exceeded retry attempts then return a no response error + if attempt == p.maxRetryAttempts { + return nil, p.noResponse() + } + attempt++ // request timed out: we wait and try again with exponential backoff - time.Sleep(backoffTimeout(uint16(attempt))) + time.Sleep(backoffTimeout(attempt)) continue } return nil, provider.ErrBadLightBlock{Reason: e} @@ -188,7 +221,7 @@ func (p *http) validatorSet(ctx context.Context, height *int64) (*types.Validato func (p *http) signedHeader(ctx context.Context, height *int64) (*types.SignedHeader, error) { // create a for loop to control retries. If p.maxRetryAttempts // is negative we will keep repeating. - for attempt := 0; attempt != p.maxRetryAttempts+1; attempt++ { + for attempt := uint16(0); attempt != p.maxRetryAttempts+1; attempt++ { commit, err := p.client.Commit(ctx, height) switch e := err.(type) { case nil: // success!! @@ -197,32 +230,48 @@ func (p *http) signedHeader(ctx context.Context, height *int64) (*types.SignedHe case *url.Error: if e.Timeout() { // we wait and try again with exponential backoff - time.Sleep(backoffTimeout(uint16(attempt))) + time.Sleep(backoffTimeout(attempt)) continue } return nil, provider.ErrBadLightBlock{Reason: e} case *rpctypes.RPCError: - // Check if we got something other than internal error. This shouldn't happen unless the RPC module - // or light client has been tampered with. If we do get this error, stop the connection with the - // peer and return an error - if e.Code != -32603 { - return nil, provider.ErrBadLightBlock{Reason: errors.New(e.Data)} - } - // check if the error indicates that the peer doesn't have the block - if strings.Contains(err.Error(), ctypes.ErrHeightNotAvailable.Error()) || - strings.Contains(err.Error(), ctypes.ErrHeightExceedsChainHead.Error()) { - return nil, provider.ErrLightBlockNotFound + if strings.Contains(e.Data, ctypes.ErrHeightNotAvailable.Error()) || + strings.Contains(e.Data, ctypes.ErrHeightExceedsChainHead.Error()) { + return nil, p.noBlock() } + // for every other error, the provider returns a bad block + return nil, provider.ErrBadLightBlock{Reason: errors.New(e.Data)} + default: // If we don't know the error then by default we return a bad light block error and // terminate the connection with the peer. return nil, provider.ErrBadLightBlock{Reason: e} } } - return nil, provider.ErrNoResponse + return nil, p.noResponse() +} + +func (p *http) noResponse() error { + p.noResponseCount++ + if p.noResponseCount > p.noResponseThreshold { + return provider.ErrUnreliableProvider{ + Reason: fmt.Sprintf("failed to respond after %d attempts", p.noResponseCount), + } + } + return provider.ErrNoResponse +} + +func (p *http) noBlock() error { + p.noBlockCount++ + if p.noBlockCount > p.noBlockThreshold { + return provider.ErrUnreliableProvider{ + Reason: fmt.Sprintf("failed to provide a block after %d attempts", p.noBlockCount), + } + } + return provider.ErrLightBlockNotFound } func validateHeight(height int64) (*int64, error) { diff --git a/light/provider/http/http_test.go b/light/provider/http/http_test.go index 8716db327..240c97973 100644 --- a/light/provider/http/http_test.go +++ b/light/provider/http/http_test.go @@ -66,25 +66,33 @@ func TestProvider(t *testing.T) { require.NoError(t, err) // let's get the highest block - sh, err := p.LightBlock(context.Background(), 0) + lb, err := p.LightBlock(context.Background(), 0) require.NoError(t, err) - assert.True(t, sh.Height < 1000) + assert.True(t, lb.Height < 1000) // let's check this is valid somehow - assert.Nil(t, sh.ValidateBasic(chainID)) + assert.Nil(t, lb.ValidateBasic(chainID)) // historical queries now work :) - lower := sh.Height - 3 - sh, err = p.LightBlock(context.Background(), lower) + lower := lb.Height - 3 + lb, err = p.LightBlock(context.Background(), lower) require.NoError(t, err) - assert.Equal(t, lower, sh.Height) + assert.Equal(t, lower, lb.Height) // // fetching missing heights (both future and pruned) should return appropriate errors - _, err = p.LightBlock(context.Background(), 1000) + lb, err = p.LightBlock(context.Background(), 1000) require.Error(t, err) + require.Nil(t, lb) assert.Equal(t, provider.ErrLightBlockNotFound, err) _, err = p.LightBlock(context.Background(), 1) require.Error(t, err) assert.Equal(t, provider.ErrLightBlockNotFound, err) + + // if the provider is unable to provide four more blocks then we should return + // an unreliable peer error + for i := 0; i < 4; i++ { + _, err = p.LightBlock(context.Background(), 1) + } + assert.IsType(t, provider.ErrUnreliableProvider{}, err) } diff --git a/light/provider/mock/deadmock.go b/light/provider/mock/deadmock.go index 76ea9e9d0..6045e45f6 100644 --- a/light/provider/mock/deadmock.go +++ b/light/provider/mock/deadmock.go @@ -2,15 +2,12 @@ package mock import ( "context" - "errors" "fmt" "github.com/tendermint/tendermint/light/provider" "github.com/tendermint/tendermint/types" ) -var errNoResp = errors.New("no response from provider") - type deadMock struct { id string } @@ -25,9 +22,9 @@ func (p *deadMock) String() string { } func (p *deadMock) LightBlock(_ context.Context, height int64) (*types.LightBlock, error) { - return nil, errNoResp + return nil, provider.ErrNoResponse } func (p *deadMock) ReportEvidence(_ context.Context, ev types.Evidence) error { - return errNoResp + return provider.ErrNoResponse } diff --git a/light/verifier.go b/light/verifier.go index a8a7ee56c..165a9a641 100644 --- a/light/verifier.go +++ b/light/verifier.go @@ -130,7 +130,7 @@ func VerifyAdjacent( // Check the validator hashes are the same if !bytes.Equal(untrustedHeader.ValidatorsHash, trustedHeader.NextValidatorsHash) { - err := fmt.Errorf("expected old header next validators (%X) to match those from new header (%X)", + err := fmt.Errorf("expected old header's next validators (%X) to match those from new header (%X)", trustedHeader.NextValidatorsHash, untrustedHeader.ValidatorsHash, )