test: fix various E2E test issues (#5576)

* Don't use state sync for nodes starting at initial height. * Also remove stopped containers when cleaning up. * Start nodes in order of startAt, mode, name to avoid full nodes starting before their seeds. * Tweak network waiting to avoid halts caused by validator changes and perturbations. * Disable most tests for seed nodes, which aren't always able to join consensus. * Disable `blockchain/v2` due to known bugs.
4 years ago · 53022220f6
--- a/test/e2e/generator/generate.go
+++ b/test/e2e/generator/generate.go
@ -29,7 +29,10 @@ var (
 	nodeABCIProtocols    = uniformChoice{"unix", "tcp", "grpc", "builtin"}
 	nodePrivvalProtocols = uniformChoice{"file", "unix", "tcp"}
 	// FIXME v1 disabled due to https://github.com/tendermint/tendermint/issues/5444
 	nodeFastSyncs         = uniformChoice{"", "v0", "v2"} // "v1",
 	// FIXME v2 disabled due to:
 	// https://github.com/tendermint/tendermint/issues/5513
 	// https://github.com/tendermint/tendermint/issues/5541
 	nodeFastSyncs         = uniformChoice{"", "v0"} // "v1", "v2"
 	nodeStateSyncs        = uniformChoice{false, true}
 	nodePersistIntervals  = uniformChoice{0, 1, 5}
 	nodeSnapshotIntervals = uniformChoice{0, 3}
@ -90,7 +93,8 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er

 	// First we generate seed nodes, starting at the initial height.
 	for i := 1; i <= numSeeds; i++ {
 		manifest.Nodes[fmt.Sprintf("seed%02d", i)] = generateNode(r, e2e.ModeSeed, 0, false)
 		manifest.Nodes[fmt.Sprintf("seed%02d", i)] = generateNode(
 			r, e2e.ModeSeed, 0, manifest.InitialHeight, false)
 	}

 	// Next, we generate validators. We make sure a BFT quorum of validators start
@ -99,15 +103,16 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
 	nextStartAt := manifest.InitialHeight + 5
 	quorum := numValidators*2/3 + 1
 	for i := 1; i <= numValidators; i++ {
 		startAt := manifest.InitialHeight
 		startAt := int64(0)
 		if i > quorum {
 			startAt = nextStartAt
 			nextStartAt += 5
 		}
 		name := fmt.Sprintf("validator%02d", i)
 		manifest.Nodes[name] = generateNode(r, e2e.ModeValidator, startAt, i <= 2)
 		manifest.Nodes[name] = generateNode(
 			r, e2e.ModeValidator, startAt, manifest.InitialHeight, i <= 2)

 		if startAt == manifest.InitialHeight {
 		if startAt == 0 {
 			(*manifest.Validators)[name] = int64(30 + r.Intn(71))
 		} else {
 			manifest.ValidatorUpdates[fmt.Sprint(startAt+5)] = map[string]int64{
@ -133,7 +138,8 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
 			startAt = nextStartAt
 			nextStartAt += 5
 		}
 		manifest.Nodes[fmt.Sprintf("full%02d", i)] = generateNode(r, e2e.ModeFull, startAt, false)
 		manifest.Nodes[fmt.Sprintf("full%02d", i)] = generateNode(
 			r, e2e.ModeFull, startAt, manifest.InitialHeight, false)
 	}

 	// We now set up peer discovery for nodes. Seed nodes are fully meshed with
@ -183,7 +189,8 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
 // here, since we need to know the overall network topology and startup
 // sequencing.
 func generateNode(
 	r *rand.Rand, mode e2e.Mode, startAt int64, forceArchive bool) *e2e.ManifestNode {
 	r *rand.Rand, mode e2e.Mode, startAt int64, initialHeight int64, forceArchive bool,
 ) *e2e.ManifestNode {
 	node := e2e.ManifestNode{
 		Mode:             string(mode),
 		StartAt:          startAt,
@ -206,8 +213,11 @@ func generateNode(
 	}

 	if node.Mode == "validator" {
 		node.Misbehaviors = nodeMisbehaviors.Choose(r).(misbehaviorOption).
 			atHeight(startAt + 5 + int64(r.Intn(10)))
 		misbehaveAt := startAt + 5 + int64(r.Intn(10))
 		if startAt == 0 {
 			misbehaveAt += initialHeight - 1
 		}
 		node.Misbehaviors = nodeMisbehaviors.Choose(r).(misbehaviorOption).atHeight(misbehaveAt)
 		if len(node.Misbehaviors) != 0 {
 			node.PrivvalProtocol = "file"
 		}
--- a/test/e2e/networks/ci.toml
+++ b/test/e2e/networks/ci.toml
@ -1,4 +1,4 @@
 # This testnet is (will be) run by CI, and attempts to cover a broad range of
 # This testnet is run by CI, and attempts to cover a broad range of
 # functionality with a single network.

 initial_height = 1000
@ -79,14 +79,17 @@ start_at = 1010
 mode = "full"
 # FIXME Should use v1, but it won't catch up since some nodes don't have all blocks
 # https://github.com/tendermint/tendermint/issues/5444
 fast_sync = "v2"
 fast_sync = "v0"
 persistent_peers = ["validator01", "validator02", "validator03", "validator04", "validator05"]
 perturb = ["restart"]

 [node.full02]
 start_at = 1015
 mode = "full"
 fast_sync = "v2"
 # FIXME Should use v2, but it has concurrency bugs causing panics or halts
 # https://github.com/tendermint/tendermint/issues/5513
 # https://github.com/tendermint/tendermint/issues/5541
 fast_sync = "v0"
 state_sync = true
 seeds = ["seed01"]
 perturb = ["restart"]
--- a/test/e2e/pkg/testnet.go
+++ b/test/e2e/pkg/testnet.go
@ -403,6 +403,16 @@ func (t Testnet) IPv6() bool {
 	return t.IP.IP.To4() == nil
 }

 // HasPerturbations returns whether the network has any perturbations.
 func (t Testnet) HasPerturbations() bool {
 	for _, node := range t.Nodes {
 		if len(node.Perturbations) > 0 {
 			return true
 		}
 	}
 	return false
 }

 // LastMisbehaviorHeight returns the height of the last misbehavior.
 func (t Testnet) LastMisbehaviorHeight() int64 {
 	lastHeight := int64(0)
--- a/test/e2e/runner/cleanup.go
+++ b/test/e2e/runner/cleanup.go
@ -32,7 +32,7 @@ func cleanupDocker() error {
 	xargsR := `$(if [[ $OSTYPE == "linux-gnu"* ]]; then echo -n "-r"; fi)`

 	err := exec("bash", "-c", fmt.Sprintf(
 		"docker container ls -q --filter label=e2e | xargs %v docker container rm -f", xargsR))
 		"docker container ls -qa --filter label=e2e | xargs %v docker container rm -f", xargsR))
 	if err != nil {
 		return err
 	}
--- a/test/e2e/runner/main.go
+++ b/test/e2e/runner/main.go
@ -69,25 +69,33 @@ func NewCLI() *CLI {
 			if err := Start(cli.testnet); err != nil {
 				return err
 			}

 			if lastMisbehavior := cli.testnet.LastMisbehaviorHeight(); lastMisbehavior > 0 {
 				// wait for misbehaviors before starting perturbations
 				if err := WaitUntil(cli.testnet, lastMisbehavior+5); err != nil {
 				// wait for misbehaviors before starting perturbations. We do a separate
 				// wait for another 5 blocks, since the last misbehavior height may be
 				// in the past depending on network startup ordering.
 				if err := WaitUntil(cli.testnet, lastMisbehavior); err != nil {
 					return err
 				}
 			}
 			if err := Perturb(cli.testnet); err != nil {
 				return err
 			}
 			if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through
 				return err
 			}

 			if cli.testnet.HasPerturbations() {
 				if err := Perturb(cli.testnet); err != nil {
 					return err
 				}
 				if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through
 					return err
 				}
 			}

 			loadCancel()
 			if err := <-chLoadResult; err != nil {
 				return err
 			}
 			// wait for network to settle before tests
 			if err := Wait(cli.testnet, 5); err != nil {
 			if err := Wait(cli.testnet, 5); err != nil { // wait for network to settle before tests
 				return err
 			}
 			if err := Test(cli.testnet); err != nil {
--- a/test/e2e/runner/start.go
+++ b/test/e2e/runner/start.go
@ -10,8 +10,21 @@ import (

 func Start(testnet *e2e.Testnet) error {

 	// Sort nodes by starting order
 	// Nodes are already sorted by name. Sort them by name then startAt,
 	// which gives the overall order startAt, mode, name.
 	nodeQueue := testnet.Nodes
 	sort.SliceStable(nodeQueue, func(i, j int) bool {
 		a, b := nodeQueue[i], nodeQueue[j]
 		switch {
 		case a.Mode == b.Mode:
 			return false
 		case a.Mode == e2e.ModeSeed:
 			return true
 		case a.Mode == e2e.ModeValidator && b.Mode == e2e.ModeFull:
 			return true
 		}
 		return false
 	})
 	sort.SliceStable(nodeQueue, func(i, j int) bool {
 		return nodeQueue[i].StartAt < nodeQueue[j].StartAt
 	})
--- a/test/e2e/tests/app_test.go
+++ b/test/e2e/tests/app_test.go
@ -16,6 +16,9 @@ import (
 // Tests that any initial state given in genesis has made it into the app.
 func TestApp_InitialState(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
 		if node.Mode == e2e.ModeSeed {
 			return
 		}
 		if len(node.Testnet.InitialState) == 0 {
 			return
 		}
@ -35,6 +38,10 @@ func TestApp_InitialState(t *testing.T) {
 // block and the node sync status.
 func TestApp_Hash(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
 		if node.Mode == e2e.ModeSeed {
 			return
 		}

 		client, err := node.Client()
 		require.NoError(t, err)
 		info, err := client.ABCIInfo(ctx)
@ -56,6 +63,10 @@ func TestApp_Hash(t *testing.T) {
 // Tests that we can set a value and retrieve it.
 func TestApp_Tx(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
 		if node.Mode == e2e.ModeSeed {
 			return
 		}

 		client, err := node.Client()
 		require.NoError(t, err)

--- a/test/e2e/tests/block_test.go
+++ b/test/e2e/tests/block_test.go
@ -13,6 +13,10 @@ import (
 func TestBlock_Header(t *testing.T) {
 	blocks := fetchBlockChain(t)
 	testNode(t, func(t *testing.T, node e2e.Node) {
 		if node.Mode == e2e.ModeSeed {
 			return
 		}

 		client, err := node.Client()
 		require.NoError(t, err)
 		status, err := client.Status(ctx)
@ -42,6 +46,10 @@ func TestBlock_Header(t *testing.T) {
 // Tests that the node contains the expected block range.
 func TestBlock_Range(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
 		if node.Mode == e2e.ModeSeed {
 			return
 		}

 		client, err := node.Client()
 		require.NoError(t, err)
 		status, err := client.Status(ctx)
--- a/test/e2e/tests/validator_test.go
+++ b/test/e2e/tests/validator_test.go
@ -14,6 +14,10 @@ import (
 // scheduled validator updates.
 func TestValidator_Sets(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
 		if node.Mode == e2e.ModeSeed {
 			return
 		}

 		client, err := node.Client()
 		require.NoError(t, err)
 		status, err := client.Status(ctx)