diff --git a/.gitignore b/.gitignore index db0e1f77a..3aa5112d8 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ remote_dump vendor .vagrant test/e2e/build +test/maverick/maverick test/e2e/networks/*/ test/p2p/data/ test/logs diff --git a/test/e2e/Makefile b/test/e2e/Makefile index 602de7547..c9eb8bc19 100644 --- a/test/e2e/Makefile +++ b/test/e2e/Makefile @@ -8,6 +8,11 @@ docker: # ABCI testing). app: go build -o build/app -tags badgerdb,boltdb,cleveldb,rocksdb ./app + +# To be used primarily by the e2e docker instance. If you want to produce this binary +# elsewhere, then run go build in the maverick directory. +maverick: + go build -o build/maverick -tags badgerdb,boltdb,cleveldb,rocksdb ../maverick generator: go build -o build/generator ./generator @@ -15,4 +20,4 @@ generator: runner: go build -o build/runner ./runner -.PHONY: all app docker generator runner +.PHONY: all app docker generator maverick runner diff --git a/test/e2e/app/config.go b/test/e2e/app/config.go index 20df6ce90..281419160 100644 --- a/test/e2e/app/config.go +++ b/test/e2e/app/config.go @@ -21,6 +21,7 @@ type Config struct { PrivValServer string `toml:"privval_server"` PrivValKey string `toml:"privval_key"` PrivValState string `toml:"privval_state"` + Misbehaviors map[string]string `toml:"misbehaviors"` } // LoadConfig loads the configuration from disk. diff --git a/test/e2e/app/main.go b/test/e2e/app/main.go index 8a5ed95a3..d37b09be4 100644 --- a/test/e2e/app/main.go +++ b/test/e2e/app/main.go @@ -5,9 +5,11 @@ import ( "fmt" "os" "path/filepath" + "strconv" "time" "github.com/spf13/viper" + "github.com/tendermint/tendermint/abci/server" "github.com/tendermint/tendermint/config" tmflags "github.com/tendermint/tendermint/libs/cli/flags" @@ -17,6 +19,8 @@ import ( "github.com/tendermint/tendermint/p2p" "github.com/tendermint/tendermint/privval" "github.com/tendermint/tendermint/proxy" + mcs "github.com/tendermint/tendermint/test/maverick/consensus" + maverick "github.com/tendermint/tendermint/test/maverick/node" ) var logger = log.NewTMLogger(log.NewSyncWriter(os.Stdout)) @@ -60,7 +64,11 @@ func run(configFile string) error { case "socket", "grpc": err = startApp(cfg) case "builtin": - err = startNode(cfg) + if len(cfg.Misbehaviors) == 0 { + err = startNode(cfg) + } else { + err = startMaverick(cfg) + } default: err = fmt.Errorf("invalid protocol %q", cfg.Protocol) } @@ -102,51 +110,59 @@ func startNode(cfg *Config) error { return err } - home := os.Getenv("TMHOME") - if home == "" { - return errors.New("TMHOME not set") - } - viper.AddConfigPath(filepath.Join(home, "config")) - viper.SetConfigName("config") - err = viper.ReadInConfig() + tmcfg, nodeLogger, nodeKey, err := setupNode() if err != nil { - return err + return fmt.Errorf("failed to setup config: %w", err) } - tmcfg := config.DefaultConfig() - err = viper.Unmarshal(tmcfg) + + n, err := node.NewNode(tmcfg, + privval.LoadOrGenFilePV(tmcfg.PrivValidatorKeyFile(), tmcfg.PrivValidatorStateFile()), + nodeKey, + proxy.NewLocalClientCreator(app), + node.DefaultGenesisDocProviderFunc(tmcfg), + node.DefaultDBProvider, + node.DefaultMetricsProvider(tmcfg.Instrumentation), + nodeLogger, + ) if err != nil { return err } - tmcfg.SetRoot(home) - if err = tmcfg.ValidateBasic(); err != nil { - return fmt.Errorf("error in config file: %v", err) - } - if tmcfg.LogFormat == config.LogFormatJSON { - logger = log.NewTMJSONLogger(log.NewSyncWriter(os.Stdout)) - } - logger, err = tmflags.ParseLogLevel(tmcfg.LogLevel, logger, config.DefaultLogLevel()) + return n.Start() +} + +// startMaverick starts a Maverick node that runs the application directly. It assumes the Tendermint +// configuration is in $TMHOME/config/tendermint.toml. +func startMaverick(cfg *Config) error { + app, err := NewApplication(cfg) if err != nil { return err } - logger = logger.With("module", "main") - nodeKey, err := p2p.LoadOrGenNodeKey(tmcfg.NodeKeyFile()) + tmcfg, logger, nodeKey, err := setupNode() if err != nil { - return fmt.Errorf("failed to load or gen node key %s: %w", tmcfg.NodeKeyFile(), err) + return fmt.Errorf("failed to setup config: %w", err) } - n, err := node.NewNode(tmcfg, - privval.LoadOrGenFilePV(tmcfg.PrivValidatorKeyFile(), tmcfg.PrivValidatorStateFile()), + misbehaviors := make(map[int64]mcs.Misbehavior, len(cfg.Misbehaviors)) + for heightString, misbehaviorString := range cfg.Misbehaviors { + height, _ := strconv.ParseInt(heightString, 10, 64) + misbehaviors[height] = mcs.MisbehaviorList[misbehaviorString] + } + + n, err := maverick.NewNode(tmcfg, + maverick.LoadOrGenFilePV(tmcfg.PrivValidatorKeyFile(), tmcfg.PrivValidatorStateFile()), nodeKey, proxy.NewLocalClientCreator(app), - node.DefaultGenesisDocProviderFunc(tmcfg), - node.DefaultDBProvider, - node.DefaultMetricsProvider(tmcfg.Instrumentation), + maverick.DefaultGenesisDocProviderFunc(tmcfg), + maverick.DefaultDBProvider, + maverick.DefaultMetricsProvider(tmcfg.Instrumentation), logger, + misbehaviors, ) if err != nil { return err } + return n.Start() } @@ -175,3 +191,42 @@ func startSigner(cfg *Config) error { logger.Info(fmt.Sprintf("Remote signer connecting to %v", cfg.PrivValServer)) return nil } + +func setupNode() (*config.Config, log.Logger, *p2p.NodeKey, error) { + var tmcfg *config.Config + + home := os.Getenv("TMHOME") + if home == "" { + return nil, nil, nil, errors.New("TMHOME not set") + } + viper.AddConfigPath(filepath.Join(home, "config")) + viper.SetConfigName("config") + err := viper.ReadInConfig() + if err != nil { + return nil, nil, nil, err + } + tmcfg = config.DefaultConfig() + err = viper.Unmarshal(tmcfg) + if err != nil { + return nil, nil, nil, err + } + tmcfg.SetRoot(home) + if err = tmcfg.ValidateBasic(); err != nil { + return nil, nil, nil, fmt.Errorf("error in config file: %w", err) + } + if tmcfg.LogFormat == config.LogFormatJSON { + logger = log.NewTMJSONLogger(log.NewSyncWriter(os.Stdout)) + } + nodeLogger, err := tmflags.ParseLogLevel(tmcfg.LogLevel, logger, config.DefaultLogLevel()) + if err != nil { + return nil, nil, nil, err + } + nodeLogger = nodeLogger.With("module", "main") + + nodeKey, err := p2p.LoadOrGenNodeKey(tmcfg.NodeKeyFile()) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to load or gen node key %s: %w", tmcfg.NodeKeyFile(), err) + } + + return tmcfg, nodeLogger, nodeKey, nil +} diff --git a/test/e2e/docker/Dockerfile b/test/e2e/docker/Dockerfile index 273bd07c6..825aa7f0d 100644 --- a/test/e2e/docker/Dockerfile +++ b/test/e2e/docker/Dockerfile @@ -18,6 +18,7 @@ RUN go mod download COPY . . RUN make build && cp build/tendermint /usr/bin/tendermint COPY test/e2e/docker/entrypoint* /usr/bin/ +RUN cd test/e2e && make maverick && cp build/maverick /usr/bin/maverick RUN cd test/e2e && make app && cp build/app /usr/bin/app # Set up runtime directory. We don't use a separate runtime image since we need diff --git a/test/e2e/docker/entrypoint-maverick b/test/e2e/docker/entrypoint-maverick new file mode 100755 index 000000000..9469e2447 --- /dev/null +++ b/test/e2e/docker/entrypoint-maverick @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# Forcibly remove any stray UNIX sockets left behind from previous runs +rm -rf /var/run/privval.sock /var/run/app.sock + +/usr/bin/app /tendermint/config/app.toml & + +sleep 1 + +/usr/bin/maverick "$@" diff --git a/test/e2e/generator/generate.go b/test/e2e/generator/generate.go index 851dfeb60..2ccbd3a6b 100644 --- a/test/e2e/generator/generate.go +++ b/test/e2e/generator/generate.go @@ -4,6 +4,7 @@ import ( "fmt" "math/rand" "sort" + "strconv" "strings" e2e "github.com/tendermint/tendermint/test/e2e/pkg" @@ -39,6 +40,10 @@ var ( "kill": 0.1, "restart": 0.1, } + nodeMisbehaviors = weightedChoice{ + misbehaviorOption{"double-prevote"}: 1, + misbehaviorOption{}: 9, + } ) // Generate generates random testnets using the given RNG. @@ -91,7 +96,7 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er nextStartAt := manifest.InitialHeight + 5 quorum := numValidators*2/3 + 1 for i := 1; i <= numValidators; i++ { - startAt := int64(0) + startAt := manifest.InitialHeight if i > quorum { startAt = nextStartAt nextStartAt += 5 @@ -174,7 +179,8 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er // generating invalid configurations. We do not set Seeds or PersistentPeers // here, since we need to know the overall network topology and startup // sequencing. -func generateNode(r *rand.Rand, mode e2e.Mode, startAt int64, forceArchive bool) *e2e.ManifestNode { +func generateNode( + r *rand.Rand, mode e2e.Mode, startAt int64, forceArchive bool) *e2e.ManifestNode { node := e2e.ManifestNode{ Mode: string(mode), StartAt: startAt, @@ -196,6 +202,14 @@ func generateNode(r *rand.Rand, mode e2e.Mode, startAt int64, forceArchive bool) node.SnapshotInterval = 3 } + if node.Mode == "validator" { + node.Misbehaviors = nodeMisbehaviors.Choose(r).(misbehaviorOption). + atHeight(startAt + 5 + int64(r.Intn(10))) + if len(node.Misbehaviors) != 0 { + node.PrivvalProtocol = "file" + } + } + // If a node which does not persist state also does not retain blocks, randomly // choose to either persist state or retain all blocks. if node.PersistInterval != nil && *node.PersistInterval == 0 && node.RetainBlocks > 0 { @@ -223,3 +237,16 @@ func generateNode(r *rand.Rand, mode e2e.Mode, startAt int64, forceArchive bool) func ptrUint64(i uint64) *uint64 { return &i } + +type misbehaviorOption struct { + misbehavior string +} + +func (m misbehaviorOption) atHeight(height int64) map[string]string { + misbehaviorMap := make(map[string]string) + if m.misbehavior == "" { + return misbehaviorMap + } + misbehaviorMap[strconv.Itoa(int(height))] = m.misbehavior + return misbehaviorMap +} diff --git a/test/e2e/generator/main.go b/test/e2e/generator/main.go index ce73ccd97..f17b4f3f4 100644 --- a/test/e2e/generator/main.go +++ b/test/e2e/generator/main.go @@ -9,6 +9,7 @@ import ( "path/filepath" "github.com/spf13/cobra" + "github.com/tendermint/tendermint/libs/log" ) diff --git a/test/e2e/generator/random.go b/test/e2e/generator/random.go index 04d1ac70d..ec59a01b2 100644 --- a/test/e2e/generator/random.go +++ b/test/e2e/generator/random.go @@ -57,7 +57,7 @@ func (uc uniformChoice) Choose(r *rand.Rand) interface{} { } // weightedChoice chooses a single random key from a map of keys and weights. -type weightedChoice map[interface{}]uint // nolint:unused +type weightedChoice map[interface{}]uint func (wc weightedChoice) Choose(r *rand.Rand) interface{} { total := 0 diff --git a/test/e2e/networks/ci.toml b/test/e2e/networks/ci.toml index 9db2ca969..9085736a4 100644 --- a/test/e2e/networks/ci.toml +++ b/test/e2e/networks/ci.toml @@ -36,6 +36,7 @@ seeds = ["seed01"] seeds = ["seed01"] snapshot_interval = 5 perturb = ["disconnect"] +misbehaviors = { 1012 = "double-prevote", 1018 = "double-prevote" } [node.validator02] seeds = ["seed02"] diff --git a/test/e2e/networks/simple.toml b/test/e2e/networks/simple.toml index 96b81f79f..37f711a91 100644 --- a/test/e2e/networks/simple.toml +++ b/test/e2e/networks/simple.toml @@ -2,3 +2,4 @@ [node.validator02] [node.validator03] [node.validator04] + diff --git a/test/e2e/pkg/manifest.go b/test/e2e/pkg/manifest.go index c951d9409..8316e57e6 100644 --- a/test/e2e/pkg/manifest.go +++ b/test/e2e/pkg/manifest.go @@ -115,6 +115,16 @@ type ManifestNode struct { // pause: temporarily pauses (freezes) the node // restart: restarts the node, shutting it down with SIGTERM Perturb []string `toml:"perturb"` + + // Misbehaviors sets how a validator behaves during consensus at a + // certain height. Multiple misbehaviors at different heights can be used + // + // An example of misbehaviors + // { 10 = "double-prevote", 20 = "double-prevote"} + // + // For more information, look at the readme in the maverick folder. + // A list of all behaviors can be found in ../maverick/consensus/behavior.go + Misbehaviors map[string]string `toml:"misbehaviors"` } // Save saves the testnet manifest to a file. diff --git a/test/e2e/pkg/testnet.go b/test/e2e/pkg/testnet.go index 351f83378..c2f55bc3d 100644 --- a/test/e2e/pkg/testnet.go +++ b/test/e2e/pkg/testnet.go @@ -15,6 +15,7 @@ import ( "github.com/tendermint/tendermint/crypto" "github.com/tendermint/tendermint/crypto/ed25519" rpchttp "github.com/tendermint/tendermint/rpc/client/http" + mcs "github.com/tendermint/tendermint/test/maverick/consensus" ) const ( @@ -78,6 +79,7 @@ type Node struct { Seeds []*Node PersistentPeers []*Node Perturbations []Perturbation + Misbehaviors map[int64]string } // LoadTestnet loads a testnet from a manifest file, using the filename to @@ -147,6 +149,7 @@ func LoadTestnet(file string) (*Testnet, error) { SnapshotInterval: nodeManifest.SnapshotInterval, RetainBlocks: nodeManifest.RetainBlocks, Perturbations: []Perturbation{}, + Misbehaviors: make(map[int64]string), } if nodeManifest.Mode != "" { node.Mode = Mode(nodeManifest.Mode) @@ -166,6 +169,13 @@ func LoadTestnet(file string) (*Testnet, error) { for _, p := range nodeManifest.Perturb { node.Perturbations = append(node.Perturbations, Perturbation(p)) } + for heightString, misbehavior := range nodeManifest.Misbehaviors { + height, err := strconv.ParseInt(heightString, 10, 64) + if err != nil { + return nil, fmt.Errorf("unable to parse height %s to int64: %w", heightString, err) + } + node.Misbehaviors[height] = misbehavior + } testnet.Nodes = append(testnet.Nodes, node) } @@ -324,6 +334,26 @@ func (n Node) Validate(testnet Testnet) error { return fmt.Errorf("invalid perturbation %q", perturbation) } } + + if (n.PrivvalProtocol != "file" || n.Mode != "validator") && len(n.Misbehaviors) != 0 { + return errors.New("must be using \"file\" privval protocol to implement misbehaviors") + } + + for height, misbehavior := range n.Misbehaviors { + if height < n.StartAt { + return fmt.Errorf("misbehavior height %d is before start height %d", height, n.StartAt) + } + exists := false + for possibleBehaviors := range mcs.MisbehaviorList { + if possibleBehaviors == misbehavior { + exists = true + } + } + if !exists { + return fmt.Errorf("misbehavior %s does not exist", misbehavior) + } + } + return nil } diff --git a/test/e2e/runner/main.go b/test/e2e/runner/main.go index 733a57f3e..bcca5b899 100644 --- a/test/e2e/runner/main.go +++ b/test/e2e/runner/main.go @@ -6,11 +6,14 @@ import ( "os" "github.com/spf13/cobra" + "github.com/tendermint/tendermint/libs/log" e2e "github.com/tendermint/tendermint/test/e2e/pkg" ) -var logger = log.NewTMLogger(log.NewSyncWriter(os.Stdout)) +var ( + logger = log.NewTMLogger(log.NewSyncWriter(os.Stdout)) +) func main() { NewCLI().Run() @@ -18,8 +21,9 @@ func main() { // CLI is the Cobra-based command-line interface. type CLI struct { - root *cobra.Command - testnet *e2e.Testnet + root *cobra.Command + testnet *e2e.Testnet + preserve bool } // NewCLI sets up the CLI. @@ -65,10 +69,13 @@ func NewCLI() *CLI { if err := Start(cli.testnet); err != nil { return err } + if err := waitForAllMisbehaviors(cli.testnet); err != nil { + return err + } if err := Perturb(cli.testnet); err != nil { return err } - if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through + if err := Wait(cli.testnet, interphaseWaitPeriod); err != nil { // allow some txs to go through return err } @@ -76,14 +83,17 @@ func NewCLI() *CLI { if err := <-chLoadResult; err != nil { return err } - if err := Wait(cli.testnet, 5); err != nil { // wait for network to settle before tests + // wait for network to settle before tests + if err := Wait(cli.testnet, interphaseWaitPeriod); err != nil { return err } if err := Test(cli.testnet); err != nil { return err } - if err := Cleanup(cli.testnet); err != nil { - return err + if !cli.preserve { + if err := Cleanup(cli.testnet); err != nil { + return err + } } return nil }, @@ -92,6 +102,9 @@ func NewCLI() *CLI { cli.root.PersistentFlags().StringP("file", "f", "", "Testnet TOML manifest") _ = cli.root.MarkPersistentFlagRequired("file") + cli.root.Flags().BoolVarP(&cli.preserve, "preserve", "p", false, + "Preserves the running of the test net after tests are completed") + cli.root.AddCommand(&cobra.Command{ Use: "setup", Short: "Generates the testnet directory and configuration", diff --git a/test/e2e/runner/setup.go b/test/e2e/runner/setup.go index c22eee725..8c641d9f6 100644 --- a/test/e2e/runner/setup.go +++ b/test/e2e/runner/setup.go @@ -12,11 +12,13 @@ import ( "path/filepath" "regexp" "sort" + "strconv" "strings" "text/template" "time" "github.com/BurntSushi/toml" + "github.com/tendermint/tendermint/config" "github.com/tendermint/tendermint/crypto/ed25519" "github.com/tendermint/tendermint/p2p" @@ -118,7 +120,20 @@ func Setup(testnet *e2e.Testnet) error { // MakeDockerCompose generates a Docker Compose config for a testnet. func MakeDockerCompose(testnet *e2e.Testnet) ([]byte, error) { // Must use version 2 Docker Compose format, to support IPv6. - tmpl, err := template.New("docker-compose").Parse(`version: '2.4' + tmpl, err := template.New("docker-compose").Funcs(template.FuncMap{ + "misbehaviorsToString": func(misbehaviors map[int64]string) string { + str := "" + for height, misbehavior := range misbehaviors { + // after the first behavior set, a comma must be prepended + if str != "" { + str += "," + } + heightString := strconv.Itoa(int(height)) + str += misbehavior + "," + heightString + } + return str + }, + }).Parse(`version: '2.4' networks: {{ .Name }}: @@ -142,6 +157,9 @@ services: image: tendermint/e2e-node {{- if eq .ABCIProtocol "builtin" }} entrypoint: /usr/bin/entrypoint-builtin +{{- else if .Misbehaviors }} + entrypoint: /usr/bin/entrypoint-maverick + command: ["node", "--misbehaviors", "{{ misbehaviorsToString .Misbehaviors }}"] {{- end }} init: true ports: @@ -330,6 +348,12 @@ func MakeAppConfig(node *e2e.Node) ([]byte, error) { } } + misbehaviors := make(map[string]string) + for height, misbehavior := range node.Misbehaviors { + misbehaviors[strconv.Itoa(int(height))] = misbehavior + } + cfg["misbehaviors"] = misbehaviors + if len(node.Testnet.ValidatorUpdates) > 0 { validatorUpdates := map[string]map[string]int64{} for height, validators := range node.Testnet.ValidatorUpdates { diff --git a/test/e2e/runner/wait.go b/test/e2e/runner/wait.go index fd3474c5c..c53032c30 100644 --- a/test/e2e/runner/wait.go +++ b/test/e2e/runner/wait.go @@ -7,6 +7,8 @@ import ( e2e "github.com/tendermint/tendermint/test/e2e/pkg" ) +const interphaseWaitPeriod = 5 + // Wait waits for a number of blocks to be produced, and for all nodes to catch // up with it. func Wait(testnet *e2e.Testnet, blocks int64) error { @@ -22,3 +24,22 @@ func Wait(testnet *e2e.Testnet, blocks int64) error { } return nil } + +// WaitForAllMisbehaviors calculates the height of the last misbehavior and ensures the entire +// testnet has surpassed this height before moving on to the next phase +func waitForAllMisbehaviors(testnet *e2e.Testnet) error { + _, _, err := waitForHeight(testnet, lastMisbehaviorHeight(testnet)) + return err +} + +func lastMisbehaviorHeight(testnet *e2e.Testnet) int64 { + lastHeight := testnet.InitialHeight + for _, n := range testnet.Nodes { + for height := range n.Misbehaviors { + if height > lastHeight { + lastHeight = height + } + } + } + return lastHeight + interphaseWaitPeriod +} diff --git a/test/e2e/tests/app_test.go b/test/e2e/tests/app_test.go index 60018cace..33eac1b40 100644 --- a/test/e2e/tests/app_test.go +++ b/test/e2e/tests/app_test.go @@ -8,6 +8,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + e2e "github.com/tendermint/tendermint/test/e2e/pkg" "github.com/tendermint/tendermint/types" ) diff --git a/test/e2e/tests/block_test.go b/test/e2e/tests/block_test.go index 688d7bd6c..23653d1e4 100644 --- a/test/e2e/tests/block_test.go +++ b/test/e2e/tests/block_test.go @@ -5,6 +5,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + e2e "github.com/tendermint/tendermint/test/e2e/pkg" ) diff --git a/test/e2e/tests/e2e_test.go b/test/e2e/tests/e2e_test.go index 80b43229c..15c747b5b 100644 --- a/test/e2e/tests/e2e_test.go +++ b/test/e2e/tests/e2e_test.go @@ -8,6 +8,7 @@ import ( "testing" "github.com/stretchr/testify/require" + rpchttp "github.com/tendermint/tendermint/rpc/client/http" rpctypes "github.com/tendermint/tendermint/rpc/core/types" e2e "github.com/tendermint/tendermint/test/e2e/pkg" diff --git a/test/e2e/tests/evidence_test.go b/test/e2e/tests/evidence_test.go new file mode 100644 index 000000000..b98224940 --- /dev/null +++ b/test/e2e/tests/evidence_test.go @@ -0,0 +1,50 @@ +package e2e_test + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/require" + + e2e "github.com/tendermint/tendermint/test/e2e/pkg" + "github.com/tendermint/tendermint/types" +) + +// assert that all nodes that have blocks during the height (or height + 1) of a misbehavior has evidence +// for that misbehavior +func TestEvidence_Misbehavior(t *testing.T) { + blocks := fetchBlockChain(t) + testNode(t, func(t *testing.T, node e2e.Node) { + for _, block := range blocks { + // Find any evidence blaming this node in this block + var nodeEvidence types.Evidence + for _, evidence := range block.Evidence.Evidence { + switch evidence := evidence.(type) { + case *types.DuplicateVoteEvidence: + if bytes.Equal(evidence.VoteA.ValidatorAddress, node.Key.PubKey().Address()) { + nodeEvidence = evidence + } + default: + t.Fatalf("unexpected evidence type %T", evidence) + } + } + + // Check that evidence was as expected (evidence is submitted in following height) + misbehavior, ok := node.Misbehaviors[block.Height-1] + if !ok { + require.Nil(t, nodeEvidence, "found unexpected evidence %v in height %v", + nodeEvidence, block.Height) + continue + } + require.NotNil(t, nodeEvidence, "no evidence found for misbehavior %v in height %v", + misbehavior, block.Height) + + switch misbehavior { + case "double-prevote": + require.IsType(t, &types.DuplicateVoteEvidence{}, nodeEvidence, "unexpected evidence type") + default: + t.Fatalf("unknown misbehavior %v", misbehavior) + } + } + }) +} diff --git a/test/e2e/tests/net_test.go b/test/e2e/tests/net_test.go index e0a84aeeb..1ca43fa05 100644 --- a/test/e2e/tests/net_test.go +++ b/test/e2e/tests/net_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/stretchr/testify/require" + e2e "github.com/tendermint/tendermint/test/e2e/pkg" ) diff --git a/test/e2e/tests/validator_test.go b/test/e2e/tests/validator_test.go index 2398d0e62..47eb1555a 100644 --- a/test/e2e/tests/validator_test.go +++ b/test/e2e/tests/validator_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/stretchr/testify/require" + e2e "github.com/tendermint/tendermint/test/e2e/pkg" "github.com/tendermint/tendermint/types" ) diff --git a/test/maverick/README.md b/test/maverick/README.md new file mode 100644 index 000000000..308275536 --- /dev/null +++ b/test/maverick/README.md @@ -0,0 +1,51 @@ +# Maverick + +![](https://assets.rollingstone.com/assets/2015/article/tom-cruise-to-fight-drones-in-top-gun-sequel-20150629/201166/large_rect/1435581755/1401x788-Top-Gun-3.jpg) + +A byzantine node used to test Tendermint consensus against a plethora of different faulty misbehaviors. Designed to easily create new faulty misbehaviors to examine how a Tendermint network reacts to the misbehavior. Can also be used for fuzzy testing with different network arrangements. + +## Misbehaviors + +A misbehavior allows control at the following stages as highlighted by the struct below + +```go +type Misbehavior struct { + String string + + EnterPropose func(cs *State, height int64, round int32) + + EnterPrevote func(cs *State, height int64, round int32) + + EnterPrecommit func(cs *State, height int64, round int32) + + ReceivePrevote func(cs *State, prevote *types.Vote) + + ReceivePrecommit func(cs *State, precommit *types.Vote) + + ReceiveProposal func(cs *State, proposal *types.Proposal) error +} +``` + +At each of these events, the node can exhibit a different misbehavior. To create a new misbehavior define a function that builds off the existing default misbehavior and then overrides one or more of these functions. Then append it to the misbehaviors list so the node recognizes it like so: + +```go +var MisbehaviorList = map[string]Misbehavior{ + "double-prevote": DoublePrevoteMisbehavior(), +} +``` + +## Setup + +The maverick node takes most of the functionality from the existing Tendermint CLI. To install this, in the directory of this readme, run: + +```bash +go build +``` + +Use `maverick init` to initialize a single node and `maverick node` to run it. This will run it normally unless you use the misbehaviors flag as follows: + +```bash +maverick node --proxy_app persistent_kvstore --misbehaviors double-vote,10 +``` + +This would cause the node to vote twice in every round at height 10. To add more misbehaviors at different heights, append the next misbehavior and height after the first (with comma separation). diff --git a/test/maverick/consensus/metrics.go b/test/maverick/consensus/metrics.go new file mode 100644 index 000000000..bbd823a3f --- /dev/null +++ b/test/maverick/consensus/metrics.go @@ -0,0 +1,220 @@ +package consensus + +import ( + "github.com/go-kit/kit/metrics" + "github.com/go-kit/kit/metrics/discard" + + prometheus "github.com/go-kit/kit/metrics/prometheus" + stdprometheus "github.com/prometheus/client_golang/prometheus" +) + +const ( + // MetricsSubsystem is a subsystem shared by all metrics exposed by this + // package. + MetricsSubsystem = "consensus" +) + +// Metrics contains metrics exposed by this package. +type Metrics struct { + // Height of the chain. + Height metrics.Gauge + + // ValidatorLastSignedHeight of a validator. + ValidatorLastSignedHeight metrics.Gauge + + // Number of rounds. + Rounds metrics.Gauge + + // Number of validators. + Validators metrics.Gauge + // Total power of all validators. + ValidatorsPower metrics.Gauge + // Power of a validator. + ValidatorPower metrics.Gauge + // Amount of blocks missed by a validator. + ValidatorMissedBlocks metrics.Gauge + // Number of validators who did not sign. + MissingValidators metrics.Gauge + // Total power of the missing validators. + MissingValidatorsPower metrics.Gauge + // Number of validators who tried to double sign. + ByzantineValidators metrics.Gauge + // Total power of the byzantine validators. + ByzantineValidatorsPower metrics.Gauge + + // Time between this and the last block. + BlockIntervalSeconds metrics.Histogram + + // Number of transactions. + NumTxs metrics.Gauge + // Size of the block. + BlockSizeBytes metrics.Gauge + // Total number of transactions. + TotalTxs metrics.Gauge + // The latest block height. + CommittedHeight metrics.Gauge + // Whether or not a node is fast syncing. 1 if yes, 0 if no. + FastSyncing metrics.Gauge + // Whether or not a node is state syncing. 1 if yes, 0 if no. + StateSyncing metrics.Gauge + + // Number of blockparts transmitted by peer. + BlockParts metrics.Counter +} + +// PrometheusMetrics returns Metrics build using Prometheus client library. +// Optionally, labels can be provided along with their values ("foo", +// "fooValue"). +func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { + labels := []string{} + for i := 0; i < len(labelsAndValues); i += 2 { + labels = append(labels, labelsAndValues[i]) + } + return &Metrics{ + Height: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "height", + Help: "Height of the chain.", + }, labels).With(labelsAndValues...), + Rounds: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "rounds", + Help: "Number of rounds.", + }, labels).With(labelsAndValues...), + + Validators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "validators", + Help: "Number of validators.", + }, labels).With(labelsAndValues...), + ValidatorLastSignedHeight: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "validator_last_signed_height", + Help: "Last signed height for a validator", + }, append(labels, "validator_address")).With(labelsAndValues...), + ValidatorMissedBlocks: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "validator_missed_blocks", + Help: "Total missed blocks for a validator", + }, append(labels, "validator_address")).With(labelsAndValues...), + ValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "validators_power", + Help: "Total power of all validators.", + }, labels).With(labelsAndValues...), + ValidatorPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "validator_power", + Help: "Power of a validator", + }, append(labels, "validator_address")).With(labelsAndValues...), + MissingValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "missing_validators", + Help: "Number of validators who did not sign.", + }, labels).With(labelsAndValues...), + MissingValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "missing_validators_power", + Help: "Total power of the missing validators.", + }, labels).With(labelsAndValues...), + ByzantineValidators: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "byzantine_validators", + Help: "Number of validators who tried to double sign.", + }, labels).With(labelsAndValues...), + ByzantineValidatorsPower: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "byzantine_validators_power", + Help: "Total power of the byzantine validators.", + }, labels).With(labelsAndValues...), + BlockIntervalSeconds: prometheus.NewHistogramFrom(stdprometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_interval_seconds", + Help: "Time between this and the last block.", + }, labels).With(labelsAndValues...), + NumTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "num_txs", + Help: "Number of transactions.", + }, labels).With(labelsAndValues...), + BlockSizeBytes: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_size_bytes", + Help: "Size of the block.", + }, labels).With(labelsAndValues...), + TotalTxs: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "total_txs", + Help: "Total number of transactions.", + }, labels).With(labelsAndValues...), + CommittedHeight: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "latest_block_height", + Help: "The latest block height.", + }, labels).With(labelsAndValues...), + FastSyncing: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "fast_syncing", + Help: "Whether or not a node is fast syncing. 1 if yes, 0 if no.", + }, labels).With(labelsAndValues...), + StateSyncing: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "state_syncing", + Help: "Whether or not a node is state syncing. 1 if yes, 0 if no.", + }, labels).With(labelsAndValues...), + BlockParts: prometheus.NewCounterFrom(stdprometheus.CounterOpts{ + Namespace: namespace, + Subsystem: MetricsSubsystem, + Name: "block_parts", + Help: "Number of blockparts transmitted by peer.", + }, append(labels, "peer_id")).With(labelsAndValues...), + } +} + +// NopMetrics returns no-op Metrics. +func NopMetrics() *Metrics { + return &Metrics{ + Height: discard.NewGauge(), + + ValidatorLastSignedHeight: discard.NewGauge(), + + Rounds: discard.NewGauge(), + + Validators: discard.NewGauge(), + ValidatorsPower: discard.NewGauge(), + ValidatorPower: discard.NewGauge(), + ValidatorMissedBlocks: discard.NewGauge(), + MissingValidators: discard.NewGauge(), + MissingValidatorsPower: discard.NewGauge(), + ByzantineValidators: discard.NewGauge(), + ByzantineValidatorsPower: discard.NewGauge(), + + BlockIntervalSeconds: discard.NewHistogram(), + + NumTxs: discard.NewGauge(), + BlockSizeBytes: discard.NewGauge(), + TotalTxs: discard.NewGauge(), + CommittedHeight: discard.NewGauge(), + FastSyncing: discard.NewGauge(), + StateSyncing: discard.NewGauge(), + BlockParts: discard.NewCounter(), + } +} diff --git a/test/maverick/consensus/misbehavior.go b/test/maverick/consensus/misbehavior.go new file mode 100644 index 000000000..75d2bd278 --- /dev/null +++ b/test/maverick/consensus/misbehavior.go @@ -0,0 +1,398 @@ +package consensus + +import ( + "fmt" + + cstypes "github.com/tendermint/tendermint/consensus/types" + tmproto "github.com/tendermint/tendermint/proto/tendermint/types" + "github.com/tendermint/tendermint/types" +) + +// MisbehaviorList encompasses a list of all possible behaviors +var MisbehaviorList = map[string]Misbehavior{ + "double-prevote": DoublePrevoteMisbehavior(), +} + +type Misbehavior struct { + Name string + + EnterPropose func(cs *State, height int64, round int32) + + EnterPrevote func(cs *State, height int64, round int32) + + EnterPrecommit func(cs *State, height int64, round int32) + + ReceivePrevote func(cs *State, prevote *types.Vote) + + ReceivePrecommit func(cs *State, precommit *types.Vote) + + ReceiveProposal func(cs *State, proposal *types.Proposal) error +} + +// BEHAVIORS + +func DefaultMisbehavior() Misbehavior { + return Misbehavior{ + Name: "default", + EnterPropose: defaultEnterPropose, + EnterPrevote: defaultEnterPrevote, + EnterPrecommit: defaultEnterPrecommit, + ReceivePrevote: defaultReceivePrevote, + ReceivePrecommit: defaultReceivePrecommit, + ReceiveProposal: defaultReceiveProposal, + } +} + +// DoublePrevoteMisbehavior will make a node prevote both nil and a block in the same +// height and round. +func DoublePrevoteMisbehavior() Misbehavior { + b := DefaultMisbehavior() + b.Name = "double-prevote" + b.EnterPrevote = func(cs *State, height int64, round int32) { + + // If a block is locked, prevote that. + if cs.LockedBlock != nil { + cs.Logger.Info("enterPrevote: Already locked on a block, prevoting locked block") + cs.signAddVote(tmproto.PrevoteType, cs.LockedBlock.Hash(), cs.LockedBlockParts.Header()) + return + } + + // If ProposalBlock is nil, prevote nil. + if cs.ProposalBlock == nil { + cs.Logger.Info("enterPrevote: ProposalBlock is nil") + cs.signAddVote(tmproto.PrevoteType, nil, types.PartSetHeader{}) + return + } + + // Validate proposal block + err := cs.blockExec.ValidateBlock(cs.state, cs.ProposalBlock) + if err != nil { + // ProposalBlock is invalid, prevote nil. + cs.Logger.Error("enterPrevote: ProposalBlock is invalid", "err", err) + cs.signAddVote(tmproto.PrevoteType, nil, types.PartSetHeader{}) + return + } + + if cs.sw == nil { + cs.Logger.Error("nil switch") + return + } + + prevote, err := cs.signVote(tmproto.PrevoteType, cs.ProposalBlock.Hash(), cs.ProposalBlockParts.Header()) + if err != nil { + cs.Logger.Error("enterPrevote: Unable to sign block", "err", err) + } + + nilPrevote, err := cs.signVote(tmproto.PrevoteType, nil, types.PartSetHeader{}) + if err != nil { + cs.Logger.Error("enterPrevote: Unable to sign block", "err", err) + } + + // add our own vote + cs.sendInternalMessage(msgInfo{&VoteMessage{prevote}, ""}) + + cs.Logger.Info("Sending conflicting votes") + peers := cs.sw.Peers().List() + // there has to be at least two other peers connected else this behavior works normally + for idx, peer := range peers { + if idx%2 == 0 { // sign the proposal block + peer.Send(VoteChannel, MustEncode(&VoteMessage{prevote})) + } else { // sign a nil block + peer.Send(VoteChannel, MustEncode(&VoteMessage{nilPrevote})) + } + } + } + return b +} + +// DEFAULTS + +func defaultEnterPropose(cs *State, height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + // If we don't get the proposal and all block parts quick enough, enterPrevote + cs.scheduleTimeout(cs.config.Propose(round), height, round, cstypes.RoundStepPropose) + + // Nothing more to do if we're not a validator + if cs.privValidator == nil { + logger.Debug("This node is not a validator") + return + } + logger.Debug("This node is a validator") + + pubKey, err := cs.privValidator.GetPubKey() + if err != nil { + // If this node is a validator & proposer in the currentx round, it will + // miss the opportunity to create a block. + logger.Error("Error on retrival of pubkey", "err", err) + return + } + address := pubKey.Address() + + // if not a validator, we're done + if !cs.Validators.HasAddress(address) { + logger.Debug("This node is not a validator", "addr", address, "vals", cs.Validators) + return + } + + if cs.isProposer(address) { + logger.Info("enterPropose: Our turn to propose", + "proposer", + address, + "privValidator", + cs.privValidator) + cs.decideProposal(height, round) + } else { + logger.Info("enterPropose: Not our turn to propose", + "proposer", + cs.Validators.GetProposer().Address, + "privValidator", + cs.privValidator) + } +} + +func defaultEnterPrevote(cs *State, height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + + // If a block is locked, prevote that. + if cs.LockedBlock != nil { + logger.Info("enterPrevote: Already locked on a block, prevoting locked block") + cs.signAddVote(tmproto.PrevoteType, cs.LockedBlock.Hash(), cs.LockedBlockParts.Header()) + return + } + + // If ProposalBlock is nil, prevote nil. + if cs.ProposalBlock == nil { + logger.Info("enterPrevote: ProposalBlock is nil") + cs.signAddVote(tmproto.PrevoteType, nil, types.PartSetHeader{}) + return + } + + // Validate proposal block + err := cs.blockExec.ValidateBlock(cs.state, cs.ProposalBlock) + if err != nil { + // ProposalBlock is invalid, prevote nil. + logger.Error("enterPrevote: ProposalBlock is invalid", "err", err) + cs.signAddVote(tmproto.PrevoteType, nil, types.PartSetHeader{}) + return + } + + // Prevote cs.ProposalBlock + // NOTE: the proposal signature is validated when it is received, + // and the proposal block parts are validated as they are received (against the merkle hash in the proposal) + logger.Info("enterPrevote: ProposalBlock is valid") + cs.signAddVote(tmproto.PrevoteType, cs.ProposalBlock.Hash(), cs.ProposalBlockParts.Header()) +} + +func defaultEnterPrecommit(cs *State, height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + + // check for a polka + blockID, ok := cs.Votes.Prevotes(round).TwoThirdsMajority() + + // If we don't have a polka, we must precommit nil. + if !ok { + if cs.LockedBlock != nil { + logger.Info("enterPrecommit: No +2/3 prevotes during enterPrecommit while we're locked. Precommitting nil") + } else { + logger.Info("enterPrecommit: No +2/3 prevotes during enterPrecommit. Precommitting nil.") + } + cs.signAddVote(tmproto.PrecommitType, nil, types.PartSetHeader{}) + return + } + + // At this point +2/3 prevoted for a particular block or nil. + _ = cs.eventBus.PublishEventPolka(cs.RoundStateEvent()) + + // the latest POLRound should be this round. + polRound, _ := cs.Votes.POLInfo() + if polRound < round { + panic(fmt.Sprintf("This POLRound should be %v but got %v", round, polRound)) + } + + // +2/3 prevoted nil. Unlock and precommit nil. + if len(blockID.Hash) == 0 { + if cs.LockedBlock == nil { + logger.Info("enterPrecommit: +2/3 prevoted for nil.") + } else { + logger.Info("enterPrecommit: +2/3 prevoted for nil. Unlocking") + cs.LockedRound = -1 + cs.LockedBlock = nil + cs.LockedBlockParts = nil + _ = cs.eventBus.PublishEventUnlock(cs.RoundStateEvent()) + } + cs.signAddVote(tmproto.PrecommitType, nil, types.PartSetHeader{}) + return + } + + // At this point, +2/3 prevoted for a particular block. + + // If we're already locked on that block, precommit it, and update the LockedRound + if cs.LockedBlock.HashesTo(blockID.Hash) { + logger.Info("enterPrecommit: +2/3 prevoted locked block. Relocking") + cs.LockedRound = round + _ = cs.eventBus.PublishEventRelock(cs.RoundStateEvent()) + cs.signAddVote(tmproto.PrecommitType, blockID.Hash, blockID.PartSetHeader) + return + } + + // If +2/3 prevoted for proposal block, stage and precommit it + if cs.ProposalBlock.HashesTo(blockID.Hash) { + logger.Info("enterPrecommit: +2/3 prevoted proposal block. Locking", "hash", blockID.Hash) + // Validate the block. + if err := cs.blockExec.ValidateBlock(cs.state, cs.ProposalBlock); err != nil { + panic(fmt.Sprintf("enterPrecommit: +2/3 prevoted for an invalid block: %v", err)) + } + cs.LockedRound = round + cs.LockedBlock = cs.ProposalBlock + cs.LockedBlockParts = cs.ProposalBlockParts + _ = cs.eventBus.PublishEventLock(cs.RoundStateEvent()) + cs.signAddVote(tmproto.PrecommitType, blockID.Hash, blockID.PartSetHeader) + return + } + + // There was a polka in this round for a block we don't have. + // Fetch that block, unlock, and precommit nil. + // The +2/3 prevotes for this round is the POL for our unlock. + logger.Info("enterPrecommit: +2/3 prevotes for a block we don't have. Voting nil", "blockID", blockID) + cs.LockedRound = -1 + cs.LockedBlock = nil + cs.LockedBlockParts = nil + if !cs.ProposalBlockParts.HasHeader(blockID.PartSetHeader) { + cs.ProposalBlock = nil + cs.ProposalBlockParts = types.NewPartSetFromHeader(blockID.PartSetHeader) + } + _ = cs.eventBus.PublishEventUnlock(cs.RoundStateEvent()) + cs.signAddVote(tmproto.PrecommitType, nil, types.PartSetHeader{}) +} + +func defaultReceivePrevote(cs *State, vote *types.Vote) { + height := cs.Height + prevotes := cs.Votes.Prevotes(vote.Round) + + // If +2/3 prevotes for a block or nil for *any* round: + if blockID, ok := prevotes.TwoThirdsMajority(); ok { + + // There was a polka! + // If we're locked but this is a recent polka, unlock. + // If it matches our ProposalBlock, update the ValidBlock + + // Unlock if `cs.LockedRound < vote.Round <= cs.Round` + // NOTE: If vote.Round > cs.Round, we'll deal with it when we get to vote.Round + if (cs.LockedBlock != nil) && + (cs.LockedRound < vote.Round) && + (vote.Round <= cs.Round) && + !cs.LockedBlock.HashesTo(blockID.Hash) { + + cs.Logger.Info("Unlocking because of POL.", "lockedRound", cs.LockedRound, "POLRound", vote.Round) + cs.LockedRound = -1 + cs.LockedBlock = nil + cs.LockedBlockParts = nil + _ = cs.eventBus.PublishEventUnlock(cs.RoundStateEvent()) + } + + // Update Valid* if we can. + // NOTE: our proposal block may be nil or not what received a polka.. + if len(blockID.Hash) != 0 && (cs.ValidRound < vote.Round) && (vote.Round == cs.Round) { + + if cs.ProposalBlock.HashesTo(blockID.Hash) { + cs.Logger.Info( + "Updating ValidBlock because of POL.", "validRound", cs.ValidRound, "POLRound", vote.Round) + cs.ValidRound = vote.Round + cs.ValidBlock = cs.ProposalBlock + cs.ValidBlockParts = cs.ProposalBlockParts + } else { + cs.Logger.Info( + "Valid block we don't know about. Set ProposalBlock=nil", + "proposal", cs.ProposalBlock.Hash(), "blockID", blockID.Hash) + // We're getting the wrong block. + cs.ProposalBlock = nil + } + if !cs.ProposalBlockParts.HasHeader(blockID.PartSetHeader) { + cs.ProposalBlockParts = types.NewPartSetFromHeader(blockID.PartSetHeader) + } + cs.evsw.FireEvent(types.EventValidBlock, &cs.RoundState) + _ = cs.eventBus.PublishEventValidBlock(cs.RoundStateEvent()) + } + } + + // If +2/3 prevotes for *anything* for future round: + switch { + case cs.Round < vote.Round && prevotes.HasTwoThirdsAny(): + // Round-skip if there is any 2/3+ of votes ahead of us + cs.enterNewRound(height, vote.Round) + case cs.Round == vote.Round && cstypes.RoundStepPrevote <= cs.Step: // current round + blockID, ok := prevotes.TwoThirdsMajority() + if ok && (cs.isProposalComplete() || len(blockID.Hash) == 0) { + cs.enterPrecommit(height, vote.Round) + } else if prevotes.HasTwoThirdsAny() { + cs.enterPrevoteWait(height, vote.Round) + } + case cs.Proposal != nil && 0 <= cs.Proposal.POLRound && cs.Proposal.POLRound == vote.Round: + // If the proposal is now complete, enter prevote of cs.Round. + if cs.isProposalComplete() { + cs.enterPrevote(height, cs.Round) + } + } + +} + +func defaultReceivePrecommit(cs *State, vote *types.Vote) { + height := cs.Height + precommits := cs.Votes.Precommits(vote.Round) + cs.Logger.Info("Added to precommit", "vote", vote, "precommits", precommits.StringShort()) + + blockID, ok := precommits.TwoThirdsMajority() + if ok { + // Executed as TwoThirdsMajority could be from a higher round + cs.enterNewRound(height, vote.Round) + cs.enterPrecommit(height, vote.Round) + if len(blockID.Hash) != 0 { + cs.enterCommit(height, vote.Round) + if cs.config.SkipTimeoutCommit && precommits.HasAll() { + cs.enterNewRound(cs.Height, 0) + } + } else { + cs.enterPrecommitWait(height, vote.Round) + } + } else if cs.Round <= vote.Round && precommits.HasTwoThirdsAny() { + cs.enterNewRound(height, vote.Round) + cs.enterPrecommitWait(height, vote.Round) + } +} + +func defaultReceiveProposal(cs *State, proposal *types.Proposal) error { + // Already have one + // TODO: possibly catch double proposals + if cs.Proposal != nil { + return nil + } + + // Does not apply + if proposal.Height != cs.Height || proposal.Round != cs.Round { + return nil + } + + // Verify POLRound, which must be -1 or in range [0, proposal.Round). + if proposal.POLRound < -1 || + (proposal.POLRound >= 0 && proposal.POLRound >= proposal.Round) { + return ErrInvalidProposalPOLRound + } + + p := proposal.ToProto() + // Verify signature + if !cs.Validators.GetProposer().PubKey.VerifySignature( + types.ProposalSignBytes(cs.state.ChainID, p), proposal.Signature) { + return ErrInvalidProposalSignature + } + + proposal.Signature = p.Signature + cs.Proposal = proposal + // We don't update cs.ProposalBlockParts if it is already set. + // This happens if we're already in cstypes.RoundStepCommit or if there is a valid block in the current round. + // TODO: We can check if Proposal is for a different block as this is a sign of misbehavior! + if cs.ProposalBlockParts == nil { + cs.ProposalBlockParts = types.NewPartSetFromHeader(proposal.BlockID.PartSetHeader) + } + cs.Logger.Info("Received proposal", "proposal", proposal) + return nil +} diff --git a/test/maverick/consensus/msgs.go b/test/maverick/consensus/msgs.go new file mode 100644 index 000000000..4de96b5f4 --- /dev/null +++ b/test/maverick/consensus/msgs.go @@ -0,0 +1,377 @@ +package consensus + +import ( + "errors" + "fmt" + + "github.com/gogo/protobuf/proto" + + cstypes "github.com/tendermint/tendermint/consensus/types" + "github.com/tendermint/tendermint/libs/bits" + tmmath "github.com/tendermint/tendermint/libs/math" + "github.com/tendermint/tendermint/p2p" + tmcons "github.com/tendermint/tendermint/proto/tendermint/consensus" + tmproto "github.com/tendermint/tendermint/proto/tendermint/types" + "github.com/tendermint/tendermint/types" +) + +// MsgToProto takes a consensus message type and returns the proto defined consensus message +func MsgToProto(msg Message) (*tmcons.Message, error) { + if msg == nil { + return nil, errors.New("consensus: message is nil") + } + var pb tmcons.Message + + switch msg := msg.(type) { + case *NewRoundStepMessage: + pb = tmcons.Message{ + Sum: &tmcons.Message_NewRoundStep{ + NewRoundStep: &tmcons.NewRoundStep{ + Height: msg.Height, + Round: msg.Round, + Step: uint32(msg.Step), + SecondsSinceStartTime: msg.SecondsSinceStartTime, + LastCommitRound: msg.LastCommitRound, + }, + }, + } + case *NewValidBlockMessage: + pbPartSetHeader := msg.BlockPartSetHeader.ToProto() + pbBits := msg.BlockParts.ToProto() + pb = tmcons.Message{ + Sum: &tmcons.Message_NewValidBlock{ + NewValidBlock: &tmcons.NewValidBlock{ + Height: msg.Height, + Round: msg.Round, + BlockPartSetHeader: pbPartSetHeader, + BlockParts: pbBits, + IsCommit: msg.IsCommit, + }, + }, + } + case *ProposalMessage: + pbP := msg.Proposal.ToProto() + pb = tmcons.Message{ + Sum: &tmcons.Message_Proposal{ + Proposal: &tmcons.Proposal{ + Proposal: *pbP, + }, + }, + } + case *ProposalPOLMessage: + pbBits := msg.ProposalPOL.ToProto() + pb = tmcons.Message{ + Sum: &tmcons.Message_ProposalPol{ + ProposalPol: &tmcons.ProposalPOL{ + Height: msg.Height, + ProposalPolRound: msg.ProposalPOLRound, + ProposalPol: *pbBits, + }, + }, + } + case *BlockPartMessage: + parts, err := msg.Part.ToProto() + if err != nil { + return nil, fmt.Errorf("msg to proto error: %w", err) + } + pb = tmcons.Message{ + Sum: &tmcons.Message_BlockPart{ + BlockPart: &tmcons.BlockPart{ + Height: msg.Height, + Round: msg.Round, + Part: *parts, + }, + }, + } + case *VoteMessage: + vote := msg.Vote.ToProto() + pb = tmcons.Message{ + Sum: &tmcons.Message_Vote{ + Vote: &tmcons.Vote{ + Vote: vote, + }, + }, + } + case *HasVoteMessage: + pb = tmcons.Message{ + Sum: &tmcons.Message_HasVote{ + HasVote: &tmcons.HasVote{ + Height: msg.Height, + Round: msg.Round, + Type: msg.Type, + Index: msg.Index, + }, + }, + } + case *VoteSetMaj23Message: + bi := msg.BlockID.ToProto() + pb = tmcons.Message{ + Sum: &tmcons.Message_VoteSetMaj23{ + VoteSetMaj23: &tmcons.VoteSetMaj23{ + Height: msg.Height, + Round: msg.Round, + Type: msg.Type, + BlockID: bi, + }, + }, + } + case *VoteSetBitsMessage: + bi := msg.BlockID.ToProto() + bits := msg.Votes.ToProto() + + vsb := &tmcons.Message_VoteSetBits{ + VoteSetBits: &tmcons.VoteSetBits{ + Height: msg.Height, + Round: msg.Round, + Type: msg.Type, + BlockID: bi, + }, + } + + if bits != nil { + vsb.VoteSetBits.Votes = *bits + } + + pb = tmcons.Message{ + Sum: vsb, + } + + default: + return nil, fmt.Errorf("consensus: message not recognized: %T", msg) + } + + return &pb, nil +} + +// MsgFromProto takes a consensus proto message and returns the native go type +func MsgFromProto(msg *tmcons.Message) (Message, error) { + if msg == nil { + return nil, errors.New("consensus: nil message") + } + var pb Message + + switch msg := msg.Sum.(type) { + case *tmcons.Message_NewRoundStep: + rs, err := tmmath.SafeConvertUint8(int64(msg.NewRoundStep.Step)) + // deny message based on possible overflow + if err != nil { + return nil, fmt.Errorf("denying message due to possible overflow: %w", err) + } + pb = &NewRoundStepMessage{ + Height: msg.NewRoundStep.Height, + Round: msg.NewRoundStep.Round, + Step: cstypes.RoundStepType(rs), + SecondsSinceStartTime: msg.NewRoundStep.SecondsSinceStartTime, + LastCommitRound: msg.NewRoundStep.LastCommitRound, + } + case *tmcons.Message_NewValidBlock: + pbPartSetHeader, err := types.PartSetHeaderFromProto(&msg.NewValidBlock.BlockPartSetHeader) + if err != nil { + return nil, fmt.Errorf("parts to proto error: %w", err) + } + + pbBits := new(bits.BitArray) + pbBits.FromProto(msg.NewValidBlock.BlockParts) + + pb = &NewValidBlockMessage{ + Height: msg.NewValidBlock.Height, + Round: msg.NewValidBlock.Round, + BlockPartSetHeader: *pbPartSetHeader, + BlockParts: pbBits, + IsCommit: msg.NewValidBlock.IsCommit, + } + case *tmcons.Message_Proposal: + pbP, err := types.ProposalFromProto(&msg.Proposal.Proposal) + if err != nil { + return nil, fmt.Errorf("proposal msg to proto error: %w", err) + } + + pb = &ProposalMessage{ + Proposal: pbP, + } + case *tmcons.Message_ProposalPol: + pbBits := new(bits.BitArray) + pbBits.FromProto(&msg.ProposalPol.ProposalPol) + pb = &ProposalPOLMessage{ + Height: msg.ProposalPol.Height, + ProposalPOLRound: msg.ProposalPol.ProposalPolRound, + ProposalPOL: pbBits, + } + case *tmcons.Message_BlockPart: + parts, err := types.PartFromProto(&msg.BlockPart.Part) + if err != nil { + return nil, fmt.Errorf("blockpart msg to proto error: %w", err) + } + pb = &BlockPartMessage{ + Height: msg.BlockPart.Height, + Round: msg.BlockPart.Round, + Part: parts, + } + case *tmcons.Message_Vote: + vote, err := types.VoteFromProto(msg.Vote.Vote) + if err != nil { + return nil, fmt.Errorf("vote msg to proto error: %w", err) + } + + pb = &VoteMessage{ + Vote: vote, + } + case *tmcons.Message_HasVote: + pb = &HasVoteMessage{ + Height: msg.HasVote.Height, + Round: msg.HasVote.Round, + Type: msg.HasVote.Type, + Index: msg.HasVote.Index, + } + case *tmcons.Message_VoteSetMaj23: + bi, err := types.BlockIDFromProto(&msg.VoteSetMaj23.BlockID) + if err != nil { + return nil, fmt.Errorf("voteSetMaj23 msg to proto error: %w", err) + } + pb = &VoteSetMaj23Message{ + Height: msg.VoteSetMaj23.Height, + Round: msg.VoteSetMaj23.Round, + Type: msg.VoteSetMaj23.Type, + BlockID: *bi, + } + case *tmcons.Message_VoteSetBits: + bi, err := types.BlockIDFromProto(&msg.VoteSetBits.BlockID) + if err != nil { + return nil, fmt.Errorf("voteSetBits msg to proto error: %w", err) + } + bits := new(bits.BitArray) + bits.FromProto(&msg.VoteSetBits.Votes) + + pb = &VoteSetBitsMessage{ + Height: msg.VoteSetBits.Height, + Round: msg.VoteSetBits.Round, + Type: msg.VoteSetBits.Type, + BlockID: *bi, + Votes: bits, + } + default: + return nil, fmt.Errorf("consensus: message not recognized: %T", msg) + } + + if err := pb.ValidateBasic(); err != nil { + return nil, err + } + + return pb, nil +} + +// MustEncode takes the reactors msg, makes it proto and marshals it +// this mimics `MustMarshalBinaryBare` in that is panics on error +func MustEncode(msg Message) []byte { + pb, err := MsgToProto(msg) + if err != nil { + panic(err) + } + enc, err := proto.Marshal(pb) + if err != nil { + panic(err) + } + return enc +} + +// WALToProto takes a WAL message and return a proto walMessage and error +func WALToProto(msg WALMessage) (*tmcons.WALMessage, error) { + var pb tmcons.WALMessage + + switch msg := msg.(type) { + case types.EventDataRoundState: + pb = tmcons.WALMessage{ + Sum: &tmcons.WALMessage_EventDataRoundState{ + EventDataRoundState: &tmproto.EventDataRoundState{ + Height: msg.Height, + Round: msg.Round, + Step: msg.Step, + }, + }, + } + case msgInfo: + consMsg, err := MsgToProto(msg.Msg) + if err != nil { + return nil, err + } + pb = tmcons.WALMessage{ + Sum: &tmcons.WALMessage_MsgInfo{ + MsgInfo: &tmcons.MsgInfo{ + Msg: *consMsg, + PeerID: string(msg.PeerID), + }, + }, + } + case timeoutInfo: + pb = tmcons.WALMessage{ + Sum: &tmcons.WALMessage_TimeoutInfo{ + TimeoutInfo: &tmcons.TimeoutInfo{ + Duration: msg.Duration, + Height: msg.Height, + Round: msg.Round, + Step: uint32(msg.Step), + }, + }, + } + case EndHeightMessage: + pb = tmcons.WALMessage{ + Sum: &tmcons.WALMessage_EndHeight{ + EndHeight: &tmcons.EndHeight{ + Height: msg.Height, + }, + }, + } + default: + return nil, fmt.Errorf("to proto: wal message not recognized: %T", msg) + } + + return &pb, nil +} + +// WALFromProto takes a proto wal message and return a consensus walMessage and error +func WALFromProto(msg *tmcons.WALMessage) (WALMessage, error) { + if msg == nil { + return nil, errors.New("nil WAL message") + } + var pb WALMessage + + switch msg := msg.Sum.(type) { + case *tmcons.WALMessage_EventDataRoundState: + pb = types.EventDataRoundState{ + Height: msg.EventDataRoundState.Height, + Round: msg.EventDataRoundState.Round, + Step: msg.EventDataRoundState.Step, + } + case *tmcons.WALMessage_MsgInfo: + walMsg, err := MsgFromProto(&msg.MsgInfo.Msg) + if err != nil { + return nil, fmt.Errorf("msgInfo from proto error: %w", err) + } + pb = msgInfo{ + Msg: walMsg, + PeerID: p2p.ID(msg.MsgInfo.PeerID), + } + + case *tmcons.WALMessage_TimeoutInfo: + tis, err := tmmath.SafeConvertUint8(int64(msg.TimeoutInfo.Step)) + // deny message based on possible overflow + if err != nil { + return nil, fmt.Errorf("denying message due to possible overflow: %w", err) + } + pb = timeoutInfo{ + Duration: msg.TimeoutInfo.Duration, + Height: msg.TimeoutInfo.Height, + Round: msg.TimeoutInfo.Round, + Step: cstypes.RoundStepType(tis), + } + return pb, nil + case *tmcons.WALMessage_EndHeight: + pb := EndHeightMessage{ + Height: msg.EndHeight.Height, + } + return pb, nil + default: + return nil, fmt.Errorf("from proto: wal message not recognized: %T", msg) + } + return pb, nil +} diff --git a/test/maverick/consensus/reactor.go b/test/maverick/consensus/reactor.go new file mode 100644 index 000000000..c82656115 --- /dev/null +++ b/test/maverick/consensus/reactor.go @@ -0,0 +1,1720 @@ +package consensus + +import ( + "errors" + "fmt" + "reflect" + "sync" + "time" + + "github.com/gogo/protobuf/proto" + + cstypes "github.com/tendermint/tendermint/consensus/types" + "github.com/tendermint/tendermint/libs/bits" + tmevents "github.com/tendermint/tendermint/libs/events" + tmjson "github.com/tendermint/tendermint/libs/json" + "github.com/tendermint/tendermint/libs/log" + tmsync "github.com/tendermint/tendermint/libs/sync" + "github.com/tendermint/tendermint/p2p" + tmcons "github.com/tendermint/tendermint/proto/tendermint/consensus" + tmproto "github.com/tendermint/tendermint/proto/tendermint/types" + sm "github.com/tendermint/tendermint/state" + "github.com/tendermint/tendermint/types" + tmtime "github.com/tendermint/tendermint/types/time" +) + +const ( + StateChannel = byte(0x20) + DataChannel = byte(0x21) + VoteChannel = byte(0x22) + VoteSetBitsChannel = byte(0x23) + + maxMsgSize = 1048576 // 1MB; NOTE/TODO: keep in sync with types.PartSet sizes. + + blocksToContributeToBecomeGoodPeer = 10000 + votesToContributeToBecomeGoodPeer = 10000 +) + +//----------------------------------------------------------------------------- + +// Reactor defines a reactor for the consensus service. +type Reactor struct { + p2p.BaseReactor // BaseService + p2p.Switch + + conS *State + + mtx tmsync.RWMutex + waitSync bool + eventBus *types.EventBus + + Metrics *Metrics +} + +type ReactorOption func(*Reactor) + +// NewReactor returns a new Reactor with the given +// consensusState. +func NewReactor(consensusState *State, waitSync bool, options ...ReactorOption) *Reactor { + conR := &Reactor{ + conS: consensusState, + waitSync: waitSync, + Metrics: NopMetrics(), + } + conR.BaseReactor = *p2p.NewBaseReactor("Consensus", conR) + + for _, option := range options { + option(conR) + } + + return conR +} + +// OnStart implements BaseService by subscribing to events, which later will be +// broadcasted to other peers and starting state if we're not in fast sync. +func (conR *Reactor) OnStart() error { + conR.Logger.Info("Reactor ", "waitSync", conR.WaitSync()) + + // start routine that computes peer statistics for evaluating peer quality + go conR.peerStatsRoutine() + + conR.subscribeToBroadcastEvents() + + if !conR.WaitSync() { + conR.conS.SetSwitch(conR.Switch) + err := conR.conS.Start() + if err != nil { + return err + } + } + + return nil +} + +// OnStop implements BaseService by unsubscribing from events and stopping +// state. +func (conR *Reactor) OnStop() { + conR.unsubscribeFromBroadcastEvents() + if err := conR.conS.Stop(); err != nil { + conR.Logger.Error("Error stopping consensus state", "err", err) + } + if !conR.WaitSync() { + conR.conS.Wait() + } +} + +// SwitchToConsensus switches from fast_sync mode to consensus mode. +// It resets the state, turns off fast_sync, and starts the consensus state-machine +func (conR *Reactor) SwitchToConsensus(state sm.State, skipWAL bool) { + conR.Logger.Info("SwitchToConsensus") + + // We have no votes, so reconstruct LastCommit from SeenCommit. + if state.LastBlockHeight > 0 { + conR.conS.reconstructLastCommit(state) + } + + // NOTE: The line below causes broadcastNewRoundStepRoutine() to broadcast a + // NewRoundStepMessage. + conR.conS.updateToState(state) + + conR.mtx.Lock() + conR.waitSync = false + conR.mtx.Unlock() + conR.Metrics.FastSyncing.Set(0) + conR.Metrics.StateSyncing.Set(0) + + if skipWAL { + conR.conS.doWALCatchup = false + } + conR.conS.SetSwitch(conR.Switch) + err := conR.conS.Start() + if err != nil { + panic(fmt.Sprintf(`Failed to start consensus state: %v + +conS: +%+v + +conR: +%+v`, err, conR.conS, conR)) + } +} + +// GetChannels implements Reactor +func (conR *Reactor) GetChannels() []*p2p.ChannelDescriptor { + // TODO optimize + return []*p2p.ChannelDescriptor{ + { + ID: StateChannel, + Priority: 5, + SendQueueCapacity: 100, + RecvMessageCapacity: maxMsgSize, + }, + { + ID: DataChannel, // maybe split between gossiping current block and catchup stuff + // once we gossip the whole block there's nothing left to send until next height or round + Priority: 10, + SendQueueCapacity: 100, + RecvBufferCapacity: 50 * 4096, + RecvMessageCapacity: maxMsgSize, + }, + { + ID: VoteChannel, + Priority: 5, + SendQueueCapacity: 100, + RecvBufferCapacity: 100 * 100, + RecvMessageCapacity: maxMsgSize, + }, + { + ID: VoteSetBitsChannel, + Priority: 1, + SendQueueCapacity: 2, + RecvBufferCapacity: 1024, + RecvMessageCapacity: maxMsgSize, + }, + } +} + +// InitPeer implements Reactor by creating a state for the peer. +func (conR *Reactor) InitPeer(peer p2p.Peer) p2p.Peer { + peerState := NewPeerState(peer).SetLogger(conR.Logger) + peer.Set(types.PeerStateKey, peerState) + return peer +} + +// AddPeer implements Reactor by spawning multiple gossiping goroutines for the +// peer. +func (conR *Reactor) AddPeer(peer p2p.Peer) { + if !conR.IsRunning() { + return + } + + peerState, ok := peer.Get(types.PeerStateKey).(*PeerState) + if !ok { + panic(fmt.Sprintf("peer %v has no state", peer)) + } + // Begin routines for this peer. + go conR.gossipDataRoutine(peer, peerState) + go conR.gossipVotesRoutine(peer, peerState) + go conR.queryMaj23Routine(peer, peerState) + + // Send our state to peer. + // If we're fast_syncing, broadcast a RoundStepMessage later upon SwitchToConsensus(). + if !conR.WaitSync() { + conR.sendNewRoundStepMessage(peer) + } +} + +// RemovePeer is a noop. +func (conR *Reactor) RemovePeer(peer p2p.Peer, reason interface{}) { + if !conR.IsRunning() { + return + } + // TODO + // ps, ok := peer.Get(PeerStateKey).(*PeerState) + // if !ok { + // panic(fmt.Sprintf("Peer %v has no state", peer)) + // } + // ps.Disconnect() +} + +// Receive implements Reactor +// NOTE: We process these messages even when we're fast_syncing. +// Messages affect either a peer state or the consensus state. +// Peer state updates can happen in parallel, but processing of +// proposals, block parts, and votes are ordered by the receiveRoutine +// NOTE: blocks on consensus state for proposals, block parts, and votes +func (conR *Reactor) Receive(chID byte, src p2p.Peer, msgBytes []byte) { + if !conR.IsRunning() { + conR.Logger.Debug("Receive", "src", src, "chId", chID, "bytes", msgBytes) + return + } + + msg, err := decodeMsg(msgBytes) + if err != nil { + conR.Logger.Error("Error decoding message", "src", src, "chId", chID, "msg", msg, "err", err, "bytes", msgBytes) + conR.Switch.StopPeerForError(src, err) + return + } + + if err = msg.ValidateBasic(); err != nil { + conR.Logger.Error("Peer sent us invalid msg", "peer", src, "msg", msg, "err", err) + conR.Switch.StopPeerForError(src, err) + return + } + + conR.Logger.Debug("Receive", "src", src, "chId", chID, "msg", msg) + + // Get peer states + ps, ok := src.Get(types.PeerStateKey).(*PeerState) + if !ok { + panic(fmt.Sprintf("Peer %v has no state", src)) + } + + switch chID { + case StateChannel: + switch msg := msg.(type) { + case *NewRoundStepMessage: + conR.conS.mtx.Lock() + initialHeight := conR.conS.state.InitialHeight + conR.conS.mtx.Unlock() + if err = msg.ValidateHeight(initialHeight); err != nil { + conR.Logger.Error("Peer sent us invalid msg", "peer", src, "msg", msg, "err", err) + conR.Switch.StopPeerForError(src, err) + return + } + ps.ApplyNewRoundStepMessage(msg) + case *NewValidBlockMessage: + ps.ApplyNewValidBlockMessage(msg) + case *HasVoteMessage: + ps.ApplyHasVoteMessage(msg) + case *VoteSetMaj23Message: + cs := conR.conS + cs.mtx.Lock() + height, votes := cs.Height, cs.Votes + cs.mtx.Unlock() + if height != msg.Height { + return + } + // Peer claims to have a maj23 for some BlockID at H,R,S, + err := votes.SetPeerMaj23(msg.Round, msg.Type, ps.peer.ID(), msg.BlockID) + if err != nil { + conR.Switch.StopPeerForError(src, err) + return + } + // Respond with a VoteSetBitsMessage showing which votes we have. + // (and consequently shows which we don't have) + var ourVotes *bits.BitArray + switch msg.Type { + case tmproto.PrevoteType: + ourVotes = votes.Prevotes(msg.Round).BitArrayByBlockID(msg.BlockID) + case tmproto.PrecommitType: + ourVotes = votes.Precommits(msg.Round).BitArrayByBlockID(msg.BlockID) + default: + panic("Bad VoteSetBitsMessage field Type. Forgot to add a check in ValidateBasic?") + } + src.TrySend(VoteSetBitsChannel, MustEncode(&VoteSetBitsMessage{ + Height: msg.Height, + Round: msg.Round, + Type: msg.Type, + BlockID: msg.BlockID, + Votes: ourVotes, + })) + default: + conR.Logger.Error(fmt.Sprintf("Unknown message type %v", reflect.TypeOf(msg))) + } + + case DataChannel: + if conR.WaitSync() { + conR.Logger.Info("Ignoring message received during sync", "msg", msg) + return + } + switch msg := msg.(type) { + case *ProposalMessage: + ps.SetHasProposal(msg.Proposal) + conR.conS.peerMsgQueue <- msgInfo{msg, src.ID()} + case *ProposalPOLMessage: + ps.ApplyProposalPOLMessage(msg) + case *BlockPartMessage: + ps.SetHasProposalBlockPart(msg.Height, msg.Round, int(msg.Part.Index)) + conR.Metrics.BlockParts.With("peer_id", string(src.ID())).Add(1) + conR.conS.peerMsgQueue <- msgInfo{msg, src.ID()} + default: + conR.Logger.Error(fmt.Sprintf("Unknown message type %v", reflect.TypeOf(msg))) + } + + case VoteChannel: + if conR.WaitSync() { + conR.Logger.Info("Ignoring message received during sync", "msg", msg) + return + } + switch msg := msg.(type) { + case *VoteMessage: + cs := conR.conS + cs.mtx.RLock() + height, valSize, lastCommitSize := cs.Height, cs.Validators.Size(), cs.LastCommit.Size() + cs.mtx.RUnlock() + ps.EnsureVoteBitArrays(height, valSize) + ps.EnsureVoteBitArrays(height-1, lastCommitSize) + ps.SetHasVote(msg.Vote) + + cs.peerMsgQueue <- msgInfo{msg, src.ID()} + + default: + // don't punish (leave room for soft upgrades) + conR.Logger.Error(fmt.Sprintf("Unknown message type %v", reflect.TypeOf(msg))) + } + + case VoteSetBitsChannel: + if conR.WaitSync() { + conR.Logger.Info("Ignoring message received during sync", "msg", msg) + return + } + switch msg := msg.(type) { + case *VoteSetBitsMessage: + cs := conR.conS + cs.mtx.Lock() + height, votes := cs.Height, cs.Votes + cs.mtx.Unlock() + + if height == msg.Height { + var ourVotes *bits.BitArray + switch msg.Type { + case tmproto.PrevoteType: + ourVotes = votes.Prevotes(msg.Round).BitArrayByBlockID(msg.BlockID) + case tmproto.PrecommitType: + ourVotes = votes.Precommits(msg.Round).BitArrayByBlockID(msg.BlockID) + default: + panic("Bad VoteSetBitsMessage field Type. Forgot to add a check in ValidateBasic?") + } + ps.ApplyVoteSetBitsMessage(msg, ourVotes) + } else { + ps.ApplyVoteSetBitsMessage(msg, nil) + } + default: + // don't punish (leave room for soft upgrades) + conR.Logger.Error(fmt.Sprintf("Unknown message type %v", reflect.TypeOf(msg))) + } + + default: + conR.Logger.Error(fmt.Sprintf("Unknown chId %X", chID)) + } +} + +// SetEventBus sets event bus. +func (conR *Reactor) SetEventBus(b *types.EventBus) { + conR.eventBus = b + conR.conS.SetEventBus(b) +} + +// WaitSync returns whether the consensus reactor is waiting for state/fast sync. +func (conR *Reactor) WaitSync() bool { + conR.mtx.RLock() + defer conR.mtx.RUnlock() + return conR.waitSync +} + +//-------------------------------------- + +// subscribeToBroadcastEvents subscribes for new round steps and votes +// using internal pubsub defined on state to broadcast +// them to peers upon receiving. +func (conR *Reactor) subscribeToBroadcastEvents() { + const subscriber = "consensus-reactor" + if err := conR.conS.evsw.AddListenerForEvent(subscriber, types.EventNewRoundStep, + func(data tmevents.EventData) { + conR.broadcastNewRoundStepMessage(data.(*cstypes.RoundState)) + }); err != nil { + conR.Logger.Error("Error adding listener for events", "err", err) + } + + if err := conR.conS.evsw.AddListenerForEvent(subscriber, types.EventValidBlock, + func(data tmevents.EventData) { + conR.broadcastNewValidBlockMessage(data.(*cstypes.RoundState)) + }); err != nil { + conR.Logger.Error("Error adding listener for events", "err", err) + } + + if err := conR.conS.evsw.AddListenerForEvent(subscriber, types.EventVote, + func(data tmevents.EventData) { + conR.broadcastHasVoteMessage(data.(*types.Vote)) + }); err != nil { + conR.Logger.Error("Error adding listener for events", "err", err) + } + +} + +func (conR *Reactor) unsubscribeFromBroadcastEvents() { + const subscriber = "consensus-reactor" + conR.conS.evsw.RemoveListener(subscriber) +} + +func (conR *Reactor) broadcastNewRoundStepMessage(rs *cstypes.RoundState) { + nrsMsg := makeRoundStepMessage(rs) + conR.Switch.Broadcast(StateChannel, MustEncode(nrsMsg)) +} + +func (conR *Reactor) broadcastNewValidBlockMessage(rs *cstypes.RoundState) { + csMsg := &NewValidBlockMessage{ + Height: rs.Height, + Round: rs.Round, + BlockPartSetHeader: rs.ProposalBlockParts.Header(), + BlockParts: rs.ProposalBlockParts.BitArray(), + IsCommit: rs.Step == cstypes.RoundStepCommit, + } + conR.Switch.Broadcast(StateChannel, MustEncode(csMsg)) +} + +// Broadcasts HasVoteMessage to peers that care. +func (conR *Reactor) broadcastHasVoteMessage(vote *types.Vote) { + msg := &HasVoteMessage{ + Height: vote.Height, + Round: vote.Round, + Type: vote.Type, + Index: vote.ValidatorIndex, + } + conR.Switch.Broadcast(StateChannel, MustEncode(msg)) + /* + // TODO: Make this broadcast more selective. + for _, peer := range conR.Switch.Peers().List() { + ps, ok := peer.Get(PeerStateKey).(*PeerState) + if !ok { + panic(fmt.Sprintf("Peer %v has no state", peer)) + } + prs := ps.GetRoundState() + if prs.Height == vote.Height { + // TODO: Also filter on round? + peer.TrySend(StateChannel, struct{ ConsensusMessage }{msg}) + } else { + // Height doesn't match + // TODO: check a field, maybe CatchupCommitRound? + // TODO: But that requires changing the struct field comment. + } + } + */ +} + +func makeRoundStepMessage(rs *cstypes.RoundState) (nrsMsg *NewRoundStepMessage) { + nrsMsg = &NewRoundStepMessage{ + Height: rs.Height, + Round: rs.Round, + Step: rs.Step, + SecondsSinceStartTime: int64(time.Since(rs.StartTime).Seconds()), + LastCommitRound: rs.LastCommit.GetRound(), + } + return +} + +func (conR *Reactor) sendNewRoundStepMessage(peer p2p.Peer) { + rs := conR.conS.GetRoundState() + nrsMsg := makeRoundStepMessage(rs) + peer.Send(StateChannel, MustEncode(nrsMsg)) +} + +func (conR *Reactor) gossipDataRoutine(peer p2p.Peer, ps *PeerState) { + logger := conR.Logger.With("peer", peer) + +OUTER_LOOP: + for { + // Manage disconnects from self or peer. + if !peer.IsRunning() || !conR.IsRunning() { + logger.Info("Stopping gossipDataRoutine for peer") + return + } + rs := conR.conS.GetRoundState() + prs := ps.GetRoundState() + + // Send proposal Block parts? + if rs.ProposalBlockParts.HasHeader(prs.ProposalBlockPartSetHeader) { + if index, ok := rs.ProposalBlockParts.BitArray().Sub(prs.ProposalBlockParts.Copy()).PickRandom(); ok { + part := rs.ProposalBlockParts.GetPart(index) + msg := &BlockPartMessage{ + Height: rs.Height, // This tells peer that this part applies to us. + Round: rs.Round, // This tells peer that this part applies to us. + Part: part, + } + logger.Debug("Sending block part", "height", prs.Height, "round", prs.Round) + if peer.Send(DataChannel, MustEncode(msg)) { + ps.SetHasProposalBlockPart(prs.Height, prs.Round, index) + } + continue OUTER_LOOP + } + } + + // If the peer is on a previous height that we have, help catch up. + if (0 < prs.Height) && (prs.Height < rs.Height) && (prs.Height >= conR.conS.blockStore.Base()) { + heightLogger := logger.With("height", prs.Height) + + // if we never received the commit message from the peer, the block parts wont be initialized + if prs.ProposalBlockParts == nil { + blockMeta := conR.conS.blockStore.LoadBlockMeta(prs.Height) + if blockMeta == nil { + heightLogger.Error("Failed to load block meta", + "blockstoreBase", conR.conS.blockStore.Base(), "blockstoreHeight", conR.conS.blockStore.Height()) + time.Sleep(conR.conS.config.PeerGossipSleepDuration) + } else { + ps.InitProposalBlockParts(blockMeta.BlockID.PartSetHeader) + } + // continue the loop since prs is a copy and not effected by this initialization + continue OUTER_LOOP + } + conR.gossipDataForCatchup(heightLogger, rs, prs, ps, peer) + continue OUTER_LOOP + } + + // If height and round don't match, sleep. + if (rs.Height != prs.Height) || (rs.Round != prs.Round) { + time.Sleep(conR.conS.config.PeerGossipSleepDuration) + continue OUTER_LOOP + } + + // By here, height and round match. + // Proposal block parts were already matched and sent if any were wanted. + // (These can match on hash so the round doesn't matter) + // Now consider sending other things, like the Proposal itself. + + // Send Proposal && ProposalPOL BitArray? + if rs.Proposal != nil && !prs.Proposal { + // Proposal: share the proposal metadata with peer. + { + msg := &ProposalMessage{Proposal: rs.Proposal} + logger.Debug("Sending proposal", "height", prs.Height, "round", prs.Round) + if peer.Send(DataChannel, MustEncode(msg)) { + // NOTE[ZM]: A peer might have received different proposal msg so this Proposal msg will be rejected! + ps.SetHasProposal(rs.Proposal) + } + } + // ProposalPOL: lets peer know which POL votes we have so far. + // Peer must receive ProposalMessage first. + // rs.Proposal was validated, so rs.Proposal.POLRound <= rs.Round, + // so we definitely have rs.Votes.Prevotes(rs.Proposal.POLRound). + if 0 <= rs.Proposal.POLRound { + msg := &ProposalPOLMessage{ + Height: rs.Height, + ProposalPOLRound: rs.Proposal.POLRound, + ProposalPOL: rs.Votes.Prevotes(rs.Proposal.POLRound).BitArray(), + } + logger.Debug("Sending POL", "height", prs.Height, "round", prs.Round) + peer.Send(DataChannel, MustEncode(msg)) + } + continue OUTER_LOOP + } + + // Nothing to do. Sleep. + time.Sleep(conR.conS.config.PeerGossipSleepDuration) + continue OUTER_LOOP + } +} + +func (conR *Reactor) gossipDataForCatchup(logger log.Logger, rs *cstypes.RoundState, + prs *cstypes.PeerRoundState, ps *PeerState, peer p2p.Peer) { + + if index, ok := prs.ProposalBlockParts.Not().PickRandom(); ok { + // Ensure that the peer's PartSetHeader is correct + blockMeta := conR.conS.blockStore.LoadBlockMeta(prs.Height) + if blockMeta == nil { + logger.Error("Failed to load block meta", "ourHeight", rs.Height, + "blockstoreBase", conR.conS.blockStore.Base(), "blockstoreHeight", conR.conS.blockStore.Height()) + time.Sleep(conR.conS.config.PeerGossipSleepDuration) + return + } else if !blockMeta.BlockID.PartSetHeader.Equals(prs.ProposalBlockPartSetHeader) { + logger.Info("Peer ProposalBlockPartSetHeader mismatch, sleeping", + "blockPartSetHeader", blockMeta.BlockID.PartSetHeader, "peerBlockPartSetHeader", prs.ProposalBlockPartSetHeader) + time.Sleep(conR.conS.config.PeerGossipSleepDuration) + return + } + // Load the part + part := conR.conS.blockStore.LoadBlockPart(prs.Height, index) + if part == nil { + logger.Error("Could not load part", "index", index, + "blockPartSetHeader", blockMeta.BlockID.PartSetHeader, "peerBlockPartSetHeader", prs.ProposalBlockPartSetHeader) + time.Sleep(conR.conS.config.PeerGossipSleepDuration) + return + } + // Send the part + msg := &BlockPartMessage{ + Height: prs.Height, // Not our height, so it doesn't matter. + Round: prs.Round, // Not our height, so it doesn't matter. + Part: part, + } + logger.Debug("Sending block part for catchup", "round", prs.Round, "index", index) + if peer.Send(DataChannel, MustEncode(msg)) { + ps.SetHasProposalBlockPart(prs.Height, prs.Round, index) + } else { + logger.Debug("Sending block part for catchup failed") + } + return + } + time.Sleep(conR.conS.config.PeerGossipSleepDuration) +} + +func (conR *Reactor) gossipVotesRoutine(peer p2p.Peer, ps *PeerState) { + logger := conR.Logger.With("peer", peer) + + // Simple hack to throttle logs upon sleep. + var sleeping = 0 + +OUTER_LOOP: + for { + // Manage disconnects from self or peer. + if !peer.IsRunning() || !conR.IsRunning() { + logger.Info("Stopping gossipVotesRoutine for peer") + return + } + rs := conR.conS.GetRoundState() + prs := ps.GetRoundState() + + switch sleeping { + case 1: // First sleep + sleeping = 2 + case 2: // No more sleep + sleeping = 0 + } + + // If height matches, then send LastCommit, Prevotes, Precommits. + if rs.Height == prs.Height { + heightLogger := logger.With("height", prs.Height) + if conR.gossipVotesForHeight(heightLogger, rs, prs, ps) { + continue OUTER_LOOP + } + } + + // Special catchup logic. + // If peer is lagging by height 1, send LastCommit. + if prs.Height != 0 && rs.Height == prs.Height+1 { + if ps.PickSendVote(rs.LastCommit) { + logger.Debug("Picked rs.LastCommit to send", "height", prs.Height) + continue OUTER_LOOP + } + } + + // Catchup logic + // If peer is lagging by more than 1, send Commit. + if prs.Height != 0 && rs.Height >= prs.Height+2 && prs.Height >= conR.conS.blockStore.Base() { + // Load the block commit for prs.Height, + // which contains precommit signatures for prs.Height. + if commit := conR.conS.blockStore.LoadBlockCommit(prs.Height); commit != nil { + if ps.PickSendVote(commit) { + logger.Debug("Picked Catchup commit to send", "height", prs.Height) + continue OUTER_LOOP + } + } + } + + if sleeping == 0 { + // We sent nothing. Sleep... + sleeping = 1 + logger.Debug("No votes to send, sleeping", "rs.Height", rs.Height, "prs.Height", prs.Height, + "localPV", rs.Votes.Prevotes(rs.Round).BitArray(), "peerPV", prs.Prevotes, + "localPC", rs.Votes.Precommits(rs.Round).BitArray(), "peerPC", prs.Precommits) + } else if sleeping == 2 { + // Continued sleep... + sleeping = 1 + } + + time.Sleep(conR.conS.config.PeerGossipSleepDuration) + continue OUTER_LOOP + } +} + +func (conR *Reactor) gossipVotesForHeight( + logger log.Logger, + rs *cstypes.RoundState, + prs *cstypes.PeerRoundState, + ps *PeerState, +) bool { + + // If there are lastCommits to send... + if prs.Step == cstypes.RoundStepNewHeight { + if ps.PickSendVote(rs.LastCommit) { + logger.Debug("Picked rs.LastCommit to send") + return true + } + } + // If there are POL prevotes to send... + if prs.Step <= cstypes.RoundStepPropose && prs.Round != -1 && prs.Round <= rs.Round && prs.ProposalPOLRound != -1 { + if polPrevotes := rs.Votes.Prevotes(prs.ProposalPOLRound); polPrevotes != nil { + if ps.PickSendVote(polPrevotes) { + logger.Debug("Picked rs.Prevotes(prs.ProposalPOLRound) to send", + "round", prs.ProposalPOLRound) + return true + } + } + } + // If there are prevotes to send... + if prs.Step <= cstypes.RoundStepPrevoteWait && prs.Round != -1 && prs.Round <= rs.Round { + if ps.PickSendVote(rs.Votes.Prevotes(prs.Round)) { + logger.Debug("Picked rs.Prevotes(prs.Round) to send", "round", prs.Round) + return true + } + } + // If there are precommits to send... + if prs.Step <= cstypes.RoundStepPrecommitWait && prs.Round != -1 && prs.Round <= rs.Round { + if ps.PickSendVote(rs.Votes.Precommits(prs.Round)) { + logger.Debug("Picked rs.Precommits(prs.Round) to send", "round", prs.Round) + return true + } + } + // If there are prevotes to send...Needed because of validBlock mechanism + if prs.Round != -1 && prs.Round <= rs.Round { + if ps.PickSendVote(rs.Votes.Prevotes(prs.Round)) { + logger.Debug("Picked rs.Prevotes(prs.Round) to send", "round", prs.Round) + return true + } + } + // If there are POLPrevotes to send... + if prs.ProposalPOLRound != -1 { + if polPrevotes := rs.Votes.Prevotes(prs.ProposalPOLRound); polPrevotes != nil { + if ps.PickSendVote(polPrevotes) { + logger.Debug("Picked rs.Prevotes(prs.ProposalPOLRound) to send", + "round", prs.ProposalPOLRound) + return true + } + } + } + + return false +} + +// NOTE: `queryMaj23Routine` has a simple crude design since it only comes +// into play for liveness when there's a signature DDoS attack happening. +func (conR *Reactor) queryMaj23Routine(peer p2p.Peer, ps *PeerState) { + logger := conR.Logger.With("peer", peer) + +OUTER_LOOP: + for { + // Manage disconnects from self or peer. + if !peer.IsRunning() || !conR.IsRunning() { + logger.Info("Stopping queryMaj23Routine for peer") + return + } + + // Maybe send Height/Round/Prevotes + { + rs := conR.conS.GetRoundState() + prs := ps.GetRoundState() + if rs.Height == prs.Height { + if maj23, ok := rs.Votes.Prevotes(prs.Round).TwoThirdsMajority(); ok { + peer.TrySend(StateChannel, MustEncode(&VoteSetMaj23Message{ + Height: prs.Height, + Round: prs.Round, + Type: tmproto.PrevoteType, + BlockID: maj23, + })) + time.Sleep(conR.conS.config.PeerQueryMaj23SleepDuration) + } + } + } + + // Maybe send Height/Round/Precommits + { + rs := conR.conS.GetRoundState() + prs := ps.GetRoundState() + if rs.Height == prs.Height { + if maj23, ok := rs.Votes.Precommits(prs.Round).TwoThirdsMajority(); ok { + peer.TrySend(StateChannel, MustEncode(&VoteSetMaj23Message{ + Height: prs.Height, + Round: prs.Round, + Type: tmproto.PrecommitType, + BlockID: maj23, + })) + time.Sleep(conR.conS.config.PeerQueryMaj23SleepDuration) + } + } + } + + // Maybe send Height/Round/ProposalPOL + { + rs := conR.conS.GetRoundState() + prs := ps.GetRoundState() + if rs.Height == prs.Height && prs.ProposalPOLRound >= 0 { + if maj23, ok := rs.Votes.Prevotes(prs.ProposalPOLRound).TwoThirdsMajority(); ok { + peer.TrySend(StateChannel, MustEncode(&VoteSetMaj23Message{ + Height: prs.Height, + Round: prs.ProposalPOLRound, + Type: tmproto.PrevoteType, + BlockID: maj23, + })) + time.Sleep(conR.conS.config.PeerQueryMaj23SleepDuration) + } + } + } + + // Little point sending LastCommitRound/LastCommit, + // These are fleeting and non-blocking. + + // Maybe send Height/CatchupCommitRound/CatchupCommit. + { + prs := ps.GetRoundState() + if prs.CatchupCommitRound != -1 && prs.Height > 0 && prs.Height <= conR.conS.blockStore.Height() && + prs.Height >= conR.conS.blockStore.Base() { + if commit := conR.conS.LoadCommit(prs.Height); commit != nil { + peer.TrySend(StateChannel, MustEncode(&VoteSetMaj23Message{ + Height: prs.Height, + Round: commit.Round, + Type: tmproto.PrecommitType, + BlockID: commit.BlockID, + })) + time.Sleep(conR.conS.config.PeerQueryMaj23SleepDuration) + } + } + } + + time.Sleep(conR.conS.config.PeerQueryMaj23SleepDuration) + + continue OUTER_LOOP + } +} + +func (conR *Reactor) peerStatsRoutine() { + for { + if !conR.IsRunning() { + conR.Logger.Info("Stopping peerStatsRoutine") + return + } + + select { + case msg := <-conR.conS.statsMsgQueue: + // Get peer + peer := conR.Switch.Peers().Get(msg.PeerID) + if peer == nil { + conR.Logger.Debug("Attempt to update stats for non-existent peer", + "peer", msg.PeerID) + continue + } + // Get peer state + ps, ok := peer.Get(types.PeerStateKey).(*PeerState) + if !ok { + panic(fmt.Sprintf("Peer %v has no state", peer)) + } + switch msg.Msg.(type) { + case *VoteMessage: + if numVotes := ps.RecordVote(); numVotes%votesToContributeToBecomeGoodPeer == 0 { + conR.Switch.MarkPeerAsGood(peer) + } + case *BlockPartMessage: + if numParts := ps.RecordBlockPart(); numParts%blocksToContributeToBecomeGoodPeer == 0 { + conR.Switch.MarkPeerAsGood(peer) + } + } + case <-conR.conS.Quit(): + return + + case <-conR.Quit(): + return + } + } +} + +// String returns a string representation of the Reactor. +// NOTE: For now, it is just a hard-coded string to avoid accessing unprotected shared variables. +// TODO: improve! +func (conR *Reactor) String() string { + // better not to access shared variables + return "ConsensusReactor" // conR.StringIndented("") +} + +// StringIndented returns an indented string representation of the Reactor +func (conR *Reactor) StringIndented(indent string) string { + s := "ConsensusReactor{\n" + s += indent + " " + conR.conS.StringIndented(indent+" ") + "\n" + for _, peer := range conR.Switch.Peers().List() { + ps, ok := peer.Get(types.PeerStateKey).(*PeerState) + if !ok { + panic(fmt.Sprintf("Peer %v has no state", peer)) + } + s += indent + " " + ps.StringIndented(indent+" ") + "\n" + } + s += indent + "}" + return s +} + +// ReactorMetrics sets the metrics +func ReactorMetrics(metrics *Metrics) ReactorOption { + return func(conR *Reactor) { conR.Metrics = metrics } +} + +//----------------------------------------------------------------------------- + +var ( + ErrPeerStateHeightRegression = errors.New("error peer state height regression") + ErrPeerStateInvalidStartTime = errors.New("error peer state invalid startTime") +) + +// PeerState contains the known state of a peer, including its connection and +// threadsafe access to its PeerRoundState. +// NOTE: THIS GETS DUMPED WITH rpc/core/consensus.go. +// Be mindful of what you Expose. +type PeerState struct { + peer p2p.Peer + logger log.Logger + + mtx sync.Mutex // NOTE: Modify below using setters, never directly. + PRS cstypes.PeerRoundState `json:"round_state"` // Exposed. + Stats *peerStateStats `json:"stats"` // Exposed. +} + +// peerStateStats holds internal statistics for a peer. +type peerStateStats struct { + Votes int `json:"votes"` + BlockParts int `json:"block_parts"` +} + +func (pss peerStateStats) String() string { + return fmt.Sprintf("peerStateStats{votes: %d, blockParts: %d}", + pss.Votes, pss.BlockParts) +} + +// NewPeerState returns a new PeerState for the given Peer +func NewPeerState(peer p2p.Peer) *PeerState { + return &PeerState{ + peer: peer, + logger: log.NewNopLogger(), + PRS: cstypes.PeerRoundState{ + Round: -1, + ProposalPOLRound: -1, + LastCommitRound: -1, + CatchupCommitRound: -1, + }, + Stats: &peerStateStats{}, + } +} + +// SetLogger allows to set a logger on the peer state. Returns the peer state +// itself. +func (ps *PeerState) SetLogger(logger log.Logger) *PeerState { + ps.logger = logger + return ps +} + +// GetRoundState returns an shallow copy of the PeerRoundState. +// There's no point in mutating it since it won't change PeerState. +func (ps *PeerState) GetRoundState() *cstypes.PeerRoundState { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + prs := ps.PRS // copy + return &prs +} + +// ToJSON returns a json of PeerState. +func (ps *PeerState) ToJSON() ([]byte, error) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + return tmjson.Marshal(ps) +} + +// GetHeight returns an atomic snapshot of the PeerRoundState's height +// used by the mempool to ensure peers are caught up before broadcasting new txs +func (ps *PeerState) GetHeight() int64 { + ps.mtx.Lock() + defer ps.mtx.Unlock() + return ps.PRS.Height +} + +// SetHasProposal sets the given proposal as known for the peer. +func (ps *PeerState) SetHasProposal(proposal *types.Proposal) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + if ps.PRS.Height != proposal.Height || ps.PRS.Round != proposal.Round { + return + } + + if ps.PRS.Proposal { + return + } + + ps.PRS.Proposal = true + + // ps.PRS.ProposalBlockParts is set due to NewValidBlockMessage + if ps.PRS.ProposalBlockParts != nil { + return + } + + ps.PRS.ProposalBlockPartSetHeader = proposal.BlockID.PartSetHeader + ps.PRS.ProposalBlockParts = bits.NewBitArray(int(proposal.BlockID.PartSetHeader.Total)) + ps.PRS.ProposalPOLRound = proposal.POLRound + ps.PRS.ProposalPOL = nil // Nil until ProposalPOLMessage received. +} + +// InitProposalBlockParts initializes the peer's proposal block parts header and bit array. +func (ps *PeerState) InitProposalBlockParts(partSetHeader types.PartSetHeader) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + if ps.PRS.ProposalBlockParts != nil { + return + } + + ps.PRS.ProposalBlockPartSetHeader = partSetHeader + ps.PRS.ProposalBlockParts = bits.NewBitArray(int(partSetHeader.Total)) +} + +// SetHasProposalBlockPart sets the given block part index as known for the peer. +func (ps *PeerState) SetHasProposalBlockPart(height int64, round int32, index int) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + if ps.PRS.Height != height || ps.PRS.Round != round { + return + } + + ps.PRS.ProposalBlockParts.SetIndex(index, true) +} + +// PickSendVote picks a vote and sends it to the peer. +// Returns true if vote was sent. +func (ps *PeerState) PickSendVote(votes types.VoteSetReader) bool { + if vote, ok := ps.PickVoteToSend(votes); ok { + msg := &VoteMessage{vote} + ps.logger.Debug("Sending vote message", "ps", ps, "vote", vote) + if ps.peer.Send(VoteChannel, MustEncode(msg)) { + ps.SetHasVote(vote) + return true + } + return false + } + return false +} + +// PickVoteToSend picks a vote to send to the peer. +// Returns true if a vote was picked. +// NOTE: `votes` must be the correct Size() for the Height(). +func (ps *PeerState) PickVoteToSend(votes types.VoteSetReader) (vote *types.Vote, ok bool) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + if votes.Size() == 0 { + return nil, false + } + + height, round, votesType, size := + votes.GetHeight(), votes.GetRound(), tmproto.SignedMsgType(votes.Type()), votes.Size() + + // Lazily set data using 'votes'. + if votes.IsCommit() { + ps.ensureCatchupCommitRound(height, round, size) + } + ps.ensureVoteBitArrays(height, size) + + psVotes := ps.getVoteBitArray(height, round, votesType) + if psVotes == nil { + return nil, false // Not something worth sending + } + if index, ok := votes.BitArray().Sub(psVotes).PickRandom(); ok { + return votes.GetByIndex(int32(index)), true + } + return nil, false +} + +func (ps *PeerState) getVoteBitArray(height int64, round int32, votesType tmproto.SignedMsgType) *bits.BitArray { + if !types.IsVoteTypeValid(votesType) { + return nil + } + + if ps.PRS.Height == height { + if ps.PRS.Round == round { + switch votesType { + case tmproto.PrevoteType: + return ps.PRS.Prevotes + case tmproto.PrecommitType: + return ps.PRS.Precommits + } + } + if ps.PRS.CatchupCommitRound == round { + switch votesType { + case tmproto.PrevoteType: + return nil + case tmproto.PrecommitType: + return ps.PRS.CatchupCommit + } + } + if ps.PRS.ProposalPOLRound == round { + switch votesType { + case tmproto.PrevoteType: + return ps.PRS.ProposalPOL + case tmproto.PrecommitType: + return nil + } + } + return nil + } + if ps.PRS.Height == height+1 { + if ps.PRS.LastCommitRound == round { + switch votesType { + case tmproto.PrevoteType: + return nil + case tmproto.PrecommitType: + return ps.PRS.LastCommit + } + } + return nil + } + return nil +} + +// 'round': A round for which we have a +2/3 commit. +func (ps *PeerState) ensureCatchupCommitRound(height int64, round int32, numValidators int) { + if ps.PRS.Height != height { + return + } + /* + NOTE: This is wrong, 'round' could change. + e.g. if orig round is not the same as block LastCommit round. + if ps.CatchupCommitRound != -1 && ps.CatchupCommitRound != round { + panic(fmt.Sprintf( + "Conflicting CatchupCommitRound. Height: %v, + Orig: %v, + New: %v", + height, + ps.CatchupCommitRound, + round)) + } + */ + if ps.PRS.CatchupCommitRound == round { + return // Nothing to do! + } + ps.PRS.CatchupCommitRound = round + if round == ps.PRS.Round { + ps.PRS.CatchupCommit = ps.PRS.Precommits + } else { + ps.PRS.CatchupCommit = bits.NewBitArray(numValidators) + } +} + +// EnsureVoteBitArrays ensures the bit-arrays have been allocated for tracking +// what votes this peer has received. +// NOTE: It's important to make sure that numValidators actually matches +// what the node sees as the number of validators for height. +func (ps *PeerState) EnsureVoteBitArrays(height int64, numValidators int) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + ps.ensureVoteBitArrays(height, numValidators) +} + +func (ps *PeerState) ensureVoteBitArrays(height int64, numValidators int) { + if ps.PRS.Height == height { + if ps.PRS.Prevotes == nil { + ps.PRS.Prevotes = bits.NewBitArray(numValidators) + } + if ps.PRS.Precommits == nil { + ps.PRS.Precommits = bits.NewBitArray(numValidators) + } + if ps.PRS.CatchupCommit == nil { + ps.PRS.CatchupCommit = bits.NewBitArray(numValidators) + } + if ps.PRS.ProposalPOL == nil { + ps.PRS.ProposalPOL = bits.NewBitArray(numValidators) + } + } else if ps.PRS.Height == height+1 { + if ps.PRS.LastCommit == nil { + ps.PRS.LastCommit = bits.NewBitArray(numValidators) + } + } +} + +// RecordVote increments internal votes related statistics for this peer. +// It returns the total number of added votes. +func (ps *PeerState) RecordVote() int { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + ps.Stats.Votes++ + + return ps.Stats.Votes +} + +// VotesSent returns the number of blocks for which peer has been sending us +// votes. +func (ps *PeerState) VotesSent() int { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + return ps.Stats.Votes +} + +// RecordBlockPart increments internal block part related statistics for this peer. +// It returns the total number of added block parts. +func (ps *PeerState) RecordBlockPart() int { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + ps.Stats.BlockParts++ + return ps.Stats.BlockParts +} + +// BlockPartsSent returns the number of useful block parts the peer has sent us. +func (ps *PeerState) BlockPartsSent() int { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + return ps.Stats.BlockParts +} + +// SetHasVote sets the given vote as known by the peer +func (ps *PeerState) SetHasVote(vote *types.Vote) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + ps.setHasVote(vote.Height, vote.Round, vote.Type, vote.ValidatorIndex) +} + +func (ps *PeerState) setHasVote(height int64, round int32, voteType tmproto.SignedMsgType, index int32) { + logger := ps.logger.With( + "peerH/R", + fmt.Sprintf("%d/%d", ps.PRS.Height, ps.PRS.Round), + "H/R", + fmt.Sprintf("%d/%d", height, round)) + logger.Debug("setHasVote", "type", voteType, "index", index) + + // NOTE: some may be nil BitArrays -> no side effects. + psVotes := ps.getVoteBitArray(height, round, voteType) + if psVotes != nil { + psVotes.SetIndex(int(index), true) + } +} + +// ApplyNewRoundStepMessage updates the peer state for the new round. +func (ps *PeerState) ApplyNewRoundStepMessage(msg *NewRoundStepMessage) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + // Ignore duplicates or decreases + if CompareHRS(msg.Height, msg.Round, msg.Step, ps.PRS.Height, ps.PRS.Round, ps.PRS.Step) <= 0 { + return + } + + // Just remember these values. + psHeight := ps.PRS.Height + psRound := ps.PRS.Round + psCatchupCommitRound := ps.PRS.CatchupCommitRound + psCatchupCommit := ps.PRS.CatchupCommit + + startTime := tmtime.Now().Add(-1 * time.Duration(msg.SecondsSinceStartTime) * time.Second) + ps.PRS.Height = msg.Height + ps.PRS.Round = msg.Round + ps.PRS.Step = msg.Step + ps.PRS.StartTime = startTime + if psHeight != msg.Height || psRound != msg.Round { + ps.PRS.Proposal = false + ps.PRS.ProposalBlockPartSetHeader = types.PartSetHeader{} + ps.PRS.ProposalBlockParts = nil + ps.PRS.ProposalPOLRound = -1 + ps.PRS.ProposalPOL = nil + // We'll update the BitArray capacity later. + ps.PRS.Prevotes = nil + ps.PRS.Precommits = nil + } + if psHeight == msg.Height && psRound != msg.Round && msg.Round == psCatchupCommitRound { + // Peer caught up to CatchupCommitRound. + // Preserve psCatchupCommit! + // NOTE: We prefer to use prs.Precommits if + // pr.Round matches pr.CatchupCommitRound. + ps.PRS.Precommits = psCatchupCommit + } + if psHeight != msg.Height { + // Shift Precommits to LastCommit. + if psHeight+1 == msg.Height && psRound == msg.LastCommitRound { + ps.PRS.LastCommitRound = msg.LastCommitRound + ps.PRS.LastCommit = ps.PRS.Precommits + } else { + ps.PRS.LastCommitRound = msg.LastCommitRound + ps.PRS.LastCommit = nil + } + // We'll update the BitArray capacity later. + ps.PRS.CatchupCommitRound = -1 + ps.PRS.CatchupCommit = nil + } +} + +// ApplyNewValidBlockMessage updates the peer state for the new valid block. +func (ps *PeerState) ApplyNewValidBlockMessage(msg *NewValidBlockMessage) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + if ps.PRS.Height != msg.Height { + return + } + + if ps.PRS.Round != msg.Round && !msg.IsCommit { + return + } + + ps.PRS.ProposalBlockPartSetHeader = msg.BlockPartSetHeader + ps.PRS.ProposalBlockParts = msg.BlockParts +} + +// ApplyProposalPOLMessage updates the peer state for the new proposal POL. +func (ps *PeerState) ApplyProposalPOLMessage(msg *ProposalPOLMessage) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + if ps.PRS.Height != msg.Height { + return + } + if ps.PRS.ProposalPOLRound != msg.ProposalPOLRound { + return + } + + // TODO: Merge onto existing ps.PRS.ProposalPOL? + // We might have sent some prevotes in the meantime. + ps.PRS.ProposalPOL = msg.ProposalPOL +} + +// ApplyHasVoteMessage updates the peer state for the new vote. +func (ps *PeerState) ApplyHasVoteMessage(msg *HasVoteMessage) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + if ps.PRS.Height != msg.Height { + return + } + + ps.setHasVote(msg.Height, msg.Round, msg.Type, msg.Index) +} + +// ApplyVoteSetBitsMessage updates the peer state for the bit-array of votes +// it claims to have for the corresponding BlockID. +// `ourVotes` is a BitArray of votes we have for msg.BlockID +// NOTE: if ourVotes is nil (e.g. msg.Height < rs.Height), +// we conservatively overwrite ps's votes w/ msg.Votes. +func (ps *PeerState) ApplyVoteSetBitsMessage(msg *VoteSetBitsMessage, ourVotes *bits.BitArray) { + ps.mtx.Lock() + defer ps.mtx.Unlock() + + votes := ps.getVoteBitArray(msg.Height, msg.Round, msg.Type) + if votes != nil { + if ourVotes == nil { + votes.Update(msg.Votes) + } else { + otherVotes := votes.Sub(ourVotes) + hasVotes := otherVotes.Or(msg.Votes) + votes.Update(hasVotes) + } + } +} + +// String returns a string representation of the PeerState +func (ps *PeerState) String() string { + return ps.StringIndented("") +} + +// StringIndented returns a string representation of the PeerState +func (ps *PeerState) StringIndented(indent string) string { + ps.mtx.Lock() + defer ps.mtx.Unlock() + return fmt.Sprintf(`PeerState{ +%s Key %v +%s RoundState %v +%s Stats %v +%s}`, + indent, ps.peer.ID(), + indent, ps.PRS.StringIndented(indent+" "), + indent, ps.Stats, + indent) +} + +//----------------------------------------------------------------------------- +// Messages + +// Message is a message that can be sent and received on the Reactor +type Message interface { + ValidateBasic() error +} + +// func init() { +// tmjson.RegisterType(&NewRoundStepMessage{}, "tendermint/NewRoundStepMessage") +// tmjson.RegisterType(&NewValidBlockMessage{}, "tendermint/NewValidBlockMessage") +// tmjson.RegisterType(&ProposalMessage{}, "tendermint/Proposal") +// tmjson.RegisterType(&ProposalPOLMessage{}, "tendermint/ProposalPOL") +// tmjson.RegisterType(&BlockPartMessage{}, "tendermint/BlockPart") +// tmjson.RegisterType(&VoteMessage{}, "tendermint/Vote") +// tmjson.RegisterType(&HasVoteMessage{}, "tendermint/HasVote") +// tmjson.RegisterType(&VoteSetMaj23Message{}, "tendermint/VoteSetMaj23") +// tmjson.RegisterType(&VoteSetBitsMessage{}, "tendermint/VoteSetBits") +// } + +func decodeMsg(bz []byte) (msg Message, err error) { + pb := &tmcons.Message{} + if err = proto.Unmarshal(bz, pb); err != nil { + return msg, err + } + + return MsgFromProto(pb) +} + +//------------------------------------- + +// NewRoundStepMessage is sent for every step taken in the ConsensusState. +// For every height/round/step transition +type NewRoundStepMessage struct { + Height int64 + Round int32 + Step cstypes.RoundStepType + SecondsSinceStartTime int64 + LastCommitRound int32 +} + +// ValidateBasic performs basic validation. +func (m *NewRoundStepMessage) ValidateBasic() error { + if m.Height < 0 { + return errors.New("negative Height") + } + if m.Round < 0 { + return errors.New("negative Round") + } + if !m.Step.IsValid() { + return errors.New("invalid Step") + } + + // NOTE: SecondsSinceStartTime may be negative + + // LastCommitRound will be -1 for the initial height, but we don't know what height this is + // since it can be specified in genesis. The reactor will have to validate this via + // ValidateHeight(). + if m.LastCommitRound < -1 { + return errors.New("invalid LastCommitRound (cannot be < -1)") + } + + return nil +} + +// ValidateHeight validates the height given the chain's initial height. +func (m *NewRoundStepMessage) ValidateHeight(initialHeight int64) error { + if m.Height < initialHeight { + return fmt.Errorf("invalid Height %v (lower than initial height %v)", + m.Height, initialHeight) + } + if m.Height == initialHeight && m.LastCommitRound != -1 { + return fmt.Errorf("invalid LastCommitRound %v (must be -1 for initial height %v)", + m.LastCommitRound, initialHeight) + } + if m.Height > initialHeight && m.LastCommitRound < 0 { + return fmt.Errorf("LastCommitRound can only be negative for initial height %v", // nolint + initialHeight) + } + return nil +} + +// String returns a string representation. +func (m *NewRoundStepMessage) String() string { + return fmt.Sprintf("[NewRoundStep H:%v R:%v S:%v LCR:%v]", + m.Height, m.Round, m.Step, m.LastCommitRound) +} + +//------------------------------------- + +// NewValidBlockMessage is sent when a validator observes a valid block B in some round r, +// i.e., there is a Proposal for block B and 2/3+ prevotes for the block B in the round r. +// In case the block is also committed, then IsCommit flag is set to true. +type NewValidBlockMessage struct { + Height int64 + Round int32 + BlockPartSetHeader types.PartSetHeader + BlockParts *bits.BitArray + IsCommit bool +} + +// ValidateBasic performs basic validation. +func (m *NewValidBlockMessage) ValidateBasic() error { + if m.Height < 0 { + return errors.New("negative Height") + } + if m.Round < 0 { + return errors.New("negative Round") + } + if err := m.BlockPartSetHeader.ValidateBasic(); err != nil { + return fmt.Errorf("wrong BlockPartSetHeader: %v", err) + } + if m.BlockParts.Size() == 0 { + return errors.New("empty blockParts") + } + if m.BlockParts.Size() != int(m.BlockPartSetHeader.Total) { + return fmt.Errorf("blockParts bit array size %d not equal to BlockPartSetHeader.Total %d", + m.BlockParts.Size(), + m.BlockPartSetHeader.Total) + } + if m.BlockParts.Size() > int(types.MaxBlockPartsCount) { + return fmt.Errorf("blockParts bit array is too big: %d, max: %d", m.BlockParts.Size(), types.MaxBlockPartsCount) + } + return nil +} + +// String returns a string representation. +func (m *NewValidBlockMessage) String() string { + return fmt.Sprintf("[ValidBlockMessage H:%v R:%v BP:%v BA:%v IsCommit:%v]", + m.Height, m.Round, m.BlockPartSetHeader, m.BlockParts, m.IsCommit) +} + +//------------------------------------- + +// ProposalMessage is sent when a new block is proposed. +type ProposalMessage struct { + Proposal *types.Proposal +} + +// ValidateBasic performs basic validation. +func (m *ProposalMessage) ValidateBasic() error { + return m.Proposal.ValidateBasic() +} + +// String returns a string representation. +func (m *ProposalMessage) String() string { + return fmt.Sprintf("[Proposal %v]", m.Proposal) +} + +//------------------------------------- + +// ProposalPOLMessage is sent when a previous proposal is re-proposed. +type ProposalPOLMessage struct { + Height int64 + ProposalPOLRound int32 + ProposalPOL *bits.BitArray +} + +// ValidateBasic performs basic validation. +func (m *ProposalPOLMessage) ValidateBasic() error { + if m.Height < 0 { + return errors.New("negative Height") + } + if m.ProposalPOLRound < 0 { + return errors.New("negative ProposalPOLRound") + } + if m.ProposalPOL.Size() == 0 { + return errors.New("empty ProposalPOL bit array") + } + if m.ProposalPOL.Size() > types.MaxVotesCount { + return fmt.Errorf("proposalPOL bit array is too big: %d, max: %d", m.ProposalPOL.Size(), types.MaxVotesCount) + } + return nil +} + +// String returns a string representation. +func (m *ProposalPOLMessage) String() string { + return fmt.Sprintf("[ProposalPOL H:%v POLR:%v POL:%v]", m.Height, m.ProposalPOLRound, m.ProposalPOL) +} + +//------------------------------------- + +// BlockPartMessage is sent when gossipping a piece of the proposed block. +type BlockPartMessage struct { + Height int64 + Round int32 + Part *types.Part +} + +// ValidateBasic performs basic validation. +func (m *BlockPartMessage) ValidateBasic() error { + if m.Height < 0 { + return errors.New("negative Height") + } + if m.Round < 0 { + return errors.New("negative Round") + } + if err := m.Part.ValidateBasic(); err != nil { + return fmt.Errorf("wrong Part: %v", err) + } + return nil +} + +// String returns a string representation. +func (m *BlockPartMessage) String() string { + return fmt.Sprintf("[BlockPart H:%v R:%v P:%v]", m.Height, m.Round, m.Part) +} + +//------------------------------------- + +// VoteMessage is sent when voting for a proposal (or lack thereof). +type VoteMessage struct { + Vote *types.Vote +} + +// ValidateBasic performs basic validation. +func (m *VoteMessage) ValidateBasic() error { + return m.Vote.ValidateBasic() +} + +// String returns a string representation. +func (m *VoteMessage) String() string { + return fmt.Sprintf("[Vote %v]", m.Vote) +} + +//------------------------------------- + +// HasVoteMessage is sent to indicate that a particular vote has been received. +type HasVoteMessage struct { + Height int64 + Round int32 + Type tmproto.SignedMsgType + Index int32 +} + +// ValidateBasic performs basic validation. +func (m *HasVoteMessage) ValidateBasic() error { + if m.Height < 0 { + return errors.New("negative Height") + } + if m.Round < 0 { + return errors.New("negative Round") + } + if !types.IsVoteTypeValid(m.Type) { + return errors.New("invalid Type") + } + if m.Index < 0 { + return errors.New("negative Index") + } + return nil +} + +// String returns a string representation. +func (m *HasVoteMessage) String() string { + return fmt.Sprintf("[HasVote VI:%v V:{%v/%02d/%v}]", m.Index, m.Height, m.Round, m.Type) +} + +//------------------------------------- + +// VoteSetMaj23Message is sent to indicate that a given BlockID has seen +2/3 votes. +type VoteSetMaj23Message struct { + Height int64 + Round int32 + Type tmproto.SignedMsgType + BlockID types.BlockID +} + +// ValidateBasic performs basic validation. +func (m *VoteSetMaj23Message) ValidateBasic() error { + if m.Height < 0 { + return errors.New("negative Height") + } + if m.Round < 0 { + return errors.New("negative Round") + } + if !types.IsVoteTypeValid(m.Type) { + return errors.New("invalid Type") + } + if err := m.BlockID.ValidateBasic(); err != nil { + return fmt.Errorf("wrong BlockID: %v", err) + } + return nil +} + +// String returns a string representation. +func (m *VoteSetMaj23Message) String() string { + return fmt.Sprintf("[VSM23 %v/%02d/%v %v]", m.Height, m.Round, m.Type, m.BlockID) +} + +//------------------------------------- + +// VoteSetBitsMessage is sent to communicate the bit-array of votes seen for the BlockID. +type VoteSetBitsMessage struct { + Height int64 + Round int32 + Type tmproto.SignedMsgType + BlockID types.BlockID + Votes *bits.BitArray +} + +// ValidateBasic performs basic validation. +func (m *VoteSetBitsMessage) ValidateBasic() error { + if m.Height < 0 { + return errors.New("negative Height") + } + if !types.IsVoteTypeValid(m.Type) { + return errors.New("invalid Type") + } + if err := m.BlockID.ValidateBasic(); err != nil { + return fmt.Errorf("wrong BlockID: %v", err) + } + // NOTE: Votes.Size() can be zero if the node does not have any + if m.Votes.Size() > types.MaxVotesCount { + return fmt.Errorf("votes bit array is too big: %d, max: %d", m.Votes.Size(), types.MaxVotesCount) + } + return nil +} + +// String returns a string representation. +func (m *VoteSetBitsMessage) String() string { + return fmt.Sprintf("[VSB %v/%02d/%v %v %v]", m.Height, m.Round, m.Type, m.BlockID, m.Votes) +} + +//------------------------------------- diff --git a/test/maverick/consensus/replay.go b/test/maverick/consensus/replay.go new file mode 100644 index 000000000..bfec9e96d --- /dev/null +++ b/test/maverick/consensus/replay.go @@ -0,0 +1,533 @@ +package consensus + +import ( + "bytes" + "fmt" + "hash/crc32" + "io" + "reflect" + "time" + + abci "github.com/tendermint/tendermint/abci/types" + "github.com/tendermint/tendermint/crypto/merkle" + "github.com/tendermint/tendermint/libs/log" + "github.com/tendermint/tendermint/proxy" + sm "github.com/tendermint/tendermint/state" + "github.com/tendermint/tendermint/types" +) + +var crc32c = crc32.MakeTable(crc32.Castagnoli) + +// Functionality to replay blocks and messages on recovery from a crash. +// There are two general failure scenarios: +// +// 1. failure during consensus +// 2. failure while applying the block +// +// The former is handled by the WAL, the latter by the proxyApp Handshake on +// restart, which ultimately hands off the work to the WAL. + +//----------------------------------------- +// 1. Recover from failure during consensus +// (by replaying messages from the WAL) +//----------------------------------------- + +// Unmarshal and apply a single message to the consensus state as if it were +// received in receiveRoutine. Lines that start with "#" are ignored. +// NOTE: receiveRoutine should not be running. +func (cs *State) readReplayMessage(msg *TimedWALMessage, newStepSub types.Subscription) error { + // Skip meta messages which exist for demarcating boundaries. + if _, ok := msg.Msg.(EndHeightMessage); ok { + return nil + } + + // for logging + switch m := msg.Msg.(type) { + case types.EventDataRoundState: + cs.Logger.Info("Replay: New Step", "height", m.Height, "round", m.Round, "step", m.Step) + // these are playback checks + ticker := time.After(time.Second * 2) + if newStepSub != nil { + select { + case stepMsg := <-newStepSub.Out(): + m2 := stepMsg.Data().(types.EventDataRoundState) + if m.Height != m2.Height || m.Round != m2.Round || m.Step != m2.Step { + return fmt.Errorf("roundState mismatch. Got %v; Expected %v", m2, m) + } + case <-newStepSub.Cancelled(): + return fmt.Errorf("failed to read off newStepSub.Out(). newStepSub was cancelled") + case <-ticker: + return fmt.Errorf("failed to read off newStepSub.Out()") + } + } + case msgInfo: + peerID := m.PeerID + if peerID == "" { + peerID = "local" + } + switch msg := m.Msg.(type) { + case *ProposalMessage: + p := msg.Proposal + cs.Logger.Info("Replay: Proposal", "height", p.Height, "round", p.Round, "header", + p.BlockID.PartSetHeader, "pol", p.POLRound, "peer", peerID) + case *BlockPartMessage: + cs.Logger.Info("Replay: BlockPart", "height", msg.Height, "round", msg.Round, "peer", peerID) + case *VoteMessage: + v := msg.Vote + cs.Logger.Info("Replay: Vote", "height", v.Height, "round", v.Round, "type", v.Type, + "blockID", v.BlockID, "peer", peerID) + } + + cs.handleMsg(m) + case timeoutInfo: + cs.Logger.Info("Replay: Timeout", "height", m.Height, "round", m.Round, "step", m.Step, "dur", m.Duration) + cs.handleTimeout(m, cs.RoundState) + default: + return fmt.Errorf("replay: Unknown TimedWALMessage type: %v", reflect.TypeOf(msg.Msg)) + } + return nil +} + +// Replay only those messages since the last block. `timeoutRoutine` should +// run concurrently to read off tickChan. +func (cs *State) catchupReplay(csHeight int64) error { + + // Set replayMode to true so we don't log signing errors. + cs.replayMode = true + defer func() { cs.replayMode = false }() + + // Ensure that #ENDHEIGHT for this height doesn't exist. + // NOTE: This is just a sanity check. As far as we know things work fine + // without it, and Handshake could reuse State if it weren't for + // this check (since we can crash after writing #ENDHEIGHT). + // + // Ignore data corruption errors since this is a sanity check. + gr, found, err := cs.wal.SearchForEndHeight(csHeight, &WALSearchOptions{IgnoreDataCorruptionErrors: true}) + if err != nil { + return err + } + if gr != nil { + if err := gr.Close(); err != nil { + return err + } + } + if found { + return fmt.Errorf("wal should not contain #ENDHEIGHT %d", csHeight) + } + + // Search for last height marker. + // + // Ignore data corruption errors in previous heights because we only care about last height + if csHeight < cs.state.InitialHeight { + return fmt.Errorf("cannot replay height %v, below initial height %v", csHeight, cs.state.InitialHeight) + } + endHeight := csHeight - 1 + if csHeight == cs.state.InitialHeight { + endHeight = 0 + } + gr, found, err = cs.wal.SearchForEndHeight(endHeight, &WALSearchOptions{IgnoreDataCorruptionErrors: true}) + if err == io.EOF { + cs.Logger.Error("Replay: wal.group.Search returned EOF", "#ENDHEIGHT", endHeight) + } else if err != nil { + return err + } + if !found { + return fmt.Errorf("cannot replay height %d. WAL does not contain #ENDHEIGHT for %d", csHeight, endHeight) + } + defer gr.Close() + + cs.Logger.Info("Catchup by replaying consensus messages", "height", csHeight) + + var msg *TimedWALMessage + dec := WALDecoder{gr} + +LOOP: + for { + msg, err = dec.Decode() + switch { + case err == io.EOF: + break LOOP + case IsDataCorruptionError(err): + cs.Logger.Error("data has been corrupted in last height of consensus WAL", "err", err, "height", csHeight) + return err + case err != nil: + return err + } + + // NOTE: since the priv key is set when the msgs are received + // it will attempt to eg double sign but we can just ignore it + // since the votes will be replayed and we'll get to the next step + if err := cs.readReplayMessage(msg, nil); err != nil { + return err + } + } + cs.Logger.Info("Replay: Done") + return nil +} + +//-------------------------------------------------------------------------------- + +// Parses marker lines of the form: +// #ENDHEIGHT: 12345 +/* +func makeHeightSearchFunc(height int64) auto.SearchFunc { + return func(line string) (int, error) { + line = strings.TrimRight(line, "\n") + parts := strings.Split(line, " ") + if len(parts) != 2 { + return -1, errors.New("line did not have 2 parts") + } + i, err := strconv.Atoi(parts[1]) + if err != nil { + return -1, errors.New("failed to parse INFO: " + err.Error()) + } + if height < i { + return 1, nil + } else if height == i { + return 0, nil + } else { + return -1, nil + } + } +}*/ + +//--------------------------------------------------- +// 2. Recover from failure while applying the block. +// (by handshaking with the app to figure out where +// we were last, and using the WAL to recover there.) +//--------------------------------------------------- + +type Handshaker struct { + stateStore sm.Store + initialState sm.State + store sm.BlockStore + eventBus types.BlockEventPublisher + genDoc *types.GenesisDoc + logger log.Logger + + nBlocks int // number of blocks applied to the state +} + +func NewHandshaker(stateStore sm.Store, state sm.State, + store sm.BlockStore, genDoc *types.GenesisDoc) *Handshaker { + + return &Handshaker{ + stateStore: stateStore, + initialState: state, + store: store, + eventBus: types.NopEventBus{}, + genDoc: genDoc, + logger: log.NewNopLogger(), + nBlocks: 0, + } +} + +func (h *Handshaker) SetLogger(l log.Logger) { + h.logger = l +} + +// SetEventBus - sets the event bus for publishing block related events. +// If not called, it defaults to types.NopEventBus. +func (h *Handshaker) SetEventBus(eventBus types.BlockEventPublisher) { + h.eventBus = eventBus +} + +// NBlocks returns the number of blocks applied to the state. +func (h *Handshaker) NBlocks() int { + return h.nBlocks +} + +// TODO: retry the handshake/replay if it fails ? +func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error { + + // Handshake is done via ABCI Info on the query conn. + res, err := proxyApp.Query().InfoSync(proxy.RequestInfo) + if err != nil { + return fmt.Errorf("error calling Info: %v", err) + } + + blockHeight := res.LastBlockHeight + if blockHeight < 0 { + return fmt.Errorf("got a negative last block height (%d) from the app", blockHeight) + } + appHash := res.LastBlockAppHash + + h.logger.Info("ABCI Handshake App Info", + "height", blockHeight, + "hash", fmt.Sprintf("%X", appHash), + "software-version", res.Version, + "protocol-version", res.AppVersion, + ) + + // Only set the version if there is no existing state. + if h.initialState.LastBlockHeight == 0 { + h.initialState.Version.Consensus.App = res.AppVersion + } + + // Replay blocks up to the latest in the blockstore. + _, err = h.ReplayBlocks(h.initialState, appHash, blockHeight, proxyApp) + if err != nil { + return fmt.Errorf("error on replay: %v", err) + } + + h.logger.Info("Completed ABCI Handshake - Tendermint and App are synced", + "appHeight", blockHeight, "appHash", fmt.Sprintf("%X", appHash)) + + // TODO: (on restart) replay mempool + + return nil +} + +// ReplayBlocks replays all blocks since appBlockHeight and ensures the result +// matches the current state. +// Returns the final AppHash or an error. +func (h *Handshaker) ReplayBlocks( + state sm.State, + appHash []byte, + appBlockHeight int64, + proxyApp proxy.AppConns, +) ([]byte, error) { + storeBlockBase := h.store.Base() + storeBlockHeight := h.store.Height() + stateBlockHeight := state.LastBlockHeight + h.logger.Info( + "ABCI Replay Blocks", + "appHeight", + appBlockHeight, + "storeHeight", + storeBlockHeight, + "stateHeight", + stateBlockHeight) + + // If appBlockHeight == 0 it means that we are at genesis and hence should send InitChain. + if appBlockHeight == 0 { + validators := make([]*types.Validator, len(h.genDoc.Validators)) + for i, val := range h.genDoc.Validators { + validators[i] = types.NewValidator(val.PubKey, val.Power) + } + validatorSet := types.NewValidatorSet(validators) + nextVals := types.TM2PB.ValidatorUpdates(validatorSet) + csParams := types.TM2PB.ConsensusParams(h.genDoc.ConsensusParams) + req := abci.RequestInitChain{ + Time: h.genDoc.GenesisTime, + ChainId: h.genDoc.ChainID, + InitialHeight: h.genDoc.InitialHeight, + ConsensusParams: csParams, + Validators: nextVals, + AppStateBytes: h.genDoc.AppState, + } + res, err := proxyApp.Consensus().InitChainSync(req) + if err != nil { + return nil, err + } + + appHash = res.AppHash + + if stateBlockHeight == 0 { // we only update state when we are in initial state + // If the app did not return an app hash, we keep the one set from the genesis doc in + // the state. We don't set appHash since we don't want the genesis doc app hash + // recorded in the genesis block. We should probably just remove GenesisDoc.AppHash. + if len(res.AppHash) > 0 { + state.AppHash = res.AppHash + } + // If the app returned validators or consensus params, update the state. + if len(res.Validators) > 0 { + vals, err := types.PB2TM.ValidatorUpdates(res.Validators) + if err != nil { + return nil, err + } + state.Validators = types.NewValidatorSet(vals) + state.NextValidators = types.NewValidatorSet(vals).CopyIncrementProposerPriority(1) + } else if len(h.genDoc.Validators) == 0 { + // If validator set is not set in genesis and still empty after InitChain, exit. + return nil, fmt.Errorf("validator set is nil in genesis and still empty after InitChain") + } + + if res.ConsensusParams != nil { + state.ConsensusParams = types.UpdateConsensusParams(state.ConsensusParams, res.ConsensusParams) + state.Version.Consensus.App = state.ConsensusParams.Version.AppVersion + } + // We update the last results hash with the empty hash, to conform with RFC-6962. + state.LastResultsHash = merkle.HashFromByteSlices(nil) + if err := h.stateStore.Save(state); err != nil { + return nil, err + } + } + } + + // First handle edge cases and constraints on the storeBlockHeight and storeBlockBase. + switch { + case storeBlockHeight == 0: + assertAppHashEqualsOneFromState(appHash, state) + return appHash, nil + + case appBlockHeight == 0 && state.InitialHeight < storeBlockBase: + // the app has no state, and the block store is truncated above the initial height + return appHash, sm.ErrAppBlockHeightTooLow{AppHeight: appBlockHeight, StoreBase: storeBlockBase} + + case appBlockHeight > 0 && appBlockHeight < storeBlockBase-1: + // the app is too far behind truncated store (can be 1 behind since we replay the next) + return appHash, sm.ErrAppBlockHeightTooLow{AppHeight: appBlockHeight, StoreBase: storeBlockBase} + + case storeBlockHeight < appBlockHeight: + // the app should never be ahead of the store (but this is under app's control) + return appHash, sm.ErrAppBlockHeightTooHigh{CoreHeight: storeBlockHeight, AppHeight: appBlockHeight} + + case storeBlockHeight < stateBlockHeight: + // the state should never be ahead of the store (this is under tendermint's control) + panic(fmt.Sprintf("StateBlockHeight (%d) > StoreBlockHeight (%d)", stateBlockHeight, storeBlockHeight)) + + case storeBlockHeight > stateBlockHeight+1: + // store should be at most one ahead of the state (this is under tendermint's control) + panic(fmt.Sprintf("StoreBlockHeight (%d) > StateBlockHeight + 1 (%d)", storeBlockHeight, stateBlockHeight+1)) + } + + var err error + // Now either store is equal to state, or one ahead. + // For each, consider all cases of where the app could be, given app <= store + if storeBlockHeight == stateBlockHeight { + // Tendermint ran Commit and saved the state. + // Either the app is asking for replay, or we're all synced up. + if appBlockHeight < storeBlockHeight { + // the app is behind, so replay blocks, but no need to go through WAL (state is already synced to store) + return h.replayBlocks(state, proxyApp, appBlockHeight, storeBlockHeight, false) + + } else if appBlockHeight == storeBlockHeight { + // We're good! + assertAppHashEqualsOneFromState(appHash, state) + return appHash, nil + } + + } else if storeBlockHeight == stateBlockHeight+1 { + // We saved the block in the store but haven't updated the state, + // so we'll need to replay a block using the WAL. + switch { + case appBlockHeight < stateBlockHeight: + // the app is further behind than it should be, so replay blocks + // but leave the last block to go through the WAL + return h.replayBlocks(state, proxyApp, appBlockHeight, storeBlockHeight, true) + + case appBlockHeight == stateBlockHeight: + // We haven't run Commit (both the state and app are one block behind), + // so replayBlock with the real app. + // NOTE: We could instead use the cs.WAL on cs.Start, + // but we'd have to allow the WAL to replay a block that wrote it's #ENDHEIGHT + h.logger.Info("Replay last block using real app") + state, err = h.replayBlock(state, storeBlockHeight, proxyApp.Consensus()) + return state.AppHash, err + + case appBlockHeight == storeBlockHeight: + // We ran Commit, but didn't save the state, so replayBlock with mock app. + abciResponses, err := h.stateStore.LoadABCIResponses(storeBlockHeight) + if err != nil { + return nil, err + } + mockApp := newMockProxyApp(appHash, abciResponses) + h.logger.Info("Replay last block using mock app") + state, err = h.replayBlock(state, storeBlockHeight, mockApp) + return state.AppHash, err + } + + } + + panic(fmt.Sprintf("uncovered case! appHeight: %d, storeHeight: %d, stateHeight: %d", + appBlockHeight, storeBlockHeight, stateBlockHeight)) +} + +func (h *Handshaker) replayBlocks( + state sm.State, + proxyApp proxy.AppConns, + appBlockHeight, + storeBlockHeight int64, + mutateState bool) ([]byte, error) { + // App is further behind than it should be, so we need to replay blocks. + // We replay all blocks from appBlockHeight+1. + // + // Note that we don't have an old version of the state, + // so we by-pass state validation/mutation using sm.ExecCommitBlock. + // This also means we won't be saving validator sets if they change during this period. + // TODO: Load the historical information to fix this and just use state.ApplyBlock + // + // If mutateState == true, the final block is replayed with h.replayBlock() + + var appHash []byte + var err error + finalBlock := storeBlockHeight + if mutateState { + finalBlock-- + } + firstBlock := appBlockHeight + 1 + if firstBlock == 1 { + firstBlock = state.InitialHeight + } + for i := firstBlock; i <= finalBlock; i++ { + h.logger.Info("Applying block", "height", i) + block := h.store.LoadBlock(i) + // Extra check to ensure the app was not changed in a way it shouldn't have. + if len(appHash) > 0 { + assertAppHashEqualsOneFromBlock(appHash, block) + } + + appHash, err = sm.ExecCommitBlock(proxyApp.Consensus(), block, h.logger, h.stateStore, h.genDoc.InitialHeight) + if err != nil { + return nil, err + } + + h.nBlocks++ + } + + if mutateState { + // sync the final block + state, err = h.replayBlock(state, storeBlockHeight, proxyApp.Consensus()) + if err != nil { + return nil, err + } + appHash = state.AppHash + } + + assertAppHashEqualsOneFromState(appHash, state) + return appHash, nil +} + +// ApplyBlock on the proxyApp with the last block. +func (h *Handshaker) replayBlock(state sm.State, height int64, proxyApp proxy.AppConnConsensus) (sm.State, error) { + block := h.store.LoadBlock(height) + meta := h.store.LoadBlockMeta(height) + + // Use stubs for both mempool and evidence pool since no transactions nor + // evidence are needed here - block already exists. + blockExec := sm.NewBlockExecutor(h.stateStore, h.logger, proxyApp, emptyMempool{}, sm.EmptyEvidencePool{}) + blockExec.SetEventBus(h.eventBus) + + var err error + state, _, err = blockExec.ApplyBlock(state, meta.BlockID, block) + if err != nil { + return sm.State{}, err + } + + h.nBlocks++ + + return state, nil +} + +func assertAppHashEqualsOneFromBlock(appHash []byte, block *types.Block) { + if !bytes.Equal(appHash, block.AppHash) { + panic(fmt.Sprintf(`block.AppHash does not match AppHash after replay. Got %X, expected %X. + +Block: %v +`, + appHash, block.AppHash, block)) + } +} + +func assertAppHashEqualsOneFromState(appHash []byte, state sm.State) { + if !bytes.Equal(appHash, state.AppHash) { + panic(fmt.Sprintf(`state.AppHash does not match AppHash after replay. Got +%X, expected %X. + +State: %v + +Did you reset Tendermint without resetting your application's data?`, + appHash, state.AppHash, state)) + } +} diff --git a/test/maverick/consensus/replay_file.go b/test/maverick/consensus/replay_file.go new file mode 100644 index 000000000..0a02031f8 --- /dev/null +++ b/test/maverick/consensus/replay_file.go @@ -0,0 +1,338 @@ +package consensus + +import ( + "bufio" + "context" + "errors" + "fmt" + "io" + "os" + "strconv" + "strings" + + dbm "github.com/tendermint/tm-db" + + cfg "github.com/tendermint/tendermint/config" + "github.com/tendermint/tendermint/libs/log" + tmos "github.com/tendermint/tendermint/libs/os" + "github.com/tendermint/tendermint/proxy" + sm "github.com/tendermint/tendermint/state" + "github.com/tendermint/tendermint/store" + "github.com/tendermint/tendermint/types" +) + +const ( + // event bus subscriber + subscriber = "replay-file" +) + +//-------------------------------------------------------- +// replay messages interactively or all at once + +// replay the wal file +func RunReplayFile(config cfg.BaseConfig, csConfig *cfg.ConsensusConfig, console bool) { + consensusState := newConsensusStateForReplay(config, csConfig) + + if err := consensusState.ReplayFile(csConfig.WalFile(), console); err != nil { + tmos.Exit(fmt.Sprintf("Error during consensus replay: %v", err)) + } +} + +// Replay msgs in file or start the console +func (cs *State) ReplayFile(file string, console bool) error { + + if cs.IsRunning() { + return errors.New("cs is already running, cannot replay") + } + if cs.wal != nil { + return errors.New("cs wal is open, cannot replay") + } + + cs.startForReplay() + + // ensure all new step events are regenerated as expected + + ctx := context.Background() + newStepSub, err := cs.eventBus.Subscribe(ctx, subscriber, types.EventQueryNewRoundStep) + if err != nil { + return fmt.Errorf("failed to subscribe %s to %v", subscriber, types.EventQueryNewRoundStep) + } + defer func() { + if err := cs.eventBus.Unsubscribe(ctx, subscriber, types.EventQueryNewRoundStep); err != nil { + cs.Logger.Error("Error unsubscribing to event bus", "err", err) + } + }() + + // just open the file for reading, no need to use wal + fp, err := os.OpenFile(file, os.O_RDONLY, 0600) + if err != nil { + return err + } + + pb := newPlayback(file, fp, cs, cs.state.Copy()) + defer pb.fp.Close() + + var nextN int // apply N msgs in a row + var msg *TimedWALMessage + for { + if nextN == 0 && console { + nextN = pb.replayConsoleLoop() + } + + msg, err = pb.dec.Decode() + if err == io.EOF { + return nil + } else if err != nil { + return err + } + + if err := pb.cs.readReplayMessage(msg, newStepSub); err != nil { + return err + } + + if nextN > 0 { + nextN-- + } + pb.count++ + } +} + +//------------------------------------------------ +// playback manager + +type playback struct { + cs *State + + fp *os.File + dec *WALDecoder + count int // how many lines/msgs into the file are we + + // replays can be reset to beginning + fileName string // so we can close/reopen the file + genesisState sm.State // so the replay session knows where to restart from +} + +func newPlayback(fileName string, fp *os.File, cs *State, genState sm.State) *playback { + return &playback{ + cs: cs, + fp: fp, + fileName: fileName, + genesisState: genState, + dec: NewWALDecoder(fp), + } +} + +// go back count steps by resetting the state and running (pb.count - count) steps +func (pb *playback) replayReset(count int, newStepSub types.Subscription) error { + if err := pb.cs.Stop(); err != nil { + return err + } + pb.cs.Wait() + + newCS := NewState(pb.cs.config, pb.genesisState.Copy(), pb.cs.blockExec, + pb.cs.blockStore, pb.cs.txNotifier, pb.cs.evpool, map[int64]Misbehavior{}) + newCS.SetEventBus(pb.cs.eventBus) + newCS.startForReplay() + + if err := pb.fp.Close(); err != nil { + return err + } + fp, err := os.OpenFile(pb.fileName, os.O_RDONLY, 0600) + if err != nil { + return err + } + pb.fp = fp + pb.dec = NewWALDecoder(fp) + count = pb.count - count + fmt.Printf("Reseting from %d to %d\n", pb.count, count) + pb.count = 0 + pb.cs = newCS + var msg *TimedWALMessage + for i := 0; i < count; i++ { + msg, err = pb.dec.Decode() + if err == io.EOF { + return nil + } else if err != nil { + return err + } + if err := pb.cs.readReplayMessage(msg, newStepSub); err != nil { + return err + } + pb.count++ + } + return nil +} + +func (cs *State) startForReplay() { + cs.Logger.Error("Replay commands are disabled until someone updates them and writes tests") + /* TODO:! + // since we replay tocks we just ignore ticks + go func() { + for { + select { + case <-cs.tickChan: + case <-cs.Quit: + return + } + } + }()*/ +} + +// console function for parsing input and running commands +func (pb *playback) replayConsoleLoop() int { + for { + fmt.Printf("> ") + bufReader := bufio.NewReader(os.Stdin) + line, more, err := bufReader.ReadLine() + if more { + tmos.Exit("input is too long") + } else if err != nil { + tmos.Exit(err.Error()) + } + + tokens := strings.Split(string(line), " ") + if len(tokens) == 0 { + continue + } + + switch tokens[0] { + case "next": + // "next" -> replay next message + // "next N" -> replay next N messages + + if len(tokens) == 1 { + return 0 + } + i, err := strconv.Atoi(tokens[1]) + if err != nil { + fmt.Println("next takes an integer argument") + } else { + return i + } + + case "back": + // "back" -> go back one message + // "back N" -> go back N messages + + // NOTE: "back" is not supported in the state machine design, + // so we restart and replay up to + + ctx := context.Background() + // ensure all new step events are regenerated as expected + + newStepSub, err := pb.cs.eventBus.Subscribe(ctx, subscriber, types.EventQueryNewRoundStep) + if err != nil { + tmos.Exit(fmt.Sprintf("failed to subscribe %s to %v", subscriber, types.EventQueryNewRoundStep)) + } + defer func() { + if err := pb.cs.eventBus.Unsubscribe(ctx, subscriber, types.EventQueryNewRoundStep); err != nil { + pb.cs.Logger.Error("Error unsubscribing from eventBus", "err", err) + } + }() + + if len(tokens) == 1 { + if err := pb.replayReset(1, newStepSub); err != nil { + pb.cs.Logger.Error("Replay reset error", "err", err) + } + } else { + i, err := strconv.Atoi(tokens[1]) + if err != nil { + fmt.Println("back takes an integer argument") + } else if i > pb.count { + fmt.Printf("argument to back must not be larger than the current count (%d)\n", pb.count) + } else if err := pb.replayReset(i, newStepSub); err != nil { + pb.cs.Logger.Error("Replay reset error", "err", err) + } + } + + case "rs": + // "rs" -> print entire round state + // "rs short" -> print height/round/step + // "rs " -> print another field of the round state + + rs := pb.cs.RoundState + if len(tokens) == 1 { + fmt.Println(rs) + } else { + switch tokens[1] { + case "short": + fmt.Printf("%v/%v/%v\n", rs.Height, rs.Round, rs.Step) + case "validators": + fmt.Println(rs.Validators) + case "proposal": + fmt.Println(rs.Proposal) + case "proposal_block": + fmt.Printf("%v %v\n", rs.ProposalBlockParts.StringShort(), rs.ProposalBlock.StringShort()) + case "locked_round": + fmt.Println(rs.LockedRound) + case "locked_block": + fmt.Printf("%v %v\n", rs.LockedBlockParts.StringShort(), rs.LockedBlock.StringShort()) + case "votes": + fmt.Println(rs.Votes.StringIndented(" ")) + + default: + fmt.Println("Unknown option", tokens[1]) + } + } + case "n": + fmt.Println(pb.count) + } + } +} + +//-------------------------------------------------------------------------------- + +// convenience for replay mode +func newConsensusStateForReplay(config cfg.BaseConfig, csConfig *cfg.ConsensusConfig) *State { + dbType := dbm.BackendType(config.DBBackend) + // Get BlockStore + blockStoreDB, err := dbm.NewDB("blockstore", dbType, config.DBDir()) + if err != nil { + tmos.Exit(err.Error()) + } + blockStore := store.NewBlockStore(blockStoreDB) + + // Get State + stateDB, err := dbm.NewDB("state", dbType, config.DBDir()) + if err != nil { + tmos.Exit(err.Error()) + } + stateStore := sm.NewStore(stateDB) + gdoc, err := sm.MakeGenesisDocFromFile(config.GenesisFile()) + if err != nil { + tmos.Exit(err.Error()) + } + state, err := sm.MakeGenesisState(gdoc) + if err != nil { + tmos.Exit(err.Error()) + } + + // Create proxyAppConn connection (consensus, mempool, query) + clientCreator := proxy.DefaultClientCreator(config.ProxyApp, config.ABCI, config.DBDir()) + proxyApp := proxy.NewAppConns(clientCreator) + err = proxyApp.Start() + if err != nil { + tmos.Exit(fmt.Sprintf("Error starting proxy app conns: %v", err)) + } + + eventBus := types.NewEventBus() + if err := eventBus.Start(); err != nil { + tmos.Exit(fmt.Sprintf("Failed to start event bus: %v", err)) + } + + handshaker := NewHandshaker(stateStore, state, blockStore, gdoc) + handshaker.SetEventBus(eventBus) + err = handshaker.Handshake(proxyApp) + if err != nil { + tmos.Exit(fmt.Sprintf("Error on handshake: %v", err)) + } + + mempool, evpool := emptyMempool{}, sm.EmptyEvidencePool{} + blockExec := sm.NewBlockExecutor(stateStore, log.TestingLogger(), proxyApp.Consensus(), mempool, evpool) + + consensusState := NewState(csConfig, state.Copy(), blockExec, + blockStore, mempool, evpool, map[int64]Misbehavior{}) + + consensusState.SetEventBus(eventBus) + return consensusState +} diff --git a/test/maverick/consensus/replay_stubs.go b/test/maverick/consensus/replay_stubs.go new file mode 100644 index 000000000..08974a67e --- /dev/null +++ b/test/maverick/consensus/replay_stubs.go @@ -0,0 +1,90 @@ +package consensus + +import ( + abci "github.com/tendermint/tendermint/abci/types" + "github.com/tendermint/tendermint/libs/clist" + mempl "github.com/tendermint/tendermint/mempool" + tmstate "github.com/tendermint/tendermint/proto/tendermint/state" + "github.com/tendermint/tendermint/proxy" + "github.com/tendermint/tendermint/types" +) + +//----------------------------------------------------------------------------- + +type emptyMempool struct{} + +var _ mempl.Mempool = emptyMempool{} + +func (emptyMempool) Lock() {} +func (emptyMempool) Unlock() {} +func (emptyMempool) Size() int { return 0 } +func (emptyMempool) CheckTx(_ types.Tx, _ func(*abci.Response), _ mempl.TxInfo) error { + return nil +} +func (emptyMempool) ReapMaxBytesMaxGas(_, _ int64) types.Txs { return types.Txs{} } +func (emptyMempool) ReapMaxTxs(n int) types.Txs { return types.Txs{} } +func (emptyMempool) Update( + _ int64, + _ types.Txs, + _ []*abci.ResponseDeliverTx, + _ mempl.PreCheckFunc, + _ mempl.PostCheckFunc, +) error { + return nil +} +func (emptyMempool) Flush() {} +func (emptyMempool) FlushAppConn() error { return nil } +func (emptyMempool) TxsAvailable() <-chan struct{} { return make(chan struct{}) } +func (emptyMempool) EnableTxsAvailable() {} +func (emptyMempool) TxsBytes() int64 { return 0 } + +func (emptyMempool) TxsFront() *clist.CElement { return nil } +func (emptyMempool) TxsWaitChan() <-chan struct{} { return nil } + +func (emptyMempool) InitWAL() error { return nil } +func (emptyMempool) CloseWAL() {} + +//----------------------------------------------------------------------------- +// mockProxyApp uses ABCIResponses to give the right results. +// +// Useful because we don't want to call Commit() twice for the same block on +// the real app. + +func newMockProxyApp(appHash []byte, abciResponses *tmstate.ABCIResponses) proxy.AppConnConsensus { + clientCreator := proxy.NewLocalClientCreator(&mockProxyApp{ + appHash: appHash, + abciResponses: abciResponses, + }) + cli, _ := clientCreator.NewABCIClient() + err := cli.Start() + if err != nil { + panic(err) + } + return proxy.NewAppConnConsensus(cli) +} + +type mockProxyApp struct { + abci.BaseApplication + + appHash []byte + txCount int + abciResponses *tmstate.ABCIResponses +} + +func (mock *mockProxyApp) DeliverTx(req abci.RequestDeliverTx) abci.ResponseDeliverTx { + r := mock.abciResponses.DeliverTxs[mock.txCount] + mock.txCount++ + if r == nil { + return abci.ResponseDeliverTx{} + } + return *r +} + +func (mock *mockProxyApp) EndBlock(req abci.RequestEndBlock) abci.ResponseEndBlock { + mock.txCount = 0 + return *mock.abciResponses.EndBlock +} + +func (mock *mockProxyApp) Commit() abci.ResponseCommit { + return abci.ResponseCommit{Data: mock.appHash} +} diff --git a/test/maverick/consensus/state.go b/test/maverick/consensus/state.go new file mode 100644 index 000000000..b12d21edf --- /dev/null +++ b/test/maverick/consensus/state.go @@ -0,0 +1,1976 @@ +package consensus + +import ( + "bytes" + "errors" + "fmt" + "io/ioutil" + "os" + "reflect" + "runtime/debug" + "sync" + "time" + + "github.com/gogo/protobuf/proto" + + cfg "github.com/tendermint/tendermint/config" + cstypes "github.com/tendermint/tendermint/consensus/types" + "github.com/tendermint/tendermint/crypto" + tmevents "github.com/tendermint/tendermint/libs/events" + "github.com/tendermint/tendermint/libs/fail" + tmjson "github.com/tendermint/tendermint/libs/json" + "github.com/tendermint/tendermint/libs/log" + tmmath "github.com/tendermint/tendermint/libs/math" + tmos "github.com/tendermint/tendermint/libs/os" + "github.com/tendermint/tendermint/libs/service" + "github.com/tendermint/tendermint/p2p" + tmproto "github.com/tendermint/tendermint/proto/tendermint/types" + sm "github.com/tendermint/tendermint/state" + "github.com/tendermint/tendermint/types" + tmtime "github.com/tendermint/tendermint/types/time" +) + +// State handles execution of the consensus algorithm. +// It processes votes and proposals, and upon reaching agreement, +// commits blocks to the chain and executes them against the application. +// The internal state machine receives input from peers, the internal validator, and from a timer. +type State struct { + service.BaseService + + // config details + config *cfg.ConsensusConfig + privValidator types.PrivValidator // for signing votes + + // store blocks and commits + blockStore sm.BlockStore + + // create and execute blocks + blockExec *sm.BlockExecutor + + // notify us if txs are available + txNotifier txNotifier + + // add evidence to the pool + // when it's detected + evpool evidencePool + + // internal state + mtx sync.RWMutex + cstypes.RoundState + state sm.State // State until height-1. + + // state changes may be triggered by: msgs from peers, + // msgs from ourself, or by timeouts + peerMsgQueue chan msgInfo + internalMsgQueue chan msgInfo + timeoutTicker TimeoutTicker + // privValidator pubkey, memoized for the duration of one block + // to avoid extra requests to HSM + privValidatorPubKey crypto.PubKey + + // information about about added votes and block parts are written on this channel + // so statistics can be computed by reactor + statsMsgQueue chan msgInfo + + // we use eventBus to trigger msg broadcasts in the reactor, + // and to notify external subscribers, eg. through a websocket + eventBus *types.EventBus + + // a Write-Ahead Log ensures we can recover from any kind of crash + // and helps us avoid signing conflicting votes + wal WAL + replayMode bool // so we don't log signing errors during replay + doWALCatchup bool // determines if we even try to do the catchup + + // for tests where we want to limit the number of transitions the state makes + nSteps int + + // some functions can be overwritten for testing + decideProposal func(height int64, round int32) + + // closed when we finish shutting down + done chan struct{} + + // synchronous pubsub between consensus state and reactor. + // state only emits EventNewRoundStep and EventVote + evsw tmevents.EventSwitch + + // for reporting metrics + metrics *Metrics + + // misbehaviors mapped for each height (can't have more than one misbehavior per height) + misbehaviors map[int64]Misbehavior + + // the switch is passed to the state so that maveick misbehaviors can directly control which + // information they send to which nodes + sw *p2p.Switch +} + +// StateOption sets an optional parameter on the State. +type StateOption func(*State) + +// NewState returns a new State. +func NewState( + config *cfg.ConsensusConfig, + state sm.State, + blockExec *sm.BlockExecutor, + blockStore sm.BlockStore, + txNotifier txNotifier, + evpool evidencePool, + misbehaviors map[int64]Misbehavior, + options ...StateOption, +) *State { + cs := &State{ + config: config, + blockExec: blockExec, + blockStore: blockStore, + txNotifier: txNotifier, + peerMsgQueue: make(chan msgInfo, msgQueueSize), + internalMsgQueue: make(chan msgInfo, msgQueueSize), + timeoutTicker: NewTimeoutTicker(), + statsMsgQueue: make(chan msgInfo, msgQueueSize), + done: make(chan struct{}), + doWALCatchup: true, + wal: nilWAL{}, + evpool: evpool, + evsw: tmevents.NewEventSwitch(), + metrics: NopMetrics(), + misbehaviors: misbehaviors, + } + // set function defaults (may be overwritten before calling Start) + cs.decideProposal = cs.defaultDecideProposal + + // We have no votes, so reconstruct LastCommit from SeenCommit. + if state.LastBlockHeight > 0 { + cs.reconstructLastCommit(state) + } + + cs.updateToState(state) + + // Don't call scheduleRound0 yet. + // We do that upon Start(). + + cs.BaseService = *service.NewBaseService(nil, "State", cs) + for _, option := range options { + option(cs) + } + return cs +} + +// I know this is not great but the maverick consensus state needs access to the peers +func (cs *State) SetSwitch(sw *p2p.Switch) { + cs.sw = sw +} + +// state transitions on complete-proposal, 2/3-any, 2/3-one +func (cs *State) handleMsg(mi msgInfo) { + cs.mtx.Lock() + defer cs.mtx.Unlock() + + var ( + added bool + err error + ) + msg, peerID := mi.Msg, mi.PeerID + switch msg := msg.(type) { + case *ProposalMessage: + // will not cause transition. + // once proposal is set, we can receive block parts + // err = cs.setProposal(msg.Proposal) + if b, ok := cs.misbehaviors[cs.Height]; ok { + err = b.ReceiveProposal(cs, msg.Proposal) + } else { + err = defaultReceiveProposal(cs, msg.Proposal) + } + case *BlockPartMessage: + // if the proposal is complete, we'll enterPrevote or tryFinalizeCommit + added, err = cs.addProposalBlockPart(msg, peerID) + if added { + cs.statsMsgQueue <- mi + } + + if err != nil && msg.Round != cs.Round { + cs.Logger.Debug( + "Received block part from wrong round", + "height", + cs.Height, + "csRound", + cs.Round, + "blockRound", + msg.Round) + err = nil + } + case *VoteMessage: + // attempt to add the vote and dupeout the validator if its a duplicate signature + // if the vote gives us a 2/3-any or 2/3-one, we transition + added, err = cs.tryAddVote(msg.Vote, peerID) + if added { + cs.statsMsgQueue <- mi + } + + // if err == ErrAddingVote { + // TODO: punish peer + // We probably don't want to stop the peer here. The vote does not + // necessarily comes from a malicious peer but can be just broadcasted by + // a typical peer. + // https://github.com/tendermint/tendermint/issues/1281 + // } + + // NOTE: the vote is broadcast to peers by the reactor listening + // for vote events + + // TODO: If rs.Height == vote.Height && rs.Round < vote.Round, + // the peer is sending us CatchupCommit precommits. + // We could make note of this and help filter in broadcastHasVoteMessage(). + default: + cs.Logger.Error("Unknown msg type", "type", reflect.TypeOf(msg)) + return + } + + if err != nil { + cs.Logger.Error("Error with msg", "height", cs.Height, "round", cs.Round, + "peer", peerID, "err", err, "msg", msg) + } +} + +// Enter (CreateEmptyBlocks): from enterNewRound(height,round) +// Enter (CreateEmptyBlocks, CreateEmptyBlocksInterval > 0 ): +// after enterNewRound(height,round), after timeout of CreateEmptyBlocksInterval +// Enter (!CreateEmptyBlocks) : after enterNewRound(height,round), once txs are in the mempool +func (cs *State) enterPropose(height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + + if cs.Height != height || round < cs.Round || (cs.Round == round && cstypes.RoundStepPropose <= cs.Step) { + logger.Debug(fmt.Sprintf( + "enterPropose(%v/%v): Invalid args. Current step: %v/%v/%v", + height, + round, + cs.Height, + cs.Round, + cs.Step)) + return + } + logger.Info(fmt.Sprintf("enterPropose(%v/%v). Current: %v/%v/%v", height, round, cs.Height, cs.Round, cs.Step)) + + defer func() { + // Done enterPropose: + cs.updateRoundStep(round, cstypes.RoundStepPropose) + cs.newStep() + + // If we have the whole proposal + POL, then goto Prevote now. + // else, we'll enterPrevote when the rest of the proposal is received (in AddProposalBlockPart), + // or else after timeoutPropose + if cs.isProposalComplete() { + cs.enterPrevote(height, cs.Round) + } + }() + + if b, ok := cs.misbehaviors[cs.Height]; ok { + b.EnterPropose(cs, height, round) + } else { + defaultEnterPropose(cs, height, round) + } +} + +// Enter: `timeoutPropose` after entering Propose. +// Enter: proposal block and POL is ready. +// Prevote for LockedBlock if we're locked, or ProposalBlock if valid. +// Otherwise vote nil. +func (cs *State) enterPrevote(height int64, round int32) { + if cs.Height != height || round < cs.Round || (cs.Round == round && cstypes.RoundStepPrevote <= cs.Step) { + cs.Logger.Debug(fmt.Sprintf( + "enterPrevote(%v/%v): Invalid args. Current step: %v/%v/%v", + height, + round, + cs.Height, + cs.Round, + cs.Step)) + return + } + + defer func() { + // Done enterPrevote: + cs.updateRoundStep(round, cstypes.RoundStepPrevote) + cs.newStep() + }() + + cs.Logger.Info(fmt.Sprintf("enterPrevote(%v/%v). Current: %v/%v/%v", height, round, cs.Height, cs.Round, cs.Step)) + + // Sign and broadcast vote as necessary + if b, ok := cs.misbehaviors[cs.Height]; ok { + b.EnterPrevote(cs, height, round) + } else { + defaultEnterPrevote(cs, height, round) + } + + // Once `addVote` hits any +2/3 prevotes, we will go to PrevoteWait + // (so we have more time to try and collect +2/3 prevotes for a single block) +} + +// Enter: `timeoutPrevote` after any +2/3 prevotes. +// Enter: `timeoutPrecommit` after any +2/3 precommits. +// Enter: +2/3 precomits for block or nil. +// Lock & precommit the ProposalBlock if we have enough prevotes for it (a POL in this round) +// else, unlock an existing lock and precommit nil if +2/3 of prevotes were nil, +// else, precommit nil otherwise. +func (cs *State) enterPrecommit(height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + + if cs.Height != height || round < cs.Round || (cs.Round == round && cstypes.RoundStepPrecommit <= cs.Step) { + logger.Debug(fmt.Sprintf( + "enterPrecommit(%v/%v): Invalid args. Current step: %v/%v/%v", + height, + round, + cs.Height, + cs.Round, + cs.Step)) + return + } + + logger.Info(fmt.Sprintf("enterPrecommit(%v/%v). Current: %v/%v/%v", height, round, cs.Height, cs.Round, cs.Step)) + + defer func() { + // Done enterPrecommit: + cs.updateRoundStep(round, cstypes.RoundStepPrecommit) + cs.newStep() + }() + + if b, ok := cs.misbehaviors[cs.Height]; ok { + b.EnterPrecommit(cs, height, round) + } else { + defaultEnterPrecommit(cs, height, round) + } + +} + +func (cs *State) addVote( + vote *types.Vote, + peerID p2p.ID) (added bool, err error) { + cs.Logger.Debug( + "addVote", + "voteHeight", + vote.Height, + "voteType", + vote.Type, + "valIndex", + vote.ValidatorIndex, + "csHeight", + cs.Height, + ) + + // A precommit for the previous height? + // These come in while we wait timeoutCommit + if vote.Height+1 == cs.Height && vote.Type == tmproto.PrecommitType { + if cs.Step != cstypes.RoundStepNewHeight { + // Late precommit at prior height is ignored + cs.Logger.Debug("Precommit vote came in after commit timeout and has been ignored", "vote", vote) + return + } + added, err = cs.LastCommit.AddVote(vote) + if !added { + return + } + + cs.Logger.Info(fmt.Sprintf("Added to lastPrecommits: %v", cs.LastCommit.StringShort())) + _ = cs.eventBus.PublishEventVote(types.EventDataVote{Vote: vote}) + cs.evsw.FireEvent(types.EventVote, vote) + + // if we can skip timeoutCommit and have all the votes now, + if cs.config.SkipTimeoutCommit && cs.LastCommit.HasAll() { + // go straight to new round (skip timeout commit) + // cs.scheduleTimeout(time.Duration(0), cs.Height, 0, cstypes.RoundStepNewHeight) + cs.enterNewRound(cs.Height, 0) + } + + return + } + + // Height mismatch is ignored. + // Not necessarily a bad peer, but not favourable behaviour. + if vote.Height != cs.Height { + cs.Logger.Info("Vote ignored and not added", "voteHeight", vote.Height, "csHeight", cs.Height, "peerID", peerID) + return + } + + added, err = cs.Votes.AddVote(vote, peerID) + if !added { + // Either duplicate, or error upon cs.Votes.AddByIndex() + return + } + + _ = cs.eventBus.PublishEventVote(types.EventDataVote{Vote: vote}) + cs.evsw.FireEvent(types.EventVote, vote) + + switch vote.Type { + case tmproto.PrevoteType: + if b, ok := cs.misbehaviors[cs.Height]; ok { + b.ReceivePrevote(cs, vote) + } else { + defaultReceivePrevote(cs, vote) + } + + case tmproto.PrecommitType: + if b, ok := cs.misbehaviors[cs.Height]; ok { + b.ReceivePrecommit(cs, vote) + } + defaultReceivePrecommit(cs, vote) + + default: + panic(fmt.Sprintf("Unexpected vote type %v", vote.Type)) + } + + return added, err +} + +//----------------------------------------------------------------------------- +// Errors + +var ( + ErrInvalidProposalSignature = errors.New("error invalid proposal signature") + ErrInvalidProposalPOLRound = errors.New("error invalid proposal POL round") + ErrAddingVote = errors.New("error adding vote") + ErrSignatureFoundInPastBlocks = errors.New("found signature from the same key") + + errPubKeyIsNotSet = errors.New("pubkey is not set. Look for \"Can't get private validator pubkey\" errors") +) + +//----------------------------------------------------------------------------- + +var ( + msgQueueSize = 1000 +) + +// msgs from the reactor which may update the state +type msgInfo struct { + Msg Message `json:"msg"` + PeerID p2p.ID `json:"peer_key"` +} + +// internally generated messages which may update the state +type timeoutInfo struct { + Duration time.Duration `json:"duration"` + Height int64 `json:"height"` + Round int32 `json:"round"` + Step cstypes.RoundStepType `json:"step"` +} + +func (ti *timeoutInfo) String() string { + return fmt.Sprintf("%v ; %d/%d %v", ti.Duration, ti.Height, ti.Round, ti.Step) +} + +// interface to the mempool +type txNotifier interface { + TxsAvailable() <-chan struct{} +} + +// interface to the evidence pool +type evidencePool interface { + // Adds consensus based evidence to the evidence pool where time is the time + // of the block where the offense occurred and the validator set is the current one. + AddEvidenceFromConsensus(types.Evidence, time.Time, *types.ValidatorSet) error +} + +//---------------------------------------- +// Public interface + +// SetLogger implements Service. +func (cs *State) SetLogger(l log.Logger) { + cs.BaseService.Logger = l + cs.timeoutTicker.SetLogger(l) +} + +// SetEventBus sets event bus. +func (cs *State) SetEventBus(b *types.EventBus) { + cs.eventBus = b + cs.blockExec.SetEventBus(b) +} + +// StateMetrics sets the metrics. +func StateMetrics(metrics *Metrics) StateOption { + return func(cs *State) { cs.metrics = metrics } +} + +// String returns a string. +func (cs *State) String() string { + // better not to access shared variables + return "ConsensusState" +} + +// GetState returns a copy of the chain state. +func (cs *State) GetState() sm.State { + cs.mtx.RLock() + defer cs.mtx.RUnlock() + return cs.state.Copy() +} + +// GetLastHeight returns the last height committed. +// If there were no blocks, returns 0. +func (cs *State) GetLastHeight() int64 { + cs.mtx.RLock() + defer cs.mtx.RUnlock() + return cs.RoundState.Height - 1 +} + +// GetRoundState returns a shallow copy of the internal consensus state. +func (cs *State) GetRoundState() *cstypes.RoundState { + cs.mtx.RLock() + rs := cs.RoundState // copy + cs.mtx.RUnlock() + return &rs +} + +// GetRoundStateJSON returns a json of RoundState. +func (cs *State) GetRoundStateJSON() ([]byte, error) { + cs.mtx.RLock() + defer cs.mtx.RUnlock() + return tmjson.Marshal(cs.RoundState) +} + +// GetRoundStateSimpleJSON returns a json of RoundStateSimple +func (cs *State) GetRoundStateSimpleJSON() ([]byte, error) { + cs.mtx.RLock() + defer cs.mtx.RUnlock() + return tmjson.Marshal(cs.RoundState.RoundStateSimple()) +} + +// GetValidators returns a copy of the current validators. +func (cs *State) GetValidators() (int64, []*types.Validator) { + cs.mtx.RLock() + defer cs.mtx.RUnlock() + return cs.state.LastBlockHeight, cs.state.Validators.Copy().Validators +} + +// SetPrivValidator sets the private validator account for signing votes. It +// immediately requests pubkey and caches it. +func (cs *State) SetPrivValidator(priv types.PrivValidator) { + cs.mtx.Lock() + defer cs.mtx.Unlock() + + cs.privValidator = priv + + if err := cs.updatePrivValidatorPubKey(); err != nil { + cs.Logger.Error("Can't get private validator pubkey", "err", err) + } +} + +// SetTimeoutTicker sets the local timer. It may be useful to overwrite for testing. +func (cs *State) SetTimeoutTicker(timeoutTicker TimeoutTicker) { + cs.mtx.Lock() + cs.timeoutTicker = timeoutTicker + cs.mtx.Unlock() +} + +// LoadCommit loads the commit for a given height. +func (cs *State) LoadCommit(height int64) *types.Commit { + cs.mtx.RLock() + defer cs.mtx.RUnlock() + if height == cs.blockStore.Height() { + return cs.blockStore.LoadSeenCommit(height) + } + return cs.blockStore.LoadBlockCommit(height) +} + +// OnStart loads the latest state via the WAL, and starts the timeout and +// receive routines. +func (cs *State) OnStart() error { + // We may set the WAL in testing before calling Start, so only OpenWAL if its + // still the nilWAL. + if _, ok := cs.wal.(nilWAL); ok { + if err := cs.loadWalFile(); err != nil { + return err + } + } + + // We may have lost some votes if the process crashed reload from consensus + // log to catchup. + if cs.doWALCatchup { + repairAttempted := false + LOOP: + for { + err := cs.catchupReplay(cs.Height) + switch { + case err == nil: + break LOOP + case !IsDataCorruptionError(err): + cs.Logger.Error("Error on catchup replay. Proceeding to start State anyway", "err", err) + break LOOP + case repairAttempted: + return err + } + + cs.Logger.Info("WAL file is corrupted. Attempting repair", "err", err) + + // 1) prep work + if err := cs.wal.Stop(); err != nil { + return err + } + repairAttempted = true + + // 2) backup original WAL file + corruptedFile := fmt.Sprintf("%s.CORRUPTED", cs.config.WalFile()) + if err := tmos.CopyFile(cs.config.WalFile(), corruptedFile); err != nil { + return err + } + cs.Logger.Info("Backed up WAL file", "src", cs.config.WalFile(), "dst", corruptedFile) + + // 3) try to repair (WAL file will be overwritten!) + if err := repairWalFile(corruptedFile, cs.config.WalFile()); err != nil { + cs.Logger.Error("Repair failed", "err", err) + return err + } + cs.Logger.Info("Successful repair") + + // reload WAL file + if err := cs.loadWalFile(); err != nil { + return err + } + } + } + + if err := cs.evsw.Start(); err != nil { + return err + } + + // we need the timeoutRoutine for replay so + // we don't block on the tick chan. + // NOTE: we will get a build up of garbage go routines + // firing on the tockChan until the receiveRoutine is started + // to deal with them (by that point, at most one will be valid) + if err := cs.timeoutTicker.Start(); err != nil { + return err + } + + // Double Signing Risk Reduction + if err := cs.checkDoubleSigningRisk(cs.Height); err != nil { + return err + } + + // now start the receiveRoutine + go cs.receiveRoutine(0) + + // schedule the first round! + // use GetRoundState so we don't race the receiveRoutine for access + cs.scheduleRound0(cs.GetRoundState()) + + return nil +} + +// loadWalFile loads WAL data from file. It overwrites cs.wal. +func (cs *State) loadWalFile() error { + wal, err := cs.OpenWAL(cs.config.WalFile()) + if err != nil { + cs.Logger.Error("Error loading State wal", "err", err) + return err + } + cs.wal = wal + return nil +} + +// OnStop implements service.Service. +func (cs *State) OnStop() { + if err := cs.evsw.Stop(); err != nil { + cs.Logger.Error("error trying to stop eventSwitch", "error", err) + } + if err := cs.timeoutTicker.Stop(); err != nil { + cs.Logger.Error("error trying to stop timeoutTicket", "error", err) + } + // WAL is stopped in receiveRoutine. +} + +// Wait waits for the the main routine to return. +// NOTE: be sure to Stop() the event switch and drain +// any event channels or this may deadlock +func (cs *State) Wait() { + <-cs.done +} + +// OpenWAL opens a file to log all consensus messages and timeouts for +// deterministic accountability. +func (cs *State) OpenWAL(walFile string) (WAL, error) { + wal, err := NewWAL(walFile) + if err != nil { + cs.Logger.Error("Failed to open WAL", "file", walFile, "err", err) + return nil, err + } + wal.SetLogger(cs.Logger.With("wal", walFile)) + if err := wal.Start(); err != nil { + cs.Logger.Error("Failed to start WAL", "err", err) + return nil, err + } + return wal, nil +} + +//------------------------------------------------------------ +// Public interface for passing messages into the consensus state, possibly causing a state transition. +// If peerID == "", the msg is considered internal. +// Messages are added to the appropriate queue (peer or internal). +// If the queue is full, the function may block. +// TODO: should these return anything or let callers just use events? + +// AddVote inputs a vote. +func (cs *State) AddVote(vote *types.Vote, peerID p2p.ID) (added bool, err error) { + if peerID == "" { + cs.internalMsgQueue <- msgInfo{&VoteMessage{vote}, ""} + } else { + cs.peerMsgQueue <- msgInfo{&VoteMessage{vote}, peerID} + } + + // TODO: wait for event?! + return false, nil +} + +// SetProposal inputs a proposal. +func (cs *State) SetProposal(proposal *types.Proposal, peerID p2p.ID) error { + + if peerID == "" { + cs.internalMsgQueue <- msgInfo{&ProposalMessage{proposal}, ""} + } else { + cs.peerMsgQueue <- msgInfo{&ProposalMessage{proposal}, peerID} + } + + // TODO: wait for event?! + return nil +} + +// AddProposalBlockPart inputs a part of the proposal block. +func (cs *State) AddProposalBlockPart(height int64, round int32, part *types.Part, peerID p2p.ID) error { + + if peerID == "" { + cs.internalMsgQueue <- msgInfo{&BlockPartMessage{height, round, part}, ""} + } else { + cs.peerMsgQueue <- msgInfo{&BlockPartMessage{height, round, part}, peerID} + } + + // TODO: wait for event?! + return nil +} + +// SetProposalAndBlock inputs the proposal and all block parts. +func (cs *State) SetProposalAndBlock( + proposal *types.Proposal, + block *types.Block, + parts *types.PartSet, + peerID p2p.ID, +) error { + if err := cs.SetProposal(proposal, peerID); err != nil { + return err + } + for i := 0; i < int(parts.Total()); i++ { + part := parts.GetPart(i) + if err := cs.AddProposalBlockPart(proposal.Height, proposal.Round, part, peerID); err != nil { + return err + } + } + return nil +} + +//------------------------------------------------------------ +// internal functions for managing the state + +func (cs *State) updateHeight(height int64) { + cs.metrics.Height.Set(float64(height)) + cs.Height = height +} + +func (cs *State) updateRoundStep(round int32, step cstypes.RoundStepType) { + cs.Round = round + cs.Step = step +} + +// enterNewRound(height, 0) at cs.StartTime. +func (cs *State) scheduleRound0(rs *cstypes.RoundState) { + // cs.Logger.Info("scheduleRound0", "now", tmtime.Now(), "startTime", cs.StartTime) + sleepDuration := rs.StartTime.Sub(tmtime.Now()) + cs.scheduleTimeout(sleepDuration, rs.Height, 0, cstypes.RoundStepNewHeight) +} + +// Attempt to schedule a timeout (by sending timeoutInfo on the tickChan) +func (cs *State) scheduleTimeout(duration time.Duration, height int64, round int32, step cstypes.RoundStepType) { + cs.timeoutTicker.ScheduleTimeout(timeoutInfo{duration, height, round, step}) +} + +// send a msg into the receiveRoutine regarding our own proposal, block part, or vote +func (cs *State) sendInternalMessage(mi msgInfo) { + select { + case cs.internalMsgQueue <- mi: + default: + // NOTE: using the go-routine means our votes can + // be processed out of order. + // TODO: use CList here for strict determinism and + // attempt push to internalMsgQueue in receiveRoutine + cs.Logger.Info("Internal msg queue is full. Using a go-routine") + go func() { cs.internalMsgQueue <- mi }() + } +} + +// Reconstruct LastCommit from SeenCommit, which we saved along with the block, +// (which happens even before saving the state) +func (cs *State) reconstructLastCommit(state sm.State) { + seenCommit := cs.blockStore.LoadSeenCommit(state.LastBlockHeight) + if seenCommit == nil { + panic(fmt.Sprintf("Failed to reconstruct LastCommit: seen commit for height %v not found", + state.LastBlockHeight)) + } + + lastPrecommits := types.CommitToVoteSet(state.ChainID, seenCommit, state.LastValidators) + if !lastPrecommits.HasTwoThirdsMajority() { + panic("Failed to reconstruct LastCommit: Does not have +2/3 maj") + } + + cs.LastCommit = lastPrecommits +} + +// Updates State and increments height to match that of state. +// The round becomes 0 and cs.Step becomes cstypes.RoundStepNewHeight. +func (cs *State) updateToState(state sm.State) { + if cs.CommitRound > -1 && 0 < cs.Height && cs.Height != state.LastBlockHeight { + panic(fmt.Sprintf("updateToState() expected state height of %v but found %v", + cs.Height, state.LastBlockHeight)) + } + if !cs.state.IsEmpty() { + if cs.state.LastBlockHeight > 0 && cs.state.LastBlockHeight+1 != cs.Height { + // This might happen when someone else is mutating cs.state. + // Someone forgot to pass in state.Copy() somewhere?! + panic(fmt.Sprintf("Inconsistent cs.state.LastBlockHeight+1 %v vs cs.Height %v", + cs.state.LastBlockHeight+1, cs.Height)) + } + if cs.state.LastBlockHeight > 0 && cs.Height == cs.state.InitialHeight { + panic(fmt.Sprintf("Inconsistent cs.state.LastBlockHeight %v, expected 0 for initial height %v", + cs.state.LastBlockHeight, cs.state.InitialHeight)) + } + + // If state isn't further out than cs.state, just ignore. + // This happens when SwitchToConsensus() is called in the reactor. + // We don't want to reset e.g. the Votes, but we still want to + // signal the new round step, because other services (eg. txNotifier) + // depend on having an up-to-date peer state! + if state.LastBlockHeight <= cs.state.LastBlockHeight { + cs.Logger.Info( + "Ignoring updateToState()", + "newHeight", + state.LastBlockHeight+1, + "oldHeight", + cs.state.LastBlockHeight+1) + cs.newStep() + return + } + } + + // Reset fields based on state. + validators := state.Validators + + switch { + case state.LastBlockHeight == 0: // Very first commit should be empty. + cs.LastCommit = (*types.VoteSet)(nil) + case cs.CommitRound > -1 && cs.Votes != nil: // Otherwise, use cs.Votes + if !cs.Votes.Precommits(cs.CommitRound).HasTwoThirdsMajority() { + panic(fmt.Sprintf("Wanted to form a Commit, but Precommits (H/R: %d/%d) didn't have 2/3+: %v", + state.LastBlockHeight, + cs.CommitRound, + cs.Votes.Precommits(cs.CommitRound))) + } + cs.LastCommit = cs.Votes.Precommits(cs.CommitRound) + case cs.LastCommit == nil: + // NOTE: when Tendermint starts, it has no votes. reconstructLastCommit + // must be called to reconstruct LastCommit from SeenCommit. + panic(fmt.Sprintf("LastCommit cannot be empty after initial block (H:%d)", + state.LastBlockHeight+1, + )) + } + + // Next desired block height + height := state.LastBlockHeight + 1 + if height == 1 { + height = state.InitialHeight + } + + // RoundState fields + cs.updateHeight(height) + cs.updateRoundStep(0, cstypes.RoundStepNewHeight) + if cs.CommitTime.IsZero() { + // "Now" makes it easier to sync up dev nodes. + // We add timeoutCommit to allow transactions + // to be gathered for the first block. + // And alternative solution that relies on clocks: + // cs.StartTime = state.LastBlockTime.Add(timeoutCommit) + cs.StartTime = cs.config.Commit(tmtime.Now()) + } else { + cs.StartTime = cs.config.Commit(cs.CommitTime) + } + + cs.Validators = validators + cs.Proposal = nil + cs.ProposalBlock = nil + cs.ProposalBlockParts = nil + cs.LockedRound = -1 + cs.LockedBlock = nil + cs.LockedBlockParts = nil + cs.ValidRound = -1 + cs.ValidBlock = nil + cs.ValidBlockParts = nil + cs.Votes = cstypes.NewHeightVoteSet(state.ChainID, height, validators) + cs.CommitRound = -1 + cs.LastValidators = state.LastValidators + cs.TriggeredTimeoutPrecommit = false + + cs.state = state + + // Finally, broadcast RoundState + cs.newStep() +} + +func (cs *State) newStep() { + rs := cs.RoundStateEvent() + if err := cs.wal.Write(rs); err != nil { + cs.Logger.Error("Error writing to wal", "err", err) + } + cs.nSteps++ + // newStep is called by updateToState in NewState before the eventBus is set! + if cs.eventBus != nil { + if err := cs.eventBus.PublishEventNewRoundStep(rs); err != nil { + cs.Logger.Error("Error publishing new round step", "err", err) + } + cs.evsw.FireEvent(types.EventNewRoundStep, &cs.RoundState) + } +} + +//----------------------------------------- +// the main go routines + +// receiveRoutine handles messages which may cause state transitions. +// it's argument (n) is the number of messages to process before exiting - use 0 to run forever +// It keeps the RoundState and is the only thing that updates it. +// Updates (state transitions) happen on timeouts, complete proposals, and 2/3 majorities. +// State must be locked before any internal state is updated. +func (cs *State) receiveRoutine(maxSteps int) { + onExit := func(cs *State) { + // NOTE: the internalMsgQueue may have signed messages from our + // priv_val that haven't hit the WAL, but its ok because + // priv_val tracks LastSig + + // close wal now that we're done writing to it + if err := cs.wal.Stop(); err != nil { + cs.Logger.Error("error trying to stop wal", "error", err) + } + cs.wal.Wait() + + close(cs.done) + } + + defer func() { + if r := recover(); r != nil { + cs.Logger.Error("CONSENSUS FAILURE!!!", "err", r, "stack", string(debug.Stack())) + // stop gracefully + // + // NOTE: We most probably shouldn't be running any further when there is + // some unexpected panic. Some unknown error happened, and so we don't + // know if that will result in the validator signing an invalid thing. It + // might be worthwhile to explore a mechanism for manual resuming via + // some console or secure RPC system, but for now, halting the chain upon + // unexpected consensus bugs sounds like the better option. + onExit(cs) + } + }() + + for { + if maxSteps > 0 { + if cs.nSteps >= maxSteps { + cs.Logger.Info("reached max steps. exiting receive routine") + cs.nSteps = 0 + return + } + } + rs := cs.RoundState + var mi msgInfo + + select { + case <-cs.txNotifier.TxsAvailable(): + cs.handleTxsAvailable() + case mi = <-cs.peerMsgQueue: + if err := cs.wal.Write(mi); err != nil { + cs.Logger.Error("Error writing to wal", "err", err) + } + // handles proposals, block parts, votes + // may generate internal events (votes, complete proposals, 2/3 majorities) + cs.handleMsg(mi) + case mi = <-cs.internalMsgQueue: + err := cs.wal.WriteSync(mi) // NOTE: fsync + if err != nil { + panic(fmt.Sprintf("Failed to write %v msg to consensus wal due to %v. Check your FS and restart the node", mi, err)) + } + + if _, ok := mi.Msg.(*VoteMessage); ok { + // we actually want to simulate failing during + // the previous WriteSync, but this isn't easy to do. + // Equivalent would be to fail here and manually remove + // some bytes from the end of the wal. + fail.Fail() // XXX + } + + // handles proposals, block parts, votes + cs.handleMsg(mi) + case ti := <-cs.timeoutTicker.Chan(): // tockChan: + if err := cs.wal.Write(ti); err != nil { + cs.Logger.Error("Error writing to wal", "err", err) + } + // if the timeout is relevant to the rs + // go to the next step + cs.handleTimeout(ti, rs) + case <-cs.Quit(): + onExit(cs) + return + } + } +} + +func (cs *State) handleTimeout(ti timeoutInfo, rs cstypes.RoundState) { + cs.Logger.Debug("Received tock", "timeout", ti.Duration, "height", ti.Height, "round", ti.Round, "step", ti.Step) + + // timeouts must be for current height, round, step + if ti.Height != rs.Height || ti.Round < rs.Round || (ti.Round == rs.Round && ti.Step < rs.Step) { + cs.Logger.Debug("Ignoring tock because we're ahead", "height", rs.Height, "round", rs.Round, "step", rs.Step) + return + } + + // the timeout will now cause a state transition + cs.mtx.Lock() + defer cs.mtx.Unlock() + + switch ti.Step { + case cstypes.RoundStepNewHeight: + // NewRound event fired from enterNewRound. + // XXX: should we fire timeout here (for timeout commit)? + cs.enterNewRound(ti.Height, 0) + case cstypes.RoundStepNewRound: + cs.enterPropose(ti.Height, 0) + case cstypes.RoundStepPropose: + if err := cs.eventBus.PublishEventTimeoutPropose(cs.RoundStateEvent()); err != nil { + cs.Logger.Error("Error publishing timeout propose", "err", err) + } + cs.enterPrevote(ti.Height, ti.Round) + case cstypes.RoundStepPrevoteWait: + if err := cs.eventBus.PublishEventTimeoutWait(cs.RoundStateEvent()); err != nil { + cs.Logger.Error("Error publishing timeout wait", "err", err) + } + cs.enterPrecommit(ti.Height, ti.Round) + case cstypes.RoundStepPrecommitWait: + if err := cs.eventBus.PublishEventTimeoutWait(cs.RoundStateEvent()); err != nil { + cs.Logger.Error("Error publishing timeout wait", "err", err) + } + cs.enterPrecommit(ti.Height, ti.Round) + cs.enterNewRound(ti.Height, ti.Round+1) + default: + panic(fmt.Sprintf("Invalid timeout step: %v", ti.Step)) + } + +} + +func (cs *State) handleTxsAvailable() { + cs.mtx.Lock() + defer cs.mtx.Unlock() + + // We only need to do this for round 0. + if cs.Round != 0 { + return + } + + switch cs.Step { + case cstypes.RoundStepNewHeight: // timeoutCommit phase + if cs.needProofBlock(cs.Height) { + // enterPropose will be called by enterNewRound + return + } + + // +1ms to ensure RoundStepNewRound timeout always happens after RoundStepNewHeight + timeoutCommit := cs.StartTime.Sub(tmtime.Now()) + 1*time.Millisecond + cs.scheduleTimeout(timeoutCommit, cs.Height, 0, cstypes.RoundStepNewRound) + case cstypes.RoundStepNewRound: // after timeoutCommit + cs.enterPropose(cs.Height, 0) + } +} + +//----------------------------------------------------------------------------- +// State functions +// Used internally by handleTimeout and handleMsg to make state transitions + +// Enter: `timeoutNewHeight` by startTime (commitTime+timeoutCommit), +// or, if SkipTimeoutCommit==true, after receiving all precommits from (height,round-1) +// Enter: `timeoutPrecommits` after any +2/3 precommits from (height,round-1) +// Enter: +2/3 precommits for nil at (height,round-1) +// Enter: +2/3 prevotes any or +2/3 precommits for block or any from (height, round) +// NOTE: cs.StartTime was already set for height. +func (cs *State) enterNewRound(height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + + if cs.Height != height || round < cs.Round || (cs.Round == round && cs.Step != cstypes.RoundStepNewHeight) { + logger.Debug(fmt.Sprintf( + "enterNewRound(%v/%v): Invalid args. Current step: %v/%v/%v", + height, + round, + cs.Height, + cs.Round, + cs.Step)) + return + } + + if now := tmtime.Now(); cs.StartTime.After(now) { + logger.Info("Need to set a buffer and log message here for sanity.", "startTime", cs.StartTime, "now", now) + } + + logger.Info(fmt.Sprintf("enterNewRound(%v/%v). Current: %v/%v/%v", height, round, cs.Height, cs.Round, cs.Step)) + + // Increment validators if necessary + validators := cs.Validators + if cs.Round < round { + validators = validators.Copy() + validators.IncrementProposerPriority(tmmath.SafeSubInt32(round, cs.Round)) + } + + // Setup new round + // we don't fire newStep for this step, + // but we fire an event, so update the round step first + cs.updateRoundStep(round, cstypes.RoundStepNewRound) + cs.Validators = validators + if round == 0 { + // We've already reset these upon new height, + // and meanwhile we might have received a proposal + // for round 0. + } else { + logger.Info("Resetting Proposal info") + cs.Proposal = nil + cs.ProposalBlock = nil + cs.ProposalBlockParts = nil + } + cs.Votes.SetRound(tmmath.SafeAddInt32(round, 1)) // also track next round (round+1) to allow round-skipping + cs.TriggeredTimeoutPrecommit = false + + if err := cs.eventBus.PublishEventNewRound(cs.NewRoundEvent()); err != nil { + cs.Logger.Error("Error publishing new round", "err", err) + } + cs.metrics.Rounds.Set(float64(round)) + + // Wait for txs to be available in the mempool + // before we enterPropose in round 0. If the last block changed the app hash, + // we may need an empty "proof" block, and enterPropose immediately. + waitForTxs := cs.config.WaitForTxs() && round == 0 && !cs.needProofBlock(height) + if waitForTxs { + if cs.config.CreateEmptyBlocksInterval > 0 { + cs.scheduleTimeout(cs.config.CreateEmptyBlocksInterval, height, round, + cstypes.RoundStepNewRound) + } + } else { + cs.enterPropose(height, round) + } +} + +// needProofBlock returns true on the first height (so the genesis app hash is signed right away) +// and where the last block (height-1) caused the app hash to change +func (cs *State) needProofBlock(height int64) bool { + if height == cs.state.InitialHeight { + return true + } + + lastBlockMeta := cs.blockStore.LoadBlockMeta(height - 1) + if lastBlockMeta == nil { + panic(fmt.Sprintf("needProofBlock: last block meta for height %d not found", height-1)) + } + return !bytes.Equal(cs.state.AppHash, lastBlockMeta.Header.AppHash) +} + +func (cs *State) isProposer(address []byte) bool { + return bytes.Equal(cs.Validators.GetProposer().Address, address) +} + +func (cs *State) defaultDecideProposal(height int64, round int32) { + var block *types.Block + var blockParts *types.PartSet + + // Decide on block + if cs.ValidBlock != nil { + // If there is valid block, choose that. + block, blockParts = cs.ValidBlock, cs.ValidBlockParts + } else { + // Create a new proposal block from state/txs from the mempool. + block, blockParts = cs.createProposalBlock() + if block == nil { + return + } + } + + // Flush the WAL. Otherwise, we may not recompute the same proposal to sign, + // and the privValidator will refuse to sign anything. + if err := cs.wal.FlushAndSync(); err != nil { + cs.Logger.Error("Error flushing to disk") + } + + // Make proposal + propBlockID := types.BlockID{Hash: block.Hash(), PartSetHeader: blockParts.Header()} + proposal := types.NewProposal(height, round, cs.ValidRound, propBlockID) + p := proposal.ToProto() + if err := cs.privValidator.SignProposal(cs.state.ChainID, p); err == nil { + proposal.Signature = p.Signature + + // send proposal and block parts on internal msg queue + cs.sendInternalMessage(msgInfo{&ProposalMessage{proposal}, ""}) + for i := 0; i < int(blockParts.Total()); i++ { + part := blockParts.GetPart(i) + cs.sendInternalMessage(msgInfo{&BlockPartMessage{cs.Height, cs.Round, part}, ""}) + } + cs.Logger.Info("Signed proposal", "height", height, "round", round, "proposal", proposal) + cs.Logger.Debug(fmt.Sprintf("Signed proposal block: %v", block)) + } else if !cs.replayMode { + cs.Logger.Error("enterPropose: Error signing proposal", "height", height, "round", round, "err", err) + } +} + +// Returns true if the proposal block is complete && +// (if POLRound was proposed, we have +2/3 prevotes from there). +func (cs *State) isProposalComplete() bool { + if cs.Proposal == nil || cs.ProposalBlock == nil { + return false + } + // we have the proposal. if there's a POLRound, + // make sure we have the prevotes from it too + if cs.Proposal.POLRound < 0 { + return true + } + // if this is false the proposer is lying or we haven't received the POL yet + return cs.Votes.Prevotes(cs.Proposal.POLRound).HasTwoThirdsMajority() + +} + +// Create the next block to propose and return it. Returns nil block upon error. +// +// We really only need to return the parts, but the block is returned for +// convenience so we can log the proposal block. +// +// NOTE: keep it side-effect free for clarity. +// CONTRACT: cs.privValidator is not nil. +func (cs *State) createProposalBlock() (block *types.Block, blockParts *types.PartSet) { + if cs.privValidator == nil { + panic("entered createProposalBlock with privValidator being nil") + } + + var commit *types.Commit + switch { + case cs.Height == cs.state.InitialHeight: + // We're creating a proposal for the first block. + // The commit is empty, but not nil. + commit = types.NewCommit(0, 0, types.BlockID{}, nil) + case cs.LastCommit.HasTwoThirdsMajority(): + // Make the commit from LastCommit + commit = cs.LastCommit.MakeCommit() + default: // This shouldn't happen. + cs.Logger.Error("enterPropose: Cannot propose anything: No commit for the previous block") + return + } + + if cs.privValidatorPubKey == nil { + // If this node is a validator & proposer in the current round, it will + // miss the opportunity to create a block. + cs.Logger.Error(fmt.Sprintf("enterPropose: %v", errPubKeyIsNotSet)) + return + } + proposerAddr := cs.privValidatorPubKey.Address() + + return cs.blockExec.CreateProposalBlock(cs.Height, cs.state, commit, proposerAddr) +} + +// Enter: any +2/3 prevotes at next round. +func (cs *State) enterPrevoteWait(height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + + if cs.Height != height || round < cs.Round || (cs.Round == round && cstypes.RoundStepPrevoteWait <= cs.Step) { + logger.Debug(fmt.Sprintf( + "enterPrevoteWait(%v/%v): Invalid args. Current step: %v/%v/%v", + height, + round, + cs.Height, + cs.Round, + cs.Step)) + return + } + if !cs.Votes.Prevotes(round).HasTwoThirdsAny() { + panic(fmt.Sprintf("enterPrevoteWait(%v/%v), but Prevotes does not have any +2/3 votes", height, round)) + } + logger.Info(fmt.Sprintf("enterPrevoteWait(%v/%v). Current: %v/%v/%v", height, round, cs.Height, cs.Round, cs.Step)) + + defer func() { + // Done enterPrevoteWait: + cs.updateRoundStep(round, cstypes.RoundStepPrevoteWait) + cs.newStep() + }() + + // Wait for some more prevotes; enterPrecommit + cs.scheduleTimeout(cs.config.Prevote(round), height, round, cstypes.RoundStepPrevoteWait) +} + +// Enter: any +2/3 precommits for next round. +func (cs *State) enterPrecommitWait(height int64, round int32) { + logger := cs.Logger.With("height", height, "round", round) + + if cs.Height != height || round < cs.Round || (cs.Round == round && cs.TriggeredTimeoutPrecommit) { + logger.Debug( + fmt.Sprintf( + "enterPrecommitWait(%v/%v): Invalid args. "+ + "Current state is Height/Round: %v/%v/, TriggeredTimeoutPrecommit:%v", + height, round, cs.Height, cs.Round, cs.TriggeredTimeoutPrecommit)) + return + } + if !cs.Votes.Precommits(round).HasTwoThirdsAny() { + panic(fmt.Sprintf("enterPrecommitWait(%v/%v), but Precommits does not have any +2/3 votes", height, round)) + } + logger.Info(fmt.Sprintf("enterPrecommitWait(%v/%v). Current: %v/%v/%v", height, round, cs.Height, cs.Round, cs.Step)) + + defer func() { + // Done enterPrecommitWait: + cs.TriggeredTimeoutPrecommit = true + cs.newStep() + }() + + // Wait for some more precommits; enterNewRound + cs.scheduleTimeout(cs.config.Precommit(round), height, round, cstypes.RoundStepPrecommitWait) +} + +// Enter: +2/3 precommits for block +func (cs *State) enterCommit(height int64, commitRound int32) { + logger := cs.Logger.With("height", height, "commitRound", commitRound) + + if cs.Height != height || cstypes.RoundStepCommit <= cs.Step { + logger.Debug(fmt.Sprintf( + "enterCommit(%v/%v): Invalid args. Current step: %v/%v/%v", + height, + commitRound, + cs.Height, + cs.Round, + cs.Step)) + return + } + logger.Info(fmt.Sprintf("enterCommit(%v/%v). Current: %v/%v/%v", height, commitRound, cs.Height, cs.Round, cs.Step)) + + defer func() { + // Done enterCommit: + // keep cs.Round the same, commitRound points to the right Precommits set. + cs.updateRoundStep(cs.Round, cstypes.RoundStepCommit) + cs.CommitRound = commitRound + cs.CommitTime = tmtime.Now() + cs.newStep() + + // Maybe finalize immediately. + cs.tryFinalizeCommit(height) + }() + + blockID, ok := cs.Votes.Precommits(commitRound).TwoThirdsMajority() + if !ok { + panic("RunActionCommit() expects +2/3 precommits") + } + + // The Locked* fields no longer matter. + // Move them over to ProposalBlock if they match the commit hash, + // otherwise they'll be cleared in updateToState. + if cs.LockedBlock.HashesTo(blockID.Hash) { + logger.Info("Commit is for locked block. Set ProposalBlock=LockedBlock", "blockHash", blockID.Hash) + cs.ProposalBlock = cs.LockedBlock + cs.ProposalBlockParts = cs.LockedBlockParts + } + + // If we don't have the block being committed, set up to get it. + if !cs.ProposalBlock.HashesTo(blockID.Hash) { + if !cs.ProposalBlockParts.HasHeader(blockID.PartSetHeader) { + logger.Info( + "Commit is for a block we don't know about. Set ProposalBlock=nil", + "proposal", + cs.ProposalBlock.Hash(), + "commit", + blockID.Hash) + // We're getting the wrong block. + // Set up ProposalBlockParts and keep waiting. + cs.ProposalBlock = nil + cs.ProposalBlockParts = types.NewPartSetFromHeader(blockID.PartSetHeader) + if err := cs.eventBus.PublishEventValidBlock(cs.RoundStateEvent()); err != nil { + cs.Logger.Error("Error publishing valid block", "err", err) + } + cs.evsw.FireEvent(types.EventValidBlock, &cs.RoundState) + } + // else { + // We just need to keep waiting. + // } + } +} + +// If we have the block AND +2/3 commits for it, finalize. +func (cs *State) tryFinalizeCommit(height int64) { + logger := cs.Logger.With("height", height) + + if cs.Height != height { + panic(fmt.Sprintf("tryFinalizeCommit() cs.Height: %v vs height: %v", cs.Height, height)) + } + + blockID, ok := cs.Votes.Precommits(cs.CommitRound).TwoThirdsMajority() + if !ok || len(blockID.Hash) == 0 { + logger.Error("Attempt to finalize failed. There was no +2/3 majority, or +2/3 was for .") + return + } + if !cs.ProposalBlock.HashesTo(blockID.Hash) { + // TODO: this happens every time if we're not a validator (ugly logs) + // TODO: ^^ wait, why does it matter that we're a validator? + logger.Info( + "Attempt to finalize failed. We don't have the commit block.", + "proposal-block", + cs.ProposalBlock.Hash(), + "commit-block", + blockID.Hash) + return + } + + // go + cs.finalizeCommit(height) +} + +// Increment height and goto cstypes.RoundStepNewHeight +func (cs *State) finalizeCommit(height int64) { + if cs.Height != height || cs.Step != cstypes.RoundStepCommit { + cs.Logger.Debug(fmt.Sprintf( + "finalizeCommit(%v): Invalid args. Current step: %v/%v/%v", + height, + cs.Height, + cs.Round, + cs.Step)) + return + } + + blockID, ok := cs.Votes.Precommits(cs.CommitRound).TwoThirdsMajority() + block, blockParts := cs.ProposalBlock, cs.ProposalBlockParts + + if !ok { + panic("Cannot finalizeCommit, commit does not have two thirds majority") + } + if !blockParts.HasHeader(blockID.PartSetHeader) { + panic("Expected ProposalBlockParts header to be commit header") + } + if !block.HashesTo(blockID.Hash) { + panic("Cannot finalizeCommit, ProposalBlock does not hash to commit hash") + } + if err := cs.blockExec.ValidateBlock(cs.state, block); err != nil { + panic(fmt.Errorf("+2/3 committed an invalid block: %w", err)) + } + + cs.Logger.Info("Finalizing commit of block with N txs", + "height", block.Height, + "hash", block.Hash(), + "root", block.AppHash, + "N", len(block.Txs)) + cs.Logger.Info(fmt.Sprintf("%v", block)) + + fail.Fail() // XXX + + // Save to blockStore. + if cs.blockStore.Height() < block.Height { + // NOTE: the seenCommit is local justification to commit this block, + // but may differ from the LastCommit included in the next block + precommits := cs.Votes.Precommits(cs.CommitRound) + seenCommit := precommits.MakeCommit() + cs.blockStore.SaveBlock(block, blockParts, seenCommit) + } else { + // Happens during replay if we already saved the block but didn't commit + cs.Logger.Info("Calling finalizeCommit on already stored block", "height", block.Height) + } + + fail.Fail() // XXX + + // Write EndHeightMessage{} for this height, implying that the blockstore + // has saved the block. + // + // If we crash before writing this EndHeightMessage{}, we will recover by + // running ApplyBlock during the ABCI handshake when we restart. If we + // didn't save the block to the blockstore before writing + // EndHeightMessage{}, we'd have to change WAL replay -- currently it + // complains about replaying for heights where an #ENDHEIGHT entry already + // exists. + // + // Either way, the State should not be resumed until we + // successfully call ApplyBlock (ie. later here, or in Handshake after + // restart). + endMsg := EndHeightMessage{height} + if err := cs.wal.WriteSync(endMsg); err != nil { // NOTE: fsync + panic(fmt.Sprintf("Failed to write %v msg to consensus wal due to %v. Check your FS and restart the node", + endMsg, err)) + } + + fail.Fail() // XXX + + // Create a copy of the state for staging and an event cache for txs. + stateCopy := cs.state.Copy() + + // Execute and commit the block, update and save the state, and update the mempool. + // NOTE The block.AppHash wont reflect these txs until the next block. + var err error + var retainHeight int64 + stateCopy, retainHeight, err = cs.blockExec.ApplyBlock( + stateCopy, + types.BlockID{Hash: block.Hash(), PartSetHeader: blockParts.Header()}, + block) + if err != nil { + cs.Logger.Error("Error on ApplyBlock", "err", err) + return + } + + fail.Fail() // XXX + + // Prune old heights, if requested by ABCI app. + if retainHeight > 0 { + pruned, err := cs.pruneBlocks(retainHeight) + if err != nil { + cs.Logger.Error("Failed to prune blocks", "retainHeight", retainHeight, "err", err) + } else { + cs.Logger.Info("Pruned blocks", "pruned", pruned, "retainHeight", retainHeight) + } + } + + // must be called before we update state + cs.recordMetrics(height, block) + + // NewHeightStep! + cs.updateToState(stateCopy) + + fail.Fail() // XXX + + // Private validator might have changed it's key pair => refetch pubkey. + if err := cs.updatePrivValidatorPubKey(); err != nil { + cs.Logger.Error("Can't get private validator pubkey", "err", err) + } + + // cs.StartTime is already set. + // Schedule Round0 to start soon. + cs.scheduleRound0(&cs.RoundState) + + // By here, + // * cs.Height has been increment to height+1 + // * cs.Step is now cstypes.RoundStepNewHeight + // * cs.StartTime is set to when we will start round0. +} + +func (cs *State) pruneBlocks(retainHeight int64) (uint64, error) { + base := cs.blockStore.Base() + if retainHeight <= base { + return 0, nil + } + pruned, err := cs.blockStore.PruneBlocks(retainHeight) + if err != nil { + return 0, fmt.Errorf("failed to prune block store: %w", err) + } + err = cs.blockExec.Store().PruneStates(base, retainHeight) + if err != nil { + return 0, fmt.Errorf("failed to prune state database: %w", err) + } + return pruned, nil +} + +func (cs *State) recordMetrics(height int64, block *types.Block) { + cs.metrics.Validators.Set(float64(cs.Validators.Size())) + cs.metrics.ValidatorsPower.Set(float64(cs.Validators.TotalVotingPower())) + + var ( + missingValidators int + missingValidatorsPower int64 + ) + // height=0 -> MissingValidators and MissingValidatorsPower are both 0. + // Remember that the first LastCommit is intentionally empty, so it's not + // fair to increment missing validators number. + if height > cs.state.InitialHeight { + // Sanity check that commit size matches validator set size - only applies + // after first block. + var ( + commitSize = block.LastCommit.Size() + valSetLen = len(cs.LastValidators.Validators) + address types.Address + ) + if commitSize != valSetLen { + panic(fmt.Sprintf("commit size (%d) doesn't match valset length (%d) at height %d\n\n%v\n\n%v", + commitSize, valSetLen, block.Height, block.LastCommit.Signatures, cs.LastValidators.Validators)) + } + + if cs.privValidator != nil { + if cs.privValidatorPubKey == nil { + // Metrics won't be updated, but it's not critical. + cs.Logger.Error(fmt.Sprintf("recordMetrics: %v", errPubKeyIsNotSet)) + } else { + address = cs.privValidatorPubKey.Address() + } + } + + for i, val := range cs.LastValidators.Validators { + commitSig := block.LastCommit.Signatures[i] + if commitSig.Absent() { + missingValidators++ + missingValidatorsPower += val.VotingPower + } + + if bytes.Equal(val.Address, address) { + label := []string{ + "validator_address", val.Address.String(), + } + cs.metrics.ValidatorPower.With(label...).Set(float64(val.VotingPower)) + if commitSig.ForBlock() { + cs.metrics.ValidatorLastSignedHeight.With(label...).Set(float64(height)) + } else { + cs.metrics.ValidatorMissedBlocks.With(label...).Add(float64(1)) + } + } + + } + } + cs.metrics.MissingValidators.Set(float64(missingValidators)) + cs.metrics.MissingValidatorsPower.Set(float64(missingValidatorsPower)) + + // NOTE: byzantine validators power and count is only for consensus evidence i.e. duplicate vote + var ( + byzantineValidatorsPower = int64(0) + byzantineValidatorsCount = int64(0) + ) + for _, ev := range block.Evidence.Evidence { + if dve, ok := ev.(*types.DuplicateVoteEvidence); ok { + if _, val := cs.Validators.GetByAddress(dve.VoteA.ValidatorAddress); val != nil { + byzantineValidatorsCount++ + byzantineValidatorsPower += val.VotingPower + } + } + } + cs.metrics.ByzantineValidators.Set(float64(byzantineValidatorsCount)) + cs.metrics.ByzantineValidatorsPower.Set(float64(byzantineValidatorsPower)) + + if height > 1 { + lastBlockMeta := cs.blockStore.LoadBlockMeta(height - 1) + if lastBlockMeta != nil { + cs.metrics.BlockIntervalSeconds.Observe( + block.Time.Sub(lastBlockMeta.Header.Time).Seconds(), + ) + } + } + + cs.metrics.NumTxs.Set(float64(len(block.Data.Txs))) + cs.metrics.TotalTxs.Add(float64(len(block.Data.Txs))) + cs.metrics.BlockSizeBytes.Set(float64(block.Size())) + cs.metrics.CommittedHeight.Set(float64(block.Height)) +} + +//----------------------------------------------------------------------------- + +// NOTE: block is not necessarily valid. +// Asynchronously triggers either enterPrevote (before we timeout of propose) or tryFinalizeCommit, +// once we have the full block. +func (cs *State) addProposalBlockPart(msg *BlockPartMessage, peerID p2p.ID) (added bool, err error) { + height, round, part := msg.Height, msg.Round, msg.Part + + // Blocks might be reused, so round mismatch is OK + if cs.Height != height { + cs.Logger.Debug("Received block part from wrong height", "height", height, "round", round) + return false, nil + } + + // We're not expecting a block part. + if cs.ProposalBlockParts == nil { + // NOTE: this can happen when we've gone to a higher round and + // then receive parts from the previous round - not necessarily a bad peer. + cs.Logger.Info("Received a block part when we're not expecting any", + "height", height, "round", round, "index", part.Index, "peer", peerID) + return false, nil + } + + added, err = cs.ProposalBlockParts.AddPart(part) + if err != nil { + return added, err + } + if cs.ProposalBlockParts.ByteSize() > cs.state.ConsensusParams.Block.MaxBytes { + return added, fmt.Errorf("total size of proposal block parts exceeds maximum block bytes (%d > %d)", + cs.ProposalBlockParts.ByteSize(), cs.state.ConsensusParams.Block.MaxBytes, + ) + } + if added && cs.ProposalBlockParts.IsComplete() { + bz, err := ioutil.ReadAll(cs.ProposalBlockParts.GetReader()) + if err != nil { + return added, err + } + + var pbb = new(tmproto.Block) + err = proto.Unmarshal(bz, pbb) + if err != nil { + return added, err + } + + block, err := types.BlockFromProto(pbb) + if err != nil { + return added, err + } + + cs.ProposalBlock = block + // NOTE: it's possible to receive complete proposal blocks for future rounds without having the proposal + cs.Logger.Info("Received complete proposal block", "height", cs.ProposalBlock.Height, "hash", cs.ProposalBlock.Hash()) + if err := cs.eventBus.PublishEventCompleteProposal(cs.CompleteProposalEvent()); err != nil { + cs.Logger.Error("Error publishing event complete proposal", "err", err) + } + + // Update Valid* if we can. + prevotes := cs.Votes.Prevotes(cs.Round) + blockID, hasTwoThirds := prevotes.TwoThirdsMajority() + if hasTwoThirds && !blockID.IsZero() && (cs.ValidRound < cs.Round) { + if cs.ProposalBlock.HashesTo(blockID.Hash) { + cs.Logger.Info("Updating valid block to new proposal block", + "valid-round", cs.Round, "valid-block-hash", cs.ProposalBlock.Hash()) + cs.ValidRound = cs.Round + cs.ValidBlock = cs.ProposalBlock + cs.ValidBlockParts = cs.ProposalBlockParts + } + // TODO: In case there is +2/3 majority in Prevotes set for some + // block and cs.ProposalBlock contains different block, either + // proposer is faulty or voting power of faulty processes is more + // than 1/3. We should trigger in the future accountability + // procedure at this point. + } + + if cs.Step <= cstypes.RoundStepPropose && cs.isProposalComplete() { + // Move onto the next step + cs.enterPrevote(height, cs.Round) + if hasTwoThirds { // this is optimisation as this will be triggered when prevote is added + cs.enterPrecommit(height, cs.Round) + } + } else if cs.Step == cstypes.RoundStepCommit { + // If we're waiting on the proposal block... + cs.tryFinalizeCommit(height) + } + return added, nil + } + return added, nil +} + +// Attempt to add the vote. if its a duplicate signature, dupeout the validator +func (cs *State) tryAddVote(vote *types.Vote, peerID p2p.ID) (bool, error) { + added, err := cs.addVote(vote, peerID) + if err != nil { + // If the vote height is off, we'll just ignore it, + // But if it's a conflicting sig, add it to the cs.evpool. + // If it's otherwise invalid, punish peer. + // nolint: gocritic + if voteErr, ok := err.(*types.ErrVoteConflictingVotes); ok { + if cs.privValidatorPubKey == nil { + return false, errPubKeyIsNotSet + } + + if bytes.Equal(vote.ValidatorAddress, cs.privValidatorPubKey.Address()) { + cs.Logger.Error( + "Found conflicting vote from ourselves. Did you unsafe_reset a validator?", + "height", + vote.Height, + "round", + vote.Round, + "type", + vote.Type) + return added, err + } + var timestamp time.Time + if voteErr.VoteA.Height == cs.state.InitialHeight { + timestamp = cs.state.LastBlockTime // genesis time + } else { + timestamp = sm.MedianTime(cs.LastCommit.MakeCommit(), cs.LastValidators) + } + evidenceErr := cs.evpool.AddEvidenceFromConsensus( + types.NewDuplicateVoteEvidence(voteErr.VoteA, voteErr.VoteB), timestamp, cs.Validators) + if evidenceErr != nil { + cs.Logger.Error("Failed to add evidence to the evidence pool", "err", evidenceErr) + } + return added, err + } else if err == types.ErrVoteNonDeterministicSignature { + cs.Logger.Debug("Vote has non-deterministic signature", "err", err) + } else { + // Either + // 1) bad peer OR + // 2) not a bad peer? this can also err sometimes with "Unexpected step" OR + // 3) tmkms use with multiple validators connecting to a single tmkms instance + // (https://github.com/tendermint/tendermint/issues/3839). + cs.Logger.Info("Error attempting to add vote", "err", err) + return added, ErrAddingVote + } + } + return added, nil +} + +//----------------------------------------------------------------------------- + +// CONTRACT: cs.privValidator is not nil. +func (cs *State) signVote( + msgType tmproto.SignedMsgType, + hash []byte, + header types.PartSetHeader, +) (*types.Vote, error) { + // Flush the WAL. Otherwise, we may not recompute the same vote to sign, + // and the privValidator will refuse to sign anything. + if err := cs.wal.FlushAndSync(); err != nil { + return nil, err + } + + if cs.privValidatorPubKey == nil { + return nil, errPubKeyIsNotSet + } + addr := cs.privValidatorPubKey.Address() + valIdx, _ := cs.Validators.GetByAddress(addr) + + vote := &types.Vote{ + ValidatorAddress: addr, + ValidatorIndex: valIdx, + Height: cs.Height, + Round: cs.Round, + Timestamp: cs.voteTime(), + Type: msgType, + BlockID: types.BlockID{Hash: hash, PartSetHeader: header}, + } + v := vote.ToProto() + err := cs.privValidator.SignVote(cs.state.ChainID, v) + vote.Signature = v.Signature + + return vote, err +} + +func (cs *State) voteTime() time.Time { + now := tmtime.Now() + minVoteTime := now + // TODO: We should remove next line in case we don't vote for v in case cs.ProposalBlock == nil, + // even if cs.LockedBlock != nil. See https://docs.tendermint.com/master/spec/. + timeIota := time.Duration(cs.state.ConsensusParams.Block.TimeIotaMs) * time.Millisecond + if cs.LockedBlock != nil { + // See the BFT time spec https://docs.tendermint.com/master/spec/consensus/bft-time.html + minVoteTime = cs.LockedBlock.Time.Add(timeIota) + } else if cs.ProposalBlock != nil { + minVoteTime = cs.ProposalBlock.Time.Add(timeIota) + } + + if now.After(minVoteTime) { + return now + } + return minVoteTime +} + +// sign the vote and publish on internalMsgQueue +func (cs *State) signAddVote(msgType tmproto.SignedMsgType, hash []byte, header types.PartSetHeader) *types.Vote { + if cs.privValidator == nil { // the node does not have a key + return nil + } + + if cs.privValidatorPubKey == nil { + // Vote won't be signed, but it's not critical. + cs.Logger.Error(fmt.Sprintf("signAddVote: %v", errPubKeyIsNotSet)) + return nil + } + + // If the node not in the validator set, do nothing. + if !cs.Validators.HasAddress(cs.privValidatorPubKey.Address()) { + return nil + } + + // TODO: pass pubKey to signVote + vote, err := cs.signVote(msgType, hash, header) + if err == nil { + cs.sendInternalMessage(msgInfo{&VoteMessage{vote}, ""}) + cs.Logger.Info("Signed and pushed vote", "height", cs.Height, "round", cs.Round, "vote", vote) + return vote + } + // if !cs.replayMode { + cs.Logger.Error("Error signing vote", "height", cs.Height, "round", cs.Round, "vote", vote, "err", err) + //} + return nil +} + +// updatePrivValidatorPubKey get's the private validator public key and +// memoizes it. This func returns an error if the private validator is not +// responding or responds with an error. +func (cs *State) updatePrivValidatorPubKey() error { + if cs.privValidator == nil { + return nil + } + + pubKey, err := cs.privValidator.GetPubKey() + if err != nil { + return err + } + cs.privValidatorPubKey = pubKey + return nil +} + +// look back to check existence of the node's consensus votes before joining consensus +func (cs *State) checkDoubleSigningRisk(height int64) error { + if cs.privValidator != nil && cs.privValidatorPubKey != nil && cs.config.DoubleSignCheckHeight > 0 && height > 0 { + valAddr := cs.privValidatorPubKey.Address() + doubleSignCheckHeight := cs.config.DoubleSignCheckHeight + if doubleSignCheckHeight > height { + doubleSignCheckHeight = height + } + for i := int64(1); i < doubleSignCheckHeight; i++ { + lastCommit := cs.blockStore.LoadSeenCommit(height - i) + if lastCommit != nil { + for sigIdx, s := range lastCommit.Signatures { + if s.BlockIDFlag == types.BlockIDFlagCommit && bytes.Equal(s.ValidatorAddress, valAddr) { + cs.Logger.Info("Found signature from the same key", "sig", s, "idx", sigIdx, "height", height-i) + return ErrSignatureFoundInPastBlocks + } + } + } + } + } + return nil +} + +//--------------------------------------------------------- + +func CompareHRS(h1 int64, r1 int32, s1 cstypes.RoundStepType, h2 int64, r2 int32, s2 cstypes.RoundStepType) int { + if h1 < h2 { + return -1 + } else if h1 > h2 { + return 1 + } + if r1 < r2 { + return -1 + } else if r1 > r2 { + return 1 + } + if s1 < s2 { + return -1 + } else if s1 > s2 { + return 1 + } + return 0 +} + +// repairWalFile decodes messages from src (until the decoder errors) and +// writes them to dst. +func repairWalFile(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.Open(dst) + if err != nil { + return err + } + defer out.Close() + + var ( + dec = NewWALDecoder(in) + enc = NewWALEncoder(out) + ) + + // best-case repair (until first error is encountered) + for { + msg, err := dec.Decode() + if err != nil { + break + } + + err = enc.Encode(msg) + if err != nil { + return fmt.Errorf("failed to encode msg: %w", err) + } + } + + return nil +} diff --git a/test/maverick/consensus/ticker.go b/test/maverick/consensus/ticker.go new file mode 100644 index 000000000..fb3571ac8 --- /dev/null +++ b/test/maverick/consensus/ticker.go @@ -0,0 +1,134 @@ +package consensus + +import ( + "time" + + "github.com/tendermint/tendermint/libs/log" + "github.com/tendermint/tendermint/libs/service" +) + +var ( + tickTockBufferSize = 10 +) + +// TimeoutTicker is a timer that schedules timeouts +// conditional on the height/round/step in the timeoutInfo. +// The timeoutInfo.Duration may be non-positive. +type TimeoutTicker interface { + Start() error + Stop() error + Chan() <-chan timeoutInfo // on which to receive a timeout + ScheduleTimeout(ti timeoutInfo) // reset the timer + + SetLogger(log.Logger) +} + +// timeoutTicker wraps time.Timer, +// scheduling timeouts only for greater height/round/step +// than what it's already seen. +// Timeouts are scheduled along the tickChan, +// and fired on the tockChan. +type timeoutTicker struct { + service.BaseService + + timer *time.Timer + tickChan chan timeoutInfo // for scheduling timeouts + tockChan chan timeoutInfo // for notifying about them +} + +// NewTimeoutTicker returns a new TimeoutTicker. +func NewTimeoutTicker() TimeoutTicker { + tt := &timeoutTicker{ + timer: time.NewTimer(0), + tickChan: make(chan timeoutInfo, tickTockBufferSize), + tockChan: make(chan timeoutInfo, tickTockBufferSize), + } + tt.BaseService = *service.NewBaseService(nil, "TimeoutTicker", tt) + tt.stopTimer() // don't want to fire until the first scheduled timeout + return tt +} + +// OnStart implements service.Service. It starts the timeout routine. +func (t *timeoutTicker) OnStart() error { + + go t.timeoutRoutine() + + return nil +} + +// OnStop implements service.Service. It stops the timeout routine. +func (t *timeoutTicker) OnStop() { + t.BaseService.OnStop() + t.stopTimer() +} + +// Chan returns a channel on which timeouts are sent. +func (t *timeoutTicker) Chan() <-chan timeoutInfo { + return t.tockChan +} + +// ScheduleTimeout schedules a new timeout by sending on the internal tickChan. +// The timeoutRoutine is always available to read from tickChan, so this won't block. +// The scheduling may fail if the timeoutRoutine has already scheduled a timeout for a later height/round/step. +func (t *timeoutTicker) ScheduleTimeout(ti timeoutInfo) { + t.tickChan <- ti +} + +//------------------------------------------------------------- + +// stop the timer and drain if necessary +func (t *timeoutTicker) stopTimer() { + // Stop() returns false if it was already fired or was stopped + if !t.timer.Stop() { + select { + case <-t.timer.C: + default: + t.Logger.Debug("Timer already stopped") + } + } +} + +// send on tickChan to start a new timer. +// timers are interupted and replaced by new ticks from later steps +// timeouts of 0 on the tickChan will be immediately relayed to the tockChan +func (t *timeoutTicker) timeoutRoutine() { + t.Logger.Debug("Starting timeout routine") + var ti timeoutInfo + for { + select { + case newti := <-t.tickChan: + t.Logger.Debug("Received tick", "old_ti", ti, "new_ti", newti) + + // ignore tickers for old height/round/step + if newti.Height < ti.Height { + continue + } else if newti.Height == ti.Height { + if newti.Round < ti.Round { + continue + } else if newti.Round == ti.Round { + if ti.Step > 0 && newti.Step <= ti.Step { + continue + } + } + } + + // stop the last timer + t.stopTimer() + + // update timeoutInfo and reset timer + // NOTE time.Timer allows duration to be non-positive + ti = newti + t.timer.Reset(ti.Duration) + t.Logger.Debug("Scheduled timeout", "dur", ti.Duration, "height", ti.Height, "round", ti.Round, "step", ti.Step) + case <-t.timer.C: + t.Logger.Info("Timed out", "dur", ti.Duration, "height", ti.Height, "round", ti.Round, "step", ti.Step) + // go routine here guarantees timeoutRoutine doesn't block. + // Determinism comes from playback in the receiveRoutine. + // We can eliminate it by merging the timeoutRoutine into receiveRoutine + // and managing the timeouts ourselves with a millisecond ticker + go func(toi timeoutInfo) { t.tockChan <- toi }(ti) + case <-t.Quit(): + return + } + } +} diff --git a/test/maverick/consensus/wal.go b/test/maverick/consensus/wal.go new file mode 100644 index 000000000..7d698713f --- /dev/null +++ b/test/maverick/consensus/wal.go @@ -0,0 +1,437 @@ +package consensus + +import ( + "encoding/binary" + "errors" + "fmt" + "hash/crc32" + "io" + "path/filepath" + "time" + + "github.com/gogo/protobuf/proto" + + auto "github.com/tendermint/tendermint/libs/autofile" + // tmjson "github.com/tendermint/tendermint/libs/json" + "github.com/tendermint/tendermint/libs/log" + tmos "github.com/tendermint/tendermint/libs/os" + "github.com/tendermint/tendermint/libs/service" + tmcons "github.com/tendermint/tendermint/proto/tendermint/consensus" + tmtime "github.com/tendermint/tendermint/types/time" +) + +const ( + // time.Time + max consensus msg size + maxMsgSizeBytes = maxMsgSize + 24 + + // how often the WAL should be sync'd during period sync'ing + walDefaultFlushInterval = 2 * time.Second +) + +//-------------------------------------------------------- +// types and functions for savings consensus messages + +// TimedWALMessage wraps WALMessage and adds Time for debugging purposes. +type TimedWALMessage struct { + Time time.Time `json:"time"` + Msg WALMessage `json:"msg"` +} + +// EndHeightMessage marks the end of the given height inside WAL. +// @internal used by scripts/wal2json util. +type EndHeightMessage struct { + Height int64 `json:"height"` +} + +type WALMessage interface{} + +// func init() { +// tmjson.RegisterType(msgInfo{}, "tendermint/wal/MsgInfo") +// tmjson.RegisterType(timeoutInfo{}, "tendermint/wal/TimeoutInfo") +// tmjson.RegisterType(EndHeightMessage{}, "tendermint/wal/EndHeightMessage") +// } + +//-------------------------------------------------------- +// Simple write-ahead logger + +// WAL is an interface for any write-ahead logger. +type WAL interface { + Write(WALMessage) error + WriteSync(WALMessage) error + FlushAndSync() error + + SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) + + // service methods + Start() error + Stop() error + Wait() +} + +// Write ahead logger writes msgs to disk before they are processed. +// Can be used for crash-recovery and deterministic replay. +// TODO: currently the wal is overwritten during replay catchup, give it a mode +// so it's either reading or appending - must read to end to start appending +// again. +type BaseWAL struct { + service.BaseService + + group *auto.Group + + enc *WALEncoder + + flushTicker *time.Ticker + flushInterval time.Duration +} + +var _ WAL = &BaseWAL{} + +// NewWAL returns a new write-ahead logger based on `baseWAL`, which implements +// WAL. It's flushed and synced to disk every 2s and once when stopped. +func NewWAL(walFile string, groupOptions ...func(*auto.Group)) (*BaseWAL, error) { + err := tmos.EnsureDir(filepath.Dir(walFile), 0700) + if err != nil { + return nil, fmt.Errorf("failed to ensure WAL directory is in place: %w", err) + } + + group, err := auto.OpenGroup(walFile, groupOptions...) + if err != nil { + return nil, err + } + wal := &BaseWAL{ + group: group, + enc: NewWALEncoder(group), + flushInterval: walDefaultFlushInterval, + } + wal.BaseService = *service.NewBaseService(nil, "baseWAL", wal) + return wal, nil +} + +// SetFlushInterval allows us to override the periodic flush interval for the WAL. +func (wal *BaseWAL) SetFlushInterval(i time.Duration) { + wal.flushInterval = i +} + +func (wal *BaseWAL) Group() *auto.Group { + return wal.group +} + +func (wal *BaseWAL) SetLogger(l log.Logger) { + wal.BaseService.Logger = l + wal.group.SetLogger(l) +} + +func (wal *BaseWAL) OnStart() error { + size, err := wal.group.Head.Size() + if err != nil { + return err + } else if size == 0 { + if err := wal.WriteSync(EndHeightMessage{0}); err != nil { + return err + } + } + err = wal.group.Start() + if err != nil { + return err + } + wal.flushTicker = time.NewTicker(wal.flushInterval) + go wal.processFlushTicks() + return nil +} + +func (wal *BaseWAL) processFlushTicks() { + for { + select { + case <-wal.flushTicker.C: + if err := wal.FlushAndSync(); err != nil { + wal.Logger.Error("Periodic WAL flush failed", "err", err) + } + case <-wal.Quit(): + return + } + } +} + +// FlushAndSync flushes and fsync's the underlying group's data to disk. +// See auto#FlushAndSync +func (wal *BaseWAL) FlushAndSync() error { + return wal.group.FlushAndSync() +} + +// Stop the underlying autofile group. +// Use Wait() to ensure it's finished shutting down +// before cleaning up files. +func (wal *BaseWAL) OnStop() { + wal.flushTicker.Stop() + if err := wal.FlushAndSync(); err != nil { + wal.Logger.Error("error on flush data to disk", "error", err) + } + if err := wal.group.Stop(); err != nil { + wal.Logger.Error("error trying to stop wal", "error", err) + } + wal.group.Close() +} + +// Wait for the underlying autofile group to finish shutting down +// so it's safe to cleanup files. +func (wal *BaseWAL) Wait() { + wal.group.Wait() +} + +// Write is called in newStep and for each receive on the +// peerMsgQueue and the timeoutTicker. +// NOTE: does not call fsync() +func (wal *BaseWAL) Write(msg WALMessage) error { + if wal == nil { + return nil + } + + if err := wal.enc.Encode(&TimedWALMessage{tmtime.Now(), msg}); err != nil { + wal.Logger.Error("Error writing msg to consensus wal. WARNING: recover may not be possible for the current height", + "err", err, "msg", msg) + return err + } + + return nil +} + +// WriteSync is called when we receive a msg from ourselves +// so that we write to disk before sending signed messages. +// NOTE: calls fsync() +func (wal *BaseWAL) WriteSync(msg WALMessage) error { + if wal == nil { + return nil + } + + if err := wal.Write(msg); err != nil { + return err + } + + if err := wal.FlushAndSync(); err != nil { + wal.Logger.Error(`WriteSync failed to flush consensus wal. + WARNING: may result in creating alternative proposals / votes for the current height iff the node restarted`, + "err", err) + return err + } + + return nil +} + +// WALSearchOptions are optional arguments to SearchForEndHeight. +type WALSearchOptions struct { + // IgnoreDataCorruptionErrors set to true will result in skipping data corruption errors. + IgnoreDataCorruptionErrors bool +} + +// SearchForEndHeight searches for the EndHeightMessage with the given height +// and returns an auto.GroupReader, whenever it was found or not and an error. +// Group reader will be nil if found equals false. +// +// CONTRACT: caller must close group reader. +func (wal *BaseWAL) SearchForEndHeight( + height int64, + options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) { + var ( + msg *TimedWALMessage + gr *auto.GroupReader + ) + lastHeightFound := int64(-1) + + // NOTE: starting from the last file in the group because we're usually + // searching for the last height. See replay.go + min, max := wal.group.MinIndex(), wal.group.MaxIndex() + wal.Logger.Info("Searching for height", "height", height, "min", min, "max", max) + for index := max; index >= min; index-- { + gr, err = wal.group.NewReader(index) + if err != nil { + return nil, false, err + } + + dec := NewWALDecoder(gr) + for { + msg, err = dec.Decode() + if err == io.EOF { + // OPTIMISATION: no need to look for height in older files if we've seen h < height + if lastHeightFound > 0 && lastHeightFound < height { + gr.Close() + return nil, false, nil + } + // check next file + break + } + if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) { + wal.Logger.Error("Corrupted entry. Skipping...", "err", err) + // do nothing + continue + } else if err != nil { + gr.Close() + return nil, false, err + } + + if m, ok := msg.Msg.(EndHeightMessage); ok { + lastHeightFound = m.Height + if m.Height == height { // found + wal.Logger.Info("Found", "height", height, "index", index) + return gr, true, nil + } + } + } + gr.Close() + } + + return nil, false, nil +} + +// ///////////////////////////////////////////////////////////////////////////// + +// A WALEncoder writes custom-encoded WAL messages to an output stream. +// +// Format: 4 bytes CRC sum + 4 bytes length + arbitrary-length value +type WALEncoder struct { + wr io.Writer +} + +// NewWALEncoder returns a new encoder that writes to wr. +func NewWALEncoder(wr io.Writer) *WALEncoder { + return &WALEncoder{wr} +} + +// Encode writes the custom encoding of v to the stream. It returns an error if +// the encoded size of v is greater than 1MB. Any error encountered +// during the write is also returned. +func (enc *WALEncoder) Encode(v *TimedWALMessage) error { + pbMsg, err := WALToProto(v.Msg) + if err != nil { + return err + } + pv := tmcons.TimedWALMessage{ + Time: v.Time, + Msg: pbMsg, + } + + data, err := proto.Marshal(&pv) + if err != nil { + panic(fmt.Errorf("encode timed wall message failure: %w", err)) + } + + crc := crc32.Checksum(data, crc32c) + length := uint32(len(data)) + if length > maxMsgSizeBytes { + return fmt.Errorf("msg is too big: %d bytes, max: %d bytes", length, maxMsgSizeBytes) + } + totalLength := 8 + int(length) + + msg := make([]byte, totalLength) + binary.BigEndian.PutUint32(msg[0:4], crc) + binary.BigEndian.PutUint32(msg[4:8], length) + copy(msg[8:], data) + + _, err = enc.wr.Write(msg) + return err +} + +// ///////////////////////////////////////////////////////////////////////////// + +// IsDataCorruptionError returns true if data has been corrupted inside WAL. +func IsDataCorruptionError(err error) bool { + _, ok := err.(DataCorruptionError) + return ok +} + +// DataCorruptionError is an error that occures if data on disk was corrupted. +type DataCorruptionError struct { + cause error +} + +func (e DataCorruptionError) Error() string { + return fmt.Sprintf("DataCorruptionError[%v]", e.cause) +} + +func (e DataCorruptionError) Cause() error { + return e.cause +} + +// A WALDecoder reads and decodes custom-encoded WAL messages from an input +// stream. See WALEncoder for the format used. +// +// It will also compare the checksums and make sure data size is equal to the +// length from the header. If that is not the case, error will be returned. +type WALDecoder struct { + rd io.Reader +} + +// NewWALDecoder returns a new decoder that reads from rd. +func NewWALDecoder(rd io.Reader) *WALDecoder { + return &WALDecoder{rd} +} + +// Decode reads the next custom-encoded value from its reader and returns it. +func (dec *WALDecoder) Decode() (*TimedWALMessage, error) { + b := make([]byte, 4) + + _, err := dec.rd.Read(b) + if errors.Is(err, io.EOF) { + return nil, err + } + if err != nil { + return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %v", err)} + } + crc := binary.BigEndian.Uint32(b) + + b = make([]byte, 4) + _, err = dec.rd.Read(b) + if err != nil { + return nil, DataCorruptionError{fmt.Errorf("failed to read length: %v", err)} + } + length := binary.BigEndian.Uint32(b) + + if length > maxMsgSizeBytes { + return nil, DataCorruptionError{fmt.Errorf( + "length %d exceeded maximum possible value of %d bytes", + length, + maxMsgSizeBytes)} + } + + data := make([]byte, length) + n, err := dec.rd.Read(data) + if err != nil { + return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v (read: %d, wanted: %d)", err, n, length)} + } + + // check checksum before decoding data + actualCRC := crc32.Checksum(data, crc32c) + if actualCRC != crc { + return nil, DataCorruptionError{fmt.Errorf("checksums do not match: read: %v, actual: %v", crc, actualCRC)} + } + + var res = new(tmcons.TimedWALMessage) + err = proto.Unmarshal(data, res) + if err != nil { + return nil, DataCorruptionError{fmt.Errorf("failed to decode data: %v", err)} + } + + walMsg, err := WALFromProto(res.Msg) + if err != nil { + return nil, DataCorruptionError{fmt.Errorf("failed to convert from proto: %w", err)} + } + tMsgWal := &TimedWALMessage{ + Time: res.Time, + Msg: walMsg, + } + + return tMsgWal, err +} + +type nilWAL struct{} + +var _ WAL = nilWAL{} + +func (nilWAL) Write(m WALMessage) error { return nil } +func (nilWAL) WriteSync(m WALMessage) error { return nil } +func (nilWAL) FlushAndSync() error { return nil } +func (nilWAL) SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) { + return nil, false, nil +} +func (nilWAL) Start() error { return nil } +func (nilWAL) Stop() error { return nil } +func (nilWAL) Wait() {} diff --git a/test/maverick/consensus/wal_fuzz.go b/test/maverick/consensus/wal_fuzz.go new file mode 100644 index 000000000..e15097c30 --- /dev/null +++ b/test/maverick/consensus/wal_fuzz.go @@ -0,0 +1,31 @@ +// +build gofuzz + +package consensus + +import ( + "bytes" + "io" +) + +func Fuzz(data []byte) int { + dec := NewWALDecoder(bytes.NewReader(data)) + for { + msg, err := dec.Decode() + if err == io.EOF { + break + } + if err != nil { + if msg != nil { + panic("msg != nil on error") + } + return 0 + } + var w bytes.Buffer + enc := NewWALEncoder(&w) + err = enc.Encode(msg) + if err != nil { + panic(err) + } + } + return 1 +} diff --git a/test/maverick/consensus/wal_generator.go b/test/maverick/consensus/wal_generator.go new file mode 100644 index 000000000..fde9064b8 --- /dev/null +++ b/test/maverick/consensus/wal_generator.go @@ -0,0 +1,229 @@ +package consensus + +import ( + "bufio" + "bytes" + "fmt" + "io" + "path/filepath" + "testing" + "time" + + db "github.com/tendermint/tm-db" + + "github.com/tendermint/tendermint/abci/example/kvstore" + cfg "github.com/tendermint/tendermint/config" + "github.com/tendermint/tendermint/libs/log" + tmrand "github.com/tendermint/tendermint/libs/rand" + "github.com/tendermint/tendermint/privval" + "github.com/tendermint/tendermint/proxy" + sm "github.com/tendermint/tendermint/state" + "github.com/tendermint/tendermint/store" + "github.com/tendermint/tendermint/types" +) + +// WALGenerateNBlocks generates a consensus WAL. It does this by spinning up a +// stripped down version of node (proxy app, event bus, consensus state) with a +// persistent kvstore application and special consensus wal instance +// (byteBufferWAL) and waits until numBlocks are created. +// If the node fails to produce given numBlocks, it returns an error. +func WALGenerateNBlocks(t *testing.T, wr io.Writer, numBlocks int) (err error) { + config := getConfig(t) + + app := kvstore.NewPersistentKVStoreApplication(filepath.Join(config.DBDir(), "wal_generator")) + + logger := log.TestingLogger().With("wal_generator", "wal_generator") + logger.Info("generating WAL (last height msg excluded)", "numBlocks", numBlocks) + + // /////////////////////////////////////////////////////////////////////////// + // COPY PASTE FROM node.go WITH A FEW MODIFICATIONS + // NOTE: we can't import node package because of circular dependency. + // NOTE: we don't do handshake so need to set state.Version.Consensus.App directly. + privValidatorKeyFile := config.PrivValidatorKeyFile() + privValidatorStateFile := config.PrivValidatorStateFile() + privValidator := privval.LoadOrGenFilePV(privValidatorKeyFile, privValidatorStateFile) + genDoc, err := types.GenesisDocFromFile(config.GenesisFile()) + if err != nil { + return fmt.Errorf("failed to read genesis file: %w", err) + } + blockStoreDB := db.NewMemDB() + stateDB := blockStoreDB + stateStore := sm.NewStore(stateDB) + state, err := sm.MakeGenesisState(genDoc) + if err != nil { + return fmt.Errorf("failed to make genesis state: %w", err) + } + state.Version.Consensus.App = kvstore.ProtocolVersion + if err = stateStore.Save(state); err != nil { + t.Error(err) + } + + blockStore := store.NewBlockStore(blockStoreDB) + + proxyApp := proxy.NewAppConns(proxy.NewLocalClientCreator(app)) + proxyApp.SetLogger(logger.With("module", "proxy")) + if err := proxyApp.Start(); err != nil { + return fmt.Errorf("failed to start proxy app connections: %w", err) + } + t.Cleanup(func() { + if err := proxyApp.Stop(); err != nil { + t.Error(err) + } + }) + + eventBus := types.NewEventBus() + eventBus.SetLogger(logger.With("module", "events")) + if err := eventBus.Start(); err != nil { + return fmt.Errorf("failed to start event bus: %w", err) + } + t.Cleanup(func() { + if err := eventBus.Stop(); err != nil { + t.Error(err) + } + }) + mempool := emptyMempool{} + evpool := sm.EmptyEvidencePool{} + blockExec := sm.NewBlockExecutor(stateStore, log.TestingLogger(), proxyApp.Consensus(), mempool, evpool) + consensusState := NewState(config.Consensus, state.Copy(), + blockExec, blockStore, mempool, evpool, map[int64]Misbehavior{}) + consensusState.SetLogger(logger) + consensusState.SetEventBus(eventBus) + if privValidator != nil { + consensusState.SetPrivValidator(privValidator) + } + // END OF COPY PASTE + // /////////////////////////////////////////////////////////////////////////// + + // set consensus wal to buffered WAL, which will write all incoming msgs to buffer + numBlocksWritten := make(chan struct{}) + wal := newByteBufferWAL(logger, NewWALEncoder(wr), int64(numBlocks), numBlocksWritten) + // see wal.go#103 + if err := wal.Write(EndHeightMessage{0}); err != nil { + t.Error(err) + } + + consensusState.wal = wal + + if err := consensusState.Start(); err != nil { + return fmt.Errorf("failed to start consensus state: %w", err) + } + + select { + case <-numBlocksWritten: + if err := consensusState.Stop(); err != nil { + t.Error(err) + } + return nil + case <-time.After(1 * time.Minute): + if err := consensusState.Stop(); err != nil { + t.Error(err) + } + return fmt.Errorf("waited too long for tendermint to produce %d blocks (grep logs for `wal_generator`)", numBlocks) + } +} + +// WALWithNBlocks returns a WAL content with numBlocks. +func WALWithNBlocks(t *testing.T, numBlocks int) (data []byte, err error) { + var b bytes.Buffer + wr := bufio.NewWriter(&b) + + if err := WALGenerateNBlocks(t, wr, numBlocks); err != nil { + return []byte{}, err + } + + wr.Flush() + return b.Bytes(), nil +} + +func randPort() int { + // returns between base and base + spread + base, spread := 20000, 20000 + return base + tmrand.Intn(spread) +} + +func makeAddrs() (string, string, string) { + start := randPort() + return fmt.Sprintf("tcp://127.0.0.1:%d", start), + fmt.Sprintf("tcp://127.0.0.1:%d", start+1), + fmt.Sprintf("tcp://127.0.0.1:%d", start+2) +} + +// getConfig returns a config for test cases +func getConfig(t *testing.T) *cfg.Config { + c := cfg.ResetTestRoot(t.Name()) + + // and we use random ports to run in parallel + tm, rpc, grpc := makeAddrs() + c.P2P.ListenAddress = tm + c.RPC.ListenAddress = rpc + c.RPC.GRPCListenAddress = grpc + return c +} + +// byteBufferWAL is a WAL which writes all msgs to a byte buffer. Writing stops +// when the heightToStop is reached. Client will be notified via +// signalWhenStopsTo channel. +type byteBufferWAL struct { + enc *WALEncoder + stopped bool + heightToStop int64 + signalWhenStopsTo chan<- struct{} + + logger log.Logger +} + +// needed for determinism +var fixedTime, _ = time.Parse(time.RFC3339, "2017-01-02T15:04:05Z") + +func newByteBufferWAL(logger log.Logger, enc *WALEncoder, nBlocks int64, signalStop chan<- struct{}) *byteBufferWAL { + return &byteBufferWAL{ + enc: enc, + heightToStop: nBlocks, + signalWhenStopsTo: signalStop, + logger: logger, + } +} + +// Save writes message to the internal buffer except when heightToStop is +// reached, in which case it will signal the caller via signalWhenStopsTo and +// skip writing. +func (w *byteBufferWAL) Write(m WALMessage) error { + if w.stopped { + w.logger.Debug("WAL already stopped. Not writing message", "msg", m) + return nil + } + + if endMsg, ok := m.(EndHeightMessage); ok { + w.logger.Debug("WAL write end height message", "height", endMsg.Height, "stopHeight", w.heightToStop) + if endMsg.Height == w.heightToStop { + w.logger.Debug("Stopping WAL at height", "height", endMsg.Height) + w.signalWhenStopsTo <- struct{}{} + w.stopped = true + return nil + } + } + + w.logger.Debug("WAL Write Message", "msg", m) + err := w.enc.Encode(&TimedWALMessage{fixedTime, m}) + if err != nil { + panic(fmt.Sprintf("failed to encode the msg %v", m)) + } + + return nil +} + +func (w *byteBufferWAL) WriteSync(m WALMessage) error { + return w.Write(m) +} + +func (w *byteBufferWAL) FlushAndSync() error { return nil } + +func (w *byteBufferWAL) SearchForEndHeight( + height int64, + options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) { + return nil, false, nil +} + +func (w *byteBufferWAL) Start() error { return nil } +func (w *byteBufferWAL) Stop() error { return nil } +func (w *byteBufferWAL) Wait() {} diff --git a/test/maverick/main.go b/test/maverick/main.go new file mode 100644 index 000000000..6a337b3fd --- /dev/null +++ b/test/maverick/main.go @@ -0,0 +1,237 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/spf13/cobra" + "github.com/spf13/viper" + + cmd "github.com/tendermint/tendermint/cmd/tendermint/commands" + "github.com/tendermint/tendermint/cmd/tendermint/commands/debug" + cfg "github.com/tendermint/tendermint/config" + "github.com/tendermint/tendermint/libs/cli" + tmflags "github.com/tendermint/tendermint/libs/cli/flags" + "github.com/tendermint/tendermint/libs/log" + tmos "github.com/tendermint/tendermint/libs/os" + tmrand "github.com/tendermint/tendermint/libs/rand" + "github.com/tendermint/tendermint/p2p" + cs "github.com/tendermint/tendermint/test/maverick/consensus" + nd "github.com/tendermint/tendermint/test/maverick/node" + "github.com/tendermint/tendermint/types" + tmtime "github.com/tendermint/tendermint/types/time" +) + +var ( + config = cfg.DefaultConfig() + logger = log.NewTMLogger(log.NewSyncWriter(os.Stdout)) + misbehaviorFlag = "" +) + +func init() { + registerFlagsRootCmd(RootCmd) +} + +func registerFlagsRootCmd(command *cobra.Command) { + command.PersistentFlags().String("log_level", config.LogLevel, "Log level") +} + +func ParseConfig() (*cfg.Config, error) { + conf := cfg.DefaultConfig() + err := viper.Unmarshal(conf) + if err != nil { + return nil, err + } + conf.SetRoot(conf.RootDir) + cfg.EnsureRoot(conf.RootDir) + if err = conf.ValidateBasic(); err != nil { + return nil, fmt.Errorf("error in config file: %v", err) + } + return conf, err +} + +// RootCmd is the root command for Tendermint core. +var RootCmd = &cobra.Command{ + Use: "maverick", + Short: "Tendermint Maverick Node", + Long: "Tendermint Maverick Node for testing with faulty consensus misbehaviors in a testnet. Contains " + + "all the functionality of a normal node but custom misbehaviors can be injected when running the node " + + "through a flag. See maverick node --help for how the misbehavior flag is constructured", + PersistentPreRunE: func(cmd *cobra.Command, args []string) (err error) { + fmt.Printf("use: %v, args: %v", cmd.Use, cmd.Args) + config, err = ParseConfig() + if err != nil { + return err + } + if config.LogFormat == cfg.LogFormatJSON { + logger = log.NewTMJSONLogger(log.NewSyncWriter(os.Stdout)) + } + logger, err = tmflags.ParseLogLevel(config.LogLevel, logger, cfg.DefaultLogLevel()) + if err != nil { + return err + } + if viper.GetBool(cli.TraceFlag) { + logger = log.NewTracingLogger(logger) + } + logger = logger.With("module", "main") + return nil + }, +} + +func main() { + rootCmd := RootCmd + rootCmd.AddCommand( + ListMisbehaviorCmd, + cmd.GenValidatorCmd, + InitFilesCmd, + cmd.ProbeUpnpCmd, + cmd.ReplayCmd, + cmd.ReplayConsoleCmd, + cmd.ResetAllCmd, + cmd.ResetPrivValidatorCmd, + cmd.ShowValidatorCmd, + cmd.ShowNodeIDCmd, + cmd.GenNodeKeyCmd, + cmd.VersionCmd, + debug.DebugCmd, + cli.NewCompletionCmd(rootCmd, true), + ) + + nodeCmd := &cobra.Command{ + Use: "node", + Short: "Run the maverick node", + RunE: func(command *cobra.Command, args []string) error { + return startNode(config, logger, misbehaviorFlag) + }, + } + + cmd.AddNodeFlags(nodeCmd) + + // Create & start node + rootCmd.AddCommand(nodeCmd) + + // add special flag for misbehaviors + nodeCmd.Flags().StringVar( + &misbehaviorFlag, + "misbehaviors", + "", + "Select the misbehaviors of the node (comma-separated, no spaces in between): \n"+ + "e.g. --misbehaviors double-prevote,3\n"+ + "You can also have multiple misbehaviors: e.g. double-prevote,3,no-vote,5") + + cmd := cli.PrepareBaseCmd(rootCmd, "TM", os.ExpandEnv(filepath.Join("$HOME", cfg.DefaultTendermintDir))) + if err := cmd.Execute(); err != nil { + panic(err) + } +} + +func startNode(config *cfg.Config, logger log.Logger, misbehaviorFlag string) error { + misbehaviors, err := nd.ParseMisbehaviors(misbehaviorFlag) + if err != nil { + return err + } + + node, err := nd.DefaultNewNode(config, logger, misbehaviors) + if err != nil { + return fmt.Errorf("failed to create node: %w", err) + } + + if err := node.Start(); err != nil { + return fmt.Errorf("failed to start node: %w", err) + } + + logger.Info("Started node", "nodeInfo", node.Switch().NodeInfo()) + + // Stop upon receiving SIGTERM or CTRL-C. + tmos.TrapSignal(logger, func() { + if node.IsRunning() { + if err := node.Stop(); err != nil { + logger.Error("unable to stop the node", "error", err) + } + } + }) + + // Run forever. + select {} +} + +var InitFilesCmd = &cobra.Command{ + Use: "init", + Short: "Initialize Tendermint", + RunE: initFiles, +} + +func initFiles(cmd *cobra.Command, args []string) error { + return initFilesWithConfig(config) +} + +func initFilesWithConfig(config *cfg.Config) error { + // private validator + privValKeyFile := config.PrivValidatorKeyFile() + privValStateFile := config.PrivValidatorStateFile() + var pv *nd.FilePV + if tmos.FileExists(privValKeyFile) { + pv = nd.LoadFilePV(privValKeyFile, privValStateFile) + logger.Info("Found private validator", "keyFile", privValKeyFile, + "stateFile", privValStateFile) + } else { + pv = nd.GenFilePV(privValKeyFile, privValStateFile) + pv.Save() + logger.Info("Generated private validator", "keyFile", privValKeyFile, + "stateFile", privValStateFile) + } + + nodeKeyFile := config.NodeKeyFile() + if tmos.FileExists(nodeKeyFile) { + logger.Info("Found node key", "path", nodeKeyFile) + } else { + if _, err := p2p.LoadOrGenNodeKey(nodeKeyFile); err != nil { + return err + } + logger.Info("Generated node key", "path", nodeKeyFile) + } + + // genesis file + genFile := config.GenesisFile() + if tmos.FileExists(genFile) { + logger.Info("Found genesis file", "path", genFile) + } else { + genDoc := types.GenesisDoc{ + ChainID: fmt.Sprintf("test-chain-%v", tmrand.Str(6)), + GenesisTime: tmtime.Now(), + ConsensusParams: types.DefaultConsensusParams(), + } + pubKey, err := pv.GetPubKey() + if err != nil { + return fmt.Errorf("can't get pubkey: %w", err) + } + genDoc.Validators = []types.GenesisValidator{{ + Address: pubKey.Address(), + PubKey: pubKey, + Power: 10, + }} + + if err := genDoc.SaveAs(genFile); err != nil { + return err + } + logger.Info("Generated genesis file", "path", genFile) + } + + return nil +} + +var ListMisbehaviorCmd = &cobra.Command{ + Use: "misbehaviors", + Short: "Lists possible misbehaviors", + RunE: listMisbehaviors, +} + +func listMisbehaviors(cmd *cobra.Command, args []string) error { + str := "Currently registered misbehaviors: \n" + for key := range cs.MisbehaviorList { + str += fmt.Sprintf("- %s\n", key) + } + fmt.Println(str) + return nil +} diff --git a/test/maverick/node/node.go b/test/maverick/node/node.go new file mode 100644 index 000000000..e1f41b6fb --- /dev/null +++ b/test/maverick/node/node.go @@ -0,0 +1,1440 @@ +package node + +import ( + "bytes" + "context" + "errors" + "fmt" + "net" + "net/http" + _ "net/http/pprof" // nolint: gosec // securely exposed on separate, optional port + "strconv" + "strings" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/rs/cors" + + dbm "github.com/tendermint/tm-db" + + abci "github.com/tendermint/tendermint/abci/types" + bcv0 "github.com/tendermint/tendermint/blockchain/v0" + bcv1 "github.com/tendermint/tendermint/blockchain/v1" + bcv2 "github.com/tendermint/tendermint/blockchain/v2" + cfg "github.com/tendermint/tendermint/config" + "github.com/tendermint/tendermint/consensus" + "github.com/tendermint/tendermint/crypto" + "github.com/tendermint/tendermint/evidence" + tmjson "github.com/tendermint/tendermint/libs/json" + "github.com/tendermint/tendermint/libs/log" + tmpubsub "github.com/tendermint/tendermint/libs/pubsub" + "github.com/tendermint/tendermint/libs/service" + "github.com/tendermint/tendermint/light" + mempl "github.com/tendermint/tendermint/mempool" + "github.com/tendermint/tendermint/p2p" + "github.com/tendermint/tendermint/p2p/pex" + "github.com/tendermint/tendermint/privval" + "github.com/tendermint/tendermint/proxy" + rpccore "github.com/tendermint/tendermint/rpc/core" + grpccore "github.com/tendermint/tendermint/rpc/grpc" + rpcserver "github.com/tendermint/tendermint/rpc/jsonrpc/server" + sm "github.com/tendermint/tendermint/state" + "github.com/tendermint/tendermint/state/txindex" + "github.com/tendermint/tendermint/state/txindex/kv" + "github.com/tendermint/tendermint/state/txindex/null" + "github.com/tendermint/tendermint/statesync" + "github.com/tendermint/tendermint/store" + cs "github.com/tendermint/tendermint/test/maverick/consensus" + "github.com/tendermint/tendermint/types" + tmtime "github.com/tendermint/tendermint/types/time" + "github.com/tendermint/tendermint/version" +) + +//------------------------------------------------------------------------------ + +// ParseMisbehaviors is a util function that converts a comma separated string into +// a map of misbehaviors to be executed by the maverick node +func ParseMisbehaviors(str string) (map[int64]cs.Misbehavior, error) { + // check if string is empty in which case we run a normal node + var misbehaviors = make(map[int64]cs.Misbehavior) + if str == "" { + return misbehaviors, nil + } + strs := strings.Split(str, ",") + if len(strs)%2 != 0 { + return misbehaviors, errors.New("missing either height or misbehavior name in the misbehavior flag") + } +OUTER_LOOP: + for i := 0; i < len(strs); i += 2 { + height, err := strconv.ParseInt(strs[i+1], 10, 64) + if err != nil { + return misbehaviors, fmt.Errorf("failed to parse misbehavior height: %w", err) + } + for key, misbehavior := range cs.MisbehaviorList { + if key == strs[i] { + misbehaviors[height] = misbehavior + continue OUTER_LOOP + } + } + return misbehaviors, fmt.Errorf("received unknown misbehavior: %s. Did you forget to add it?", strs[i]) + } + + return misbehaviors, nil +} + +// DBContext specifies config information for loading a new DB. +type DBContext struct { + ID string + Config *cfg.Config +} + +// DBProvider takes a DBContext and returns an instantiated DB. +type DBProvider func(*DBContext) (dbm.DB, error) + +// DefaultDBProvider returns a database using the DBBackend and DBDir +// specified in the ctx.Config. +func DefaultDBProvider(ctx *DBContext) (dbm.DB, error) { + dbType := dbm.BackendType(ctx.Config.DBBackend) + return dbm.NewDB(ctx.ID, dbType, ctx.Config.DBDir()) +} + +// GenesisDocProvider returns a GenesisDoc. +// It allows the GenesisDoc to be pulled from sources other than the +// filesystem, for instance from a distributed key-value store cluster. +type GenesisDocProvider func() (*types.GenesisDoc, error) + +// DefaultGenesisDocProviderFunc returns a GenesisDocProvider that loads +// the GenesisDoc from the config.GenesisFile() on the filesystem. +func DefaultGenesisDocProviderFunc(config *cfg.Config) GenesisDocProvider { + return func() (*types.GenesisDoc, error) { + return types.GenesisDocFromFile(config.GenesisFile()) + } +} + +// Provider takes a config and a logger and returns a ready to go Node. +type Provider func(*cfg.Config, log.Logger) (*Node, error) + +// DefaultNewNode returns a Tendermint node with default settings for the +// PrivValidator, ClientCreator, GenesisDoc, and DBProvider. +// It implements NodeProvider. +func DefaultNewNode(config *cfg.Config, logger log.Logger, misbehaviors map[int64]cs.Misbehavior) (*Node, error) { + nodeKey, err := p2p.LoadOrGenNodeKey(config.NodeKeyFile()) + if err != nil { + return nil, fmt.Errorf("failed to load or gen node key %s, err: %w", config.NodeKeyFile(), err) + } + + return NewNode(config, + LoadOrGenFilePV(config.PrivValidatorKeyFile(), config.PrivValidatorStateFile()), + nodeKey, + proxy.DefaultClientCreator(config.ProxyApp, config.ABCI, config.DBDir()), + DefaultGenesisDocProviderFunc(config), + DefaultDBProvider, + DefaultMetricsProvider(config.Instrumentation), + logger, + misbehaviors, + ) + +} + +// MetricsProvider returns a consensus, p2p and mempool Metrics. +type MetricsProvider func(chainID string) (*cs.Metrics, *p2p.Metrics, *mempl.Metrics, *sm.Metrics) + +// DefaultMetricsProvider returns Metrics build using Prometheus client library +// if Prometheus is enabled. Otherwise, it returns no-op Metrics. +func DefaultMetricsProvider(config *cfg.InstrumentationConfig) MetricsProvider { + return func(chainID string) (*cs.Metrics, *p2p.Metrics, *mempl.Metrics, *sm.Metrics) { + if config.Prometheus { + return cs.PrometheusMetrics(config.Namespace, "chain_id", chainID), + p2p.PrometheusMetrics(config.Namespace, "chain_id", chainID), + mempl.PrometheusMetrics(config.Namespace, "chain_id", chainID), + sm.PrometheusMetrics(config.Namespace, "chain_id", chainID) + } + return cs.NopMetrics(), p2p.NopMetrics(), mempl.NopMetrics(), sm.NopMetrics() + } +} + +// Option sets a parameter for the node. +type Option func(*Node) + +// Temporary interface for switching to fast sync, we should get rid of v0 and v1 reactors. +// See: https://github.com/tendermint/tendermint/issues/4595 +type fastSyncReactor interface { + SwitchToFastSync(sm.State) error +} + +// CustomReactors allows you to add custom reactors (name -> p2p.Reactor) to +// the node's Switch. +// +// WARNING: using any name from the below list of the existing reactors will +// result in replacing it with the custom one. +// +// - MEMPOOL +// - BLOCKCHAIN +// - CONSENSUS +// - EVIDENCE +// - PEX +// - STATESYNC +func CustomReactors(reactors map[string]p2p.Reactor) Option { + return func(n *Node) { + for name, reactor := range reactors { + if existingReactor := n.sw.Reactor(name); existingReactor != nil { + n.sw.Logger.Info("Replacing existing reactor with a custom one", + "name", name, "existing", existingReactor, "custom", reactor) + n.sw.RemoveReactor(name, existingReactor) + } + n.sw.AddReactor(name, reactor) + } + } +} + +func CustomReactorsAsConstructors(reactors map[string]func(n *Node) p2p.Reactor) Option { + return func(n *Node) { + for name, customReactor := range reactors { + if existingReactor := n.sw.Reactor(name); existingReactor != nil { + n.sw.Logger.Info("Replacing existing reactor with a custom one", + "name", name) + n.sw.RemoveReactor(name, existingReactor) + } + n.sw.AddReactor(name, customReactor(n)) + } + } +} + +// StateProvider overrides the state provider used by state sync to retrieve trusted app hashes and +// build a State object for bootstrapping the node. +// WARNING: this interface is considered unstable and subject to change. +func StateProvider(stateProvider statesync.StateProvider) Option { + return func(n *Node) { + n.stateSyncProvider = stateProvider + } +} + +//------------------------------------------------------------------------------ + +// Node is the highest level interface to a full Tendermint node. +// It includes all configuration information and running services. +type Node struct { + service.BaseService + + // config + config *cfg.Config + genesisDoc *types.GenesisDoc // initial validator set + privValidator types.PrivValidator // local node's validator key + + // network + transport *p2p.MultiplexTransport + sw *p2p.Switch // p2p connections + addrBook pex.AddrBook // known peers + nodeInfo p2p.NodeInfo + nodeKey *p2p.NodeKey // our node privkey + isListening bool + + // services + eventBus *types.EventBus // pub/sub for services + stateStore sm.Store + blockStore *store.BlockStore // store the blockchain to disk + bcReactor p2p.Reactor // for fast-syncing + mempoolReactor *mempl.Reactor // for gossipping transactions + mempool mempl.Mempool + stateSync bool // whether the node should state sync on startup + stateSyncReactor *statesync.Reactor // for hosting and restoring state sync snapshots + stateSyncProvider statesync.StateProvider // provides state data for bootstrapping a node + stateSyncGenesis sm.State // provides the genesis state for state sync + consensusState *cs.State // latest consensus state + consensusReactor *cs.Reactor // for participating in the consensus + pexReactor *pex.Reactor // for exchanging peer addresses + evidencePool *evidence.Pool // tracking evidence + proxyApp proxy.AppConns // connection to the application + rpcListeners []net.Listener // rpc servers + txIndexer txindex.TxIndexer + indexerService *txindex.IndexerService + prometheusSrv *http.Server +} + +func initDBs(config *cfg.Config, dbProvider DBProvider) (blockStore *store.BlockStore, stateDB dbm.DB, err error) { + var blockStoreDB dbm.DB + blockStoreDB, err = dbProvider(&DBContext{"blockstore", config}) + if err != nil { + return + } + blockStore = store.NewBlockStore(blockStoreDB) + + stateDB, err = dbProvider(&DBContext{"state", config}) + if err != nil { + return + } + + return +} + +func createAndStartProxyAppConns(clientCreator proxy.ClientCreator, logger log.Logger) (proxy.AppConns, error) { + proxyApp := proxy.NewAppConns(clientCreator) + proxyApp.SetLogger(logger.With("module", "proxy")) + if err := proxyApp.Start(); err != nil { + return nil, fmt.Errorf("error starting proxy app connections: %v", err) + } + return proxyApp, nil +} + +func createAndStartEventBus(logger log.Logger) (*types.EventBus, error) { + eventBus := types.NewEventBus() + eventBus.SetLogger(logger.With("module", "events")) + if err := eventBus.Start(); err != nil { + return nil, err + } + return eventBus, nil +} + +func createAndStartIndexerService(config *cfg.Config, dbProvider DBProvider, + eventBus *types.EventBus, logger log.Logger) (*txindex.IndexerService, txindex.TxIndexer, error) { + + var txIndexer txindex.TxIndexer + switch config.TxIndex.Indexer { + case "kv": + store, err := dbProvider(&DBContext{"tx_index", config}) + if err != nil { + return nil, nil, err + } + txIndexer = kv.NewTxIndex(store) + default: + txIndexer = &null.TxIndex{} + } + + indexerService := txindex.NewIndexerService(txIndexer, eventBus) + indexerService.SetLogger(logger.With("module", "txindex")) + if err := indexerService.Start(); err != nil { + return nil, nil, err + } + return indexerService, txIndexer, nil +} + +func doHandshake( + stateStore sm.Store, + state sm.State, + blockStore sm.BlockStore, + genDoc *types.GenesisDoc, + eventBus types.BlockEventPublisher, + proxyApp proxy.AppConns, + consensusLogger log.Logger) error { + + handshaker := cs.NewHandshaker(stateStore, state, blockStore, genDoc) + handshaker.SetLogger(consensusLogger) + handshaker.SetEventBus(eventBus) + if err := handshaker.Handshake(proxyApp); err != nil { + return fmt.Errorf("error during handshake: %v", err) + } + return nil +} + +func logNodeStartupInfo(state sm.State, pubKey crypto.PubKey, logger, consensusLogger log.Logger) { + // Log the version info. + logger.Info("Version info", + "software", version.TMCoreSemVer, + "block", version.BlockProtocol, + "p2p", version.P2PProtocol, + ) + + // If the state and software differ in block version, at least log it. + if state.Version.Consensus.Block != version.BlockProtocol { + logger.Info("Software and state have different block protocols", + "software", version.BlockProtocol, + "state", state.Version.Consensus.Block, + ) + } + + addr := pubKey.Address() + // Log whether this node is a validator or an observer + if state.Validators.HasAddress(addr) { + consensusLogger.Info("This node is a validator", "addr", addr, "pubKey", pubKey) + } else { + consensusLogger.Info("This node is not a validator", "addr", addr, "pubKey", pubKey) + } +} + +func onlyValidatorIsUs(state sm.State, pubKey crypto.PubKey) bool { + if state.Validators.Size() > 1 { + return false + } + addr, _ := state.Validators.GetByIndex(0) + return bytes.Equal(pubKey.Address(), addr) +} + +func createMempoolAndMempoolReactor(config *cfg.Config, proxyApp proxy.AppConns, + state sm.State, memplMetrics *mempl.Metrics, logger log.Logger) (*mempl.Reactor, *mempl.CListMempool) { + + mempool := mempl.NewCListMempool( + config.Mempool, + proxyApp.Mempool(), + state.LastBlockHeight, + mempl.WithMetrics(memplMetrics), + mempl.WithPreCheck(sm.TxPreCheck(state)), + mempl.WithPostCheck(sm.TxPostCheck(state)), + ) + mempoolLogger := logger.With("module", "mempool") + mempoolReactor := mempl.NewReactor(config.Mempool, mempool) + mempoolReactor.SetLogger(mempoolLogger) + + if config.Consensus.WaitForTxs() { + mempool.EnableTxsAvailable() + } + return mempoolReactor, mempool +} + +func createEvidenceReactor(config *cfg.Config, dbProvider DBProvider, + stateDB dbm.DB, blockStore *store.BlockStore, logger log.Logger) (*evidence.Reactor, *evidence.Pool, error) { + + evidenceDB, err := dbProvider(&DBContext{"evidence", config}) + if err != nil { + return nil, nil, err + } + evidenceLogger := logger.With("module", "evidence") + evidencePool, err := evidence.NewPool(evidenceDB, sm.NewStore(stateDB), blockStore) + if err != nil { + return nil, nil, err + } + evidenceReactor := evidence.NewReactor(evidencePool) + evidenceReactor.SetLogger(evidenceLogger) + return evidenceReactor, evidencePool, nil +} + +func createBlockchainReactor(config *cfg.Config, + state sm.State, + blockExec *sm.BlockExecutor, + blockStore *store.BlockStore, + fastSync bool, + logger log.Logger) (bcReactor p2p.Reactor, err error) { + + switch config.FastSync.Version { + case "v0": + bcReactor = bcv0.NewBlockchainReactor(state.Copy(), blockExec, blockStore, fastSync) + case "v1": + bcReactor = bcv1.NewBlockchainReactor(state.Copy(), blockExec, blockStore, fastSync) + case "v2": + bcReactor = bcv2.NewBlockchainReactor(state.Copy(), blockExec, blockStore, fastSync) + default: + return nil, fmt.Errorf("unknown fastsync version %s", config.FastSync.Version) + } + + bcReactor.SetLogger(logger.With("module", "blockchain")) + return bcReactor, nil +} + +func createConsensusReactor(config *cfg.Config, + state sm.State, + blockExec *sm.BlockExecutor, + blockStore sm.BlockStore, + mempool *mempl.CListMempool, + evidencePool *evidence.Pool, + privValidator types.PrivValidator, + csMetrics *cs.Metrics, + waitSync bool, + eventBus *types.EventBus, + consensusLogger log.Logger, + misbehaviors map[int64]cs.Misbehavior) (*cs.Reactor, *cs.State) { + + consensusState := cs.NewState( + config.Consensus, + state.Copy(), + blockExec, + blockStore, + mempool, + evidencePool, + misbehaviors, + cs.StateMetrics(csMetrics), + ) + consensusState.SetLogger(consensusLogger) + if privValidator != nil { + consensusState.SetPrivValidator(privValidator) + } + consensusReactor := cs.NewReactor(consensusState, waitSync, cs.ReactorMetrics(csMetrics)) + consensusReactor.SetLogger(consensusLogger) + // services which will be publishing and/or subscribing for messages (events) + // consensusReactor will set it on consensusState and blockExecutor + consensusReactor.SetEventBus(eventBus) + return consensusReactor, consensusState +} + +func createTransport( + config *cfg.Config, + nodeInfo p2p.NodeInfo, + nodeKey *p2p.NodeKey, + proxyApp proxy.AppConns, +) ( + *p2p.MultiplexTransport, + []p2p.PeerFilterFunc, +) { + var ( + mConnConfig = p2p.MConnConfig(config.P2P) + transport = p2p.NewMultiplexTransport(nodeInfo, *nodeKey, mConnConfig) + connFilters = []p2p.ConnFilterFunc{} + peerFilters = []p2p.PeerFilterFunc{} + ) + + if !config.P2P.AllowDuplicateIP { + connFilters = append(connFilters, p2p.ConnDuplicateIPFilter()) + } + + // Filter peers by addr or pubkey with an ABCI query. + // If the query return code is OK, add peer. + if config.FilterPeers { + connFilters = append( + connFilters, + // ABCI query for address filtering. + func(_ p2p.ConnSet, c net.Conn, _ []net.IP) error { + res, err := proxyApp.Query().QuerySync(abci.RequestQuery{ + Path: fmt.Sprintf("/p2p/filter/addr/%s", c.RemoteAddr().String()), + }) + if err != nil { + return err + } + if res.IsErr() { + return fmt.Errorf("error querying abci app: %v", res) + } + + return nil + }, + ) + + peerFilters = append( + peerFilters, + // ABCI query for ID filtering. + func(_ p2p.IPeerSet, p p2p.Peer) error { + res, err := proxyApp.Query().QuerySync(abci.RequestQuery{ + Path: fmt.Sprintf("/p2p/filter/id/%s", p.ID()), + }) + if err != nil { + return err + } + if res.IsErr() { + return fmt.Errorf("error querying abci app: %v", res) + } + + return nil + }, + ) + } + + p2p.MultiplexTransportConnFilters(connFilters...)(transport) + + // Limit the number of incoming connections. + max := config.P2P.MaxNumInboundPeers + len(splitAndTrimEmpty(config.P2P.UnconditionalPeerIDs, ",", " ")) + p2p.MultiplexTransportMaxIncomingConnections(max)(transport) + + return transport, peerFilters +} + +func createSwitch(config *cfg.Config, + transport p2p.Transport, + p2pMetrics *p2p.Metrics, + peerFilters []p2p.PeerFilterFunc, + mempoolReactor *mempl.Reactor, + bcReactor p2p.Reactor, + stateSyncReactor *statesync.Reactor, + consensusReactor *cs.Reactor, + evidenceReactor *evidence.Reactor, + nodeInfo p2p.NodeInfo, + nodeKey *p2p.NodeKey, + p2pLogger log.Logger) *p2p.Switch { + + sw := p2p.NewSwitch( + config.P2P, + transport, + p2p.WithMetrics(p2pMetrics), + p2p.SwitchPeerFilters(peerFilters...), + ) + sw.SetLogger(p2pLogger) + sw.AddReactor("MEMPOOL", mempoolReactor) + sw.AddReactor("BLOCKCHAIN", bcReactor) + sw.AddReactor("CONSENSUS", consensusReactor) + sw.AddReactor("EVIDENCE", evidenceReactor) + sw.AddReactor("STATESYNC", stateSyncReactor) + + sw.SetNodeInfo(nodeInfo) + sw.SetNodeKey(nodeKey) + + p2pLogger.Info("P2P Node ID", "ID", nodeKey.ID(), "file", config.NodeKeyFile()) + return sw +} + +func createAddrBookAndSetOnSwitch(config *cfg.Config, sw *p2p.Switch, + p2pLogger log.Logger, nodeKey *p2p.NodeKey) (pex.AddrBook, error) { + + addrBook := pex.NewAddrBook(config.P2P.AddrBookFile(), config.P2P.AddrBookStrict) + addrBook.SetLogger(p2pLogger.With("book", config.P2P.AddrBookFile())) + + // Add ourselves to addrbook to prevent dialing ourselves + if config.P2P.ExternalAddress != "" { + addr, err := p2p.NewNetAddressString(p2p.IDAddressString(nodeKey.ID(), config.P2P.ExternalAddress)) + if err != nil { + return nil, fmt.Errorf("p2p.external_address is incorrect: %w", err) + } + addrBook.AddOurAddress(addr) + } + if config.P2P.ListenAddress != "" { + addr, err := p2p.NewNetAddressString(p2p.IDAddressString(nodeKey.ID(), config.P2P.ListenAddress)) + if err != nil { + return nil, fmt.Errorf("p2p.laddr is incorrect: %w", err) + } + addrBook.AddOurAddress(addr) + } + + sw.SetAddrBook(addrBook) + + return addrBook, nil +} + +func createPEXReactorAndAddToSwitch(addrBook pex.AddrBook, config *cfg.Config, + sw *p2p.Switch, logger log.Logger) *pex.Reactor { + + // TODO persistent peers ? so we can have their DNS addrs saved + pexReactor := pex.NewReactor(addrBook, + &pex.ReactorConfig{ + Seeds: splitAndTrimEmpty(config.P2P.Seeds, ",", " "), + SeedMode: config.P2P.SeedMode, + // See consensus/reactor.go: blocksToContributeToBecomeGoodPeer 10000 + // blocks assuming 10s blocks ~ 28 hours. + // TODO (melekes): make it dynamic based on the actual block latencies + // from the live network. + // https://github.com/tendermint/tendermint/issues/3523 + SeedDisconnectWaitPeriod: 28 * time.Hour, + PersistentPeersMaxDialPeriod: config.P2P.PersistentPeersMaxDialPeriod, + }) + pexReactor.SetLogger(logger.With("module", "pex")) + sw.AddReactor("PEX", pexReactor) + return pexReactor +} + +// startStateSync starts an asynchronous state sync process, then switches to fast sync mode. +func startStateSync(ssR *statesync.Reactor, bcR fastSyncReactor, conR *cs.Reactor, + stateProvider statesync.StateProvider, config *cfg.StateSyncConfig, fastSync bool, + stateStore sm.Store, blockStore *store.BlockStore, state sm.State) error { + ssR.Logger.Info("Starting state sync") + + if stateProvider == nil { + var err error + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + stateProvider, err = statesync.NewLightClientStateProvider( + ctx, + state.ChainID, state.Version, state.InitialHeight, + config.RPCServers, light.TrustOptions{ + Period: config.TrustPeriod, + Height: config.TrustHeight, + Hash: config.TrustHashBytes(), + }, ssR.Logger.With("module", "light")) + if err != nil { + return fmt.Errorf("failed to set up light client state provider: %w", err) + } + } + + go func() { + state, commit, err := ssR.Sync(stateProvider, config.DiscoveryTime) + if err != nil { + ssR.Logger.Error("State sync failed", "err", err) + return + } + err = stateStore.Bootstrap(state) + if err != nil { + ssR.Logger.Error("Failed to bootstrap node with new state", "err", err) + return + } + err = blockStore.SaveSeenCommit(state.LastBlockHeight, commit) + if err != nil { + ssR.Logger.Error("Failed to store last seen commit", "err", err) + return + } + + if fastSync { + // FIXME Very ugly to have these metrics bleed through here. + conR.Metrics.StateSyncing.Set(0) + conR.Metrics.FastSyncing.Set(1) + err = bcR.SwitchToFastSync(state) + if err != nil { + ssR.Logger.Error("Failed to switch to fast sync", "err", err) + return + } + } else { + conR.SwitchToConsensus(state, true) + } + }() + return nil +} + +// NewNode returns a new, ready to go, Tendermint Node. +func NewNode(config *cfg.Config, + privValidator types.PrivValidator, + nodeKey *p2p.NodeKey, + clientCreator proxy.ClientCreator, + genesisDocProvider GenesisDocProvider, + dbProvider DBProvider, + metricsProvider MetricsProvider, + logger log.Logger, + misbehaviors map[int64]cs.Misbehavior, + options ...Option) (*Node, error) { + + blockStore, stateDB, err := initDBs(config, dbProvider) + if err != nil { + return nil, err + } + + stateStore := sm.NewStore(stateDB) + + state, genDoc, err := LoadStateFromDBOrGenesisDocProvider(stateDB, genesisDocProvider) + if err != nil { + return nil, err + } + + // Create the proxyApp and establish connections to the ABCI app (consensus, mempool, query). + proxyApp, err := createAndStartProxyAppConns(clientCreator, logger) + if err != nil { + return nil, err + } + + // EventBus and IndexerService must be started before the handshake because + // we might need to index the txs of the replayed block as this might not have happened + // when the node stopped last time (i.e. the node stopped after it saved the block + // but before it indexed the txs, or, endblocker panicked) + eventBus, err := createAndStartEventBus(logger) + if err != nil { + return nil, err + } + + // Transaction indexing + indexerService, txIndexer, err := createAndStartIndexerService(config, dbProvider, eventBus, logger) + if err != nil { + return nil, err + } + + // If an address is provided, listen on the socket for a connection from an + // external signing process. + if config.PrivValidatorListenAddr != "" { + // FIXME: we should start services inside OnStart + privValidator, err = createAndStartPrivValidatorSocketClient(config.PrivValidatorListenAddr, genDoc.ChainID, logger) + if err != nil { + return nil, fmt.Errorf("error with private validator socket client: %w", err) + } + } + + pubKey, err := privValidator.GetPubKey() + if err != nil { + return nil, fmt.Errorf("can't get pubkey: %w", err) + } + + // Determine whether we should do state and/or fast sync. + // We don't fast-sync when the only validator is us. + fastSync := config.FastSyncMode && !onlyValidatorIsUs(state, pubKey) + stateSync := config.StateSync.Enable && !onlyValidatorIsUs(state, pubKey) + if stateSync && state.LastBlockHeight > 0 { + logger.Info("Found local state with non-zero height, skipping state sync") + stateSync = false + } + + // Create the handshaker, which calls RequestInfo, sets the AppVersion on the state, + // and replays any blocks as necessary to sync tendermint with the app. + consensusLogger := logger.With("module", "consensus") + if !stateSync { + if err := doHandshake(stateStore, state, blockStore, genDoc, eventBus, proxyApp, consensusLogger); err != nil { + return nil, err + } + + // Reload the state. It will have the Version.Consensus.App set by the + // Handshake, and may have other modifications as well (ie. depending on + // what happened during block replay). + state, err = stateStore.Load() + if err != nil { + return nil, fmt.Errorf("cannot load state: %w", err) + } + } + + logNodeStartupInfo(state, pubKey, logger, consensusLogger) + + csMetrics, p2pMetrics, memplMetrics, smMetrics := metricsProvider(genDoc.ChainID) + + // Make MempoolReactor + mempoolReactor, mempool := createMempoolAndMempoolReactor(config, proxyApp, state, memplMetrics, logger) + + // Make Evidence Reactor + evidenceReactor, evidencePool, err := createEvidenceReactor(config, dbProvider, stateDB, blockStore, logger) + if err != nil { + return nil, err + } + + // make block executor for consensus and blockchain reactors to execute blocks + blockExec := sm.NewBlockExecutor( + stateStore, + logger.With("module", "state"), + proxyApp.Consensus(), + mempool, + evidencePool, + sm.BlockExecutorWithMetrics(smMetrics), + ) + + // Make BlockchainReactor. Don't start fast sync if we're doing a state sync first. + bcReactor, err := createBlockchainReactor(config, state, blockExec, blockStore, fastSync && !stateSync, logger) + if err != nil { + return nil, fmt.Errorf("could not create blockchain reactor: %w", err) + } + + // Make ConsensusReactor. Don't enable fully if doing a state sync and/or fast sync first. + // FIXME We need to update metrics here, since other reactors don't have access to them. + if stateSync { + csMetrics.StateSyncing.Set(1) + } else if fastSync { + csMetrics.FastSyncing.Set(1) + } + + logger.Info("Setting up maverick consensus reactor", "Misbehaviors", misbehaviors) + consensusReactor, consensusState := createConsensusReactor( + config, state, blockExec, blockStore, mempool, evidencePool, + privValidator, csMetrics, stateSync || fastSync, eventBus, consensusLogger, misbehaviors) + + // Set up state sync reactor, and schedule a sync if requested. + // FIXME The way we do phased startups (e.g. replay -> fast sync -> consensus) is very messy, + // we should clean this whole thing up. See: + // https://github.com/tendermint/tendermint/issues/4644 + stateSyncReactor := statesync.NewReactor(proxyApp.Snapshot(), proxyApp.Query(), + config.StateSync.TempDir) + stateSyncReactor.SetLogger(logger.With("module", "statesync")) + + nodeInfo, err := makeNodeInfo(config, nodeKey, txIndexer, genDoc, state) + if err != nil { + return nil, err + } + + // Setup Transport. + transport, peerFilters := createTransport(config, nodeInfo, nodeKey, proxyApp) + + // Setup Switch. + p2pLogger := logger.With("module", "p2p") + sw := createSwitch( + config, transport, p2pMetrics, peerFilters, mempoolReactor, bcReactor, + stateSyncReactor, consensusReactor, evidenceReactor, nodeInfo, nodeKey, p2pLogger, + ) + + err = sw.AddPersistentPeers(splitAndTrimEmpty(config.P2P.PersistentPeers, ",", " ")) + if err != nil { + return nil, fmt.Errorf("could not add peers from persistent_peers field: %w", err) + } + + err = sw.AddUnconditionalPeerIDs(splitAndTrimEmpty(config.P2P.UnconditionalPeerIDs, ",", " ")) + if err != nil { + return nil, fmt.Errorf("could not add peer ids from unconditional_peer_ids field: %w", err) + } + + addrBook, err := createAddrBookAndSetOnSwitch(config, sw, p2pLogger, nodeKey) + if err != nil { + return nil, fmt.Errorf("could not create addrbook: %w", err) + } + + // Optionally, start the pex reactor + // + // TODO: + // + // We need to set Seeds and PersistentPeers on the switch, + // since it needs to be able to use these (and their DNS names) + // even if the PEX is off. We can include the DNS name in the NetAddress, + // but it would still be nice to have a clear list of the current "PersistentPeers" + // somewhere that we can return with net_info. + // + // If PEX is on, it should handle dialing the seeds. Otherwise the switch does it. + // Note we currently use the addrBook regardless at least for AddOurAddress + var pexReactor *pex.Reactor + if config.P2P.PexReactor { + pexReactor = createPEXReactorAndAddToSwitch(addrBook, config, sw, logger) + } + + if config.RPC.PprofListenAddress != "" { + go func() { + logger.Info("Starting pprof server", "laddr", config.RPC.PprofListenAddress) + logger.Error("pprof server error", "err", http.ListenAndServe(config.RPC.PprofListenAddress, nil)) + }() + } + + node := &Node{ + config: config, + genesisDoc: genDoc, + privValidator: privValidator, + + transport: transport, + sw: sw, + addrBook: addrBook, + nodeInfo: nodeInfo, + nodeKey: nodeKey, + + stateStore: stateStore, + blockStore: blockStore, + bcReactor: bcReactor, + mempoolReactor: mempoolReactor, + mempool: mempool, + consensusState: consensusState, + consensusReactor: consensusReactor, + stateSyncReactor: stateSyncReactor, + stateSync: stateSync, + stateSyncGenesis: state, // Shouldn't be necessary, but need a way to pass the genesis state + pexReactor: pexReactor, + evidencePool: evidencePool, + proxyApp: proxyApp, + txIndexer: txIndexer, + indexerService: indexerService, + eventBus: eventBus, + } + node.BaseService = *service.NewBaseService(logger, "Node", node) + + for _, option := range options { + option(node) + } + + return node, nil +} + +// OnStart starts the Node. It implements service.Service. +func (n *Node) OnStart() error { + now := tmtime.Now() + genTime := n.genesisDoc.GenesisTime + if genTime.After(now) { + n.Logger.Info("Genesis time is in the future. Sleeping until then...", "genTime", genTime) + time.Sleep(genTime.Sub(now)) + } + + // Add private IDs to addrbook to block those peers being added + n.addrBook.AddPrivateIDs(splitAndTrimEmpty(n.config.P2P.PrivatePeerIDs, ",", " ")) + + // Start the RPC server before the P2P server + // so we can eg. receive txs for the first block + if n.config.RPC.ListenAddress != "" { + listeners, err := n.startRPC() + if err != nil { + return err + } + n.rpcListeners = listeners + } + + if n.config.Instrumentation.Prometheus && + n.config.Instrumentation.PrometheusListenAddr != "" { + n.prometheusSrv = n.startPrometheusServer(n.config.Instrumentation.PrometheusListenAddr) + } + + // Start the transport. + addr, err := p2p.NewNetAddressString(p2p.IDAddressString(n.nodeKey.ID(), n.config.P2P.ListenAddress)) + if err != nil { + return err + } + if err := n.transport.Listen(*addr); err != nil { + return err + } + + n.isListening = true + + if n.config.Mempool.WalEnabled() { + err = n.mempool.InitWAL() + if err != nil { + return fmt.Errorf("init mempool WAL: %w", err) + } + } + + // Start the switch (the P2P server). + err = n.sw.Start() + if err != nil { + return err + } + + // Always connect to persistent peers + err = n.sw.DialPeersAsync(splitAndTrimEmpty(n.config.P2P.PersistentPeers, ",", " ")) + if err != nil { + return fmt.Errorf("could not dial peers from persistent_peers field: %w", err) + } + + // Run state sync + if n.stateSync { + bcR, ok := n.bcReactor.(fastSyncReactor) + if !ok { + return fmt.Errorf("this blockchain reactor does not support switching from state sync") + } + err := startStateSync(n.stateSyncReactor, bcR, n.consensusReactor, n.stateSyncProvider, + n.config.StateSync, n.config.FastSyncMode, n.stateStore, n.blockStore, n.stateSyncGenesis) + if err != nil { + return fmt.Errorf("failed to start state sync: %w", err) + } + } + + return nil +} + +// OnStop stops the Node. It implements service.Service. +func (n *Node) OnStop() { + n.BaseService.OnStop() + + n.Logger.Info("Stopping Node") + + // first stop the non-reactor services + if err := n.eventBus.Stop(); err != nil { + n.Logger.Error("Error closing eventBus", "err", err) + } + if err := n.indexerService.Stop(); err != nil { + n.Logger.Error("Error closing indexerService", "err", err) + } + + // now stop the reactors + if err := n.sw.Stop(); err != nil { + n.Logger.Error("Error closing switch", "err", err) + } + + // stop mempool WAL + if n.config.Mempool.WalEnabled() { + n.mempool.CloseWAL() + } + + if err := n.transport.Close(); err != nil { + n.Logger.Error("Error closing transport", "err", err) + } + + n.isListening = false + + // finally stop the listeners / external services + for _, l := range n.rpcListeners { + n.Logger.Info("Closing rpc listener", "listener", l) + if err := l.Close(); err != nil { + n.Logger.Error("Error closing listener", "listener", l, "err", err) + } + } + + if pvsc, ok := n.privValidator.(service.Service); ok { + if err := pvsc.Stop(); err != nil { + n.Logger.Error("Error closing private validator", "err", err) + } + } + + if n.prometheusSrv != nil { + if err := n.prometheusSrv.Shutdown(context.Background()); err != nil { + // Error from closing listeners, or context timeout: + n.Logger.Error("Prometheus HTTP server Shutdown", "err", err) + } + } +} + +// ConfigureRPC makes sure RPC has all the objects it needs to operate. +func (n *Node) ConfigureRPC() error { + pubKey, err := n.privValidator.GetPubKey() + if err != nil { + return fmt.Errorf("can't get pubkey: %w", err) + } + rpccore.SetEnvironment(&rpccore.Environment{ + ProxyAppQuery: n.proxyApp.Query(), + ProxyAppMempool: n.proxyApp.Mempool(), + + StateStore: n.stateStore, + BlockStore: n.blockStore, + EvidencePool: n.evidencePool, + ConsensusState: n.consensusState, + P2PPeers: n.sw, + P2PTransport: n, + + PubKey: pubKey, + GenDoc: n.genesisDoc, + TxIndexer: n.txIndexer, + ConsensusReactor: &consensus.Reactor{}, + EventBus: n.eventBus, + Mempool: n.mempool, + + Logger: n.Logger.With("module", "rpc"), + + Config: *n.config.RPC, + }) + return nil +} + +func (n *Node) startRPC() ([]net.Listener, error) { + err := n.ConfigureRPC() + if err != nil { + return nil, err + } + + listenAddrs := splitAndTrimEmpty(n.config.RPC.ListenAddress, ",", " ") + + if n.config.RPC.Unsafe { + rpccore.AddUnsafeRoutes() + } + + config := rpcserver.DefaultConfig() + config.MaxBodyBytes = n.config.RPC.MaxBodyBytes + config.MaxHeaderBytes = n.config.RPC.MaxHeaderBytes + config.MaxOpenConnections = n.config.RPC.MaxOpenConnections + // If necessary adjust global WriteTimeout to ensure it's greater than + // TimeoutBroadcastTxCommit. + // See https://github.com/tendermint/tendermint/issues/3435 + if config.WriteTimeout <= n.config.RPC.TimeoutBroadcastTxCommit { + config.WriteTimeout = n.config.RPC.TimeoutBroadcastTxCommit + 1*time.Second + } + + // we may expose the rpc over both a unix and tcp socket + listeners := make([]net.Listener, len(listenAddrs)) + for i, listenAddr := range listenAddrs { + mux := http.NewServeMux() + rpcLogger := n.Logger.With("module", "rpc-server") + wmLogger := rpcLogger.With("protocol", "websocket") + wm := rpcserver.NewWebsocketManager(rpccore.Routes, + rpcserver.OnDisconnect(func(remoteAddr string) { + err := n.eventBus.UnsubscribeAll(context.Background(), remoteAddr) + if err != nil && err != tmpubsub.ErrSubscriptionNotFound { + wmLogger.Error("Failed to unsubscribe addr from events", "addr", remoteAddr, "err", err) + } + }), + rpcserver.ReadLimit(config.MaxBodyBytes), + ) + wm.SetLogger(wmLogger) + mux.HandleFunc("/websocket", wm.WebsocketHandler) + rpcserver.RegisterRPCFuncs(mux, rpccore.Routes, rpcLogger) + listener, err := rpcserver.Listen( + listenAddr, + config, + ) + if err != nil { + return nil, err + } + + var rootHandler http.Handler = mux + if n.config.RPC.IsCorsEnabled() { + corsMiddleware := cors.New(cors.Options{ + AllowedOrigins: n.config.RPC.CORSAllowedOrigins, + AllowedMethods: n.config.RPC.CORSAllowedMethods, + AllowedHeaders: n.config.RPC.CORSAllowedHeaders, + }) + rootHandler = corsMiddleware.Handler(mux) + } + if n.config.RPC.IsTLSEnabled() { + go func() { + if err := rpcserver.ServeTLS( + listener, + rootHandler, + n.config.RPC.CertFile(), + n.config.RPC.KeyFile(), + rpcLogger, + config, + ); err != nil { + n.Logger.Error("Error serving server with TLS", "err", err) + } + }() + } else { + go func() { + if err := rpcserver.Serve( + listener, + rootHandler, + rpcLogger, + config, + ); err != nil { + n.Logger.Error("Error serving server", "err", err) + } + }() + } + + listeners[i] = listener + } + + // we expose a simplified api over grpc for convenience to app devs + grpcListenAddr := n.config.RPC.GRPCListenAddress + if grpcListenAddr != "" { + config := rpcserver.DefaultConfig() + config.MaxBodyBytes = n.config.RPC.MaxBodyBytes + config.MaxHeaderBytes = n.config.RPC.MaxHeaderBytes + // NOTE: GRPCMaxOpenConnections is used, not MaxOpenConnections + config.MaxOpenConnections = n.config.RPC.GRPCMaxOpenConnections + // If necessary adjust global WriteTimeout to ensure it's greater than + // TimeoutBroadcastTxCommit. + // See https://github.com/tendermint/tendermint/issues/3435 + if config.WriteTimeout <= n.config.RPC.TimeoutBroadcastTxCommit { + config.WriteTimeout = n.config.RPC.TimeoutBroadcastTxCommit + 1*time.Second + } + listener, err := rpcserver.Listen(grpcListenAddr, config) + if err != nil { + return nil, err + } + go func() { + if err := grpccore.StartGRPCServer(listener); err != nil { + n.Logger.Error("Error starting gRPC server", "err", err) + } + }() + listeners = append(listeners, listener) + } + + return listeners, nil +} + +// startPrometheusServer starts a Prometheus HTTP server, listening for metrics +// collectors on addr. +func (n *Node) startPrometheusServer(addr string) *http.Server { + srv := &http.Server{ + Addr: addr, + Handler: promhttp.InstrumentMetricHandler( + prometheus.DefaultRegisterer, promhttp.HandlerFor( + prometheus.DefaultGatherer, + promhttp.HandlerOpts{MaxRequestsInFlight: n.config.Instrumentation.MaxOpenConnections}, + ), + ), + } + go func() { + if err := srv.ListenAndServe(); err != http.ErrServerClosed { + // Error starting or closing listener: + n.Logger.Error("Prometheus HTTP server ListenAndServe", "err", err) + } + }() + return srv +} + +// Switch returns the Node's Switch. +func (n *Node) Switch() *p2p.Switch { + return n.sw +} + +// BlockStore returns the Node's BlockStore. +func (n *Node) BlockStore() *store.BlockStore { + return n.blockStore +} + +// ConsensusState returns the Node's ConsensusState. +func (n *Node) ConsensusState() *cs.State { + return n.consensusState +} + +// ConsensusReactor returns the Node's ConsensusReactor. +func (n *Node) ConsensusReactor() *cs.Reactor { + return n.consensusReactor +} + +// MempoolReactor returns the Node's mempool reactor. +func (n *Node) MempoolReactor() *mempl.Reactor { + return n.mempoolReactor +} + +// Mempool returns the Node's mempool. +func (n *Node) Mempool() mempl.Mempool { + return n.mempool +} + +// PEXReactor returns the Node's PEXReactor. It returns nil if PEX is disabled. +func (n *Node) PEXReactor() *pex.Reactor { + return n.pexReactor +} + +// EvidencePool returns the Node's EvidencePool. +func (n *Node) EvidencePool() *evidence.Pool { + return n.evidencePool +} + +// EventBus returns the Node's EventBus. +func (n *Node) EventBus() *types.EventBus { + return n.eventBus +} + +// PrivValidator returns the Node's PrivValidator. +// XXX: for convenience only! +func (n *Node) PrivValidator() types.PrivValidator { + return n.privValidator +} + +// GenesisDoc returns the Node's GenesisDoc. +func (n *Node) GenesisDoc() *types.GenesisDoc { + return n.genesisDoc +} + +// ProxyApp returns the Node's AppConns, representing its connections to the ABCI application. +func (n *Node) ProxyApp() proxy.AppConns { + return n.proxyApp +} + +// Config returns the Node's config. +func (n *Node) Config() *cfg.Config { + return n.config +} + +//------------------------------------------------------------------------------ + +func (n *Node) Listeners() []string { + return []string{ + fmt.Sprintf("Listener(@%v)", n.config.P2P.ExternalAddress), + } +} + +func (n *Node) IsListening() bool { + return n.isListening +} + +// NodeInfo returns the Node's Info from the Switch. +func (n *Node) NodeInfo() p2p.NodeInfo { + return n.nodeInfo +} + +func makeNodeInfo( + config *cfg.Config, + nodeKey *p2p.NodeKey, + txIndexer txindex.TxIndexer, + genDoc *types.GenesisDoc, + state sm.State, +) (p2p.NodeInfo, error) { + txIndexerStatus := "on" + if _, ok := txIndexer.(*null.TxIndex); ok { + txIndexerStatus = "off" + } + + var bcChannel byte + switch config.FastSync.Version { + case "v0": + bcChannel = bcv0.BlockchainChannel + case "v1": + bcChannel = bcv1.BlockchainChannel + case "v2": + bcChannel = bcv2.BlockchainChannel + default: + return nil, fmt.Errorf("unknown fastsync version %s", config.FastSync.Version) + } + + nodeInfo := p2p.DefaultNodeInfo{ + ProtocolVersion: p2p.NewProtocolVersion( + version.P2PProtocol, // global + state.Version.Consensus.Block, + state.Version.Consensus.App, + ), + DefaultNodeID: nodeKey.ID(), + Network: genDoc.ChainID, + Version: version.TMCoreSemVer, + Channels: []byte{ + bcChannel, + cs.StateChannel, cs.DataChannel, cs.VoteChannel, cs.VoteSetBitsChannel, + mempl.MempoolChannel, + evidence.EvidenceChannel, + statesync.SnapshotChannel, statesync.ChunkChannel, + }, + Moniker: config.Moniker, + Other: p2p.DefaultNodeInfoOther{ + TxIndex: txIndexerStatus, + RPCAddress: config.RPC.ListenAddress, + }, + } + + if config.P2P.PexReactor { + nodeInfo.Channels = append(nodeInfo.Channels, pex.PexChannel) + } + + lAddr := config.P2P.ExternalAddress + + if lAddr == "" { + lAddr = config.P2P.ListenAddress + } + + nodeInfo.ListenAddr = lAddr + + err := nodeInfo.Validate() + return nodeInfo, err +} + +//------------------------------------------------------------------------------ + +var ( + genesisDocKey = []byte("genesisDoc") +) + +// LoadStateFromDBOrGenesisDocProvider attempts to load the state from the +// database, or creates one using the given genesisDocProvider and persists the +// result to the database. On success this also returns the genesis doc loaded +// through the given provider. +func LoadStateFromDBOrGenesisDocProvider( + stateDB dbm.DB, + genesisDocProvider GenesisDocProvider, +) (sm.State, *types.GenesisDoc, error) { + // Get genesis doc + genDoc, err := loadGenesisDoc(stateDB) + if err != nil { + genDoc, err = genesisDocProvider() + if err != nil { + return sm.State{}, nil, err + } + // save genesis doc to prevent a certain class of user errors (e.g. when it + // was changed, accidentally or not). Also good for audit trail. + saveGenesisDoc(stateDB, genDoc) + } + stateStore := sm.NewStore(stateDB) + state, err := stateStore.LoadFromDBOrGenesisDoc(genDoc) + if err != nil { + return sm.State{}, nil, err + } + return state, genDoc, nil +} + +// panics if failed to unmarshal bytes +func loadGenesisDoc(db dbm.DB) (*types.GenesisDoc, error) { + b, err := db.Get(genesisDocKey) + if err != nil { + panic(err) + } + if len(b) == 0 { + return nil, errors.New("genesis doc not found") + } + var genDoc *types.GenesisDoc + err = tmjson.Unmarshal(b, &genDoc) + if err != nil { + panic(fmt.Sprintf("Failed to load genesis doc due to unmarshaling error: %v (bytes: %X)", err, b)) + } + return genDoc, nil +} + +// panics if failed to marshal the given genesis document +func saveGenesisDoc(db dbm.DB, genDoc *types.GenesisDoc) { + b, err := tmjson.Marshal(genDoc) + if err != nil { + panic(fmt.Sprintf("Failed to save genesis doc due to marshaling error: %v", err)) + } + if err := db.SetSync(genesisDocKey, b); err != nil { + panic(fmt.Sprintf("Failed to save genesis doc: %v", err)) + } +} + +func createAndStartPrivValidatorSocketClient( + listenAddr, + chainID string, + logger log.Logger, +) (types.PrivValidator, error) { + pve, err := privval.NewSignerListener(listenAddr, logger) + if err != nil { + return nil, fmt.Errorf("failed to start private validator: %w", err) + } + + pvsc, err := privval.NewSignerClient(pve, chainID) + if err != nil { + return nil, fmt.Errorf("failed to start private validator: %w", err) + } + + // try to get a pubkey from private validate first time + _, err = pvsc.GetPubKey() + if err != nil { + return nil, fmt.Errorf("can't get pubkey: %w", err) + } + + const ( + retries = 50 // 50 * 100ms = 5s total + timeout = 100 * time.Millisecond + ) + pvscWithRetries := privval.NewRetrySignerClient(pvsc, retries, timeout) + + return pvscWithRetries, nil +} + +// splitAndTrimEmpty slices s into all subslices separated by sep and returns a +// slice of the string s with all leading and trailing Unicode code points +// contained in cutset removed. If sep is empty, SplitAndTrim splits after each +// UTF-8 sequence. First part is equivalent to strings.SplitN with a count of +// -1. also filter out empty strings, only return non-empty strings. +func splitAndTrimEmpty(s, sep, cutset string) []string { + if s == "" { + return []string{} + } + + spl := strings.Split(s, sep) + nonEmptyStrings := make([]string, 0, len(spl)) + for i := 0; i < len(spl); i++ { + element := strings.Trim(spl[i], cutset) + if element != "" { + nonEmptyStrings = append(nonEmptyStrings, element) + } + } + return nonEmptyStrings +} diff --git a/test/maverick/node/privval.go b/test/maverick/node/privval.go new file mode 100644 index 000000000..441b6ca9d --- /dev/null +++ b/test/maverick/node/privval.go @@ -0,0 +1,358 @@ +package node + +import ( + "errors" + "fmt" + "io/ioutil" + + "github.com/tendermint/tendermint/crypto" + "github.com/tendermint/tendermint/crypto/ed25519" + tmbytes "github.com/tendermint/tendermint/libs/bytes" + tmjson "github.com/tendermint/tendermint/libs/json" + tmos "github.com/tendermint/tendermint/libs/os" + "github.com/tendermint/tendermint/libs/tempfile" + tmproto "github.com/tendermint/tendermint/proto/tendermint/types" + "github.com/tendermint/tendermint/types" +) + +// ******************************************************************************************************************* +// +// WARNING: FOR TESTING ONLY. DO NOT USE THIS FILE OUTSIDE MAVERICK +// +// ******************************************************************************************************************* + +const ( + stepNone int8 = 0 // Used to distinguish the initial state + stepPropose int8 = 1 + stepPrevote int8 = 2 + stepPrecommit int8 = 3 +) + +// A vote is either stepPrevote or stepPrecommit. +func voteToStep(vote *tmproto.Vote) int8 { + switch vote.Type { + case tmproto.PrevoteType: + return stepPrevote + case tmproto.PrecommitType: + return stepPrecommit + default: + panic(fmt.Sprintf("Unknown vote type: %v", vote.Type)) + } +} + +//------------------------------------------------------------------------------- + +// FilePVKey stores the immutable part of PrivValidator. +type FilePVKey struct { + Address types.Address `json:"address"` + PubKey crypto.PubKey `json:"pub_key"` + PrivKey crypto.PrivKey `json:"priv_key"` + + filePath string +} + +// Save persists the FilePVKey to its filePath. +func (pvKey FilePVKey) Save() { + outFile := pvKey.filePath + if outFile == "" { + panic("cannot save PrivValidator key: filePath not set") + } + + jsonBytes, err := tmjson.MarshalIndent(pvKey, "", " ") + if err != nil { + panic(err) + } + err = tempfile.WriteFileAtomic(outFile, jsonBytes, 0600) + if err != nil { + panic(err) + } + +} + +//------------------------------------------------------------------------------- + +// FilePVLastSignState stores the mutable part of PrivValidator. +type FilePVLastSignState struct { + Height int64 `json:"height"` + Round int32 `json:"round"` + Step int8 `json:"step"` + Signature []byte `json:"signature,omitempty"` + SignBytes tmbytes.HexBytes `json:"signbytes,omitempty"` + + filePath string +} + +// CheckHRS checks the given height, round, step (HRS) against that of the +// FilePVLastSignState. It returns an error if the arguments constitute a regression, +// or if they match but the SignBytes are empty. +// The returned boolean indicates whether the last Signature should be reused - +// it returns true if the HRS matches the arguments and the SignBytes are not empty (indicating +// we have already signed for this HRS, and can reuse the existing signature). +// It panics if the HRS matches the arguments, there's a SignBytes, but no Signature. +func (lss *FilePVLastSignState) CheckHRS(height int64, round int32, step int8) (bool, error) { + + if lss.Height > height { + return false, fmt.Errorf("height regression. Got %v, last height %v", height, lss.Height) + } + + if lss.Height == height { + if lss.Round > round { + return false, fmt.Errorf("round regression at height %v. Got %v, last round %v", height, round, lss.Round) + } + + if lss.Round == round { + if lss.Step > step { + return false, fmt.Errorf( + "step regression at height %v round %v. Got %v, last step %v", + height, + round, + step, + lss.Step, + ) + } else if lss.Step == step { + if lss.SignBytes != nil { + if lss.Signature == nil { + panic("pv: Signature is nil but SignBytes is not!") + } + return true, nil + } + return false, errors.New("no SignBytes found") + } + } + } + return false, nil +} + +// Save persists the FilePvLastSignState to its filePath. +func (lss *FilePVLastSignState) Save() { + outFile := lss.filePath + if outFile == "" { + panic("cannot save FilePVLastSignState: filePath not set") + } + jsonBytes, err := tmjson.MarshalIndent(lss, "", " ") + if err != nil { + panic(err) + } + err = tempfile.WriteFileAtomic(outFile, jsonBytes, 0600) + if err != nil { + panic(err) + } +} + +//------------------------------------------------------------------------------- + +// FilePV implements PrivValidator using data persisted to disk +// to prevent double signing. +// NOTE: the directories containing pv.Key.filePath and pv.LastSignState.filePath must already exist. +// It includes the LastSignature and LastSignBytes so we don't lose the signature +// if the process crashes after signing but before the resulting consensus message is processed. +type FilePV struct { + Key FilePVKey + LastSignState FilePVLastSignState +} + +// GenFilePV generates a new validator with randomly generated private key +// and sets the filePaths, but does not call Save(). +func GenFilePV(keyFilePath, stateFilePath string) *FilePV { + privKey := ed25519.GenPrivKey() + + return &FilePV{ + Key: FilePVKey{ + Address: privKey.PubKey().Address(), + PubKey: privKey.PubKey(), + PrivKey: privKey, + filePath: keyFilePath, + }, + LastSignState: FilePVLastSignState{ + Step: stepNone, + filePath: stateFilePath, + }, + } +} + +// LoadFilePV loads a FilePV from the filePaths. The FilePV handles double +// signing prevention by persisting data to the stateFilePath. If either file path +// does not exist, the program will exit. +func LoadFilePV(keyFilePath, stateFilePath string) *FilePV { + return loadFilePV(keyFilePath, stateFilePath, true) +} + +// LoadFilePVEmptyState loads a FilePV from the given keyFilePath, with an empty LastSignState. +// If the keyFilePath does not exist, the program will exit. +func LoadFilePVEmptyState(keyFilePath, stateFilePath string) *FilePV { + return loadFilePV(keyFilePath, stateFilePath, false) +} + +// If loadState is true, we load from the stateFilePath. Otherwise, we use an empty LastSignState. +func loadFilePV(keyFilePath, stateFilePath string, loadState bool) *FilePV { + keyJSONBytes, err := ioutil.ReadFile(keyFilePath) + if err != nil { + tmos.Exit(err.Error()) + } + pvKey := FilePVKey{} + err = tmjson.Unmarshal(keyJSONBytes, &pvKey) + if err != nil { + tmos.Exit(fmt.Sprintf("Error reading PrivValidator key from %v: %v\n", keyFilePath, err)) + } + + // overwrite pubkey and address for convenience + pvKey.PubKey = pvKey.PrivKey.PubKey() + pvKey.Address = pvKey.PubKey.Address() + pvKey.filePath = keyFilePath + + pvState := FilePVLastSignState{} + + if loadState { + stateJSONBytes, err := ioutil.ReadFile(stateFilePath) + if err != nil { + tmos.Exit(err.Error()) + } + err = tmjson.Unmarshal(stateJSONBytes, &pvState) + if err != nil { + tmos.Exit(fmt.Sprintf("Error reading PrivValidator state from %v: %v\n", stateFilePath, err)) + } + } + + pvState.filePath = stateFilePath + + return &FilePV{ + Key: pvKey, + LastSignState: pvState, + } +} + +// LoadOrGenFilePV loads a FilePV from the given filePaths +// or else generates a new one and saves it to the filePaths. +func LoadOrGenFilePV(keyFilePath, stateFilePath string) *FilePV { + var pv *FilePV + if tmos.FileExists(keyFilePath) { + pv = LoadFilePV(keyFilePath, stateFilePath) + } else { + pv = GenFilePV(keyFilePath, stateFilePath) + pv.Save() + } + return pv +} + +// GetAddress returns the address of the validator. +// Implements PrivValidator. +func (pv *FilePV) GetAddress() types.Address { + return pv.Key.Address +} + +// GetPubKey returns the public key of the validator. +// Implements PrivValidator. +func (pv *FilePV) GetPubKey() (crypto.PubKey, error) { + return pv.Key.PubKey, nil +} + +// SignVote signs a canonical representation of the vote, along with the +// chainID. Implements PrivValidator. +func (pv *FilePV) SignVote(chainID string, vote *tmproto.Vote) error { + if err := pv.signVote(chainID, vote); err != nil { + return fmt.Errorf("error signing vote: %v", err) + } + return nil +} + +// SignProposal signs a canonical representation of the proposal, along with +// the chainID. Implements PrivValidator. +func (pv *FilePV) SignProposal(chainID string, proposal *tmproto.Proposal) error { + if err := pv.signProposal(chainID, proposal); err != nil { + return fmt.Errorf("error signing proposal: %v", err) + } + return nil +} + +// Save persists the FilePV to disk. +func (pv *FilePV) Save() { + pv.Key.Save() + pv.LastSignState.Save() +} + +// Reset resets all fields in the FilePV. +// NOTE: Unsafe! +func (pv *FilePV) Reset() { + var sig []byte + pv.LastSignState.Height = 0 + pv.LastSignState.Round = 0 + pv.LastSignState.Step = 0 + pv.LastSignState.Signature = sig + pv.LastSignState.SignBytes = nil + pv.Save() +} + +// String returns a string representation of the FilePV. +func (pv *FilePV) String() string { + return fmt.Sprintf( + "PrivValidator{%v LH:%v, LR:%v, LS:%v}", + pv.GetAddress(), + pv.LastSignState.Height, + pv.LastSignState.Round, + pv.LastSignState.Step, + ) +} + +//------------------------------------------------------------------------------------ + +// signVote checks if the vote is good to sign and sets the vote signature. +// It may need to set the timestamp as well if the vote is otherwise the same as +// a previously signed vote (ie. we crashed after signing but before the vote hit the WAL). +func (pv *FilePV) signVote(chainID string, vote *tmproto.Vote) error { + height, round, step := vote.Height, vote.Round, voteToStep(vote) + + lss := pv.LastSignState + + _, err := lss.CheckHRS(height, round, step) + if err != nil { + return err + } + + signBytes := types.VoteSignBytes(chainID, vote) + + // It passed the checks. Sign the vote + sig, err := pv.Key.PrivKey.Sign(signBytes) + if err != nil { + return err + } + pv.saveSigned(height, round, step, signBytes, sig) + vote.Signature = sig + return nil +} + +// signProposal checks if the proposal is good to sign and sets the proposal signature. +// It may need to set the timestamp as well if the proposal is otherwise the same as +// a previously signed proposal ie. we crashed after signing but before the proposal hit the WAL). +func (pv *FilePV) signProposal(chainID string, proposal *tmproto.Proposal) error { + height, round, step := proposal.Height, proposal.Round, stepPropose + + lss := pv.LastSignState + + _, err := lss.CheckHRS(height, round, step) + if err != nil { + return err + } + + signBytes := types.ProposalSignBytes(chainID, proposal) + + // It passed the checks. Sign the proposal + sig, err := pv.Key.PrivKey.Sign(signBytes) + if err != nil { + return err + } + pv.saveSigned(height, round, step, signBytes, sig) + proposal.Signature = sig + return nil +} + +// Persist height/round/step and signature +func (pv *FilePV) saveSigned(height int64, round int32, step int8, + signBytes []byte, sig []byte) { + + pv.LastSignState.Height = height + pv.LastSignState.Round = round + pv.LastSignState.Step = step + pv.LastSignState.Signature = sig + pv.LastSignState.SignBytes = signBytes + pv.LastSignState.Save() +}