From 4bfec797e8f9c8be5d3666554bb9d6c1ff47f475 Mon Sep 17 00:00:00 2001 From: Alexander Bezobchuk Date: Fri, 13 Dec 2019 03:46:23 -0500 Subject: [PATCH] cli: debug sub-command (#4227) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Issue Implement a new subcommand: tendermint debug. This subcommand itself has two subcommands: $ tendermint debug kill --home= Writes debug info into a compressed archive. The archive will contain the following: ├── config.toml ├── consensus_state.json ├── net_info.json ├── stacktrace.out ├── status.json └── wal The Tendermint process will be killed. $ tendermint debug dump --home= This command will perform similar to kill except it only polls the node and dumps debugging data every frequency seconds to a compressed archive under a given destination directory. Each archive will contain: ├── consensus_state.json ├── goroutine.out ├── heap.out ├── net_info.json ├── status.json └── wal Note: goroutine.out and heap.out will only be written if a profile address is provided and is operational. This command is blocking and will log any error. replaces: #3327 closes: #3249 ## Commits: * Implement debug tool command stubs * Implement net getters and zip logic * Update zip dir API and add home flag * Add simple godocs for kill aux functions * Move IO util to new file and implement copy WAL func * Implement copy config function * Implement killProc * Remove debug fmt * Validate output file input * Direct STDERR to file * Godoc updates * Sleep prior to killing tail proc * Minor cleanup of godocs * Move debug command and add make target * Rename command handler function * Add example to command long descr * Move current implementation to cmd/tendermint/commands/debug * Update kill cmd long description * Implement dump command * Add pending log entry * Add gosec nolint * Add error check for Mkdir * Add os.IsNotExist(err) * Add to debugging section in running-in-prod doc --- CHANGELOG_PENDING.md | 3 + cmd/tendermint/commands/debug/debug.go | 41 +++++ cmd/tendermint/commands/debug/dump.go | 130 ++++++++++++++++ cmd/tendermint/commands/debug/io.go | 117 ++++++++++++++ cmd/tendermint/commands/debug/kill.go | 147 ++++++++++++++++++ cmd/tendermint/commands/debug/util.go | 82 ++++++++++ cmd/tendermint/main.go | 5 +- docs/tendermint-core/running-in-production.md | 13 ++ 8 files changed, 537 insertions(+), 1 deletion(-) create mode 100644 cmd/tendermint/commands/debug/debug.go create mode 100644 cmd/tendermint/commands/debug/dump.go create mode 100644 cmd/tendermint/commands/debug/io.go create mode 100644 cmd/tendermint/commands/debug/kill.go create mode 100644 cmd/tendermint/commands/debug/util.go diff --git a/CHANGELOG_PENDING.md b/CHANGELOG_PENDING.md index 2e5dfc2cc..2e58ecb8c 100644 --- a/CHANGELOG_PENDING.md +++ b/CHANGELOG_PENDING.md @@ -106,6 +106,9 @@ program](https://hackerone.com/tendermint). ### FEATURES: - [p2p] \#4053 Add `unconditional_peer_ids` and `persistent_peers_max_dial_period` config variables (see ADR-050) (@dongsam) +- [tools] [\#4227](https://github.com/tendermint/tendermint/pull/4227) Implement `tendermint debug kill` and + `tendermint debug dump` commands for Tendermint node debugging functionality. See `--help` in both + commands for further documentation and usage. - [cli] \#4234 Add `--db_backend and --db_dir` flags (@princesinha19) - [cli] \#4113 Add optional `--genesis_hash` flag to check genesis hash upon startup - [config] \#3831 Add support for [RocksDB](https://rocksdb.org/) (@Stumble) diff --git a/cmd/tendermint/commands/debug/debug.go b/cmd/tendermint/commands/debug/debug.go new file mode 100644 index 000000000..b5fc3c9e6 --- /dev/null +++ b/cmd/tendermint/commands/debug/debug.go @@ -0,0 +1,41 @@ +package debug + +import ( + "os" + + "github.com/spf13/cobra" + + "github.com/tendermint/tendermint/libs/log" +) + +var ( + nodeRPCAddr string + profAddr string + frequency uint + + flagNodeRPCAddr = "rpc-laddr" + flagProfAddr = "pprof-laddr" + flagFrequency = "frequency" + + logger = log.NewTMLogger(log.NewSyncWriter(os.Stdout)) +) + +// DebugCmd defines the root command containing subcommands that assist in +// debugging running Tendermint processes. +var DebugCmd = &cobra.Command{ + Use: "debug", + Short: "A utility to kill or watch a Tendermint process while aggregating debugging data", +} + +func init() { + DebugCmd.PersistentFlags().SortFlags = true + DebugCmd.PersistentFlags().StringVar( + &nodeRPCAddr, + flagNodeRPCAddr, + "tcp://localhost:26657", + "The Tendermint node's RPC address (:)", + ) + + DebugCmd.AddCommand(killCmd) + DebugCmd.AddCommand(dumpCmd) +} diff --git a/cmd/tendermint/commands/debug/dump.go b/cmd/tendermint/commands/debug/dump.go new file mode 100644 index 000000000..f742b5bff --- /dev/null +++ b/cmd/tendermint/commands/debug/dump.go @@ -0,0 +1,130 @@ +package debug + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "time" + + "github.com/pkg/errors" + "github.com/spf13/cobra" + "github.com/spf13/viper" + cfg "github.com/tendermint/tendermint/config" + "github.com/tendermint/tendermint/libs/cli" + rpcclient "github.com/tendermint/tendermint/rpc/client" +) + +var dumpCmd = &cobra.Command{ + Use: "dump [output-directory]", + Short: "Continuously poll a Tendermint process and dump debugging data into a single location", + Long: `Continuously poll a Tendermint process and dump debugging data into a single +location at a specified frequency. At each frequency interval, an archived and compressed +file will contain node debugging information including the goroutine and heap profiles +if enabled.`, + Args: cobra.ExactArgs(1), + RunE: dumpCmdHandler, +} + +func init() { + dumpCmd.Flags().UintVar( + &frequency, + flagFrequency, + 30, + "The frequency (seconds) in which to poll, aggregate and dump Tendermint debug data", + ) + + dumpCmd.Flags().StringVar( + &profAddr, + flagProfAddr, + "", + "The profiling server address (:)", + ) +} + +func dumpCmdHandler(_ *cobra.Command, args []string) error { + outDir := args[0] + if outDir == "" { + return errors.New("invalid output directory") + } + + if frequency == 0 { + return errors.New("frequency must be positive") + } + + if _, err := os.Stat(outDir); os.IsNotExist(err) { + if err := os.Mkdir(outDir, os.ModePerm); err != nil { + return errors.Wrap(err, "failed to create output directory") + } + } + + rpc := rpcclient.NewHTTP(nodeRPCAddr, "/websocket") + + home := viper.GetString(cli.HomeFlag) + conf := cfg.DefaultConfig() + conf = conf.SetRoot(home) + cfg.EnsureRoot(conf.RootDir) + + dumpDebugData(outDir, conf, rpc) + + ticker := time.NewTicker(time.Duration(frequency) * time.Second) + for range ticker.C { + dumpDebugData(outDir, conf, rpc) + } + + return nil +} + +func dumpDebugData(outDir string, conf *cfg.Config, rpc *rpcclient.HTTP) { + start := time.Now().UTC() + + tmpDir, err := ioutil.TempDir(outDir, "tendermint_debug_tmp") + if err != nil { + logger.Error("failed to create temporary directory", "dir", tmpDir, "error", err) + return + } + defer os.RemoveAll(tmpDir) + + logger.Info("getting node status...") + if err := dumpStatus(rpc, tmpDir, "status.json"); err != nil { + logger.Error("failed to dump node status", "error", err) + return + } + + logger.Info("getting node network info...") + if err := dumpNetInfo(rpc, tmpDir, "net_info.json"); err != nil { + logger.Error("failed to dump node network info", "error", err) + return + } + + logger.Info("getting node consensus state...") + if err := dumpConsensusState(rpc, tmpDir, "consensus_state.json"); err != nil { + logger.Error("failed to dump node consensus state", "error", err) + return + } + + logger.Info("copying node WAL...") + if err := copyWAL(conf, tmpDir); err != nil { + logger.Error("failed to copy node WAL", "error", err) + return + } + + if profAddr != "" { + logger.Info("getting node goroutine profile...") + if err := dumpProfile(tmpDir, profAddr, "goroutine", 2); err != nil { + logger.Error("failed to dump goroutine profile", "error", err) + return + } + + logger.Info("getting node heap profile...") + if err := dumpProfile(tmpDir, profAddr, "heap", 2); err != nil { + logger.Error("failed to dump heap profile", "error", err) + return + } + } + + outFile := filepath.Join(outDir, fmt.Sprintf("%s.zip", start.Format(time.Stamp))) + if err := zipDir(tmpDir, outFile); err != nil { + logger.Error("failed to create and compress archive", "file", outFile, "error", err) + } +} diff --git a/cmd/tendermint/commands/debug/io.go b/cmd/tendermint/commands/debug/io.go new file mode 100644 index 000000000..240b2b23c --- /dev/null +++ b/cmd/tendermint/commands/debug/io.go @@ -0,0 +1,117 @@ +package debug + +import ( + "archive/zip" + "encoding/json" + "io" + "io/ioutil" + "os" + "path" + "path/filepath" + "strings" + + "github.com/pkg/errors" +) + +// zipDir zips all the contents found in src, including both files and +// directories, into a destination file dest. It returns an error upon failure. +// It assumes src is a directory. +func zipDir(src, dest string) error { + zipFile, err := os.Create(dest) + if err != nil { + return err + } + defer zipFile.Close() + + zipWriter := zip.NewWriter(zipFile) + defer zipWriter.Close() + + dirName := filepath.Base(dest) + baseDir := strings.TrimSuffix(dirName, filepath.Ext(dirName)) + + filepath.Walk(src, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + header, err := zip.FileInfoHeader(info) + if err != nil { + return err + } + + // Each execution of this utility on a Tendermint process will result in a + // unique file. + header.Name = filepath.Join(baseDir, strings.TrimPrefix(path, src)) + + // Handle cases where the content to be zipped is a file or a directory, + // where a directory must have a '/' suffix. + if info.IsDir() { + header.Name += "/" + } else { + header.Method = zip.Deflate + } + + headerWriter, err := zipWriter.CreateHeader(header) + if err != nil { + return err + } + + if info.IsDir() { + return nil + } + + file, err := os.Open(path) + if err != nil { + return err + } + defer file.Close() + + _, err = io.Copy(headerWriter, file) + return err + }) + + return nil +} + +// copyFile copies a file from src to dest and returns an error upon failure. The +// copied file retains the source file's permissions. +func copyFile(src, dest string) error { + if _, err := os.Stat(src); os.IsNotExist(err) { + return err + } + + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer srcFile.Close() + + destFile, err := os.Create(dest) + if err != nil { + return err + } + defer destFile.Close() + + if _, err = io.Copy(destFile, srcFile); err != nil { + return err + } + + srcInfo, err := os.Stat(src) + if err != nil { + return err + } + + return os.Chmod(dest, srcInfo.Mode()) +} + +// writeStateToFile pretty JSON encodes an object and writes it to file composed +// of dir and filename. It returns an error upon failure to encode or write to +// file. +func writeStateJSONToFile(state interface{}, dir, filename string) error { + stateJSON, err := json.MarshalIndent(state, "", " ") + if err != nil { + return errors.Wrap(err, "failed to encode state dump") + } + + return ioutil.WriteFile(path.Join(dir, filename), stateJSON, os.ModePerm) +} diff --git a/cmd/tendermint/commands/debug/kill.go b/cmd/tendermint/commands/debug/kill.go new file mode 100644 index 000000000..6ce67fbe7 --- /dev/null +++ b/cmd/tendermint/commands/debug/kill.go @@ -0,0 +1,147 @@ +package debug + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + "time" + + "github.com/pkg/errors" + "github.com/spf13/cobra" + "github.com/spf13/viper" + + cfg "github.com/tendermint/tendermint/config" + "github.com/tendermint/tendermint/libs/cli" + rpcclient "github.com/tendermint/tendermint/rpc/client" +) + +var killCmd = &cobra.Command{ + Use: "kill [pid] [compressed-output-file]", + Short: "Kill a Tendermint process while aggregating and packaging debugging data", + Long: `Kill a Tendermint process while also aggregating Tendermint process data +such as the latest node state, including consensus and networking state, +go-routine state, and the node's WAL and config information. This aggregated data +is packaged into a compressed archive. + +Example: +$ tendermint debug 34255 /path/to/tm-debug.zip`, + Args: cobra.ExactArgs(2), + RunE: killCmdHandler, +} + +func killCmdHandler(cmd *cobra.Command, args []string) error { + pid, err := strconv.ParseUint(args[0], 10, 64) + if err != nil { + return err + } + + outFile := args[1] + if outFile == "" { + return errors.New("invalid output file") + } + + rpc := rpcclient.NewHTTP(nodeRPCAddr, "/websocket") + + home := viper.GetString(cli.HomeFlag) + conf := cfg.DefaultConfig() + conf = conf.SetRoot(home) + cfg.EnsureRoot(conf.RootDir) + + // Create a temporary directory which will contain all the state dumps and + // relevant files and directories that will be compressed into a file. + tmpDir, err := ioutil.TempDir(os.TempDir(), "tendermint_debug_tmp") + if err != nil { + return errors.Wrap(err, "failed to create temporary directory") + } + defer os.RemoveAll(tmpDir) + + logger.Info("getting node status...") + if err := dumpStatus(rpc, tmpDir, "status.json"); err != nil { + return err + } + + logger.Info("getting node network info...") + if err := dumpNetInfo(rpc, tmpDir, "net_info.json"); err != nil { + return err + } + + logger.Info("getting node consensus state...") + if err := dumpConsensusState(rpc, tmpDir, "consensus_state.json"); err != nil { + return err + } + + logger.Info("copying node WAL...") + if err := copyWAL(conf, tmpDir); err != nil { + return err + } + + logger.Info("copying node configuration...") + if err := copyConfig(home, tmpDir); err != nil { + return err + } + + logger.Info("killing Tendermint process") + if err := killProc(pid, tmpDir); err != nil { + return err + } + + logger.Info("archiving and compressing debug directory...") + return zipDir(tmpDir, outFile) +} + +// killProc attempts to kill the Tendermint process with a given PID with an +// ABORT signal which should result in a goroutine stacktrace. The PID's STDERR +// is tailed and piped to a file under the directory dir. An error is returned +// if the output file cannot be created or the tail command cannot be started. +// An error is not returned if any subsequent syscall fails. +func killProc(pid uint64, dir string) error { + // pipe STDERR output from tailing the Tendermint process to a file + // + // NOTE: This will only work on UNIX systems. + cmd := exec.Command("tail", "-f", fmt.Sprintf("/proc/%d/fd/2", pid)) // nolint: gosec + + outFile, err := os.Create(filepath.Join(dir, "stacktrace.out")) + if err != nil { + return err + } + defer outFile.Close() + + cmd.Stdout = outFile + cmd.Stderr = outFile + + if err := cmd.Start(); err != nil { + return err + } + + // kill the underlying Tendermint process and subsequent tailing process + go func() { + // Killing the Tendermint process with the '-ABRT|-6' signal will result in + // a goroutine stacktrace. + if err := syscall.Kill(int(pid), syscall.SIGABRT); err != nil { + fmt.Fprintf(os.Stderr, "failed to kill Tendermint process: %s", err) + } + + // allow some time to allow the Tendermint process to be killed + // + // TODO: We should 'wait' for a kill to succeed (e.g. poll for PID until it + // cannot be found). Regardless, this should be ample time. + time.Sleep(5 * time.Second) + + if err := cmd.Process.Kill(); err != nil { + fmt.Fprintf(os.Stderr, "failed to kill Tendermint process output redirection: %s", err) + } + }() + + if err := cmd.Wait(); err != nil { + // only return an error not invoked by a manual kill + if _, ok := err.(*exec.ExitError); !ok { + return err + } + } + + return nil +} diff --git a/cmd/tendermint/commands/debug/util.go b/cmd/tendermint/commands/debug/util.go new file mode 100644 index 000000000..b392d23d7 --- /dev/null +++ b/cmd/tendermint/commands/debug/util.go @@ -0,0 +1,82 @@ +package debug + +import ( + "fmt" + "io/ioutil" + "net/http" + "os" + "path" + "path/filepath" + + "github.com/pkg/errors" + cfg "github.com/tendermint/tendermint/config" + rpcclient "github.com/tendermint/tendermint/rpc/client" +) + +// dumpStatus gets node status state dump from the Tendermint RPC and writes it +// to file. It returns an error upon failure. +func dumpStatus(rpc *rpcclient.HTTP, dir, filename string) error { + status, err := rpc.Status() + if err != nil { + return errors.Wrap(err, "failed to get node status") + } + + return writeStateJSONToFile(status, dir, filename) +} + +// dumpNetInfo gets network information state dump from the Tendermint RPC and +// writes it to file. It returns an error upon failure. +func dumpNetInfo(rpc *rpcclient.HTTP, dir, filename string) error { + netInfo, err := rpc.NetInfo() + if err != nil { + return errors.Wrap(err, "failed to get node network information") + } + + return writeStateJSONToFile(netInfo, dir, filename) +} + +// dumpConsensusState gets consensus state dump from the Tendermint RPC and +// writes it to file. It returns an error upon failure. +func dumpConsensusState(rpc *rpcclient.HTTP, dir, filename string) error { + consDump, err := rpc.DumpConsensusState() + if err != nil { + return errors.Wrap(err, "failed to get node consensus dump") + } + + return writeStateJSONToFile(consDump, dir, filename) +} + +// copyWAL copies the Tendermint node's WAL file. It returns an error if the +// WAL file cannot be read or copied. +func copyWAL(conf *cfg.Config, dir string) error { + walPath := conf.Consensus.WalFile() + walFile := filepath.Base(walPath) + + return copyFile(walPath, filepath.Join(dir, walFile)) +} + +// copyConfig copies the Tendermint node's config file. It returns an error if +// the config file cannot be read or copied. +func copyConfig(home, dir string) error { + configFile := "config.toml" + configPath := filepath.Join(home, "config", configFile) + + return copyFile(configPath, filepath.Join(dir, configFile)) +} + +func dumpProfile(dir, addr, profile string, debug int) error { + endpoint := fmt.Sprintf("%s/debug/pprof/%s?debug=%d", addr, profile, debug) + + resp, err := http.Get(endpoint) // nolint: gosec + if err != nil { + return errors.Wrapf(err, "failed to query for %s profile", profile) + } + defer resp.Body.Close() + + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return errors.Wrapf(err, "failed to read %s profile response body", profile) + } + + return ioutil.WriteFile(path.Join(dir, fmt.Sprintf("%s.out", profile)), body, os.ModePerm) +} diff --git a/cmd/tendermint/main.go b/cmd/tendermint/main.go index a5a8d2d80..0cd4b7b70 100644 --- a/cmd/tendermint/main.go +++ b/cmd/tendermint/main.go @@ -7,6 +7,7 @@ import ( "github.com/tendermint/tendermint/libs/cli" cmd "github.com/tendermint/tendermint/cmd/tendermint/commands" + "github.com/tendermint/tendermint/cmd/tendermint/commands/debug" cfg "github.com/tendermint/tendermint/config" nm "github.com/tendermint/tendermint/node" ) @@ -26,7 +27,9 @@ func main() { cmd.TestnetFilesCmd, cmd.ShowNodeIDCmd, cmd.GenNodeKeyCmd, - cmd.VersionCmd) + cmd.VersionCmd, + debug.DebugCmd, + ) // NOTE: // Users wishing to: diff --git a/docs/tendermint-core/running-in-production.md b/docs/tendermint-core/running-in-production.md index f1954cbea..6daa1f16e 100644 --- a/docs/tendermint-core/running-in-production.md +++ b/docs/tendermint-core/running-in-production.md @@ -141,6 +141,19 @@ returns just the votes seen at the current height. - [StackOverflow questions](https://stackoverflow.com/questions/tagged/tendermint) +### Debug Utility + +Tendermint also ships with a `debug` sub-command that allows you to kill a live +Tendermint process while collecting useful information in a compressed archive +such as the configuration used, consensus state, network state, the node' status, +the WAL, and even the stacktrace of the process before exit. These files can be +useful to examine when debugging a faulty Tendermint process. + +In addition, the `debug` sub-command also allows you to dump debugging data into +compressed archives at a regular interval. These archives contain the goroutine +and heap profiles in addition to the consensus state, network info, node status, +and even the WAL. + ## Monitoring Tendermint Each Tendermint instance has a standard `/health` RPC endpoint, which