Browse Source

cli: debug sub-command (#4227)

## Issue

Implement a new subcommand: tendermint debug. This subcommand itself has two subcommands:

    $ tendermint debug kill <pid> </path/to/out.zip> --home=</path/to/app.d>

Writes debug info into a compressed archive. The archive will contain the following:

├── config.toml
├── consensus_state.json
├── net_info.json
├── stacktrace.out
├── status.json
└── wal

The Tendermint process will be killed.

    $ tendermint debug dump </path/to/out> --home=</path/to/app.d>

This command will perform similar to kill except it only polls the node and dumps debugging data every frequency seconds to a compressed archive under a given destination directory. Each archive will contain:

├── consensus_state.json
├── goroutine.out
├── heap.out
├── net_info.json
├── status.json
└── wal

Note:

    goroutine.out and heap.out will only be written if a profile address is provided and is operational.
    This command is blocking and will log any error.

replaces: #3327
closes: #3249

## Commits:

* Implement debug tool command stubs

* Implement net getters and zip logic

* Update zip dir API and add home flag

* Add simple godocs for kill aux functions

* Move IO util to new file and implement copy WAL func

* Implement copy config function

* Implement killProc

* Remove debug fmt

* Validate output file input

* Direct STDERR to file

* Godoc updates

* Sleep prior to killing tail proc

* Minor cleanup of godocs

* Move debug command and add make target

* Rename command handler function

* Add example to command long descr

* Move current implementation to cmd/tendermint/commands/debug

* Update kill cmd long description

* Implement dump command

* Add pending log entry

* Add gosec nolint

* Add error check for Mkdir

* Add os.IsNotExist(err)

* Add to debugging section in running-in-prod doc
pull/4251/head
Alexander Bezobchuk 5 years ago
committed by Anton Kaliaev
parent
commit
4bfec797e8
8 changed files with 537 additions and 1 deletions
  1. +3
    -0
      CHANGELOG_PENDING.md
  2. +41
    -0
      cmd/tendermint/commands/debug/debug.go
  3. +130
    -0
      cmd/tendermint/commands/debug/dump.go
  4. +117
    -0
      cmd/tendermint/commands/debug/io.go
  5. +147
    -0
      cmd/tendermint/commands/debug/kill.go
  6. +82
    -0
      cmd/tendermint/commands/debug/util.go
  7. +4
    -1
      cmd/tendermint/main.go
  8. +13
    -0
      docs/tendermint-core/running-in-production.md

+ 3
- 0
CHANGELOG_PENDING.md View File

@ -106,6 +106,9 @@ program](https://hackerone.com/tendermint).
### FEATURES:
- [p2p] \#4053 Add `unconditional_peer_ids` and `persistent_peers_max_dial_period` config variables (see ADR-050) (@dongsam)
- [tools] [\#4227](https://github.com/tendermint/tendermint/pull/4227) Implement `tendermint debug kill` and
`tendermint debug dump` commands for Tendermint node debugging functionality. See `--help` in both
commands for further documentation and usage.
- [cli] \#4234 Add `--db_backend and --db_dir` flags (@princesinha19)
- [cli] \#4113 Add optional `--genesis_hash` flag to check genesis hash upon startup
- [config] \#3831 Add support for [RocksDB](https://rocksdb.org/) (@Stumble)


+ 41
- 0
cmd/tendermint/commands/debug/debug.go View File

@ -0,0 +1,41 @@
package debug
import (
"os"
"github.com/spf13/cobra"
"github.com/tendermint/tendermint/libs/log"
)
var (
nodeRPCAddr string
profAddr string
frequency uint
flagNodeRPCAddr = "rpc-laddr"
flagProfAddr = "pprof-laddr"
flagFrequency = "frequency"
logger = log.NewTMLogger(log.NewSyncWriter(os.Stdout))
)
// DebugCmd defines the root command containing subcommands that assist in
// debugging running Tendermint processes.
var DebugCmd = &cobra.Command{
Use: "debug",
Short: "A utility to kill or watch a Tendermint process while aggregating debugging data",
}
func init() {
DebugCmd.PersistentFlags().SortFlags = true
DebugCmd.PersistentFlags().StringVar(
&nodeRPCAddr,
flagNodeRPCAddr,
"tcp://localhost:26657",
"The Tendermint node's RPC address (<host>:<port>)",
)
DebugCmd.AddCommand(killCmd)
DebugCmd.AddCommand(dumpCmd)
}

+ 130
- 0
cmd/tendermint/commands/debug/dump.go View File

@ -0,0 +1,130 @@
package debug
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"time"
"github.com/pkg/errors"
"github.com/spf13/cobra"
"github.com/spf13/viper"
cfg "github.com/tendermint/tendermint/config"
"github.com/tendermint/tendermint/libs/cli"
rpcclient "github.com/tendermint/tendermint/rpc/client"
)
var dumpCmd = &cobra.Command{
Use: "dump [output-directory]",
Short: "Continuously poll a Tendermint process and dump debugging data into a single location",
Long: `Continuously poll a Tendermint process and dump debugging data into a single
location at a specified frequency. At each frequency interval, an archived and compressed
file will contain node debugging information including the goroutine and heap profiles
if enabled.`,
Args: cobra.ExactArgs(1),
RunE: dumpCmdHandler,
}
func init() {
dumpCmd.Flags().UintVar(
&frequency,
flagFrequency,
30,
"The frequency (seconds) in which to poll, aggregate and dump Tendermint debug data",
)
dumpCmd.Flags().StringVar(
&profAddr,
flagProfAddr,
"",
"The profiling server address (<host>:<port>)",
)
}
func dumpCmdHandler(_ *cobra.Command, args []string) error {
outDir := args[0]
if outDir == "" {
return errors.New("invalid output directory")
}
if frequency == 0 {
return errors.New("frequency must be positive")
}
if _, err := os.Stat(outDir); os.IsNotExist(err) {
if err := os.Mkdir(outDir, os.ModePerm); err != nil {
return errors.Wrap(err, "failed to create output directory")
}
}
rpc := rpcclient.NewHTTP(nodeRPCAddr, "/websocket")
home := viper.GetString(cli.HomeFlag)
conf := cfg.DefaultConfig()
conf = conf.SetRoot(home)
cfg.EnsureRoot(conf.RootDir)
dumpDebugData(outDir, conf, rpc)
ticker := time.NewTicker(time.Duration(frequency) * time.Second)
for range ticker.C {
dumpDebugData(outDir, conf, rpc)
}
return nil
}
func dumpDebugData(outDir string, conf *cfg.Config, rpc *rpcclient.HTTP) {
start := time.Now().UTC()
tmpDir, err := ioutil.TempDir(outDir, "tendermint_debug_tmp")
if err != nil {
logger.Error("failed to create temporary directory", "dir", tmpDir, "error", err)
return
}
defer os.RemoveAll(tmpDir)
logger.Info("getting node status...")
if err := dumpStatus(rpc, tmpDir, "status.json"); err != nil {
logger.Error("failed to dump node status", "error", err)
return
}
logger.Info("getting node network info...")
if err := dumpNetInfo(rpc, tmpDir, "net_info.json"); err != nil {
logger.Error("failed to dump node network info", "error", err)
return
}
logger.Info("getting node consensus state...")
if err := dumpConsensusState(rpc, tmpDir, "consensus_state.json"); err != nil {
logger.Error("failed to dump node consensus state", "error", err)
return
}
logger.Info("copying node WAL...")
if err := copyWAL(conf, tmpDir); err != nil {
logger.Error("failed to copy node WAL", "error", err)
return
}
if profAddr != "" {
logger.Info("getting node goroutine profile...")
if err := dumpProfile(tmpDir, profAddr, "goroutine", 2); err != nil {
logger.Error("failed to dump goroutine profile", "error", err)
return
}
logger.Info("getting node heap profile...")
if err := dumpProfile(tmpDir, profAddr, "heap", 2); err != nil {
logger.Error("failed to dump heap profile", "error", err)
return
}
}
outFile := filepath.Join(outDir, fmt.Sprintf("%s.zip", start.Format(time.Stamp)))
if err := zipDir(tmpDir, outFile); err != nil {
logger.Error("failed to create and compress archive", "file", outFile, "error", err)
}
}

+ 117
- 0
cmd/tendermint/commands/debug/io.go View File

@ -0,0 +1,117 @@
package debug
import (
"archive/zip"
"encoding/json"
"io"
"io/ioutil"
"os"
"path"
"path/filepath"
"strings"
"github.com/pkg/errors"
)
// zipDir zips all the contents found in src, including both files and
// directories, into a destination file dest. It returns an error upon failure.
// It assumes src is a directory.
func zipDir(src, dest string) error {
zipFile, err := os.Create(dest)
if err != nil {
return err
}
defer zipFile.Close()
zipWriter := zip.NewWriter(zipFile)
defer zipWriter.Close()
dirName := filepath.Base(dest)
baseDir := strings.TrimSuffix(dirName, filepath.Ext(dirName))
filepath.Walk(src, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
header, err := zip.FileInfoHeader(info)
if err != nil {
return err
}
// Each execution of this utility on a Tendermint process will result in a
// unique file.
header.Name = filepath.Join(baseDir, strings.TrimPrefix(path, src))
// Handle cases where the content to be zipped is a file or a directory,
// where a directory must have a '/' suffix.
if info.IsDir() {
header.Name += "/"
} else {
header.Method = zip.Deflate
}
headerWriter, err := zipWriter.CreateHeader(header)
if err != nil {
return err
}
if info.IsDir() {
return nil
}
file, err := os.Open(path)
if err != nil {
return err
}
defer file.Close()
_, err = io.Copy(headerWriter, file)
return err
})
return nil
}
// copyFile copies a file from src to dest and returns an error upon failure. The
// copied file retains the source file's permissions.
func copyFile(src, dest string) error {
if _, err := os.Stat(src); os.IsNotExist(err) {
return err
}
srcFile, err := os.Open(src)
if err != nil {
return err
}
defer srcFile.Close()
destFile, err := os.Create(dest)
if err != nil {
return err
}
defer destFile.Close()
if _, err = io.Copy(destFile, srcFile); err != nil {
return err
}
srcInfo, err := os.Stat(src)
if err != nil {
return err
}
return os.Chmod(dest, srcInfo.Mode())
}
// writeStateToFile pretty JSON encodes an object and writes it to file composed
// of dir and filename. It returns an error upon failure to encode or write to
// file.
func writeStateJSONToFile(state interface{}, dir, filename string) error {
stateJSON, err := json.MarshalIndent(state, "", " ")
if err != nil {
return errors.Wrap(err, "failed to encode state dump")
}
return ioutil.WriteFile(path.Join(dir, filename), stateJSON, os.ModePerm)
}

+ 147
- 0
cmd/tendermint/commands/debug/kill.go View File

@ -0,0 +1,147 @@
package debug
import (
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall"
"time"
"github.com/pkg/errors"
"github.com/spf13/cobra"
"github.com/spf13/viper"
cfg "github.com/tendermint/tendermint/config"
"github.com/tendermint/tendermint/libs/cli"
rpcclient "github.com/tendermint/tendermint/rpc/client"
)
var killCmd = &cobra.Command{
Use: "kill [pid] [compressed-output-file]",
Short: "Kill a Tendermint process while aggregating and packaging debugging data",
Long: `Kill a Tendermint process while also aggregating Tendermint process data
such as the latest node state, including consensus and networking state,
go-routine state, and the node's WAL and config information. This aggregated data
is packaged into a compressed archive.
Example:
$ tendermint debug 34255 /path/to/tm-debug.zip`,
Args: cobra.ExactArgs(2),
RunE: killCmdHandler,
}
func killCmdHandler(cmd *cobra.Command, args []string) error {
pid, err := strconv.ParseUint(args[0], 10, 64)
if err != nil {
return err
}
outFile := args[1]
if outFile == "" {
return errors.New("invalid output file")
}
rpc := rpcclient.NewHTTP(nodeRPCAddr, "/websocket")
home := viper.GetString(cli.HomeFlag)
conf := cfg.DefaultConfig()
conf = conf.SetRoot(home)
cfg.EnsureRoot(conf.RootDir)
// Create a temporary directory which will contain all the state dumps and
// relevant files and directories that will be compressed into a file.
tmpDir, err := ioutil.TempDir(os.TempDir(), "tendermint_debug_tmp")
if err != nil {
return errors.Wrap(err, "failed to create temporary directory")
}
defer os.RemoveAll(tmpDir)
logger.Info("getting node status...")
if err := dumpStatus(rpc, tmpDir, "status.json"); err != nil {
return err
}
logger.Info("getting node network info...")
if err := dumpNetInfo(rpc, tmpDir, "net_info.json"); err != nil {
return err
}
logger.Info("getting node consensus state...")
if err := dumpConsensusState(rpc, tmpDir, "consensus_state.json"); err != nil {
return err
}
logger.Info("copying node WAL...")
if err := copyWAL(conf, tmpDir); err != nil {
return err
}
logger.Info("copying node configuration...")
if err := copyConfig(home, tmpDir); err != nil {
return err
}
logger.Info("killing Tendermint process")
if err := killProc(pid, tmpDir); err != nil {
return err
}
logger.Info("archiving and compressing debug directory...")
return zipDir(tmpDir, outFile)
}
// killProc attempts to kill the Tendermint process with a given PID with an
// ABORT signal which should result in a goroutine stacktrace. The PID's STDERR
// is tailed and piped to a file under the directory dir. An error is returned
// if the output file cannot be created or the tail command cannot be started.
// An error is not returned if any subsequent syscall fails.
func killProc(pid uint64, dir string) error {
// pipe STDERR output from tailing the Tendermint process to a file
//
// NOTE: This will only work on UNIX systems.
cmd := exec.Command("tail", "-f", fmt.Sprintf("/proc/%d/fd/2", pid)) // nolint: gosec
outFile, err := os.Create(filepath.Join(dir, "stacktrace.out"))
if err != nil {
return err
}
defer outFile.Close()
cmd.Stdout = outFile
cmd.Stderr = outFile
if err := cmd.Start(); err != nil {
return err
}
// kill the underlying Tendermint process and subsequent tailing process
go func() {
// Killing the Tendermint process with the '-ABRT|-6' signal will result in
// a goroutine stacktrace.
if err := syscall.Kill(int(pid), syscall.SIGABRT); err != nil {
fmt.Fprintf(os.Stderr, "failed to kill Tendermint process: %s", err)
}
// allow some time to allow the Tendermint process to be killed
//
// TODO: We should 'wait' for a kill to succeed (e.g. poll for PID until it
// cannot be found). Regardless, this should be ample time.
time.Sleep(5 * time.Second)
if err := cmd.Process.Kill(); err != nil {
fmt.Fprintf(os.Stderr, "failed to kill Tendermint process output redirection: %s", err)
}
}()
if err := cmd.Wait(); err != nil {
// only return an error not invoked by a manual kill
if _, ok := err.(*exec.ExitError); !ok {
return err
}
}
return nil
}

+ 82
- 0
cmd/tendermint/commands/debug/util.go View File

@ -0,0 +1,82 @@
package debug
import (
"fmt"
"io/ioutil"
"net/http"
"os"
"path"
"path/filepath"
"github.com/pkg/errors"
cfg "github.com/tendermint/tendermint/config"
rpcclient "github.com/tendermint/tendermint/rpc/client"
)
// dumpStatus gets node status state dump from the Tendermint RPC and writes it
// to file. It returns an error upon failure.
func dumpStatus(rpc *rpcclient.HTTP, dir, filename string) error {
status, err := rpc.Status()
if err != nil {
return errors.Wrap(err, "failed to get node status")
}
return writeStateJSONToFile(status, dir, filename)
}
// dumpNetInfo gets network information state dump from the Tendermint RPC and
// writes it to file. It returns an error upon failure.
func dumpNetInfo(rpc *rpcclient.HTTP, dir, filename string) error {
netInfo, err := rpc.NetInfo()
if err != nil {
return errors.Wrap(err, "failed to get node network information")
}
return writeStateJSONToFile(netInfo, dir, filename)
}
// dumpConsensusState gets consensus state dump from the Tendermint RPC and
// writes it to file. It returns an error upon failure.
func dumpConsensusState(rpc *rpcclient.HTTP, dir, filename string) error {
consDump, err := rpc.DumpConsensusState()
if err != nil {
return errors.Wrap(err, "failed to get node consensus dump")
}
return writeStateJSONToFile(consDump, dir, filename)
}
// copyWAL copies the Tendermint node's WAL file. It returns an error if the
// WAL file cannot be read or copied.
func copyWAL(conf *cfg.Config, dir string) error {
walPath := conf.Consensus.WalFile()
walFile := filepath.Base(walPath)
return copyFile(walPath, filepath.Join(dir, walFile))
}
// copyConfig copies the Tendermint node's config file. It returns an error if
// the config file cannot be read or copied.
func copyConfig(home, dir string) error {
configFile := "config.toml"
configPath := filepath.Join(home, "config", configFile)
return copyFile(configPath, filepath.Join(dir, configFile))
}
func dumpProfile(dir, addr, profile string, debug int) error {
endpoint := fmt.Sprintf("%s/debug/pprof/%s?debug=%d", addr, profile, debug)
resp, err := http.Get(endpoint) // nolint: gosec
if err != nil {
return errors.Wrapf(err, "failed to query for %s profile", profile)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return errors.Wrapf(err, "failed to read %s profile response body", profile)
}
return ioutil.WriteFile(path.Join(dir, fmt.Sprintf("%s.out", profile)), body, os.ModePerm)
}

+ 4
- 1
cmd/tendermint/main.go View File

@ -7,6 +7,7 @@ import (
"github.com/tendermint/tendermint/libs/cli"
cmd "github.com/tendermint/tendermint/cmd/tendermint/commands"
"github.com/tendermint/tendermint/cmd/tendermint/commands/debug"
cfg "github.com/tendermint/tendermint/config"
nm "github.com/tendermint/tendermint/node"
)
@ -26,7 +27,9 @@ func main() {
cmd.TestnetFilesCmd,
cmd.ShowNodeIDCmd,
cmd.GenNodeKeyCmd,
cmd.VersionCmd)
cmd.VersionCmd,
debug.DebugCmd,
)
// NOTE:
// Users wishing to:


+ 13
- 0
docs/tendermint-core/running-in-production.md View File

@ -141,6 +141,19 @@ returns just the votes seen at the current height.
- [StackOverflow
questions](https://stackoverflow.com/questions/tagged/tendermint)
### Debug Utility
Tendermint also ships with a `debug` sub-command that allows you to kill a live
Tendermint process while collecting useful information in a compressed archive
such as the configuration used, consensus state, network state, the node' status,
the WAL, and even the stacktrace of the process before exit. These files can be
useful to examine when debugging a faulty Tendermint process.
In addition, the `debug` sub-command also allows you to dump debugging data into
compressed archives at a regular interval. These archives contain the goroutine
and heap profiles in addition to the consensus state, network info, node status,
and even the WAL.
## Monitoring Tendermint
Each Tendermint instance has a standard `/health` RPC endpoint, which


Loading…
Cancel
Save