## Issue Implement a new subcommand: tendermint debug. This subcommand itself has two subcommands: $ tendermint debug kill <pid> </path/to/out.zip> --home=</path/to/app.d> Writes debug info into a compressed archive. The archive will contain the following: ├── config.toml ├── consensus_state.json ├── net_info.json ├── stacktrace.out ├── status.json └── wal The Tendermint process will be killed. $ tendermint debug dump </path/to/out> --home=</path/to/app.d> This command will perform similar to kill except it only polls the node and dumps debugging data every frequency seconds to a compressed archive under a given destination directory. Each archive will contain: ├── consensus_state.json ├── goroutine.out ├── heap.out ├── net_info.json ├── status.json └── wal Note: goroutine.out and heap.out will only be written if a profile address is provided and is operational. This command is blocking and will log any error. replaces: #3327 closes: #3249 ## Commits: * Implement debug tool command stubs * Implement net getters and zip logic * Update zip dir API and add home flag * Add simple godocs for kill aux functions * Move IO util to new file and implement copy WAL func * Implement copy config function * Implement killProc * Remove debug fmt * Validate output file input * Direct STDERR to file * Godoc updates * Sleep prior to killing tail proc * Minor cleanup of godocs * Move debug command and add make target * Rename command handler function * Add example to command long descr * Move current implementation to cmd/tendermint/commands/debug * Update kill cmd long description * Implement dump command * Add pending log entry * Add gosec nolint * Add error check for Mkdir * Add os.IsNotExist(err) * Add to debugging section in running-in-prod docpull/4251/head
@ -0,0 +1,41 @@ | |||
package debug | |||
import ( | |||
"os" | |||
"github.com/spf13/cobra" | |||
"github.com/tendermint/tendermint/libs/log" | |||
) | |||
var ( | |||
nodeRPCAddr string | |||
profAddr string | |||
frequency uint | |||
flagNodeRPCAddr = "rpc-laddr" | |||
flagProfAddr = "pprof-laddr" | |||
flagFrequency = "frequency" | |||
logger = log.NewTMLogger(log.NewSyncWriter(os.Stdout)) | |||
) | |||
// DebugCmd defines the root command containing subcommands that assist in | |||
// debugging running Tendermint processes. | |||
var DebugCmd = &cobra.Command{ | |||
Use: "debug", | |||
Short: "A utility to kill or watch a Tendermint process while aggregating debugging data", | |||
} | |||
func init() { | |||
DebugCmd.PersistentFlags().SortFlags = true | |||
DebugCmd.PersistentFlags().StringVar( | |||
&nodeRPCAddr, | |||
flagNodeRPCAddr, | |||
"tcp://localhost:26657", | |||
"The Tendermint node's RPC address (<host>:<port>)", | |||
) | |||
DebugCmd.AddCommand(killCmd) | |||
DebugCmd.AddCommand(dumpCmd) | |||
} |
@ -0,0 +1,130 @@ | |||
package debug | |||
import ( | |||
"fmt" | |||
"io/ioutil" | |||
"os" | |||
"path/filepath" | |||
"time" | |||
"github.com/pkg/errors" | |||
"github.com/spf13/cobra" | |||
"github.com/spf13/viper" | |||
cfg "github.com/tendermint/tendermint/config" | |||
"github.com/tendermint/tendermint/libs/cli" | |||
rpcclient "github.com/tendermint/tendermint/rpc/client" | |||
) | |||
var dumpCmd = &cobra.Command{ | |||
Use: "dump [output-directory]", | |||
Short: "Continuously poll a Tendermint process and dump debugging data into a single location", | |||
Long: `Continuously poll a Tendermint process and dump debugging data into a single | |||
location at a specified frequency. At each frequency interval, an archived and compressed | |||
file will contain node debugging information including the goroutine and heap profiles | |||
if enabled.`, | |||
Args: cobra.ExactArgs(1), | |||
RunE: dumpCmdHandler, | |||
} | |||
func init() { | |||
dumpCmd.Flags().UintVar( | |||
&frequency, | |||
flagFrequency, | |||
30, | |||
"The frequency (seconds) in which to poll, aggregate and dump Tendermint debug data", | |||
) | |||
dumpCmd.Flags().StringVar( | |||
&profAddr, | |||
flagProfAddr, | |||
"", | |||
"The profiling server address (<host>:<port>)", | |||
) | |||
} | |||
func dumpCmdHandler(_ *cobra.Command, args []string) error { | |||
outDir := args[0] | |||
if outDir == "" { | |||
return errors.New("invalid output directory") | |||
} | |||
if frequency == 0 { | |||
return errors.New("frequency must be positive") | |||
} | |||
if _, err := os.Stat(outDir); os.IsNotExist(err) { | |||
if err := os.Mkdir(outDir, os.ModePerm); err != nil { | |||
return errors.Wrap(err, "failed to create output directory") | |||
} | |||
} | |||
rpc := rpcclient.NewHTTP(nodeRPCAddr, "/websocket") | |||
home := viper.GetString(cli.HomeFlag) | |||
conf := cfg.DefaultConfig() | |||
conf = conf.SetRoot(home) | |||
cfg.EnsureRoot(conf.RootDir) | |||
dumpDebugData(outDir, conf, rpc) | |||
ticker := time.NewTicker(time.Duration(frequency) * time.Second) | |||
for range ticker.C { | |||
dumpDebugData(outDir, conf, rpc) | |||
} | |||
return nil | |||
} | |||
func dumpDebugData(outDir string, conf *cfg.Config, rpc *rpcclient.HTTP) { | |||
start := time.Now().UTC() | |||
tmpDir, err := ioutil.TempDir(outDir, "tendermint_debug_tmp") | |||
if err != nil { | |||
logger.Error("failed to create temporary directory", "dir", tmpDir, "error", err) | |||
return | |||
} | |||
defer os.RemoveAll(tmpDir) | |||
logger.Info("getting node status...") | |||
if err := dumpStatus(rpc, tmpDir, "status.json"); err != nil { | |||
logger.Error("failed to dump node status", "error", err) | |||
return | |||
} | |||
logger.Info("getting node network info...") | |||
if err := dumpNetInfo(rpc, tmpDir, "net_info.json"); err != nil { | |||
logger.Error("failed to dump node network info", "error", err) | |||
return | |||
} | |||
logger.Info("getting node consensus state...") | |||
if err := dumpConsensusState(rpc, tmpDir, "consensus_state.json"); err != nil { | |||
logger.Error("failed to dump node consensus state", "error", err) | |||
return | |||
} | |||
logger.Info("copying node WAL...") | |||
if err := copyWAL(conf, tmpDir); err != nil { | |||
logger.Error("failed to copy node WAL", "error", err) | |||
return | |||
} | |||
if profAddr != "" { | |||
logger.Info("getting node goroutine profile...") | |||
if err := dumpProfile(tmpDir, profAddr, "goroutine", 2); err != nil { | |||
logger.Error("failed to dump goroutine profile", "error", err) | |||
return | |||
} | |||
logger.Info("getting node heap profile...") | |||
if err := dumpProfile(tmpDir, profAddr, "heap", 2); err != nil { | |||
logger.Error("failed to dump heap profile", "error", err) | |||
return | |||
} | |||
} | |||
outFile := filepath.Join(outDir, fmt.Sprintf("%s.zip", start.Format(time.Stamp))) | |||
if err := zipDir(tmpDir, outFile); err != nil { | |||
logger.Error("failed to create and compress archive", "file", outFile, "error", err) | |||
} | |||
} |
@ -0,0 +1,117 @@ | |||
package debug | |||
import ( | |||
"archive/zip" | |||
"encoding/json" | |||
"io" | |||
"io/ioutil" | |||
"os" | |||
"path" | |||
"path/filepath" | |||
"strings" | |||
"github.com/pkg/errors" | |||
) | |||
// zipDir zips all the contents found in src, including both files and | |||
// directories, into a destination file dest. It returns an error upon failure. | |||
// It assumes src is a directory. | |||
func zipDir(src, dest string) error { | |||
zipFile, err := os.Create(dest) | |||
if err != nil { | |||
return err | |||
} | |||
defer zipFile.Close() | |||
zipWriter := zip.NewWriter(zipFile) | |||
defer zipWriter.Close() | |||
dirName := filepath.Base(dest) | |||
baseDir := strings.TrimSuffix(dirName, filepath.Ext(dirName)) | |||
filepath.Walk(src, func(path string, info os.FileInfo, err error) error { | |||
if err != nil { | |||
return err | |||
} | |||
header, err := zip.FileInfoHeader(info) | |||
if err != nil { | |||
return err | |||
} | |||
// Each execution of this utility on a Tendermint process will result in a | |||
// unique file. | |||
header.Name = filepath.Join(baseDir, strings.TrimPrefix(path, src)) | |||
// Handle cases where the content to be zipped is a file or a directory, | |||
// where a directory must have a '/' suffix. | |||
if info.IsDir() { | |||
header.Name += "/" | |||
} else { | |||
header.Method = zip.Deflate | |||
} | |||
headerWriter, err := zipWriter.CreateHeader(header) | |||
if err != nil { | |||
return err | |||
} | |||
if info.IsDir() { | |||
return nil | |||
} | |||
file, err := os.Open(path) | |||
if err != nil { | |||
return err | |||
} | |||
defer file.Close() | |||
_, err = io.Copy(headerWriter, file) | |||
return err | |||
}) | |||
return nil | |||
} | |||
// copyFile copies a file from src to dest and returns an error upon failure. The | |||
// copied file retains the source file's permissions. | |||
func copyFile(src, dest string) error { | |||
if _, err := os.Stat(src); os.IsNotExist(err) { | |||
return err | |||
} | |||
srcFile, err := os.Open(src) | |||
if err != nil { | |||
return err | |||
} | |||
defer srcFile.Close() | |||
destFile, err := os.Create(dest) | |||
if err != nil { | |||
return err | |||
} | |||
defer destFile.Close() | |||
if _, err = io.Copy(destFile, srcFile); err != nil { | |||
return err | |||
} | |||
srcInfo, err := os.Stat(src) | |||
if err != nil { | |||
return err | |||
} | |||
return os.Chmod(dest, srcInfo.Mode()) | |||
} | |||
// writeStateToFile pretty JSON encodes an object and writes it to file composed | |||
// of dir and filename. It returns an error upon failure to encode or write to | |||
// file. | |||
func writeStateJSONToFile(state interface{}, dir, filename string) error { | |||
stateJSON, err := json.MarshalIndent(state, "", " ") | |||
if err != nil { | |||
return errors.Wrap(err, "failed to encode state dump") | |||
} | |||
return ioutil.WriteFile(path.Join(dir, filename), stateJSON, os.ModePerm) | |||
} |
@ -0,0 +1,147 @@ | |||
package debug | |||
import ( | |||
"fmt" | |||
"io/ioutil" | |||
"os" | |||
"os/exec" | |||
"path/filepath" | |||
"strconv" | |||
"syscall" | |||
"time" | |||
"github.com/pkg/errors" | |||
"github.com/spf13/cobra" | |||
"github.com/spf13/viper" | |||
cfg "github.com/tendermint/tendermint/config" | |||
"github.com/tendermint/tendermint/libs/cli" | |||
rpcclient "github.com/tendermint/tendermint/rpc/client" | |||
) | |||
var killCmd = &cobra.Command{ | |||
Use: "kill [pid] [compressed-output-file]", | |||
Short: "Kill a Tendermint process while aggregating and packaging debugging data", | |||
Long: `Kill a Tendermint process while also aggregating Tendermint process data | |||
such as the latest node state, including consensus and networking state, | |||
go-routine state, and the node's WAL and config information. This aggregated data | |||
is packaged into a compressed archive. | |||
Example: | |||
$ tendermint debug 34255 /path/to/tm-debug.zip`, | |||
Args: cobra.ExactArgs(2), | |||
RunE: killCmdHandler, | |||
} | |||
func killCmdHandler(cmd *cobra.Command, args []string) error { | |||
pid, err := strconv.ParseUint(args[0], 10, 64) | |||
if err != nil { | |||
return err | |||
} | |||
outFile := args[1] | |||
if outFile == "" { | |||
return errors.New("invalid output file") | |||
} | |||
rpc := rpcclient.NewHTTP(nodeRPCAddr, "/websocket") | |||
home := viper.GetString(cli.HomeFlag) | |||
conf := cfg.DefaultConfig() | |||
conf = conf.SetRoot(home) | |||
cfg.EnsureRoot(conf.RootDir) | |||
// Create a temporary directory which will contain all the state dumps and | |||
// relevant files and directories that will be compressed into a file. | |||
tmpDir, err := ioutil.TempDir(os.TempDir(), "tendermint_debug_tmp") | |||
if err != nil { | |||
return errors.Wrap(err, "failed to create temporary directory") | |||
} | |||
defer os.RemoveAll(tmpDir) | |||
logger.Info("getting node status...") | |||
if err := dumpStatus(rpc, tmpDir, "status.json"); err != nil { | |||
return err | |||
} | |||
logger.Info("getting node network info...") | |||
if err := dumpNetInfo(rpc, tmpDir, "net_info.json"); err != nil { | |||
return err | |||
} | |||
logger.Info("getting node consensus state...") | |||
if err := dumpConsensusState(rpc, tmpDir, "consensus_state.json"); err != nil { | |||
return err | |||
} | |||
logger.Info("copying node WAL...") | |||
if err := copyWAL(conf, tmpDir); err != nil { | |||
return err | |||
} | |||
logger.Info("copying node configuration...") | |||
if err := copyConfig(home, tmpDir); err != nil { | |||
return err | |||
} | |||
logger.Info("killing Tendermint process") | |||
if err := killProc(pid, tmpDir); err != nil { | |||
return err | |||
} | |||
logger.Info("archiving and compressing debug directory...") | |||
return zipDir(tmpDir, outFile) | |||
} | |||
// killProc attempts to kill the Tendermint process with a given PID with an | |||
// ABORT signal which should result in a goroutine stacktrace. The PID's STDERR | |||
// is tailed and piped to a file under the directory dir. An error is returned | |||
// if the output file cannot be created or the tail command cannot be started. | |||
// An error is not returned if any subsequent syscall fails. | |||
func killProc(pid uint64, dir string) error { | |||
// pipe STDERR output from tailing the Tendermint process to a file | |||
// | |||
// NOTE: This will only work on UNIX systems. | |||
cmd := exec.Command("tail", "-f", fmt.Sprintf("/proc/%d/fd/2", pid)) // nolint: gosec | |||
outFile, err := os.Create(filepath.Join(dir, "stacktrace.out")) | |||
if err != nil { | |||
return err | |||
} | |||
defer outFile.Close() | |||
cmd.Stdout = outFile | |||
cmd.Stderr = outFile | |||
if err := cmd.Start(); err != nil { | |||
return err | |||
} | |||
// kill the underlying Tendermint process and subsequent tailing process | |||
go func() { | |||
// Killing the Tendermint process with the '-ABRT|-6' signal will result in | |||
// a goroutine stacktrace. | |||
if err := syscall.Kill(int(pid), syscall.SIGABRT); err != nil { | |||
fmt.Fprintf(os.Stderr, "failed to kill Tendermint process: %s", err) | |||
} | |||
// allow some time to allow the Tendermint process to be killed | |||
// | |||
// TODO: We should 'wait' for a kill to succeed (e.g. poll for PID until it | |||
// cannot be found). Regardless, this should be ample time. | |||
time.Sleep(5 * time.Second) | |||
if err := cmd.Process.Kill(); err != nil { | |||
fmt.Fprintf(os.Stderr, "failed to kill Tendermint process output redirection: %s", err) | |||
} | |||
}() | |||
if err := cmd.Wait(); err != nil { | |||
// only return an error not invoked by a manual kill | |||
if _, ok := err.(*exec.ExitError); !ok { | |||
return err | |||
} | |||
} | |||
return nil | |||
} |
@ -0,0 +1,82 @@ | |||
package debug | |||
import ( | |||
"fmt" | |||
"io/ioutil" | |||
"net/http" | |||
"os" | |||
"path" | |||
"path/filepath" | |||
"github.com/pkg/errors" | |||
cfg "github.com/tendermint/tendermint/config" | |||
rpcclient "github.com/tendermint/tendermint/rpc/client" | |||
) | |||
// dumpStatus gets node status state dump from the Tendermint RPC and writes it | |||
// to file. It returns an error upon failure. | |||
func dumpStatus(rpc *rpcclient.HTTP, dir, filename string) error { | |||
status, err := rpc.Status() | |||
if err != nil { | |||
return errors.Wrap(err, "failed to get node status") | |||
} | |||
return writeStateJSONToFile(status, dir, filename) | |||
} | |||
// dumpNetInfo gets network information state dump from the Tendermint RPC and | |||
// writes it to file. It returns an error upon failure. | |||
func dumpNetInfo(rpc *rpcclient.HTTP, dir, filename string) error { | |||
netInfo, err := rpc.NetInfo() | |||
if err != nil { | |||
return errors.Wrap(err, "failed to get node network information") | |||
} | |||
return writeStateJSONToFile(netInfo, dir, filename) | |||
} | |||
// dumpConsensusState gets consensus state dump from the Tendermint RPC and | |||
// writes it to file. It returns an error upon failure. | |||
func dumpConsensusState(rpc *rpcclient.HTTP, dir, filename string) error { | |||
consDump, err := rpc.DumpConsensusState() | |||
if err != nil { | |||
return errors.Wrap(err, "failed to get node consensus dump") | |||
} | |||
return writeStateJSONToFile(consDump, dir, filename) | |||
} | |||
// copyWAL copies the Tendermint node's WAL file. It returns an error if the | |||
// WAL file cannot be read or copied. | |||
func copyWAL(conf *cfg.Config, dir string) error { | |||
walPath := conf.Consensus.WalFile() | |||
walFile := filepath.Base(walPath) | |||
return copyFile(walPath, filepath.Join(dir, walFile)) | |||
} | |||
// copyConfig copies the Tendermint node's config file. It returns an error if | |||
// the config file cannot be read or copied. | |||
func copyConfig(home, dir string) error { | |||
configFile := "config.toml" | |||
configPath := filepath.Join(home, "config", configFile) | |||
return copyFile(configPath, filepath.Join(dir, configFile)) | |||
} | |||
func dumpProfile(dir, addr, profile string, debug int) error { | |||
endpoint := fmt.Sprintf("%s/debug/pprof/%s?debug=%d", addr, profile, debug) | |||
resp, err := http.Get(endpoint) // nolint: gosec | |||
if err != nil { | |||
return errors.Wrapf(err, "failed to query for %s profile", profile) | |||
} | |||
defer resp.Body.Close() | |||
body, err := ioutil.ReadAll(resp.Body) | |||
if err != nil { | |||
return errors.Wrapf(err, "failed to read %s profile response body", profile) | |||
} | |||
return ioutil.WriteFile(path.Join(dir, fmt.Sprintf("%s.out", profile)), body, os.ModePerm) | |||
} |