abci: localClient improvements & bugfixes & pubsub Unsubscribe issues (#2748)
* use READ lock/unlock in ConsensusState#GetLastHeight
Refs #2721
* do not use defers when there's no need
* fix peer formatting (output its address instead of the pointer)
```
[54310]: E[11-02|11:59:39.851] Connection failed @ sendRoutine module=p2p peer=0xb78f00 conn=MConn{74.207.236.148:26656} err="pong timeout"
```
https://github.com/tendermint/tendermint/issues/2721#issuecomment-435326581
* panic if peer has no state
https://github.com/tendermint/tendermint/issues/2721#issuecomment-435347165
It's confusing that sometimes we check if peer has a state, but most of
the times we expect it to be there
1. https://github.com/tendermint/tendermint/blob/add79700b5fe84417538202b6c927c8cc5383672/mempool/reactor.go#L138
2. https://github.com/tendermint/tendermint/blob/add79700b5fe84417538202b6c927c8cc5383672/rpc/core/consensus.go#L196 (edited)
I will change everything to always assume peer has a state and panic
otherwise
that should help identify issues earlier
* abci/localclient: extend lock on app callback
App callback should be protected by lock as well (note this was already
done for InitChainAsync, why not for others???). Otherwise, when we
execute the block, tx might come in and call the callback in the same
time we're updating it in execBlockOnProxyApp => DATA RACE
Fixes #2721
Consensus state is locked
```
goroutine 113333 [semacquire, 309 minutes]:
sync.runtime_SemacquireMutex(0xc00180009c, 0xc0000c7e00)
/usr/local/go/src/runtime/sema.go:71 +0x3d
sync.(*RWMutex).RLock(0xc001800090)
/usr/local/go/src/sync/rwmutex.go:50 +0x4e
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).GetRoundState(0xc001800000, 0x0)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:218 +0x46
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusReactor).queryMaj23Routine(0xc0017def80, 0x11104a0, 0xc0072488f0, 0xc007248
9c0)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/reactor.go:735 +0x16d
created by github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusReactor).AddPeer
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/reactor.go:172 +0x236
```
because localClient is locked
```
goroutine 1899 [semacquire, 309 minutes]:
sync.runtime_SemacquireMutex(0xc00003363c, 0xc0000cb500)
/usr/local/go/src/runtime/sema.go:71 +0x3d
sync.(*Mutex).Lock(0xc000033638)
/usr/local/go/src/sync/mutex.go:134 +0xff
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/abci/client.(*localClient).SetResponseCallback(0xc0001fb560, 0xc007868540)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/abci/client/local_client.go:32 +0x33
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/proxy.(*appConnConsensus).SetResponseCallback(0xc00002f750, 0xc007868540)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/proxy/app_conn.go:57 +0x40
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/state.execBlockOnProxyApp(0x1104e20, 0xc002ca0ba0, 0x11092a0, 0xc00002f750, 0xc0001fe960, 0xc000bfc660, 0x110cfe0, 0xc000090330, 0xc9d12, 0xc000d9d5a0, ...)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/state/execution.go:230 +0x1fd
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/state.(*BlockExecutor).ApplyBlock(0xc002c2a230, 0x7, 0x0, 0xc000eae880, 0x6, 0xc002e52c60, 0x16, 0x1f927, 0xc9d12, 0xc000d9d5a0, ...)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/state/execution.go:96 +0x142
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).finalizeCommit(0xc001800000, 0x1f928)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:1339 +0xa3e
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).tryFinalizeCommit(0xc001800000, 0x1f928)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:1270 +0x451
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).enterCommit.func1(0xc001800000, 0x0, 0x1f928)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:1218 +0x90
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).enterCommit(0xc001800000, 0x1f928, 0x0)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:1247 +0x6b8
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).addVote(0xc001800000, 0xc003d8dea0, 0xc000cf4cc0, 0x28, 0xf1, 0xc003bc7ad0, 0xc003bc7b10)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:1659 +0xbad
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).tryAddVote(0xc001800000, 0xc003d8dea0, 0xc000cf4cc0, 0x28, 0xf1, 0xf1, 0xf1)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:1517 +0x59
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).handleMsg(0xc001800000, 0xd98200, 0xc0070dbed0, 0xc000cf4cc0, 0x28)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:660 +0x64b
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).receiveRoutine(0xc001800000, 0x0)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:617 +0x670
created by github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus.(*ConsensusState).OnStart
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/consensus/state.go:311 +0x132
```
tx comes in and CheckTx is executed right when we execute the block
```
goroutine 111044 [semacquire, 309 minutes]:
sync.runtime_SemacquireMutex(0xc00003363c, 0x0)
/usr/local/go/src/runtime/sema.go:71 +0x3d
sync.(*Mutex).Lock(0xc000033638)
/usr/local/go/src/sync/mutex.go:134 +0xff
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/abci/client.(*localClient).CheckTxAsync(0xc0001fb0e0, 0xc002d94500, 0x13f, 0x280, 0x0)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/abci/client/local_client.go:85 +0x47
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/proxy.(*appConnMempool).CheckTxAsync(0xc00002f720, 0xc002d94500, 0x13f, 0x280, 0x1)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/proxy/app_conn.go:114 +0x51
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/mempool.(*Mempool).CheckTx(0xc002d3a320, 0xc002d94500, 0x13f, 0x280, 0xc0072355f0, 0x0, 0x0)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/mempool/mempool.go:316 +0x17b
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/core.BroadcastTxSync(0xc002d94500, 0x13f, 0x280, 0x0, 0x0, 0x0)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/core/mempool.go:93 +0xb8
reflect.Value.call(0xd85560, 0x10326c0, 0x13, 0xec7b8b, 0x4, 0xc00663f180, 0x1, 0x1, 0xc00663f180, 0xc00663f188, ...)
/usr/local/go/src/reflect/value.go:447 +0x449
reflect.Value.Call(0xd85560, 0x10326c0, 0x13, 0xc00663f180, 0x1, 0x1, 0x0, 0x0, 0xc005cc9344)
/usr/local/go/src/reflect/value.go:308 +0xa4
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/lib/server.makeHTTPHandler.func2(0x1102060, 0xc00663f100, 0xc0082d7900)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/lib/server/handlers.go:269 +0x188
net/http.HandlerFunc.ServeHTTP(0xc002c81f20, 0x1102060, 0xc00663f100, 0xc0082d7900)
/usr/local/go/src/net/http/server.go:1964 +0x44
net/http.(*ServeMux).ServeHTTP(0xc002c81b60, 0x1102060, 0xc00663f100, 0xc0082d7900)
/usr/local/go/src/net/http/server.go:2361 +0x127
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/lib/server.maxBytesHandler.ServeHTTP(0x10f8a40, 0xc002c81b60, 0xf4240, 0x1102060, 0xc00663f100, 0xc0082d7900)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/lib/server/http_server.go:219 +0xcf
github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/lib/server.RecoverAndLogHandler.func1(0x1103220, 0xc00121e620, 0xc0082d7900)
/root/go/src/github.com/MinterTeam/minter-go-node/vendor/github.com/tendermint/tendermint/rpc/lib/server/http_server.go:192 +0x394
net/http.HandlerFunc.ServeHTTP(0xc002c06ea0, 0x1103220, 0xc00121e620, 0xc0082d7900)
/usr/local/go/src/net/http/server.go:1964 +0x44
net/http.serverHandler.ServeHTTP(0xc001a1aa90, 0x1103220, 0xc00121e620, 0xc0082d7900)
/usr/local/go/src/net/http/server.go:2741 +0xab
net/http.(*conn).serve(0xc00785a3c0, 0x11041a0, 0xc000f844c0)
/usr/local/go/src/net/http/server.go:1847 +0x646
created by net/http.(*Server).Serve
/usr/local/go/src/net/http/server.go:2851 +0x2f5
```
* consensus: use read lock in Receive#VoteMessage
* use defer to unlock mutex because application might panic
* use defer in every method of the localClient
* add a changelog entry
* drain channels before Unsubscribe(All)
Read https://github.com/tendermint/tendermint/blob/55362ed76630f3e1ebec159a598f6a9fb5892cb1/libs/pubsub/pubsub.go#L13
for the detailed explanation of the issue.
We'll need to fix it someday. Make sure to keep an eye on
https://github.com/tendermint/tendermint/blob/master/docs/architecture/adr-033-pubsub.md
* retry instead of panic when peer has no state in reactors other than consensus
in /dump_consensus_state RPC endpoint, skip a peer with no state
* rpc/core/mempool: simplify error messages
* rpc/core/mempool: use time.After instead of timer
also, do not log DeliverTx result (to be consistent with other memthods)
* unlock before calling the callback in reqRes#SetCallback
6 years ago |
|
- package abcicli
-
- import (
- "fmt"
- "net"
- "sync"
- "time"
-
- context "golang.org/x/net/context"
- grpc "google.golang.org/grpc"
-
- "github.com/tendermint/tendermint/abci/types"
- cmn "github.com/tendermint/tendermint/libs/common"
- )
-
- var _ Client = (*grpcClient)(nil)
-
- // A stripped copy of the remoteClient that makes
- // synchronous calls using grpc
- type grpcClient struct {
- cmn.BaseService
- mustConnect bool
-
- client types.ABCIApplicationClient
- conn *grpc.ClientConn
-
- mtx sync.Mutex
- addr string
- err error
- resCb func(*types.Request, *types.Response) // listens to all callbacks
- }
-
- func NewGRPCClient(addr string, mustConnect bool) *grpcClient {
- cli := &grpcClient{
- addr: addr,
- mustConnect: mustConnect,
- }
- cli.BaseService = *cmn.NewBaseService(nil, "grpcClient", cli)
- return cli
- }
-
- func dialerFunc(addr string, timeout time.Duration) (net.Conn, error) {
- return cmn.Connect(addr)
- }
-
- func (cli *grpcClient) OnStart() error {
- if err := cli.BaseService.OnStart(); err != nil {
- return err
- }
- RETRY_LOOP:
- for {
- conn, err := grpc.Dial(cli.addr, grpc.WithInsecure(), grpc.WithDialer(dialerFunc))
- if err != nil {
- if cli.mustConnect {
- return err
- }
- cli.Logger.Error(fmt.Sprintf("abci.grpcClient failed to connect to %v. Retrying...\n", cli.addr), "err", err)
- time.Sleep(time.Second * dialRetryIntervalSeconds)
- continue RETRY_LOOP
- }
-
- cli.Logger.Info("Dialed server. Waiting for echo.", "addr", cli.addr)
- client := types.NewABCIApplicationClient(conn)
- cli.conn = conn
-
- ENSURE_CONNECTED:
- for {
- _, err := client.Echo(context.Background(), &types.RequestEcho{Message: "hello"}, grpc.FailFast(true))
- if err == nil {
- break ENSURE_CONNECTED
- }
- cli.Logger.Error("Echo failed", "err", err)
- time.Sleep(time.Second * echoRetryIntervalSeconds)
- }
-
- cli.client = client
- return nil
- }
- }
-
- func (cli *grpcClient) OnStop() {
- cli.BaseService.OnStop()
-
- if cli.conn != nil {
- cli.conn.Close()
- }
- }
-
- func (cli *grpcClient) StopForError(err error) {
- cli.mtx.Lock()
- if !cli.IsRunning() {
- return
- }
-
- if cli.err == nil {
- cli.err = err
- }
- cli.mtx.Unlock()
-
- cli.Logger.Error(fmt.Sprintf("Stopping abci.grpcClient for error: %v", err.Error()))
- cli.Stop()
- }
-
- func (cli *grpcClient) Error() error {
- cli.mtx.Lock()
- defer cli.mtx.Unlock()
- return cli.err
- }
-
- // Set listener for all responses
- // NOTE: callback may get internally generated flush responses.
- func (cli *grpcClient) SetResponseCallback(resCb Callback) {
- cli.mtx.Lock()
- cli.resCb = resCb
- cli.mtx.Unlock()
- }
-
- //----------------------------------------
- // GRPC calls are synchronous, but some callbacks expect to be called asynchronously
- // (eg. the mempool expects to be able to lock to remove bad txs from cache).
- // To accommodate, we finish each call in its own go-routine,
- // which is expensive, but easy - if you want something better, use the socket protocol!
- // maybe one day, if people really want it, we use grpc streams,
- // but hopefully not :D
-
- func (cli *grpcClient) EchoAsync(msg string) *ReqRes {
- req := types.ToRequestEcho(msg)
- res, err := cli.client.Echo(context.Background(), req.GetEcho(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_Echo{Echo: res}})
- }
-
- func (cli *grpcClient) FlushAsync() *ReqRes {
- req := types.ToRequestFlush()
- res, err := cli.client.Flush(context.Background(), req.GetFlush(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_Flush{Flush: res}})
- }
-
- func (cli *grpcClient) InfoAsync(params types.RequestInfo) *ReqRes {
- req := types.ToRequestInfo(params)
- res, err := cli.client.Info(context.Background(), req.GetInfo(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_Info{Info: res}})
- }
-
- func (cli *grpcClient) SetOptionAsync(params types.RequestSetOption) *ReqRes {
- req := types.ToRequestSetOption(params)
- res, err := cli.client.SetOption(context.Background(), req.GetSetOption(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_SetOption{SetOption: res}})
- }
-
- func (cli *grpcClient) DeliverTxAsync(tx []byte) *ReqRes {
- req := types.ToRequestDeliverTx(tx)
- res, err := cli.client.DeliverTx(context.Background(), req.GetDeliverTx(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_DeliverTx{DeliverTx: res}})
- }
-
- func (cli *grpcClient) CheckTxAsync(tx []byte) *ReqRes {
- req := types.ToRequestCheckTx(tx)
- res, err := cli.client.CheckTx(context.Background(), req.GetCheckTx(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_CheckTx{CheckTx: res}})
- }
-
- func (cli *grpcClient) QueryAsync(params types.RequestQuery) *ReqRes {
- req := types.ToRequestQuery(params)
- res, err := cli.client.Query(context.Background(), req.GetQuery(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_Query{Query: res}})
- }
-
- func (cli *grpcClient) CommitAsync() *ReqRes {
- req := types.ToRequestCommit()
- res, err := cli.client.Commit(context.Background(), req.GetCommit(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_Commit{Commit: res}})
- }
-
- func (cli *grpcClient) InitChainAsync(params types.RequestInitChain) *ReqRes {
- req := types.ToRequestInitChain(params)
- res, err := cli.client.InitChain(context.Background(), req.GetInitChain(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_InitChain{InitChain: res}})
- }
-
- func (cli *grpcClient) BeginBlockAsync(params types.RequestBeginBlock) *ReqRes {
- req := types.ToRequestBeginBlock(params)
- res, err := cli.client.BeginBlock(context.Background(), req.GetBeginBlock(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_BeginBlock{BeginBlock: res}})
- }
-
- func (cli *grpcClient) EndBlockAsync(params types.RequestEndBlock) *ReqRes {
- req := types.ToRequestEndBlock(params)
- res, err := cli.client.EndBlock(context.Background(), req.GetEndBlock(), grpc.FailFast(true))
- if err != nil {
- cli.StopForError(err)
- }
- return cli.finishAsyncCall(req, &types.Response{Value: &types.Response_EndBlock{EndBlock: res}})
- }
-
- func (cli *grpcClient) finishAsyncCall(req *types.Request, res *types.Response) *ReqRes {
- reqres := NewReqRes(req)
- reqres.Response = res // Set response
- reqres.Done() // Release waiters
- reqres.SetDone() // so reqRes.SetCallback will run the callback
-
- // go routine for callbacks
- go func() {
- // Notify reqRes listener if set
- if cb := reqres.GetCallback(); cb != nil {
- cb(res)
- }
-
- // Notify client listener if set
- if cli.resCb != nil {
- cli.resCb(reqres.Request, res)
- }
- }()
- return reqres
- }
-
- //----------------------------------------
-
- func (cli *grpcClient) FlushSync() error {
- return nil
- }
-
- func (cli *grpcClient) EchoSync(msg string) (*types.ResponseEcho, error) {
- reqres := cli.EchoAsync(msg)
- // StopForError should already have been called if error is set
- return reqres.Response.GetEcho(), cli.Error()
- }
-
- func (cli *grpcClient) InfoSync(req types.RequestInfo) (*types.ResponseInfo, error) {
- reqres := cli.InfoAsync(req)
- return reqres.Response.GetInfo(), cli.Error()
- }
-
- func (cli *grpcClient) SetOptionSync(req types.RequestSetOption) (*types.ResponseSetOption, error) {
- reqres := cli.SetOptionAsync(req)
- return reqres.Response.GetSetOption(), cli.Error()
- }
-
- func (cli *grpcClient) DeliverTxSync(tx []byte) (*types.ResponseDeliverTx, error) {
- reqres := cli.DeliverTxAsync(tx)
- return reqres.Response.GetDeliverTx(), cli.Error()
- }
-
- func (cli *grpcClient) CheckTxSync(tx []byte) (*types.ResponseCheckTx, error) {
- reqres := cli.CheckTxAsync(tx)
- return reqres.Response.GetCheckTx(), cli.Error()
- }
-
- func (cli *grpcClient) QuerySync(req types.RequestQuery) (*types.ResponseQuery, error) {
- reqres := cli.QueryAsync(req)
- return reqres.Response.GetQuery(), cli.Error()
- }
-
- func (cli *grpcClient) CommitSync() (*types.ResponseCommit, error) {
- reqres := cli.CommitAsync()
- return reqres.Response.GetCommit(), cli.Error()
- }
-
- func (cli *grpcClient) InitChainSync(params types.RequestInitChain) (*types.ResponseInitChain, error) {
- reqres := cli.InitChainAsync(params)
- return reqres.Response.GetInitChain(), cli.Error()
- }
-
- func (cli *grpcClient) BeginBlockSync(params types.RequestBeginBlock) (*types.ResponseBeginBlock, error) {
- reqres := cli.BeginBlockAsync(params)
- return reqres.Response.GetBeginBlock(), cli.Error()
- }
-
- func (cli *grpcClient) EndBlockSync(params types.RequestEndBlock) (*types.ResponseEndBlock, error) {
- reqres := cli.EndBlockAsync(params)
- return reqres.Response.GetEndBlock(), cli.Error()
- }
|