You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

263 lines
5.8 KiB

Close and retry a RemoteSigner on err (#2923) * Close and recreate a RemoteSigner on err * Update changelog * Address Anton's comments / suggestions: - update changelog - restart TCPVal - shut down on `ErrUnexpectedResponse` * re-init remote signer client with fresh connection if Ping fails - add/update TODOs in secret connection - rename tcp.go -> tcp_client.go, same with ipc to clarify their purpose * account for `conn returned by waitConnection can be `nil` - also add TODO about RemoteSigner conn field * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn * Tests for retrying: IPC / TCP - shorter info log on success - set conn and use it in tests to close conn - add rwmutex for conn field in IPC * comments and doc.go * fix ipc tests. fixes #2677 * use constants for tests * cleanup some error statements * fixes #2784, race in tests * remove print statement * minor fixes from review * update comment on sts spec * cosmetics * p2p/conn: add failing tests * p2p/conn: make SecretConnection thread safe * changelog * IPCVal signer refactor - use a .reset() method - don't use embedded RemoteSignerClient - guard RemoteSignerClient with mutex - drop the .conn - expose Close() on RemoteSignerClient * apply IPCVal refactor to TCPVal * remove mtx from RemoteSignerClient * consolidate IPCVal and TCPVal, fixes #3104 - done in tcp_client.go - now called SocketVal - takes a listener in the constructor - make tcpListener and unixListener contain all the differences * delete ipc files * introduce unix and tcp dialer for RemoteSigner * rename files - drop tcp_ prefix - rename priv_validator.go to file.go * bring back listener options * fix node * fix priv_val_server * fix node test * minor cleanup and comments
6 years ago
  1. package privval
  2. import (
  3. "errors"
  4. "fmt"
  5. "net"
  6. "sync"
  7. "time"
  8. "github.com/tendermint/tendermint/crypto"
  9. cmn "github.com/tendermint/tendermint/libs/common"
  10. "github.com/tendermint/tendermint/libs/log"
  11. "github.com/tendermint/tendermint/types"
  12. )
  13. const (
  14. defaultConnHeartBeatSeconds = 2
  15. defaultDialRetries = 10
  16. )
  17. // Socket errors.
  18. var (
  19. ErrUnexpectedResponse = errors.New("received unexpected response")
  20. )
  21. var (
  22. connHeartbeat = time.Second * defaultConnHeartBeatSeconds
  23. )
  24. // SocketValOption sets an optional parameter on the SocketVal.
  25. type SocketValOption func(*SocketVal)
  26. // SocketValHeartbeat sets the period on which to check the liveness of the
  27. // connected Signer connections.
  28. func SocketValHeartbeat(period time.Duration) SocketValOption {
  29. return func(sc *SocketVal) { sc.connHeartbeat = period }
  30. }
  31. // SocketVal implements PrivValidator.
  32. // It listens for an external process to dial in and uses
  33. // the socket to request signatures.
  34. type SocketVal struct {
  35. cmn.BaseService
  36. listener net.Listener
  37. // ping
  38. cancelPing chan struct{}
  39. pingTicker *time.Ticker
  40. connHeartbeat time.Duration
  41. // signer is mutable since it can be
  42. // reset if the connection fails.
  43. // failures are detected by a background
  44. // ping routine.
  45. // Methods on the underlying net.Conn itself
  46. // are already gorountine safe.
  47. mtx sync.RWMutex
  48. signer *RemoteSignerClient
  49. }
  50. // Check that SocketVal implements PrivValidator.
  51. var _ types.PrivValidator = (*SocketVal)(nil)
  52. // NewSocketVal returns an instance of SocketVal.
  53. func NewSocketVal(
  54. logger log.Logger,
  55. listener net.Listener,
  56. ) *SocketVal {
  57. sc := &SocketVal{
  58. listener: listener,
  59. connHeartbeat: connHeartbeat,
  60. }
  61. sc.BaseService = *cmn.NewBaseService(logger, "SocketVal", sc)
  62. return sc
  63. }
  64. //--------------------------------------------------------
  65. // Implement PrivValidator
  66. // GetPubKey implements PrivValidator.
  67. func (sc *SocketVal) GetPubKey() crypto.PubKey {
  68. sc.mtx.RLock()
  69. defer sc.mtx.RUnlock()
  70. return sc.signer.GetPubKey()
  71. }
  72. // SignVote implements PrivValidator.
  73. func (sc *SocketVal) SignVote(chainID string, vote *types.Vote) error {
  74. sc.mtx.RLock()
  75. defer sc.mtx.RUnlock()
  76. return sc.signer.SignVote(chainID, vote)
  77. }
  78. // SignProposal implements PrivValidator.
  79. func (sc *SocketVal) SignProposal(chainID string, proposal *types.Proposal) error {
  80. sc.mtx.RLock()
  81. defer sc.mtx.RUnlock()
  82. return sc.signer.SignProposal(chainID, proposal)
  83. }
  84. //--------------------------------------------------------
  85. // More thread safe methods proxied to the signer
  86. // Ping is used to check connection health.
  87. func (sc *SocketVal) Ping() error {
  88. sc.mtx.RLock()
  89. defer sc.mtx.RUnlock()
  90. return sc.signer.Ping()
  91. }
  92. // Close closes the underlying net.Conn.
  93. func (sc *SocketVal) Close() {
  94. sc.mtx.RLock()
  95. defer sc.mtx.RUnlock()
  96. if sc.signer != nil {
  97. if err := sc.signer.Close(); err != nil {
  98. sc.Logger.Error("OnStop", "err", err)
  99. }
  100. }
  101. if sc.listener != nil {
  102. if err := sc.listener.Close(); err != nil {
  103. sc.Logger.Error("OnStop", "err", err)
  104. }
  105. }
  106. }
  107. //--------------------------------------------------------
  108. // Service start and stop
  109. // OnStart implements cmn.Service.
  110. func (sc *SocketVal) OnStart() error {
  111. if closed, err := sc.reset(); err != nil {
  112. sc.Logger.Error("OnStart", "err", err)
  113. return err
  114. } else if closed {
  115. return fmt.Errorf("listener is closed")
  116. }
  117. // Start a routine to keep the connection alive
  118. sc.cancelPing = make(chan struct{}, 1)
  119. sc.pingTicker = time.NewTicker(sc.connHeartbeat)
  120. go func() {
  121. for {
  122. select {
  123. case <-sc.pingTicker.C:
  124. err := sc.Ping()
  125. if err != nil {
  126. sc.Logger.Error("Ping", "err", err)
  127. if err == ErrUnexpectedResponse {
  128. return
  129. }
  130. closed, err := sc.reset()
  131. if err != nil {
  132. sc.Logger.Error("Reconnecting to remote signer failed", "err", err)
  133. continue
  134. }
  135. if closed {
  136. sc.Logger.Info("listener is closing")
  137. return
  138. }
  139. sc.Logger.Info("Re-created connection to remote signer", "impl", sc)
  140. }
  141. case <-sc.cancelPing:
  142. sc.pingTicker.Stop()
  143. return
  144. }
  145. }
  146. }()
  147. return nil
  148. }
  149. // OnStop implements cmn.Service.
  150. func (sc *SocketVal) OnStop() {
  151. if sc.cancelPing != nil {
  152. close(sc.cancelPing)
  153. }
  154. sc.Close()
  155. }
  156. //--------------------------------------------------------
  157. // Connection and signer management
  158. // waits to accept and sets a new connection.
  159. // connection is closed in OnStop.
  160. // returns true if the listener is closed
  161. // (ie. it returns a nil conn).
  162. func (sc *SocketVal) reset() (bool, error) {
  163. sc.mtx.Lock()
  164. defer sc.mtx.Unlock()
  165. // first check if the conn already exists and close it.
  166. if sc.signer != nil {
  167. if err := sc.signer.Close(); err != nil {
  168. sc.Logger.Error("error closing connection", "err", err)
  169. }
  170. }
  171. // wait for a new conn
  172. conn, err := sc.waitConnection()
  173. if err != nil {
  174. return false, err
  175. }
  176. // listener is closed
  177. if conn == nil {
  178. return true, nil
  179. }
  180. sc.signer, err = NewRemoteSignerClient(conn)
  181. if err != nil {
  182. // failed to fetch the pubkey. close out the connection.
  183. if err := conn.Close(); err != nil {
  184. sc.Logger.Error("error closing connection", "err", err)
  185. }
  186. return false, err
  187. }
  188. return false, nil
  189. }
  190. func (sc *SocketVal) acceptConnection() (net.Conn, error) {
  191. conn, err := sc.listener.Accept()
  192. if err != nil {
  193. if !sc.IsRunning() {
  194. return nil, nil // Ignore error from listener closing.
  195. }
  196. return nil, err
  197. }
  198. return conn, nil
  199. }
  200. // waitConnection uses the configured wait timeout to error if no external
  201. // process connects in the time period.
  202. func (sc *SocketVal) waitConnection() (net.Conn, error) {
  203. var (
  204. connc = make(chan net.Conn, 1)
  205. errc = make(chan error, 1)
  206. )
  207. go func(connc chan<- net.Conn, errc chan<- error) {
  208. conn, err := sc.acceptConnection()
  209. if err != nil {
  210. errc <- err
  211. return
  212. }
  213. connc <- conn
  214. }(connc, errc)
  215. select {
  216. case conn := <-connc:
  217. return conn, nil
  218. case err := <-errc:
  219. return nil, err
  220. }
  221. }