You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

339 lines
9.6 KiB

9 years ago
9 years ago
9 years ago
9 years ago
9 years ago
  1. package types
  2. import (
  3. "fmt"
  4. "os"
  5. "os/exec"
  6. "sync"
  7. "time"
  8. "github.com/rcrowley/go-metrics"
  9. . "github.com/tendermint/go-common"
  10. tmtypes "github.com/tendermint/tendermint/types"
  11. )
  12. // waitign more than this many seconds for a block means we're unhealthy
  13. const newBlockTimeoutSeconds = 5
  14. //------------------------------------------------
  15. // blockchain types
  16. // NOTE: mintnet duplicates some types from here and val.go
  17. //------------------------------------------------
  18. // Known chain and validator set IDs (from which anything else can be found)
  19. // Returned by the Status RPC
  20. type ChainAndValidatorSetIDs struct {
  21. ChainIDs []string `json:"chain_ids"`
  22. ValidatorSetIDs []string `json:"validator_set_ids"`
  23. }
  24. //------------------------------------------------
  25. // chain state
  26. // Main chain state
  27. // Returned over RPC; also used to manage state
  28. type ChainState struct {
  29. Config *BlockchainConfig `json:"config"`
  30. Status *BlockchainStatus `json:"status"`
  31. }
  32. func (cs *ChainState) NewBlock(block *tmtypes.Header) {
  33. cs.Status.NewBlock(block)
  34. }
  35. func (cs *ChainState) UpdateLatency(oldLatency, newLatency float64) {
  36. cs.Status.UpdateLatency(oldLatency, newLatency)
  37. }
  38. func (cs *ChainState) SetOnline(val *ValidatorState, isOnline bool) {
  39. cs.Status.SetOnline(val, isOnline)
  40. }
  41. //------------------------------------------------
  42. // Blockchain Config: id, validator config
  43. // Chain Config
  44. type BlockchainConfig struct {
  45. // should be fixed for life of chain
  46. ID string `json:"id"`
  47. ValSetID string `json:"val_set_id"` // NOTE: do we really commit to one val set per chain?
  48. // handles live validator states (latency, last block, etc)
  49. // and validator set changes
  50. mtx sync.Mutex
  51. Validators []*ValidatorState `json:"validators"` // TODO: this should be ValidatorConfig and the state in BlockchainStatus
  52. valIDMap map[string]int // map IDs to indices
  53. }
  54. // So we can fetch validator by id rather than index
  55. func (bc *BlockchainConfig) PopulateValIDMap() {
  56. bc.mtx.Lock()
  57. defer bc.mtx.Unlock()
  58. bc.valIDMap = make(map[string]int)
  59. for i, v := range bc.Validators {
  60. bc.valIDMap[v.Config.Validator.ID] = i
  61. }
  62. }
  63. func (bc *BlockchainConfig) GetValidatorByID(valID string) (*ValidatorState, error) {
  64. bc.mtx.Lock()
  65. defer bc.mtx.Unlock()
  66. valIndex, ok := bc.valIDMap[valID]
  67. if !ok {
  68. return nil, fmt.Errorf("Unknown validator %s", valID)
  69. }
  70. return bc.Validators[valIndex], nil
  71. }
  72. //------------------------------------------------
  73. // BlockchainStatus
  74. // Basic blockchain metrics
  75. type BlockchainStatus struct {
  76. mtx sync.Mutex
  77. // Blockchain Info
  78. Height int `json:"height"` // latest height we've got
  79. BlockchainSize int64 `json:"blockchain_size"`
  80. MeanBlockTime float64 `json:"mean_block_time" wire:"unsafe"` // ms (avg over last minute)
  81. TxThroughput float64 `json:"tx_throughput" wire:"unsafe"` // tx/s (avg over last minute)
  82. blockTimeMeter metrics.Meter
  83. txThroughputMeter metrics.Meter
  84. // Network Info
  85. NumValidators int `json:"num_validators"`
  86. ActiveValidators int `json:"active_validators"`
  87. //ActiveNodes int `json:"active_nodes"`
  88. MeanLatency float64 `json:"mean_latency" wire:"unsafe"` // ms
  89. // Health
  90. FullHealth bool `json:"full_health"` // all validators online, synced, making blocks
  91. Healthy bool `json:"healthy"` // we're making blocks
  92. // Uptime
  93. UptimeData *UptimeData `json:"uptime_data"`
  94. // What else can we get / do we want?
  95. // TODO: charts for block time, latency (websockets/event-meter ?)
  96. // for benchmark runs
  97. benchResults *BenchmarkResults
  98. }
  99. func (bc *BlockchainStatus) BenchmarkTxs(results chan *BenchmarkResults, nTxs int, args []string) {
  100. log.Notice("Running benchmark", "ntxs", nTxs)
  101. bc.benchResults = &BenchmarkResults{
  102. StartTime: time.Now(),
  103. nTxs: nTxs,
  104. results: results,
  105. }
  106. if len(args) > 0 {
  107. // TODO: capture output to file
  108. cmd := exec.Command(args[0], args[1:]...)
  109. cmd.Stdout = os.Stdout
  110. cmd.Stderr = os.Stderr
  111. go cmd.Run()
  112. }
  113. }
  114. func (bc *BlockchainStatus) BenchmarkBlocks(results chan *BenchmarkResults, nBlocks int, args []string) {
  115. log.Notice("Running benchmark", "nblocks", nBlocks)
  116. bc.benchResults = &BenchmarkResults{
  117. StartTime: time.Now(),
  118. nBlocks: nBlocks,
  119. results: results,
  120. }
  121. if len(args) > 0 {
  122. // TODO: capture output to file
  123. cmd := exec.Command(args[0], args[1:]...)
  124. cmd.Stdout = os.Stdout
  125. cmd.Stderr = os.Stderr
  126. go cmd.Run()
  127. }
  128. }
  129. type Block struct {
  130. Time time.Time `json:time"`
  131. Height int `json:"height"`
  132. NumTxs int `json:"num_txs"`
  133. }
  134. type BenchmarkResults struct {
  135. StartTime time.Time `json:"start_time"`
  136. StartBlock int `json:"start_block"`
  137. TotalTime float64 `json:"total_time"` // seconds
  138. Blocks []*Block `json:"blocks"`
  139. NumBlocks int `json:"num_blocks"`
  140. NumTxs int `json:"num_txs`
  141. MeanLatency float64 `json:"latency"` // seconds per block
  142. MeanThroughput float64 `json:"throughput"` // txs per second
  143. // either we wait for n blocks or n txs
  144. nBlocks int
  145. nTxs int
  146. done bool
  147. results chan *BenchmarkResults
  148. }
  149. // Return the total time to commit all txs, in seconds
  150. func (br *BenchmarkResults) ElapsedTime() float64 {
  151. return float64(br.Blocks[br.NumBlocks-1].Time.Sub(br.StartTime)) / float64(1000000000)
  152. }
  153. // Return the avg seconds/block
  154. func (br *BenchmarkResults) Latency() float64 {
  155. return br.ElapsedTime() / float64(br.NumBlocks)
  156. }
  157. // Return the avg txs/second
  158. func (br *BenchmarkResults) Throughput() float64 {
  159. return float64(br.NumTxs) / br.ElapsedTime()
  160. }
  161. func (br *BenchmarkResults) Done() {
  162. log.Info("Done benchmark", "num blocks", br.NumBlocks, "block len", len(br.Blocks))
  163. br.done = true
  164. br.TotalTime = br.ElapsedTime()
  165. br.MeanThroughput = br.Throughput()
  166. br.MeanLatency = br.Latency()
  167. br.results <- br
  168. }
  169. type UptimeData struct {
  170. StartTime time.Time `json:"start_time"`
  171. Uptime float64 `json:"uptime" wire:"unsafe"` // Percentage of time we've been Healthy, ever
  172. totalDownTime time.Duration // total downtime (only updated when we come back online)
  173. wentDown time.Time
  174. // TODO: uptime over last day, month, year
  175. }
  176. func NewBlockchainStatus() *BlockchainStatus {
  177. return &BlockchainStatus{
  178. blockTimeMeter: metrics.NewMeter(),
  179. txThroughputMeter: metrics.NewMeter(),
  180. Healthy: true,
  181. UptimeData: &UptimeData{
  182. StartTime: time.Now(),
  183. Uptime: 100.0,
  184. },
  185. }
  186. }
  187. func (s *BlockchainStatus) NewBlock(block *tmtypes.Header) {
  188. s.mtx.Lock()
  189. defer s.mtx.Unlock()
  190. if block.Height > s.Height {
  191. numTxs := block.NumTxs
  192. s.Height = block.Height
  193. s.blockTimeMeter.Mark(1)
  194. s.txThroughputMeter.Mark(int64(numTxs))
  195. s.MeanBlockTime = (1.0 / s.blockTimeMeter.Rate1()) * 1000 // 1/s to ms
  196. s.TxThroughput = s.txThroughputMeter.Rate1()
  197. log.Debug("New Block", "height", s.Height, "ntxs", numTxs)
  198. if s.benchResults != nil && !s.benchResults.done {
  199. if s.benchResults.StartBlock == 0 && numTxs > 0 {
  200. s.benchResults.StartBlock = s.Height
  201. }
  202. s.benchResults.Blocks = append(s.benchResults.Blocks, &Block{
  203. Time: time.Now(),
  204. Height: s.Height,
  205. NumTxs: numTxs,
  206. })
  207. s.benchResults.NumTxs += numTxs
  208. s.benchResults.NumBlocks += 1
  209. if s.benchResults.nTxs > 0 && s.benchResults.NumTxs >= s.benchResults.nTxs {
  210. s.benchResults.Done()
  211. } else if s.benchResults.nBlocks > 0 && s.benchResults.NumBlocks >= s.benchResults.nBlocks {
  212. s.benchResults.Done()
  213. }
  214. }
  215. // if we're making blocks, we're healthy
  216. if !s.Healthy {
  217. s.Healthy = true
  218. s.UptimeData.totalDownTime += time.Since(s.UptimeData.wentDown)
  219. }
  220. // if we are connected to all validators, we're at full health
  221. // TODO: make sure they're all at the same height (within a block) and all proposing (and possibly validating )
  222. // Alternatively, just check there hasn't been a new round in numValidators rounds
  223. if s.ActiveValidators == s.NumValidators {
  224. s.FullHealth = true
  225. }
  226. // TODO: should we refactor so there's a central loop and ticker?
  227. go s.newBlockTimeout(s.Height)
  228. }
  229. }
  230. // we have newBlockTimeoutSeconds to make a new block, else we're unhealthy
  231. func (s *BlockchainStatus) newBlockTimeout(height int) {
  232. time.Sleep(time.Second * newBlockTimeoutSeconds)
  233. s.mtx.Lock()
  234. defer s.mtx.Unlock()
  235. if !(s.Height > height) {
  236. s.Healthy = false
  237. s.UptimeData.wentDown = time.Now()
  238. }
  239. }
  240. // Used to calculate uptime on demand. TODO: refactor this into the central loop ...
  241. func (s *BlockchainStatus) RealTimeUpdates() {
  242. s.mtx.Lock()
  243. defer s.mtx.Unlock()
  244. since := time.Since(s.UptimeData.StartTime)
  245. uptime := since - s.UptimeData.totalDownTime
  246. if !s.Healthy {
  247. uptime -= time.Since(s.UptimeData.wentDown)
  248. }
  249. s.UptimeData.Uptime = float64(uptime) / float64(since)
  250. }
  251. func (s *BlockchainStatus) UpdateLatency(oldLatency, newLatency float64) {
  252. s.mtx.Lock()
  253. defer s.mtx.Unlock()
  254. // update avg validator rpc latency
  255. mean := s.MeanLatency * float64(s.NumValidators)
  256. mean = (mean - oldLatency + newLatency) / float64(s.NumValidators)
  257. s.MeanLatency = mean
  258. }
  259. // Toggle validators online/offline (updates ActiveValidators and FullHealth)
  260. func (s *BlockchainStatus) SetOnline(val *ValidatorState, isOnline bool) {
  261. val.SetOnline(isOnline)
  262. var change int
  263. if isOnline {
  264. change = 1
  265. } else {
  266. change = -1
  267. }
  268. s.mtx.Lock()
  269. defer s.mtx.Unlock()
  270. s.ActiveValidators += change
  271. if s.ActiveValidators > s.NumValidators {
  272. panic(Fmt("got %d validators. max %ds", s.ActiveValidators, s.NumValidators))
  273. }
  274. // if we lost a connection we're no longer at full health, even if it's still online.
  275. // so long as we receive blocks, we'll know we're still healthy
  276. if s.ActiveValidators != s.NumValidators {
  277. s.FullHealth = false
  278. }
  279. }
  280. func TwoThirdsMaj(count, total int) bool {
  281. return float64(count) > (2.0/3.0)*float64(total)
  282. }