From 07b46d5a054365b576b8dbab9744c944a4d6e4f4 Mon Sep 17 00:00:00 2001 From: Sam Kleinman Date: Thu, 17 Mar 2022 13:30:13 -0400 Subject: [PATCH 1/4] blocksync: drop redundant shutdown mechanisms (#8136) --- internal/blocksync/pool.go | 16 ---------------- internal/blocksync/reactor.go | 2 -- 2 files changed, 18 deletions(-) diff --git a/internal/blocksync/pool.go b/internal/blocksync/pool.go index 4c905c660..7f133a7a1 100644 --- a/internal/blocksync/pool.go +++ b/internal/blocksync/pool.go @@ -86,7 +86,6 @@ type BlockPool struct { requestsCh chan<- BlockRequest errorsCh chan<- peerError - exitedCh chan struct{} startHeight int64 lastHundredBlockTimeStamp time.Time @@ -109,7 +108,6 @@ func NewBlockPool( height: start, startHeight: start, numPending: 0, - exitedCh: make(chan struct{}), requestsCh: requestsCh, errorsCh: errorsCh, lastSyncRate: 0, @@ -125,11 +123,6 @@ func (pool *BlockPool) OnStart(ctx context.Context) error { pool.lastHundredBlockTimeStamp = pool.lastAdvance go pool.makeRequestersRoutine(ctx) - go func() { - defer close(pool.exitedCh) - pool.Wait() - }() - return nil } @@ -637,12 +630,6 @@ func (bpr *bpRequester) redo(peerID types.NodeID) { // Responsible for making more requests as necessary // Returns only when a block is found (e.g. AddBlock() is called) func (bpr *bpRequester) requestRoutine(ctx context.Context) { - bprPoolDone := make(chan struct{}) - go func() { - defer close(bprPoolDone) - bpr.pool.Wait() - }() - OUTER_LOOP: for { // Pick a peer to send request to. @@ -670,9 +657,6 @@ OUTER_LOOP: select { case <-ctx.Done(): return - case <-bpr.pool.exitedCh: - bpr.Stop() - return case peerID := <-bpr.redoCh: if peerID == bpr.peerID { bpr.reset() diff --git a/internal/blocksync/reactor.go b/internal/blocksync/reactor.go index cf1a10623..89c8c642b 100644 --- a/internal/blocksync/reactor.go +++ b/internal/blocksync/reactor.go @@ -445,8 +445,6 @@ func (r *Reactor) poolRoutine(ctx context.Context, stateSynced bool) { select { case <-ctx.Done(): return - case <-r.pool.exitedCh: - return case <-switchToConsensusTicker.C: var ( height, numPending, lenRequesters = r.pool.GetStatus() From 1dd8807cc32e2ca5d59ec032fb708809119a6129 Mon Sep 17 00:00:00 2001 From: Sam Kleinman Date: Thu, 17 Mar 2022 13:45:27 -0400 Subject: [PATCH 2/4] mempool: test harness should expose application (#8143) This is minor, but I was trying to write a test and realized that the application reference in the harness isn't actually used, which is quite confusing. --- internal/mempool/mempool_bench_test.go | 10 ++- internal/mempool/mempool_test.go | 105 +++++++++++++++++++------ internal/mempool/reactor_test.go | 34 +++++--- 3 files changed, 113 insertions(+), 36 deletions(-) diff --git a/internal/mempool/mempool_bench_test.go b/internal/mempool/mempool_bench_test.go index 088af174a..843e42e87 100644 --- a/internal/mempool/mempool_bench_test.go +++ b/internal/mempool/mempool_bench_test.go @@ -8,15 +8,23 @@ import ( "time" "github.com/stretchr/testify/require" + abciclient "github.com/tendermint/tendermint/abci/client" + "github.com/tendermint/tendermint/abci/example/kvstore" + "github.com/tendermint/tendermint/libs/log" ) func BenchmarkTxMempool_CheckTx(b *testing.B) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + client := abciclient.NewLocalClient(log.NewNopLogger(), kvstore.NewApplication()) + if err := client.Start(ctx); err != nil { + b.Fatal(err) + } + // setup the cache and the mempool number for hitting GetEvictableTxs during the // benchmark. 5000 is the current default mempool size in the TM config. - txmp := setup(ctx, b, 10000) + txmp := setup(ctx, b, client, 10000) txmp.config.Size = 5000 rng := rand.New(rand.NewSource(time.Now().UnixNano())) diff --git a/internal/mempool/mempool_test.go b/internal/mempool/mempool_test.go index e4f604cb1..165e4bd20 100644 --- a/internal/mempool/mempool_test.go +++ b/internal/mempool/mempool_test.go @@ -72,30 +72,18 @@ func (app *application) CheckTx(req abci.RequestCheckTx) abci.ResponseCheckTx { } } -func setup(ctx context.Context, t testing.TB, cacheSize int, options ...TxMempoolOption) *TxMempool { +func setup(ctx context.Context, t testing.TB, app abciclient.Client, cacheSize int, options ...TxMempoolOption) *TxMempool { t.Helper() - var cancel context.CancelFunc - ctx, cancel = context.WithCancel(ctx) - logger := log.TestingLogger() - conn := abciclient.NewLocalClient(logger, &application{ - kvstore.NewApplication(), - }) - cfg, err := config.ResetTestRoot(t.TempDir(), strings.ReplaceAll(t.Name(), "/", "|")) require.NoError(t, err) cfg.Mempool.CacheSize = cacheSize - require.NoError(t, conn.Start(ctx)) - t.Cleanup(func() { - os.RemoveAll(cfg.RootDir) - cancel() - conn.Wait() - }) + t.Cleanup(func() { os.RemoveAll(cfg.RootDir) }) - return NewTxMempool(logger.With("test", t.Name()), cfg.Mempool, conn, options...) + return NewTxMempool(logger.With("test", t.Name()), cfg.Mempool, app, options...) } func checkTxs(ctx context.Context, t *testing.T, txmp *TxMempool, numTxs int, peerID uint16) []testTx { @@ -137,7 +125,13 @@ func TestTxMempool_TxsAvailable(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 0) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 0) txmp.EnableTxsAvailable() ensureNoTxFire := func() { @@ -194,7 +188,13 @@ func TestTxMempool_Size(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 0) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 0) txs := checkTxs(ctx, t, txmp, 100, 0) require.Equal(t, len(txs), txmp.Size()) require.Equal(t, int64(5690), txmp.SizeBytes()) @@ -221,7 +221,13 @@ func TestTxMempool_Flush(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 0) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 0) txs := checkTxs(ctx, t, txmp, 100, 0) require.Equal(t, len(txs), txmp.Size()) require.Equal(t, int64(5690), txmp.SizeBytes()) @@ -249,7 +255,13 @@ func TestTxMempool_ReapMaxBytesMaxGas(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 0) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 0) tTxs := checkTxs(ctx, t, txmp, 100, 0) // all txs request 1 gas unit require.Equal(t, len(tTxs), txmp.Size()) require.Equal(t, int64(5690), txmp.SizeBytes()) @@ -302,7 +314,13 @@ func TestTxMempool_ReapMaxTxs(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 0) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 0) tTxs := checkTxs(ctx, t, txmp, 100, 0) require.Equal(t, len(tTxs), txmp.Size()) require.Equal(t, int64(5690), txmp.SizeBytes()) @@ -354,7 +372,12 @@ func TestTxMempool_CheckTxExceedsMaxSize(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 0) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + txmp := setup(ctx, t, client, 0) rng := rand.New(rand.NewSource(time.Now().UnixNano())) tx := make([]byte, txmp.config.MaxTxBytes+1) @@ -374,7 +397,13 @@ func TestTxMempool_CheckTxSamePeer(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 100) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 100) peerID := uint16(1) rng := rand.New(rand.NewSource(time.Now().UnixNano())) @@ -392,7 +421,13 @@ func TestTxMempool_CheckTxSameSender(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 100) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 100) peerID := uint16(1) rng := rand.New(rand.NewSource(time.Now().UnixNano())) @@ -417,7 +452,13 @@ func TestTxMempool_ConcurrentTxs(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 100) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 100) rng := rand.New(rand.NewSource(time.Now().UnixNano())) checkTxDone := make(chan struct{}) @@ -484,7 +525,13 @@ func TestTxMempool_ExpiredTxs_NumBlocks(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - txmp := setup(ctx, t, 500) + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + + txmp := setup(ctx, t, client, 500) txmp.height = 100 txmp.config.TTLNumBlocks = 10 @@ -556,10 +603,16 @@ func TestTxMempool_CheckTxPostCheckError(t *testing.T) { ctx, cancel := context.WithCancel(ctx) defer cancel() + client := abciclient.NewLocalClient(log.NewNopLogger(), &application{Application: kvstore.NewApplication()}) + if err := client.Start(ctx); err != nil { + t.Fatal(err) + } + t.Cleanup(client.Wait) + postCheckFn := func(_ types.Tx, _ *abci.ResponseCheckTx) error { return testCase.err } - txmp := setup(ctx, t, 0, WithPostCheck(postCheckFn)) + txmp := setup(ctx, t, client, 0, WithPostCheck(postCheckFn)) rng := rand.New(rand.NewSource(time.Now().UnixNano())) tx := make([]byte, txmp.config.MaxTxBytes-1) _, err := rng.Read(tx) diff --git a/internal/mempool/reactor_test.go b/internal/mempool/reactor_test.go index 04e51ca8d..dab63af73 100644 --- a/internal/mempool/reactor_test.go +++ b/internal/mempool/reactor_test.go @@ -13,6 +13,7 @@ import ( "github.com/fortytw2/leaktest" "github.com/stretchr/testify/require" + abciclient "github.com/tendermint/tendermint/abci/client" "github.com/tendermint/tendermint/abci/example/kvstore" abci "github.com/tendermint/tendermint/abci/types" "github.com/tendermint/tendermint/config" @@ -39,7 +40,7 @@ type reactorTestSuite struct { nodes []types.NodeID } -func setupReactors(ctx context.Context, t *testing.T, numNodes int, chBuf uint) *reactorTestSuite { +func setupReactors(ctx context.Context, t *testing.T, logger log.Logger, numNodes int, chBuf uint) *reactorTestSuite { t.Helper() cfg, err := config.ResetTestRoot(t.TempDir(), strings.ReplaceAll(t.Name(), "/", "|")) @@ -63,7 +64,11 @@ func setupReactors(ctx context.Context, t *testing.T, numNodes int, chBuf uint) for nodeID := range rts.network.Nodes { rts.kvstores[nodeID] = kvstore.NewApplication() - mempool := setup(ctx, t, 0) + client := abciclient.NewLocalClient(logger, rts.kvstores[nodeID]) + require.NoError(t, client.Start(ctx)) + t.Cleanup(client.Wait) + + mempool := setup(ctx, t, client, 0) rts.mempools[nodeID] = mempool rts.peerChans[nodeID] = make(chan p2p.PeerUpdate, chBuf) @@ -151,7 +156,9 @@ func TestReactorBroadcastDoesNotPanic(t *testing.T) { defer cancel() const numNodes = 2 - rts := setupReactors(ctx, t, numNodes, 0) + + logger := log.NewNopLogger() + rts := setupReactors(ctx, t, logger, numNodes, 0) observePanic := func(r interface{}) { t.Fatal("panic detected in reactor") @@ -194,7 +201,9 @@ func TestReactorBroadcastTxs(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - rts := setupReactors(ctx, t, numNodes, uint(numTxs)) + logger := log.NewNopLogger() + + rts := setupReactors(ctx, t, logger, numNodes, uint(numTxs)) primary := rts.nodes[0] secondaries := rts.nodes[1:] @@ -218,7 +227,8 @@ func TestReactorConcurrency(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - rts := setupReactors(ctx, t, numNodes, 0) + logger := log.NewNopLogger() + rts := setupReactors(ctx, t, logger, numNodes, 0) primary := rts.nodes[0] secondary := rts.nodes[1] @@ -276,7 +286,8 @@ func TestReactorNoBroadcastToSender(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - rts := setupReactors(ctx, t, numNodes, uint(numTxs)) + logger := log.NewNopLogger() + rts := setupReactors(ctx, t, logger, numNodes, uint(numTxs)) primary := rts.nodes[0] secondary := rts.nodes[1] @@ -300,7 +311,9 @@ func TestReactor_MaxTxBytes(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - rts := setupReactors(ctx, t, numNodes, 0) + logger := log.NewNopLogger() + + rts := setupReactors(ctx, t, logger, numNodes, 0) primary := rts.nodes[0] secondary := rts.nodes[1] @@ -336,7 +349,8 @@ func TestDontExhaustMaxActiveIDs(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - rts := setupReactors(ctx, t, 1, MaxActiveIDs+1) + logger := log.NewNopLogger() + rts := setupReactors(ctx, t, logger, 1, MaxActiveIDs+1) nodeID := rts.nodes[0] @@ -388,7 +402,9 @@ func TestBroadcastTxForPeerStopsWhenPeerStops(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - rts := setupReactors(ctx, t, 2, 2) + logger := log.NewNopLogger() + + rts := setupReactors(ctx, t, logger, 2, 2) primary := rts.nodes[0] secondary := rts.nodes[1] From 02c7199eece1dc9295a44e80c229f660c61df13b Mon Sep 17 00:00:00 2001 From: William Banfield <4561443+williambanfield@users.noreply.github.com> Date: Thu, 17 Mar 2022 14:49:37 -0400 Subject: [PATCH 3/4] types: update synchrony params to match checked in proto (#8142) The `.proto` file do not have the `nullable = false` annotation present on the `SynchronyParams` durations. This pull request updates the `SynchronyParams` to match the checked in proto files. Note, this does not make the code buildable against the latest protos. This pull request was achieved by checking out all files _not relevant_ to the `SynchronyParams` and removing the new `TimeoutParams` from the the `params.proto` file. Future updates will add these back. This pull request also adds a `nil` check to the `pbParams.Synchrony` field in `ConsensusParamsFromProto`. Old versions of Tendermint will not have the `Synchrony` parameters filled in so this code would panic on startup. We will fill in the empty fields with defaults, but per https://github.com/tendermint/tendermint/blob/master/docs/rfc/rfc-009-consensus-parameter-upgrades.md#only-update-hashedparams-on-hash-breaking-releases we will keep out of the hash during this release. --- proto/tendermint/statesync/message_test.go | 9 +- proto/tendermint/types/params.pb.go | 164 +++++++++++++-------- types/params.go | 27 ++-- types/params_test.go | 8 +- 4 files changed, 130 insertions(+), 78 deletions(-) diff --git a/proto/tendermint/statesync/message_test.go b/proto/tendermint/statesync/message_test.go index a0b241615..1db421aca 100644 --- a/proto/tendermint/statesync/message_test.go +++ b/proto/tendermint/statesync/message_test.go @@ -3,6 +3,7 @@ package statesync_test import ( "encoding/hex" "testing" + "time" "github.com/gogo/protobuf/proto" "github.com/stretchr/testify/require" @@ -204,8 +205,8 @@ func TestStateSyncVectors(t *testing.T) { AppVersion: 11, }, Synchrony: &tmproto.SynchronyParams{ - MessageDelay: 550, - Precision: 90, + MessageDelay: durationPtr(550), + Precision: durationPtr(90), }, }, }, @@ -224,3 +225,7 @@ func TestStateSyncVectors(t *testing.T) { require.Equal(t, tc.expBytes, hex.EncodeToString(bz), tc.testName) } } + +func durationPtr(t time.Duration) *time.Duration { + return &t +} diff --git a/proto/tendermint/types/params.pb.go b/proto/tendermint/types/params.pb.go index 32689932c..ff55379df 100644 --- a/proto/tendermint/types/params.pb.go +++ b/proto/tendermint/types/params.pb.go @@ -381,9 +381,17 @@ func (m *HashedParams) GetBlockMaxGas() int64 { return 0 } +// SynchronyParams configure the bounds under which a proposed block's timestamp is considered valid. +// These parameters are part of the proposer-based timestamps algorithm. For more information, +// see the specification of proposer-based timestamps: +// https://github.com/tendermint/tendermint/tree/master/spec/consensus/proposer-based-timestamp type SynchronyParams struct { - MessageDelay time.Duration `protobuf:"bytes,1,opt,name=message_delay,json=messageDelay,proto3,stdduration" json:"message_delay"` - Precision time.Duration `protobuf:"bytes,2,opt,name=precision,proto3,stdduration" json:"precision"` + // message_delay bounds how long a proposal message may take to reach all validators on a newtork + // and still be considered valid. + MessageDelay *time.Duration `protobuf:"bytes,1,opt,name=message_delay,json=messageDelay,proto3,stdduration" json:"message_delay,omitempty"` + // precision bounds how skewed a proposer's clock may be from any validator + // on the network while still producing valid proposals. + Precision *time.Duration `protobuf:"bytes,2,opt,name=precision,proto3,stdduration" json:"precision,omitempty"` } func (m *SynchronyParams) Reset() { *m = SynchronyParams{} } @@ -419,18 +427,18 @@ func (m *SynchronyParams) XXX_DiscardUnknown() { var xxx_messageInfo_SynchronyParams proto.InternalMessageInfo -func (m *SynchronyParams) GetMessageDelay() time.Duration { +func (m *SynchronyParams) GetMessageDelay() *time.Duration { if m != nil { return m.MessageDelay } - return 0 + return nil } -func (m *SynchronyParams) GetPrecision() time.Duration { +func (m *SynchronyParams) GetPrecision() *time.Duration { if m != nil { return m.Precision } - return 0 + return nil } func init() { @@ -446,43 +454,43 @@ func init() { func init() { proto.RegisterFile("tendermint/types/params.proto", fileDescriptor_e12598271a686f57) } var fileDescriptor_e12598271a686f57 = []byte{ - // 561 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x94, 0x4d, 0x6b, 0xd4, 0x40, - 0x18, 0xc7, 0x37, 0xdd, 0xbe, 0xec, 0x3e, 0xdb, 0xed, 0x96, 0x41, 0x30, 0x56, 0x9a, 0x5d, 0x73, - 0x90, 0x82, 0x90, 0x88, 0x45, 0x44, 0x10, 0xa4, 0xdb, 0x8a, 0x05, 0xa9, 0x48, 0x7c, 0x39, 0xf4, - 0x12, 0x26, 0xbb, 0x63, 0x36, 0x74, 0x93, 0x19, 0x32, 0xc9, 0xb2, 0xf9, 0x16, 0x1e, 0x3d, 0x79, - 0xd6, 0x8f, 0xe1, 0xad, 0xc7, 0x1e, 0x3d, 0xa9, 0xec, 0x7e, 0x11, 0x99, 0xc9, 0x4c, 0xd3, 0xdd, - 0x2a, 0xd8, 0x5b, 0x32, 0xcf, 0xef, 0x97, 0x87, 0xf9, 0x3f, 0x93, 0x81, 0xdd, 0x8c, 0x24, 0x43, - 0x92, 0xc6, 0x51, 0x92, 0xb9, 0x59, 0xc1, 0x08, 0x77, 0x19, 0x4e, 0x71, 0xcc, 0x1d, 0x96, 0xd2, - 0x8c, 0xa2, 0xed, 0xaa, 0xec, 0xc8, 0xf2, 0xce, 0xad, 0x90, 0x86, 0x54, 0x16, 0x5d, 0xf1, 0x54, - 0x72, 0x3b, 0x56, 0x48, 0x69, 0x38, 0x26, 0xae, 0x7c, 0x0b, 0xf2, 0x8f, 0xee, 0x30, 0x4f, 0x71, - 0x16, 0xd1, 0xa4, 0xac, 0xdb, 0xdf, 0x57, 0xa0, 0x73, 0x48, 0x13, 0x4e, 0x12, 0x9e, 0xf3, 0x37, - 0xb2, 0x03, 0xda, 0x87, 0xb5, 0x60, 0x4c, 0x07, 0x67, 0xa6, 0xd1, 0x33, 0xf6, 0x5a, 0x8f, 0x76, - 0x9d, 0xe5, 0x5e, 0x4e, 0x5f, 0x94, 0x4b, 0xda, 0x2b, 0x59, 0xf4, 0x0c, 0x1a, 0x64, 0x12, 0x0d, - 0x49, 0x32, 0x20, 0xe6, 0x8a, 0xf4, 0x7a, 0xd7, 0xbd, 0x17, 0x8a, 0x50, 0xea, 0xa5, 0x81, 0x9e, - 0x43, 0x73, 0x82, 0xc7, 0xd1, 0x10, 0x67, 0x34, 0x35, 0xeb, 0x52, 0xbf, 0x77, 0x5d, 0xff, 0xa0, - 0x11, 0xe5, 0x57, 0x0e, 0x7a, 0x0a, 0x1b, 0x13, 0x92, 0xf2, 0x88, 0x26, 0xe6, 0xaa, 0xd4, 0xbb, - 0x7f, 0xd1, 0x4b, 0x40, 0xc9, 0x9a, 0x17, 0xbd, 0x79, 0x91, 0x0c, 0x46, 0x29, 0x4d, 0x0a, 0x73, - 0xed, 0x5f, 0xbd, 0xdf, 0x6a, 0x44, 0xf7, 0xbe, 0x74, 0xec, 0x43, 0x68, 0x5d, 0x09, 0x04, 0xdd, - 0x85, 0x66, 0x8c, 0xa7, 0x7e, 0x50, 0x64, 0x84, 0xcb, 0x08, 0xeb, 0x5e, 0x23, 0xc6, 0xd3, 0xbe, - 0x78, 0x47, 0xb7, 0x61, 0x43, 0x14, 0x43, 0xcc, 0x65, 0x4a, 0x75, 0x6f, 0x3d, 0xc6, 0xd3, 0x97, - 0x98, 0xdb, 0xdf, 0x0c, 0xd8, 0x5a, 0x8c, 0x07, 0x3d, 0x00, 0x24, 0x58, 0x1c, 0x12, 0x3f, 0xc9, - 0x63, 0x5f, 0xe6, 0xac, 0xbf, 0xd8, 0x89, 0xf1, 0xf4, 0x20, 0x24, 0xaf, 0xf3, 0x58, 0xb6, 0xe6, - 0xe8, 0x04, 0xb6, 0x35, 0xac, 0x47, 0xac, 0xe6, 0x70, 0xc7, 0x29, 0xcf, 0x80, 0xa3, 0xcf, 0x80, - 0x73, 0xa4, 0x80, 0x7e, 0xe3, 0xfc, 0x67, 0xb7, 0xf6, 0xf9, 0x57, 0xd7, 0xf0, 0xb6, 0xca, 0xef, - 0xe9, 0xca, 0xe2, 0x26, 0xea, 0x8b, 0x9b, 0xb0, 0x1f, 0x43, 0x67, 0x69, 0x14, 0xc8, 0x86, 0x36, - 0xcb, 0x03, 0xff, 0x8c, 0x14, 0xbe, 0xcc, 0xcb, 0x34, 0x7a, 0xf5, 0xbd, 0xa6, 0xd7, 0x62, 0x79, - 0xf0, 0x8a, 0x14, 0xef, 0xc4, 0x92, 0xfd, 0x10, 0xda, 0x0b, 0x23, 0x40, 0x5d, 0x68, 0x61, 0xc6, - 0x7c, 0x3d, 0x38, 0xb1, 0xb3, 0x55, 0x0f, 0x30, 0x63, 0x0a, 0xb3, 0x4f, 0x61, 0xf3, 0x18, 0xf3, - 0x11, 0x19, 0x2a, 0xe1, 0x3e, 0x74, 0x64, 0x0a, 0xfe, 0x72, 0xc0, 0x6d, 0xb9, 0x7c, 0xa2, 0x53, - 0xb6, 0xa1, 0x5d, 0x71, 0x55, 0xd6, 0x2d, 0x4d, 0x89, 0xc0, 0xbf, 0x18, 0xd0, 0x59, 0x1a, 0x2a, - 0x3a, 0x86, 0x76, 0x4c, 0x38, 0x97, 0x21, 0x92, 0x31, 0x2e, 0xd4, 0x1f, 0xf0, 0x5f, 0x09, 0x6e, - 0x2a, 0xf3, 0x48, 0x88, 0xe8, 0x00, 0x9a, 0x2c, 0x25, 0x83, 0x88, 0xdf, 0x70, 0x0e, 0x95, 0xd5, - 0x7f, 0xff, 0x75, 0x66, 0x19, 0xe7, 0x33, 0xcb, 0xb8, 0x98, 0x59, 0xc6, 0xef, 0x99, 0x65, 0x7c, - 0x9a, 0x5b, 0xb5, 0x8b, 0xb9, 0x55, 0xfb, 0x31, 0xb7, 0x6a, 0xa7, 0x4f, 0xc2, 0x28, 0x1b, 0xe5, - 0x81, 0x33, 0xa0, 0xb1, 0x7b, 0xf5, 0xaa, 0xa8, 0x1e, 0xcb, 0xbb, 0x60, 0xf9, 0x1a, 0x09, 0xd6, - 0xe5, 0xfa, 0xfe, 0x9f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xcb, 0x26, 0x8a, 0x0b, 0x61, 0x04, 0x00, - 0x00, + // 565 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x94, 0x4d, 0x8b, 0xd3, 0x40, + 0x18, 0xc7, 0x9b, 0xed, 0xbe, 0xb4, 0x4f, 0xb7, 0xdb, 0x65, 0x10, 0x8c, 0x2b, 0x9b, 0xd6, 0x1c, + 0x64, 0x41, 0x48, 0xc4, 0x45, 0x44, 0x50, 0xc4, 0x6e, 0x45, 0x41, 0x56, 0x24, 0xbe, 0x1c, 0xf6, + 0x12, 0x26, 0xed, 0x98, 0x86, 0x6d, 0x32, 0x43, 0x26, 0x29, 0xcd, 0xb7, 0xf0, 0x24, 0x7e, 0x04, + 0xfd, 0x18, 0xde, 0xf6, 0xb8, 0x47, 0x4f, 0x2a, 0xed, 0x17, 0x91, 0x99, 0xcc, 0x6c, 0xb6, 0x5d, + 0x15, 0x6f, 0xc9, 0x3c, 0xff, 0xdf, 0x3c, 0xcc, 0xef, 0x49, 0x06, 0xf6, 0x33, 0x92, 0x8c, 0x48, + 0x1a, 0x47, 0x49, 0xe6, 0x66, 0x05, 0x23, 0xdc, 0x65, 0x38, 0xc5, 0x31, 0x77, 0x58, 0x4a, 0x33, + 0x8a, 0x76, 0xab, 0xb2, 0x23, 0xcb, 0x7b, 0xd7, 0x42, 0x1a, 0x52, 0x59, 0x74, 0xc5, 0x53, 0x99, + 0xdb, 0xb3, 0x42, 0x4a, 0xc3, 0x09, 0x71, 0xe5, 0x5b, 0x90, 0x7f, 0x70, 0x47, 0x79, 0x8a, 0xb3, + 0x88, 0x26, 0x65, 0xdd, 0xfe, 0xb6, 0x06, 0x9d, 0x23, 0x9a, 0x70, 0x92, 0xf0, 0x9c, 0xbf, 0x96, + 0x1d, 0xd0, 0x21, 0x6c, 0x04, 0x13, 0x3a, 0x3c, 0x35, 0x8d, 0x9e, 0x71, 0xd0, 0xba, 0xb7, 0xef, + 0xac, 0xf6, 0x72, 0xfa, 0xa2, 0x5c, 0xa6, 0xbd, 0x32, 0x8b, 0x1e, 0x41, 0x83, 0x4c, 0xa3, 0x11, + 0x49, 0x86, 0xc4, 0x5c, 0x93, 0x5c, 0xef, 0x2a, 0xf7, 0x4c, 0x25, 0x14, 0x7a, 0x41, 0xa0, 0x27, + 0xd0, 0x9c, 0xe2, 0x49, 0x34, 0xc2, 0x19, 0x4d, 0xcd, 0xba, 0xc4, 0x6f, 0x5d, 0xc5, 0xdf, 0xeb, + 0x88, 0xe2, 0x2b, 0x06, 0x3d, 0x84, 0xad, 0x29, 0x49, 0x79, 0x44, 0x13, 0x73, 0x5d, 0xe2, 0xdd, + 0x3f, 0xe0, 0x65, 0x40, 0xc1, 0x3a, 0x2f, 0x7a, 0xf3, 0x22, 0x19, 0x8e, 0x53, 0x9a, 0x14, 0xe6, + 0xc6, 0xdf, 0x7a, 0xbf, 0xd1, 0x11, 0xdd, 0xfb, 0x82, 0xb1, 0x8f, 0xa0, 0x75, 0x49, 0x08, 0xba, + 0x09, 0xcd, 0x18, 0xcf, 0xfc, 0xa0, 0xc8, 0x08, 0x97, 0x0a, 0xeb, 0x5e, 0x23, 0xc6, 0xb3, 0xbe, + 0x78, 0x47, 0xd7, 0x61, 0x4b, 0x14, 0x43, 0xcc, 0xa5, 0xa5, 0xba, 0xb7, 0x19, 0xe3, 0xd9, 0x73, + 0xcc, 0xed, 0xaf, 0x06, 0xec, 0x2c, 0xeb, 0x41, 0x77, 0x00, 0x89, 0x2c, 0x0e, 0x89, 0x9f, 0xe4, + 0xb1, 0x2f, 0x3d, 0xeb, 0x1d, 0x3b, 0x31, 0x9e, 0x3d, 0x0d, 0xc9, 0xab, 0x3c, 0x96, 0xad, 0x39, + 0x3a, 0x86, 0x5d, 0x1d, 0xd6, 0x23, 0x56, 0x73, 0xb8, 0xe1, 0x94, 0xdf, 0x80, 0xa3, 0xbf, 0x01, + 0x67, 0xa0, 0x02, 0xfd, 0xc6, 0xd9, 0x8f, 0x6e, 0xed, 0xf3, 0xcf, 0xae, 0xe1, 0xed, 0x94, 0xfb, + 0xe9, 0xca, 0xf2, 0x21, 0xea, 0xcb, 0x87, 0xb0, 0xef, 0x43, 0x67, 0x65, 0x14, 0xc8, 0x86, 0x36, + 0xcb, 0x03, 0xff, 0x94, 0x14, 0xbe, 0xf4, 0x65, 0x1a, 0xbd, 0xfa, 0x41, 0xd3, 0x6b, 0xb1, 0x3c, + 0x78, 0x49, 0x8a, 0xb7, 0x62, 0xc9, 0xbe, 0x0b, 0xed, 0xa5, 0x11, 0xa0, 0x2e, 0xb4, 0x30, 0x63, + 0xbe, 0x1e, 0x9c, 0x38, 0xd9, 0xba, 0x07, 0x98, 0x31, 0x15, 0xb3, 0x4f, 0x60, 0xfb, 0x05, 0xe6, + 0x63, 0x32, 0x52, 0xc0, 0x6d, 0xe8, 0x48, 0x0b, 0xfe, 0xaa, 0xe0, 0xb6, 0x5c, 0x3e, 0xd6, 0x96, + 0x6d, 0x68, 0x57, 0xb9, 0xca, 0x75, 0x4b, 0xa7, 0x84, 0xf0, 0x4f, 0x06, 0x74, 0x56, 0x86, 0x8a, + 0x06, 0xd0, 0x8e, 0x09, 0xe7, 0x52, 0x22, 0x99, 0xe0, 0x42, 0xfd, 0x01, 0xff, 0x30, 0xb8, 0x2e, + 0xed, 0x6d, 0x2b, 0x6a, 0x20, 0x20, 0xf4, 0x18, 0x9a, 0x2c, 0x25, 0xc3, 0x88, 0xff, 0xd7, 0x0c, + 0xca, 0x1d, 0x2a, 0xa2, 0xff, 0xee, 0xcb, 0xdc, 0x32, 0xce, 0xe6, 0x96, 0x71, 0x3e, 0xb7, 0x8c, + 0x5f, 0x73, 0xcb, 0xf8, 0xb8, 0xb0, 0x6a, 0xe7, 0x0b, 0xab, 0xf6, 0x7d, 0x61, 0xd5, 0x4e, 0x1e, + 0x84, 0x51, 0x36, 0xce, 0x03, 0x67, 0x48, 0x63, 0xf7, 0xf2, 0x15, 0x51, 0x3d, 0x96, 0x77, 0xc0, + 0xea, 0xf5, 0x11, 0x6c, 0xca, 0xf5, 0xc3, 0xdf, 0x01, 0x00, 0x00, 0xff, 0xff, 0x57, 0x89, 0x7c, + 0xd9, 0x59, 0x04, 0x00, 0x00, } func (this *ConsensusParams) Equal(that interface{}) bool { @@ -677,10 +685,22 @@ func (this *SynchronyParams) Equal(that interface{}) bool { } else if this == nil { return false } - if this.MessageDelay != that1.MessageDelay { + if this.MessageDelay != nil && that1.MessageDelay != nil { + if *this.MessageDelay != *that1.MessageDelay { + return false + } + } else if this.MessageDelay != nil { + return false + } else if that1.MessageDelay != nil { return false } - if this.Precision != that1.Precision { + if this.Precision != nil && that1.Precision != nil { + if *this.Precision != *that1.Precision { + return false + } + } else if this.Precision != nil { + return false + } else if that1.Precision != nil { return false } return true @@ -955,22 +975,26 @@ func (m *SynchronyParams) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l - n7, err7 := github_com_gogo_protobuf_types.StdDurationMarshalTo(m.Precision, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdDuration(m.Precision):]) - if err7 != nil { - return 0, err7 + if m.Precision != nil { + n7, err7 := github_com_gogo_protobuf_types.StdDurationMarshalTo(*m.Precision, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdDuration(*m.Precision):]) + if err7 != nil { + return 0, err7 + } + i -= n7 + i = encodeVarintParams(dAtA, i, uint64(n7)) + i-- + dAtA[i] = 0x12 } - i -= n7 - i = encodeVarintParams(dAtA, i, uint64(n7)) - i-- - dAtA[i] = 0x12 - n8, err8 := github_com_gogo_protobuf_types.StdDurationMarshalTo(m.MessageDelay, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdDuration(m.MessageDelay):]) - if err8 != nil { - return 0, err8 + if m.MessageDelay != nil { + n8, err8 := github_com_gogo_protobuf_types.StdDurationMarshalTo(*m.MessageDelay, dAtA[i-github_com_gogo_protobuf_types.SizeOfStdDuration(*m.MessageDelay):]) + if err8 != nil { + return 0, err8 + } + i -= n8 + i = encodeVarintParams(dAtA, i, uint64(n8)) + i-- + dAtA[i] = 0xa } - i -= n8 - i = encodeVarintParams(dAtA, i, uint64(n8)) - i-- - dAtA[i] = 0xa return len(dAtA) - i, nil } @@ -1094,10 +1118,14 @@ func (m *SynchronyParams) Size() (n int) { } var l int _ = l - l = github_com_gogo_protobuf_types.SizeOfStdDuration(m.MessageDelay) - n += 1 + l + sovParams(uint64(l)) - l = github_com_gogo_protobuf_types.SizeOfStdDuration(m.Precision) - n += 1 + l + sovParams(uint64(l)) + if m.MessageDelay != nil { + l = github_com_gogo_protobuf_types.SizeOfStdDuration(*m.MessageDelay) + n += 1 + l + sovParams(uint64(l)) + } + if m.Precision != nil { + l = github_com_gogo_protobuf_types.SizeOfStdDuration(*m.Precision) + n += 1 + l + sovParams(uint64(l)) + } return n } @@ -1843,7 +1871,10 @@ func (m *SynchronyParams) Unmarshal(dAtA []byte) error { if postIndex > l { return io.ErrUnexpectedEOF } - if err := github_com_gogo_protobuf_types.StdDurationUnmarshal(&m.MessageDelay, dAtA[iNdEx:postIndex]); err != nil { + if m.MessageDelay == nil { + m.MessageDelay = new(time.Duration) + } + if err := github_com_gogo_protobuf_types.StdDurationUnmarshal(m.MessageDelay, dAtA[iNdEx:postIndex]); err != nil { return err } iNdEx = postIndex @@ -1876,7 +1907,10 @@ func (m *SynchronyParams) Unmarshal(dAtA []byte) error { if postIndex > l { return io.ErrUnexpectedEOF } - if err := github_com_gogo_protobuf_types.StdDurationUnmarshal(&m.Precision, dAtA[iNdEx:postIndex]); err != nil { + if m.Precision == nil { + m.Precision = new(time.Duration) + } + if err := github_com_gogo_protobuf_types.StdDurationUnmarshal(m.Precision, dAtA[iNdEx:postIndex]); err != nil { return err } iNdEx = postIndex diff --git a/types/params.go b/types/params.go index fc9c4aaad..a74ae3762 100644 --- a/types/params.go +++ b/types/params.go @@ -275,8 +275,12 @@ func (params ConsensusParams) UpdateConsensusParams(params2 *tmproto.ConsensusPa res.Version.AppVersion = params2.Version.AppVersion } if params2.Synchrony != nil { - res.Synchrony.Precision = params2.Synchrony.Precision - res.Synchrony.MessageDelay = params2.Synchrony.MessageDelay + if params2.Synchrony.MessageDelay != nil { + res.Synchrony.MessageDelay = *params2.Synchrony.GetMessageDelay() + } + if params2.Synchrony.Precision != nil { + res.Synchrony.Precision = *params2.Synchrony.GetPrecision() + } } return res } @@ -299,14 +303,14 @@ func (params *ConsensusParams) ToProto() tmproto.ConsensusParams { AppVersion: params.Version.AppVersion, }, Synchrony: &tmproto.SynchronyParams{ - MessageDelay: params.Synchrony.MessageDelay, - Precision: params.Synchrony.Precision, + MessageDelay: ¶ms.Synchrony.MessageDelay, + Precision: ¶ms.Synchrony.Precision, }, } } func ConsensusParamsFromProto(pbParams tmproto.ConsensusParams) ConsensusParams { - return ConsensusParams{ + c := ConsensusParams{ Block: BlockParams{ MaxBytes: pbParams.Block.MaxBytes, MaxGas: pbParams.Block.MaxGas, @@ -322,9 +326,14 @@ func ConsensusParamsFromProto(pbParams tmproto.ConsensusParams) ConsensusParams Version: VersionParams{ AppVersion: pbParams.Version.AppVersion, }, - Synchrony: SynchronyParams{ - MessageDelay: pbParams.Synchrony.MessageDelay, - Precision: pbParams.Synchrony.Precision, - }, } + if pbParams.Synchrony != nil { + if pbParams.Synchrony.MessageDelay != nil { + c.Synchrony.MessageDelay = *pbParams.Synchrony.GetMessageDelay() + } + if pbParams.Synchrony.Precision != nil { + c.Synchrony.Precision = *pbParams.Synchrony.GetPrecision() + } + } + return c } diff --git a/types/params_test.go b/types/params_test.go index 0aaf1e2b9..41b5afc04 100644 --- a/types/params_test.go +++ b/types/params_test.go @@ -246,8 +246,8 @@ func TestConsensusParamsUpdate(t *testing.T) { intialParams: makeParams(makeParamsArgs{evidenceAge: 3, precision: time.Second, messageDelay: 3 * time.Second}), updates: &tmproto.ConsensusParams{ Synchrony: &tmproto.SynchronyParams{ - Precision: time.Second * 2, - MessageDelay: time.Second * 4, + Precision: durationPtr(time.Second * 2), + MessageDelay: durationPtr(time.Second * 4), }, }, updatedParams: makeParams(makeParamsArgs{evidenceAge: 3, precision: 2 * time.Second, messageDelay: 4 * time.Second}), @@ -339,3 +339,7 @@ func TestProto(t *testing.T) { } } + +func durationPtr(t time.Duration) *time.Duration { + return &t +} From 7c91b53999fa202aedc87eaec96113214ce4da4b Mon Sep 17 00:00:00 2001 From: William Banfield <4561443+williambanfield@users.noreply.github.com> Date: Thu, 17 Mar 2022 15:20:46 -0400 Subject: [PATCH 4/4] docs: PBTS synchrony issues runbook (#8129) closes: #7756 # What does this pull request change? This pull request adds a new runbook for operators enountering errors related to the new Proposer-Based Timestamps algorithm. The goal of this runbook is to give operators a set of clear steps that they can follow if they are having issues producing blocks because of clock synchronization problems. This pull request also renames the `*PrevoteDelay` metrics to drop the term `MessageDelay`. These metrics provide a combined view of `message_delay` + `synchrony` so the name may be confusing. # Questions to reviewers * Are there ways to make the set of steps clearer or are there any pieces that seem confusing? --- .../proposer-based-timestamps-runbook.md | 216 ++++++++++++++++++ internal/consensus/metrics.go | 18 +- internal/consensus/state.go | 4 +- 3 files changed, 227 insertions(+), 11 deletions(-) create mode 100644 docs/tools/debugging/proposer-based-timestamps-runbook.md diff --git a/docs/tools/debugging/proposer-based-timestamps-runbook.md b/docs/tools/debugging/proposer-based-timestamps-runbook.md new file mode 100644 index 000000000..a817bd29e --- /dev/null +++ b/docs/tools/debugging/proposer-based-timestamps-runbook.md @@ -0,0 +1,216 @@ +--- +order: 3 +--- + +# Proposer-Based Timestamps Runbook + +Version v0.36 of Tendermint added new constraints for the timestamps included in +each block created by Tendermint. The new constraints mean that validators may +fail to produce valid blocks or may issue `nil` `prevotes` for proposed blocks +depending on the configuration of the validator's local clock. + +## What is this document for? + +This document provides a set of actionable steps for application developers and +node operators to diagnose and fix issues related to clock synchronization and +configuration of the Proposer-Based Timestamps [SynchronyParams](https://github.com/tendermint/tendermint/blob/master/spec/core/data_structures.md#synchronyparams). + +Use this runbook if you observe that validators are frequently voting `nil` for a block that the rest +of the network votes for or if validators are frequently producing block proposals +that are not voted for by the rest of the network. + +## Requirements + +To use this runbook, you must be running a node that has the [Prometheus metrics endpoint enabled](https://github.com/tendermint/tendermint/blob/master/docs/nodes/metrics.md) +and the Tendermint RPC endpoint enabled and accessible. + +It is strongly recommended to also run a Prometheus metrics collector to gather and +analyze metrics from the Tendermint node. + +## Debugging a Single Node + +If you observe that a single validator is frequently failing to produce blocks or +voting nil for proposals that other validators vote for and suspect it may be +related to clock synchronization, use the following steps to debug and correct the issue. + +### Check Timely Metric + +Tendermint exposes a histogram metric for the difference between the timestamp in the proposal +the and the time read from the node's local clock when the proposal is received. + +The histogram exposes multiple metrics on the Prometheus `/metrics` endpoint called +* `tendermint_consensus_proposal_timestamp_difference_bucket`. +* `tendermint_consensus_proposal_timestamp_difference_sum`. +* `tendermint_consensus_proposal_timestamp_difference_count`. + +Each metric is also labeled with the key `is_timely`, which can have a value of +`true` or `false`. + +#### From the Prometheus Collector UI + +If you are running a Prometheus collector, navigate to the query web interface and select the 'Graph' tab. + +Issue a query for the following: + +``` +tendermint_consensus_proposal_timestamp_difference_count{is_timely="false"} / +tendermint_consensus_proposal_timestamp_difference_count{is_timely="true"} +``` + +This query will graph the ratio of proposals the node considered timely to those it +considered untimely. If the ratio is increasing, it means that your node is consistently +seeing more proposals that are far from its local clock. If this is the case, you should +check to make sure your local clock is properly synchronized to NTP. + +#### From the `/metrics` url + +If you are not running a Prometheus collector, navigate to the `/metrics` endpoint +exposed on the Prometheus metrics port with `curl` or a browser. + +Search for the `tendermint_consensus_proposal_timestamp_difference_count` metrics. +This metric is labeled with `is_timely`. Investigate the value of +`tendermint_consensus_proposal_timestamp_difference_count` where `is_timely="false"` +and where `is_timely="true"`. Refresh the endpoint and observe if the value of `is_timely="false"` +is growing. + +If you observe that `is_timely="false"` is growing, it means that your node is consistently +seeing proposals that are far from its local clock. If this is the case, you should check +to make sure your local clock is properly synchronized to NTP. + +### Checking Clock Sync + +NTP configuration and tooling is very specific to the operating system and distribution +that your validator node is running. This guide assumes you have `timedatectl` installed with +[chrony](https://chrony.tuxfamily.org/), a popular tool for interacting with time +synchronization on Linux distributions. If you are using an operating system or +distribution with a different time synchronization mechanism, please consult the +documentation for your operating system to check the status and re-synchronize the daemon. + +#### Check if NTP is Enabled + +```shell +$ timedatectl +``` + +From the output, ensure that `NTP service` is `active`. If `NTP service` is `inactive`, run: + +```shell +$ timedatectl set-ntp true +``` + +Re-run the `timedatectl` command and verify that the change has taken effect. + +#### Check if Your NTP Daemon is Synchronized + +Check the status of your local `chrony` NTP daemon using by running the following: + +```shell +$ chronyc tracking +``` + +If the `chrony` daemon is running, you will see output that indicates its current status. +If the `chrony` daemon is not running, restart it and re-run `chronyc tracking`. + +The `System time` field of the response should show a value that is much smaller than 100 +milliseconds. + +If the value is very large, restart the `chronyd` daemon. + +## Debugging a Network + +If you observe that a network is frequently failing to produce blocks and suspect +it may be related to clock synchronization, use the following steps to debug and correct the issue. + +### Check Prevote Message Delay + +Tendermint exposes metrics that help determine how synchronized the clocks on a network are. + +These metrics are visible on the Prometheus `/metrics` endpoint and are called: +* `tendermint_consensus_quorum_prevote_delay` +* `tendermint_consensus_full_prevote_delay` + +These metrics calculate the difference between the timestamp in the proposal message and +the timestamp of a prevote that was issued during consensus. + +The `tendermint_consensus_quorum_prevote_delay` metric is the interval in seconds +between the proposal timestamp and the timestamp of the earliest prevote that +achieved a quorum during the prevote step. + +The `tendermint_consensus_full_prevote_delay` metric is the interval in seconds +between the proposal timestamp and the timestamp of the latest prevote in a round +where 100% of the validators voted. + +#### From the Prometheus Collector UI + +If you are running a Prometheus collector, navigate to the query web interface and select the 'Graph' tab. + +Issue a query for the following: + +``` +sum(tendermint_consensus_quorum_prevote_delay) by (proposer_address) +``` + +This query will graph the difference in seconds for each proposer on the network. + +If the value is much larger for some proposers, then the issue is likely related to the clock +synchronization of their nodes. Contact those proposers and ensure that their nodes +are properly connected to NTP using the steps for [Debugging a Single Node](#debugging-a-single-node). + +If the value is relatively similar for all proposers you should next compare this +value to the `SynchronyParams` values for the network. Continue to the [Checking +Sychrony](#checking-synchrony) steps. + +#### From the `/metrics` url + +If you are not running a Prometheus collector, navigate to the `/metrics` endpoint +exposed on the Prometheus metrics port. + +Search for the `tendermint_consensus_quorum_prevote_delay` metric. There will be one +entry of this metric for each `proposer_address`. If the value of this metric is +much larger for some proposers, then the issue is likely related to synchronization of their +nodes with NTP. Contact those proposers and ensure that their nodes are properly connected +to NTP using the steps for [Debugging a Single Node](#debugging-a-single-node). + +If the values are relatively similar for all proposers you should next compare, +you'll need to compare this value to the `SynchronyParams` for the network. Continue +to the [Checking Sychrony](#checking-synchrony) steps. + +### Checking Synchrony + +To determine the currently configured `SynchronyParams` for your network, issue a +request to your node's RPC endpoint. For a node running locally with the RPC server +exposed on port `26657`, run the following command: + +```shell +$ curl localhost:26657/consensus_params +``` + +The json output will contain a field named `synchrony`, with the following structure: + +```json +{ + "precision": "500000000", + "message_delay": "3000000000" +} +``` + +The `precision` and `message_delay` values returned are listed in nanoseconds: +In the examples above, the precision is 500ms and the message delay is 3s. +Remember, `tendermint_consensus_quorum_prevote_delay` is listed in seconds. +If the `tendermint_consensus_quorum_prevote_delay` value approaches the sum of `precision` and `message_delay`, +then the value selected for these parameters is too small. Your application will +need to be modified to update the `SynchronyParams` to have larger values. + +### Updating SynchronyParams + +The `SynchronyParams` are `ConsensusParameters` which means they are set and updated +by the application running alongside Tendermint. Updates to these parameters must +be passed to the application during the `FinalizeBlock` ABCI method call. + +If the application was built using the CosmosSDK, then these parameters can be updated +programatically using a governance proposal. For more information, see the [CosmosSDK +documentation](https://hub.cosmos.network/main/governance/submitting.html#sending-the-transaction-that-submits-your-governance-proposal). + +If the application does not implement a way to update the consensus parameters +programatically, then the application itself must be updated to do so. More information on updating +the consensus parameters via ABCI can be found in the [FinalizeBlock documentation](https://github.com/tendermint/tendermint/blob/master/spec/abci++/abci++_methods_002_draft.md#finalizeblock). diff --git a/internal/consensus/metrics.go b/internal/consensus/metrics.go index 5934be420..ed31ec636 100644 --- a/internal/consensus/metrics.go +++ b/internal/consensus/metrics.go @@ -92,12 +92,12 @@ type Metrics struct { // be above 2/3 of the total voting power of the network defines the endpoint // the endpoint of the interval. Subtract the proposal timestamp from this endpoint // to obtain the quorum delay. - QuorumPrevoteMessageDelay metrics.Gauge + QuorumPrevoteDelay metrics.Gauge - // FullPrevoteMessageDelay is the interval in seconds between the proposal + // FullPrevoteDelay is the interval in seconds between the proposal // timestamp and the timestamp of the latest prevote in a round where 100% // of the voting power on the network issued prevotes. - FullPrevoteMessageDelay metrics.Gauge + FullPrevoteDelay metrics.Gauge // ProposalTimestampDifference is the difference between the timestamp in // the proposal message and the local time of the validator at the time @@ -257,17 +257,17 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics { Help: "Time spent per step.", Buckets: stdprometheus.ExponentialBucketsRange(0.1, 100, 8), }, append(labels, "step")).With(labelsAndValues...), - QuorumPrevoteMessageDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + QuorumPrevoteDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, - Name: "quorum_prevote_message_delay", + Name: "quorum_prevote_delay", Help: "Difference in seconds between the proposal timestamp and the timestamp " + "of the latest prevote that achieved a quorum in the prevote step.", }, append(labels, "proposer_address")).With(labelsAndValues...), - FullPrevoteMessageDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + FullPrevoteDelay: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ Namespace: namespace, Subsystem: MetricsSubsystem, - Name: "full_prevote_message_delay", + Name: "full_prevote_delay", Help: "Difference in seconds between the proposal timestamp and the timestamp " + "of the latest prevote that achieved 100% of the voting power in the prevote step.", }, append(labels, "proposer_address")).With(labelsAndValues...), @@ -314,8 +314,8 @@ func NopMetrics() *Metrics { BlockParts: discard.NewCounter(), BlockGossipReceiveLatency: discard.NewHistogram(), BlockGossipPartsReceived: discard.NewCounter(), - QuorumPrevoteMessageDelay: discard.NewGauge(), - FullPrevoteMessageDelay: discard.NewGauge(), + QuorumPrevoteDelay: discard.NewGauge(), + FullPrevoteDelay: discard.NewGauge(), ProposalTimestampDifference: discard.NewHistogram(), } } diff --git a/internal/consensus/state.go b/internal/consensus/state.go index 92b5b84d6..4f0f2c680 100644 --- a/internal/consensus/state.go +++ b/internal/consensus/state.go @@ -2592,12 +2592,12 @@ func (cs *State) calculatePrevoteMessageDelayMetrics() { _, val := cs.Validators.GetByAddress(v.ValidatorAddress) votingPowerSeen += val.VotingPower if votingPowerSeen >= cs.Validators.TotalVotingPower()*2/3+1 { - cs.metrics.QuorumPrevoteMessageDelay.With("proposer_address", cs.Validators.GetProposer().Address.String()).Set(v.Timestamp.Sub(cs.Proposal.Timestamp).Seconds()) + cs.metrics.QuorumPrevoteDelay.With("proposer_address", cs.Validators.GetProposer().Address.String()).Set(v.Timestamp.Sub(cs.Proposal.Timestamp).Seconds()) break } } if ps.HasAll() { - cs.metrics.FullPrevoteMessageDelay.With("proposer_address", cs.Validators.GetProposer().Address.String()).Set(pl[len(pl)-1].Timestamp.Sub(cs.Proposal.Timestamp).Seconds()) + cs.metrics.FullPrevoteDelay.With("proposer_address", cs.Validators.GetProposer().Address.String()).Set(pl[len(pl)-1].Timestamp.Sub(cs.Proposal.Timestamp).Seconds()) } }