- package consensus
-
- import (
- "context"
- "encoding/binary"
- "errors"
- "fmt"
- "hash/crc32"
- "io"
- "path/filepath"
- "time"
-
- "github.com/gogo/protobuf/proto"
-
- auto "github.com/tendermint/tendermint/internal/libs/autofile"
- tmjson "github.com/tendermint/tendermint/libs/json"
- "github.com/tendermint/tendermint/libs/log"
- tmos "github.com/tendermint/tendermint/libs/os"
- "github.com/tendermint/tendermint/libs/service"
- tmtime "github.com/tendermint/tendermint/libs/time"
- tmcons "github.com/tendermint/tendermint/proto/tendermint/consensus"
- )
-
- const (
- // time.Time + max consensus msg size
- maxMsgSizeBytes = maxMsgSize + 24
-
- // how often the WAL should be sync'd during period sync'ing
- walDefaultFlushInterval = 2 * time.Second
- )
-
- //--------------------------------------------------------
- // types and functions for savings consensus messages
-
- // TimedWALMessage wraps WALMessage and adds Time for debugging purposes.
- type TimedWALMessage struct {
- Time time.Time `json:"time"`
- Msg WALMessage `json:"msg"`
- }
-
- // EndHeightMessage marks the end of the given height inside WAL.
- // @internal used by scripts/wal2json util.
- type EndHeightMessage struct {
- Height int64 `json:"height"`
- }
-
- type WALMessage interface{}
-
- func init() {
- tmjson.RegisterType(msgInfo{}, "tendermint/wal/MsgInfo")
- tmjson.RegisterType(timeoutInfo{}, "tendermint/wal/TimeoutInfo")
- tmjson.RegisterType(EndHeightMessage{}, "tendermint/wal/EndHeightMessage")
- }
-
- //--------------------------------------------------------
- // Simple write-ahead logger
-
- // WAL is an interface for any write-ahead logger.
- type WAL interface {
- Write(WALMessage) error
- WriteSync(WALMessage) error
- FlushAndSync() error
-
- SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error)
-
- // service methods
- Start(context.Context) error
- Stop() error
- Wait()
- }
-
- // Write ahead logger writes msgs to disk before they are processed.
- // Can be used for crash-recovery and deterministic replay.
- // TODO: currently the wal is overwritten during replay catchup, give it a mode
- // so it's either reading or appending - must read to end to start appending
- // again.
- type BaseWAL struct {
- service.BaseService
-
- group *auto.Group
-
- enc *WALEncoder
-
- flushTicker *time.Ticker
- flushInterval time.Duration
- }
-
- var _ WAL = &BaseWAL{}
-
- // NewWAL returns a new write-ahead logger based on `baseWAL`, which implements
- // WAL. It's flushed and synced to disk every 2s and once when stopped.
- func NewWAL(logger log.Logger, walFile string, groupOptions ...func(*auto.Group)) (*BaseWAL, error) {
- err := tmos.EnsureDir(filepath.Dir(walFile), 0700)
- if err != nil {
- return nil, fmt.Errorf("failed to ensure WAL directory is in place: %w", err)
- }
-
- group, err := auto.OpenGroup(walFile, groupOptions...)
- if err != nil {
- return nil, err
- }
- wal := &BaseWAL{
- group: group,
- enc: NewWALEncoder(group),
- flushInterval: walDefaultFlushInterval,
- }
- wal.BaseService = *service.NewBaseService(logger, "baseWAL", wal)
- return wal, nil
- }
-
- // SetFlushInterval allows us to override the periodic flush interval for the WAL.
- func (wal *BaseWAL) SetFlushInterval(i time.Duration) {
- wal.flushInterval = i
- }
-
- func (wal *BaseWAL) Group() *auto.Group {
- return wal.group
- }
-
- func (wal *BaseWAL) OnStart(ctx context.Context) error {
- size, err := wal.group.Head.Size()
- if err != nil {
- return err
- } else if size == 0 {
- if err := wal.WriteSync(EndHeightMessage{0}); err != nil {
- return err
- }
- }
- err = wal.group.Start(ctx)
- if err != nil {
- return err
- }
- wal.flushTicker = time.NewTicker(wal.flushInterval)
- go wal.processFlushTicks()
- return nil
- }
-
- func (wal *BaseWAL) processFlushTicks() {
- for {
- select {
- case <-wal.flushTicker.C:
- if err := wal.FlushAndSync(); err != nil {
- wal.Logger.Error("Periodic WAL flush failed", "err", err)
- }
- case <-wal.Quit():
- return
- }
- }
- }
-
- // FlushAndSync flushes and fsync's the underlying group's data to disk.
- // See auto#FlushAndSync
- func (wal *BaseWAL) FlushAndSync() error {
- return wal.group.FlushAndSync()
- }
-
- // Stop the underlying autofile group.
- // Use Wait() to ensure it's finished shutting down
- // before cleaning up files.
- func (wal *BaseWAL) OnStop() {
- wal.flushTicker.Stop()
- if err := wal.FlushAndSync(); err != nil {
- if !errors.Is(err, service.ErrAlreadyStopped) {
- wal.Logger.Error("error on flush data to disk", "error", err)
- }
- }
- if err := wal.group.Stop(); err != nil {
- if !errors.Is(err, service.ErrAlreadyStopped) {
- wal.Logger.Error("error trying to stop wal", "error", err)
- }
- }
- wal.group.Close()
- }
-
- // Wait for the underlying autofile group to finish shutting down
- // so it's safe to cleanup files.
- func (wal *BaseWAL) Wait() {
- wal.group.Wait()
- }
-
- // Write is called in newStep and for each receive on the
- // peerMsgQueue and the timeoutTicker.
- // NOTE: does not call fsync()
- func (wal *BaseWAL) Write(msg WALMessage) error {
- if wal == nil {
- return nil
- }
-
- if err := wal.enc.Encode(&TimedWALMessage{tmtime.Now(), msg}); err != nil {
- wal.Logger.Error("Error writing msg to consensus wal. WARNING: recover may not be possible for the current height",
- "err", err, "msg", msg)
- return err
- }
-
- return nil
- }
-
- // WriteSync is called when we receive a msg from ourselves
- // so that we write to disk before sending signed messages.
- // NOTE: calls fsync()
- func (wal *BaseWAL) WriteSync(msg WALMessage) error {
- if wal == nil {
- return nil
- }
-
- if err := wal.Write(msg); err != nil {
- return err
- }
-
- if err := wal.FlushAndSync(); err != nil {
- wal.Logger.Error(`WriteSync failed to flush consensus wal.
- WARNING: may result in creating alternative proposals / votes for the current height iff the node restarted`,
- "err", err)
- return err
- }
-
- return nil
- }
-
- // WALSearchOptions are optional arguments to SearchForEndHeight.
- type WALSearchOptions struct {
- // IgnoreDataCorruptionErrors set to true will result in skipping data corruption errors.
- IgnoreDataCorruptionErrors bool
- }
-
- // SearchForEndHeight searches for the EndHeightMessage with the given height
- // and returns an auto.GroupReader, whenever it was found or not and an error.
- // Group reader will be nil if found equals false.
- //
- // CONTRACT: caller must close group reader.
- func (wal *BaseWAL) SearchForEndHeight(
- height int64,
- options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) {
- var (
- msg *TimedWALMessage
- gr *auto.GroupReader
- )
- lastHeightFound := int64(-1)
-
- // NOTE: starting from the last file in the group because we're usually
- // searching for the last height. See replay.go
- min, max := wal.group.MinIndex(), wal.group.MaxIndex()
- wal.Logger.Info("Searching for height", "height", height, "min", min, "max", max)
- for index := max; index >= min; index-- {
- gr, err = wal.group.NewReader(index)
- if err != nil {
- return nil, false, err
- }
-
- dec := NewWALDecoder(gr)
- for {
- msg, err = dec.Decode()
- if err == io.EOF {
- // OPTIMISATION: no need to look for height in older files if we've seen h < height
- if lastHeightFound > 0 && lastHeightFound < height {
- gr.Close()
- return nil, false, nil
- }
- // check next file
- break
- }
- if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) {
- wal.Logger.Error("Corrupted entry. Skipping...", "err", err)
- // do nothing
- continue
- } else if err != nil {
- gr.Close()
- return nil, false, err
- }
-
- if m, ok := msg.Msg.(EndHeightMessage); ok {
- lastHeightFound = m.Height
- if m.Height == height { // found
- wal.Logger.Info("Found", "height", height, "index", index)
- return gr, true, nil
- }
- }
- }
- gr.Close()
- }
-
- return nil, false, nil
- }
-
- // A WALEncoder writes custom-encoded WAL messages to an output stream.
- //
- // Format: 4 bytes CRC sum + 4 bytes length + arbitrary-length value
- type WALEncoder struct {
- wr io.Writer
- }
-
- // NewWALEncoder returns a new encoder that writes to wr.
- func NewWALEncoder(wr io.Writer) *WALEncoder {
- return &WALEncoder{wr}
- }
-
- // Encode writes the custom encoding of v to the stream. It returns an error if
- // the encoded size of v is greater than 1MB. Any error encountered
- // during the write is also returned.
- func (enc *WALEncoder) Encode(v *TimedWALMessage) error {
- pbMsg, err := WALToProto(v.Msg)
- if err != nil {
- return err
- }
- pv := tmcons.TimedWALMessage{
- Time: v.Time,
- Msg: pbMsg,
- }
-
- data, err := proto.Marshal(&pv)
- if err != nil {
- panic(fmt.Errorf("encode timed wall message failure: %w", err))
- }
-
- crc := crc32.Checksum(data, crc32c)
- length := uint32(len(data))
- if length > maxMsgSizeBytes {
- return fmt.Errorf("msg is too big: %d bytes, max: %d bytes", length, maxMsgSizeBytes)
- }
- totalLength := 8 + int(length)
-
- msg := make([]byte, totalLength)
- binary.BigEndian.PutUint32(msg[0:4], crc)
- binary.BigEndian.PutUint32(msg[4:8], length)
- copy(msg[8:], data)
-
- _, err = enc.wr.Write(msg)
- return err
- }
-
- // IsDataCorruptionError returns true if data has been corrupted inside WAL.
- func IsDataCorruptionError(err error) bool {
- _, ok := err.(DataCorruptionError)
- return ok
- }
-
- // DataCorruptionError is an error that occures if data on disk was corrupted.
- type DataCorruptionError struct {
- cause error
- }
-
- func (e DataCorruptionError) Error() string {
- return fmt.Sprintf("DataCorruptionError[%v]", e.cause)
- }
-
- func (e DataCorruptionError) Cause() error {
- return e.cause
- }
-
- // A WALDecoder reads and decodes custom-encoded WAL messages from an input
- // stream. See WALEncoder for the format used.
- //
- // It will also compare the checksums and make sure data size is equal to the
- // length from the header. If that is not the case, error will be returned.
- type WALDecoder struct {
- rd io.Reader
- }
-
- // NewWALDecoder returns a new decoder that reads from rd.
- func NewWALDecoder(rd io.Reader) *WALDecoder {
- return &WALDecoder{rd}
- }
-
- // Decode reads the next custom-encoded value from its reader and returns it.
- func (dec *WALDecoder) Decode() (*TimedWALMessage, error) {
- b := make([]byte, 4)
-
- _, err := dec.rd.Read(b)
- if errors.Is(err, io.EOF) {
- return nil, err
- }
- if err != nil {
- return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %v", err)}
- }
- crc := binary.BigEndian.Uint32(b)
-
- b = make([]byte, 4)
- _, err = dec.rd.Read(b)
- if err != nil {
- return nil, DataCorruptionError{fmt.Errorf("failed to read length: %v", err)}
- }
- length := binary.BigEndian.Uint32(b)
-
- if length > maxMsgSizeBytes {
- return nil, DataCorruptionError{fmt.Errorf(
- "length %d exceeded maximum possible value of %d bytes",
- length,
- maxMsgSizeBytes)}
- }
-
- data := make([]byte, length)
- n, err := dec.rd.Read(data)
- if err != nil {
- return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v (read: %d, wanted: %d)", err, n, length)}
- }
-
- // check checksum before decoding data
- actualCRC := crc32.Checksum(data, crc32c)
- if actualCRC != crc {
- return nil, DataCorruptionError{fmt.Errorf("checksums do not match: read: %v, actual: %v", crc, actualCRC)}
- }
-
- var res = new(tmcons.TimedWALMessage)
- err = proto.Unmarshal(data, res)
- if err != nil {
- return nil, DataCorruptionError{fmt.Errorf("failed to decode data: %v", err)}
- }
-
- walMsg, err := WALFromProto(res.Msg)
- if err != nil {
- return nil, DataCorruptionError{fmt.Errorf("failed to convert from proto: %w", err)}
- }
- tMsgWal := &TimedWALMessage{
- Time: res.Time,
- Msg: walMsg,
- }
-
- return tMsgWal, err
- }
-
- type nilWAL struct{}
-
- var _ WAL = nilWAL{}
-
- func (nilWAL) Write(m WALMessage) error { return nil }
- func (nilWAL) WriteSync(m WALMessage) error { return nil }
- func (nilWAL) FlushAndSync() error { return nil }
- func (nilWAL) SearchForEndHeight(height int64, options *WALSearchOptions) (rd io.ReadCloser, found bool, err error) {
- return nil, false, nil
- }
- func (nilWAL) Start(context.Context) error { return nil }
- func (nilWAL) Stop() error { return nil }
- func (nilWAL) Wait() {}
|