mirror of
https://github.com/fluencelabs/tendermint
synced 2025-07-31 20:21:56 +00:00
WAL: better errors and new fail point (#3246)
* privval: more info in errors * wal: change Debug logs to Info * wal: log and return error on corrupted wal instead of panicing * fail: Exit right away instead of sending interupt * consensus: FAIL before handling our own vote allows to replicate #3089: - run using `FAIL_TEST_INDEX=0` - delete some bytes from the end of the WAL - start normally Results in logs like: ``` I[2019-02-03|18:12:58.225] Searching for height module=consensus wal=/Users/ethanbuchman/.tendermint/data/cs.wal/wal height=1 min=0 max=0 E[2019-02-03|18:12:58.225] Error on catchup replay. Proceeding to start ConsensusState anyway module=consensus err="failed to read data: EOF" I[2019-02-03|18:12:58.225] Started node module=main nodeInfo="{ProtocolVersion:{P2P:6 Block:9 App:1} ID_:35e87e93f2e31f305b65a5517fd2102331b56002 ListenAddr:tcp://0.0.0.0:26656 Network:test-chain-J8JvJH Version:0.29.1 Channels:4020212223303800 Moniker:Ethans-MacBook-Pro.local Other:{TxIndex:on RPCAddress:tcp://0.0.0.0:26657}}" E[2019-02-03|18:12:58.226] Couldn't connect to any seeds module=p2p I[2019-02-03|18:12:59.229] Timed out module=consensus dur=998.568ms height=1 round=0 step=RoundStepNewHeight I[2019-02-03|18:12:59.230] enterNewRound(1/0). Current: 1/0/RoundStepNewHeight module=consensus height=1 round=0 I[2019-02-03|18:12:59.230] enterPropose(1/0). Current: 1/0/RoundStepNewRound module=consensus height=1 round=0 I[2019-02-03|18:12:59.230] enterPropose: Our turn to propose module=consensus height=1 round=0 proposer=AD278B7767B05D7FBEB76207024C650988FA77D5 privValidator="PrivValidator{AD278B7767B05D7FBEB76207024C650988FA77D5 LH:1, LR:0, LS:2}" E[2019-02-03|18:12:59.230] enterPropose: Error signing proposal module=consensus height=1 round=0 err="Error signing proposal: Step regression at height 1 round 0. Got 1, last step 2" I[2019-02-03|18:13:02.233] Timed out module=consensus dur=3s height=1 round=0 step=RoundStepPropose I[2019-02-03|18:13:02.233] enterPrevote(1/0). Current: 1/0/RoundStepPropose module=consensus I[2019-02-03|18:13:02.233] enterPrevote: ProposalBlock is nil module=consensus height=1 round=0 E[2019-02-03|18:13:02.234] Error signing vote module=consensus height=1 round=0 vote="Vote{0:AD278B7767B0 1/00/1(Prevote) 000000000000 000000000000 @ 2019-02-04T02:13:02.233897Z}" err="Error signing vote: Conflicting data" ``` Notice the EOF, the step regression, and the conflicting data. * wal: change errors to be DataCorruptionError * exit on corrupt WAL * fix log * fix new line
This commit is contained in:
@@ -163,7 +163,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions)
|
||||
// NOTE: starting from the last file in the group because we're usually
|
||||
// searching for the last height. See replay.go
|
||||
min, max := wal.group.MinIndex(), wal.group.MaxIndex()
|
||||
wal.Logger.Debug("Searching for height", "height", height, "min", min, "max", max)
|
||||
wal.Logger.Info("Searching for height", "height", height, "min", min, "max", max)
|
||||
for index := max; index >= min; index-- {
|
||||
gr, err = wal.group.NewReader(index)
|
||||
if err != nil {
|
||||
@@ -183,7 +183,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions)
|
||||
break
|
||||
}
|
||||
if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) {
|
||||
wal.Logger.Debug("Corrupted entry. Skipping...", "err", err)
|
||||
wal.Logger.Error("Corrupted entry. Skipping...", "err", err)
|
||||
// do nothing
|
||||
continue
|
||||
} else if err != nil {
|
||||
@@ -194,7 +194,7 @@ func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions)
|
||||
if m, ok := msg.Msg.(EndHeightMessage); ok {
|
||||
lastHeightFound = m.Height
|
||||
if m.Height == height { // found
|
||||
wal.Logger.Debug("Found", "height", height, "index", index)
|
||||
wal.Logger.Info("Found", "height", height, "index", index)
|
||||
return gr, true, nil
|
||||
}
|
||||
}
|
||||
@@ -281,25 +281,25 @@ func (dec *WALDecoder) Decode() (*TimedWALMessage, error) {
|
||||
return nil, err
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read checksum: %v", err)
|
||||
return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %v", err)}
|
||||
}
|
||||
crc := binary.BigEndian.Uint32(b)
|
||||
|
||||
b = make([]byte, 4)
|
||||
_, err = dec.rd.Read(b)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read length: %v", err)
|
||||
return nil, DataCorruptionError{fmt.Errorf("failed to read length: %v", err)}
|
||||
}
|
||||
length := binary.BigEndian.Uint32(b)
|
||||
|
||||
if length > maxMsgSizeBytes {
|
||||
return nil, fmt.Errorf("length %d exceeded maximum possible value of %d bytes", length, maxMsgSizeBytes)
|
||||
return nil, DataCorruptionError{fmt.Errorf("length %d exceeded maximum possible value of %d bytes", length, maxMsgSizeBytes)}
|
||||
}
|
||||
|
||||
data := make([]byte, length)
|
||||
_, err = dec.rd.Read(data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read data: %v", err)
|
||||
return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v", err)}
|
||||
}
|
||||
|
||||
// check checksum before decoding data
|
||||
|
Reference in New Issue
Block a user