tendermint/consensus/wal_test.go

208 lines
5.3 KiB
Go
Raw Normal View History

2017-10-09 23:10:58 +04:00
package consensus
import (
"bytes"
"crypto/rand"
"fmt"
"io/ioutil"
"os"
"path/filepath"
2018-04-05 07:05:45 -07:00
// "sync"
2017-10-09 23:10:58 +04:00
"testing"
"time"
"github.com/tendermint/tendermint/consensus/types"
"github.com/tendermint/tendermint/libs/autofile"
"github.com/tendermint/tendermint/libs/log"
2017-10-25 21:54:56 -04:00
tmtypes "github.com/tendermint/tendermint/types"
tmtime "github.com/tendermint/tendermint/types/time"
2017-10-09 23:10:58 +04:00
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestWALTruncate(t *testing.T) {
walDir, err := ioutil.TempDir("", "wal")
require.NoError(t, err)
defer os.RemoveAll(walDir)
walFile := filepath.Join(walDir, "wal")
//this magic number 4K can truncate the content when RotateFile. defaultHeadSizeLimit(10M) is hard to simulate.
//this magic number 1 * time.Millisecond make RotateFile check frequently. defaultGroupCheckDuration(5s) is hard to simulate.
wal, err := NewWAL(walFile,
autofile.GroupHeadSizeLimit(4096),
autofile.GroupCheckDuration(1*time.Millisecond),
)
require.NoError(t, err)
wal.SetLogger(log.TestingLogger())
err = wal.Start()
require.NoError(t, err)
fix non deterministic test failures and race in privval socket (#3258) * node: decrease retry conn timeout in test Should fix #3256 The retry timeout was set to the default, which is the same as the accept timeout, so it's no wonder this would fail. Here we decrease the retry timeout so we can try many times before the accept timeout. * p2p: increase handshake timeout in test This fails sometimes, presumably because the handshake timeout is so low (only 50ms). So increase it to 1s. Should fix #3187 * privval: fix race with ping. closes #3237 Pings happen in a go-routine and can happen concurrently with other messages. Since we use a request/response protocol, we expect to send a request and get back the corresponding response. But with pings happening concurrently, this assumption could be violated. We were using a mutex, but only a RWMutex, where the RLock was being held for sending messages - this was to allow the underlying connection to be replaced if it fails. Turns out we actually need to use a full lock (not just a read lock) to prevent multiple requests from happening concurrently. * node: fix test name. DelayedStop -> DelayedStart * autofile: Wait() method In the TestWALTruncate in consensus/wal_test.go we remove the WAL directory at the end of the test. However the wal.Stop() does not properly wait for the autofile group to finish shutting down. Hence it was possible that the group's go-routine is still running when the cleanup happens, which causes a panic since the directory disappeared. Here we add a Wait() method to properly wait until the go-routine exits so we can safely clean up. This fixes #2852.
2019-02-06 10:24:43 -05:00
defer func() {
wal.Stop()
// wait for the wal to finish shutting down so we
// can safely remove the directory
wal.Wait()
}()
//60 block's size nearly 70K, greater than group's headBuf size(4096 * 10), when headBuf is full, truncate content will Flush to the file.
//at this time, RotateFile is called, truncate content exist in each file.
err = WALGenerateNBlocks(t, wal.Group(), 60)
require.NoError(t, err)
time.Sleep(1 * time.Millisecond) //wait groupCheckDuration, make sure RotateFile run
wal.Group().Flush()
h := int64(50)
gr, found, err := wal.SearchForEndHeight(h, &WALSearchOptions{})
assert.NoError(t, err, fmt.Sprintf("expected not to err on height %d", h))
assert.True(t, found, fmt.Sprintf("expected to find end height for %d", h))
assert.NotNil(t, gr, "expected group not to be nil")
defer gr.Close()
dec := NewWALDecoder(gr)
msg, err := dec.Decode()
assert.NoError(t, err, "expected to decode a message")
rs, ok := msg.Msg.(tmtypes.EventDataRoundState)
assert.True(t, ok, "expected message of type EventDataRoundState")
assert.Equal(t, rs.Height, h+1, fmt.Sprintf("wrong height"))
}
2017-10-09 23:10:58 +04:00
func TestWALEncoderDecoder(t *testing.T) {
now := tmtime.Now()
2017-10-09 23:10:58 +04:00
msgs := []TimedWALMessage{
{Time: now, Msg: EndHeightMessage{0}},
{Time: now, Msg: timeoutInfo{Duration: time.Second, Height: 1, Round: 1, Step: types.RoundStepPropose}},
2017-10-09 23:10:58 +04:00
}
b := new(bytes.Buffer)
for _, msg := range msgs {
b.Reset()
enc := NewWALEncoder(b)
err := enc.Encode(&msg)
require.NoError(t, err)
dec := NewWALDecoder(b)
decoded, err := dec.Decode()
require.NoError(t, err)
2018-04-05 08:17:10 -07:00
assert.Equal(t, msg.Time.UTC(), decoded.Time)
2017-10-09 23:10:58 +04:00
assert.Equal(t, msg.Msg, decoded.Msg)
}
}
2017-10-25 21:54:56 -04:00
cs/wal: refuse to encode msg that is bigger than maxMsgSizeBytes (#3303) Earlier this week somebody posted this in GoS Riot chat: ``` E[2019-02-12|10:38:37.596] Corrupted entry. Skipping... module=consensus wal=/home/gaia/.gaiad/data/cs.wal/wal err="DataCorruptionError[length 878916964 exceeded maximum possible value of 1048576 bytes]" E[2019-02-12|10:38:37.596] Corrupted entry. Skipping... module=consensus wal=/home/gaia/.gaiad/data/cs.wal/wal err="DataCorruptionError[length 825701731 exceeded maximum possible value of 1048576 bytes]" E[2019-02-12|10:38:37.596] Corrupted entry. Skipping... module=consensus wal=/home/gaia/.gaiad/data/cs.wal/wal err="DataCorruptionError[length 1631073634 exceeded maximum possible value of 1048576 bytes]" E[2019-02-12|10:38:37.596] Corrupted entry. Skipping... module=consensus wal=/home/gaia/.gaiad/data/cs.wal/wal err="DataCorruptionError[length 912418148 exceeded maximum possible value of 1048576 bytes]" E[2019-02-12|10:38:37.600] Corrupted entry. Skipping... module=consensus wal=/home/gaia/.gaiad/data/cs.wal/wal err="DataCorruptionError[failed to read data: EOF]" E[2019-02-12|10:38:37.600] Error on catchup replay. Proceeding to start ConsensusState anyway module=consensus err="Cannot replay height 7242. WAL does not contain #ENDHEIGHT for 7241" E[2019-02-12|10:38:37.861] Error dialing peer module=p2p err="dial tcp 35.183.126.181:26656: i/o timeout ``` Note the length error messages. What has happened is the length field got corrupted probably. I've looked at the code and noticed that we don't check the msg size during encoding. This PR fixes that. It also improves a few error messages in WALDecoder.
2019-02-18 11:23:06 +04:00
func TestWALWritePanicsIfMsgIsTooBig(t *testing.T) {
walDir, err := ioutil.TempDir("", "wal")
require.NoError(t, err)
defer os.RemoveAll(walDir)
walFile := filepath.Join(walDir, "wal")
wal, err := NewWAL(walFile)
require.NoError(t, err)
err = wal.Start()
require.NoError(t, err)
defer func() {
wal.Stop()
// wait for the wal to finish shutting down so we
// can safely remove the directory
wal.Wait()
}()
assert.Panics(t, func() { wal.Write(make([]byte, maxMsgSizeBytes+1)) })
}
2018-01-18 20:38:19 -05:00
func TestWALSearchForEndHeight(t *testing.T) {
walBody, err := WALWithNBlocks(t, 6)
2017-12-06 15:57:00 -06:00
if err != nil {
t.Fatal(err)
}
2017-12-06 18:28:14 -06:00
walFile := tempWALWithData(walBody)
2017-12-06 15:57:00 -06:00
2018-04-09 16:32:43 +02:00
wal, err := NewWAL(walFile)
require.NoError(t, err)
wal.SetLogger(log.TestingLogger())
2017-10-25 21:54:56 -04:00
h := int64(3)
gr, found, err := wal.SearchForEndHeight(h, &WALSearchOptions{})
assert.NoError(t, err, fmt.Sprintf("expected not to err on height %d", h))
assert.True(t, found, fmt.Sprintf("expected to find end height for %d", h))
2017-10-25 21:54:56 -04:00
assert.NotNil(t, gr, "expected group not to be nil")
defer gr.Close()
dec := NewWALDecoder(gr)
msg, err := dec.Decode()
assert.NoError(t, err, "expected to decode a message")
rs, ok := msg.Msg.(tmtypes.EventDataRoundState)
assert.True(t, ok, "expected message of type EventDataRoundState")
assert.Equal(t, rs.Height, h+1, fmt.Sprintf("wrong height"))
2017-10-25 21:54:56 -04:00
}
2018-04-05 07:05:45 -07:00
/*
var initOnce sync.Once
func registerInterfacesOnce() {
initOnce.Do(func() {
var _ = wire.RegisterInterface(
struct{ WALMessage }{},
wire.ConcreteType{[]byte{}, 0x10},
)
})
}
2018-04-05 07:05:45 -07:00
*/
func nBytes(n int) []byte {
buf := make([]byte, n)
n, _ = rand.Read(buf)
return buf[:n]
}
func benchmarkWalDecode(b *testing.B, n int) {
2018-04-05 07:05:45 -07:00
// registerInterfacesOnce()
buf := new(bytes.Buffer)
enc := NewWALEncoder(buf)
data := nBytes(n)
enc.Encode(&TimedWALMessage{Msg: data, Time: time.Now().Round(time.Second).UTC()})
encoded := buf.Bytes()
b.ResetTimer()
for i := 0; i < b.N; i++ {
buf.Reset()
buf.Write(encoded)
dec := NewWALDecoder(buf)
if _, err := dec.Decode(); err != nil {
b.Fatal(err)
}
}
b.ReportAllocs()
}
func BenchmarkWalDecode512B(b *testing.B) {
benchmarkWalDecode(b, 512)
}
func BenchmarkWalDecode10KB(b *testing.B) {
benchmarkWalDecode(b, 10*1024)
}
func BenchmarkWalDecode100KB(b *testing.B) {
benchmarkWalDecode(b, 100*1024)
}
func BenchmarkWalDecode1MB(b *testing.B) {
benchmarkWalDecode(b, 1024*1024)
}
func BenchmarkWalDecode10MB(b *testing.B) {
benchmarkWalDecode(b, 10*1024*1024)
}
func BenchmarkWalDecode100MB(b *testing.B) {
benchmarkWalDecode(b, 100*1024*1024)
}
func BenchmarkWalDecode1GB(b *testing.B) {
benchmarkWalDecode(b, 1024*1024*1024)
}