2016-01-18 14:10:05 -05:00
|
|
|
package consensus
|
|
|
|
|
|
|
|
import (
|
2017-10-09 23:10:58 +04:00
|
|
|
"encoding/binary"
|
|
|
|
"fmt"
|
2017-09-20 15:33:05 -07:00
|
|
|
"hash/crc32"
|
2017-10-09 23:10:58 +04:00
|
|
|
"io"
|
2017-10-30 11:12:01 -05:00
|
|
|
"path/filepath"
|
2016-01-18 14:10:05 -05:00
|
|
|
"time"
|
|
|
|
|
2017-10-30 11:12:01 -05:00
|
|
|
"github.com/pkg/errors"
|
|
|
|
|
2018-04-09 16:32:43 +02:00
|
|
|
amino "github.com/tendermint/go-amino"
|
2018-07-01 22:36:49 -04:00
|
|
|
auto "github.com/tendermint/tendermint/libs/autofile"
|
|
|
|
cmn "github.com/tendermint/tendermint/libs/common"
|
2018-11-06 13:12:12 +01:00
|
|
|
"github.com/tendermint/tendermint/libs/log"
|
2018-08-10 00:25:57 -05:00
|
|
|
"github.com/tendermint/tendermint/types"
|
2018-09-01 01:33:51 +02:00
|
|
|
tmtime "github.com/tendermint/tendermint/types/time"
|
2016-01-18 14:10:05 -05:00
|
|
|
)
|
|
|
|
|
2017-12-11 19:48:57 -06:00
|
|
|
const (
|
2018-09-12 23:44:43 +04:00
|
|
|
// must be greater than types.BlockPartSizeBytes + a few bytes
|
2017-12-15 11:42:53 -06:00
|
|
|
maxMsgSizeBytes = 1024 * 1024 // 1MB
|
2019-02-20 07:45:18 +02:00
|
|
|
|
|
|
|
// how often the WAL should be sync'd during period sync'ing
|
|
|
|
walDefaultFlushInterval = 2 * time.Second
|
2017-12-11 19:48:57 -06:00
|
|
|
)
|
|
|
|
|
2016-01-18 14:10:05 -05:00
|
|
|
//--------------------------------------------------------
|
|
|
|
// types and functions for savings consensus messages
|
|
|
|
|
2016-10-28 15:01:14 -07:00
|
|
|
type TimedWALMessage struct {
|
2017-10-09 23:10:58 +04:00
|
|
|
Time time.Time `json:"time"` // for debugging purposes
|
|
|
|
Msg WALMessage `json:"msg"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// EndHeightMessage marks the end of the given height inside WAL.
|
2017-12-06 15:57:00 -06:00
|
|
|
// @internal used by scripts/wal2json util.
|
2017-10-09 23:10:58 +04:00
|
|
|
type EndHeightMessage struct {
|
2017-12-01 19:04:53 -06:00
|
|
|
Height int64 `json:"height"`
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
|
|
|
|
2016-10-28 15:01:14 -07:00
|
|
|
type WALMessage interface{}
|
2016-01-18 14:10:05 -05:00
|
|
|
|
2018-04-05 07:05:45 -07:00
|
|
|
func RegisterWALMessages(cdc *amino.Codec) {
|
|
|
|
cdc.RegisterInterface((*WALMessage)(nil), nil)
|
|
|
|
cdc.RegisterConcrete(types.EventDataRoundState{}, "tendermint/wal/EventDataRoundState", nil)
|
|
|
|
cdc.RegisterConcrete(msgInfo{}, "tendermint/wal/MsgInfo", nil)
|
|
|
|
cdc.RegisterConcrete(timeoutInfo{}, "tendermint/wal/TimeoutInfo", nil)
|
2018-04-06 13:46:40 -07:00
|
|
|
cdc.RegisterConcrete(EndHeightMessage{}, "tendermint/wal/EndHeightMessage", nil)
|
2018-04-05 07:05:45 -07:00
|
|
|
}
|
2016-01-18 14:10:05 -05:00
|
|
|
|
|
|
|
//--------------------------------------------------------
|
|
|
|
// Simple write-ahead logger
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
// WAL is an interface for any write-ahead logger.
|
|
|
|
type WAL interface {
|
2018-05-20 14:40:01 -04:00
|
|
|
Write(WALMessage)
|
|
|
|
WriteSync(WALMessage)
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
Group() *auto.Group
|
2017-12-11 19:48:20 -06:00
|
|
|
SearchForEndHeight(height int64, options *WALSearchOptions) (gr *auto.GroupReader, found bool, err error)
|
2019-02-20 07:45:18 +02:00
|
|
|
Flush() error
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
|
2017-11-06 13:20:39 -05:00
|
|
|
Start() error
|
|
|
|
Stop() error
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
Wait()
|
|
|
|
}
|
|
|
|
|
2016-01-18 14:10:05 -05:00
|
|
|
// Write ahead logger writes msgs to disk before they are processed.
|
2019-02-21 18:28:02 +04:00
|
|
|
// Can be used for crash-recovery and deterministic replay.
|
|
|
|
// TODO: currently the wal is overwritten during replay catchup, give it a mode
|
|
|
|
// so it's either reading or appending - must read to end to start appending
|
|
|
|
// again.
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
type baseWAL struct {
|
2017-10-04 16:40:45 -04:00
|
|
|
cmn.BaseService
|
2016-03-01 16:04:19 -05:00
|
|
|
|
2016-10-28 15:01:14 -07:00
|
|
|
group *auto.Group
|
2017-10-09 23:10:58 +04:00
|
|
|
|
|
|
|
enc *WALEncoder
|
2019-02-20 07:45:18 +02:00
|
|
|
|
|
|
|
flushTicker *time.Ticker
|
|
|
|
flushInterval time.Duration
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
|
|
|
|
2019-02-21 18:28:02 +04:00
|
|
|
// NewWAL returns a new write-ahead logger based on `baseWAL`, which implements
|
|
|
|
// WAL. It's flushed and synced to disk every 2s and once when stopped.
|
2018-09-25 19:22:45 +08:00
|
|
|
func NewWAL(walFile string, groupOptions ...func(*auto.Group)) (*baseWAL, error) {
|
2017-10-30 11:12:01 -05:00
|
|
|
err := cmn.EnsureDir(filepath.Dir(walFile), 0700)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "failed to ensure WAL directory is in place")
|
|
|
|
}
|
|
|
|
|
2018-09-25 19:22:45 +08:00
|
|
|
group, err := auto.OpenGroup(walFile, groupOptions...)
|
2016-01-18 14:10:05 -05:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
wal := &baseWAL{
|
2019-02-20 07:45:18 +02:00
|
|
|
group: group,
|
|
|
|
enc: NewWALEncoder(group),
|
|
|
|
flushInterval: walDefaultFlushInterval,
|
2016-10-28 15:01:14 -07:00
|
|
|
}
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
wal.BaseService = *cmn.NewBaseService(nil, "baseWAL", wal)
|
2017-05-12 23:07:53 +02:00
|
|
|
return wal, nil
|
2016-11-16 01:15:39 -05:00
|
|
|
}
|
|
|
|
|
2019-02-20 07:45:18 +02:00
|
|
|
// SetFlushInterval allows us to override the periodic flush interval for the WAL.
|
|
|
|
func (wal *baseWAL) SetFlushInterval(i time.Duration) {
|
|
|
|
wal.flushInterval = i
|
|
|
|
}
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func (wal *baseWAL) Group() *auto.Group {
|
|
|
|
return wal.group
|
|
|
|
}
|
|
|
|
|
2018-11-06 13:12:12 +01:00
|
|
|
func (wal *baseWAL) SetLogger(l log.Logger) {
|
|
|
|
wal.BaseService.Logger = l
|
|
|
|
wal.group.SetLogger(l)
|
|
|
|
}
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func (wal *baseWAL) OnStart() error {
|
2016-11-16 01:15:39 -05:00
|
|
|
size, err := wal.group.Head.Size()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
} else if size == 0 {
|
2018-05-20 14:40:01 -04:00
|
|
|
wal.WriteSync(EndHeightMessage{0})
|
2016-11-16 01:15:39 -05:00
|
|
|
}
|
2017-11-06 13:20:39 -05:00
|
|
|
err = wal.group.Start()
|
2019-02-20 07:45:18 +02:00
|
|
|
wal.flushTicker = time.NewTicker(wal.flushInterval)
|
|
|
|
go wal.processFlushTicks()
|
2016-11-21 19:16:19 -08:00
|
|
|
return err
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
|
|
|
|
2019-02-20 07:45:18 +02:00
|
|
|
func (wal *baseWAL) processFlushTicks() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-wal.flushTicker.C:
|
2019-02-21 18:28:02 +04:00
|
|
|
if err := wal.Flush(); err != nil {
|
2019-02-20 07:45:18 +02:00
|
|
|
wal.Logger.Error("Periodic WAL flush failed", "err", err)
|
|
|
|
}
|
|
|
|
case <-wal.Quit():
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-21 18:28:02 +04:00
|
|
|
// Flush will attempt to flush and fsync the underlying group's data to disk.
|
2019-02-20 07:45:18 +02:00
|
|
|
func (wal *baseWAL) Flush() error {
|
|
|
|
return wal.group.Flush()
|
|
|
|
}
|
|
|
|
|
2019-02-06 10:24:43 -05:00
|
|
|
// Stop the underlying autofile group.
|
|
|
|
// Use Wait() to ensure it's finished shutting down
|
|
|
|
// before cleaning up files.
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func (wal *baseWAL) OnStop() {
|
2019-02-20 07:45:18 +02:00
|
|
|
wal.flushTicker.Stop()
|
|
|
|
wal.Flush()
|
2016-11-21 19:16:19 -08:00
|
|
|
wal.group.Stop()
|
2018-06-04 16:34:16 +04:00
|
|
|
wal.group.Close()
|
2016-08-17 23:08:43 -04:00
|
|
|
}
|
|
|
|
|
2019-02-06 10:24:43 -05:00
|
|
|
// Wait for the underlying autofile group to finish shutting down
|
|
|
|
// so it's safe to cleanup files.
|
|
|
|
func (wal *baseWAL) Wait() {
|
|
|
|
wal.group.Wait()
|
|
|
|
}
|
|
|
|
|
2018-05-20 16:44:08 -04:00
|
|
|
// Write is called in newStep and for each receive on the
|
2018-05-22 15:39:27 +04:00
|
|
|
// peerMsgQueue and the timeoutTicker.
|
2018-05-20 16:44:08 -04:00
|
|
|
// NOTE: does not call fsync()
|
2018-05-20 14:40:01 -04:00
|
|
|
func (wal *baseWAL) Write(msg WALMessage) {
|
|
|
|
if wal == nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Write the wal message
|
2018-09-01 01:33:51 +02:00
|
|
|
if err := wal.enc.Encode(&TimedWALMessage{tmtime.Now(), msg}); err != nil {
|
2018-08-10 00:25:57 -05:00
|
|
|
panic(fmt.Sprintf("Error writing msg to consensus wal: %v \n\nMessage: %v", err, msg))
|
2018-05-20 14:40:01 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-20 16:44:08 -04:00
|
|
|
// WriteSync is called when we receive a msg from ourselves
|
|
|
|
// so that we write to disk before sending signed messages.
|
|
|
|
// NOTE: calls fsync()
|
2018-05-20 14:40:01 -04:00
|
|
|
func (wal *baseWAL) WriteSync(msg WALMessage) {
|
2016-10-28 11:58:09 -07:00
|
|
|
if wal == nil {
|
|
|
|
return
|
|
|
|
}
|
2017-10-09 23:10:58 +04:00
|
|
|
|
2018-05-20 16:44:08 -04:00
|
|
|
wal.Write(msg)
|
2019-02-20 07:45:18 +02:00
|
|
|
if err := wal.Flush(); err != nil {
|
2018-08-10 00:25:57 -05:00
|
|
|
panic(fmt.Sprintf("Error flushing consensus wal buf to file. Error: %v \n", err))
|
2016-12-17 23:43:17 -05:00
|
|
|
}
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
2016-11-16 01:15:39 -05:00
|
|
|
|
2017-12-11 19:48:20 -06:00
|
|
|
// WALSearchOptions are optional arguments to SearchForEndHeight.
|
|
|
|
type WALSearchOptions struct {
|
2017-12-12 13:02:40 -06:00
|
|
|
// IgnoreDataCorruptionErrors set to true will result in skipping data corruption errors.
|
2017-12-11 19:48:20 -06:00
|
|
|
IgnoreDataCorruptionErrors bool
|
|
|
|
}
|
|
|
|
|
2018-05-22 15:42:37 +04:00
|
|
|
// SearchForEndHeight searches for the EndHeightMessage with the given height
|
|
|
|
// and returns an auto.GroupReader, whenever it was found or not and an error.
|
2017-10-09 23:10:58 +04:00
|
|
|
// Group reader will be nil if found equals false.
|
|
|
|
//
|
|
|
|
// CONTRACT: caller must close group reader.
|
2017-12-11 19:48:20 -06:00
|
|
|
func (wal *baseWAL) SearchForEndHeight(height int64, options *WALSearchOptions) (gr *auto.GroupReader, found bool, err error) {
|
2017-10-09 23:10:58 +04:00
|
|
|
var msg *TimedWALMessage
|
2018-05-22 17:05:54 +04:00
|
|
|
lastHeightFound := int64(-1)
|
2016-12-22 15:01:02 -05:00
|
|
|
|
2017-10-09 23:10:58 +04:00
|
|
|
// NOTE: starting from the last file in the group because we're usually
|
|
|
|
// searching for the last height. See replay.go
|
|
|
|
min, max := wal.group.MinIndex(), wal.group.MaxIndex()
|
2019-02-04 13:00:06 -05:00
|
|
|
wal.Logger.Info("Searching for height", "height", height, "min", min, "max", max)
|
2017-10-09 23:10:58 +04:00
|
|
|
for index := max; index >= min; index-- {
|
|
|
|
gr, err = wal.group.NewReader(index)
|
|
|
|
if err != nil {
|
|
|
|
return nil, false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
dec := NewWALDecoder(gr)
|
|
|
|
for {
|
|
|
|
msg, err = dec.Decode()
|
|
|
|
if err == io.EOF {
|
2018-05-22 17:05:54 +04:00
|
|
|
// OPTIMISATION: no need to look for height in older files if we've seen h < height
|
|
|
|
if lastHeightFound > 0 && lastHeightFound < height {
|
|
|
|
gr.Close()
|
|
|
|
return nil, false, nil
|
|
|
|
}
|
2017-10-25 21:54:56 -04:00
|
|
|
// check next file
|
2017-10-09 23:10:58 +04:00
|
|
|
break
|
|
|
|
}
|
2017-12-11 19:48:20 -06:00
|
|
|
if options.IgnoreDataCorruptionErrors && IsDataCorruptionError(err) {
|
2019-02-04 13:00:06 -05:00
|
|
|
wal.Logger.Error("Corrupted entry. Skipping...", "err", err)
|
2017-12-11 19:48:20 -06:00
|
|
|
// do nothing
|
2018-05-22 15:42:37 +04:00
|
|
|
continue
|
2017-12-11 19:48:20 -06:00
|
|
|
} else if err != nil {
|
2017-10-09 23:10:58 +04:00
|
|
|
gr.Close()
|
|
|
|
return nil, false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if m, ok := msg.Msg.(EndHeightMessage); ok {
|
2018-05-22 17:05:54 +04:00
|
|
|
lastHeightFound = m.Height
|
2017-10-09 23:10:58 +04:00
|
|
|
if m.Height == height { // found
|
2019-02-04 13:00:06 -05:00
|
|
|
wal.Logger.Info("Found", "height", height, "index", index)
|
2017-10-09 23:10:58 +04:00
|
|
|
return gr, true, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
gr.Close()
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
// A WALEncoder writes custom-encoded WAL messages to an output stream.
|
|
|
|
//
|
2018-04-05 07:05:45 -07:00
|
|
|
// Format: 4 bytes CRC sum + 4 bytes length + arbitrary-length value (go-amino encoded)
|
2017-10-09 23:10:58 +04:00
|
|
|
type WALEncoder struct {
|
|
|
|
wr io.Writer
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewWALEncoder returns a new encoder that writes to wr.
|
|
|
|
func NewWALEncoder(wr io.Writer) *WALEncoder {
|
|
|
|
return &WALEncoder{wr}
|
|
|
|
}
|
|
|
|
|
2019-02-18 11:23:06 +04:00
|
|
|
// Encode writes the custom encoding of v to the stream. It returns an error if
|
|
|
|
// the amino-encoded size of v is greater than 1MB. Any error encountered
|
|
|
|
// during the write is also returned.
|
2017-12-07 11:45:50 -06:00
|
|
|
func (enc *WALEncoder) Encode(v *TimedWALMessage) error {
|
2018-04-05 07:05:45 -07:00
|
|
|
data := cdc.MustMarshalBinaryBare(v)
|
2017-10-09 23:10:58 +04:00
|
|
|
|
|
|
|
crc := crc32.Checksum(data, crc32c)
|
|
|
|
length := uint32(len(data))
|
2019-02-18 11:23:06 +04:00
|
|
|
if length > maxMsgSizeBytes {
|
|
|
|
return fmt.Errorf("Msg is too big: %d bytes, max: %d bytes", length, maxMsgSizeBytes)
|
|
|
|
}
|
2017-10-09 23:10:58 +04:00
|
|
|
totalLength := 8 + int(length)
|
|
|
|
|
|
|
|
msg := make([]byte, totalLength)
|
|
|
|
binary.BigEndian.PutUint32(msg[0:4], crc)
|
|
|
|
binary.BigEndian.PutUint32(msg[4:8], length)
|
|
|
|
copy(msg[8:], data)
|
|
|
|
|
|
|
|
_, err := enc.wr.Write(msg)
|
|
|
|
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
2017-12-11 19:48:20 -06:00
|
|
|
// IsDataCorruptionError returns true if data has been corrupted inside WAL.
|
|
|
|
func IsDataCorruptionError(err error) bool {
|
|
|
|
_, ok := err.(DataCorruptionError)
|
|
|
|
return ok
|
|
|
|
}
|
|
|
|
|
2017-12-12 13:02:40 -06:00
|
|
|
// DataCorruptionError is an error that occures if data on disk was corrupted.
|
2017-12-11 19:48:20 -06:00
|
|
|
type DataCorruptionError struct {
|
|
|
|
cause error
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e DataCorruptionError) Error() string {
|
|
|
|
return fmt.Sprintf("DataCorruptionError[%v]", e.cause)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e DataCorruptionError) Cause() error {
|
|
|
|
return e.cause
|
|
|
|
}
|
|
|
|
|
2017-10-09 23:10:58 +04:00
|
|
|
// A WALDecoder reads and decodes custom-encoded WAL messages from an input
|
|
|
|
// stream. See WALEncoder for the format used.
|
|
|
|
//
|
|
|
|
// It will also compare the checksums and make sure data size is equal to the
|
|
|
|
// length from the header. If that is not the case, error will be returned.
|
|
|
|
type WALDecoder struct {
|
|
|
|
rd io.Reader
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewWALDecoder returns a new decoder that reads from rd.
|
|
|
|
func NewWALDecoder(rd io.Reader) *WALDecoder {
|
|
|
|
return &WALDecoder{rd}
|
|
|
|
}
|
|
|
|
|
2017-10-25 21:54:56 -04:00
|
|
|
// Decode reads the next custom-encoded value from its reader and returns it.
|
2017-10-09 23:10:58 +04:00
|
|
|
func (dec *WALDecoder) Decode() (*TimedWALMessage, error) {
|
|
|
|
b := make([]byte, 4)
|
|
|
|
|
2017-12-11 19:50:05 -06:00
|
|
|
_, err := dec.rd.Read(b)
|
2017-10-09 23:10:58 +04:00
|
|
|
if err == io.EOF {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if err != nil {
|
2019-02-04 13:00:06 -05:00
|
|
|
return nil, DataCorruptionError{fmt.Errorf("failed to read checksum: %v", err)}
|
2017-10-09 23:10:58 +04:00
|
|
|
}
|
|
|
|
crc := binary.BigEndian.Uint32(b)
|
|
|
|
|
|
|
|
b = make([]byte, 4)
|
2017-12-11 19:50:05 -06:00
|
|
|
_, err = dec.rd.Read(b)
|
2017-10-09 23:10:58 +04:00
|
|
|
if err != nil {
|
2019-02-04 13:00:06 -05:00
|
|
|
return nil, DataCorruptionError{fmt.Errorf("failed to read length: %v", err)}
|
2017-10-09 23:10:58 +04:00
|
|
|
}
|
|
|
|
length := binary.BigEndian.Uint32(b)
|
|
|
|
|
2017-12-11 19:48:57 -06:00
|
|
|
if length > maxMsgSizeBytes {
|
2019-02-04 13:00:06 -05:00
|
|
|
return nil, DataCorruptionError{fmt.Errorf("length %d exceeded maximum possible value of %d bytes", length, maxMsgSizeBytes)}
|
2017-12-11 19:48:57 -06:00
|
|
|
}
|
|
|
|
|
2017-10-09 23:10:58 +04:00
|
|
|
data := make([]byte, length)
|
2019-02-18 11:23:06 +04:00
|
|
|
n, err := dec.rd.Read(data)
|
2017-10-09 23:10:58 +04:00
|
|
|
if err != nil {
|
2019-02-18 11:23:06 +04:00
|
|
|
return nil, DataCorruptionError{fmt.Errorf("failed to read data: %v (read: %d, wanted: %d)", err, n, length)}
|
2017-10-09 23:10:58 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
// check checksum before decoding data
|
|
|
|
actualCRC := crc32.Checksum(data, crc32c)
|
|
|
|
if actualCRC != crc {
|
2019-02-18 11:23:06 +04:00
|
|
|
return nil, DataCorruptionError{fmt.Errorf("checksums do not match: read: %v, actual: %v", crc, actualCRC)}
|
2017-10-09 23:10:58 +04:00
|
|
|
}
|
|
|
|
|
2018-04-05 07:05:45 -07:00
|
|
|
var res = new(TimedWALMessage) // nolint: gosimple
|
|
|
|
err = cdc.UnmarshalBinaryBare(data, res)
|
2017-10-09 23:10:58 +04:00
|
|
|
if err != nil {
|
2017-12-11 19:48:20 -06:00
|
|
|
return nil, DataCorruptionError{fmt.Errorf("failed to decode data: %v", err)}
|
2017-10-09 23:10:58 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return res, err
|
|
|
|
}
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
type nilWAL struct{}
|
|
|
|
|
2018-05-20 14:40:01 -04:00
|
|
|
func (nilWAL) Write(m WALMessage) {}
|
|
|
|
func (nilWAL) WriteSync(m WALMessage) {}
|
|
|
|
func (nilWAL) Group() *auto.Group { return nil }
|
2017-12-11 19:48:20 -06:00
|
|
|
func (nilWAL) SearchForEndHeight(height int64, options *WALSearchOptions) (gr *auto.GroupReader, found bool, err error) {
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
return nil, false, nil
|
|
|
|
}
|
2017-11-06 13:20:39 -05:00
|
|
|
func (nilWAL) Start() error { return nil }
|
|
|
|
func (nilWAL) Stop() error { return nil }
|
|
|
|
func (nilWAL) Wait() {}
|
2019-02-20 07:45:18 +02:00
|
|
|
func (nilWAL) Flush() error { return nil }
|