2016-01-18 14:10:05 -05:00
|
|
|
package consensus
|
|
|
|
|
|
|
|
import (
|
2017-10-09 23:10:58 +04:00
|
|
|
"bytes"
|
|
|
|
"encoding/binary"
|
|
|
|
"fmt"
|
2017-09-20 15:33:05 -07:00
|
|
|
"hash/crc32"
|
2017-10-09 23:10:58 +04:00
|
|
|
"io"
|
2017-10-30 11:12:01 -05:00
|
|
|
"path/filepath"
|
2016-01-18 14:10:05 -05:00
|
|
|
"time"
|
|
|
|
|
2017-10-30 11:12:01 -05:00
|
|
|
"github.com/pkg/errors"
|
|
|
|
|
2017-05-02 11:53:32 +04:00
|
|
|
wire "github.com/tendermint/go-wire"
|
|
|
|
"github.com/tendermint/tendermint/types"
|
2017-04-21 18:12:54 -04:00
|
|
|
auto "github.com/tendermint/tmlibs/autofile"
|
2017-10-04 16:40:45 -04:00
|
|
|
cmn "github.com/tendermint/tmlibs/common"
|
2016-01-18 14:10:05 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
//--------------------------------------------------------
|
|
|
|
// types and functions for savings consensus messages
|
|
|
|
|
2017-10-09 23:10:58 +04:00
|
|
|
var (
|
2017-10-23 23:33:17 +04:00
|
|
|
walSeparator = []byte{55, 127, 6, 130} // 0x377f0682 - magic number
|
2017-10-09 23:10:58 +04:00
|
|
|
)
|
|
|
|
|
2016-10-28 15:01:14 -07:00
|
|
|
type TimedWALMessage struct {
|
2017-10-09 23:10:58 +04:00
|
|
|
Time time.Time `json:"time"` // for debugging purposes
|
|
|
|
Msg WALMessage `json:"msg"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// EndHeightMessage marks the end of the given height inside WAL.
|
2017-10-23 23:33:17 +04:00
|
|
|
// @internal used by scripts/cutWALUntil util.
|
2017-10-09 23:10:58 +04:00
|
|
|
type EndHeightMessage struct {
|
|
|
|
Height uint64 `json:"height"`
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
|
|
|
|
2016-10-28 15:01:14 -07:00
|
|
|
type WALMessage interface{}
|
2016-01-18 14:10:05 -05:00
|
|
|
|
|
|
|
var _ = wire.RegisterInterface(
|
2016-10-28 15:01:14 -07:00
|
|
|
struct{ WALMessage }{},
|
2016-01-28 19:53:22 -08:00
|
|
|
wire.ConcreteType{types.EventDataRoundState{}, 0x01},
|
2016-01-18 14:10:05 -05:00
|
|
|
wire.ConcreteType{msgInfo{}, 0x02},
|
|
|
|
wire.ConcreteType{timeoutInfo{}, 0x03},
|
2017-10-09 23:10:58 +04:00
|
|
|
wire.ConcreteType{EndHeightMessage{}, 0x04},
|
2016-01-18 14:10:05 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
//--------------------------------------------------------
|
|
|
|
// Simple write-ahead logger
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
// WAL is an interface for any write-ahead logger.
|
|
|
|
type WAL interface {
|
|
|
|
Save(WALMessage)
|
|
|
|
Group() *auto.Group
|
|
|
|
SearchForEndHeight(height uint64) (gr *auto.GroupReader, found bool, err error)
|
|
|
|
|
|
|
|
Start() (bool, error)
|
|
|
|
Stop() bool
|
|
|
|
Wait()
|
|
|
|
}
|
|
|
|
|
2016-01-18 14:10:05 -05:00
|
|
|
// Write ahead logger writes msgs to disk before they are processed.
|
|
|
|
// Can be used for crash-recovery and deterministic replay
|
2016-01-18 15:57:57 -05:00
|
|
|
// TODO: currently the wal is overwritten during replay catchup
|
|
|
|
// give it a mode so it's either reading or appending - must read to end to start appending again
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
type baseWAL struct {
|
2017-10-04 16:40:45 -04:00
|
|
|
cmn.BaseService
|
2016-03-01 16:04:19 -05:00
|
|
|
|
2016-10-28 15:01:14 -07:00
|
|
|
group *auto.Group
|
2016-03-01 16:04:19 -05:00
|
|
|
light bool // ignore block parts
|
2017-10-09 23:10:58 +04:00
|
|
|
|
|
|
|
enc *WALEncoder
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func NewWAL(walFile string, light bool) (*baseWAL, error) {
|
2017-10-30 11:12:01 -05:00
|
|
|
err := cmn.EnsureDir(filepath.Dir(walFile), 0700)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "failed to ensure WAL directory is in place")
|
|
|
|
}
|
|
|
|
|
2017-02-17 19:12:05 -05:00
|
|
|
group, err := auto.OpenGroup(walFile)
|
2016-01-18 14:10:05 -05:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
wal := &baseWAL{
|
2016-10-28 15:01:14 -07:00
|
|
|
group: group,
|
|
|
|
light: light,
|
2017-10-09 23:10:58 +04:00
|
|
|
enc: NewWALEncoder(group),
|
2016-10-28 15:01:14 -07:00
|
|
|
}
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
wal.BaseService = *cmn.NewBaseService(nil, "baseWAL", wal)
|
2017-05-12 23:07:53 +02:00
|
|
|
return wal, nil
|
2016-11-16 01:15:39 -05:00
|
|
|
}
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func (wal *baseWAL) Group() *auto.Group {
|
|
|
|
return wal.group
|
|
|
|
}
|
|
|
|
|
|
|
|
func (wal *baseWAL) OnStart() error {
|
2016-11-16 01:15:39 -05:00
|
|
|
size, err := wal.group.Head.Size()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
} else if size == 0 {
|
2017-10-09 23:10:58 +04:00
|
|
|
wal.Save(EndHeightMessage{0})
|
2016-11-16 01:15:39 -05:00
|
|
|
}
|
2016-11-21 19:16:19 -08:00
|
|
|
_, err = wal.group.Start()
|
|
|
|
return err
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
|
|
|
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func (wal *baseWAL) OnStop() {
|
2016-10-28 15:01:14 -07:00
|
|
|
wal.BaseService.OnStop()
|
2016-11-21 19:16:19 -08:00
|
|
|
wal.group.Stop()
|
2016-08-17 23:08:43 -04:00
|
|
|
}
|
|
|
|
|
2016-01-18 14:10:05 -05:00
|
|
|
// called in newStep and for each pass in receiveRoutine
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func (wal *baseWAL) Save(msg WALMessage) {
|
2016-10-28 11:58:09 -07:00
|
|
|
if wal == nil {
|
|
|
|
return
|
|
|
|
}
|
2017-10-09 23:10:58 +04:00
|
|
|
|
2016-10-28 11:58:09 -07:00
|
|
|
if wal.light {
|
|
|
|
// in light mode we only write new steps, timeouts, and our own votes (no proposals, block parts)
|
2017-10-09 23:10:58 +04:00
|
|
|
if mi, ok := msg.(msgInfo); ok {
|
2016-10-28 11:58:09 -07:00
|
|
|
if mi.PeerKey != "" {
|
|
|
|
return
|
2016-03-01 16:04:19 -05:00
|
|
|
}
|
|
|
|
}
|
2016-10-28 11:58:09 -07:00
|
|
|
}
|
2017-10-09 23:10:58 +04:00
|
|
|
|
2016-10-28 15:01:14 -07:00
|
|
|
// Write the wal message
|
2017-10-09 23:10:58 +04:00
|
|
|
if err := wal.enc.Encode(&TimedWALMessage{time.Now(), msg}); err != nil {
|
|
|
|
cmn.PanicQ(cmn.Fmt("Error writing msg to consensus wal: %v \n\nMessage: %v", err, msg))
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
2017-10-09 23:10:58 +04:00
|
|
|
|
2016-12-17 23:43:17 -05:00
|
|
|
// TODO: only flush when necessary
|
|
|
|
if err := wal.group.Flush(); err != nil {
|
2017-10-04 16:40:45 -04:00
|
|
|
cmn.PanicQ(cmn.Fmt("Error flushing consensus wal buf to file. Error: %v \n", err))
|
2016-12-17 23:43:17 -05:00
|
|
|
}
|
2016-01-18 14:10:05 -05:00
|
|
|
}
|
2016-11-16 01:15:39 -05:00
|
|
|
|
2017-10-09 23:10:58 +04:00
|
|
|
// SearchForEndHeight searches for the EndHeightMessage with the height and
|
|
|
|
// returns an auto.GroupReader, whenever it was found or not and an error.
|
|
|
|
// Group reader will be nil if found equals false.
|
|
|
|
//
|
|
|
|
// CONTRACT: caller must close group reader.
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
func (wal *baseWAL) SearchForEndHeight(height uint64) (gr *auto.GroupReader, found bool, err error) {
|
2017-10-09 23:10:58 +04:00
|
|
|
var msg *TimedWALMessage
|
2016-12-22 15:01:02 -05:00
|
|
|
|
2017-10-09 23:10:58 +04:00
|
|
|
// NOTE: starting from the last file in the group because we're usually
|
|
|
|
// searching for the last height. See replay.go
|
|
|
|
min, max := wal.group.MinIndex(), wal.group.MaxIndex()
|
|
|
|
wal.Logger.Debug("Searching for height", "height", height, "min", min, "max", max)
|
|
|
|
for index := max; index >= min; index-- {
|
|
|
|
gr, err = wal.group.NewReader(index)
|
|
|
|
if err != nil {
|
|
|
|
return nil, false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
dec := NewWALDecoder(gr)
|
|
|
|
for {
|
|
|
|
msg, err = dec.Decode()
|
|
|
|
if err == io.EOF {
|
2017-10-25 21:54:56 -04:00
|
|
|
// check next file
|
2017-10-09 23:10:58 +04:00
|
|
|
break
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
gr.Close()
|
|
|
|
return nil, false, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if m, ok := msg.Msg.(EndHeightMessage); ok {
|
|
|
|
if m.Height == height { // found
|
|
|
|
wal.Logger.Debug("Found", "height", height, "index", index)
|
|
|
|
return gr, true, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
gr.Close()
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil, false, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
// A WALEncoder writes custom-encoded WAL messages to an output stream.
|
|
|
|
//
|
|
|
|
// Format: 4 bytes CRC sum + 4 bytes length + arbitrary-length value (go-wire encoded)
|
|
|
|
type WALEncoder struct {
|
|
|
|
wr io.Writer
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewWALEncoder returns a new encoder that writes to wr.
|
|
|
|
func NewWALEncoder(wr io.Writer) *WALEncoder {
|
|
|
|
return &WALEncoder{wr}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Encode writes the custom encoding of v to the stream.
|
|
|
|
func (enc *WALEncoder) Encode(v interface{}) error {
|
|
|
|
data := wire.BinaryBytes(v)
|
|
|
|
|
|
|
|
crc := crc32.Checksum(data, crc32c)
|
|
|
|
length := uint32(len(data))
|
|
|
|
totalLength := 8 + int(length)
|
|
|
|
|
|
|
|
msg := make([]byte, totalLength)
|
|
|
|
binary.BigEndian.PutUint32(msg[0:4], crc)
|
|
|
|
binary.BigEndian.PutUint32(msg[4:8], length)
|
|
|
|
copy(msg[8:], data)
|
|
|
|
|
|
|
|
_, err := enc.wr.Write(msg)
|
|
|
|
|
|
|
|
if err == nil {
|
|
|
|
// TODO [Anton Kaliaev 23 Oct 2017]: remove separator
|
|
|
|
_, err = enc.wr.Write(walSeparator)
|
|
|
|
}
|
|
|
|
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
// A WALDecoder reads and decodes custom-encoded WAL messages from an input
|
|
|
|
// stream. See WALEncoder for the format used.
|
|
|
|
//
|
|
|
|
// It will also compare the checksums and make sure data size is equal to the
|
|
|
|
// length from the header. If that is not the case, error will be returned.
|
|
|
|
type WALDecoder struct {
|
|
|
|
rd io.Reader
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewWALDecoder returns a new decoder that reads from rd.
|
|
|
|
func NewWALDecoder(rd io.Reader) *WALDecoder {
|
|
|
|
return &WALDecoder{rd}
|
|
|
|
}
|
|
|
|
|
2017-10-25 21:54:56 -04:00
|
|
|
// Decode reads the next custom-encoded value from its reader and returns it.
|
2017-10-09 23:10:58 +04:00
|
|
|
func (dec *WALDecoder) Decode() (*TimedWALMessage, error) {
|
|
|
|
b := make([]byte, 4)
|
|
|
|
|
|
|
|
n, err := dec.rd.Read(b)
|
|
|
|
if err == io.EOF {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to read checksum: %v", err)
|
|
|
|
}
|
|
|
|
crc := binary.BigEndian.Uint32(b)
|
|
|
|
|
|
|
|
b = make([]byte, 4)
|
|
|
|
n, err = dec.rd.Read(b)
|
|
|
|
if err == io.EOF {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to read length: %v", err)
|
|
|
|
}
|
|
|
|
length := binary.BigEndian.Uint32(b)
|
|
|
|
|
|
|
|
data := make([]byte, length)
|
|
|
|
n, err = dec.rd.Read(data)
|
|
|
|
if err == io.EOF {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("not enough bytes for data: %v (want: %d, read: %v)", err, length, n)
|
|
|
|
}
|
|
|
|
|
|
|
|
// check checksum before decoding data
|
|
|
|
actualCRC := crc32.Checksum(data, crc32c)
|
|
|
|
if actualCRC != crc {
|
|
|
|
return nil, fmt.Errorf("checksums do not match: (read: %v, actual: %v)", crc, actualCRC)
|
|
|
|
}
|
|
|
|
|
|
|
|
var nn int
|
|
|
|
var res *TimedWALMessage
|
|
|
|
res = wire.ReadBinary(&TimedWALMessage{}, bytes.NewBuffer(data), int(length), &nn, &err).(*TimedWALMessage)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to decode data: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO [Anton Kaliaev 23 Oct 2017]: remove separator
|
|
|
|
if err = readSeparator(dec.rd); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return res, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// readSeparator reads a separator from r. It returns any error from underlying
|
|
|
|
// reader or if it's not a separator.
|
|
|
|
func readSeparator(r io.Reader) error {
|
|
|
|
b := make([]byte, len(walSeparator))
|
|
|
|
_, err := r.Read(b)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to read separator: %v", err)
|
|
|
|
}
|
|
|
|
if !bytes.Equal(b, walSeparator) {
|
|
|
|
return fmt.Errorf("not a separator: %v", b)
|
2016-12-22 15:01:02 -05:00
|
|
|
}
|
2017-10-09 23:10:58 +04:00
|
|
|
return nil
|
2016-11-16 01:15:39 -05:00
|
|
|
}
|
new pubsub package
comment out failing consensus tests for now
rewrite rpc httpclient to use new pubsub package
import pubsub as tmpubsub, query as tmquery
make event IDs constants
EventKey -> EventTypeKey
rename EventsPubsub to PubSub
mempool does not use pubsub
rename eventsSub to pubsub
new subscribe API
fix channel size issues and consensus tests bugs
refactor rpc client
add missing discardFromChan method
add mutex
rename pubsub to eventBus
remove IsRunning from WSRPCConnection interface (not needed)
add a comment in broadcastNewRoundStepsAndVotes
rename registerEventCallbacks to broadcastNewRoundStepsAndVotes
See https://dave.cheney.net/2014/03/19/channel-axioms
stop eventBuses after reactor tests
remove unnecessary Unsubscribe
return subscribe helper function
move discardFromChan to where it is used
subscribe now returns an err
this gives us ability to refuse to subscribe if pubsub is at its max
capacity.
use context for control overflow
cache queries
handle err when subscribing in replay_test
rename testClientID to testSubscriber
extract var
set channel buffer capacity to 1 in replay_file
fix byzantine_test
unsubscribe from single event, not all events
refactor httpclient to return events to appropriate channels
return failing testReplayCrashBeforeWriteVote test
fix TestValidatorSetChanges
refactor code a bit
fix testReplayCrashBeforeWriteVote
add comment
fix TestValidatorSetChanges
fixes from Bucky's review
update comment [ci skip]
test TxEventBuffer
update changelog
fix TestValidatorSetChanges (2nd attempt)
only do wg.Done when no errors
benchmark event bus
create pubsub server inside NewEventBus
only expose config params (later if needed)
set buffer capacity to 0 so we are not testing cache
new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ}
This should allow to subscribe to all transactions! or a specific one
using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'"
use TimeoutCommit instead of afterPublishEventNewBlockTimeout
TimeoutCommit is the time a node waits after committing a block, before
it goes into the next height. So it will finish everything from the last
block, but then wait a bit. The idea is this gives it time to hear more
votes from other validators, to strengthen the commit it includes in the
next block. But it also gives it time to hear about new transactions.
waitForBlockWithUpdatedVals
rewrite WAL crash tests
Task:
test that we can recover from any WAL crash.
Solution:
the old tests were relying on event hub being run in the same thread (we
were injecting the private validator's last signature).
when considering a rewrite, we considered two possible solutions: write
a "fuzzy" testing system where WAL is crashing upon receiving a new
message, or inject failures and trigger them in tests using something
like https://github.com/coreos/gofail.
remove sleep
no cs.Lock around wal.Save
test different cases (empty block, non-empty block, ...)
comments
add comments
test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks
fixes as per Bucky's last review
reset subscriptions on UnsubscribeAll
use a simple counter to track message for which we panicked
also, set a smaller part size for all test cases
2017-06-26 19:00:30 +04:00
|
|
|
|
|
|
|
type nilWAL struct{}
|
|
|
|
|
|
|
|
func (nilWAL) Save(m WALMessage) {}
|
|
|
|
func (nilWAL) Group() *auto.Group { return nil }
|
|
|
|
func (nilWAL) SearchForEndHeight(height uint64) (gr *auto.GroupReader, found bool, err error) {
|
|
|
|
return nil, false, nil
|
|
|
|
}
|
|
|
|
func (nilWAL) Start() (bool, error) { return true, nil }
|
|
|
|
func (nilWAL) Stop() bool { return true }
|
|
|
|
func (nilWAL) Wait() {}
|