consensus: timeout on replayLastBlock

This commit is contained in:
Ethan Buchman 2017-03-27 15:41:45 -04:00
parent d4f6254551
commit 077cf13a1f
4 changed files with 30 additions and 11 deletions

View File

@ -79,6 +79,7 @@ func GetConfig(rootDir string) cfg.Config {
mapConfig.SetDefault("block_size", 10000) // max number of txs
mapConfig.SetDefault("block_part_size", 65536) // part size 64K
mapConfig.SetDefault("disable_data_hash", false)
mapConfig.SetDefault("timeout_handshake", 10000)
mapConfig.SetDefault("timeout_propose", 3000)
mapConfig.SetDefault("timeout_propose_delta", 500)
mapConfig.SetDefault("timeout_prevote", 1000)

View File

@ -93,6 +93,7 @@ func ResetConfig(localPath string) cfg.Config {
mapConfig.SetDefault("block_size", 10000)
mapConfig.SetDefault("block_part_size", 65536) // part size 64K
mapConfig.SetDefault("disable_data_hash", false)
mapConfig.SetDefault("timeout_handshake", 10000)
mapConfig.SetDefault("timeout_propose", 2000)
mapConfig.SetDefault("timeout_propose_delta", 1)
mapConfig.SetDefault("timeout_prevote", 10)

View File

@ -190,6 +190,8 @@ func (h *Handshaker) NBlocks() int {
return h.nBlocks
}
var ErrReplayLastBlockTimeout = errors.New("Timed out waiting for last block to be replayed")
// TODO: retry the handshake/replay if it fails ?
func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error {
// handshake is done via info request on the query conn
@ -207,7 +209,11 @@ func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error {
// replay blocks up to the latest in the blockstore
_, err = h.ReplayBlocks(appHash, blockHeight, proxyApp)
if err != nil {
if err == ErrReplayLastBlockTimeout {
log.Warn("Failed to sync via handshake. Trying other means. If they fail, please increase the timeout_handshake parameter")
return nil
} else if err != nil {
return errors.New(Fmt("Error on replay: %v", err))
}
@ -320,6 +326,7 @@ func (h *Handshaker) replayBlocks(proxyApp proxy.AppConns, appBlockHeight, store
func (h *Handshaker) replayLastBlock(proxyApp proxy.AppConnConsensus) ([]byte, error) {
mempool := types.MockMempool{}
cs := NewConsensusState(h.config, h.state, proxyApp, h.store, mempool)
defer cs.Stop()
evsw := types.NewEventSwitch()
evsw.Start()
@ -328,9 +335,19 @@ func (h *Handshaker) replayLastBlock(proxyApp proxy.AppConnConsensus) ([]byte, e
newBlockCh := subscribeToEvent(evsw, "consensus-replay", types.EventStringNewBlock(), 1)
// run through the WAL, commit new block, stop
cs.Start()
<-newBlockCh // TODO: use a timeout and return err?
cs.Stop()
if _, err := cs.Start(); err != nil {
return nil, err
}
timeout := h.config.GetInt("timeout_handshake")
timer := time.NewTimer(time.Duration(timeout) * time.Millisecond)
log.Notice("Attempting to replay last block", "height", h.store.Height(), "timeout", timeout)
select {
case <-newBlockCh:
case <-timer.C:
return nil, ErrReplayLastBlockTimeout
}
h.nBlocks += 1

View File

@ -343,13 +343,7 @@ func (cs *ConsensusState) OnStart() error {
cs.BaseService.OnStart()
walFile := cs.config.GetString("cs_wal_file")
err := EnsureDir(path.Dir(walFile), 0700)
if err != nil {
log.Error("Error ensuring ConsensusState wal dir", "error", err.Error())
return err
}
err = cs.OpenWAL(walFile)
if err != nil {
if err := cs.OpenWAL(walFile); err != nil {
log.Error("Error loading ConsensusState wal", "error", err.Error())
return err
}
@ -404,6 +398,12 @@ func (cs *ConsensusState) Wait() {
// Open file to log all consensus messages and timeouts for deterministic accountability
func (cs *ConsensusState) OpenWAL(walFile string) (err error) {
err = EnsureDir(path.Dir(walFile), 0700)
if err != nil {
log.Error("Error ensuring ConsensusState wal dir", "error", err.Error())
return err
}
cs.mtx.Lock()
defer cs.mtx.Unlock()
wal, err := NewWAL(walFile, cs.config.GetBool("cs_wal_light"))