diff --git a/config/tendermint/config.go b/config/tendermint/config.go index c210d6e0..ea2f1d43 100644 --- a/config/tendermint/config.go +++ b/config/tendermint/config.go @@ -79,6 +79,7 @@ func GetConfig(rootDir string) cfg.Config { mapConfig.SetDefault("block_size", 10000) // max number of txs mapConfig.SetDefault("block_part_size", 65536) // part size 64K mapConfig.SetDefault("disable_data_hash", false) + mapConfig.SetDefault("timeout_handshake", 10000) mapConfig.SetDefault("timeout_propose", 3000) mapConfig.SetDefault("timeout_propose_delta", 500) mapConfig.SetDefault("timeout_prevote", 1000) diff --git a/config/tendermint_test/config.go b/config/tendermint_test/config.go index 55e3adb4..26a48335 100644 --- a/config/tendermint_test/config.go +++ b/config/tendermint_test/config.go @@ -93,6 +93,7 @@ func ResetConfig(localPath string) cfg.Config { mapConfig.SetDefault("block_size", 10000) mapConfig.SetDefault("block_part_size", 65536) // part size 64K mapConfig.SetDefault("disable_data_hash", false) + mapConfig.SetDefault("timeout_handshake", 10000) mapConfig.SetDefault("timeout_propose", 2000) mapConfig.SetDefault("timeout_propose_delta", 1) mapConfig.SetDefault("timeout_prevote", 10) diff --git a/consensus/replay.go b/consensus/replay.go index 6c4e65a0..4bdc2e87 100644 --- a/consensus/replay.go +++ b/consensus/replay.go @@ -190,6 +190,8 @@ func (h *Handshaker) NBlocks() int { return h.nBlocks } +var ErrReplayLastBlockTimeout = errors.New("Timed out waiting for last block to be replayed") + // TODO: retry the handshake/replay if it fails ? func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error { // handshake is done via info request on the query conn @@ -207,7 +209,11 @@ func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error { // replay blocks up to the latest in the blockstore _, err = h.ReplayBlocks(appHash, blockHeight, proxyApp) - if err != nil { + if err == ErrReplayLastBlockTimeout { + log.Warn("Failed to sync via handshake. Trying other means. If they fail, please increase the timeout_handshake parameter") + return nil + + } else if err != nil { return errors.New(Fmt("Error on replay: %v", err)) } @@ -320,6 +326,7 @@ func (h *Handshaker) replayBlocks(proxyApp proxy.AppConns, appBlockHeight, store func (h *Handshaker) replayLastBlock(proxyApp proxy.AppConnConsensus) ([]byte, error) { mempool := types.MockMempool{} cs := NewConsensusState(h.config, h.state, proxyApp, h.store, mempool) + defer cs.Stop() evsw := types.NewEventSwitch() evsw.Start() @@ -328,9 +335,19 @@ func (h *Handshaker) replayLastBlock(proxyApp proxy.AppConnConsensus) ([]byte, e newBlockCh := subscribeToEvent(evsw, "consensus-replay", types.EventStringNewBlock(), 1) // run through the WAL, commit new block, stop - cs.Start() - <-newBlockCh // TODO: use a timeout and return err? - cs.Stop() + if _, err := cs.Start(); err != nil { + return nil, err + } + + timeout := h.config.GetInt("timeout_handshake") + timer := time.NewTimer(time.Duration(timeout) * time.Millisecond) + log.Notice("Attempting to replay last block", "height", h.store.Height(), "timeout", timeout) + + select { + case <-newBlockCh: + case <-timer.C: + return nil, ErrReplayLastBlockTimeout + } h.nBlocks += 1 diff --git a/consensus/state.go b/consensus/state.go index 23eaff74..cca9d2ed 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -343,13 +343,7 @@ func (cs *ConsensusState) OnStart() error { cs.BaseService.OnStart() walFile := cs.config.GetString("cs_wal_file") - err := EnsureDir(path.Dir(walFile), 0700) - if err != nil { - log.Error("Error ensuring ConsensusState wal dir", "error", err.Error()) - return err - } - err = cs.OpenWAL(walFile) - if err != nil { + if err := cs.OpenWAL(walFile); err != nil { log.Error("Error loading ConsensusState wal", "error", err.Error()) return err } @@ -404,6 +398,12 @@ func (cs *ConsensusState) Wait() { // Open file to log all consensus messages and timeouts for deterministic accountability func (cs *ConsensusState) OpenWAL(walFile string) (err error) { + err = EnsureDir(path.Dir(walFile), 0700) + if err != nil { + log.Error("Error ensuring ConsensusState wal dir", "error", err.Error()) + return err + } + cs.mtx.Lock() defer cs.mtx.Unlock() wal, err := NewWAL(walFile, cs.config.GetBool("cs_wal_light"))