diff --git a/consensus/state.go b/consensus/state.go index 4e06dc0d..f0fa3054 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -8,6 +8,8 @@ import ( "sync" "time" + "github.com/ebuchman/fail-test" + . "github.com/tendermint/go-common" cfg "github.com/tendermint/go-config" "github.com/tendermint/go-wire" @@ -1255,16 +1257,19 @@ func (cs *ConsensusState) finalizeCommit(height int) { "height", block.Height, "hash", block.Hash(), "root", block.AppHash) log.Info(Fmt("%v", block)) + fail.Fail() // XXX + // Save to blockStore. if cs.blockStore.Height() < block.Height { precommits := cs.Votes.Precommits(cs.CommitRound) seenCommit := precommits.MakeCommit() - log.Notice("save block", "height", block.Height) cs.blockStore.SaveBlock(block, blockParts, seenCommit) } else { log.Warn("Why are we finalizeCommitting a block height we already have?", "height", block.Height) } + fail.Fail() // XXX + // Create a copy of the state for staging // and an event cache for txs stateCopy := cs.state.Copy() @@ -1277,6 +1282,8 @@ func (cs *ConsensusState) finalizeCommit(height int) { // NOTE: the block.AppHash wont reflect these txs until the next block stateCopy.ApplyBlock(eventCache, cs.proxyAppConn, block, blockParts.Header(), cs.mempool) + fail.Fail() // XXX + // Fire off event for new block. // TODO: Handle app failure. See #177 types.FireEventNewBlock(cs.evsw, types.EventDataNewBlock{block}) @@ -1284,9 +1291,10 @@ func (cs *ConsensusState) finalizeCommit(height int) { eventCache.Flush() // Save the state. - log.Notice("save state", "height", stateCopy.LastBlockHeight, "hash", stateCopy.AppHash) stateCopy.Save() + fail.Fail() // XXX + // NewHeightStep! cs.updateToState(stateCopy) diff --git a/state/execution.go b/state/execution.go index 4d570f0b..3208e067 100644 --- a/state/execution.go +++ b/state/execution.go @@ -1,8 +1,11 @@ package state import ( + "bytes" "errors" + "github.com/ebuchman/fail-test" + . "github.com/tendermint/go-common" "github.com/tendermint/tendermint/proxy" "github.com/tendermint/tendermint/types" @@ -98,20 +101,28 @@ func (s *State) execBlockOnProxyApp(eventCache types.Fireable, proxyAppConn prox return err } + fail.Fail() // XXX + // Run txs of block for _, tx := range block.Txs { + fail.FailRand(len(block.Txs)) // XXX proxyAppConn.AppendTxAsync(tx) if err := proxyAppConn.Error(); err != nil { return err } } + fail.Fail() // XXX + // End block changedValidators, err := proxyAppConn.EndBlockSync(uint64(block.Height)) if err != nil { log.Warn("Error in proxyAppConn.EndBlock", "error", err) return err } + + fail.Fail() // XXX + // TODO: Do something with changedValidators log.Debug("TODO: Do something with changedValidators", "changedValidators", changedValidators) @@ -248,6 +259,8 @@ func (m mockMempool) Update(height int, txs []types.Tx) {} //---------------------------------------------------------------- // Replay blocks to sync app to latest state of core +type ErrReplay error + type ErrAppBlockHeightTooHigh struct { coreHeight int appHeight int @@ -257,6 +270,16 @@ func (e ErrAppBlockHeightTooHigh) Error() string { return Fmt("App block height (%d) is higher than core (%d)", e.appHeight, e.coreHeight) } +type ErrLastStateMismatch struct { + height int + core []byte + app []byte +} + +func (e ErrLastStateMismatch) Error() string { + return Fmt("Latest tendermint block (%d) LastAppHash (%X) does not match app's AppHash (%X)", e.height, e.core, e.app) +} + type ErrStateMismatch struct { got *State expected *State @@ -289,29 +312,47 @@ func (s *State) ReplayBlocks(appHash []byte, header *types.Header, partsHeader t appBlockHeight := stateCopy.LastBlockHeight coreBlockHeight := blockStore.Height() if coreBlockHeight < appBlockHeight { + // if the app is ahead, there's nothing we can do return ErrAppBlockHeightTooHigh{coreBlockHeight, appBlockHeight} } else if coreBlockHeight == appBlockHeight { // if we crashed between Commit and SaveState, - // the state's app hash is stale + // the state's app hash is stale. + // otherwise we're synced if s.Stale { s.Stale = false s.AppHash = appHash } + return checkState(s, stateCopy) + + } else if s.LastBlockHeight == appBlockHeight { + // core is ahead of app but core's state height is at apps height + // this happens if we crashed after saving the block, + // but before committing it. We should be 1 ahead + if coreBlockHeight != appBlockHeight+1 { + PanicSanity(Fmt("core.state.height == app.height but core.height (%d) > app.height+1 (%d)", coreBlockHeight, appBlockHeight+1)) + } + + // check that the blocks last apphash is the states apphash + blockMeta := blockStore.LoadBlockMeta(coreBlockHeight) + if !bytes.Equal(blockMeta.Header.AppHash, appHash) { + return ErrLastStateMismatch{coreBlockHeight, blockMeta.Header.AppHash, appHash} + } + + // replay the block against the actual tendermint state (not the copy) + return loadApplyBlock(coreBlockHeight, s, blockStore, appConnConsensus) } else { - // the app is behind. + // either we're caught up or there's blocks to replay // replay all blocks starting with appBlockHeight+1 for i := appBlockHeight + 1; i <= coreBlockHeight; i++ { - blockMeta := blockStore.LoadBlockMeta(i) - block := blockStore.LoadBlock(i) - panicOnNilBlock(i, coreBlockHeight, block, blockMeta) // XXX - - var eventCache events.Fireable // nil - stateCopy.ApplyBlock(eventCache, appConnConsensus, block, blockMeta.PartsHeader, mockMempool{}) + loadApplyBlock(i, stateCopy, blockStore, appConnConsensus) } + return checkState(s, stateCopy) } +} +func checkState(s, stateCopy *State) error { // The computed state and the previously set state should be identical if !s.Equals(stateCopy) { return ErrStateMismatch{stateCopy, s} @@ -319,6 +360,15 @@ func (s *State) ReplayBlocks(appHash []byte, header *types.Header, partsHeader t return nil } +func loadApplyBlock(blockIndex int, s *State, blockStore proxy.BlockStore, appConnConsensus proxy.AppConnConsensus) error { + blockMeta := blockStore.LoadBlockMeta(blockIndex) + block := blockStore.LoadBlock(blockIndex) + panicOnNilBlock(blockIndex, blockStore.Height(), block, blockMeta) // XXX + + var eventCache events.Fireable // nil + return s.ApplyBlock(eventCache, appConnConsensus, block, blockMeta.PartsHeader, mockMempool{}) +} + func panicOnNilBlock(height, bsHeight int, block *types.Block, blockMeta *types.BlockMeta) { if block == nil || blockMeta == nil { // Sanity? diff --git a/test/persist/test.sh b/test/persist/test.sh index c51fa7d0..5c1e1241 100644 --- a/test/persist/test.sh +++ b/test/persist/test.sh @@ -24,13 +24,13 @@ function kill_procs(){ function send_txs(){ # send a bunch of txs over a few blocks echo "Sending txs" -# for i in `seq 1 5`; do -# for j in `seq 1 100`; do + for i in `seq 1 5`; do + for j in `seq 1 100`; do tx=`head -c 8 /dev/urandom | hexdump -ve '1/1 "%.2X"'` curl -s 127.0.0.1:46657/broadcast_tx_async?tx=\"$tx\" &> /dev/null -# done + done sleep 1 -# done + done } diff --git a/test/persist/test2.sh b/test/persist/test2.sh new file mode 100644 index 00000000..509deee7 --- /dev/null +++ b/test/persist/test2.sh @@ -0,0 +1,104 @@ +#! /bin/bash + + +export TMROOT=$HOME/.tendermint_persist + +rm -rf $TMROOT +tendermint init + +function start_procs(){ + name=$1 + indexToFail=$2 + echo "Starting persistent dummy and tendermint" + dummy --persist $TMROOT/dummy &> "dummy_${name}.log" & + PID_DUMMY=$! + if [[ "$indexToFail" == "" ]]; then + # run in background, dont fail + tendermint node &> tendermint_${name}.log & + PID_TENDERMINT=$! + else + # run in foreground, fail + FAIL_TEST_INDEX=$indexToFail tendermint node &> tendermint_${name}.log + PID_TENDERMINT=$! + fi +} + +function kill_procs(){ + kill -9 $PID_DUMMY $PID_TENDERMINT + wait $PID_DUMMY + wait $PID_TENDERMINT +} + + +# wait till node is up, send txs +function send_txs(){ + addr="127.0.0.1:46657" + curl -s $addr/status > /dev/null + ERR=$? + while [ "$ERR" != 0 ]; do + sleep 1 + curl -s $addr/status > /dev/null + ERR=$? + done + + # send a bunch of txs over a few blocks + echo "Node is up, sending txs" + for i in `seq 1 5`; do + for j in `seq 1 100`; do + tx=`head -c 8 /dev/urandom | hexdump -ve '1/1 "%.2X"'` + curl -s $addr/broadcast_tx_async?tx=\"$tx\" &> /dev/null + done + sleep 1 + done +} + + +failsStart=0 +fails=`grep -r "fail.Fail" --include \*.go . | wc -l` +failsEnd=$(($fails-1)) + +for failIndex in `seq $failsStart $failsEnd`; do + echo "" + echo "* Test FailIndex $failIndex" + # test failure at failIndex + + send_txs & + start_procs 1 $failIndex + + # tendermint should fail when it hits the fail index + kill -9 $PID_DUMMY + wait $PID_DUMMY + + start_procs 2 + + # wait for node to handshake and make a new block + addr="localhost:46657" + curl -s $addr/status > /dev/null + ERR=$? + i=0 + while [ "$ERR" != 0 ]; do + sleep 1 + curl -s $addr/status > /dev/null + ERR=$? + i=$(($i + 1)) + if [[ $i == 10 ]]; then + echo "Timed out waiting for tendermint to start" + exit 1 + fi + done + + # wait for a new block + h1=`curl -s $addr/status | jq .result[1].latest_block_height` + h2=$h1 + while [ "$h2" == "$h1" ]; do + sleep 1 + h2=`curl -s $addr/status | jq .result[1].latest_block_height` + done + + kill_procs + + echo "* Passed Test for FailIndex $failIndex" + echo "" +done + +echo "Passed Test: Persistence"