From 2553f6c9e53b900d23a19a46f0cd9b0303673297 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 24 Jan 2015 07:52:24 +0100 Subject: [PATCH 1/5] Cluster: initialized not used fileds in gossip section. Otherwise we risk sending not initialized data to other nodes, that may contain anything. This was actually not possible only because the initialization of the buffer where the cluster packets header is created was larger than the 3 gossip sections we use, so the memory was already all filled with zeroes by the memset(). --- src/cluster.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index 995f1663..66fc301c 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2195,6 +2195,8 @@ void clusterSendPing(clusterLink *link, int type) { memcpy(gossip->ip,this->ip,sizeof(this->ip)); gossip->port = htons(this->port); gossip->flags = htons(this->flags); + gossip->notused1 = 0; + gossip->notused2 = 0; gossipcount++; } From 96368c2d0b8d39ddd8b3e036aac7a91cb08e2c94 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 30 Jan 2015 10:41:45 +0100 Subject: [PATCH 2/5] Cluster: create-cluster script improved. --- utils/create-cluster/README | 2 +- utils/create-cluster/create-cluster | 62 +++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/utils/create-cluster/README b/utils/create-cluster/README index f3a3f088..1f43748e 100644 --- a/utils/create-cluster/README +++ b/utils/create-cluster/README @@ -24,4 +24,4 @@ In order to stop a cluster: 1. Use "./craete-cluster stop" to stop all the instances. After you stopped the instances you can use "./create-cluster start" to restart them if you change ideas. 2. Use "./create-cluster clean" to remove all the AOF / log files to restat with a clean environment. -It is currently hardcoded that you start a cluster where each master has one slave, since the script is pretty basic. +Use the command "./create-cluster help" to get the full list of features. diff --git a/utils/create-cluster/create-cluster b/utils/create-cluster/create-cluster index 76f61091..efb3135d 100755 --- a/utils/create-cluster/create-cluster +++ b/utils/create-cluster/create-cluster @@ -1,8 +1,21 @@ #!/bin/bash +# Settings PORT=30000 -ENDPORT=30006 -TIMEOUT=15000 +TIMEOUT=2000 +NODES=6 +REPLICAS=1 + +# You may want to put the above config parameters into config.sh in order to +# override the defaults without modifying this script. + +if [ -a config.sh ] +then + source "config.sh" +fi + +# Computed vars +ENDPORT=$((PORT+NODES)) if [ "$1" == "start" ] then @@ -21,7 +34,7 @@ then PORT=$((PORT+1)) HOSTS="$HOSTS 127.0.0.1:$PORT" done - ../../src/redis-trib.rb create --replicas 1 $HOSTS + ../../src/redis-trib.rb create --replicas $REPLICAS $HOSTS exit 0 fi @@ -35,22 +48,31 @@ then exit 0 fi -if [ "$1" == "join" ] +if [ "$1" == "watch" ] +then + PORT=$((PORT+1)) + while [ 1 ]; do + clear + date + redis-cli -p $PORT cluster nodes | head -30 + sleep 1 + done + exit 0 +fi + +if [ "$1" == "tail" ] +then + INSTANCE=$2 + PORT=$((PORT+INSTANCE)) + tail -f ${PORT}.log + exit 0 +fi + +if [ "$1" == "call" ] then while [ $((PORT < ENDPORT)) != "0" ]; do PORT=$((PORT+1)) - echo "Joining $PORT" - redis-cli -p $PORT CLUSTER MEET 127.0.0.1 10002 - done - - echo "Waiting 5 seconds" - sleep 5 - - PORT=30000 - while [ $((PORT < ENDPORT)) != "0" ]; do - PORT=$((PORT+1)) - echo "Replicate $PORT" - redis-cli -p $PORT CLUSTER REPLICATE $2 + ../../src/redis-cli -p $PORT $2 $3 $4 $5 $6 $7 $8 $9 done exit 0 fi @@ -64,4 +86,10 @@ then exit 0 fi -echo "Usage: $0 [start|create|stop|join|clean]" +echo "Usage: $0 [start|create|stop|watch|tail|clean]" +echo "start -- Launch Redis Cluster instances." +echo "create -- Create a cluster using redis-trib create." +echo "stop -- Stop Redis Cluster instances." +echo "watch -- Show CLUSTER NODES output (first 30 lines) of first node." +echo "tail -- Run tail -f of instance at base port + ID." +echo "clean -- Remove all instances data, logs, configs." From 0f1b9c3db16ccf7b0a8cdd24f2bf10098fbc44f9 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 30 Jan 2015 11:23:27 +0100 Subject: [PATCH 3/5] More correct wanted / maxiterations values in clusterSendPing(). --- src/cluster.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 66fc301c..3712cc7a 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -40,6 +40,7 @@ #include #include #include +#include /* A global reference to myself is handy to make code more clear. * Myself always points to server.cluster->myself, that is, the clusterNode @@ -2136,8 +2137,9 @@ void clusterSendPing(clusterLink *link, int type) { * Since we have non-voting slaves that lower the probability of an entry * to feature our node, we set the number of entires per packet as * 10% of the total nodes we have. */ - wanted = freshnodes/10; + wanted = floor(dictSize(server.cluster->nodes)/10); if (wanted < 3) wanted = 3; + if (wanted > freshnodes) wanted = freshnodes; /* Compute the maxium totlen to allocate our buffer. We'll fix the totlen * later according to the number of gossip sections we really were able @@ -2156,7 +2158,7 @@ void clusterSendPing(clusterLink *link, int type) { clusterBuildMessageHdr(hdr,type); /* Populate the gossip fields */ - int maxiterations = wanted+10; + int maxiterations = wanted*2; while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { dictEntry *de = dictGetRandomKey(server.cluster->nodes); clusterNode *this = dictGetVal(de); @@ -2199,6 +2201,8 @@ void clusterSendPing(clusterLink *link, int type) { gossip->notused2 = 0; gossipcount++; } + redisLog(REDIS_VERBOSE,"WANTED: %d, USED_ITER: %d, GOSSIPCOUNT: %d", + wanted, wanted*2-maxiterations, gossipcount); /* Ready to send... fix the totlen fiend and queue the message in the * output buffer. */ From 55f2bc646af24528ea12e36268c978d496056633 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 30 Jan 2015 11:54:18 +0100 Subject: [PATCH 4/5] Cluster: some bias towwards FAIL/PFAIL nodes in gossip sections. This improves PFAIL -> FAIL switch. Too late at this point in the RC releases to add proper PFAIL/FAIL separate dictionary to do this in a less randomized way. Tested in practice with experiments that this helps. PFAIL -> FAIL average with 20 nodes and node-timeout set to 5 seconds takes 2.5 seconds without this commit, 1 second with this commit. --- src/cluster.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 3712cc7a..bbad47bd 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2158,7 +2158,7 @@ void clusterSendPing(clusterLink *link, int type) { clusterBuildMessageHdr(hdr,type); /* Populate the gossip fields */ - int maxiterations = wanted*2; + int maxiterations = wanted*3; while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { dictEntry *de = dictGetRandomKey(server.cluster->nodes); clusterNode *this = dictGetVal(de); @@ -2169,6 +2169,11 @@ void clusterSendPing(clusterLink *link, int type) { * already, so we just gossip about other nodes. */ if (this == myself) continue; + /* Give a bias to FAIL/PFAIL nodes. */ + if (maxiterations > wanted*2 && + !(this->flags & (REDIS_NODE_PFAIL|REDIS_NODE_FAIL))) + continue; + /* In the gossip section don't include: * 1) Nodes in HANDSHAKE state. * 3) Nodes with the NOADDR flag set. @@ -2201,8 +2206,6 @@ void clusterSendPing(clusterLink *link, int type) { gossip->notused2 = 0; gossipcount++; } - redisLog(REDIS_VERBOSE,"WANTED: %d, USED_ITER: %d, GOSSIPCOUNT: %d", - wanted, wanted*2-maxiterations, gossipcount); /* Ready to send... fix the totlen fiend and queue the message in the * output buffer. */ From 19029a4ebc539de5c9a79e053aa7271c37be0ac1 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 30 Jan 2015 12:03:17 +0100 Subject: [PATCH 5/5] Cluster: Tcl script to check avg pfail->fail time. --- utils/cluster_fail_time.tcl | 50 +++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 utils/cluster_fail_time.tcl diff --git a/utils/cluster_fail_time.tcl b/utils/cluster_fail_time.tcl new file mode 100644 index 00000000..87399495 --- /dev/null +++ b/utils/cluster_fail_time.tcl @@ -0,0 +1,50 @@ +# This simple script is used in order to estimate the average PFAIL->FAIL +# state switch after a failure. + +set ::sleep_time 10 ; # How much to sleep to trigger PFAIL. +set ::fail_port 30016 ; # Node to put in sleep. +set ::other_port 30001 ; # Node to use to monitor the flag switch. + +proc avg vector { + set sum 0.0 + foreach x $vector { + set sum [expr {$sum+$x}] + } + expr {$sum/[llength $vector]} +} + +set samples {} +while 1 { + exec redis-cli -p $::fail_port debug sleep $::sleep_time > /dev/null & + + # Wait for fail? to appear. + while 1 { + set output [exec redis-cli -p $::other_port cluster nodes] + if {[string match {*fail\?*} $output]} break + after 100 + } + + puts "FAIL?" + set start [clock milliseconds] + + # Wait for fail? to disappear. + while 1 { + set output [exec redis-cli -p $::other_port cluster nodes] + if {![string match {*fail\?*} $output]} break + after 100 + } + + puts "FAIL" + set now [clock milliseconds] + set elapsed [expr {$now-$start}] + puts $elapsed + lappend samples $elapsed + + puts "AVG([llength $samples]): [avg $samples]" + + # Wait for the instance to be available again. + exec redis-cli -p $::fail_port ping + + # Wait for the fail flag to be cleared. + after 2000 +}