From 0595420b1e42c31b10ff28df889a07e57d802510 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 20 Mar 2015 16:42:49 +0100 Subject: [PATCH] Cluster: refactoring around configEpoch handling. This commit moves the process of generating a new config epoch without consensus out of the clusterCommand() implementation, in order to make it reusable for other reasons (current target is to have a CLUSTER FAILOVER option forcing the failover when no master majority is reachable). Moreover the commit moves other functions which are similarly related to config epochs in a new logical section of the cluster.c file, just for clarity. --- src/cluster.c | 224 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 135 insertions(+), 89 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 178ce9b7..4ff8fb4d 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -74,27 +74,13 @@ void clusterCloseAllSlots(void); void clusterSetNodeAsMaster(clusterNode *n); void clusterDelNode(clusterNode *delnode); sds representRedisNodeFlags(sds ci, uint16_t flags); +uint64_t clusterGetMaxEpoch(void); +int clusterBumpConfigEpochWithoutConsensus(void); /* ----------------------------------------------------------------------------- * Initialization * -------------------------------------------------------------------------- */ -/* Return the greatest configEpoch found in the cluster. */ -uint64_t clusterGetMaxEpoch(void) { - uint64_t max = 0; - dictIterator *di; - dictEntry *de; - - di = dictGetSafeIterator(server.cluster->nodes); - while((de = dictNext(di)) != NULL) { - clusterNode *node = dictGetVal(de); - if (node->configEpoch > max) max = node->configEpoch; - } - dictReleaseIterator(di); - if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch; - return max; -} - /* Load the cluster config from 'filename'. * * If the file does not exist or is zero-length (this may happen because @@ -927,6 +913,137 @@ void clusterRenameNode(clusterNode *node, char *newname) { clusterAddNode(node); } +/* ----------------------------------------------------------------------------- + * CLUSTER config epoch handling + * -------------------------------------------------------------------------- */ + +/* Return the greatest configEpoch found in the cluster. */ +uint64_t clusterGetMaxEpoch(void) { + uint64_t max = 0; + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (node->configEpoch > max) max = node->configEpoch; + } + dictReleaseIterator(di); + if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch; + return max; +} + +/* If this node epoch is zero or is not already the greatest across the + * cluster (from the POV of the local configuration), this function will: + * + * 1) Generate a new config epoch increment the current epoch. + * 2) Assign the new epoch to this node, WITHOUT any consensus. + * 3) Persist the configuration on disk before sending packets with the + * new configuration. + * + * If the new config epoch is generated and assigend, REDIS_OK is returned, + * otherwise REDIS_ERR is returned (since the node has already the greatest + * configuration around) and no operation is performed. + * + * Important note: this function violates the principle that config epochs + * should be generated with consensus and should be unique across the cluster. + * However Redis Cluster uses this auto-generated new config epochs in two + * cases: + * + * 1) When slots are closed after importing. Otherwise resharding would be + * too exansive. + * 2) When CLUSTER FAILOVER is called with options that force a slave to + * failover its master even if there is not master majority able to + * create a new configuration epoch. + * + * Redis Cluster does not explode using this function, even in the case of + * a collision between this node and another node, generating the same + * configuration epoch unilaterally, because the config epoch conflict + * resolution algorithm will eventually move colliding nodes to different + * config epochs. However usign this function may violate the "last failover + * wins" rule, so should only be used with care. */ +int clusterBumpConfigEpochWithoutConsensus(void) { + uint64_t maxEpoch = clusterGetMaxEpoch(); + + if (myself->configEpoch == 0 || + myself->configEpoch != maxEpoch) + { + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + redisLog(REDIS_WARNING, + "New configEpoch set to %llu", + (unsigned long long) myself->configEpoch); + return REDIS_OK; + } else { + return REDIS_ERR; + } +} + +/* This function is called when this node is a master, and we receive from + * another master a configuration epoch that is equal to our configuration + * epoch. + * + * BACKGROUND + * + * It is not possible that different slaves get the same config + * epoch during a failover election, because the slaves need to get voted + * by a majority. However when we perform a manual resharding of the cluster + * the node will assign a configuration epoch to itself without to ask + * for agreement. Usually resharding happens when the cluster is working well + * and is supervised by the sysadmin, however it is possible for a failover + * to happen exactly while the node we are resharding a slot to assigns itself + * a new configuration epoch, but before it is able to propagate it. + * + * So technically it is possible in this condition that two nodes end with + * the same configuration epoch. + * + * Another possibility is that there are bugs in the implementation causing + * this to happen. + * + * Moreover when a new cluster is created, all the nodes start with the same + * configEpoch. This collision resolution code allows nodes to automatically + * end with a different configEpoch at startup automatically. + * + * In all the cases, we want a mechanism that resolves this issue automatically + * as a safeguard. The same configuration epoch for masters serving different + * set of slots is not harmful, but it is if the nodes end serving the same + * slots for some reason (manual errors or software bugs) without a proper + * failover procedure. + * + * In general we want a system that eventually always ends with different + * masters having different configuration epochs whatever happened, since + * nothign is worse than a split-brain condition in a distributed system. + * + * BEHAVIOR + * + * When this function gets called, what happens is that if this node + * has the lexicographically smaller Node ID compared to the other node + * with the conflicting epoch (the 'sender' node), it will assign itself + * the greatest configuration epoch currently detected among nodes plus 1. + * + * This means that even if there are multiple nodes colliding, the node + * with the greatest Node ID never moves forward, so eventually all the nodes + * end with a different configuration epoch. + */ +void clusterHandleConfigEpochCollision(clusterNode *sender) { + /* Prerequisites: nodes have the same configEpoch and are both masters. */ + if (sender->configEpoch != myself->configEpoch || + !nodeIsMaster(sender) || !nodeIsMaster(myself)) return; + /* Don't act if the colliding node has a smaller Node ID. */ + if (memcmp(sender->name,myself->name,REDIS_CLUSTER_NAMELEN) <= 0) return; + /* Get the next ID available at the best of this node knowledge. */ + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterSaveConfigOrDie(1); + redisLog(REDIS_VERBOSE, + "WARNING: configEpoch collision with node %.40s." + " configEpoch set to %llu", + sender->name, + (unsigned long long) myself->configEpoch); +} + /* ----------------------------------------------------------------------------- * CLUSTER nodes blacklist * @@ -1399,69 +1516,6 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc } } -/* This function is called when this node is a master, and we receive from - * another master a configuration epoch that is equal to our configuration - * epoch. - * - * BACKGROUND - * - * It is not possible that different slaves get the same config - * epoch during a failover election, because the slaves need to get voted - * by a majority. However when we perform a manual resharding of the cluster - * the node will assign a configuration epoch to itself without to ask - * for agreement. Usually resharding happens when the cluster is working well - * and is supervised by the sysadmin, however it is possible for a failover - * to happen exactly while the node we are resharding a slot to assigns itself - * a new configuration epoch, but before it is able to propagate it. - * - * So technically it is possible in this condition that two nodes end with - * the same configuration epoch. - * - * Another possibility is that there are bugs in the implementation causing - * this to happen. - * - * Moreover when a new cluster is created, all the nodes start with the same - * configEpoch. This collision resolution code allows nodes to automatically - * end with a different configEpoch at startup automatically. - * - * In all the cases, we want a mechanism that resolves this issue automatically - * as a safeguard. The same configuration epoch for masters serving different - * set of slots is not harmful, but it is if the nodes end serving the same - * slots for some reason (manual errors or software bugs) without a proper - * failover procedure. - * - * In general we want a system that eventually always ends with different - * masters having different configuration epochs whatever happened, since - * nothign is worse than a split-brain condition in a distributed system. - * - * BEHAVIOR - * - * When this function gets called, what happens is that if this node - * has the lexicographically smaller Node ID compared to the other node - * with the conflicting epoch (the 'sender' node), it will assign itself - * the greatest configuration epoch currently detected among nodes plus 1. - * - * This means that even if there are multiple nodes colliding, the node - * with the greatest Node ID never moves forward, so eventually all the nodes - * end with a different configuration epoch. - */ -void clusterHandleConfigEpochCollision(clusterNode *sender) { - /* Prerequisites: nodes have the same configEpoch and are both masters. */ - if (sender->configEpoch != myself->configEpoch || - !nodeIsMaster(sender) || !nodeIsMaster(myself)) return; - /* Don't act if the colliding node has a smaller Node ID. */ - if (memcmp(sender->name,myself->name,REDIS_CLUSTER_NAMELEN) <= 0) return; - /* Get the next ID available at the best of this node knowledge. */ - server.cluster->currentEpoch++; - myself->configEpoch = server.cluster->currentEpoch; - clusterSaveConfigOrDie(1); - redisLog(REDIS_VERBOSE, - "WARNING: configEpoch collision with node %.40s." - " configEpoch set to %llu", - sender->name, - (unsigned long long) myself->configEpoch); -} - /* When this function is called, there is a packet to process starting * at node->rcvbuf. Releasing the buffer is up to the caller, so this * function should just handle the higher level stuff of processing the @@ -3902,17 +3956,9 @@ void clusterCommand(redisClient *c) { * failover happens at the same time we close the slot, the * configEpoch collision resolution will fix it assigning * a different epoch to each node. */ - uint64_t maxEpoch = clusterGetMaxEpoch(); - - if (myself->configEpoch == 0 || - myself->configEpoch != maxEpoch) - { - server.cluster->currentEpoch++; - myself->configEpoch = server.cluster->currentEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_FSYNC_CONFIG); + if (clusterBumpConfigEpochWithoutConsensus() == REDIS_OK) { redisLog(REDIS_WARNING, - "configEpoch set to %llu after importing slot %d", - (unsigned long long) myself->configEpoch, slot); + "configEpoch updated after importing slot %d", slot); } server.cluster->importing_slots_from[slot] = NULL; }