diff --git a/src/replication.c b/src/replication.c index 49c38f73..e62afb2f 100644 --- a/src/replication.c +++ b/src/replication.c @@ -162,6 +162,7 @@ void feedReplicationBacklog(void *ptr, size_t len) { unsigned char *p = ptr; server.master_repl_offset += len; + server.master_repl_meaningful_offset = server.master_repl_offset; /* This is a circular buffer, so write as much data we can at every * iteration and rewind the "idx" index if we reach the limit. */ @@ -1768,6 +1769,7 @@ void readSyncBulkPayload(connection *conn) { * we are starting a new history. */ memcpy(server.replid,server.master->replid,sizeof(server.replid)); server.master_repl_offset = server.master->reploff; + server.master_repl_meaningful_offset = server.master->reploff; clearReplicationId2(); /* Let's create the replication backlog if needed. Slaves need to @@ -2725,12 +2727,37 @@ void replicationCacheMaster(client *c) { * current offset if no data was lost during the failover. So we use our * current replication ID and offset in order to synthesize a cached master. */ void replicationCacheMasterUsingMyself(void) { + serverLog(LL_NOTICE, + "Before turning into a replica, using my own master parameters " + "to synthesize a cached master: I may be able to synchronize with " + "the new master with just a partial transfer."); + /* This will be used to populate the field server.master->reploff * by replicationCreateMasterClient(). We'll later set the created * master as server.cached_master, so the replica will use such * offset for PSYNC. */ server.master_initial_offset = server.master_repl_offset; + /* However if the "meaningful" offset, that is the offset without + * the final PINGs in the stream, is different, use this instead: + * often when the master is no longer reachable, replicas will never + * receive the PINGs, however the master will end with an incremented + * offset because of the PINGs and will not be able to incrementally + * PSYNC with the new master. */ + if (server.master_repl_offset > server.master_repl_meaningful_offset) { + long long delta = server.master_repl_offset - + server.master_repl_meaningful_offset; + serverLog(LL_NOTICE, + "Using the meaningful offset %lld instead of %lld to exclude " + "the final PINGs (%lld bytes difference)", + server.master_repl_meaningful_offset, + server.master_repl_offset, + delta); + server.master_initial_offset = server.master_repl_meaningful_offset; + server.repl_backlog_histlen -= delta; + if (server.repl_backlog_histlen < 0) server.repl_backlog_histlen = 0; + } + /* The master client we create can be set to any DBID, because * the new master will start its replication stream with SELECT. */ replicationCreateMasterClient(NULL,-1); @@ -2742,7 +2769,6 @@ void replicationCacheMasterUsingMyself(void) { unlinkClient(server.master); server.cached_master = server.master; server.master = NULL; - serverLog(LL_NOTICE,"Before turning into a replica, using my master parameters to synthesize a cached master: I may be able to synchronize with the new master with just a partial transfer."); } /* Free a cached master, called when there are no longer the conditions for @@ -3122,10 +3148,18 @@ void replicationCron(void) { clientsArePaused(); if (!manual_failover_in_progress) { + long long before_ping = server.master_repl_meaningful_offset; ping_argv[0] = createStringObject("PING",4); replicationFeedSlaves(server.slaves, server.slaveseldb, ping_argv, 1); decrRefCount(ping_argv[0]); + /* The server.master_repl_meaningful_offset variable represents + * the offset of the replication stream without the pending PINGs. + * This is useful to set the right replication offset for PSYNC + * when the master is turned into a replica. Otherwise pending + * PINGs may not allow it to perform an incremental sync with the + * new master. */ + server.master_repl_meaningful_offset = before_ping; } } diff --git a/src/server.c b/src/server.c index 65a01db5..84439461 100644 --- a/src/server.c +++ b/src/server.c @@ -2355,6 +2355,7 @@ void initServerConfig(void) { server.repl_syncio_timeout = CONFIG_REPL_SYNCIO_TIMEOUT; server.repl_down_since = 0; /* Never connected, repl is down since EVER. */ server.master_repl_offset = 0; + server.master_repl_meaningful_offset = 0; /* Replication partial resync backlog */ server.repl_backlog = NULL; @@ -4398,6 +4399,7 @@ sds genRedisInfoString(const char *section) { "master_replid:%s\r\n" "master_replid2:%s\r\n" "master_repl_offset:%lld\r\n" + "master_repl_meaningful_offset:%lld\r\n" "second_repl_offset:%lld\r\n" "repl_backlog_active:%d\r\n" "repl_backlog_size:%lld\r\n" @@ -4406,6 +4408,7 @@ sds genRedisInfoString(const char *section) { server.replid, server.replid2, server.master_repl_offset, + server.master_repl_meaningful_offset, server.second_replid_offset, server.repl_backlog != NULL, server.repl_backlog_size, @@ -4783,6 +4786,7 @@ void loadDataFromDisk(void) { { memcpy(server.replid,rsi.repl_id,sizeof(server.replid)); server.master_repl_offset = rsi.repl_offset; + server.master_repl_meaningful_offset = rsi.repl_offset; /* If we are a slave, create a cached master from this * information, in order to allow partial resynchronizations * with masters. */ diff --git a/src/server.h b/src/server.h index ed4707d6..818fbc3b 100644 --- a/src/server.h +++ b/src/server.h @@ -1241,6 +1241,7 @@ struct redisServer { char replid[CONFIG_RUN_ID_SIZE+1]; /* My current replication ID. */ char replid2[CONFIG_RUN_ID_SIZE+1]; /* replid inherited from master*/ long long master_repl_offset; /* My current replication offset */ + long long master_repl_meaningful_offset; /* Offset minus latest PINGs. */ long long second_replid_offset; /* Accept offsets up to this for replid2. */ int slaveseldb; /* Last SELECTed DB in replication output */ int repl_ping_slave_period; /* Master pings the slave every N seconds */