mirror of
https://github.com/fluencelabs/redis
synced 2025-05-01 13:32:13 +00:00
This commit attempts to fix a bug involving PSYNC and diskless replication (currently experimental) found by Yuval Inbar from Redis Labs and that was later found to have even more far reaching effects (the bug also exists when diskstore is off). The gist of the bug is that, a Redis master replies with +FULLRESYNC to a PSYNC attempt that fails and requires a full resynchronization. However, the baseline offset sent along with FULLRESYNC was always the current master replication offset. This is not ok, because there are many reasosn that may delay the RDB file creation. And... guess what, the master offset we communicate must be the one of the time the RDB was created. So for example: 1) When the BGSAVE for replication is delayed since there is one already but is not good for replication. 2) When the BGSAVE is not needed as we attach one currently ongoing. 3) When because of diskless replication the BGSAVE is delayed. In all the above cases the PSYNC reply is wrong and the slave may reconnect later claiming to need a wrong offset: this may cause data curruption later.
151 lines
5.8 KiB
C
151 lines
5.8 KiB
C
/* Synchronous socket and file I/O operations useful across the core.
|
|
*
|
|
* Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "redis.h"
|
|
|
|
/* ----------------- Blocking sockets I/O with timeouts --------------------- */
|
|
|
|
/* Redis performs most of the I/O in a nonblocking way, with the exception
|
|
* of the SYNC command where the slave does it in a blocking way, and
|
|
* the MIGRATE command that must be blocking in order to be atomic from the
|
|
* point of view of the two instances (one migrating the key and one receiving
|
|
* the key). This is why need the following blocking I/O functions.
|
|
*
|
|
* All the functions take the timeout in milliseconds. */
|
|
|
|
#define REDIS_SYNCIO_RESOLUTION 10 /* Resolution in milliseconds */
|
|
|
|
/* Write the specified payload to 'fd'. If writing the whole payload will be
|
|
* done within 'timeout' milliseconds the operation succeeds and 'size' is
|
|
* returned. Otherwise the operation fails, -1 is returned, and an unspecified
|
|
* partial write could be performed against the file descriptor. */
|
|
ssize_t syncWrite(int fd, char *ptr, ssize_t size, long long timeout) {
|
|
ssize_t nwritten, ret = size;
|
|
long long start = mstime();
|
|
long long remaining = timeout;
|
|
|
|
while(1) {
|
|
long long wait = (remaining > REDIS_SYNCIO_RESOLUTION) ?
|
|
remaining : REDIS_SYNCIO_RESOLUTION;
|
|
long long elapsed;
|
|
|
|
/* Optimistically try to write before checking if the file descriptor
|
|
* is actually writable. At worst we get EAGAIN. */
|
|
nwritten = write(fd,ptr,size);
|
|
if (nwritten == -1) {
|
|
if (errno != EAGAIN) return -1;
|
|
} else {
|
|
ptr += nwritten;
|
|
size -= nwritten;
|
|
}
|
|
if (size == 0) return ret;
|
|
|
|
/* Wait */
|
|
aeWait(fd,AE_WRITABLE,wait);
|
|
elapsed = mstime() - start;
|
|
if (elapsed >= timeout) {
|
|
errno = ETIMEDOUT;
|
|
return -1;
|
|
}
|
|
remaining = timeout - elapsed;
|
|
}
|
|
}
|
|
|
|
/* Read the specified amount of bytes from 'fd'. If all the bytes are read
|
|
* within 'timeout' milliseconds the operation succeed and 'size' is returned.
|
|
* Otherwise the operation fails, -1 is returned, and an unspecified amount of
|
|
* data could be read from the file descriptor. */
|
|
ssize_t syncRead(int fd, char *ptr, ssize_t size, long long timeout) {
|
|
ssize_t nread, totread = 0;
|
|
long long start = mstime();
|
|
long long remaining = timeout;
|
|
|
|
if (size == 0) return 0;
|
|
while(1) {
|
|
long long wait = (remaining > REDIS_SYNCIO_RESOLUTION) ?
|
|
remaining : REDIS_SYNCIO_RESOLUTION;
|
|
long long elapsed;
|
|
|
|
/* Optimistically try to read before checking if the file descriptor
|
|
* is actually readable. At worst we get EAGAIN. */
|
|
nread = read(fd,ptr,size);
|
|
if (nread == 0) return -1; /* short read. */
|
|
if (nread == -1) {
|
|
if (errno != EAGAIN) return -1;
|
|
} else {
|
|
ptr += nread;
|
|
size -= nread;
|
|
totread += nread;
|
|
}
|
|
if (size == 0) return totread;
|
|
|
|
/* Wait */
|
|
aeWait(fd,AE_READABLE,wait);
|
|
elapsed = mstime() - start;
|
|
if (elapsed >= timeout) {
|
|
errno = ETIMEDOUT;
|
|
return -1;
|
|
}
|
|
remaining = timeout - elapsed;
|
|
}
|
|
}
|
|
|
|
/* Read a line making sure that every char will not require more than 'timeout'
|
|
* milliseconds to be read. Empty newlines before the first non-empty line
|
|
* are ignored. This is useful because since Redis sometimes uses empty
|
|
* newlines in order to take the connection "alive".
|
|
*
|
|
* On success the number of bytes read is returned, otherwise -1.
|
|
* On success the string is always correctly terminated with a 0 byte. */
|
|
ssize_t syncReadLine(int fd, char *ptr, ssize_t size, long long timeout) {
|
|
ssize_t nread = 0;
|
|
|
|
size--;
|
|
while(size) {
|
|
char c;
|
|
|
|
if (syncRead(fd,&c,1,timeout) == -1) return -1;
|
|
if (c == '\n') {
|
|
/* Ignore empty lines, otherwise return to the caller. */
|
|
if (nread != 0) {
|
|
*ptr = '\0';
|
|
if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
|
|
return nread;
|
|
}
|
|
} else {
|
|
*ptr++ = c;
|
|
*ptr = '\0';
|
|
nread++;
|
|
}
|
|
size--;
|
|
}
|
|
return nread;
|
|
}
|