From b7d20a2000e452ebe3be9eb5561cf62e60fe5ddc Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 4 Feb 2015 22:12:46 +0100 Subject: [PATCH 01/13] Less blocking dictGetRandomKeys(). Related to issue #2306. --- src/dict.c | 54 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/src/dict.c b/src/dict.c index 29d40009..097401d9 100644 --- a/src/dict.c +++ b/src/dict.c @@ -663,34 +663,42 @@ dictEntry *dictGetRandomKey(dict *d) * at producing N elements, and the elements are guaranteed to be non * repeating. */ unsigned int dictGetRandomKeys(dict *d, dictEntry **des, unsigned int count) { - int j; /* internal hash table id, 0 or 1. */ - unsigned int stored = 0; + unsigned int j; /* internal hash table id, 0 or 1. */ + unsigned int tables; /* 1 or 2 tables? */ + unsigned int stored = 0, maxsizemask; if (dictSize(d) < count) count = dictSize(d); - while(stored < count) { - for (j = 0; j < 2; j++) { - /* Pick a random point inside the hash table 0 or 1. */ - unsigned int i = random() & d->ht[j].sizemask; - int size = d->ht[j].size; - /* Make sure to visit every bucket by iterating 'size' times. */ - while(size--) { - dictEntry *he = d->ht[j].table[i]; - while (he) { - /* Collect all the elements of the buckets found non - * empty while iterating. */ - *des = he; - des++; - he = he->next; - stored++; - if (stored == count) return stored; - } - i = (i+1) & d->ht[j].sizemask; + /* Try to do a rehashing work proportional to 'count'. */ + for (j = 0; j < count; j++) { + if (dictIsRehashing(d)) + _dictRehashStep(d); + else + break; + } + + tables = dictIsRehashing(d) ? 2 : 1; + maxsizemask = d->ht[0].sizemask; + if (tables > 1 && maxsizemask < d->ht[1].sizemask) + maxsizemask = d->ht[1].sizemask; + + /* Pick a random point inside the larger table. */ + unsigned int i = random() & maxsizemask; + while(stored < count) { + for (j = 0; j < tables; j++) { + if (i >= d->ht[j].size) continue; /* Out of range for this table. */ + dictEntry *he = d->ht[j].table[i]; + while (he) { + /* Collect all the elements of the buckets found non + * empty while iterating. */ + *des = he; + des++; + he = he->next; + stored++; + if (stored == count) return stored; } - /* If there is only one table and we iterated it all, we should - * already have 'count' elements. Assert this condition. */ - assert(dictIsRehashing(d) != 0); } + i = (i+1) & maxsizemask; } return stored; /* Never reached. */ } From 40f64bc68a5684bf7a6def51dabef1bbc1c37076 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Feb 2015 10:42:09 +0100 Subject: [PATCH 02/13] dict.c: prevent useless resize to same size. Related to issue #2306. --- src/dict.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/dict.c b/src/dict.c index 097401d9..8f237a8a 100644 --- a/src/dict.c +++ b/src/dict.c @@ -211,6 +211,9 @@ int dictExpand(dict *d, unsigned long size) if (dictIsRehashing(d) || d->ht[0].used > size) return DICT_ERR; + /* Rehashing to the same table size is not useful. */ + if (realsize == d->ht[0].size) return DICT_ERR; + /* Allocate the new hash table and initialize all pointers to NULL */ n.size = realsize; n.sizemask = realsize-1; From e2ab7be33c2cbc84b5bb15f8519b319038abdbe6 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Feb 2015 10:51:05 +0100 Subject: [PATCH 03/13] dict.c: put a bound to max work dictRehash() call can do. Related to issue #2306. --- src/dict.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/dict.c b/src/dict.c index 8f237a8a..6ad424e0 100644 --- a/src/dict.c +++ b/src/dict.c @@ -235,9 +235,15 @@ int dictExpand(dict *d, unsigned long size) /* Performs N steps of incremental rehashing. Returns 1 if there are still * keys to move from the old to the new hash table, otherwise 0 is returned. + * * Note that a rehashing step consists in moving a bucket (that may have more - * than one key as we use chaining) from the old to the new hash table. */ + * than one key as we use chaining) from the old to the new hash table, however + * since part of the hash table may be composed of empty spaces, it is not + * guaranteed that this function will rehash even a single bucket, since it + * will visit at max N*10 empty buckets in total, otherwise the amount of + * work it does would be unbound and the function may block for a long time. */ int dictRehash(dict *d, int n) { + int empty_visits = n*10; /* Max number of empty buckets to visit. */ if (!dictIsRehashing(d)) return 0; while(n--) { @@ -255,7 +261,10 @@ int dictRehash(dict *d, int n) { /* Note that rehashidx can't overflow as we are sure there are more * elements because ht[0].used != 0 */ assert(d->ht[0].size > (unsigned long)d->rehashidx); - while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++; + while(d->ht[0].table[d->rehashidx] == NULL) { + d->rehashidx++; + if (--empty_visits == 0) return 1; + } de = d->ht[0].table[d->rehashidx]; /* Move all the keys in this bucket from the old to the new hash HT */ while(de) { From eb4859db05ec5db6cfc7a2da19ddb447b5b6488c Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Feb 2015 10:58:28 +0100 Subject: [PATCH 04/13] dict.c: dictGetRandomKeys() visit pattern optimization. We use the invariant that the original table ht[0] is never populated up to the index before the current rehashing index. Related to issue #2306. --- src/dict.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/dict.c b/src/dict.c index 6ad424e0..6afe15e5 100644 --- a/src/dict.c +++ b/src/dict.c @@ -698,6 +698,10 @@ unsigned int dictGetRandomKeys(dict *d, dictEntry **des, unsigned int count) { unsigned int i = random() & maxsizemask; while(stored < count) { for (j = 0; j < tables; j++) { + /* Invariant of the dict.c rehashing: up to the indexes already + * visited in ht[0] during the rehashing, there are no populated + * buckets, so we can skip ht[0] for indexes between 0 and idx-1. */ + if (j == 0 && i < d->rehashidx) continue; if (i >= d->ht[j].size) continue; /* Out of range for this table. */ dictEntry *he = d->ht[j].table[i]; while (he) { From a355568c1a9297e3126df2574805f29e4501c7ac Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Feb 2015 11:28:45 +0100 Subject: [PATCH 05/13] dict.c: dictGetRandomKeys() optimization for big->small table case. Related to issue #2306. --- src/dict.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/dict.c b/src/dict.c index 6afe15e5..af3623d0 100644 --- a/src/dict.c +++ b/src/dict.c @@ -701,7 +701,14 @@ unsigned int dictGetRandomKeys(dict *d, dictEntry **des, unsigned int count) { /* Invariant of the dict.c rehashing: up to the indexes already * visited in ht[0] during the rehashing, there are no populated * buckets, so we can skip ht[0] for indexes between 0 and idx-1. */ - if (j == 0 && i < d->rehashidx) continue; + if (tables == 2 && j == 0 && i < d->rehashidx) { + /* Moreover, if we are currently out of range in the second + * table, there will be no elements in both tables up to + * the current rehashing index, so we jump if possible. + * (this happens when going from big to small table). */ + if (i >= d->ht[1].size) i = d->rehashidx; + continue; + } if (i >= d->ht[j].size) continue; /* Out of range for this table. */ dictEntry *he = d->ht[j].table[i]; while (he) { From 252ebcd973614c14e81d7744826cd1ab0248546c Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 5 Feb 2015 12:15:58 +0100 Subject: [PATCH 06/13] dict.c: don't try buckets that are empty for sure in dictGetRandomKey(). This is very similar to the optimization applied to dictGetRandomKeys, but applied to the single key variant. Related to issue #2306. --- src/dict.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/dict.c b/src/dict.c index af3623d0..e0adfbc9 100644 --- a/src/dict.c +++ b/src/dict.c @@ -628,7 +628,11 @@ dictEntry *dictGetRandomKey(dict *d) if (dictIsRehashing(d)) _dictRehashStep(d); if (dictIsRehashing(d)) { do { - h = random() % (d->ht[0].size+d->ht[1].size); + /* We are sure there are no elements in indexes from 0 + * to rehashidx-1 */ + h = d->rehashidx + (random() % (d->ht[0].size + + d->ht[1].size - + d->rehashidx)); he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] : d->ht[0].table[h]; } while(he == NULL); From aa3de042324f4fd71dece81d7776e18d98ec376d Mon Sep 17 00:00:00 2001 From: Sun He Date: Fri, 6 Feb 2015 11:18:58 +0800 Subject: [PATCH 07/13] dict.c/dictRehash: check again to update --- src/dict.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/dict.c b/src/dict.c index e0adfbc9..9b2be81f 100644 --- a/src/dict.c +++ b/src/dict.c @@ -282,6 +282,13 @@ int dictRehash(dict *d, int n) { d->ht[0].table[d->rehashidx] = NULL; d->rehashidx++; } + /* Check again if we already rehashed the whole table... */ + if (d->ht[0].used == 0) { + zfree(d->ht[0].table); + d->ht[0] = d->ht[1]; + _dictReset(&d->ht[1]); + d->rehashidx = -1; + } return 1; } From 7014e1afc21a157ce293ff0462ab84a5ae4559ff Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 6 Feb 2015 10:48:13 +0100 Subject: [PATCH 08/13] dict.c: avoid code repetition in dictRehash(). Avoid code repetition introduced with PR #2367, also fixes the return value to always return 0 if there is nothing more to rehash. --- src/dict.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/dict.c b/src/dict.c index 9b2be81f..da6266dd 100644 --- a/src/dict.c +++ b/src/dict.c @@ -246,18 +246,9 @@ int dictRehash(dict *d, int n) { int empty_visits = n*10; /* Max number of empty buckets to visit. */ if (!dictIsRehashing(d)) return 0; - while(n--) { + while(n-- && d->ht[0].used != 0) { dictEntry *de, *nextde; - /* Check if we already rehashed the whole table... */ - if (d->ht[0].used == 0) { - zfree(d->ht[0].table); - d->ht[0] = d->ht[1]; - _dictReset(&d->ht[1]); - d->rehashidx = -1; - return 0; - } - /* Note that rehashidx can't overflow as we are sure there are more * elements because ht[0].used != 0 */ assert(d->ht[0].size > (unsigned long)d->rehashidx); @@ -282,13 +273,17 @@ int dictRehash(dict *d, int n) { d->ht[0].table[d->rehashidx] = NULL; d->rehashidx++; } - /* Check again if we already rehashed the whole table... */ + + /* Check if we already rehashed the whole table... */ if (d->ht[0].used == 0) { zfree(d->ht[0].table); d->ht[0] = d->ht[1]; _dictReset(&d->ht[1]); d->rehashidx = -1; + return 0; } + + /* More to rehash... */ return 1; } From 05a398389fd8ac6e822bc1dd3bbd1b363ab270bd Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 6 Feb 2015 15:48:42 +0100 Subject: [PATCH 09/13] dict.c: add dictGetSomeKeys(), specialized for eviction. --- src/dict.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++- src/dict.h | 1 + src/redis.c | 2 +- 3 files changed, 95 insertions(+), 2 deletions(-) diff --git a/src/dict.c b/src/dict.c index da6266dd..69dfb757 100644 --- a/src/dict.c +++ b/src/dict.c @@ -661,7 +661,10 @@ dictEntry *dictGetRandomKey(dict *d) return he; } -/* This is a version of dictGetRandomKey() that is modified in order to +/* XXX: This is going to be removed soon and SPOP internals + * reimplemented. + * + * This is a version of dictGetRandomKey() that is modified in order to * return multiple entries by jumping at a random place of the hash table * and scanning linearly for entries. * @@ -732,6 +735,95 @@ unsigned int dictGetRandomKeys(dict *d, dictEntry **des, unsigned int count) { return stored; /* Never reached. */ } + +/* This function samples the dictionary to return a few keys from random + * locations. + * + * It does not guarantee to return all the keys specified in 'count', nor + * it does guarantee to return non-duplicated elements, however it will make + * some effort to do both things. + * + * Returned pointers to hash table entries are stored into 'des' that + * points to an array of dictEntry pointers. The array must have room for + * at least 'count' elements, that is the argument we pass to the function + * to tell how many random elements we need. + * + * The function returns the number of items stored into 'des', that may + * be less than 'count' if the hash table has less than 'count' elements + * inside, or if not enough elements were found in a reasonable amount of + * steps. + * + * Note that this function is not suitable when you need a good distribution + * of the returned items, but only when you need to "sample" a given number + * of continuous elements to run some kind of algorithm or to produce + * statistics. However the function is much faster than dictGetRandomKey() + * at producing N elements. */ +unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { + unsigned int j; /* internal hash table id, 0 or 1. */ + unsigned int tables; /* 1 or 2 tables? */ + unsigned int stored = 0, maxsizemask; + unsigned int maxsteps; + + if (dictSize(d) < count) count = dictSize(d); + maxsteps = count*10; + + /* Try to do a rehashing work proportional to 'count'. */ + for (j = 0; j < count; j++) { + if (dictIsRehashing(d)) + _dictRehashStep(d); + else + break; + } + + tables = dictIsRehashing(d) ? 2 : 1; + maxsizemask = d->ht[0].sizemask; + if (tables > 1 && maxsizemask < d->ht[1].sizemask) + maxsizemask = d->ht[1].sizemask; + + /* Pick a random point inside the larger table. */ + unsigned int i = random() & maxsizemask; + unsigned int emptylen = 0; /* Continuous empty entries so far. */ + while(stored < count && maxsteps--) { + for (j = 0; j < tables; j++) { + /* Invariant of the dict.c rehashing: up to the indexes already + * visited in ht[0] during the rehashing, there are no populated + * buckets, so we can skip ht[0] for indexes between 0 and idx-1. */ + if (tables == 2 && j == 0 && i < d->rehashidx) { + /* Moreover, if we are currently out of range in the second + * table, there will be no elements in both tables up to + * the current rehashing index, so we jump if possible. + * (this happens when going from big to small table). */ + if (i >= d->ht[1].size) i = d->rehashidx; + continue; + } + if (i >= d->ht[j].size) continue; /* Out of range for this table. */ + dictEntry *he = d->ht[j].table[i]; + + /* Count contiguous empty buckets, and jump to other + * locations if they reach 'count' (with a minimum of 5). */ + if (he == NULL) { + emptylen++; + if (emptylen >= 5 && emptylen > count) { + i = random() & maxsizemask; + emptylen = 0; + } + } else { + while (he) { + /* Collect all the elements of the buckets found non + * empty while iterating. */ + *des = he; + des++; + he = he->next; + stored++; + if (stored == count) return stored; + } + } + } + i = (i+1) & maxsizemask; + } + return stored; +} + /* Function to reverse bits. Algorithm from: * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */ static unsigned long rev(unsigned long v) { diff --git a/src/dict.h b/src/dict.h index 7421078f..5959a57b 100644 --- a/src/dict.h +++ b/src/dict.h @@ -164,6 +164,7 @@ dictIterator *dictGetSafeIterator(dict *d); dictEntry *dictNext(dictIterator *iter); void dictReleaseIterator(dictIterator *iter); dictEntry *dictGetRandomKey(dict *d); +unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count); unsigned int dictGetRandomKeys(dict *d, dictEntry **des, unsigned int count); void dictPrintStats(dict *d); unsigned int dictGenHashFunction(const void *key, int len); diff --git a/src/redis.c b/src/redis.c index 07458328..964f7857 100644 --- a/src/redis.c +++ b/src/redis.c @@ -3137,7 +3137,7 @@ void evictionPoolPopulate(dict *sampledict, dict *keydict, struct evictionPoolEn } #if 1 /* Use bulk get by default. */ - count = dictGetRandomKeys(sampledict,samples,server.maxmemory_samples); + count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples); #else count = server.maxmemory_samples; for (j = 0; j < count; j++) samples[j] = dictGetRandomKey(sampledict); From eaae581b55822dabf07cb89d2a42796376db2609 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 6 Feb 2015 16:17:11 +0100 Subject: [PATCH 10/13] Remove optional single-key path from evictionPoolPopulate(). --- src/redis.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/redis.c b/src/redis.c index 964f7857..a6470487 100644 --- a/src/redis.c +++ b/src/redis.c @@ -3136,13 +3136,7 @@ void evictionPoolPopulate(dict *sampledict, dict *keydict, struct evictionPoolEn samples = zmalloc(sizeof(samples[0])*server.maxmemory_samples); } -#if 1 /* Use bulk get by default. */ count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples); -#else - count = server.maxmemory_samples; - for (j = 0; j < count; j++) samples[j] = dictGetRandomKey(sampledict); -#endif - for (j = 0; j < count; j++) { unsigned long long idle; sds key; From 08ad51762bdeabbc8dc416052a44bc59da7f10e2 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 7 Feb 2015 09:54:07 +0100 Subject: [PATCH 11/13] redis-cli --stat: show LOAD when loading. --- src/redis-cli.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/redis-cli.c b/src/redis-cli.c index 8773d8e7..2473b4f3 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -1943,6 +1943,7 @@ static void statMode(void) { /* Children */ aux = getLongInfoField(reply->str,"bgsave_in_progress"); aux |= getLongInfoField(reply->str,"aof_rewrite_in_progress") << 1; + aux |= getLongInfoField(reply->str,"loading") << 2; switch(aux) { case 0: break; case 1: @@ -1954,6 +1955,9 @@ static void statMode(void) { case 3: printf("SAVE+AOF"); break; + case 4: + printf("LOAD"); + break; } printf("\n"); From 541eb23895ab630cd6e208f2348cdd43ab8fddd8 Mon Sep 17 00:00:00 2001 From: antirez Date: Sun, 8 Feb 2015 11:19:47 +0100 Subject: [PATCH 12/13] dict.c: reset emptylen when bucket is not empty. Fixed by @oranagra, thank you. --- src/dict.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dict.c b/src/dict.c index 69dfb757..43add192 100644 --- a/src/dict.c +++ b/src/dict.c @@ -808,6 +808,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { emptylen = 0; } } else { + emptylen = 0; while (he) { /* Collect all the elements of the buckets found non * empty while iterating. */ From 967590de6efcbced49825d99fba984255993d403 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 9 Feb 2015 15:17:20 +0100 Subject: [PATCH 13/13] Separate latency monitoring of eviction loop and eviction DELs. --- src/latency.c | 12 +++++++++++- src/latency.h | 4 ++++ src/redis.c | 6 +++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/latency.c b/src/latency.c index cb116fb9..fd76b321 100644 --- a/src/latency.c +++ b/src/latency.c @@ -228,6 +228,7 @@ sds createLatencyReport(void) { int advise_write_load_info = 0; /* Print info about AOF and write load. */ int advise_hz = 0; /* Use higher HZ. */ int advise_large_objects = 0; /* Deletion of large objects. */ + int advise_mass_eviction = 0; /* Avoid mass eviction of keys. */ int advise_relax_fsync_policy = 0; /* appendfsync always is slow. */ int advise_disable_thp = 0; /* AnonHugePages detected. */ int advices = 0; @@ -364,11 +365,16 @@ sds createLatencyReport(void) { } /* Eviction cycle. */ - if (!strcasecmp(event,"eviction-cycle")) { + if (!strcasecmp(event,"eviction-del")) { advise_large_objects = 1; advices++; } + if (!strcasecmp(event,"eviction-cycle")) { + advise_mass_eviction = 1; + advices++; + } + report = sdscatlen(report,"\n",1); } dictReleaseIterator(di); @@ -452,6 +458,10 @@ sds createLatencyReport(void) { report = sdscat(report,"- Deleting, expiring or evicting (because of maxmemory policy) large objects is a blocking operation. If you have very large objects that are often deleted, expired, or evicted, try to fragment those objects into multiple smaller objects.\n"); } + if (advise_mass_eviction) { + report = sdscat(report,"- Sudden changes to the 'maxmemory' setting via 'CONFIG SET', or allocation of large objects via sets or sorted sets intersections, STORE option of SORT, Redis Cluster large keys migrations (RESTORE command), may create sudden memory pressure forcing the server to block trying to evict keys. \n"); + } + if (advise_disable_thp) { report = sdscat(report,"- I detected a non zero amount of anonymous huge pages used by your process. This creates very serious latency events in different conditions, especially when Redis is persisting on disk. To disable THP support use the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled', make sure to also add it into /etc/rc.local so that the command will be executed again after a reboot. Note that even if you have already disabled THP, you still need to restart the Redis process to get rid of the huge pages already created.\n"); } diff --git a/src/latency.h b/src/latency.h index 240f54b4..0fe26e0e 100644 --- a/src/latency.h +++ b/src/latency.h @@ -86,4 +86,8 @@ int THPIsEnabled(void); (var) >= server.latency_monitor_threshold) \ latencyAddSample((event),(var)); +/* Remove time from a nested event. */ +#define latencyRemoveNestedEvent(event_var,nested_var) \ + event_var += nested_var; + #endif /* __LATENCY_H */ diff --git a/src/redis.c b/src/redis.c index a6470487..04d54ade 100644 --- a/src/redis.c +++ b/src/redis.c @@ -3191,7 +3191,7 @@ void evictionPoolPopulate(dict *sampledict, dict *keydict, struct evictionPoolEn int freeMemoryIfNeeded(void) { size_t mem_used, mem_tofree, mem_freed; int slaves = listLength(server.slaves); - mstime_t latency; + mstime_t latency, eviction_latency; /* Remove the size of slaves output buffers and AOF buffer from the * count of used memory. */ @@ -3322,7 +3322,11 @@ int freeMemoryIfNeeded(void) { * AOF and Output buffer memory will be freed eventually so * we only care about memory used by the key space. */ delta = (long long) zmalloc_used_memory(); + latencyStartMonitor(eviction_latency); dbDelete(db,keyobj); + latencyEndMonitor(eviction_latency); + latencyAddSampleIfNeeded("eviction-del",eviction_latency); + latencyRemoveNestedEvent(latency,eviction_latency); delta -= (long long) zmalloc_used_memory(); mem_freed += delta; server.stat_evictedkeys++;