Hi Dormando, Thank you for your insightful answer. I implemented the autoshrink mechanism in 1.4, because I could not find slab mover in 1.6. The patch is attached, and I will be happy to hear your opinion. It contains the following content: " Added slab autoshrink: dynamic memory support. mem_limit can now be shrunk or expanded in mid-run using the maxmegabytes command. It uses the infrastructure of slab automove, prefering slab shink to slab automove if both mechanisms are active. It is tested in the new test t/slabs_shrink.t.
Bugfix: when USE_SYSTEM_MALLOC, asserting clsid is zero accessed uninitialized memory. Fixed t/binary.t for number of tests. Improved error message in devtools/clean-whitespace.pl " Next, I intend to enable the user to query the mem_limit and mem_allocated current sizes, and to improve the user control over the speed in which slabs are freed. Currently it is always one slab at a time, at most 1 per second, which gives a rate of 1MB/sec. This is not enough to changing 200MB within seconds. Thanks Orna On Thursday, June 14, 2012 10:41:21 AM UTC+3, Dormando wrote: > > > > > Hello Yiftach, Dormando and everyone, > > > > I work with Eyal exactly on that: OSes that get and lose physical memory > at runtime. > > We are interested in memcached because it is an important cloud > benchmark which stresses the memory. > > > > I think the way memcached deals with changes in the value size > distribution has to do with dynamic memory. > > If memchaced caches many small objects, many slabs for small-size items > are allocated. If then the distribution changes, and suddenly all objects > are large-sized, > > then at some point small-size slabs need to be freed, or at least > cleared and replaced by large-size slabs. If this is indeed what happens, > we could take advantage > > of the point in time when a slab is freed or cleared, and reclaim the > slab (assuming the memory was not preallocated). > > I found a comment saying /* so slab size changer can tell later if item > is already free or not */", but I could not find the implementation of such > a mechanism. > > > > Do you find this a reasonable approach? > > You're making some assumptions between the LRU and slab allocation. A slab > will be full of completely random allocations, ejecting memory to free it > up will lose a bunch of items. It also doesn't do what you suggest, not > natively. Before 1.4.11 slab memory allocations were static. If you loaded > small items, then large items, your large items would have no memory. > > http://code.google.com/p/memcached/wiki/ReleaseNotes1411 > > If you look at how the slab mover is implemented, you could inject > yourself into the slab mover, and instead of moving a slab between one > class and another, free it once the items are cleared. > > It is, as I said, not something you want to do all willy-nilly. Preserving > the items in the slab page you intend to move requires a lot more careful > work than I was willing to do at the time. It might be too slow as well.
diff --git a/devtools/clean-whitespace.pl b/devtools/clean-whitespace.pl
index 95481ef..18a7cfc 100755
--- a/devtools/clean-whitespace.pl
+++ b/devtools/clean-whitespace.pl
@@ -26,7 +26,7 @@ foreach my $f (@files) {
$after =~ s/ +$//mg;
$after .= "\n" unless $after =~ /\n$/;
next if $after eq $before;
- open(my $fh, ">$f") or die;
+ open(my $fh, ">$f") or die "Could not open for write file $f\n";
print $fh $after;
close($fh);
}
diff --git a/items.c b/items.c
index ab588db..1d75387 100644
--- a/items.c
+++ b/items.c
@@ -175,7 +175,11 @@ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_tim
return NULL;
}
+#ifdef USE_SYSTEM_MALLOC
+ it->slabs_clsid=0;
+#else
assert(it->slabs_clsid == 0);
+#endif
assert(it != heads[id]);
/* Item initialization can happen outside of the lock; the item's already
diff --git a/memcached.c b/memcached.c
index b121e16..7f3a8a5 100644
--- a/memcached.c
+++ b/memcached.c
@@ -174,6 +174,7 @@ static void stats_init(void) {
stats.hash_power_level = stats.hash_bytes = stats.hash_is_expanding = 0;
stats.expired_unfetched = stats.evicted_unfetched = 0;
stats.slabs_moved = 0;
+ stats.slabs_shrunk = 0;
stats.accepting_conns = true; /* assuming we start in this state. */
stats.slab_reassign_running = false;
@@ -225,6 +226,7 @@ static void settings_init(void) {
settings.hashpower_init = 0;
settings.slab_reassign = false;
settings.slab_automove = false;
+ settings.slab_autoshrink = false;
}
/*
@@ -2590,6 +2592,10 @@ static void server_stats(ADD_STAT add_stats, conn *c) {
if (settings.slab_reassign) {
APPEND_STAT("slab_reassign_running", "%u", stats.slab_reassign_running);
APPEND_STAT("slabs_moved", "%llu", stats.slabs_moved);
+
+ if (settings.slab_autoshrink) {
+ APPEND_STAT("slabs_shrunk", "%llu", stats.slabs_shrunk);
+ }
}
STATS_UNLOCK();
}
@@ -2625,6 +2631,7 @@ static void process_stat_settings(ADD_STAT add_stats, void *c) {
APPEND_STAT("hashpower_init", "%d", settings.hashpower_init);
APPEND_STAT("slab_reassign", "%s", settings.slab_reassign ? "yes" : "no");
APPEND_STAT("slab_automove", "%s", settings.slab_automove ? "yes" : "no");
+ APPEND_STAT("slab_autoshrink", "%s", settings.slab_autoshrink ? "yes" : "no");
}
static void process_stat(conn *c, token_t *tokens, const size_t ntokens) {
@@ -3184,6 +3191,24 @@ static void process_delete_command(conn *c, token_t *tokens, const size_t ntoken
}
}
+
+static void process_maxmegabytes_command(conn *c, token_t *tokens, const size_t ntokens) {
+ unsigned long int newmaxbytes;
+
+ assert(c != NULL);
+
+ set_noreply_maybe(c, tokens, ntokens);
+
+ newmaxbytes=strtol(tokens[COMMAND_TOKEN+1].value, NULL, 10) *1024 * 1024;
+ if (memory_shrink_expand(newmaxbytes))
+ out_string(c, "ERROR: Could not change memory to new value");
+ else
+ settings.maxbytes = newmaxbytes;
+ out_string(c, "OK");
+ return;
+}
+
+
static void process_verbosity_command(conn *c, token_t *tokens, const size_t ntokens) {
unsigned int level;
@@ -3197,7 +3222,9 @@ static void process_verbosity_command(conn *c, token_t *tokens, const size_t nto
return;
}
-static void process_slabs_automove_command(conn *c, token_t *tokens, const size_t ntokens) {
+/**
+ Handle both automove and autoshrink*/
+static void process_slabs_auto_command(conn *c, token_t *tokens, const size_t ntokens, bool * const autovar) {
unsigned int level;
assert(c != NULL);
@@ -3206,17 +3233,18 @@ static void process_slabs_automove_command(conn *c, token_t *tokens, const size_
level = strtoul(tokens[2].value, NULL, 10);
if (level == 0) {
- settings.slab_automove = false;
+ *autovar = false;
} else if (level == 1) {
- settings.slab_automove = true;
+ *autovar = true;
} else {
- out_string(c, "ERROR");
+ out_string(c, "ERROR: auto var level must be 0 or 1");
return;
}
out_string(c, "OK");
return;
}
+
static void process_command(conn *c, char *command) {
token_t tokens[MAX_TOKENS];
@@ -3373,12 +3401,23 @@ static void process_command(conn *c, char *command) {
break;
}
return;
- } else if (ntokens == 4 &&
- (strcmp(tokens[COMMAND_TOKEN + 1].value, "automove") == 0)) {
- process_slabs_automove_command(c, tokens, ntokens);
} else {
- out_string(c, "ERROR");
+ if (ntokens == 4 &&
+ (strcmp(tokens[COMMAND_TOKEN + 1].value, "automove") == 0)) {
+ process_slabs_auto_command(c, tokens, ntokens,&(settings.slab_automove));
+ } else {
+ if (ntokens == 4 &&
+ (strcmp(tokens[COMMAND_TOKEN + 1].value, "autoshrink") == 0)) {
+ process_slabs_auto_command(c, tokens, ntokens,&(settings.slab_autoshrink));
+ }else {
+ out_string(c, "ERROR: bad slabs command");
+ }
+ }
}
+
+
+ } else if (ntokens == 3 && (strcmp(tokens[COMMAND_TOKEN].value, "maxmegabytes") == 0)) {
+ process_maxmegabytes_command(c, tokens, ntokens);
} else if ((ntokens == 3 || ntokens == 4) && (strcmp(tokens[COMMAND_TOKEN].value, "verbosity") == 0)) {
process_verbosity_command(c, tokens, ntokens);
} else {
@@ -4730,13 +4769,15 @@ int main (int argc, char **argv) {
MAXCONNS_FAST = 0,
HASHPOWER_INIT,
SLAB_REASSIGN,
- SLAB_AUTOMOVE
+ SLAB_AUTOMOVE,
+ SLAB_AUTOSHRINK
};
char *const subopts_tokens[] = {
[MAXCONNS_FAST] = "maxconns_fast",
[HASHPOWER_INIT] = "hashpower",
[SLAB_REASSIGN] = "slab_reassign",
[SLAB_AUTOMOVE] = "slab_automove",
+ [SLAB_AUTOSHRINK] = "slab_autoshrink",
NULL
};
@@ -4988,6 +5029,9 @@ int main (int argc, char **argv) {
case SLAB_AUTOMOVE:
settings.slab_automove = true;
break;
+ case SLAB_AUTOSHRINK:
+ settings.slab_autoshrink = true;
+ break;
default:
printf("Illegal suboption \"%s\"\n", subopts_value);
return 1;
diff --git a/memcached.h b/memcached.h
index 97a79d0..3ddd0b9 100644
--- a/memcached.h
+++ b/memcached.h
@@ -23,6 +23,7 @@
#include "sasl_defs.h"
+
/** Maximum length of a key. */
#define KEY_MAX_LENGTH 250
@@ -267,6 +268,7 @@ struct stats {
uint64_t evicted_unfetched; /* items evicted but never touched */
bool slab_reassign_running; /* slab reassign in progress */
uint64_t slabs_moved; /* times slabs were moved around */
+ uint64_t slabs_shrunk; /* times slabs were shrunk */
};
#define MAX_VERBOSITY_LEVEL 2
@@ -302,6 +304,7 @@ struct settings {
bool maxconns_fast; /* Whether or not to early close connections */
bool slab_reassign; /* Whether or not slab reassignment is allowed */
bool slab_automove; /* Whether or not to automatically move slabs */
+ bool slab_autoshrink; /* Whether or not to automatically shrink memory use*/
int hashpower_init; /* Starting hash power level */
};
diff --git a/slabs.c b/slabs.c
index 2483fbf..4f637bd 100644
--- a/slabs.c
+++ b/slabs.c
@@ -8,6 +8,7 @@
* memcached protocol.
*/
#include "memcached.h"
+#include <limits.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/signal.h>
@@ -424,6 +425,23 @@ static void *memory_allocate(size_t size) {
return ret;
}
+
+int memory_shrink_expand(const size_t size) {
+
+ if (mem_base == NULL) {
+ /* We are not using a preallocated large memory chunk */
+ if (size<settings.item_size_max)
+ return 2;
+ pthread_mutex_lock(&slabs_lock);
+ mem_limit=size;
+ pthread_mutex_unlock(&slabs_lock);
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+
void *slabs_alloc(size_t size, unsigned int id) {
void *ret;
@@ -462,6 +480,9 @@ void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
static volatile int do_run_slab_thread = 1;
+#define AUTOMOVE_DECISION_SECONDS 10
+#define AUTOSHRINK_DECISION_SECONDS 1
+
#define DEFAULT_SLAB_BULK_CHECK 1
int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
@@ -469,25 +490,33 @@ static int slab_rebalance_start(void) {
slabclass_t *s_cls;
slabclass_t *d_cls;
int no_go = 0;
+ bool shrink=(slab_rebal.d_clsid==0);
pthread_mutex_lock(&cache_lock);
pthread_mutex_lock(&slabs_lock);
if (slab_rebal.s_clsid < POWER_SMALLEST ||
slab_rebal.s_clsid > power_largest ||
- slab_rebal.d_clsid < POWER_SMALLEST ||
- slab_rebal.d_clsid > power_largest ||
+ (!shrink && (
+ slab_rebal.d_clsid < POWER_SMALLEST ||
+ slab_rebal.d_clsid > power_largest ) )||
slab_rebal.s_clsid == slab_rebal.d_clsid)
no_go = -2;
s_cls = &slabclass[slab_rebal.s_clsid];
- d_cls = &slabclass[slab_rebal.d_clsid];
- if (d_cls->end_page_ptr || s_cls->end_page_ptr ||
- !grow_slab_list(slab_rebal.d_clsid)) {
+ if (s_cls->end_page_ptr){
no_go = -1;
}
+ if (!shrink){
+ d_cls = &slabclass[slab_rebal.d_clsid];
+ if (d_cls->end_page_ptr ||
+ !grow_slab_list(slab_rebal.d_clsid)) {
+ no_go = -1;
+ }
+ }
+
if (s_cls->slabs < 2)
no_go = -3;
@@ -509,7 +538,7 @@ static int slab_rebalance_start(void) {
slab_rebalance_signal = 2;
if (settings.verbose > 1) {
- fprintf(stderr, "Started a slab rebalance\n");
+ fprintf(stderr, "Started a slab %s\n",shrink?"shrink":"rebalance");
}
pthread_mutex_unlock(&slabs_lock);
@@ -619,24 +648,33 @@ static int slab_rebalance_move(void) {
static void slab_rebalance_finish(void) {
slabclass_t *s_cls;
slabclass_t *d_cls;
+ bool shrink=(slab_rebal.d_clsid==0);
pthread_mutex_lock(&cache_lock);
pthread_mutex_lock(&slabs_lock);
s_cls = &slabclass[slab_rebal.s_clsid];
- d_cls = &slabclass[slab_rebal.d_clsid];
/* At this point the stolen slab is completely clear */
s_cls->slab_list[s_cls->killing - 1] =
s_cls->slab_list[s_cls->slabs - 1];
- s_cls->slabs--;
+ s_cls->slabs-=s_cls->killing;/*Planning for a number of slabs freed at the same operation*/
s_cls->killing = 0;
- memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
+ if (shrink){
+ ((item *)(slab_rebal.slab_start))->slabs_clsid = 0;
+ if (mem_base==NULL){
+ free(slab_rebal.slab_start);
+ mem_malloced -= settings.item_size_max;
+ }
+ }else{
+ memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
- d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
- d_cls->end_page_ptr = slab_rebal.slab_start;
- d_cls->end_page_free = d_cls->perslab;
+ d_cls = &slabclass[slab_rebal.d_clsid];
+ d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
+ d_cls->end_page_ptr = slab_rebal.slab_start;
+ d_cls->end_page_free = d_cls->perslab;
+ }
slab_rebal.done = 0;
slab_rebal.s_clsid = 0;
@@ -652,36 +690,47 @@ static void slab_rebalance_finish(void) {
STATS_LOCK();
stats.slab_reassign_running = false;
- stats.slabs_moved++;
+ if (shrink)
+ stats.slabs_shrunk++;
+ else
+ stats.slabs_moved++;
+
STATS_UNLOCK();
if (settings.verbose > 1) {
- fprintf(stderr, "finished a slab move\n");
+ fprintf(stderr, "Finished a slab %s\n",shrink?"shrink":"move");
}
}
-/* Return 1 means a decision was reached.
+/* Return 1 means a decision was reached for the source.
+ Return 2 menas a decision was reached for a destination as well.
+ Return 0 if no decision was made.
* Move to its own thread (created/destroyed as needed) once automover is more
* complex.
*/
-static int slab_automove_decision(int *src, int *dst) {
+static int slab_automove_decision(int *src, int *dst, const bool shrink_now) {
static uint64_t evicted_old[POWER_LARGEST];
+/*Record the number of consecutive times
+ in which a slab had zero evictions*/
static unsigned int slab_zeroes[POWER_LARGEST];
static unsigned int slab_winner = 0;
static unsigned int slab_wins = 0;
uint64_t evicted_new[POWER_LARGEST];
- uint64_t evicted_diff = 0;
+ uint64_t evicted_diff[POWER_LARGEST];
uint64_t evicted_max = 0;
+ uint64_t evicted_min = ULONG_MAX;
unsigned int highest_slab = 0;
unsigned int total_pages[POWER_LARGEST];
int i;
int source = 0;
+ int emergency_source = 0;
int dest = 0;
static rel_time_t next_run;
/* Run less frequently than the slabmove tester. */
if (current_time >= next_run) {
- next_run = current_time + 10;
+ int decision_seconds=shrink_now?AUTOSHRINK_DECISION_SECONDS:AUTOMOVE_DECISION_SECONDS;
+ next_run = current_time + decision_seconds;
} else {
return 0;
}
@@ -693,24 +742,43 @@ static int slab_automove_decision(int *src, int *dst) {
}
pthread_mutex_unlock(&cache_lock);
- /* Find a candidate source; something with zero evicts 3+ times */
+ /* Find a candidate source; something with zero evicts 3+ times.
+ This algorithm prefers larger powers as a source. */
for (i = POWER_SMALLEST; i < power_largest; i++) {
- evicted_diff = evicted_new[i] - evicted_old[i];
- if (evicted_diff == 0 && total_pages[i] > 2) {
+ evicted_diff[i] = evicted_new[i] - evicted_old[i];
+
+ if (evicted_diff[i] == 0 && total_pages[i] > 2) {
slab_zeroes[i]++;
if (source == 0 && slab_zeroes[i] >= 3)
source = i;
} else {
+ /*Search for the best destination according
+ to the current statistics*/
slab_zeroes[i] = 0;
- if (evicted_diff > evicted_max) {
- evicted_max = evicted_diff;
+ if (evicted_diff[i] > evicted_max) {
+ evicted_max = evicted_diff[i];
highest_slab = i;
}
}
+
+ if (settings.verbose > 2 && total_pages[i]) {
+ fprintf(stderr,
+ "total pages: slab class %d diff %ld slabs %d\n",
+ i,evicted_diff[i],total_pages[i]);
+ }
+
+ if (shrink_now && evicted_diff[i] < evicted_min && (total_pages[i] >= 2)){
+ /*we verify that there are enough slabs in the emergency source,
+ otherwise we don't have anything to take from.
+ If we wait to slab_reassign with this check we might hit a neverending loop.*/
+ evicted_min=evicted_diff[i];
+ emergency_source=i;
+ }
+
evicted_old[i] = evicted_new[i];
}
- /* Pick a valid destination */
+ /* Pick a valid destination: a destination which won 3 times in a row*/
if (slab_winner != 0 && slab_winner == highest_slab) {
slab_wins++;
if (slab_wins >= 3)
@@ -720,9 +788,18 @@ static int slab_automove_decision(int *src, int *dst) {
slab_winner = highest_slab;
}
- if (source && dest) {
+ if (shrink_now && !source){
+ source=emergency_source;
+ dest=0;/*just in case*/
+ }
+
+
+ if (source) {
*src = source;
*dst = dest;
+ if (dest) {
+ return 2;
+ }
return 1;
}
return 0;
@@ -746,9 +823,22 @@ static void *slab_maintenance_thread(void *arg) {
} else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
/* If we have a decision to continue, continue it */
was_busy = slab_rebalance_move();
- } else if (settings.slab_automove && slab_automove_decision(&src, &dest) == 1) {
- /* Blind to the return codes. It will retry on its own */
- slabs_reassign(src, dest);
+ } else {
+ bool shrink_now= settings.slab_autoshrink && mem_limit &&
+ (mem_malloced > mem_limit);
+ if (shrink_now || settings.slab_automove) {
+ int decision=slab_automove_decision(&src, &dest, shrink_now);
+ /* Blind to the return codes. It will retry on its own */
+ if (shrink_now && decision > 0) {
+ /*We do not pass dest here, but rather 0,
+ so that even if a destination was found,
+ shrinkage will take precedence over reassigning*/
+ slabs_reassign(src, 0);
+ }else if (settings.slab_automove && decision == 2) {
+ /*Only automove memory when no shrinkage is required*/
+ slabs_reassign(src, dest);
+ }
+ }
}
if (slab_rebal.done) {
@@ -770,18 +860,17 @@ static enum reassign_result_type do_slabs_reassign(int src, int dst) {
return REASSIGN_SRC_DST_SAME;
if (src < POWER_SMALLEST || src > power_largest ||
- dst < POWER_SMALLEST || dst > power_largest)
+ (dst!=0 && (dst < POWER_SMALLEST || dst > power_largest)))
return REASSIGN_BADCLASS;
if (slabclass[src].slabs < 2)
return REASSIGN_NOSPARE;
- if (slabclass[dst].end_page_ptr)
+ if ((dst!=0) && slabclass[dst].end_page_ptr)
return REASSIGN_DEST_NOT_FULL;
if (slabclass[src].end_page_ptr)
return REASSIGN_SRC_NOT_SAFE;
-
slab_rebal.s_clsid = src;
slab_rebal.d_clsid = dst;
@@ -790,6 +879,7 @@ static enum reassign_result_type do_slabs_reassign(int src, int dst) {
return REASSIGN_OK;
}
+
enum reassign_result_type slabs_reassign(int src, int dst) {
enum reassign_result_type ret;
mutex_lock(&slabs_lock);
@@ -811,6 +901,9 @@ int start_slab_maintenance_thread(void) {
slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
}
}
+ if (settings.verbose >1)
+ fprintf(stderr,"starting slab mainainance\n");
+
if ((ret = pthread_create(&maintenance_tid, NULL,
slab_maintenance_thread, NULL)) != 0) {
fprintf(stderr, "Can't create thread: %s\n", strerror(ret));
diff --git a/slabs.h b/slabs.h
index 730474b..1db4669 100644
--- a/slabs.h
+++ b/slabs.h
@@ -27,6 +27,9 @@ void slabs_free(void *ptr, size_t size, unsigned int id);
/** Adjust the stats for memory requested */
void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal);
+/** Adjust the limit on memory in run time*/
+int memory_shrink_expand(const size_t size) ;
+
/** Return a datum for stats in binary protocol */
bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c);
diff --git a/t/binary.t b/t/binary.t
index 504ddef..fb049c0 100755
--- a/t/binary.t
+++ b/t/binary.t
@@ -2,7 +2,7 @@
use strict;
use warnings;
-use Test::More tests => 3539;
+use Test::More tests => 3542;
use FindBin qw($Bin);
use lib "$Bin/lib";
use MemcachedTest;
slabs_shrink.t
Description: Troff document
