Blame slabs.c

Packit 4e8bc4
/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
Packit 4e8bc4
/*
Packit 4e8bc4
 * Slabs memory allocation, based on powers-of-N. Slabs are up to 1MB in size
Packit 4e8bc4
 * and are divided into chunks. The chunk sizes start off at the size of the
Packit 4e8bc4
 * "item" structure plus space for a small key and value. They increase by
Packit 4e8bc4
 * a multiplier factor from there, up to half the maximum slab size. The last
Packit 4e8bc4
 * slab size is always 1MB, since that's the maximum item size allowed by the
Packit 4e8bc4
 * memcached protocol.
Packit 4e8bc4
 */
Packit 4e8bc4
#include "memcached.h"
Packit 4e8bc4
#include <sys/mman.h>
Packit 4e8bc4
#include <sys/stat.h>
Packit 4e8bc4
#include <sys/socket.h>
Packit 4e8bc4
#include <sys/resource.h>
Packit 4e8bc4
#include <fcntl.h>
Packit 4e8bc4
#include <netinet/in.h>
Packit 4e8bc4
#include <errno.h>
Packit 4e8bc4
#include <stdlib.h>
Packit 4e8bc4
#include <stdio.h>
Packit 4e8bc4
#include <string.h>
Packit 4e8bc4
#include <signal.h>
Packit 4e8bc4
#include <assert.h>
Packit 4e8bc4
#include <pthread.h>
Packit 4e8bc4
Packit 4e8bc4
//#define DEBUG_SLAB_MOVER
Packit 4e8bc4
/* powers-of-N allocation structures */
Packit 4e8bc4
Packit 4e8bc4
typedef struct {
Packit 4e8bc4
    unsigned int size;      /* sizes of items */
Packit 4e8bc4
    unsigned int perslab;   /* how many items per slab */
Packit 4e8bc4
Packit 4e8bc4
    void *slots;           /* list of item ptrs */
Packit 4e8bc4
    unsigned int sl_curr;   /* total free items in list */
Packit 4e8bc4
Packit 4e8bc4
    unsigned int slabs;     /* how many slabs were allocated for this class */
Packit 4e8bc4
Packit 4e8bc4
    void **slab_list;       /* array of slab pointers */
Packit 4e8bc4
    unsigned int list_size; /* size of prev array */
Packit 4e8bc4
} slabclass_t;
Packit 4e8bc4
Packit 4e8bc4
static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
Packit 4e8bc4
static size_t mem_limit = 0;
Packit 4e8bc4
static size_t mem_malloced = 0;
Packit 4e8bc4
/* If the memory limit has been hit once. Used as a hint to decide when to
Packit 4e8bc4
 * early-wake the LRU maintenance thread */
Packit 4e8bc4
static bool mem_limit_reached = false;
Packit 4e8bc4
static int power_largest;
Packit 4e8bc4
Packit 4e8bc4
static void *mem_base = NULL;
Packit 4e8bc4
static void *mem_current = NULL;
Packit 4e8bc4
static size_t mem_avail = 0;
Packit 4e8bc4
#ifdef EXTSTORE
Packit 4e8bc4
static void *storage  = NULL;
Packit 4e8bc4
#endif
Packit 4e8bc4
/**
Packit 4e8bc4
 * Access to the slab allocator is protected by this lock
Packit 4e8bc4
 */
Packit 4e8bc4
static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
Packit 4e8bc4
static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
Packit 4e8bc4
Packit 4e8bc4
/*
Packit 4e8bc4
 * Forward Declarations
Packit 4e8bc4
 */
Packit 4e8bc4
static int grow_slab_list (const unsigned int id);
Packit 4e8bc4
static int do_slabs_newslab(const unsigned int id);
Packit 4e8bc4
static void *memory_allocate(size_t size);
Packit 4e8bc4
static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
Packit 4e8bc4
Packit 4e8bc4
/* Preallocate as many slab pages as possible (called from slabs_init)
Packit 4e8bc4
   on start-up, so users don't get confused out-of-memory errors when
Packit 4e8bc4
   they do have free (in-slab) space, but no space to make new slabs.
Packit 4e8bc4
   if maxslabs is 18 (POWER_LARGEST - POWER_SMALLEST + 1), then all
Packit 4e8bc4
   slab types can be made.  if max memory is less than 18 MB, only the
Packit 4e8bc4
   smaller ones will be made.  */
Packit 4e8bc4
static void slabs_preallocate (const unsigned int maxslabs);
Packit 4e8bc4
#ifdef EXTSTORE
Packit 4e8bc4
void slabs_set_storage(void *arg) {
Packit 4e8bc4
    storage = arg;
Packit 4e8bc4
}
Packit 4e8bc4
#endif
Packit 4e8bc4
/*
Packit 4e8bc4
 * Figures out which slab class (chunk size) is required to store an item of
Packit 4e8bc4
 * a given size.
Packit 4e8bc4
 *
Packit 4e8bc4
 * Given object size, return id to use when allocating/freeing memory for object
Packit 4e8bc4
 * 0 means error: can't store such a large object
Packit 4e8bc4
 */
Packit 4e8bc4
Packit 4e8bc4
unsigned int slabs_clsid(const size_t size) {
Packit 4e8bc4
    int res = POWER_SMALLEST;
Packit 4e8bc4
Packit 4e8bc4
    if (size == 0 || size > settings.item_size_max)
Packit 4e8bc4
        return 0;
Packit 4e8bc4
    while (size > slabclass[res].size)
Packit 4e8bc4
        if (res++ == power_largest)     /* won't fit in the biggest slab */
Packit 4e8bc4
            return power_largest;
Packit 4e8bc4
    return res;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
unsigned int slabs_size(const int clsid) {
Packit 4e8bc4
    return slabclass[clsid].size;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
// TODO: could this work with the restartable memory?
Packit 4e8bc4
// Docs say hugepages only work with private shm allocs.
Packit 4e8bc4
/* Function split out for better error path handling */
Packit 4e8bc4
static void * alloc_large_chunk(const size_t limit)
Packit 4e8bc4
{
Packit 4e8bc4
    void *ptr = NULL;
Packit 4e8bc4
#if defined(__linux__) && defined(MADV_HUGEPAGE)
Packit 4e8bc4
    size_t pagesize = 0;
Packit 4e8bc4
    FILE *fp;
Packit 4e8bc4
    int ret;
Packit 4e8bc4
Packit 4e8bc4
    /* Get the size of huge pages */
Packit 4e8bc4
    fp = fopen("/proc/meminfo", "r");
Packit 4e8bc4
    if (fp != NULL) {
Packit 4e8bc4
        char buf[64];
Packit 4e8bc4
Packit 4e8bc4
        while ((fgets(buf, sizeof(buf), fp)))
Packit 4e8bc4
            if (!strncmp(buf, "Hugepagesize:", 13)) {
Packit 4e8bc4
                ret = sscanf(buf + 13, "%zu\n", &pagesize);
Packit 4e8bc4
Packit 4e8bc4
                /* meminfo huge page size is in KiBs */
Packit 4e8bc4
                pagesize <<= 10;
Packit 4e8bc4
            }
Packit 4e8bc4
        fclose(fp);
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if (!pagesize) {
Packit 4e8bc4
        fprintf(stderr, "Failed to get supported huge page size\n");
Packit 4e8bc4
        return NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if (settings.verbose > 1)
Packit 4e8bc4
        fprintf(stderr, "huge page size: %zu\n", pagesize);
Packit 4e8bc4
Packit 4e8bc4
    /* This works because glibc simply uses mmap when the alignment is
Packit 4e8bc4
     * above a certain limit. */
Packit 4e8bc4
    ret = posix_memalign(&ptr, pagesize, limit);
Packit 4e8bc4
    if (ret != 0) {
Packit 4e8bc4
        fprintf(stderr, "Failed to get aligned memory chunk: %d\n", ret);
Packit 4e8bc4
        return NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    ret = madvise(ptr, limit, MADV_HUGEPAGE);
Packit 4e8bc4
    if (ret < 0) {
Packit 4e8bc4
        fprintf(stderr, "Failed to set transparent hugepage hint: %d\n", ret);
Packit 4e8bc4
        free(ptr);
Packit 4e8bc4
        ptr = NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
#elif defined(__FreeBSD__)
Packit 4e8bc4
    size_t align = (sizeof(size_t) * 8 - (__builtin_clzl(4095)));
Packit 4e8bc4
    ptr = mmap(NULL, limit, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON | MAP_ALIGNED(align) | MAP_ALIGNED_SUPER, -1, 0);
Packit 4e8bc4
    if (ptr == MAP_FAILED) {
Packit 4e8bc4
        fprintf(stderr, "Failed to set super pages\n");
Packit 4e8bc4
        ptr = NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
#else
Packit 4e8bc4
    ptr = malloc(limit);
Packit 4e8bc4
#endif
Packit 4e8bc4
    return ptr;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
unsigned int slabs_fixup(char *chunk, const int border) {
Packit 4e8bc4
    slabclass_t *p;
Packit 4e8bc4
    item *it = (item *)chunk;
Packit 4e8bc4
    int id = ITEM_clsid(it);
Packit 4e8bc4
Packit 4e8bc4
    // memory isn't used yet. shunt to global pool.
Packit 4e8bc4
    // (which must be 0)
Packit 4e8bc4
    if (id == 0) {
Packit 4e8bc4
        //assert(border == 0);
Packit 4e8bc4
        p = &slabclass[0];
Packit 4e8bc4
        grow_slab_list(0);
Packit 4e8bc4
        p->slab_list[p->slabs++] = (char*)chunk;
Packit 4e8bc4
        return -1;
Packit 4e8bc4
    }
Packit 4e8bc4
    p = &slabclass[id];
Packit 4e8bc4
Packit 4e8bc4
    // if we're on a page border, add the slab to slab class
Packit 4e8bc4
    if (border == 0) {
Packit 4e8bc4
        grow_slab_list(id);
Packit 4e8bc4
        p->slab_list[p->slabs++] = chunk;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    // increase free count if ITEM_SLABBED
Packit 4e8bc4
    if (it->it_flags == ITEM_SLABBED) {
Packit 4e8bc4
        // if ITEM_SLABBED re-stack on freelist.
Packit 4e8bc4
        // don't have to run pointer fixups.
Packit 4e8bc4
        it->prev = 0;
Packit 4e8bc4
        it->next = p->slots;
Packit 4e8bc4
        if (it->next) it->next->prev = it;
Packit 4e8bc4
        p->slots = it;
Packit 4e8bc4
Packit 4e8bc4
        p->sl_curr++;
Packit 4e8bc4
        //fprintf(stderr, "replacing into freelist\n");
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    return p->size;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/**
Packit 4e8bc4
 * Determines the chunk sizes and initializes the slab class descriptors
Packit 4e8bc4
 * accordingly.
Packit 4e8bc4
 */
Packit 4e8bc4
void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes, void *mem_base_external, bool reuse_mem) {
Packit 4e8bc4
    int i = POWER_SMALLEST - 1;
Packit 4e8bc4
    unsigned int size = sizeof(item) + settings.chunk_size;
Packit 4e8bc4
Packit 4e8bc4
    /* Some platforms use runtime transparent hugepages. If for any reason
Packit 4e8bc4
     * the initial allocation fails, the required settings do not persist
Packit 4e8bc4
     * for remaining allocations. As such it makes little sense to do slab
Packit 4e8bc4
     * preallocation. */
Packit 4e8bc4
    bool __attribute__ ((unused)) do_slab_prealloc = false;
Packit 4e8bc4
Packit 4e8bc4
    mem_limit = limit;
Packit 4e8bc4
Packit 4e8bc4
    if (prealloc && mem_base_external == NULL) {
Packit 4e8bc4
        mem_base = alloc_large_chunk(mem_limit);
Packit 4e8bc4
        if (mem_base) {
Packit 4e8bc4
            do_slab_prealloc = true;
Packit 4e8bc4
            mem_current = mem_base;
Packit 4e8bc4
            mem_avail = mem_limit;
Packit 4e8bc4
        } else {
Packit 4e8bc4
            fprintf(stderr, "Warning: Failed to allocate requested memory in"
Packit 4e8bc4
                    " one large chunk.\nWill allocate in smaller chunks\n");
Packit 4e8bc4
        }
Packit 4e8bc4
    } else if (prealloc && mem_base_external != NULL) {
Packit 4e8bc4
        // Can't (yet) mix hugepages with mmap allocations, so separate the
Packit 4e8bc4
        // logic from above. Reusable memory also force-preallocates memory
Packit 4e8bc4
        // pages into the global pool, which requires turning mem_* variables.
Packit 4e8bc4
        do_slab_prealloc = true;
Packit 4e8bc4
        mem_base = mem_base_external;
Packit 4e8bc4
        // _current shouldn't be used in this case, but we set it to where it
Packit 4e8bc4
        // should be anyway.
Packit 4e8bc4
        if (reuse_mem) {
Packit 4e8bc4
            mem_current = ((char*)mem_base) + mem_limit;
Packit 4e8bc4
            mem_avail = 0;
Packit 4e8bc4
        } else {
Packit 4e8bc4
            mem_current = mem_base;
Packit 4e8bc4
            mem_avail = mem_limit;
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    memset(slabclass, 0, sizeof(slabclass));
Packit 4e8bc4
Packit 4e8bc4
    while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
Packit 4e8bc4
        if (slab_sizes != NULL) {
Packit 4e8bc4
            if (slab_sizes[i-1] == 0)
Packit 4e8bc4
                break;
Packit 4e8bc4
            size = slab_sizes[i-1];
Packit 4e8bc4
        } else if (size >= settings.slab_chunk_size_max / factor) {
Packit 4e8bc4
            break;
Packit 4e8bc4
        }
Packit 4e8bc4
        /* Make sure items are always n-byte aligned */
Packit 4e8bc4
        if (size % CHUNK_ALIGN_BYTES)
Packit 4e8bc4
            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
Packit 4e8bc4
Packit 4e8bc4
        slabclass[i].size = size;
Packit 4e8bc4
        slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
Packit 4e8bc4
        if (slab_sizes == NULL)
Packit 4e8bc4
            size *= factor;
Packit 4e8bc4
        if (settings.verbose > 1) {
Packit 4e8bc4
            fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
Packit 4e8bc4
                    i, slabclass[i].size, slabclass[i].perslab);
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    power_largest = i;
Packit 4e8bc4
    slabclass[power_largest].size = settings.slab_chunk_size_max;
Packit 4e8bc4
    slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
Packit 4e8bc4
    if (settings.verbose > 1) {
Packit 4e8bc4
        fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
Packit 4e8bc4
                i, slabclass[i].size, slabclass[i].perslab);
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    /* for the test suite:  faking of how much we've already malloc'd */
Packit 4e8bc4
    {
Packit 4e8bc4
        char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
Packit 4e8bc4
        if (t_initial_malloc) {
Packit 4e8bc4
            mem_malloced = (size_t)atol(t_initial_malloc);
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if (do_slab_prealloc) {
Packit 4e8bc4
        if (!reuse_mem) {
Packit 4e8bc4
            slabs_preallocate(power_largest);
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
void slabs_prefill_global(void) {
Packit 4e8bc4
    void *ptr;
Packit 4e8bc4
    slabclass_t *p = &slabclass[0];
Packit 4e8bc4
    int len = settings.slab_page_size;
Packit 4e8bc4
Packit 4e8bc4
    while (mem_malloced < mem_limit
Packit 4e8bc4
            && (ptr = memory_allocate(len)) != NULL) {
Packit 4e8bc4
        grow_slab_list(0);
Packit Service 3ed69b
        // Ensure the front header is zero'd to avoid confusing restart code.
Packit Service 3ed69b
        // It's probably good enough to cast it and just zero slabs_clsid, but
Packit Service 3ed69b
        // this is extra paranoid.
Packit Service 3ed69b
        memset(ptr, 0, sizeof(item));
Packit 4e8bc4
        p->slab_list[p->slabs++] = ptr;
Packit 4e8bc4
    }
Packit 4e8bc4
    mem_limit_reached = true;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static void slabs_preallocate (const unsigned int maxslabs) {
Packit 4e8bc4
    int i;
Packit 4e8bc4
    unsigned int prealloc = 0;
Packit 4e8bc4
Packit 4e8bc4
    /* pre-allocate a 1MB slab in every size class so people don't get
Packit 4e8bc4
       confused by non-intuitive "SERVER_ERROR out of memory"
Packit 4e8bc4
       messages.  this is the most common question on the mailing
Packit 4e8bc4
       list.  if you really don't want this, you can rebuild without
Packit 4e8bc4
       these three lines.  */
Packit 4e8bc4
Packit 4e8bc4
    for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
Packit 4e8bc4
        if (++prealloc > maxslabs)
Packit 4e8bc4
            break;
Packit 4e8bc4
        if (do_slabs_newslab(i) == 0) {
Packit 4e8bc4
            fprintf(stderr, "Error while preallocating slab memory!\n"
Packit 4e8bc4
                "If using -L or other prealloc options, max memory must be "
Packit 4e8bc4
                "at least %d megabytes.\n", power_largest);
Packit 4e8bc4
            exit(1);
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static int grow_slab_list (const unsigned int id) {
Packit 4e8bc4
    slabclass_t *p = &slabclass[id];
Packit 4e8bc4
    if (p->slabs == p->list_size) {
Packit 4e8bc4
        size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
Packit 4e8bc4
        void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
Packit 4e8bc4
        if (new_list == 0) return 0;
Packit 4e8bc4
        p->list_size = new_size;
Packit 4e8bc4
        p->slab_list = new_list;
Packit 4e8bc4
    }
Packit 4e8bc4
    return 1;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
Packit 4e8bc4
    slabclass_t *p = &slabclass[id];
Packit 4e8bc4
    int x;
Packit 4e8bc4
    for (x = 0; x < p->perslab; x++) {
Packit 4e8bc4
        do_slabs_free(ptr, 0, id);
Packit 4e8bc4
        ptr += p->size;
Packit 4e8bc4
    }
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* Fast FIFO queue */
Packit 4e8bc4
static void *get_page_from_global_pool(void) {
Packit 4e8bc4
    slabclass_t *p = &slabclass[SLAB_GLOBAL_PAGE_POOL];
Packit 4e8bc4
    if (p->slabs < 1) {
Packit 4e8bc4
        return NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
    char *ret = p->slab_list[p->slabs - 1];
Packit 4e8bc4
    p->slabs--;
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static int do_slabs_newslab(const unsigned int id) {
Packit 4e8bc4
    slabclass_t *p = &slabclass[id];
Packit 4e8bc4
    slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
Packit 4e8bc4
    int len = (settings.slab_reassign || settings.slab_chunk_size_max != settings.slab_page_size)
Packit 4e8bc4
        ? settings.slab_page_size
Packit 4e8bc4
        : p->size * p->perslab;
Packit 4e8bc4
    char *ptr;
Packit 4e8bc4
Packit 4e8bc4
    if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
Packit 4e8bc4
         && g->slabs == 0)) {
Packit 4e8bc4
        mem_limit_reached = true;
Packit 4e8bc4
        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
Packit 4e8bc4
        return 0;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if ((grow_slab_list(id) == 0) ||
Packit 4e8bc4
        (((ptr = get_page_from_global_pool()) == NULL) &&
Packit 4e8bc4
        ((ptr = memory_allocate((size_t)len)) == 0))) {
Packit 4e8bc4
Packit 4e8bc4
        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
Packit 4e8bc4
        return 0;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
#if !defined(__FreeBSD__)
Packit 4e8bc4
    memset(ptr, 0, (size_t)len);
Packit 4e8bc4
#endif
Packit 4e8bc4
    split_slab_page_into_freelist(ptr, id);
Packit 4e8bc4
Packit 4e8bc4
    p->slab_list[p->slabs++] = ptr;
Packit 4e8bc4
    MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
Packit 4e8bc4
Packit 4e8bc4
    return 1;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/*@null@*/
Packit 4e8bc4
static void *do_slabs_alloc(const size_t size, unsigned int id,
Packit 4e8bc4
        unsigned int flags) {
Packit 4e8bc4
    slabclass_t *p;
Packit 4e8bc4
    void *ret = NULL;
Packit 4e8bc4
    item *it = NULL;
Packit 4e8bc4
Packit 4e8bc4
    if (id < POWER_SMALLEST || id > power_largest) {
Packit 4e8bc4
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
Packit 4e8bc4
        return NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
    p = &slabclass[id];
Packit 4e8bc4
    assert(p->sl_curr == 0 || (((item *)p->slots)->it_flags & ITEM_SLABBED));
Packit 4e8bc4
Packit 4e8bc4
    assert(size <= p->size);
Packit 4e8bc4
    /* fail unless we have space at the end of a recently allocated page,
Packit 4e8bc4
       we have something on our freelist, or we could allocate a new page */
Packit 4e8bc4
    if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
Packit 4e8bc4
        do_slabs_newslab(id);
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if (p->sl_curr != 0) {
Packit 4e8bc4
        /* return off our freelist */
Packit 4e8bc4
        it = (item *)p->slots;
Packit 4e8bc4
        p->slots = it->next;
Packit 4e8bc4
        if (it->next) it->next->prev = 0;
Packit 4e8bc4
        /* Kill flag and initialize refcount here for lock safety in slab
Packit 4e8bc4
         * mover's freeness detection. */
Packit 4e8bc4
        it->it_flags &= ~ITEM_SLABBED;
Packit 4e8bc4
        it->refcount = 1;
Packit 4e8bc4
        p->sl_curr--;
Packit 4e8bc4
        ret = (void *)it;
Packit 4e8bc4
    } else {
Packit 4e8bc4
        ret = NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if (ret) {
Packit 4e8bc4
        MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
Packit 4e8bc4
    } else {
Packit 4e8bc4
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static void do_slabs_free_chunked(item *it, const size_t size) {
Packit 4e8bc4
    item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
Packit 4e8bc4
    slabclass_t *p;
Packit 4e8bc4
Packit 4e8bc4
    it->it_flags = ITEM_SLABBED;
Packit 4e8bc4
    // FIXME: refresh on how this works?
Packit 4e8bc4
    //it->slabs_clsid = 0;
Packit 4e8bc4
    it->prev = 0;
Packit 4e8bc4
    // header object's original classid is stored in chunk.
Packit 4e8bc4
    p = &slabclass[chunk->orig_clsid];
Packit Service 98b490
    // original class id needs to be set on free memory.
Packit Service 98b490
    it->slabs_clsid = chunk->orig_clsid;
Packit 4e8bc4
    if (chunk->next) {
Packit 4e8bc4
        chunk = chunk->next;
Packit 4e8bc4
        chunk->prev = 0;
Packit 4e8bc4
    } else {
Packit 4e8bc4
        // header with no attached chunk
Packit 4e8bc4
        chunk = NULL;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    // return the header object.
Packit 4e8bc4
    // TODO: This is in three places, here and in do_slabs_free().
Packit 4e8bc4
    it->prev = 0;
Packit 4e8bc4
    it->next = p->slots;
Packit 4e8bc4
    if (it->next) it->next->prev = it;
Packit 4e8bc4
    p->slots = it;
Packit 4e8bc4
    p->sl_curr++;
Packit 4e8bc4
Packit 4e8bc4
    item_chunk *next_chunk;
Packit 4e8bc4
    while (chunk) {
Packit 4e8bc4
        assert(chunk->it_flags == ITEM_CHUNK);
Packit 4e8bc4
        chunk->it_flags = ITEM_SLABBED;
Packit 4e8bc4
        p = &slabclass[chunk->slabs_clsid];
Packit 4e8bc4
        next_chunk = chunk->next;
Packit 4e8bc4
Packit 4e8bc4
        chunk->prev = 0;
Packit 4e8bc4
        chunk->next = p->slots;
Packit 4e8bc4
        if (chunk->next) chunk->next->prev = chunk;
Packit 4e8bc4
        p->slots = chunk;
Packit 4e8bc4
        p->sl_curr++;
Packit 4e8bc4
Packit 4e8bc4
        chunk = next_chunk;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    return;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
Packit 4e8bc4
static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
Packit 4e8bc4
    slabclass_t *p;
Packit 4e8bc4
    item *it;
Packit 4e8bc4
Packit 4e8bc4
    assert(id >= POWER_SMALLEST && id <= power_largest);
Packit 4e8bc4
    if (id < POWER_SMALLEST || id > power_largest)
Packit 4e8bc4
        return;
Packit 4e8bc4
Packit 4e8bc4
    MEMCACHED_SLABS_FREE(size, id, ptr);
Packit 4e8bc4
    p = &slabclass[id];
Packit 4e8bc4
Packit 4e8bc4
    it = (item *)ptr;
Packit 4e8bc4
    if ((it->it_flags & ITEM_CHUNKED) == 0) {
Packit 4e8bc4
        it->it_flags = ITEM_SLABBED;
Packit 4e8bc4
        it->slabs_clsid = id;
Packit 4e8bc4
        it->prev = 0;
Packit 4e8bc4
        it->next = p->slots;
Packit 4e8bc4
        if (it->next) it->next->prev = it;
Packit 4e8bc4
        p->slots = it;
Packit 4e8bc4
Packit 4e8bc4
        p->sl_curr++;
Packit 4e8bc4
    } else {
Packit 4e8bc4
        do_slabs_free_chunked(it, size);
Packit 4e8bc4
    }
Packit 4e8bc4
    return;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* With refactoring of the various stats code the automover won't need a
Packit 4e8bc4
 * custom function here.
Packit 4e8bc4
 */
Packit 4e8bc4
void fill_slab_stats_automove(slab_stats_automove *am) {
Packit 4e8bc4
    int n;
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
Packit 4e8bc4
        slabclass_t *p = &slabclass[n];
Packit 4e8bc4
        slab_stats_automove *cur = &am[n];
Packit 4e8bc4
        cur->chunks_per_page = p->perslab;
Packit 4e8bc4
        cur->free_chunks = p->sl_curr;
Packit 4e8bc4
        cur->total_pages = p->slabs;
Packit 4e8bc4
        cur->chunk_size = p->size;
Packit 4e8bc4
    }
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* TODO: slabs_available_chunks should grow up to encompass this.
Packit 4e8bc4
 * mem_flag is redundant with the other function.
Packit 4e8bc4
 */
Packit 4e8bc4
unsigned int global_page_pool_size(bool *mem_flag) {
Packit 4e8bc4
    unsigned int ret = 0;
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    if (mem_flag != NULL)
Packit 4e8bc4
        *mem_flag = mem_malloced >= mem_limit ? true : false;
Packit 4e8bc4
    ret = slabclass[SLAB_GLOBAL_PAGE_POOL].slabs;
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/*@null@*/
Packit 4e8bc4
static void do_slabs_stats(ADD_STAT add_stats, void *c) {
Packit 4e8bc4
    int i, total;
Packit 4e8bc4
    /* Get the per-thread stats which contain some interesting aggregates */
Packit 4e8bc4
    struct thread_stats thread_stats;
Packit 4e8bc4
    threadlocal_stats_aggregate(&thread_stats);
Packit 4e8bc4
Packit 4e8bc4
    total = 0;
Packit 4e8bc4
    for(i = POWER_SMALLEST; i <= power_largest; i++) {
Packit 4e8bc4
        slabclass_t *p = &slabclass[i];
Packit 4e8bc4
        if (p->slabs != 0) {
Packit 4e8bc4
            uint32_t perslab, slabs;
Packit 4e8bc4
            slabs = p->slabs;
Packit 4e8bc4
            perslab = p->perslab;
Packit 4e8bc4
Packit 4e8bc4
            char key_str[STAT_KEY_LEN];
Packit 4e8bc4
            char val_str[STAT_VAL_LEN];
Packit 4e8bc4
            int klen = 0, vlen = 0;
Packit 4e8bc4
Packit 4e8bc4
            APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "used_chunks", "%u",
Packit 4e8bc4
                            slabs*perslab - p->sl_curr);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
Packit 4e8bc4
            /* Stat is dead, but displaying zero instead of removing it. */
Packit 4e8bc4
            APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "get_hits", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].get_hits);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "cmd_set", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].set_cmds);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "delete_hits", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].delete_hits);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "incr_hits", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].incr_hits);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "decr_hits", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].decr_hits);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "cas_hits", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].cas_hits);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "cas_badval", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].cas_badval);
Packit 4e8bc4
            APPEND_NUM_STAT(i, "touch_hits", "%llu",
Packit 4e8bc4
                    (unsigned long long)thread_stats.slab_stats[i].touch_hits);
Packit 4e8bc4
            total++;
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    /* add overall slab stats and append terminator */
Packit 4e8bc4
Packit 4e8bc4
    APPEND_STAT("active_slabs", "%d", total);
Packit 4e8bc4
    APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
Packit 4e8bc4
    add_stats(NULL, 0, NULL, 0, c);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static void *memory_allocate(size_t size) {
Packit 4e8bc4
    void *ret;
Packit 4e8bc4
Packit 4e8bc4
    if (mem_base == NULL) {
Packit 4e8bc4
        /* We are not using a preallocated large memory chunk */
Packit 4e8bc4
        ret = malloc(size);
Packit 4e8bc4
    } else {
Packit 4e8bc4
        ret = mem_current;
Packit 4e8bc4
Packit 4e8bc4
        if (size > mem_avail) {
Packit 4e8bc4
            return NULL;
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
        /* mem_current pointer _must_ be aligned!!! */
Packit 4e8bc4
        if (size % CHUNK_ALIGN_BYTES) {
Packit 4e8bc4
            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
        mem_current = ((char*)mem_current) + size;
Packit 4e8bc4
        if (size < mem_avail) {
Packit 4e8bc4
            mem_avail -= size;
Packit 4e8bc4
        } else {
Packit 4e8bc4
            mem_avail = 0;
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
    mem_malloced += size;
Packit 4e8bc4
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* Must only be used if all pages are item_size_max */
Packit 4e8bc4
static void memory_release() {
Packit 4e8bc4
    void *p = NULL;
Packit 4e8bc4
    if (mem_base != NULL)
Packit 4e8bc4
        return;
Packit 4e8bc4
Packit 4e8bc4
    if (!settings.slab_reassign)
Packit 4e8bc4
        return;
Packit 4e8bc4
Packit 4e8bc4
    while (mem_malloced > mem_limit &&
Packit 4e8bc4
            (p = get_page_from_global_pool()) != NULL) {
Packit 4e8bc4
        free(p);
Packit 4e8bc4
        mem_malloced -= settings.slab_page_size;
Packit 4e8bc4
    }
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
void *slabs_alloc(size_t size, unsigned int id,
Packit 4e8bc4
        unsigned int flags) {
Packit 4e8bc4
    void *ret;
Packit 4e8bc4
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    ret = do_slabs_alloc(size, id, flags);
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
void slabs_free(void *ptr, size_t size, unsigned int id) {
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    do_slabs_free(ptr, size, id);
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
void slabs_stats(ADD_STAT add_stats, void *c) {
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    do_slabs_stats(add_stats, c);
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static bool do_slabs_adjust_mem_limit(size_t new_mem_limit) {
Packit 4e8bc4
    /* Cannot adjust memory limit at runtime if prealloc'ed */
Packit 4e8bc4
    if (mem_base != NULL)
Packit 4e8bc4
        return false;
Packit 4e8bc4
    settings.maxbytes = new_mem_limit;
Packit 4e8bc4
    mem_limit = new_mem_limit;
Packit 4e8bc4
    mem_limit_reached = false; /* Will reset on next alloc */
Packit 4e8bc4
    memory_release(); /* free what might already be in the global pool */
Packit 4e8bc4
    return true;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
bool slabs_adjust_mem_limit(size_t new_mem_limit) {
Packit 4e8bc4
    bool ret;
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    ret = do_slabs_adjust_mem_limit(new_mem_limit);
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag,
Packit 4e8bc4
        unsigned int *chunks_perslab) {
Packit 4e8bc4
    unsigned int ret;
Packit 4e8bc4
    slabclass_t *p;
Packit 4e8bc4
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    p = &slabclass[id];
Packit 4e8bc4
    ret = p->sl_curr;
Packit 4e8bc4
    if (mem_flag != NULL)
Packit 4e8bc4
        *mem_flag = mem_malloced >= mem_limit ? true : false;
Packit 4e8bc4
    if (chunks_perslab != NULL)
Packit 4e8bc4
        *chunks_perslab = p->perslab;
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* The slabber system could avoid needing to understand much, if anything,
Packit 4e8bc4
 * about items if callbacks were strategically used. Due to how the slab mover
Packit 4e8bc4
 * works, certain flag bits can only be adjusted while holding the slabs lock.
Packit 4e8bc4
 * Using these functions, isolate sections of code needing this and turn them
Packit 4e8bc4
 * into callbacks when an interface becomes more obvious.
Packit 4e8bc4
 */
Packit 4e8bc4
void slabs_mlock(void) {
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
void slabs_munlock(void) {
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
Packit 4e8bc4
static volatile int do_run_slab_rebalance_thread = 1;
Packit 4e8bc4
Packit 4e8bc4
static int slab_rebalance_start(void) {
Packit 4e8bc4
    slabclass_t *s_cls;
Packit 4e8bc4
    int no_go = 0;
Packit 4e8bc4
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
Packit 4e8bc4
    if (slab_rebal.s_clsid < SLAB_GLOBAL_PAGE_POOL ||
Packit 4e8bc4
        slab_rebal.s_clsid > power_largest  ||
Packit 4e8bc4
        slab_rebal.d_clsid < SLAB_GLOBAL_PAGE_POOL ||
Packit 4e8bc4
        slab_rebal.d_clsid > power_largest  ||
Packit 4e8bc4
        slab_rebal.s_clsid == slab_rebal.d_clsid)
Packit 4e8bc4
        no_go = -2;
Packit 4e8bc4
Packit 4e8bc4
    s_cls = &slabclass[slab_rebal.s_clsid];
Packit 4e8bc4
Packit 4e8bc4
    if (!grow_slab_list(slab_rebal.d_clsid)) {
Packit 4e8bc4
        no_go = -1;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if (s_cls->slabs < 2)
Packit 4e8bc4
        no_go = -3;
Packit 4e8bc4
Packit 4e8bc4
    if (no_go != 0) {
Packit 4e8bc4
        pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
        return no_go; /* Should use a wrapper function... */
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    /* Always kill the first available slab page as it is most likely to
Packit 4e8bc4
     * contain the oldest items
Packit 4e8bc4
     */
Packit 4e8bc4
    slab_rebal.slab_start = s_cls->slab_list[0];
Packit 4e8bc4
    slab_rebal.slab_end   = (char *)slab_rebal.slab_start +
Packit 4e8bc4
        (s_cls->size * s_cls->perslab);
Packit 4e8bc4
    slab_rebal.slab_pos   = slab_rebal.slab_start;
Packit 4e8bc4
    slab_rebal.done       = 0;
Packit 4e8bc4
    // Don't need to do chunk move work if page is in global pool.
Packit 4e8bc4
    if (slab_rebal.s_clsid == SLAB_GLOBAL_PAGE_POOL) {
Packit 4e8bc4
        slab_rebal.done = 1;
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    // Bit-vector to keep track of completed chunks
Packit 4e8bc4
    slab_rebal.completed = (uint8_t*)calloc(s_cls->perslab,sizeof(uint8_t));
Packit 4e8bc4
Packit 4e8bc4
    slab_rebalance_signal = 2;
Packit 4e8bc4
Packit 4e8bc4
    if (settings.verbose > 1) {
Packit 4e8bc4
        fprintf(stderr, "Started a slab rebalance\n");
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
Packit 4e8bc4
    STATS_LOCK();
Packit 4e8bc4
    stats_state.slab_reassign_running = true;
Packit 4e8bc4
    STATS_UNLOCK();
Packit 4e8bc4
Packit 4e8bc4
    return 0;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* CALLED WITH slabs_lock HELD */
Packit 4e8bc4
static void *slab_rebalance_alloc(const size_t size, unsigned int id) {
Packit 4e8bc4
    slabclass_t *s_cls;
Packit 4e8bc4
    s_cls = &slabclass[slab_rebal.s_clsid];
Packit 4e8bc4
    int x;
Packit 4e8bc4
    item *new_it = NULL;
Packit 4e8bc4
Packit 4e8bc4
    for (x = 0; x < s_cls->perslab; x++) {
Packit 4e8bc4
        new_it = do_slabs_alloc(size, id, SLABS_ALLOC_NO_NEWPAGE);
Packit 4e8bc4
        /* check that memory isn't within the range to clear */
Packit 4e8bc4
        if (new_it == NULL) {
Packit 4e8bc4
            break;
Packit 4e8bc4
        }
Packit 4e8bc4
        if ((void *)new_it >= slab_rebal.slab_start
Packit 4e8bc4
            && (void *)new_it < slab_rebal.slab_end) {
Packit 4e8bc4
            /* Pulled something we intend to free. Mark it as freed since
Packit 4e8bc4
             * we've already done the work of unlinking it from the freelist.
Packit 4e8bc4
             */
Packit 4e8bc4
            new_it->refcount = 0;
Packit 4e8bc4
            new_it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
Packit 4e8bc4
#ifdef DEBUG_SLAB_MOVER
Packit 4e8bc4
            memcpy(ITEM_key(new_it), "deadbeef", 8);
Packit 4e8bc4
#endif
Packit 4e8bc4
            new_it = NULL;
Packit 4e8bc4
            slab_rebal.inline_reclaim++;
Packit 4e8bc4
        } else {
Packit 4e8bc4
            break;
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
    return new_it;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* CALLED WITH slabs_lock HELD */
Packit 4e8bc4
/* detaches item/chunk from freelist. */
Packit 4e8bc4
static void slab_rebalance_cut_free(slabclass_t *s_cls, item *it) {
Packit 4e8bc4
    /* Ensure this was on the freelist and nothing else. */
Packit 4e8bc4
    assert(it->it_flags == ITEM_SLABBED);
Packit 4e8bc4
    if (s_cls->slots == it) {
Packit 4e8bc4
        s_cls->slots = it->next;
Packit 4e8bc4
    }
Packit 4e8bc4
    if (it->next) it->next->prev = it->prev;
Packit 4e8bc4
    if (it->prev) it->prev->next = it->next;
Packit 4e8bc4
    s_cls->sl_curr--;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
enum move_status {
Packit 4e8bc4
    MOVE_PASS=0, MOVE_FROM_SLAB, MOVE_FROM_LRU, MOVE_BUSY, MOVE_LOCKED
Packit 4e8bc4
};
Packit 4e8bc4
Packit 4e8bc4
#define SLAB_MOVE_MAX_LOOPS 1000
Packit 4e8bc4
Packit 4e8bc4
/* refcount == 0 is safe since nobody can incr while item_lock is held.
Packit 4e8bc4
 * refcount != 0 is impossible since flags/etc can be modified in other
Packit 4e8bc4
 * threads. instead, note we found a busy one and bail. logic in do_item_get
Packit 4e8bc4
 * will prevent busy items from continuing to be busy
Packit 4e8bc4
 * NOTE: This is checking it_flags outside of an item lock. I believe this
Packit 4e8bc4
 * works since it_flags is 8 bits, and we're only ever comparing a single bit
Packit 4e8bc4
 * regardless. ITEM_SLABBED bit will always be correct since we're holding the
Packit 4e8bc4
 * lock which modifies that bit. ITEM_LINKED won't exist if we're between an
Packit 4e8bc4
 * item having ITEM_SLABBED removed, and the key hasn't been added to the item
Packit 4e8bc4
 * yet. The memory barrier from the slabs lock should order the key write and the
Packit 4e8bc4
 * flags to the item?
Packit 4e8bc4
 * If ITEM_LINKED did exist and was just removed, but we still see it, that's
Packit 4e8bc4
 * still safe since it will have a valid key, which we then lock, and then
Packit 4e8bc4
 * recheck everything.
Packit 4e8bc4
 * This may not be safe on all platforms; If not, slabs_alloc() will need to
Packit 4e8bc4
 * seed the item key while holding slabs_lock.
Packit 4e8bc4
 */
Packit 4e8bc4
static int slab_rebalance_move(void) {
Packit 4e8bc4
    slabclass_t *s_cls;
Packit 4e8bc4
    int was_busy = 0;
Packit 4e8bc4
    int refcount = 0;
Packit 4e8bc4
    uint32_t hv;
Packit 4e8bc4
    void *hold_lock;
Packit 4e8bc4
    enum move_status status = MOVE_PASS;
Packit 4e8bc4
Packit 4e8bc4
    s_cls = &slabclass[slab_rebal.s_clsid];
Packit 4e8bc4
    // the offset to check if completed or not
Packit 4e8bc4
    int offset = ((char*)slab_rebal.slab_pos-(char*)slab_rebal.slab_start)/(s_cls->size);
Packit 4e8bc4
Packit 4e8bc4
    // skip acquiring the slabs lock for items we've already fully processed.
Packit 4e8bc4
    if (slab_rebal.completed[offset] == 0) {
Packit 4e8bc4
        pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
        hv = 0;
Packit 4e8bc4
        hold_lock = NULL;
Packit 4e8bc4
        item *it = slab_rebal.slab_pos;
Packit 4e8bc4
Packit 4e8bc4
        item_chunk *ch = NULL;
Packit 4e8bc4
        status = MOVE_PASS;
Packit 4e8bc4
Packit 4e8bc4
        if (it->it_flags & ITEM_CHUNK) {
Packit 4e8bc4
            /* This chunk is a chained part of a larger item. */
Packit 4e8bc4
            ch = (item_chunk *) it;
Packit 4e8bc4
            /* Instead, we use the head chunk to find the item and effectively
Packit 4e8bc4
             * lock the entire structure. If a chunk has ITEM_CHUNK flag, its
Packit 4e8bc4
             * head cannot be slabbed, so the normal routine is safe. */
Packit 4e8bc4
            it = ch->head;
Packit 4e8bc4
            assert(it->it_flags & ITEM_CHUNKED);
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
        /* ITEM_FETCHED when ITEM_SLABBED is overloaded to mean we've cleared
Packit 4e8bc4
         * the chunk for move. Only these two flags should exist.
Packit 4e8bc4
         */
Packit 4e8bc4
        if (it->it_flags != (ITEM_SLABBED|ITEM_FETCHED)) {
Packit 4e8bc4
            /* ITEM_SLABBED can only be added/removed under the slabs_lock */
Packit 4e8bc4
            if (it->it_flags & ITEM_SLABBED) {
Packit 4e8bc4
                assert(ch == NULL);
Packit 4e8bc4
                slab_rebalance_cut_free(s_cls, it);
Packit 4e8bc4
                status = MOVE_FROM_SLAB;
Packit 4e8bc4
            } else if ((it->it_flags & ITEM_LINKED) != 0) {
Packit 4e8bc4
                /* If it doesn't have ITEM_SLABBED, the item could be in any
Packit 4e8bc4
                 * state on its way to being freed or written to. If no
Packit 4e8bc4
                 * ITEM_SLABBED, but it's had ITEM_LINKED, it must be active
Packit 4e8bc4
                 * and have the key written to it already.
Packit 4e8bc4
                 */
Packit 4e8bc4
                hv = hash(ITEM_key(it), it->nkey);
Packit 4e8bc4
                if ((hold_lock = item_trylock(hv)) == NULL) {
Packit 4e8bc4
                    status = MOVE_LOCKED;
Packit 4e8bc4
                } else {
Packit 4e8bc4
                    bool is_linked = (it->it_flags & ITEM_LINKED);
Packit 4e8bc4
                    refcount = refcount_incr(it);
Packit 4e8bc4
                    if (refcount == 2) { /* item is linked but not busy */
Packit 4e8bc4
                        /* Double check ITEM_LINKED flag here, since we're
Packit 4e8bc4
                         * past a memory barrier from the mutex. */
Packit 4e8bc4
                        if (is_linked) {
Packit 4e8bc4
                            status = MOVE_FROM_LRU;
Packit 4e8bc4
                        } else {
Packit 4e8bc4
                            /* refcount == 1 + !ITEM_LINKED means the item is being
Packit 4e8bc4
                             * uploaded to, or was just unlinked but hasn't been freed
Packit 4e8bc4
                             * yet. Let it bleed off on its own and try again later */
Packit 4e8bc4
                            status = MOVE_BUSY;
Packit 4e8bc4
                        }
Packit 4e8bc4
                    } else if (refcount > 2 && is_linked) {
Packit 4e8bc4
                        // TODO: Mark items for delete/rescue and process
Packit 4e8bc4
                        // outside of the main loop.
Packit 4e8bc4
                        if (slab_rebal.busy_loops > SLAB_MOVE_MAX_LOOPS) {
Packit 4e8bc4
                            slab_rebal.busy_deletes++;
Packit 4e8bc4
                            // Only safe to hold slabs lock because refcount
Packit 4e8bc4
                            // can't drop to 0 until we release item lock.
Packit 4e8bc4
                            STORAGE_delete(storage, it);
Packit 4e8bc4
                            pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
                            do_item_unlink(it, hv);
Packit 4e8bc4
                            pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
                        }
Packit 4e8bc4
                        status = MOVE_BUSY;
Packit 4e8bc4
                    } else {
Packit 4e8bc4
                        if (settings.verbose > 2) {
Packit 4e8bc4
                            fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
Packit 4e8bc4
                                it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
Packit 4e8bc4
                        }
Packit 4e8bc4
                        status = MOVE_BUSY;
Packit 4e8bc4
                    }
Packit 4e8bc4
                    /* Item lock must be held while modifying refcount */
Packit 4e8bc4
                    if (status == MOVE_BUSY) {
Packit 4e8bc4
                        refcount_decr(it);
Packit 4e8bc4
                        item_trylock_unlock(hold_lock);
Packit 4e8bc4
                    }
Packit 4e8bc4
                }
Packit 4e8bc4
            } else {
Packit 4e8bc4
                /* See above comment. No ITEM_SLABBED or ITEM_LINKED. Mark
Packit 4e8bc4
                 * busy and wait for item to complete its upload. */
Packit 4e8bc4
                status = MOVE_BUSY;
Packit 4e8bc4
            }
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
        int save_item = 0;
Packit 4e8bc4
        item *new_it = NULL;
Packit 4e8bc4
        size_t ntotal = 0;
Packit 4e8bc4
        switch (status) {
Packit 4e8bc4
            case MOVE_FROM_LRU:
Packit 4e8bc4
                /* Lock order is LRU locks -> slabs_lock. unlink uses LRU lock.
Packit 4e8bc4
                 * We only need to hold the slabs_lock while initially looking
Packit 4e8bc4
                 * at an item, and at this point we have an exclusive refcount
Packit 4e8bc4
                 * (2) + the item is locked. Drop slabs lock, drop item to
Packit 4e8bc4
                 * refcount 1 (just our own, then fall through and wipe it
Packit 4e8bc4
                 */
Packit 4e8bc4
                /* Check if expired or flushed */
Packit 4e8bc4
                ntotal = ITEM_ntotal(it);
Packit 4e8bc4
#ifdef EXTSTORE
Packit 4e8bc4
                if (it->it_flags & ITEM_HDR) {
Packit 4e8bc4
                    ntotal = (ntotal - it->nbytes) + sizeof(item_hdr);
Packit 4e8bc4
                }
Packit 4e8bc4
#endif
Packit 4e8bc4
                /* REQUIRES slabs_lock: CHECK FOR cls->sl_curr > 0 */
Packit 4e8bc4
                if (ch == NULL && (it->it_flags & ITEM_CHUNKED)) {
Packit 4e8bc4
                    /* Chunked should be identical to non-chunked, except we need
Packit 4e8bc4
                     * to swap out ntotal for the head-chunk-total. */
Packit 4e8bc4
                    ntotal = s_cls->size;
Packit 4e8bc4
                }
Packit 4e8bc4
                if ((it->exptime != 0 && it->exptime < current_time)
Packit 4e8bc4
                    || item_is_flushed(it)) {
Packit 4e8bc4
                    /* Expired, don't save. */
Packit 4e8bc4
                    save_item = 0;
Packit 4e8bc4
                } else if (ch == NULL &&
Packit 4e8bc4
                        (new_it = slab_rebalance_alloc(ntotal, slab_rebal.s_clsid)) == NULL) {
Packit 4e8bc4
                    /* Not a chunk of an item, and nomem. */
Packit 4e8bc4
                    save_item = 0;
Packit 4e8bc4
                    slab_rebal.evictions_nomem++;
Packit 4e8bc4
                } else if (ch != NULL &&
Packit 4e8bc4
                        (new_it = slab_rebalance_alloc(s_cls->size, slab_rebal.s_clsid)) == NULL) {
Packit 4e8bc4
                    /* Is a chunk of an item, and nomem. */
Packit 4e8bc4
                    save_item = 0;
Packit 4e8bc4
                    slab_rebal.evictions_nomem++;
Packit 4e8bc4
                } else {
Packit 4e8bc4
                    /* Was whatever it was, and we have memory for it. */
Packit 4e8bc4
                    save_item = 1;
Packit 4e8bc4
                }
Packit 4e8bc4
                pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
                if (save_item) {
Packit 4e8bc4
                    if (ch == NULL) {
Packit 4e8bc4
                        assert((new_it->it_flags & ITEM_CHUNKED) == 0);
Packit 4e8bc4
                        /* if free memory, memcpy. clear prev/next/h_bucket */
Packit 4e8bc4
                        memcpy(new_it, it, ntotal);
Packit 4e8bc4
                        new_it->prev = 0;
Packit 4e8bc4
                        new_it->next = 0;
Packit 4e8bc4
                        new_it->h_next = 0;
Packit 4e8bc4
                        /* These are definitely required. else fails assert */
Packit 4e8bc4
                        new_it->it_flags &= ~ITEM_LINKED;
Packit 4e8bc4
                        new_it->refcount = 0;
Packit 4e8bc4
                        do_item_replace(it, new_it, hv);
Packit 4e8bc4
                        /* Need to walk the chunks and repoint head  */
Packit 4e8bc4
                        if (new_it->it_flags & ITEM_CHUNKED) {
Packit 4e8bc4
                            item_chunk *fch = (item_chunk *) ITEM_schunk(new_it);
Packit 4e8bc4
                            fch->next->prev = fch;
Packit 4e8bc4
                            while (fch) {
Packit 4e8bc4
                                fch->head = new_it;
Packit 4e8bc4
                                fch = fch->next;
Packit 4e8bc4
                            }
Packit 4e8bc4
                        }
Packit 4e8bc4
                        it->refcount = 0;
Packit 4e8bc4
                        it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
Packit 4e8bc4
#ifdef DEBUG_SLAB_MOVER
Packit 4e8bc4
                        memcpy(ITEM_key(it), "deadbeef", 8);
Packit 4e8bc4
#endif
Packit 4e8bc4
                        slab_rebal.rescues++;
Packit 4e8bc4
                    } else {
Packit 4e8bc4
                        item_chunk *nch = (item_chunk *) new_it;
Packit 4e8bc4
                        /* Chunks always have head chunk (the main it) */
Packit 4e8bc4
                        ch->prev->next = nch;
Packit 4e8bc4
                        if (ch->next)
Packit 4e8bc4
                            ch->next->prev = nch;
Packit 4e8bc4
                        memcpy(nch, ch, ch->used + sizeof(item_chunk));
Packit 4e8bc4
                        ch->refcount = 0;
Packit 4e8bc4
                        ch->it_flags = ITEM_SLABBED|ITEM_FETCHED;
Packit 4e8bc4
                        slab_rebal.chunk_rescues++;
Packit 4e8bc4
#ifdef DEBUG_SLAB_MOVER
Packit 4e8bc4
                        memcpy(ITEM_key((item *)ch), "deadbeef", 8);
Packit 4e8bc4
#endif
Packit 4e8bc4
                        refcount_decr(it);
Packit 4e8bc4
                    }
Packit 4e8bc4
                    slab_rebal.completed[offset] = 1;
Packit 4e8bc4
                } else {
Packit 4e8bc4
                    /* unlink and mark as done if it's not
Packit 4e8bc4
                     * a chunked item as they require more book-keeping) */
Packit 4e8bc4
                    STORAGE_delete(storage, it);
Packit 4e8bc4
                    if (!ch && (it->it_flags & ITEM_CHUNKED) == 0) {
Packit 4e8bc4
                        do_item_unlink(it, hv);
Packit 4e8bc4
                        it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
Packit 4e8bc4
                        it->refcount = 0;
Packit 4e8bc4
#ifdef DEBUG_SLAB_MOVER
Packit 4e8bc4
                        memcpy(ITEM_key(it), "deadbeef", 8);
Packit 4e8bc4
#endif
Packit 4e8bc4
                        slab_rebal.completed[offset] = 1;
Packit 4e8bc4
                    } else {
Packit 4e8bc4
                        ntotal = ITEM_ntotal(it);
Packit 4e8bc4
                        do_item_unlink(it, hv);
Packit 4e8bc4
                        slabs_free(it, ntotal, slab_rebal.s_clsid);
Packit 4e8bc4
                        /* Swing around again later to remove it from the freelist. */
Packit 4e8bc4
                        slab_rebal.busy_items++;
Packit 4e8bc4
                        was_busy++;
Packit 4e8bc4
                    }
Packit 4e8bc4
Packit 4e8bc4
                }
Packit 4e8bc4
                item_trylock_unlock(hold_lock);
Packit 4e8bc4
                pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
                /* Always remove the ntotal, as we added it in during
Packit 4e8bc4
                 * do_slabs_alloc() when copying the item.
Packit 4e8bc4
                 */
Packit 4e8bc4
                break;
Packit 4e8bc4
            case MOVE_FROM_SLAB:
Packit 4e8bc4
                slab_rebal.completed[offset] = 1;
Packit 4e8bc4
                it->refcount = 0;
Packit 4e8bc4
                it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
Packit 4e8bc4
#ifdef DEBUG_SLAB_MOVER
Packit 4e8bc4
                memcpy(ITEM_key(it), "deadbeef", 8);
Packit 4e8bc4
#endif
Packit 4e8bc4
                break;
Packit 4e8bc4
            case MOVE_BUSY:
Packit 4e8bc4
            case MOVE_LOCKED:
Packit 4e8bc4
                slab_rebal.busy_items++;
Packit 4e8bc4
                was_busy++;
Packit 4e8bc4
                break;
Packit 4e8bc4
            case MOVE_PASS:
Packit 4e8bc4
                break;
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
        pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    // Note: slab_rebal.* is occasionally protected under slabs_lock, but
Packit 4e8bc4
    // the mover thread is the only user while active: so it's only necessary
Packit 4e8bc4
    // for start/stop synchronization.
Packit 4e8bc4
    slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
Packit 4e8bc4
Packit 4e8bc4
    if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
Packit 4e8bc4
        /* Some items were busy, start again from the top */
Packit 4e8bc4
        if (slab_rebal.busy_items) {
Packit 4e8bc4
            slab_rebal.slab_pos = slab_rebal.slab_start;
Packit 4e8bc4
            STATS_LOCK();
Packit 4e8bc4
            stats.slab_reassign_busy_items += slab_rebal.busy_items;
Packit 4e8bc4
            STATS_UNLOCK();
Packit 4e8bc4
            slab_rebal.busy_items = 0;
Packit 4e8bc4
            slab_rebal.busy_loops++;
Packit 4e8bc4
        } else {
Packit 4e8bc4
            slab_rebal.done++;
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    return was_busy;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static void slab_rebalance_finish(void) {
Packit 4e8bc4
    slabclass_t *s_cls;
Packit 4e8bc4
    slabclass_t *d_cls;
Packit 4e8bc4
    int x;
Packit 4e8bc4
    uint32_t rescues;
Packit 4e8bc4
    uint32_t evictions_nomem;
Packit 4e8bc4
    uint32_t inline_reclaim;
Packit 4e8bc4
    uint32_t chunk_rescues;
Packit 4e8bc4
    uint32_t busy_deletes;
Packit 4e8bc4
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
Packit 4e8bc4
    s_cls = &slabclass[slab_rebal.s_clsid];
Packit 4e8bc4
    d_cls = &slabclass[slab_rebal.d_clsid];
Packit 4e8bc4
Packit 4e8bc4
#ifdef DEBUG_SLAB_MOVER
Packit 4e8bc4
    /* If the algorithm is broken, live items can sneak in. */
Packit 4e8bc4
    slab_rebal.slab_pos = slab_rebal.slab_start;
Packit 4e8bc4
    while (1) {
Packit 4e8bc4
        item *it = slab_rebal.slab_pos;
Packit 4e8bc4
        assert(it->it_flags == (ITEM_SLABBED|ITEM_FETCHED));
Packit 4e8bc4
        assert(memcmp(ITEM_key(it), "deadbeef", 8) == 0);
Packit 4e8bc4
        it->it_flags = ITEM_SLABBED|ITEM_FETCHED;
Packit 4e8bc4
        slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
Packit 4e8bc4
        if (slab_rebal.slab_pos >= slab_rebal.slab_end)
Packit 4e8bc4
            break;
Packit 4e8bc4
    }
Packit 4e8bc4
#endif
Packit 4e8bc4
Packit 4e8bc4
    /* At this point the stolen slab is completely clear.
Packit 4e8bc4
     * We always kill the "first"/"oldest" slab page in the slab_list, so
Packit 4e8bc4
     * shuffle the page list backwards and decrement.
Packit 4e8bc4
     */
Packit 4e8bc4
    s_cls->slabs--;
Packit 4e8bc4
    for (x = 0; x < s_cls->slabs; x++) {
Packit 4e8bc4
        s_cls->slab_list[x] = s_cls->slab_list[x+1];
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
Packit 4e8bc4
    /* Don't need to split the page into chunks if we're just storing it */
Packit 4e8bc4
    if (slab_rebal.d_clsid > SLAB_GLOBAL_PAGE_POOL) {
Packit 4e8bc4
        memset(slab_rebal.slab_start, 0, (size_t)settings.slab_page_size);
Packit 4e8bc4
        split_slab_page_into_freelist(slab_rebal.slab_start,
Packit 4e8bc4
            slab_rebal.d_clsid);
Packit 4e8bc4
    } else if (slab_rebal.d_clsid == SLAB_GLOBAL_PAGE_POOL) {
Packit 4e8bc4
        /* memset just enough to signal restart handler to skip */
Packit 4e8bc4
        memset(slab_rebal.slab_start, 0, sizeof(item));
Packit 4e8bc4
        /* mem_malloc'ed might be higher than mem_limit. */
Packit 4e8bc4
        mem_limit_reached = false;
Packit 4e8bc4
        memory_release();
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    slab_rebal.busy_loops = 0;
Packit 4e8bc4
    slab_rebal.done       = 0;
Packit 4e8bc4
    slab_rebal.s_clsid    = 0;
Packit 4e8bc4
    slab_rebal.d_clsid    = 0;
Packit 4e8bc4
    slab_rebal.slab_start = NULL;
Packit 4e8bc4
    slab_rebal.slab_end   = NULL;
Packit 4e8bc4
    slab_rebal.slab_pos   = NULL;
Packit 4e8bc4
    evictions_nomem    = slab_rebal.evictions_nomem;
Packit 4e8bc4
    inline_reclaim = slab_rebal.inline_reclaim;
Packit 4e8bc4
    rescues   = slab_rebal.rescues;
Packit 4e8bc4
    chunk_rescues = slab_rebal.chunk_rescues;
Packit 4e8bc4
    busy_deletes = slab_rebal.busy_deletes;
Packit 4e8bc4
    slab_rebal.evictions_nomem    = 0;
Packit 4e8bc4
    slab_rebal.inline_reclaim = 0;
Packit 4e8bc4
    slab_rebal.rescues  = 0;
Packit 4e8bc4
    slab_rebal.chunk_rescues = 0;
Packit 4e8bc4
    slab_rebal.busy_deletes = 0;
Packit 4e8bc4
Packit 4e8bc4
    slab_rebalance_signal = 0;
Packit 4e8bc4
Packit 4e8bc4
    free(slab_rebal.completed);
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
Packit 4e8bc4
    STATS_LOCK();
Packit 4e8bc4
    stats.slabs_moved++;
Packit 4e8bc4
    stats.slab_reassign_rescues += rescues;
Packit 4e8bc4
    stats.slab_reassign_evictions_nomem += evictions_nomem;
Packit 4e8bc4
    stats.slab_reassign_inline_reclaim += inline_reclaim;
Packit 4e8bc4
    stats.slab_reassign_chunk_rescues += chunk_rescues;
Packit 4e8bc4
    stats.slab_reassign_busy_deletes += busy_deletes;
Packit 4e8bc4
    stats_state.slab_reassign_running = false;
Packit 4e8bc4
    STATS_UNLOCK();
Packit 4e8bc4
Packit 4e8bc4
    if (settings.verbose > 1) {
Packit 4e8bc4
        fprintf(stderr, "finished a slab move\n");
Packit 4e8bc4
    }
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* Slab mover thread.
Packit 4e8bc4
 * Sits waiting for a condition to jump off and shovel some memory about
Packit 4e8bc4
 */
Packit 4e8bc4
static void *slab_rebalance_thread(void *arg) {
Packit 4e8bc4
    int was_busy = 0;
Packit 4e8bc4
    int backoff_timer = 1;
Packit 4e8bc4
    int backoff_max = 1000;
Packit 4e8bc4
    /* So we first pass into cond_wait with the mutex held */
Packit 4e8bc4
    mutex_lock(&slabs_rebalance_lock);
Packit 4e8bc4
Packit 4e8bc4
    /* Must finish moving page before stopping */
Packit 4e8bc4
    while (slab_rebalance_signal || do_run_slab_rebalance_thread) {
Packit 4e8bc4
        if (slab_rebalance_signal == 1) {
Packit 4e8bc4
            if (slab_rebalance_start() < 0) {
Packit 4e8bc4
                /* Handle errors with more specificity as required. */
Packit 4e8bc4
                slab_rebalance_signal = 0;
Packit 4e8bc4
            }
Packit 4e8bc4
Packit 4e8bc4
            was_busy = 0;
Packit 4e8bc4
        } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
Packit 4e8bc4
            was_busy = slab_rebalance_move();
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
        if (slab_rebal.done) {
Packit 4e8bc4
            slab_rebalance_finish();
Packit 4e8bc4
        } else if (was_busy) {
Packit 4e8bc4
            /* Stuck waiting for some items to unlock, so slow down a bit
Packit 4e8bc4
             * to give them a chance to free up */
Packit 4e8bc4
            usleep(backoff_timer);
Packit 4e8bc4
            backoff_timer = backoff_timer * 2;
Packit 4e8bc4
            if (backoff_timer > backoff_max)
Packit 4e8bc4
                backoff_timer = backoff_max;
Packit 4e8bc4
        }
Packit 4e8bc4
Packit 4e8bc4
        if (slab_rebalance_signal == 0) {
Packit 4e8bc4
            /* always hold this lock while we're running */
Packit 4e8bc4
            pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    // TODO: cancel in-flight slab page move
Packit 4e8bc4
    mutex_unlock(&slabs_rebalance_lock);
Packit 4e8bc4
    return NULL;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* Iterate at most once through the slab classes and pick a "random" source.
Packit 4e8bc4
 * I like this better than calling rand() since rand() is slow enough that we
Packit 4e8bc4
 * can just check all of the classes once instead.
Packit 4e8bc4
 */
Packit 4e8bc4
static int slabs_reassign_pick_any(int dst) {
Packit 4e8bc4
    static int cur = POWER_SMALLEST - 1;
Packit 4e8bc4
    int tries = power_largest - POWER_SMALLEST + 1;
Packit 4e8bc4
    for (; tries > 0; tries--) {
Packit 4e8bc4
        cur++;
Packit 4e8bc4
        if (cur > power_largest)
Packit 4e8bc4
            cur = POWER_SMALLEST;
Packit 4e8bc4
        if (cur == dst)
Packit 4e8bc4
            continue;
Packit 4e8bc4
        if (slabclass[cur].slabs > 1) {
Packit 4e8bc4
            return cur;
Packit 4e8bc4
        }
Packit 4e8bc4
    }
Packit 4e8bc4
    return -1;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static enum reassign_result_type do_slabs_reassign(int src, int dst) {
Packit 4e8bc4
    bool nospare = false;
Packit 4e8bc4
    if (slab_rebalance_signal != 0)
Packit 4e8bc4
        return REASSIGN_RUNNING;
Packit 4e8bc4
Packit 4e8bc4
    if (src == dst)
Packit 4e8bc4
        return REASSIGN_SRC_DST_SAME;
Packit 4e8bc4
Packit 4e8bc4
    /* Special indicator to choose ourselves. */
Packit 4e8bc4
    if (src == -1) {
Packit 4e8bc4
        src = slabs_reassign_pick_any(dst);
Packit 4e8bc4
        /* TODO: If we end up back at -1, return a new error type */
Packit 4e8bc4
    }
Packit 4e8bc4
Packit 4e8bc4
    if (src < SLAB_GLOBAL_PAGE_POOL || src > power_largest ||
Packit 4e8bc4
        dst < SLAB_GLOBAL_PAGE_POOL || dst > power_largest)
Packit 4e8bc4
        return REASSIGN_BADCLASS;
Packit 4e8bc4
Packit 4e8bc4
    pthread_mutex_lock(&slabs_lock);
Packit 4e8bc4
    if (slabclass[src].slabs < 2)
Packit 4e8bc4
        nospare = true;
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_lock);
Packit 4e8bc4
    if (nospare)
Packit 4e8bc4
        return REASSIGN_NOSPARE;
Packit 4e8bc4
Packit 4e8bc4
    slab_rebal.s_clsid = src;
Packit 4e8bc4
    slab_rebal.d_clsid = dst;
Packit 4e8bc4
Packit 4e8bc4
    slab_rebalance_signal = 1;
Packit 4e8bc4
    pthread_cond_signal(&slab_rebalance_cond);
Packit 4e8bc4
Packit 4e8bc4
    return REASSIGN_OK;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
enum reassign_result_type slabs_reassign(int src, int dst) {
Packit 4e8bc4
    enum reassign_result_type ret;
Packit 4e8bc4
    if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
Packit 4e8bc4
        return REASSIGN_RUNNING;
Packit 4e8bc4
    }
Packit 4e8bc4
    ret = do_slabs_reassign(src, dst);
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_rebalance_lock);
Packit 4e8bc4
    return ret;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* If we hold this lock, rebalancer can't wake up or move */
Packit 4e8bc4
void slabs_rebalancer_pause(void) {
Packit 4e8bc4
    pthread_mutex_lock(&slabs_rebalance_lock);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
void slabs_rebalancer_resume(void) {
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_rebalance_lock);
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
static pthread_t rebalance_tid;
Packit 4e8bc4
Packit 4e8bc4
int start_slab_maintenance_thread(void) {
Packit 4e8bc4
    int ret;
Packit 4e8bc4
    slab_rebalance_signal = 0;
Packit 4e8bc4
    slab_rebal.slab_start = NULL;
Packit 4e8bc4
Packit 4e8bc4
    if ((ret = pthread_create(&rebalance_tid, NULL,
Packit 4e8bc4
                              slab_rebalance_thread, NULL)) != 0) {
Packit 4e8bc4
        fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
Packit 4e8bc4
        return -1;
Packit 4e8bc4
    }
Packit 4e8bc4
    return 0;
Packit 4e8bc4
}
Packit 4e8bc4
Packit 4e8bc4
/* The maintenance thread is on a sleep/loop cycle, so it should join after a
Packit 4e8bc4
 * short wait */
Packit 4e8bc4
void stop_slab_maintenance_thread(void) {
Packit 4e8bc4
    mutex_lock(&slabs_rebalance_lock);
Packit 4e8bc4
    do_run_slab_rebalance_thread = 0;
Packit 4e8bc4
    pthread_cond_signal(&slab_rebalance_cond);
Packit 4e8bc4
    pthread_mutex_unlock(&slabs_rebalance_lock);
Packit 4e8bc4
Packit 4e8bc4
    /* Wait for the maintenance thread to stop */
Packit 4e8bc4
    pthread_join(rebalance_tid, NULL);
Packit 4e8bc4
}