Blob Blame History Raw
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 * See COPYRIGHT in top-level directory.
 */

/*
 * The original version of this code was contributed by Milind Chabbi
 * based on the work when he was at Rice University. It relies on the
 * HMCS lock description in [1] and the fast path described in [2].
 *
 * [1] Chabbi, Milind, Michael Fagan, and John Mellor-Crummey. "High
 * performance locks for multi-level NUMA systems." In Proceedings of
 * the ACM SIGPLAN Symposium on Principles and Practice of Parallel
 * Programming (PPoPP'15), ACM, 2015.
 *
 * [2] Chabbi, Milind, and John Mellor-Crummey. "Contention-conscious,
 * locality-preserving locks." In Proceedings of the 21st ACM SIGPLAN
 * Symposium on Principles and Practice of Parallel Programming (PPoPP'16,
 * p. 22. ACM, 2016.
 */

#include "lock/zm_lock_types.h"
#include <hwloc.h>

#ifndef DEFAULT_THRESHOLD
#define DEFAULT_THRESHOLD 256
#endif

#ifndef HMCS_DEFAULT_MAX_LEVELS
#define HMCS_DEFAULT_MAX_LEVELS 3
#endif

#define WAIT (0xffffffff)
#define COHORT_START (0x1)
#define ACQUIRE_PARENT (0xcffffffc)

#ifndef TRUE
#define TRUE 1
#else
#error "TRUE already defined"
#endif

#ifndef FALSE
#define FALSE 0
#else
#error "TRUE already defined"
#endif

/* Atomic operation shorthands. The memory ordering defaults to:
 * 1- Acquire ordering for loads
 * 2- Release ordering for stores
 * 3- Acquire+Release ordering for read-modify-write operations
 * */

#define LOAD(addr)                  zm_atomic_load(addr, zm_memord_acquire)
#define STORE(addr, val)            zm_atomic_store(addr, val, zm_memord_release)
#define SWAP(addr, desire)          zm_atomic_exchange_ptr(addr, desire, zm_memord_acq_rel)
#define CAS(addr, expect, desire)   zm_atomic_compare_exchange_strong(addr,\
                                                                      expect,\
                                                                      desire,\
                                                                      zm_memord_acq_rel,\
                                                                      zm_memord_acquire)

struct hnode{
    unsigned threshold __attribute__((aligned(ZM_CACHELINE_SIZE)));
    struct hnode * parent __attribute__((aligned(ZM_CACHELINE_SIZE)));
    zm_atomic_ptr_t lock __attribute__((aligned(ZM_CACHELINE_SIZE)));
    zm_mcs_qnode_t node __attribute__((aligned(ZM_CACHELINE_SIZE)));

}__attribute__((aligned(ZM_CACHELINE_SIZE)));

struct leaf{
    struct hnode * cur_node;
    struct hnode * root_node;
    zm_mcs_qnode_t I;
    int curDepth;
    int took_fast_path;
};

struct lock{
    // Assumes tids range from [0.. max_threads)
    // Assumes that tid 0 is close to tid and so on.
    struct leaf ** leaf_nodes __attribute__((aligned(ZM_CACHELINE_SIZE)));
    hwloc_topology_t topo;
    int levels;
};

static zm_thread_local int tid = -1;

/* TODO: automate hardware topology detection
 * instead of the below hard-coded method */

/* Check the actual affinity mask assigned to the thread */
static void check_affinity(hwloc_topology_t topo) {
    hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
    int set_length;
    hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD);
    set_length = hwloc_get_nbobjs_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU);
    hwloc_bitmap_free(cpuset);

    if(set_length != 1) {
        printf("IZEM:HMCS:ERROR: thread bound to more than one HW thread!\n");
        exit(EXIT_FAILURE);
    }
}

static inline void reuse_qnode(zm_mcs_qnode_t *I){
    STORE(&I->status, WAIT);
    STORE(&I->next, ZM_NULL);
}

static void* new_hnode() {
    int err;
    void *storage;
    err = posix_memalign(&storage, ZM_CACHELINE_SIZE, sizeof(struct hnode));
    if (err != 0) {
        printf("posix_memalign failed in HMCS : new_hnode \n");
        exit(EXIT_FAILURE);
    }
    return storage;
}

/* TODO: Macro or Template this for fast comprison */
static inline unsigned get_threshold(struct hnode *L) {
    return L->threshold;
}

static inline void normal_mcs_release_with_value(struct hnode * L, zm_mcs_qnode_t *I, unsigned val){

    zm_mcs_qnode_t *succ = (zm_mcs_qnode_t *)LOAD(&I->next);
    if(succ) {
        STORE(&succ->status, val);
        return;
    }
    zm_mcs_qnode_t *tmp = I;
    if (CAS(&(L->lock), (zm_ptr_t*)&tmp,ZM_NULL))
        return;
    while(succ == NULL)
        succ = (zm_mcs_qnode_t *)LOAD(&I->next); /* SPIN */
    STORE(&succ->status, val);
    return;
}

static inline void acquire_root(struct hnode * L, zm_mcs_qnode_t *I) {
    // Prepare the node for use.
    reuse_qnode(I);
    zm_mcs_qnode_t *pred = (zm_mcs_qnode_t*) SWAP(&(L->lock), (zm_ptr_t)I);

    if(!pred) {
        // I am the first one at this level
        return;
    }

    STORE(&pred->next, I);
    while(LOAD(&I->status) == WAIT)
        ; /* SPIN */
    return;
}

static inline void tryacq_root(struct hnode * L, zm_mcs_qnode_t *I, int *success) {
    zm_ptr_t expected = ZM_NULL;
    // Prepare the node for use.
    reuse_qnode(I);
    *success = CAS(&(L->lock), &expected, (zm_ptr_t)I);

    return;
}

static inline void release_root(struct hnode * L, zm_mcs_qnode_t *I) {
    // Top level release is usual MCS
    // At the top level MCS we always writr COHORT_START since
    // 1. It will release the lock
    // 2. Will never grow large
    // 3. Avoids a read from I->status
    normal_mcs_release_with_value(L, I, COHORT_START);
}

static inline int nowaiters_root(struct hnode * L, zm_mcs_qnode_t *I) {
    return (LOAD(&I->next) == ZM_NULL);
}

static inline void acquire_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
    // Trivial case = root level
    if (level == 1)
        acquire_root(L, I);
    else {
        // Prepare the node for use.
        reuse_qnode(I);
        zm_mcs_qnode_t* pred = (zm_mcs_qnode_t*)SWAP(&(L->lock), (zm_ptr_t)I);
        if(!pred) {
            // I am the first one at this level
            // begining of cohort
            STORE(&I->status, COHORT_START);
            // acquire at next level if not at the top level
            acquire_helper(level - 1, L->parent, &(L->node));
            return;
        } else {
            STORE(&pred->next, I);
            for(;;){
                unsigned myStatus = LOAD(&I->status);
                if(myStatus < ACQUIRE_PARENT) {
                    return;
                }
                if(myStatus == ACQUIRE_PARENT) {
                    // beginning of cohort
                    STORE(&I->status, COHORT_START);
                    // This means this level is acquired and we can start the next level
                    acquire_helper(level - 1, L->parent, &(L->node));
                    return;
                }
                // spin back; (I->status == WAIT)
            }
        }
    }
}

static inline void release_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
    // Trivial case = root level
    if (level == 1) {
        release_root(L, I);
    } else {
        unsigned cur_count = LOAD(&(I->status)) ;
        zm_mcs_qnode_t * succ;

        // Lower level releases
        if(cur_count == get_threshold(L)) {
            // NO KNOWN SUCCESSORS / DESCENDENTS
            // reached threshold and have next level
            // release to next level
            release_helper(level - 1, L->parent, &(L->node));
            //COMMIT_ALL_WRITES();
            // Tap successor at this level and ask to spin acquire next level lock
            normal_mcs_release_with_value(L, I, ACQUIRE_PARENT);
            return;
        }

        succ = (zm_mcs_qnode_t*)LOAD(&(I->next));
        // Not reached threshold
        if(succ) {
            STORE(&succ->status, cur_count + 1);
            return; // released
        }
        // No known successor, so release
        release_helper(level - 1, L->parent, &(L->node));
        // Tap successor at this level and ask to spin acquire next level lock
        normal_mcs_release_with_value(L, I, ACQUIRE_PARENT);
    }
}

static inline int nowaiters_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
    if (level == 1 ) {
        return nowaiters_root(L,I);
    } else {
        if(LOAD(&I->next) != ZM_NULL)
            return FALSE;
        else
            return nowaiters_helper(level - 1, L->parent, &(L->node));
    }
}

static void* new_leaf(struct hnode *h, int depth) {
    int err;
    struct leaf *leaf;
    err = posix_memalign((void **) &leaf, ZM_CACHELINE_SIZE, sizeof(struct leaf));
    if (err != 0) {
        printf("posix_memalign failed in HMCS : new_leaf \n");
        exit(EXIT_FAILURE);
    }
    leaf->cur_node = h;
    leaf->curDepth = depth;
    leaf->took_fast_path = FALSE;
    struct hnode *tmp, *root_node;
    for(tmp = leaf->cur_node; tmp->parent != NULL; tmp = tmp->parent);
    root_node = tmp;
    leaf->root_node = root_node;
    return leaf;
}

static inline void acquire_from_leaf(int level, struct leaf *L){
    if((zm_ptr_t)L->cur_node->lock == ZM_NULL
    && (zm_ptr_t)L->root_node->lock == ZM_NULL) {
        // go FP
        L->took_fast_path = TRUE;
        acquire_root(L->root_node, &L->I);
        return;
    }
    acquire_helper(level, L->cur_node, &L->I);
    return;
}

static inline void tryacq_from_leaf(int level, struct leaf *L, int *success){
    *success = 0;
    if((zm_ptr_t)L->cur_node->lock == ZM_NULL
    && (zm_ptr_t)L->root_node->lock == ZM_NULL) {
        tryacq_root(L->root_node, &L->I, success);
        if (*success)
            L->took_fast_path = TRUE;
    }
    return;
}

static inline void release_from_leaf(int level, struct leaf *L){
    //myrelease(cur_node, I);
    if(L->took_fast_path) {
        release_root(L->root_node, &L->I);
        L->took_fast_path = FALSE;
        return;
    }
    release_helper(level, L->cur_node, &L->I);
    return;
}

static inline int nowaiters_from_leaf(int level, struct leaf *L){
    // Shouldnt this be nowaiters(root_node, I)?
    if(L->took_fast_path) {
        return nowaiters_root(L->cur_node, &L->I);
    }

    return nowaiters_helper(level, L->cur_node, &L->I);
}

static int get_hwthread_id(hwloc_topology_t topo){
    hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
    hwloc_obj_t obj;
    hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD);
    obj = hwloc_get_obj_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU, 0);
    hwloc_bitmap_free(cpuset);
    return obj->logical_index;
}

static void set_hierarchy(struct lock *L, int *max_threads, int** particip_per_level) {
    int max_depth, levels = 0, max_levels = HMCS_DEFAULT_MAX_LEVELS, explicit_levels = 0;
    char tmp[20];
    char *s = getenv("ZM_HMCS_MAX_LEVELS");
    if (s != NULL)
        max_levels = atoi(s);
    int depths[max_levels];
    int idx = 0;
    /* advice to users: run `hwloc-ls -s --no-io --no-icaches` and choose
     * depths of interest in ascending order. The first depth must be `0' */

    s = getenv("ZM_HMCS_EXPLICIT_LEVELS");
    if (s != NULL) {
        strcpy(tmp, s);
        explicit_levels = 1;
        char* token;
        token = strtok(tmp,",");
        while(token != NULL) {
            depths[idx] = atoi(token);
            if (idx == 0)
                assert(depths[idx] == 0 && "the first depth must be machine level (i.e., depth 0), run `hwloc-ls -s --no-io --no-icaches` and choose appropriate depth values");
            idx++;
            token = strtok(NULL,",");
        }
        assert(idx == max_levels);
    }

    hwloc_topology_init(&L->topo);
    hwloc_topology_load(L->topo);

    *max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU);

    max_depth = hwloc_topology_get_depth(L->topo);
    assert(max_depth >= 2); /* At least Machine and Core levels exist */

    *particip_per_level = (int*) malloc(max_levels * sizeof(int));
    int prev_nobjs = -1;
    if(!explicit_levels) {
        for (int d = max_depth - 2; d > 1; d--) {
            int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d);
            /* Check if this level has a hierarchical impact */
            if(cur_nobjs != prev_nobjs) {
                prev_nobjs = cur_nobjs;
                (*particip_per_level)[levels] = (*max_threads)/cur_nobjs;
                levels++;
                if(levels >= max_levels - 1)
                    break;
            }
        }
        (*particip_per_level)[levels] = *max_threads;
        levels++;
    } else {
        for(int i = max_levels - 1; i >= 0; i--){
            int d = depths[i];
            int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d);
            /* Check if this level has a hierarchical impact */
            if(cur_nobjs != prev_nobjs) {
                prev_nobjs = cur_nobjs;
                (*particip_per_level)[levels] = (*max_threads)/cur_nobjs;
                levels++;
            } else {
                assert(0 && "plz choose levels that have a hierarchical impact");
            }
        }
    }

    L->levels = levels;
}

static void free_hierarchy(int* particip_per_level){
    free(particip_per_level);
}

static void* new_lock(){

    struct lock *L;
    posix_memalign((void **) &L, ZM_CACHELINE_SIZE, sizeof(struct lock));

    int max_threads;
    int *participants_at_level;
    set_hierarchy(L, &max_threads, &participants_at_level);

    // Total locks needed = participantsPerLevel[1] + participantsPerLevel[2] + .. participantsPerLevel[levels-1] + 1
    // Save affinity
    hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
    hwloc_get_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD);

    int total_locks_needed = 0;
    int levels = L->levels;

    for (int i=0; i < levels; i++) {
        total_locks_needed += max_threads / participants_at_level[i] ;
    }
    struct hnode ** lock_locations;
    posix_memalign((void **) &lock_locations, ZM_CACHELINE_SIZE, sizeof(struct hnode*) * total_locks_needed);
    struct leaf ** leaf_nodes;
    posix_memalign((void **) &leaf_nodes, ZM_CACHELINE_SIZE, sizeof(struct leaf*) * max_threads);

    int threshold = DEFAULT_THRESHOLD;
    char *s = getenv("ZM_HMCS_THRESHOLD");
    if (s != NULL)
        threshold = atoi(s);

    hwloc_obj_t obj;
    for(int tid = 0 ; tid < max_threads; tid ++){
        obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid);
        hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
        // Pin me to hw-thread-id = tid
        int last_lock_location_end = 0;
        for(int cur_level = 0 ; cur_level < levels; cur_level++){
            if (tid%participants_at_level[cur_level] == 0) {
                // master, initialize the lock
                int lock_location = last_lock_location_end + tid/participants_at_level[cur_level];
                last_lock_location_end += max_threads/participants_at_level[cur_level];
                struct hnode * cur_hnode = new_hnode();
                cur_hnode->threshold = threshold;
                cur_hnode->parent = NULL;
                cur_hnode->lock = ZM_NULL;
                lock_locations[lock_location] = cur_hnode;
            }
        }
    }

    // setup parents
    for(int tid = 0 ; tid < max_threads; tid ++){
        obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid);
        hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
        int last_lock_location_end = 0;
        for(int cur_level = 0 ; cur_level < levels - 1; cur_level++){
            if (tid%participants_at_level[cur_level] == 0) {
                int lock_location = last_lock_location_end + tid/participants_at_level[cur_level];
                last_lock_location_end += max_threads/participants_at_level[cur_level];
                int parentLockLocation = last_lock_location_end + tid/participants_at_level[cur_level+1];
                lock_locations[lock_location]->parent = lock_locations[parentLockLocation];
            }
        }
        leaf_nodes[tid] = (struct leaf*)new_leaf(lock_locations[tid/participants_at_level[0]], levels);
    }
    free(lock_locations);
    free_hierarchy(participants_at_level);
    // Restore affinity
    hwloc_set_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD);
    L->leaf_nodes = leaf_nodes;

    hwloc_bitmap_free(cpuset);

    return L;
}

static void search_nodes_rec(struct hnode *node, struct hnode **nodes_to_free, int *num_ptrs, int max_threads) {
    int i;
    if(node != NULL) {
        for(i = 0; i < *num_ptrs; i++) {
            if(node == nodes_to_free[i])
                break; /* already marked to be free'd */
        }
        if(i == *num_ptrs) { /* newly encountered pointer */
            nodes_to_free[*num_ptrs] = node;
            (*num_ptrs)++;
            assert(*num_ptrs < 2*max_threads);
        }
        search_nodes_rec(node->parent, nodes_to_free, num_ptrs, max_threads);
    }
}

static void free_lock(struct lock* L) {
    int max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU);
    int num_ptrs = 0;
    struct hnode **nodes_to_free = (struct hnode**) malloc(2*max_threads*sizeof(struct hnode*));
    for (int tid = 0; tid < max_threads; tid++) {
        search_nodes_rec(L->leaf_nodes[tid]->cur_node, nodes_to_free, &num_ptrs, max_threads);
        free(L->leaf_nodes[tid]);
    }
    free(L->leaf_nodes);
    for(int i = 0; i < num_ptrs; i++)
        free(nodes_to_free[i]);
    free(nodes_to_free);
    hwloc_topology_destroy(L->topo);
    free(L);
}

static inline void hmcs_acquire(struct lock *L){
    if (zm_unlikely(tid == -1)) {
        check_affinity(L->topo);
        tid = get_hwthread_id(L->topo);
    }
    acquire_from_leaf(L->levels, L->leaf_nodes[tid]);
}

static inline void hmcs_tryacq(struct lock *L, int *success){
    if (zm_unlikely(tid == -1)) {
        check_affinity(L->topo);
        tid = get_hwthread_id(L->topo);
    }
    tryacq_from_leaf(L->levels, L->leaf_nodes[tid], success);
}

static inline void hmcs_release(struct lock *L){
    release_from_leaf(L->levels, L->leaf_nodes[tid]);
}

static inline int hmcs_nowaiters(struct lock *L){
    return nowaiters_from_leaf(L->levels, L->leaf_nodes[tid]);
}

int zm_hmcs_init(zm_hmcs_t * handle) {
    void *p = new_lock();
    *handle  = (zm_hmcs_t) p;
    return 0;
}

int zm_hmcs_destroy(zm_hmcs_t *L) {
    free_lock((struct lock*)(*L));
    return 0;
}

int zm_hmcs_acquire(zm_hmcs_t L){
    hmcs_acquire((struct lock*)L);
    return 0;
}

int zm_hmcs_tryacq(zm_hmcs_t L, int *success){
    hmcs_tryacq((struct lock*)L, success);
    return 0;
}
int zm_hmcs_release(zm_hmcs_t L){
    hmcs_release((struct lock*)L);
    return 0;
}
int zm_hmcs_nowaiters(zm_hmcs_t L){
    return hmcs_nowaiters((struct lock*)L);
}