/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ /* * See COPYRIGHT in top-level directory. */ /* * The original version of this code was contributed by Milind Chabbi * based on the work when he was at Rice University. It relies on the * HMCS lock description in [1] and the fast path described in [2]. * * [1] Chabbi, Milind, Michael Fagan, and John Mellor-Crummey. "High * performance locks for multi-level NUMA systems." In Proceedings of * the ACM SIGPLAN Symposium on Principles and Practice of Parallel * Programming (PPoPP'15), ACM, 2015. * * [2] Chabbi, Milind, and John Mellor-Crummey. "Contention-conscious, * locality-preserving locks." In Proceedings of the 21st ACM SIGPLAN * Symposium on Principles and Practice of Parallel Programming (PPoPP'16, * p. 22. ACM, 2016. */ #include "lock/zm_lock_types.h" #include #ifndef DEFAULT_THRESHOLD #define DEFAULT_THRESHOLD 256 #endif #ifndef HMCS_DEFAULT_MAX_LEVELS #define HMCS_DEFAULT_MAX_LEVELS 3 #endif #define WAIT (0xffffffff) #define COHORT_START (0x1) #define ACQUIRE_PARENT (0xcffffffc) #ifndef TRUE #define TRUE 1 #else #error "TRUE already defined" #endif #ifndef FALSE #define FALSE 0 #else #error "TRUE already defined" #endif /* Atomic operation shorthands. The memory ordering defaults to: * 1- Acquire ordering for loads * 2- Release ordering for stores * 3- Acquire+Release ordering for read-modify-write operations * */ #define LOAD(addr) zm_atomic_load(addr, zm_memord_acquire) #define STORE(addr, val) zm_atomic_store(addr, val, zm_memord_release) #define SWAP(addr, desire) zm_atomic_exchange_ptr(addr, desire, zm_memord_acq_rel) #define CAS(addr, expect, desire) zm_atomic_compare_exchange_strong(addr,\ expect,\ desire,\ zm_memord_acq_rel,\ zm_memord_acquire) struct hnode{ unsigned threshold __attribute__((aligned(ZM_CACHELINE_SIZE))); struct hnode * parent __attribute__((aligned(ZM_CACHELINE_SIZE))); zm_atomic_ptr_t lock __attribute__((aligned(ZM_CACHELINE_SIZE))); zm_mcs_qnode_t node __attribute__((aligned(ZM_CACHELINE_SIZE))); }__attribute__((aligned(ZM_CACHELINE_SIZE))); struct leaf{ struct hnode * cur_node; struct hnode * root_node; zm_mcs_qnode_t I; int curDepth; int took_fast_path; }; struct lock{ // Assumes tids range from [0.. max_threads) // Assumes that tid 0 is close to tid and so on. struct leaf ** leaf_nodes __attribute__((aligned(ZM_CACHELINE_SIZE))); hwloc_topology_t topo; int levels; }; static zm_thread_local int tid = -1; /* TODO: automate hardware topology detection * instead of the below hard-coded method */ /* Check the actual affinity mask assigned to the thread */ static void check_affinity(hwloc_topology_t topo) { hwloc_cpuset_t cpuset = hwloc_bitmap_alloc(); int set_length; hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD); set_length = hwloc_get_nbobjs_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU); hwloc_bitmap_free(cpuset); if(set_length != 1) { printf("IZEM:HMCS:ERROR: thread bound to more than one HW thread!\n"); exit(EXIT_FAILURE); } } static inline void reuse_qnode(zm_mcs_qnode_t *I){ STORE(&I->status, WAIT); STORE(&I->next, ZM_NULL); } static void* new_hnode() { int err; void *storage; err = posix_memalign(&storage, ZM_CACHELINE_SIZE, sizeof(struct hnode)); if (err != 0) { printf("posix_memalign failed in HMCS : new_hnode \n"); exit(EXIT_FAILURE); } return storage; } /* TODO: Macro or Template this for fast comprison */ static inline unsigned get_threshold(struct hnode *L) { return L->threshold; } static inline void normal_mcs_release_with_value(struct hnode * L, zm_mcs_qnode_t *I, unsigned val){ zm_mcs_qnode_t *succ = (zm_mcs_qnode_t *)LOAD(&I->next); if(succ) { STORE(&succ->status, val); return; } zm_mcs_qnode_t *tmp = I; if (CAS(&(L->lock), (zm_ptr_t*)&tmp,ZM_NULL)) return; while(succ == NULL) succ = (zm_mcs_qnode_t *)LOAD(&I->next); /* SPIN */ STORE(&succ->status, val); return; } static inline void acquire_root(struct hnode * L, zm_mcs_qnode_t *I) { // Prepare the node for use. reuse_qnode(I); zm_mcs_qnode_t *pred = (zm_mcs_qnode_t*) SWAP(&(L->lock), (zm_ptr_t)I); if(!pred) { // I am the first one at this level return; } STORE(&pred->next, I); while(LOAD(&I->status) == WAIT) ; /* SPIN */ return; } static inline void tryacq_root(struct hnode * L, zm_mcs_qnode_t *I, int *success) { zm_ptr_t expected = ZM_NULL; // Prepare the node for use. reuse_qnode(I); *success = CAS(&(L->lock), &expected, (zm_ptr_t)I); return; } static inline void release_root(struct hnode * L, zm_mcs_qnode_t *I) { // Top level release is usual MCS // At the top level MCS we always writr COHORT_START since // 1. It will release the lock // 2. Will never grow large // 3. Avoids a read from I->status normal_mcs_release_with_value(L, I, COHORT_START); } static inline int nowaiters_root(struct hnode * L, zm_mcs_qnode_t *I) { return (LOAD(&I->next) == ZM_NULL); } static inline void acquire_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) { // Trivial case = root level if (level == 1) acquire_root(L, I); else { // Prepare the node for use. reuse_qnode(I); zm_mcs_qnode_t* pred = (zm_mcs_qnode_t*)SWAP(&(L->lock), (zm_ptr_t)I); if(!pred) { // I am the first one at this level // begining of cohort STORE(&I->status, COHORT_START); // acquire at next level if not at the top level acquire_helper(level - 1, L->parent, &(L->node)); return; } else { STORE(&pred->next, I); for(;;){ unsigned myStatus = LOAD(&I->status); if(myStatus < ACQUIRE_PARENT) { return; } if(myStatus == ACQUIRE_PARENT) { // beginning of cohort STORE(&I->status, COHORT_START); // This means this level is acquired and we can start the next level acquire_helper(level - 1, L->parent, &(L->node)); return; } // spin back; (I->status == WAIT) } } } } static inline void release_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) { // Trivial case = root level if (level == 1) { release_root(L, I); } else { unsigned cur_count = LOAD(&(I->status)) ; zm_mcs_qnode_t * succ; // Lower level releases if(cur_count == get_threshold(L)) { // NO KNOWN SUCCESSORS / DESCENDENTS // reached threshold and have next level // release to next level release_helper(level - 1, L->parent, &(L->node)); //COMMIT_ALL_WRITES(); // Tap successor at this level and ask to spin acquire next level lock normal_mcs_release_with_value(L, I, ACQUIRE_PARENT); return; } succ = (zm_mcs_qnode_t*)LOAD(&(I->next)); // Not reached threshold if(succ) { STORE(&succ->status, cur_count + 1); return; // released } // No known successor, so release release_helper(level - 1, L->parent, &(L->node)); // Tap successor at this level and ask to spin acquire next level lock normal_mcs_release_with_value(L, I, ACQUIRE_PARENT); } } static inline int nowaiters_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) { if (level == 1 ) { return nowaiters_root(L,I); } else { if(LOAD(&I->next) != ZM_NULL) return FALSE; else return nowaiters_helper(level - 1, L->parent, &(L->node)); } } static void* new_leaf(struct hnode *h, int depth) { int err; struct leaf *leaf; err = posix_memalign((void **) &leaf, ZM_CACHELINE_SIZE, sizeof(struct leaf)); if (err != 0) { printf("posix_memalign failed in HMCS : new_leaf \n"); exit(EXIT_FAILURE); } leaf->cur_node = h; leaf->curDepth = depth; leaf->took_fast_path = FALSE; struct hnode *tmp, *root_node; for(tmp = leaf->cur_node; tmp->parent != NULL; tmp = tmp->parent); root_node = tmp; leaf->root_node = root_node; return leaf; } static inline void acquire_from_leaf(int level, struct leaf *L){ if((zm_ptr_t)L->cur_node->lock == ZM_NULL && (zm_ptr_t)L->root_node->lock == ZM_NULL) { // go FP L->took_fast_path = TRUE; acquire_root(L->root_node, &L->I); return; } acquire_helper(level, L->cur_node, &L->I); return; } static inline void tryacq_from_leaf(int level, struct leaf *L, int *success){ *success = 0; if((zm_ptr_t)L->cur_node->lock == ZM_NULL && (zm_ptr_t)L->root_node->lock == ZM_NULL) { tryacq_root(L->root_node, &L->I, success); if (*success) L->took_fast_path = TRUE; } return; } static inline void release_from_leaf(int level, struct leaf *L){ //myrelease(cur_node, I); if(L->took_fast_path) { release_root(L->root_node, &L->I); L->took_fast_path = FALSE; return; } release_helper(level, L->cur_node, &L->I); return; } static inline int nowaiters_from_leaf(int level, struct leaf *L){ // Shouldnt this be nowaiters(root_node, I)? if(L->took_fast_path) { return nowaiters_root(L->cur_node, &L->I); } return nowaiters_helper(level, L->cur_node, &L->I); } static int get_hwthread_id(hwloc_topology_t topo){ hwloc_cpuset_t cpuset = hwloc_bitmap_alloc(); hwloc_obj_t obj; hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD); obj = hwloc_get_obj_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU, 0); hwloc_bitmap_free(cpuset); return obj->logical_index; } static void set_hierarchy(struct lock *L, int *max_threads, int** particip_per_level) { int max_depth, levels = 0, max_levels = HMCS_DEFAULT_MAX_LEVELS, explicit_levels = 0; char tmp[20]; char *s = getenv("ZM_HMCS_MAX_LEVELS"); if (s != NULL) max_levels = atoi(s); int depths[max_levels]; int idx = 0; /* advice to users: run `hwloc-ls -s --no-io --no-icaches` and choose * depths of interest in ascending order. The first depth must be `0' */ s = getenv("ZM_HMCS_EXPLICIT_LEVELS"); if (s != NULL) { strcpy(tmp, s); explicit_levels = 1; char* token; token = strtok(tmp,","); while(token != NULL) { depths[idx] = atoi(token); if (idx == 0) assert(depths[idx] == 0 && "the first depth must be machine level (i.e., depth 0), run `hwloc-ls -s --no-io --no-icaches` and choose appropriate depth values"); idx++; token = strtok(NULL,","); } assert(idx == max_levels); } hwloc_topology_init(&L->topo); hwloc_topology_load(L->topo); *max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU); max_depth = hwloc_topology_get_depth(L->topo); assert(max_depth >= 2); /* At least Machine and Core levels exist */ *particip_per_level = (int*) malloc(max_levels * sizeof(int)); int prev_nobjs = -1; if(!explicit_levels) { for (int d = max_depth - 2; d > 1; d--) { int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d); /* Check if this level has a hierarchical impact */ if(cur_nobjs != prev_nobjs) { prev_nobjs = cur_nobjs; (*particip_per_level)[levels] = (*max_threads)/cur_nobjs; levels++; if(levels >= max_levels - 1) break; } } (*particip_per_level)[levels] = *max_threads; levels++; } else { for(int i = max_levels - 1; i >= 0; i--){ int d = depths[i]; int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d); /* Check if this level has a hierarchical impact */ if(cur_nobjs != prev_nobjs) { prev_nobjs = cur_nobjs; (*particip_per_level)[levels] = (*max_threads)/cur_nobjs; levels++; } else { assert(0 && "plz choose levels that have a hierarchical impact"); } } } L->levels = levels; } static void free_hierarchy(int* particip_per_level){ free(particip_per_level); } static void* new_lock(){ struct lock *L; posix_memalign((void **) &L, ZM_CACHELINE_SIZE, sizeof(struct lock)); int max_threads; int *participants_at_level; set_hierarchy(L, &max_threads, &participants_at_level); // Total locks needed = participantsPerLevel[1] + participantsPerLevel[2] + .. participantsPerLevel[levels-1] + 1 // Save affinity hwloc_cpuset_t cpuset = hwloc_bitmap_alloc(); hwloc_get_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD); int total_locks_needed = 0; int levels = L->levels; for (int i=0; i < levels; i++) { total_locks_needed += max_threads / participants_at_level[i] ; } struct hnode ** lock_locations; posix_memalign((void **) &lock_locations, ZM_CACHELINE_SIZE, sizeof(struct hnode*) * total_locks_needed); struct leaf ** leaf_nodes; posix_memalign((void **) &leaf_nodes, ZM_CACHELINE_SIZE, sizeof(struct leaf*) * max_threads); int threshold = DEFAULT_THRESHOLD; char *s = getenv("ZM_HMCS_THRESHOLD"); if (s != NULL) threshold = atoi(s); hwloc_obj_t obj; for(int tid = 0 ; tid < max_threads; tid ++){ obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid); hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD); // Pin me to hw-thread-id = tid int last_lock_location_end = 0; for(int cur_level = 0 ; cur_level < levels; cur_level++){ if (tid%participants_at_level[cur_level] == 0) { // master, initialize the lock int lock_location = last_lock_location_end + tid/participants_at_level[cur_level]; last_lock_location_end += max_threads/participants_at_level[cur_level]; struct hnode * cur_hnode = new_hnode(); cur_hnode->threshold = threshold; cur_hnode->parent = NULL; cur_hnode->lock = ZM_NULL; lock_locations[lock_location] = cur_hnode; } } } // setup parents for(int tid = 0 ; tid < max_threads; tid ++){ obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid); hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD); int last_lock_location_end = 0; for(int cur_level = 0 ; cur_level < levels - 1; cur_level++){ if (tid%participants_at_level[cur_level] == 0) { int lock_location = last_lock_location_end + tid/participants_at_level[cur_level]; last_lock_location_end += max_threads/participants_at_level[cur_level]; int parentLockLocation = last_lock_location_end + tid/participants_at_level[cur_level+1]; lock_locations[lock_location]->parent = lock_locations[parentLockLocation]; } } leaf_nodes[tid] = (struct leaf*)new_leaf(lock_locations[tid/participants_at_level[0]], levels); } free(lock_locations); free_hierarchy(participants_at_level); // Restore affinity hwloc_set_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD); L->leaf_nodes = leaf_nodes; hwloc_bitmap_free(cpuset); return L; } static void search_nodes_rec(struct hnode *node, struct hnode **nodes_to_free, int *num_ptrs, int max_threads) { int i; if(node != NULL) { for(i = 0; i < *num_ptrs; i++) { if(node == nodes_to_free[i]) break; /* already marked to be free'd */ } if(i == *num_ptrs) { /* newly encountered pointer */ nodes_to_free[*num_ptrs] = node; (*num_ptrs)++; assert(*num_ptrs < 2*max_threads); } search_nodes_rec(node->parent, nodes_to_free, num_ptrs, max_threads); } } static void free_lock(struct lock* L) { int max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU); int num_ptrs = 0; struct hnode **nodes_to_free = (struct hnode**) malloc(2*max_threads*sizeof(struct hnode*)); for (int tid = 0; tid < max_threads; tid++) { search_nodes_rec(L->leaf_nodes[tid]->cur_node, nodes_to_free, &num_ptrs, max_threads); free(L->leaf_nodes[tid]); } free(L->leaf_nodes); for(int i = 0; i < num_ptrs; i++) free(nodes_to_free[i]); free(nodes_to_free); hwloc_topology_destroy(L->topo); free(L); } static inline void hmcs_acquire(struct lock *L){ if (zm_unlikely(tid == -1)) { check_affinity(L->topo); tid = get_hwthread_id(L->topo); } acquire_from_leaf(L->levels, L->leaf_nodes[tid]); } static inline void hmcs_tryacq(struct lock *L, int *success){ if (zm_unlikely(tid == -1)) { check_affinity(L->topo); tid = get_hwthread_id(L->topo); } tryacq_from_leaf(L->levels, L->leaf_nodes[tid], success); } static inline void hmcs_release(struct lock *L){ release_from_leaf(L->levels, L->leaf_nodes[tid]); } static inline int hmcs_nowaiters(struct lock *L){ return nowaiters_from_leaf(L->levels, L->leaf_nodes[tid]); } int zm_hmcs_init(zm_hmcs_t * handle) { void *p = new_lock(); *handle = (zm_hmcs_t) p; return 0; } int zm_hmcs_destroy(zm_hmcs_t *L) { free_lock((struct lock*)(*L)); return 0; } int zm_hmcs_acquire(zm_hmcs_t L){ hmcs_acquire((struct lock*)L); return 0; } int zm_hmcs_tryacq(zm_hmcs_t L, int *success){ hmcs_tryacq((struct lock*)L, success); return 0; } int zm_hmcs_release(zm_hmcs_t L){ hmcs_release((struct lock*)L); return 0; } int zm_hmcs_nowaiters(zm_hmcs_t L){ return hmcs_nowaiters((struct lock*)L); }