/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ /* * (C) 2013 by Argonne National Laboratory. * See COPYRIGHT in top-level directory. */ #include #include #include #include #include #include "mcs-mutex.h" /* TODO: Make these mutex operations no-ops for sequential runs */ /** Create an MCS mutex. Collective on comm. * * @param[out] comm communicator containing all processes that will use the * mutex * @param[out] tail_rank rank of the process in comm that holds the tail * pointer * @param[out] hdl handle to the mutex * @return MPI status */ int MCS_Mutex_create(int tail_rank, MPI_Comm comm, MCS_Mutex * hdl_out) { int rank, nproc; MCS_Mutex hdl; hdl = malloc(sizeof(struct mcs_mutex_s)); assert(hdl != NULL); MPI_Comm_dup(comm, &hdl->comm); MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); hdl->tail_rank = tail_rank; #ifdef USE_WIN_SHARED MPI_Win_allocate_shared(2 * sizeof(int), sizeof(int), MPI_INFO_NULL, hdl->comm, &hdl->base, &hdl->window); #else #ifdef USE_WIN_ALLOC_SHM MPI_Info_create(&hdl->win_info); MPI_Info_set(hdl->win_info, "alloc_shm", "true"); #else MPI_Info_create(&hdl->win_info); MPI_Info_set(hdl->win_info, "alloc_shm", "false"); #endif MPI_Win_allocate(2 * sizeof(int), sizeof(int), hdl->win_info, hdl->comm, &hdl->base, &hdl->window); #endif MPI_Win_lock_all(0, hdl->window); hdl->base[0] = MPI_PROC_NULL; hdl->base[1] = MPI_PROC_NULL; MPI_Win_sync(hdl->window); MPI_Barrier(hdl->comm); *hdl_out = hdl; return MPI_SUCCESS; } /** Free an MCS mutex. Collective on ranks in the communicator used at the * time of creation. * * @param[in] hdl handle to the group that will be freed * @return MPI status */ int MCS_Mutex_free(MCS_Mutex * hdl_ptr) { MCS_Mutex hdl = *hdl_ptr; MPI_Win_unlock_all(hdl->window); MPI_Win_free(&hdl->window); MPI_Comm_free(&hdl->comm); #ifndef USE_WIN_SHARED MPI_Info_free(&hdl->win_info); #endif free(hdl); hdl_ptr = NULL; return MPI_SUCCESS; } /** Lock a mutex. * * @param[in] hdl Handle to the mutex * @return MPI status */ int MCS_Mutex_lock(MCS_Mutex hdl) { int rank, nproc; int prev; MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* This store is safe, since it cannot happen concurrently with a remote * write */ hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL; MPI_Win_sync(hdl->window); MPI_Fetch_and_op(&rank, &prev, MPI_INT, hdl->tail_rank, MCS_MTX_TAIL_DISP, MPI_REPLACE, hdl->window); MPI_Win_flush(hdl->tail_rank, hdl->window); /* If there was a previous tail, update their next pointer and wait for * notification. Otherwise, the mutex was successfully acquired. */ if (prev != MPI_PROC_NULL) { /* Wait for notification */ MPI_Status status; MPI_Accumulate(&rank, 1, MPI_INT, prev, MCS_MTX_ELEM_DISP, 1, MPI_INT, MPI_REPLACE, hdl->window); MPI_Win_flush(prev, hdl->window); debug_print("%2d: LOCK - waiting for notification from %d\n", rank, prev); MPI_Recv(NULL, 0, MPI_BYTE, prev, MCS_MUTEX_TAG, hdl->comm, &status); } debug_print("%2d: LOCK - lock acquired\n", rank); return MPI_SUCCESS; } /** Attempt to acquire a mutex. * * @param[in] hdl Handle to the mutex * @param[out] success Indicates whether the mutex was acquired * @return MPI status */ int MCS_Mutex_trylock(MCS_Mutex hdl, int *success) { int rank, nproc; int tail, nil = MPI_PROC_NULL; MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* This store is safe, since it cannot happen concurrently with a remote * write */ hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL; MPI_Win_sync(hdl->window); /* Check if the lock is available and claim it if it is. */ MPI_Compare_and_swap(&rank, &nil, &tail, MPI_INT, hdl->tail_rank, MCS_MTX_TAIL_DISP, hdl->window); MPI_Win_flush(hdl->tail_rank, hdl->window); /* If the old tail was MPI_PROC_NULL, we have claimed the mutex */ *success = (tail == nil); debug_print("%2d: TRYLOCK - %s\n", rank, (*success) ? "Success" : "Non-success"); return MPI_SUCCESS; } /** Unlock a mutex. * * @param[in] hdl Handle to the mutex * @return MPI status */ int MCS_Mutex_unlock(MCS_Mutex hdl) { int rank, nproc, next; MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); MPI_Win_sync(hdl->window); /* Read my next pointer. FOP is used since another process may write to * this location concurrent with this read. */ MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP, MPI_NO_OP, hdl->window); MPI_Win_flush(rank, hdl->window); if (next == MPI_PROC_NULL) { int tail; int nil = MPI_PROC_NULL; /* Check if we are the at the tail of the lock queue. If so, we're * done. If not, we need to send notification. */ MPI_Compare_and_swap(&nil, &rank, &tail, MPI_INT, hdl->tail_rank, MCS_MTX_TAIL_DISP, hdl->window); MPI_Win_flush(hdl->tail_rank, hdl->window); if (tail != rank) { debug_print("%2d: UNLOCK - waiting for next pointer (tail = %d)\n", rank, tail); assert(tail >= 0 && tail < nproc); for (;;) { int flag; MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP, MPI_NO_OP, hdl->window); MPI_Win_flush(rank, hdl->window); if (next != MPI_PROC_NULL) break; MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE); } } } /* Notify the next waiting process */ if (next != MPI_PROC_NULL) { debug_print("%2d: UNLOCK - notifying %d\n", rank, next); MPI_Send(NULL, 0, MPI_BYTE, next, MCS_MUTEX_TAG, hdl->comm); } debug_print("%2d: UNLOCK - lock released\n", rank); return MPI_SUCCESS; }