/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* (C) 2001 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#if !defined(MPIU_THREAD_POBJ_H_INCLUDED)
#define MPIU_THREAD_POBJ_H_INCLUDED
/* There are multiple locks, one for each (major) object */
/* MT FIXME the following description is almost right, but it needs minor
* updates and revision to account for the COMPLETION CS and other issues in the
* request */
/* The fine-grained locking discipline for requests is unfortunately complicated:
*
* (1) Raw allocation and deallocation of requests is protected internally by
* the HANDLEALLOC critical section. This is currently the same as the HANDLE
* CS, not sure why we have both...
*
* (2) Once allocated, a directly allocated request is intially held exclusively
* by a single thread. Direct allocation is common for send requests, but recv
* requests are usually created differently.
*
* (3) Most receive requests are created as the result of a call to FDP_or_AEU
* or FDU_or_AEP. Calls to these functions (along with the other receive queue
* functions) must be inside a MSGQUEUE CS. This CS protects the queue data
* structures as well as any fields inside the requests while they are in the
* queue. For example, assume a call to FDU_or_AEP, as in MPID_Recv. If the
* FDU case hits, the MSGQUEUE CS may be released immediately after the call.
* If the AEP case hits, however, the MSGQUEUE CS must remain held until any
* request field manipulation (such as dev.recv_pending_count) is complete.
*
* (4) In both the send and receive request cases, there is usually a particular
* thread in some upper-level code (e.g. MPI_Send) with interest in the
* completion of the request. This may or may not be a thread that is also
* making progress on this request (often not). The upper level code must not
* attempt to access any request fields (such as the status) until completion is
* signalled by the lower layer.
*
* (5) Once removed from the receive queue, the request is once again
* exclusively owned by the dequeuing thread. From here, the dequeuing thread
* may do whatever it wants with the request without holding any CS, until it
* signals the request's completion. Signalling completion indicates that the
* thread in the upper layer polling on it may access the rest of the fields in
* the request. This completion signalling is lock-free and must be implemented
* carefully to work correctly in the face of optimizing compilers and CPUs.
* The upper-level thread now wholly owns the request until it is deallocated.
*
* (6) In ch3:nemesis at least, multithreaded access to send requests is managed
* by the MPIDCOMM (progress engine) CS. The completion signalling pattern
* applies here (think MPI_Isend/MPI_Wait).
*
* (7) Request cancellation is tricky-ish. For send cancellation, it is
* possible that the completion counter is actually *incremented* because a
* pkt is sent to the recipient asking for remote cancellation. By asking for
* cancellation (of any kind of req), the upper layer gives up its exclusive
* access to the request and must wait for the completion counter to drop to 0
* before exclusively accessing the request fields.
*
* The completion counter is a reference count, much like the object liveness
* reference count. However it differs from a normal refcount because of
* guarantees in the MPI Standard. Applications must not attempt to complete
* (wait/test/free) a given request concurrently in two separate threads. So
* checking for cc==0 is safe because only one thread is ever allowed to make
* that check.
*
* A non-zero completion count must always be accompanied by a normal reference
* that is logically held by the progress engine. Similarly, once the
* completion counter drops to zero, the progress engine is expected to release
* its reference.
*/
/* lock ordering: if MPIDCOMM+MSGQUEUE must be aquired at the same time, then
* the order should be to acquire MPIDCOMM first, then MSGQUEUE. Release in
* reverse order. */
/* POBJ locks are all real recursive ops */
#define MPIUI_THREAD_CS_ENTER_POBJ(mutex) MPIUI_THREAD_CS_ENTER_NONRECURSIVE("POBJ", mutex)
#define MPIUI_THREAD_CS_EXIT_POBJ(mutex) MPIUI_THREAD_CS_EXIT_NONRECURSIVE("POBJ", mutex)
#define MPIUI_THREAD_CS_YIELD_POBJ(mutex) MPIUI_THREAD_CS_YIELD_NONRECURSIVE("POBJ", mutex)
/* ALLGRAN locks are all real nonrecursive ops */
#define MPIUI_THREAD_CS_ENTER_ALLGRAN(mutex) MPIUI_THREAD_CS_ENTER_NONRECURSIVE("ALLGRAN", mutex)
#define MPIUI_THREAD_CS_EXIT_ALLGRAN(mutex) MPIUI_THREAD_CS_EXIT_NONRECURSIVE("ALLGRAN", mutex)
#define MPIUI_THREAD_CS_YIELD_ALLGRAN(mutex) MPIUI_THREAD_CS_YIELD_NONRECURSIVE("ALLGRAN", mutex)
/* GLOBAL locks are all NO-OPs */
#define MPIUI_THREAD_CS_ENTER_GLOBAL(mutex) do {} while (0)
#define MPIUI_THREAD_CS_EXIT_GLOBAL(mutex) do {} while (0)
#define MPIUI_THREAD_CS_YIELD_GLOBAL(mutex) do {} while (0)
/* define a type for the completion counter */
#include "opa_primitives.h"
typedef OPA_int_t MPIU_cc_t;
/* implies no barrier, since this routine should only be used for request
* initialization */
static inline void MPIU_cc_set(MPIU_cc_t * cc_ptr, int val)
{
if (val == 0) {
/* values other than 0 do not enforce any ordering, and therefore do not
* start a HB arc */
/* MT FIXME using cc_set in this way is sloppy. Sometimes the caller
* really does know that the cc value may cleared, but more likely this
* is just a hack to avoid the work of figuring out what the cc value
* currently is and decrementing it instead. */
/* barrier ensures that any state written before indicating completion is
* seen by the thread polling on the cc. If OPA adds store-release
* semantics, we can convert to that instead. */
OPA_write_barrier();
MPL_VG_ANNOTATE_HAPPENS_BEFORE(cc_ptr);
}
#if defined(MPL_VG_AVAILABLE)
/* MT subtle: store_int is actually safe to use, but Helgrind/DRD/TSan all
* view the store/load pair as a race. Using an atomic operation for the
* store side makes all three happy. DRD & TSan also support
* ANNOTATE_BENIGN_RACE, but Helgrind does not. */
OPA_swap_int(cc_ptr, val);
#else
OPA_store_int(cc_ptr, val);
#endif
}
ATTRIBUTE((unused))
static MPL_DBG_INLINE_KEYWORD int MPIU_cc_is_complete(MPIU_cc_t * cc_ptr)
{
int complete;
complete = (0 == OPA_load_int(cc_ptr));
if (complete) {
MPL_VG_ANNOTATE_HAPPENS_AFTER(cc_ptr);
OPA_read_barrier();
}
return complete;
}
/* incomplete_==TRUE iff the cc > 0 after the decr */
#define MPIU_cc_decr(cc_ptr_, incomplete_) \
do { \
OPA_write_barrier(); \
MPL_VG_ANNOTATE_HAPPENS_BEFORE(cc_ptr_); \
*(incomplete_) = !OPA_decr_and_test_int(cc_ptr_); \
/* TODO check if this HA is actually necessary */ \
if (!*(incomplete_)) { \
MPL_VG_ANNOTATE_HAPPENS_AFTER(cc_ptr_); \
} \
} while (0)
/* MT FIXME does this need a HB/HA annotation? This macro is only used for
* cancel_send right now. */
/* was_incomplete_==TRUE iff the cc==0 before the decr */
#define MPIU_cc_incr(cc_ptr_, was_incomplete_) \
do { \
*(was_incomplete_) = OPA_fetch_and_incr_int(cc_ptr_); \
} while (0)
#define MPIU_cc_get(cc_) OPA_load_int(&(cc_))
/* "publishes" the obj with handle value (handle_) via the handle pointer
* (hnd_lval_). That is, it is a version of the following statement that fixes
* memory consistency issues:
* (hnd_lval_) = (handle_);
*
* assumes that the following is always true: typeof(*hnd_lval_ptr_)==int
*/
/* This could potentially be generalized beyond MPI-handle objects, but we
* should only take that step after seeing good evidence of its use. A general
* macro (that is portable to non-gcc compilers) will need type information to
* make the appropriate volatile cast. */
/* Ideally _GLOBAL would use this too, but we don't want to count on OPA
* availability in _GLOBAL mode. Instead the GLOBAL critical section should be
* used. */
#define MPIU_OBJ_PUBLISH_HANDLE(hnd_lval_, handle_) \
do { \
if (MPIR_ThreadInfo.isThreaded) { \
/* wmb ensures all read-only object field values are seen before the */ \
/* handle value is seen at the application level */ \
OPA_write_barrier(); \
/* volatile ensures lval is not speculatively read or written */ \
*(volatile int *)&(hnd_lval_) = (handle_); \
} \
else { \
(hnd_lval_) = (handle_); \
} \
} while (0)
#endif /* !defined(MPIU_THREAD_POBJ_H_INCLUDED) */