Blob Blame History Raw
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#ifndef MPID_RMA_SHM_H_INCLUDED
#define MPID_RMA_SHM_H_INCLUDED

#include "utlist.h"
#include "mpid_rma_types.h"

static inline int do_accumulate_op(void *source_buf, int source_count, MPI_Datatype source_dtp,
                                   void *target_buf, int target_count, MPI_Datatype target_dtp,
                                   MPI_Aint stream_offset, MPI_Op acc_op,
                                   MPIDI_RMA_Acc_srcbuf_kind_t srckind);

#define ASSIGN_COPY(src, dest, count, type)     \
    {                                           \
        type *src_ = (type *) src;              \
        type *dest_ = (type *) dest;            \
        int i;                                  \
        for (i = 0; i < count; i++)             \
            dest_[i] = src_[i];                 \
        goto fn_exit;                           \
    }

static inline int shm_copy(const void *src, int scount, MPI_Datatype stype,
                           void *dest, int dcount, MPI_Datatype dtype)
{
    int mpi_errno = MPI_SUCCESS;

    /* We use a threshold of operations under which a for loop of assignments is
     * used.  Even though this happens at smaller block lengths, making it
     * potentially inefficient, it can take advantage of some vectorization
     * available on most modern processors. */
#define SHM_OPS_THRESHOLD  (16)

    if (MPIR_DATATYPE_IS_PREDEFINED(stype) && MPIR_DATATYPE_IS_PREDEFINED(dtype) &&
        scount <= SHM_OPS_THRESHOLD) {

        /* FIXME: We currently only optimize a few predefined datatypes, which
         * have a direct C datatype mapping. */

        /* The below list of datatypes is based on those specified in the MPI-3
         * standard on page 665. */
        switch (stype) {
        case MPI_CHAR:
            ASSIGN_COPY(src, dest, scount, char);

        case MPI_SHORT:
            ASSIGN_COPY(src, dest, scount, signed short int);

        case MPI_INT:
            ASSIGN_COPY(src, dest, scount, signed int);

        case MPI_LONG:
            ASSIGN_COPY(src, dest, scount, signed long int);

        case MPI_LONG_LONG_INT:        /* covers MPI_LONG_LONG too */
            ASSIGN_COPY(src, dest, scount, signed long long int);

        case MPI_SIGNED_CHAR:
            ASSIGN_COPY(src, dest, scount, signed char);

        case MPI_UNSIGNED_CHAR:
            ASSIGN_COPY(src, dest, scount, unsigned char);

        case MPI_UNSIGNED_SHORT:
            ASSIGN_COPY(src, dest, scount, unsigned short int);

        case MPI_UNSIGNED:
            ASSIGN_COPY(src, dest, scount, unsigned int);

        case MPI_UNSIGNED_LONG:
            ASSIGN_COPY(src, dest, scount, unsigned long int);

        case MPI_UNSIGNED_LONG_LONG:
            ASSIGN_COPY(src, dest, scount, unsigned long long int);

        case MPI_FLOAT:
            ASSIGN_COPY(src, dest, scount, float);

        case MPI_DOUBLE:
            ASSIGN_COPY(src, dest, scount, double);

        case MPI_LONG_DOUBLE:
            ASSIGN_COPY(src, dest, scount, long double);

#if 0
            /* FIXME: we need a configure check to define HAVE_WCHAR_T before
             * this can be enabled */
        case MPI_WCHAR:
            ASSIGN_COPY(src, dest, scount, wchar_t);
#endif

#if 0
            /* FIXME: we need a configure check to define HAVE_C_BOOL before
             * this can be enabled */
        case MPI_C_BOOL:
            ASSIGN_COPY(src, dest, scount, _Bool);
#endif

#if HAVE_INT8_T
        case MPI_INT8_T:
            ASSIGN_COPY(src, dest, scount, int8_t);
#endif /* HAVE_INT8_T */

#if HAVE_INT16_T
        case MPI_INT16_T:
            ASSIGN_COPY(src, dest, scount, int16_t);
#endif /* HAVE_INT16_T */

#if HAVE_INT32_T
        case MPI_INT32_T:
            ASSIGN_COPY(src, dest, scount, int32_t);
#endif /* HAVE_INT32_T */

#if HAVE_INT64_T
        case MPI_INT64_T:
            ASSIGN_COPY(src, dest, scount, int64_t);
#endif /* HAVE_INT64_T */

#if HAVE_UINT8_T
        case MPI_UINT8_T:
            ASSIGN_COPY(src, dest, scount, uint8_t);
#endif /* HAVE_UINT8_T */

#if HAVE_UINT16_T
        case MPI_UINT16_T:
            ASSIGN_COPY(src, dest, scount, uint16_t);
#endif /* HAVE_UINT16_T */

#if HAVE_UINT32_T
        case MPI_UINT32_T:
            ASSIGN_COPY(src, dest, scount, uint32_t);
#endif /* HAVE_UINT32_T */

#if HAVE_UINT64_T
        case MPI_UINT64_T:
            ASSIGN_COPY(src, dest, scount, uint64_t);
#endif /* HAVE_UINT64_T */

        case MPI_AINT:
            ASSIGN_COPY(src, dest, scount, MPI_Aint);

        case MPI_COUNT:
            ASSIGN_COPY(src, dest, scount, MPI_Count);

        case MPI_OFFSET:
            ASSIGN_COPY(src, dest, scount, MPI_Offset);

#if 0
            /* FIXME: we need a configure check to define HAVE_C_COMPLEX before
             * this can be enabled */
        case MPI_C_COMPLEX:    /* covers MPI_C_FLOAT_COMPLEX as well */
            ASSIGN_COPY(src, dest, scount, float _Complex);
#endif

#if 0
            /* FIXME: we need a configure check to define HAVE_C_DOUPLE_COMPLEX
             * before this can be enabled */
        case MPI_C_DOUBLE_COMPLEX:
            ASSIGN_COPY(src, dest, scount, double _Complex);
#endif

#if 0
            /* FIXME: we need a configure check to define
             * HAVE_C_LONG_DOUPLE_COMPLEX before this can be enabled */
        case MPI_C_LONG_DOUBLE_COMPLEX:
            ASSIGN_COPY(src, dest, scount, long double _Complex);
#endif

#if 0
            /* Types that don't have a direct equivalent */
        case MPI_BYTE:
        case MPI_PACKED:
#endif

#if 0   /* Fortran types */
        case MPI_INTEGER:
        case MPI_REAL:
        case MPI_DOUBLE_PRECISION:
        case MPI_COMPLEX:
        case MPI_LOGICAL:
        case MPI_CHARACTER:
#endif

#if 0   /* C++ types */
        case MPI_CXX_BOOL:
        case MPI_CXX_FLOAT_COMPLEX:
        case MPI_CXX_DOUBLE_COMPLEX:
        case MPI_CXX_LONG_DOUBLE_COMPLEX:
#endif

#if 0   /* Optional Fortran types */
        case MPI_DOUBLE_COMPLEX:
        case MPI_INTEGER1:
        case MPI_INTEGER2:
        case MPI_INTEGER4:
        case MPI_INTEGER8:
        case MPI_INTEGER16:
        case MPI_REAL2:
        case MPI_REAL4:
        case MPI_REAL8:
        case MPI_REAL16:
        case MPI_COMPLEX4:
        case MPI_COMPLEX8:
        case MPI_COMPLEX16:
        case MPI_COMPLEX32:
#endif

#if 0   /* C datatypes for reduction functions */
        case MPI_FLOAT_INT:
        case MPI_DOUBLE_INT:
        case MPI_LONG_INT:
        case MPI_2INT:
        case MPI_LONG_DOUBLE_INT:
#endif

#if 0   /* Fortran datatypes for reduction functions */
        case MPI_2REAL:
        case MPI_2DOUBLE_PRECISION:
        case MPI_2INTEGER:
#endif

#if 0   /* Random types not present in the standard */
        case MPI_2COMPLEX:
        case MPI_2DOUBLE_COMPLEX:
#endif

        default:
            /* Just to make sure the switch statement is not empty */
            ;
        }
    }

    mpi_errno = MPIR_Localcopy(src, scount, stype, dest, dcount, dtype);
    if (mpi_errno) {
        MPIR_ERR_POP(mpi_errno);
    }

  fn_exit:
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_put_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_put_op(const void *origin_addr, int origin_count, MPI_Datatype
                                        origin_datatype, int target_rank, MPI_Aint target_disp,
                                        int target_count, MPI_Datatype target_datatype,
                                        MPIR_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    void *base = NULL;
    int disp_unit;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);

    if (win_ptr->shm_allocated == TRUE) {
        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
        MPIR_Assert(local_target_rank >= 0);
        base = win_ptr->shm_base_addrs[local_target_rank];
        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    mpi_errno = shm_copy(origin_addr, origin_count, origin_datatype,
                         (char *) base + disp_unit * target_disp, target_count, target_datatype);
    if (mpi_errno) {
        MPIR_ERR_POP(mpi_errno);
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_PUT_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_acc_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_acc_op(const void *origin_addr, int origin_count, MPI_Datatype
                                        origin_datatype, int target_rank, MPI_Aint target_disp,
                                        int target_count, MPI_Datatype target_datatype, MPI_Op op,
                                        MPIR_Win * win_ptr)
{
    void *base = NULL;
    int disp_unit, shm_op = 0;
    int mpi_errno = MPI_SUCCESS;
    int i;
    MPI_Datatype basic_type;
    MPI_Aint stream_elem_count, stream_unit_count;
    MPI_Aint predefined_dtp_size, predefined_dtp_extent, predefined_dtp_count;
    MPI_Aint total_len, rest_len;
    MPI_Aint origin_dtp_size;
    MPIR_Datatype*origin_dtp_ptr = NULL;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);

    if (win_ptr->shm_allocated == TRUE) {
        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
        MPIR_Assert(local_target_rank >= 0);
        shm_op = 1;
        base = win_ptr->shm_base_addrs[local_target_rank];
        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
        if (shm_op) {
            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        }
        mpi_errno = do_accumulate_op((void *) origin_addr, origin_count, origin_datatype,
                                     (void *) ((char *) base + disp_unit * target_disp),
                                     target_count, target_datatype, 0, op,
                                     MPIDI_RMA_ACC_SRCBUF_DEFAULT);
        if (shm_op) {
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        }

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        goto fn_exit;
    }

    /* Get total length of origin data */
    MPIR_Datatype_get_size_macro(origin_datatype, origin_dtp_size);
    total_len = origin_dtp_size * origin_count;

    MPIR_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
    MPIR_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
    basic_type = origin_dtp_ptr->basic_type;
    MPIR_Datatype_get_size_macro(basic_type, predefined_dtp_size);
    predefined_dtp_count = total_len / predefined_dtp_size;
    MPIR_Datatype_get_extent_macro(basic_type, predefined_dtp_extent);
    MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);

    stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
    stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
    MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0);

    rest_len = total_len;
    for (i = 0; i < stream_unit_count; i++) {
        MPIR_Segment *seg = NULL;
        void *packed_buf = NULL;
        MPI_Aint first, last;
        MPI_Aint stream_offset, stream_size, stream_count;

        stream_offset = i * stream_elem_count * predefined_dtp_size;
        stream_size = MPL_MIN(stream_elem_count * predefined_dtp_size, rest_len);
        stream_count = stream_size / predefined_dtp_size;
        rest_len -= stream_size;

        first = stream_offset;
        last = stream_offset + stream_size;

        packed_buf = MPL_malloc(stream_size, MPL_MEM_BUFFER);

        seg = MPIR_Segment_alloc();
        MPIR_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
                             "MPIR_Segment");
        MPIR_Segment_init(origin_addr, origin_count, origin_datatype, seg);
        MPIR_Segment_pack(seg, first, &last, packed_buf);
        MPIR_Segment_free(seg);

        if (shm_op) {
            MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        }

        MPIR_Assert(stream_count == (int) stream_count);
        mpi_errno = do_accumulate_op((void *) packed_buf, (int) stream_count, basic_type,
                                     (void *) ((char *) base + disp_unit * target_disp),
                                     target_count, target_datatype, stream_offset, op,
                                     MPIDI_RMA_ACC_SRCBUF_PACKED);

        if (shm_op) {
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        }

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPL_free(packed_buf);
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_ACC_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_get_acc_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_get_acc_op(const void *origin_addr, int origin_count, MPI_Datatype
                                            origin_datatype, void *result_addr, int result_count,
                                            MPI_Datatype result_datatype, int target_rank, MPI_Aint
                                            target_disp, int target_count,
                                            MPI_Datatype target_datatype, MPI_Op op,
                                            MPIR_Win * win_ptr)
{
    int disp_unit, shm_locked = 0;
    void *base = NULL;
    int i;
    MPI_Datatype basic_type;
    MPI_Aint stream_elem_count, stream_unit_count;
    MPI_Aint predefined_dtp_size, predefined_dtp_extent, predefined_dtp_count;
    MPI_Aint total_len, rest_len;
    MPI_Aint origin_dtp_size;
    MPIR_Datatype*origin_dtp_ptr = NULL;
    int is_empty_origin = FALSE;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);

    /* Judge if origin buffer is empty */
    if (op == MPI_NO_OP)
        is_empty_origin = TRUE;

    if (win_ptr->shm_allocated == TRUE) {
        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
        MPIR_Assert(local_target_rank >= 0);
        base = win_ptr->shm_base_addrs[local_target_rank];
        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        shm_locked = 1;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    /* Perform the local get first, then the accumulate */
    mpi_errno = shm_copy((char *) base + disp_unit * target_disp, target_count, target_datatype,
                         result_addr, result_count, result_datatype);
    if (mpi_errno) {
        MPIR_ERR_POP(mpi_errno);
    }

    if (is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {

        mpi_errno = do_accumulate_op((void *) origin_addr, origin_count, origin_datatype,
                                     (void *) ((char *) base + disp_unit * target_disp),
                                     target_count, target_datatype, 0, op,
                                     MPIDI_RMA_ACC_SRCBUF_DEFAULT);
        if (shm_locked) {
            MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        }

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        goto fn_exit;
    }

    /* Get total length of origin data */
    MPIR_Datatype_get_size_macro(origin_datatype, origin_dtp_size);
    total_len = origin_dtp_size * origin_count;

    MPIR_Datatype_get_ptr(origin_datatype, origin_dtp_ptr);
    MPIR_Assert(origin_dtp_ptr != NULL && origin_dtp_ptr->basic_type != MPI_DATATYPE_NULL);
    basic_type = origin_dtp_ptr->basic_type;
    MPIR_Datatype_get_size_macro(basic_type, predefined_dtp_size);
    predefined_dtp_count = total_len / predefined_dtp_size;
    MPIR_Datatype_get_extent_macro(basic_type, predefined_dtp_extent);
    MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0);

    stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
    stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
    MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0);

    rest_len = total_len;
    for (i = 0; i < stream_unit_count; i++) {
        MPIR_Segment *seg = NULL;
        void *packed_buf = NULL;
        MPI_Aint first, last;
        MPI_Aint stream_offset, stream_size, stream_count;

        stream_offset = i * stream_elem_count * predefined_dtp_size;
        stream_size = MPL_MIN(stream_elem_count * predefined_dtp_size, rest_len);
        stream_count = stream_size / predefined_dtp_size;
        rest_len -= stream_size;

        first = stream_offset;
        last = stream_offset + stream_size;

        packed_buf = MPL_malloc(stream_size, MPL_MEM_BUFFER);

        seg = MPIR_Segment_alloc();
        MPIR_ERR_CHKANDJUMP1(seg == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s",
                             "MPIR_Segment");
        MPIR_Segment_init(origin_addr, origin_count, origin_datatype, seg);
        MPIR_Segment_pack(seg, first, &last, packed_buf);
        MPIR_Segment_free(seg);

        MPIR_Assert(stream_count == (int) stream_count);
        mpi_errno = do_accumulate_op((void *) packed_buf, (int) stream_count, basic_type,
                                     (void *) ((char *) base + disp_unit * target_disp),
                                     target_count, target_datatype, stream_offset, op,
                                     MPIDI_RMA_ACC_SRCBUF_PACKED);

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        MPL_free(packed_buf);
    }

    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        shm_locked = 0;
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_GET_ACC_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    }
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_get_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_get_op(void *origin_addr, int origin_count,
                                        MPI_Datatype origin_datatype, int target_rank,
                                        MPI_Aint target_disp, int target_count,
                                        MPI_Datatype target_datatype, MPIR_Win * win_ptr)
{
    void *base = NULL;
    int disp_unit;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);

    if (win_ptr->shm_allocated == TRUE) {
        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
        MPIR_Assert(local_target_rank >= 0);
        base = win_ptr->shm_base_addrs[local_target_rank];
        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    mpi_errno = shm_copy((char *) base + disp_unit * target_disp, target_count, target_datatype,
                         origin_addr, origin_count, origin_datatype);
    if (mpi_errno) {
        MPIR_ERR_POP(mpi_errno);
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_GET_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_cas_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_cas_op(const void *origin_addr, const void *compare_addr,
                                        void *result_addr, MPI_Datatype datatype, int target_rank,
                                        MPI_Aint target_disp, MPIR_Win * win_ptr)
{
    void *base = NULL, *dest_addr = NULL;
    int disp_unit;
    MPI_Aint len;
    int shm_locked = 0;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);

    if (win_ptr->shm_allocated == TRUE) {
        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
        MPIR_Assert(local_target_rank >= 0);
        base = win_ptr->shm_base_addrs[local_target_rank];
        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;

        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        shm_locked = 1;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    dest_addr = (char *) base + disp_unit * target_disp;

    MPIR_Datatype_get_size_macro(datatype, len);
    MPIR_Memcpy(result_addr, dest_addr, len);

    if (MPIR_Compare_equal(compare_addr, dest_addr, datatype)) {
        MPIR_Memcpy(dest_addr, origin_addr, len);
    }

    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        shm_locked = 0;
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_CAS_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    }
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Shm_fop_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Shm_fop_op(const void *origin_addr, void *result_addr,
                                        MPI_Datatype datatype, int target_rank,
                                        MPI_Aint target_disp, MPI_Op op, MPIR_Win * win_ptr)
{
    void *base = NULL, *dest_addr = NULL;
    MPI_User_function *uop = NULL;
    int disp_unit;
    MPI_Aint len;
    int one, shm_locked = 0;
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);

    if (win_ptr->shm_allocated == TRUE) {
        int local_target_rank = win_ptr->comm_ptr->intranode_table[target_rank];
        MPIR_Assert(local_target_rank >= 0);
        base = win_ptr->shm_base_addrs[local_target_rank];
        disp_unit = win_ptr->basic_info_table[target_rank].disp_unit;

        MPIDI_CH3I_SHM_MUTEX_LOCK(win_ptr);
        shm_locked = 1;
    }
    else {
        base = win_ptr->base;
        disp_unit = win_ptr->disp_unit;
    }

    dest_addr = (char *) base + disp_unit * target_disp;

    MPIR_Datatype_get_size_macro(datatype, len);
    MPIR_Memcpy(result_addr, dest_addr, len);

    uop = MPIR_OP_HDL_TO_FN(op);
    one = 1;

    (*uop) ((void *) origin_addr, dest_addr, &one, &datatype);

    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
        shm_locked = 0;
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_FOP_OP);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    if (shm_locked) {
        MPIDI_CH3I_SHM_MUTEX_UNLOCK(win_ptr);
    }
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}


#endif /* MPID_RMA_SHM_H_INCLUDED */