Blob Blame History Raw
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2006 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 *
 *  Portions of this code were written by Intel Corporation.
 *  Copyright (C) 2011-2016 Intel Corporation.  Intel provides this material
 *  to Argonne National Laboratory subject to Software Grant and Corporate
 *  Contributor License Agreement dated February 8, 2012.
 */
#ifndef CH4R_WIN_H_INCLUDED
#define CH4R_WIN_H_INCLUDED

#include "ch4_impl.h"
#include "ch4i_util.h"
#include <opa_primitives.h>
#include "mpir_info.h"
#include "ch4r_symheap.h"
#include "uthash.h"
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
#endif /* HAVE_SYS_MMAN_H */

/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===

cvars:
    - name        : MPIR_CVAR_CH4_RMA_MEM_EFFICIENT
      category    : CH4
      type        : boolean
      default     : false
      class       : none
      verbosity   : MPI_T_VERBOSITY_USER_BASIC
      scope       : MPI_T_SCOPE_GROUP_EQ
      description : >-
        If true, memory-saving mode is on, per-target object is released
        at the epoch end call.
        If false, performance-efficient mode is on, all allocated target
        objects are cached and freed at win_finalize.

=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_winlock_getlocallock ATTRIBUTE((unused));
extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_wincreate_allgather ATTRIBUTE((unused));
extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_amhdr_set ATTRIBUTE((unused));

MPL_STATIC_INLINE_PREFIX void MPIDI_CH4I_parse_info_accu_ops_str(const char *str,
                                                                 uint32_t * ops_ptr)
{
    uint32_t ops = 0;
    char *value, *token, *savePtr = NULL;

    value = (char *) str;
    /* str can never be NULL. */
    MPIR_Assert(value);

    /* handle special value */
    if (!strncmp(value, "none", strlen("none"))) {
        *ops_ptr = 0;
        return;
    } else if (!strncmp(value, "any_op", strlen("any_op"))) {
        MPIDI_CH4U_win_info_accu_op_shift_t op_shift;
        /* add all ops */
        for (op_shift = 0; op_shift < MPIDI_CH4I_ACCU_OP_SHIFT_LAST; op_shift++)
            ops |= (1 << op_shift);
        *ops_ptr = ops;
        return;
    }

    token = (char *) strtok_r(value, ",", &savePtr);
    while (token != NULL) {

        /* traverse op list (exclude null and last) and add the op if set */
        if (!strncmp(token, "max", strlen("max")))
            ops |= (1 << MPIDI_CH4I_ACCU_MAX_SHIFT);
        else if (!strncmp(token, "min", strlen("min")))
            ops |= (1 << MPIDI_CH4I_ACCU_MIN_SHIFT);
        else if (!strncmp(token, "sum", strlen("sum")))
            ops |= (1 << MPIDI_CH4I_ACCU_SUM_SHIFT);
        else if (!strncmp(token, "prod", strlen("prod")))
            ops |= (1 << MPIDI_CH4I_ACCU_PROD_SHIFT);
        else if (!strncmp(token, "maxloc", strlen("maxloc")))
            ops |= (1 << MPIDI_CH4I_ACCU_MAXLOC_SHIFT);
        else if (!strncmp(token, "minloc", strlen("minloc")))
            ops |= (1 << MPIDI_CH4I_ACCU_MINLOC_SHIFT);
        else if (!strncmp(token, "band", strlen("band")))
            ops |= (1 << MPIDI_CH4I_ACCU_BAND_SHIFT);
        else if (!strncmp(token, "bor", strlen("bor")))
            ops |= (1 << MPIDI_CH4I_ACCU_BOR_SHIFT);
        else if (!strncmp(token, "bxor", strlen("bxor")))
            ops |= (1 << MPIDI_CH4I_ACCU_BXOR_SHIFT);
        else if (!strncmp(token, "land", strlen("land")))
            ops |= (1 << MPIDI_CH4I_ACCU_LAND_SHIFT);
        else if (!strncmp(token, "lor", strlen("lor")))
            ops |= (1 << MPIDI_CH4I_ACCU_LOR_SHIFT);
        else if (!strncmp(token, "lxor", strlen("lxor")))
            ops |= (1 << MPIDI_CH4I_ACCU_LXOR_SHIFT);
        else if (!strncmp(token, "replace", strlen("replace")))
            ops |= (1 << MPIDI_CH4I_ACCU_REPLACE_SHIFT);
        else if (!strncmp(token, "no_op", strlen("no_op")))
            ops |= (1 << MPIDI_CH4I_ACCU_NO_OP_SHIFT);
        else if (!strncmp(token, "cswap", strlen("cswap")) ||
                 !strncmp(token, "compare_and_swap", strlen("compare_and_swap")))
            ops |= (1 << MPIDI_CH4I_ACCU_CSWAP_SHIFT);

        token = (char *) strtok_r(NULL, ",", &savePtr);
    }

    /* update info only when any valid value is set */
    if (ops)
        *ops_ptr = ops;
}

MPL_STATIC_INLINE_PREFIX void MPIDI_CH4I_get_info_accu_ops_str(uint32_t val, char *buf,
                                                               size_t maxlen)
{
    int c = 0;

    MPIR_Assert(maxlen >= strlen("max,min,sum,prod,maxloc,minloc,band,bor,"
                                 "bxor,land,lor,lxor,replace,no_op,cswap") + 1);

    if (val & (1 << MPIDI_CH4I_ACCU_MAX_SHIFT))
        c += snprintf(buf + c, maxlen - c, "max");
    if (val & (1 << MPIDI_CH4I_ACCU_MIN_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%smin", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_SUM_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%ssum", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_PROD_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sprod", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_MAXLOC_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%smaxloc", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_MINLOC_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sminloc", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_BAND_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sband", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_BOR_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sbor", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_BXOR_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sbxor", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_LAND_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sland", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_LOR_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%slor", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_LXOR_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%slxor", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_REPLACE_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sreplace", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_NO_OP_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%sno_op", (c > 0) ? "," : "");
    if (val & (1 << MPIDI_CH4I_ACCU_CSWAP_SHIFT))
        c += snprintf(buf + c, maxlen - c, "%scswap", (c > 0) ? "," : "");

    if (c == 0)
        strncpy(buf, "none", maxlen);
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4_RMA_Init_sync_pvars
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_RMA_Init_sync_pvars(void)
{
    int mpi_errno = MPI_SUCCESS;
    /* rma_winlock_getlocallock */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_winlock_getlocallock,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "WIN_LOCK:Get local lock (in seconds)");

    /* rma_wincreate_allgather */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_wincreate_allgather,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "WIN_CREATE:Allgather (in seconds)");

    /* rma_amhdr_set */
    MPIR_T_PVAR_TIMER_REGISTER_STATIC(RMA,
                                      MPI_DOUBLE,
                                      rma_amhdr_set,
                                      MPI_T_VERBOSITY_MPIDEV_DETAIL,
                                      MPI_T_BIND_NO_OBJECT,
                                      MPIR_T_PVAR_FLAG_READONLY,
                                      "RMA", "Set fields in AM Handler (in seconds)");

    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4I_win_set_info
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
MPL_STATIC_INLINE_PREFIX int MPIDI_CH4I_win_set_info(MPIR_Win * win, MPIR_Info * info, bool is_init)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4I_WIN_SET_INFO);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4I_WIN_SET_INFO);

    MPIR_Info *curr_ptr;
    char *value, *token, *savePtr = NULL;
    int save_ordering;

    curr_ptr = info->next;

    while (curr_ptr) {
        if (!strcmp(curr_ptr->key, "no_locks")) {
            if (!strcmp(curr_ptr->value, "true"))
                MPIDI_CH4U_WIN(win, info_args).no_locks = 1;
            else if (!strcmp(curr_ptr->value, "false"))
                MPIDI_CH4U_WIN(win, info_args).no_locks = 0;
        } else if (!strcmp(curr_ptr->key, "accumulate_ordering")) {
            save_ordering = MPIDI_CH4U_WIN(win, info_args).accumulate_ordering;
            MPIDI_CH4U_WIN(win, info_args).accumulate_ordering = 0;
            if (!strcmp(curr_ptr->value, "none")) {
                /* For MPI-3, "none" means no ordering and is not default. */
                goto next;
            }

            /* value can never be NULL. */
            MPIR_Assert(curr_ptr->value);

            value = curr_ptr->value;
            token = (char *) strtok_r(value, ",", &savePtr);

            while (token) {
                if (!memcmp(token, "rar", 3))
                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
                        (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering |
                         MPIDI_CH4I_ACCU_ORDER_RAR);
                else if (!memcmp(token, "raw", 3))
                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
                        (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering |
                         MPIDI_CH4I_ACCU_ORDER_RAW);
                else if (!memcmp(token, "war", 3))
                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
                        (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering |
                         MPIDI_CH4I_ACCU_ORDER_WAR);
                else if (!memcmp(token, "waw", 3))
                    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering =
                        (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering |
                         MPIDI_CH4I_ACCU_ORDER_WAW);
                else
                    MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_ARG, goto fn_fail, "**info");

                token = (char *) strtok_r(NULL, ",", &savePtr);
            }

            if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering == 0)
                MPIDI_CH4U_WIN(win, info_args).accumulate_ordering = save_ordering;
        } else if (!strcmp(curr_ptr->key, "accumulate_ops")) {
            if (!strcmp(curr_ptr->value, "same_op"))
                MPIDI_CH4U_WIN(win, info_args).accumulate_ops = MPIDI_CH4I_ACCU_SAME_OP;
            else if (!strcmp(curr_ptr->value, "same_op_no_op"))
                MPIDI_CH4U_WIN(win, info_args).accumulate_ops = MPIDI_CH4I_ACCU_SAME_OP_NO_OP;
        } else if (!strcmp(curr_ptr->key, "same_disp_unit")) {
            if (!strcmp(curr_ptr->value, "true"))
                MPIDI_CH4U_WIN(win, info_args).same_disp_unit = 1;
            else if (!strcmp(curr_ptr->value, "false"))
                MPIDI_CH4U_WIN(win, info_args).same_disp_unit = 0;
        } else if (!strcmp(curr_ptr->key, "same_size")) {
            if (!strcmp(curr_ptr->value, "true"))
                MPIDI_CH4U_WIN(win, info_args).same_size = 1;
            else if (!strcmp(curr_ptr->value, "false"))
                MPIDI_CH4U_WIN(win, info_args).same_size = 0;
        } else if (!strcmp(curr_ptr->key, "alloc_shared_noncontig")) {
            if (!strcmp(curr_ptr->value, "true"))
                MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig = 1;
            else if (!strcmp(curr_ptr->value, "false"))
                MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig = 0;
        } else if (!strcmp(curr_ptr->key, "alloc_shm")) {
            if (!strcmp(curr_ptr->value, "true"))
                MPIDI_CH4U_WIN(win, info_args).alloc_shm = 1;
            else if (!strcmp(curr_ptr->value, "false"))
                MPIDI_CH4U_WIN(win, info_args).alloc_shm = 0;
        }
        /* We allow the user to set the following atomics hint only at window init time,
         * all future updates by win_set_info are ignored. This is because we do not
         * have a good way to ensure all outstanding atomic ops have been completed
         * on all processes especially in passive-target epochs. */
        else if (is_init && !strcmp(curr_ptr->key, "which_accumulate_ops")) {
            MPIDI_CH4I_parse_info_accu_ops_str(curr_ptr->value,
                                               &MPIDI_CH4U_WIN(win,
                                                               info_args).which_accumulate_ops);
        } else if (is_init && !strcmp(curr_ptr->key, "accumulate_noncontig_dtype")) {
            if (!strcmp(curr_ptr->value, "true"))
                MPIDI_CH4U_WIN(win, info_args).accumulate_noncontig_dtype = true;
            else if (!strcmp(curr_ptr->value, "false"))
                MPIDI_CH4U_WIN(win, info_args).accumulate_noncontig_dtype = false;
        } else if (is_init && !strcmp(curr_ptr->key, "accumulate_max_bytes")) {
            if (!strcmp(curr_ptr->value, "unlimited") || !strcmp(curr_ptr->value, "-1"))
                MPIDI_CH4U_WIN(win, info_args).accumulate_max_bytes = -1;
            else {
                long max_bytes = atol(curr_ptr->value);
                if (max_bytes >= 0)
                    MPIDI_CH4U_WIN(win, info_args).accumulate_max_bytes = max_bytes;
            }
        } else if (is_init && !strcmp(curr_ptr->key, "disable_shm_accumulate")) {
            if (!strcmp(curr_ptr->value, "true"))
                MPIDI_CH4U_WIN(win, info_args).disable_shm_accumulate = true;
            else
                MPIDI_CH4U_WIN(win, info_args).disable_shm_accumulate = false;
        }
      next:
        curr_ptr = curr_ptr->next;
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4I_WIN_SET_INFO);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_set_info
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_set_info(MPIR_Win * win, MPIR_Info * info)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_SET_INFO);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_SET_INFO);

    mpi_errno = MPIDI_CH4I_win_set_info(win, info, FALSE /* is_init */);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIR_Barrier(win->comm_ptr, &errflag);
  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_SET_INFO);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_win_init
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_win_init(MPI_Aint length,
                                      int disp_unit,
                                      MPIR_Win ** win_ptr,
                                      MPIR_Info * info,
                                      MPIR_Comm * comm_ptr, int create_flavor, int model)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Win *win = (MPIR_Win *) MPIR_Handle_obj_alloc(&MPIR_Win_mem);
    MPIDI_CH4U_win_target_t *targets = NULL;
    MPIR_Comm *win_comm_ptr;
    MPIDI_CH4U_win_info_accu_op_shift_t op_shift;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_WIN_INIT);
    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH4R_WIN_INIT);

    MPIR_ERR_CHKANDSTMT(win == NULL, mpi_errno, MPI_ERR_NO_MEM, goto fn_fail, "**nomem");
    *win_ptr = win;

    memset(&win->dev.ch4u, 0, sizeof(MPIDI_CH4U_win_t));

    /* Duplicate the original communicator here to avoid having collisions
     * between internal collectives */
    mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr);
    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    MPIDI_CH4U_WIN(win, targets) = targets;

    win->errhandler = NULL;
    win->base = NULL;
    win->size = length;
    win->disp_unit = disp_unit;
    win->create_flavor = (MPIR_Win_flavor_t) create_flavor;
    win->model = (MPIR_Win_model_t) model;
    win->copyCreateFlavor = (MPIR_Win_flavor_t) 0;
    win->copyModel = (MPIR_Win_model_t) 0;
    win->attributes = NULL;
    win->comm_ptr = win_comm_ptr;
    win->copyDispUnit = 0;
    win->copySize = 0;
    MPIDI_CH4U_WIN(win, shared_table) = NULL;
    MPIDI_CH4U_WIN(win, sync).assert_mode = 0;
    MPIDI_CH4U_WIN(win, shm_allocated) = 0;

    /* Initialize the info (hint) flags per window */
    MPIDI_CH4U_WIN(win, info_args).no_locks = 0;
    MPIDI_CH4U_WIN(win, info_args).accumulate_ordering = (MPIDI_CH4I_ACCU_ORDER_RAR |
                                                          MPIDI_CH4I_ACCU_ORDER_RAW |
                                                          MPIDI_CH4I_ACCU_ORDER_WAR |
                                                          MPIDI_CH4I_ACCU_ORDER_WAW);
    MPIDI_CH4U_WIN(win, info_args).accumulate_ops = MPIDI_CH4I_ACCU_SAME_OP_NO_OP;
    MPIDI_CH4U_WIN(win, info_args).same_size = 0;
    MPIDI_CH4U_WIN(win, info_args).same_disp_unit = 0;
    MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig = 0;
    if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE
        || win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
        MPIDI_CH4U_WIN(win, info_args).alloc_shm = 1;
    } else {
        MPIDI_CH4U_WIN(win, info_args).alloc_shm = 0;
    }

    /* default any op */
    MPIDI_CH4U_WIN(win, info_args).which_accumulate_ops = 0;
    for (op_shift = 0; op_shift < MPIDI_CH4I_ACCU_OP_SHIFT_LAST; op_shift++)
        MPIDI_CH4U_WIN(win, info_args).which_accumulate_ops |= (1 << op_shift);
    MPIDI_CH4U_WIN(win, info_args).accumulate_noncontig_dtype = true;
    MPIDI_CH4U_WIN(win, info_args).accumulate_max_bytes = -1;
    MPIDI_CH4U_WIN(win, info_args).disable_shm_accumulate = false;

    if ((info != NULL) && ((int *) info != (int *) MPI_INFO_NULL)) {
        mpi_errno = MPIDI_CH4I_win_set_info(win, info, TRUE /* is_init */);
        if (MPI_SUCCESS != mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }


    MPIDI_CH4U_WIN(win, mmap_sz) = 0;
    MPIDI_CH4U_WIN(win, mmap_addr) = NULL;

    MPIR_cc_set(&MPIDI_CH4U_WIN(win, local_cmpl_cnts), 0);
    MPIR_cc_set(&MPIDI_CH4U_WIN(win, remote_cmpl_cnts), 0);
    MPIR_cc_set(&MPIDI_CH4U_WIN(win, remote_acc_cmpl_cnts), 0);

    MPIDI_CH4U_WIN(win, win_id) = MPIDI_CH4U_generate_win_id(comm_ptr);
    MPIDI_CH4U_map_set(MPIDI_CH4_Global.win_map, MPIDI_CH4U_WIN(win, win_id), win, MPL_MEM_RMA);

  fn_exit:
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH4R_WIN_INIT);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4I_fill_ranks_in_win_grp
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4I_fill_ranks_in_win_grp(MPIR_Win * win_ptr, MPIR_Group * group_ptr,
                                                   int *ranks_in_win_grp)
{
    int mpi_errno = MPI_SUCCESS;
    int i, *ranks_in_grp = NULL;
    MPIR_Group *win_grp_ptr;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4I_FILL_RANKS_IN_WIN_GRP);
    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH4I_FILL_RANKS_IN_WIN_GRP);

    ranks_in_grp = (int *) MPL_malloc(group_ptr->size * sizeof(int), MPL_MEM_RMA);
    MPIR_Assert(ranks_in_grp);
    for (i = 0; i < group_ptr->size; i++)
        ranks_in_grp[i] = i;

    mpi_errno = MPIR_Comm_group_impl(win_ptr->comm_ptr, &win_grp_ptr);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIR_Group_translate_ranks_impl(group_ptr, group_ptr->size,
                                                ranks_in_grp, win_grp_ptr, ranks_in_win_grp);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIR_Group_free_impl(win_grp_ptr);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

  fn_exit:
    MPL_free(ranks_in_grp);

    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH4I_FILL_RANKS_IN_WIN_GRP);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_start
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_start(MPIR_Group * group, int assert, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_START);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_START);

    MPIDI_CH4U_ACCESS_EPOCH_CHECK_NONE(win, mpi_errno, goto fn_fail);

    MPIR_Group_add_ref(group);
    if (assert & MPI_MODE_NOCHECK) {
        goto no_check;
    }

    MPIDI_CH4R_PROGRESS_WHILE(group->size != (int) MPIDI_CH4U_WIN(win, sync).pw.count);
  no_check:
    MPIDI_CH4U_WIN(win, sync).pw.count = 0;

    MPIR_ERR_CHKANDJUMP((MPIDI_CH4U_WIN(win, sync).sc.group != NULL),
                        mpi_errno, MPI_ERR_GROUP, "**group");
    MPIDI_CH4U_WIN(win, sync).sc.group = group;
    MPIDI_CH4U_WIN(win, sync).access_epoch_type = MPIDI_CH4U_EPOTYPE_START;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_START);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_complete
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_complete(MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH4U_win_cntrl_msg_t msg;
    int win_grp_idx, peer;
    MPIR_Group *group;
    int *ranks_in_win_grp = NULL;
    int all_local_completed = 0;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_COMPLETE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_COMPLETE);

    MPIDI_CH4U_ACCESS_EPOCH_CHECK(win, MPIDI_CH4U_EPOTYPE_START, mpi_errno, return mpi_errno);

    group = MPIDI_CH4U_WIN(win, sync).sc.group;
    MPIR_Assert(group != NULL);

    /* Ensure op completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
    msg.origin_rank = win->comm_ptr->rank;

    /* Ensure completion of AM operations */
    ranks_in_win_grp = (int *) MPL_malloc(sizeof(int) * group->size, MPL_MEM_RMA);
    MPIR_Assert(ranks_in_win_grp);

    mpi_errno = MPIDI_CH4I_fill_ranks_in_win_grp(win, group, ranks_in_win_grp);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    /* FIXME: now we simply set per-target counters for PSCW, can it be optimized ? */
    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_win_check_group_local_completed(win, ranks_in_win_grp, group->size,
                                              &all_local_completed);
    } while (all_local_completed != 1);

    for (win_grp_idx = 0; win_grp_idx < group->size; ++win_grp_idx) {
        peer = ranks_in_win_grp[win_grp_idx];

#ifndef MPIDI_CH4_DIRECT_NETMOD
        if (MPIDI_CH4_rank_is_local(peer, win->comm_ptr))
            mpi_errno = MPIDI_SHM_am_send_hdr(peer, win->comm_ptr,
                                              MPIDI_CH4U_WIN_COMPLETE, &msg, sizeof(msg));
        else
#endif
        {
            mpi_errno = MPIDI_NM_am_send_hdr(peer, win->comm_ptr,
                                             MPIDI_CH4U_WIN_COMPLETE, &msg, sizeof(msg));
        }

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
    }

    /* In performance-efficient mode, all allocated targets are freed at win_finalize. */
    if (MPIR_CVAR_CH4_RMA_MEM_EFFICIENT)
        MPIDI_CH4U_win_target_cleanall(win);
    MPIDI_CH4U_WIN(win, sync).access_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
    MPIR_Group_release(MPIDI_CH4U_WIN(win, sync).sc.group);
    MPIDI_CH4U_WIN(win, sync).sc.group = NULL;

  fn_exit:
    MPL_free(ranks_in_win_grp);

    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_COMPLETE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_post
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_post(MPIR_Group * group, int assert, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_CH4U_win_cntrl_msg_t msg;
    int win_grp_idx, peer;
    int *ranks_in_win_grp = NULL;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_POST);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_POST);

    MPIDI_CH4U_EXPOSURE_EPOCH_CHECK_NONE(win, mpi_errno, goto fn_fail);

    MPIR_Group_add_ref(group);
    MPIR_ERR_CHKANDJUMP((MPIDI_CH4U_WIN(win, sync).pw.group != NULL),
                        mpi_errno, MPI_ERR_GROUP, "**group");

    MPIDI_CH4U_WIN(win, sync).pw.group = group;
    MPIR_Assert(group != NULL);
    if (assert & MPI_MODE_NOCHECK) {
        goto no_check;
    }

    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
    msg.origin_rank = win->comm_ptr->rank;

    ranks_in_win_grp = (int *) MPL_malloc(sizeof(int) * group->size, MPL_MEM_RMA);
    MPIR_Assert(ranks_in_win_grp);

    mpi_errno = MPIDI_CH4I_fill_ranks_in_win_grp(win, group, ranks_in_win_grp);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    for (win_grp_idx = 0; win_grp_idx < group->size; ++win_grp_idx) {
        peer = ranks_in_win_grp[win_grp_idx];

#ifndef MPIDI_CH4_DIRECT_NETMOD
        if (MPIDI_CH4_rank_is_local(peer, win->comm_ptr))
            mpi_errno = MPIDI_SHM_am_send_hdr(peer, win->comm_ptr,
                                              MPIDI_CH4U_WIN_POST, &msg, sizeof(msg));
        else
#endif
        {
            mpi_errno = MPIDI_NM_am_send_hdr(peer, win->comm_ptr,
                                             MPIDI_CH4U_WIN_POST, &msg, sizeof(msg));
        }

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
    }

  no_check:
    MPIDI_CH4U_WIN(win, sync).exposure_epoch_type = MPIDI_CH4U_EPOTYPE_POST;
  fn_exit:
    MPL_free(ranks_in_win_grp);

    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_POST);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_wait
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_wait(MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Group *group;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_WAIT);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_WAIT);

    MPIDI_CH4U_EXPOSURE_EPOCH_CHECK(win, MPIDI_CH4U_EPOTYPE_POST, mpi_errno, goto fn_fail);
    group = MPIDI_CH4U_WIN(win, sync).pw.group;
    MPIDI_CH4R_PROGRESS_WHILE(group->size != (int) MPIDI_CH4U_WIN(win, sync).sc.count);

    MPIDI_CH4U_WIN(win, sync).sc.count = 0;
    MPIDI_CH4U_WIN(win, sync).pw.group = NULL;
    MPIR_Group_release(group);
    MPIDI_CH4U_WIN(win, sync).exposure_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_WAIT);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_test
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_test(MPIR_Win * win, int *flag)
{
    int mpi_errno = MPI_SUCCESS;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_TEST);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_TEST);

    MPIDI_CH4U_EXPOSURE_EPOCH_CHECK(win, MPIDI_CH4U_EPOTYPE_POST, mpi_errno, goto fn_fail);

    MPIR_Group *group;
    group = MPIDI_CH4U_WIN(win, sync).pw.group;

    if (group->size == (int) MPIDI_CH4U_WIN(win, sync).sc.count) {
        MPIDI_CH4U_WIN(win, sync).sc.count = 0;
        MPIDI_CH4U_WIN(win, sync).pw.group = NULL;
        *flag = 1;
        MPIR_Group_release(group);
        MPIDI_CH4U_WIN(win, sync).exposure_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
    } else {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
        *flag = 0;
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_TEST);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_lock
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_lock(int lock_type, int rank, int assert, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    unsigned locked;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_LOCK);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_LOCK);

    if (rank == MPI_PROC_NULL)
        goto fn_exit0;

    MPIDI_CH4U_LOCK_EPOCH_CHECK_NONE(win, rank, mpi_errno, goto fn_fail);

    MPIDI_CH4U_win_target_t *target_ptr = MPIDI_CH4U_win_target_get(win, rank);

    MPIDI_CH4U_win_target_sync_lock_t *slock = &target_ptr->sync.lock;
    MPIR_Assert(slock->locked == 0);
    if (assert & MPI_MODE_NOCHECK) {
        target_ptr->sync.assert_mode |= MPI_MODE_NOCHECK;
        slock->locked = 1;
        goto no_check;
    }

    MPIDI_CH4U_win_cntrl_msg_t msg;
    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
    msg.origin_rank = win->comm_ptr->rank;
    msg.lock_type = lock_type;

    locked = slock->locked + 1;
#ifndef MPIDI_CH4_DIRECT_NETMOD
    if (MPIDI_CH4_rank_is_local(rank, win->comm_ptr))
        mpi_errno =
            MPIDI_SHM_am_send_hdr(rank, win->comm_ptr, MPIDI_CH4U_WIN_LOCK, &msg, sizeof(msg));
    else
#endif
    {
        mpi_errno =
            MPIDI_NM_am_send_hdr(rank, win->comm_ptr, MPIDI_CH4U_WIN_LOCK, &msg, sizeof(msg));
    }

    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");

    MPIDI_CH4R_PROGRESS_WHILE(slock->locked != locked);
  no_check:
    target_ptr->sync.access_epoch_type = MPIDI_CH4U_EPOTYPE_LOCK;

  fn_exit0:
    MPIDI_CH4U_WIN(win, sync).access_epoch_type = MPIDI_CH4U_EPOTYPE_LOCK;
    MPIDI_CH4U_WIN(win, sync).lock.count++;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_LOCK);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_unlock
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_unlock(int rank, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    unsigned unlocked;
    MPIDI_CH4U_win_cntrl_msg_t msg;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_UNLOCK);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_UNLOCK);

    /* Check window lock epoch.
     * PROC_NULL does not update per-target epoch. */
    MPIDI_CH4U_ACCESS_EPOCH_CHECK(win, MPIDI_CH4U_EPOTYPE_LOCK, mpi_errno, return mpi_errno);
    if (rank == MPI_PROC_NULL)
        goto fn_exit0;

    MPIDI_CH4U_win_target_t *target_ptr = MPIDI_CH4U_win_target_find(win, rank);
    MPIR_Assert(target_ptr);

    /* Check per-target lock epoch */
    MPIDI_CH4U_EPOCH_CHECK_TARGET_LOCK(target_ptr, mpi_errno, return mpi_errno);

    MPIDI_CH4U_win_target_sync_lock_t *slock = &target_ptr->sync.lock;
    /* NOTE: lock blocking waits till granted */
    MPIR_Assert(slock->locked == 1);

    /* Ensure op completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_target_cmpl_hook(rank, win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_target_cmpl_hook(rank, win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* Ensure completion of AM operations */
    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
    } while (MPIR_cc_get(target_ptr->remote_cmpl_cnts) != 0);

    if (target_ptr->sync.assert_mode & MPI_MODE_NOCHECK) {
        target_ptr->sync.lock.locked = 0;
        goto no_check;
    }

    msg.win_id = MPIDI_CH4U_WIN(win, win_id);
    msg.origin_rank = win->comm_ptr->rank;
    unlocked = slock->locked - 1;

#ifndef MPIDI_CH4_DIRECT_NETMOD
    if (MPIDI_CH4_rank_is_local(rank, win->comm_ptr))
        mpi_errno =
            MPIDI_SHM_am_send_hdr(rank, win->comm_ptr, MPIDI_CH4U_WIN_UNLOCK, &msg, sizeof(msg));
    else
#endif
    {
        mpi_errno =
            MPIDI_NM_am_send_hdr(rank, win->comm_ptr, MPIDI_CH4U_WIN_UNLOCK, &msg, sizeof(msg));
    }

    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");

    MPIDI_CH4R_PROGRESS_WHILE(slock->locked != unlocked);
  no_check:
    /* In performance-efficient mode, all allocated targets are freed at win_finalize. */
    if (MPIR_CVAR_CH4_RMA_MEM_EFFICIENT)
        MPIDI_CH4U_win_target_delete(win, target_ptr);

  fn_exit0:
    MPIR_Assert(MPIDI_CH4U_WIN(win, sync).lock.count > 0);
    MPIDI_CH4U_WIN(win, sync).lock.count--;

    /* Reset window epoch only when all per-target lock epochs are closed. */
    if (MPIDI_CH4U_WIN(win, sync).lock.count == 0) {
        MPIDI_CH4U_WIN(win, sync).access_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_UNLOCK);
    return mpi_errno;
  fn_fail:
    goto fn_exit;

}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_get_info
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_get_info(MPIR_Win * win, MPIR_Info ** info_p_p)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_GET_INFO);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_GET_INFO);

    mpi_errno = MPIR_Info_alloc(info_p_p);
    if (MPI_SUCCESS != mpi_errno) {
        *info_p_p = NULL;
        MPIR_ERR_POP(mpi_errno);
    }

    if (MPIDI_CH4U_WIN(win, info_args).no_locks)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "no_locks", "true");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "no_locks", "false");

    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    {
#define BUFSIZE 32
        char buf[BUFSIZE];
        int c = 0;

        CH4_COMPILE_TIME_ASSERT(BUFSIZE >= 16); /* maximum: strlen("rar,raw,war,waw") + 1 */

        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_RAR)
            c += snprintf(buf, BUFSIZE, "rar");

        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_RAW)
            c += snprintf(buf + c, BUFSIZE - c, "%sraw", (c > 0) ? "," : "");

        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_WAR)
            c += snprintf(buf + c, BUFSIZE - c, "%swar", (c > 0) ? "," : "");

        if (MPIDI_CH4U_WIN(win, info_args).accumulate_ordering & MPIDI_CH4I_ACCU_ORDER_WAW)
            c += snprintf(buf + c, BUFSIZE - c, "%swaw", (c > 0) ? "," : "");

        if (c == 0) {
            strncpy(buf, "none", BUFSIZE);
        }

        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_ordering", buf);
        if (MPI_SUCCESS != mpi_errno)
            MPIR_ERR_POP(mpi_errno);
#undef BUFSIZE
    }

    if (MPIDI_CH4U_WIN(win, info_args).accumulate_ops == MPIDI_CH4I_ACCU_SAME_OP)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_ops", "same_op");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_ops", "same_op_no_op");

    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "alloc_shared_noncontig", "true");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "alloc_shared_noncontig", "false");

    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (MPIDI_CH4U_WIN(win, info_args).same_size)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "same_size", "true");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "same_size", "false");

    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (MPIDI_CH4U_WIN(win, info_args).same_disp_unit)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "same_disp_unit", "true");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "same_disp_unit", "false");

    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (MPIDI_CH4U_WIN(win, info_args).alloc_shm)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "alloc_shm", "true");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "alloc_shm", "false");

    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    {   /* Keep buf as a local variable for which_accumulate_ops key. */
        char buf[128];
        MPIDI_CH4I_get_info_accu_ops_str(MPIDI_CH4U_WIN(win, info_args).which_accumulate_ops,
                                         &buf[0], sizeof(buf));
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "which_accumulate_ops", buf);
        if (MPI_SUCCESS != mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }

    if (MPIDI_CH4U_WIN(win, info_args).accumulate_noncontig_dtype)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_noncontig_dtype", "true");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_noncontig_dtype", "false");
    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (MPIDI_CH4U_WIN(win, info_args).accumulate_max_bytes >= 0) {
        char buf[32];           /* make sure 64-bit integer can fit */
        snprintf(buf, sizeof(buf), "%ld",
                 (long) MPIDI_CH4U_WIN(win, info_args).accumulate_max_bytes);
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_max_bytes", buf);
    } else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "accumulate_max_bytes", "unlimited");
    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (MPIDI_CH4U_WIN(win, info_args).disable_shm_accumulate)
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "disable_shm_accumulate", "true");
    else
        mpi_errno = MPIR_Info_set_impl(*info_p_p, "disable_shm_accumulate", "false");
    if (MPI_SUCCESS != mpi_errno)
        MPIR_ERR_POP(mpi_errno);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_GET_INFO);
    return mpi_errno;
  fn_fail:
    if (*info_p_p != NULL) {
        MPIR_Info_free(*info_p_p);
        *info_p_p = NULL;
    }
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_win_finalize
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_win_finalize(MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int all_completed = 0;
    MPIR_Win *win = *win_ptr;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_WIN_FINALIZE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_WIN_FINALIZE);

    /* All local outstanding OPs should have been completed. */
    MPIR_Assert(MPIR_cc_get(MPIDI_CH4U_WIN(win, local_cmpl_cnts)) == 0);
    MPIR_Assert(MPIR_cc_get(MPIDI_CH4U_WIN(win, remote_cmpl_cnts)) == 0);

    /* Make progress till all OPs have been completed */
    do {
        int all_local_completed = 0, all_remote_completed = 0;

        MPIDI_CH4R_PROGRESS();

        MPIDI_win_check_all_targets_local_completed(win, &all_local_completed);
        MPIDI_win_check_all_targets_remote_completed(win, &all_remote_completed);

        /* Local completion counter might be updated later than remote completion
         * (at request completion), so we need to check it before release entire
         * window. */
        all_completed = (MPIR_cc_get(MPIDI_CH4U_WIN(win, local_cmpl_cnts)) == 0) &&
            (MPIR_cc_get(MPIDI_CH4U_WIN(win, remote_cmpl_cnts)) == 0) &&
            all_local_completed && all_remote_completed;
    } while (all_completed != 1);

    mpi_errno = MPIDI_NM_mpi_win_free_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_mpi_win_free_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    MPIDI_CH4U_win_target_cleanall(win);
    MPIDI_CH4U_win_hash_clear(win);

    if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||
        win->create_flavor == MPI_WIN_FLAVOR_SHARED) {
        /* if more than one process on a node, we always use shared memory */
        if (win->comm_ptr->node_comm != NULL) {
            if (MPIDI_CH4U_WIN(win, mmap_sz) > 0) {
                /* destroy shared window memory */
                mpi_errno = MPIDI_CH4U_destroy_shm_segment(MPIDI_CH4U_WIN(win, mmap_sz),
                                                           &MPIDI_CH4U_WIN(win, shm_segment_handle),
                                                           &MPIDI_CH4U_WIN(win, mmap_addr));
                if (mpi_errno)
                    MPIR_ERR_POP(mpi_errno);
            }

            MPL_free(MPIDI_CH4U_WIN(win, shared_table));
        } else if (MPIDI_CH4U_WIN(win, mmap_sz) > 0) {
            /* if single process on the node, we use mmap with symm heap */
            MPL_munmap(MPIDI_CH4U_WIN(win, mmap_addr), MPIDI_CH4U_WIN(win, mmap_sz), MPL_MEM_RMA);
        } else
            MPL_free(win->base);
    }

    MPIDI_CH4U_map_erase(MPIDI_CH4_Global.win_map, MPIDI_CH4U_WIN(win, win_id));

    MPIR_Comm_release(win->comm_ptr);
    MPIR_Handle_obj_free(&MPIR_Win_mem, win);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_WIN_FINALIZE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_free
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_free(MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_Win *win = *win_ptr;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_FREE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_FREE);

    MPIDI_CH4U_ACCESS_EPOCH_CHECK_NONE(win, mpi_errno, return mpi_errno);
    MPIDI_CH4U_EXPOSURE_EPOCH_CHECK_NONE(win, mpi_errno, return mpi_errno);

    mpi_errno = MPIR_Barrier(win->comm_ptr, &errflag);
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

    MPIDI_CH4R_win_finalize(win_ptr);
  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_FREE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_fence
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_fence(int massert, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_FENCE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_FENCE);

    MPIDI_CH4U_FENCE_EPOCH_CHECK(win, mpi_errno, goto fn_fail);

    /* Ensure op completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* Ensure completion of AM operations */
    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
    } while (MPIR_cc_get(MPIDI_CH4U_WIN(win, local_cmpl_cnts)) != 0);
    MPIDI_CH4U_EPOCH_FENCE_EVENT(win, massert);

    /*
     * We always make a barrier even if MPI_MODE_NOPRECEDE is specified.
     * This is necessary because we no longer defer executions of RMA ops
     * until synchronization calls as CH3 did. Otherwise, the code like
     * this won't work correctly (cf. f77/rma/wingetf)
     *
     * Rank 0                          Rank 1
     * ----                            ----
     * Store to local mem in window
     * MPI_Win_fence(MODE_NOPRECEDE)   MPI_Win_fence(MODE_NOPRECEDE)
     * MPI_Get(from rank 1)
     */
    /* MPIR_Barrier's state is protected by ALLFUNC_MUTEX.
     * In VNI granularity, individual send/recv/wait operations will take
     * the VNI lock internally. */
    MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
    mpi_errno = MPIR_Barrier(win->comm_ptr, &errflag);
    MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_FENCE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_create
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_create(void *base,
                                            MPI_Aint length,
                                            int disp_unit,
                                            MPIR_Info * info, MPIR_Comm * comm_ptr,
                                            MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_Win *win;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_CREATE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_CREATE);

    mpi_errno = MPIDI_CH4R_win_init(length,
                                    disp_unit,
                                    win_ptr,
                                    info, comm_ptr, MPI_WIN_FLAVOR_CREATE, MPI_WIN_UNIFIED);

    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

    win = *win_ptr;
    win->base = base;

    mpi_errno = MPIDI_NM_mpi_win_create_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_mpi_win_create_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    mpi_errno = MPIR_Barrier(win->comm_ptr, &errflag);

    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_CREATE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_attach
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_attach(MPIR_Win * win, void *base, MPI_Aint size)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_ATTACH);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_ATTACH);

    MPIR_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_DYNAMIC), mpi_errno,
                        MPI_ERR_RMA_FLAVOR, goto fn_fail, "**rmaflavor");

    mpi_errno = MPIDI_NM_mpi_win_attach_hook(win, base, size);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_mpi_win_attach_hook(win, base, size);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_ATTACH);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

/* Allocate RMA window over shared memory region. Used by both win_allocate
 * and win_allocate_shared.
 *
 * This routine allocates window memory region on each node from shared
 * memory, and initializes the shared_table structure that stores each
 * node process's size, disp_unit, and start address for shm RMA operations
 * and query routine.*/
#undef FUNCNAME
#define FUNCNAME MPIDI_CH4I_win_shm_alloc_impl
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4I_win_shm_alloc_impl(MPI_Aint size,
                                                int disp_unit,
                                                MPIR_Comm * comm_ptr,
                                                void **base_ptr, MPIR_Win ** win_ptr)
{
    int i, mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_Win *win = NULL;
    size_t total_shm_size = 0LL;
    MPIDI_CH4U_win_shared_info_t *shared_table = NULL;
    MPI_Aint *shm_offsets = NULL;
    MPIR_Comm *shm_comm_ptr = comm_ptr->node_comm;
    size_t page_sz = 0, mapsize;
    int mapfail_flag = 0;
    unsigned symheap_flag = 1, global_symheap_flag = 0;

    MPIR_CHKPMEM_DECL(1);
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4I_WIN_SHM_ALLOC_IMPL);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4I_WIN_SHM_ALLOC_IMPL);

    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

    win = *win_ptr;
    *base_ptr = NULL;

    /* Check whether multiple processes exist on the local node. If so,
     * we need to count the total size on a node for shared memory allocation. */
    if (shm_comm_ptr != NULL) {
        MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
        MPIDI_CH4U_WIN(win, shared_table) =
            (MPIDI_CH4U_win_shared_info_t *) MPL_malloc(sizeof(MPIDI_CH4U_win_shared_info_t) *
                                                        shm_comm_ptr->local_size, MPL_MEM_RMA);
        shared_table = MPIDI_CH4U_WIN(win, shared_table);
        shared_table[shm_comm_ptr->rank].size = size;
        shared_table[shm_comm_ptr->rank].disp_unit = disp_unit;
        shared_table[shm_comm_ptr->rank].shm_base_addr = NULL;

        mpi_errno = MPIR_Allgather(MPI_IN_PLACE,
                                   0,
                                   MPI_DATATYPE_NULL,
                                   shared_table,
                                   sizeof(MPIDI_CH4U_win_shared_info_t), MPI_BYTE, shm_comm_ptr,
                                   &errflag);
        MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;

        MPIR_CHKLMEM_MALLOC(shm_offsets, MPI_Aint *, shm_comm_ptr->local_size * sizeof(MPI_Aint),
                            mpi_errno, "shm offset", MPL_MEM_RMA);

        /* No allreduce here because this is a shared memory domain
         * and should be a relatively small number of processes
         * and a non performance sensitive API.
         */
        for (i = 0; i < shm_comm_ptr->local_size; i++) {
            shm_offsets[i] = (MPI_Aint) total_shm_size;
            if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig)
                total_shm_size += MPIDI_CH4R_get_mapsize(shared_table[i].size, &page_sz);
            else
                total_shm_size += shared_table[i].size;
        }

        /* if all processes give zero size on a single node window, simply return. */
        if (total_shm_size == 0 && shm_comm_ptr->local_size == comm_ptr->local_size)
            goto fn_exit;

        /* if my size is not page aligned and noncontig is disabled, skip global symheap. */
        if (size != MPIDI_CH4R_get_mapsize(size, &page_sz) &&
            !MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig)
            symheap_flag = 0;
    } else
        total_shm_size = size;

    /* try global symm heap only when multiple processes exist */
    if (comm_ptr->local_size > 1) {
        /* global symm heap can be successful only when any of the following conditions meet.
         * Thus, we can skip unnecessary global symm heap retry based on condition check.
         * - no shared memory node (i.e., single process per node)
         * - size of each process on the shared memory node is page aligned,
         *   thus all process can be assigned to a page aligned start address.
         * - user sets alloc_shared_noncontig=true, thus we can internally make
         *   the size aligned on each process. */
        mpi_errno = MPIR_Allreduce(&symheap_flag, &global_symheap_flag, 1, MPI_UNSIGNED,
                                   MPI_BAND, comm_ptr, &errflag);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
    } else
        global_symheap_flag = 0;

    /* because MPI_shm follows a create & attach mode, we need to set the
     * size of entire shared memory segment on each node as the size of
     * each process. */
    mapsize = MPIDI_CH4R_get_mapsize(total_shm_size, &page_sz);
    MPIDI_CH4U_WIN(win, mmap_sz) = mapsize;

    /* first try global symmetric heap segment allocation */
    if (global_symheap_flag) {
        mpi_errno = MPIDI_CH4R_get_shm_symheap(mapsize, shm_offsets, comm_ptr, win, &mapfail_flag);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_fail;
    }

    /* if fails, try normal shm segment allocation or malloc */
    if (!global_symheap_flag || mapfail_flag) {
        if (shm_comm_ptr != NULL && mapsize) {
            mpi_errno = MPIDI_CH4U_allocate_shm_segment(shm_comm_ptr, mapsize,
                                                        &MPIDI_CH4U_WIN(win, shm_segment_handle),
                                                        &MPIDI_CH4U_WIN(win, mmap_addr));
            if (mpi_errno != MPI_SUCCESS)
                goto fn_fail;
        } else if (size > 0) {
            MPIR_CHKPMEM_MALLOC(*base_ptr, void *, size, mpi_errno, "(*win_ptr)->base",
                                MPL_MEM_RMA);
            MPL_VG_MEM_INIT(*base_ptr, size);
            MPIDI_CH4U_WIN(win, mmap_sz) = 0;   /* reset mmap_sz if use malloc */
        }
    }

    /* compute the base addresses of each process within the shared memory segment */
    if (shm_comm_ptr != NULL) {
        char *cur_base = (char *) MPIDI_CH4U_WIN(win, mmap_addr);
        for (i = 0; i < shm_comm_ptr->local_size; i++) {
            if (shared_table[i].size)
                shared_table[i].shm_base_addr = cur_base;
            else
                shared_table[i].shm_base_addr = NULL;

            if (MPIDI_CH4U_WIN(win, info_args).alloc_shared_noncontig)
                cur_base += MPIDI_CH4R_get_mapsize(shared_table[i].size, &page_sz);
            else
                cur_base += shared_table[i].size;
        }

        *base_ptr = shared_table[shm_comm_ptr->rank].shm_base_addr;
    } else if (MPIDI_CH4U_WIN(win, mmap_sz) > 0) {
        /* if symm heap is allocated without shared memory, use the mapping address */
        *base_ptr = MPIDI_CH4U_WIN(win, mmap_addr);
    }
    /* otherwise, it has already be assigned with a local memory region or NULL (zero size). */

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4I_WIN_SHM_ALLOC_IMPL);
    return mpi_errno;
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_allocate_shared
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_allocate_shared(MPI_Aint size,
                                                     int disp_unit,
                                                     MPIR_Info * info_ptr,
                                                     MPIR_Comm * comm_ptr,
                                                     void **base_ptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_Win *win = NULL;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_ALLOCATE_SHARED);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_ALLOCATE_SHARED);

    mpi_errno = MPIDI_CH4R_win_init(size, disp_unit, win_ptr, info_ptr, comm_ptr,
                                    MPI_WIN_FLAVOR_SHARED, MPI_WIN_UNIFIED);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPIDI_CH4I_win_shm_alloc_impl(size, disp_unit, comm_ptr, base_ptr, win_ptr);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    win = *win_ptr;
    win->base = *base_ptr;
    win->size = size;

    mpi_errno = MPIDI_NM_mpi_win_allocate_shared_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_mpi_win_allocate_shared_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    mpi_errno = MPIR_Barrier(comm_ptr, &errflag);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_ALLOCATE_SHARED);
    return mpi_errno;
  fn_fail:
    if (win_ptr)
        MPIDI_CH4R_win_finalize(win_ptr);
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_detach
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_detach(MPIR_Win * win, const void *base)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_DETACH);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_DETACH);
    MPIR_ERR_CHKANDSTMT((win->create_flavor != MPI_WIN_FLAVOR_DYNAMIC), mpi_errno,
                        MPI_ERR_RMA_FLAVOR, goto fn_fail, "**rmaflavor");

    mpi_errno = MPIDI_NM_mpi_win_detach_hook(win, base);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_mpi_win_detach_hook(win, base);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_DETACH);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_shared_query
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_shared_query(MPIR_Win * win,
                                                  int rank,
                                                  MPI_Aint * size, int *disp_unit, void *baseptr)
{
    int mpi_errno = MPI_SUCCESS;
    int offset = rank;
    MPIDI_CH4U_win_shared_info_t *shared_table = MPIDI_CH4U_WIN(win, shared_table);

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_SHARED_QUERY);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_SHARED_QUERY);

    /* When only single process exists on the node, should only query
     * MPI_PROC_NULL or local process. Thus, return local window's info. */
    if (win->comm_ptr->node_comm == NULL) {
        *size = win->size;
        *disp_unit = win->disp_unit;
        *((void **) baseptr) = win->base;
        goto fn_exit;
    }

    /* When rank is MPI_PROC_NULL, return the memory region belonging the lowest
     * rank that specified size > 0*/
    if (rank == MPI_PROC_NULL) {
        /* Default, if no process has size > 0. */
        *size = 0;
        *disp_unit = 0;
        *((void **) baseptr) = NULL;

        for (offset = 0; offset < win->comm_ptr->local_size; offset++) {
            if (shared_table[offset].size > 0) {
                *size = shared_table[offset].size;
                *disp_unit = shared_table[offset].disp_unit;
                *((void **) baseptr) = shared_table[offset].shm_base_addr;
                break;
            }
        }
    } else {
        *size = shared_table[offset].size;
        *disp_unit = shared_table[offset].disp_unit;
        *(void **) baseptr = shared_table[offset].shm_base_addr;
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_SHARED_QUERY);
    return mpi_errno;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_allocate
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_allocate(MPI_Aint size,
                                              int disp_unit,
                                              MPIR_Info * info,
                                              MPIR_Comm * comm, void *baseptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_Win *win;
    void **base_ptr = (void **) baseptr;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_ALLOCATE);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_ALLOCATE);

    mpi_errno = MPIDI_CH4R_win_init(size, disp_unit, win_ptr, info, comm,
                                    MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED);

    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

    mpi_errno = MPIDI_CH4I_win_shm_alloc_impl(size, disp_unit, comm, base_ptr, win_ptr);
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

    win = *win_ptr;
    win->base = *(void **) baseptr;
    win->size = size;

    mpi_errno = MPIDI_NM_mpi_win_allocate_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_mpi_win_allocate_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    mpi_errno = MPIR_Barrier(comm, &errflag);

    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_ALLOCATE);
    return mpi_errno;
  fn_fail:
    if (win_ptr)
        MPIDI_CH4R_win_finalize(win_ptr);
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_flush
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_flush(int rank, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH);

    /* Check window lock epoch.
     * PROC_NULL does not update per-target epoch. */
    MPIDI_CH4U_EPOCH_CHECK_PASSIVE(win, mpi_errno, return mpi_errno);
    if (rank == MPI_PROC_NULL)
        goto fn_exit;

    /* Ensure op completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_target_cmpl_hook(rank, win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_target_cmpl_hook(rank, win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* Ensure completion of AM operations issued to the target.
     * If target object is not created (e.g., when all operations issued
     * to the target were via shm and in lockall), we also need trigger
     * progress once to handle remote AM. */
    MPIDI_CH4U_win_target_t *target_ptr = MPIDI_CH4U_win_target_find(win, rank);
    if (target_ptr) {
        if (MPIDI_CH4U_WIN(win, sync).access_epoch_type == MPIDI_CH4U_EPOTYPE_LOCK)
            MPIDI_CH4U_EPOCH_CHECK_TARGET_LOCK(target_ptr, mpi_errno, goto fn_fail);
    }

    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
    } while (target_ptr && MPIR_cc_get(target_ptr->remote_cmpl_cnts) != 0);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_flush_local_all
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_flush_local_all(MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    int all_local_completed = 0;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_LOCAL_ALL);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_LOCAL_ALL);

    MPIDI_CH4U_EPOCH_CHECK_PASSIVE(win, mpi_errno, goto fn_fail);

    /* Ensure op local completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_win_local_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_win_local_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* Ensure completion of AM operations */

    /* FIXME: now we simply set per-target counters for lockall in case
     * user flushes per target, but this should be optimized. */
    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_win_check_all_targets_local_completed(win, &all_local_completed);
    } while (all_local_completed != 1);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_LOCAL_ALL);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_unlock_all
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_unlock_all(MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_UNLOCK_ALL);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_UNLOCK_ALL);
    int i;

    int all_remote_completed = 0;

    MPIDI_CH4U_ACCESS_EPOCH_CHECK(win, MPIDI_CH4U_EPOTYPE_LOCK_ALL, mpi_errno, return mpi_errno);
    /* NOTE: lockall blocking waits till all locks granted */
    MPIR_Assert(MPIDI_CH4U_WIN(win, sync).lockall.allLocked == win->comm_ptr->local_size);

    /* Ensure op completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* Ensure completion of AM operations */

    /* FIXME: now we simply set per-target counters for lockall in case
     * user flushes per target, but this should be optimized. */
    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_win_check_all_targets_remote_completed(win, &all_remote_completed);
    } while (all_remote_completed != 1);

    if (MPIDI_CH4U_WIN(win, sync).assert_mode & MPI_MODE_NOCHECK) {
        MPIDI_CH4U_WIN(win, sync).lockall.allLocked = 0;
        goto no_check;
    }
    for (i = 0; i < win->comm_ptr->local_size; i++) {
        MPIDI_CH4U_win_cntrl_msg_t msg;
        msg.win_id = MPIDI_CH4U_WIN(win, win_id);
        msg.origin_rank = win->comm_ptr->rank;

#ifndef MPIDI_CH4_DIRECT_NETMOD
        if (MPIDI_CH4_rank_is_local(i, win->comm_ptr))
            mpi_errno = MPIDI_SHM_am_send_hdr(i, win->comm_ptr,
                                              MPIDI_CH4U_WIN_UNLOCKALL, &msg, sizeof(msg));
        else
#endif
        {
            mpi_errno = MPIDI_NM_am_send_hdr(i, win->comm_ptr,
                                             MPIDI_CH4U_WIN_UNLOCKALL, &msg, sizeof(msg));
        }

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
    }

    MPIDI_CH4R_PROGRESS_WHILE(MPIDI_CH4U_WIN(win, sync).lockall.allLocked);
  no_check:
    /* In performance-efficient mode, all allocated targets are freed at win_finalize. */
    if (MPIR_CVAR_CH4_RMA_MEM_EFFICIENT)
        MPIDI_CH4U_win_target_cleanall(win);
    MPIDI_CH4U_WIN(win, sync).access_epoch_type = MPIDI_CH4U_EPOTYPE_NONE;
    MPIDI_CH4U_WIN(win, sync).assert_mode = 0;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_UNLOCK_ALL);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_create_dynamic
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_create_dynamic(MPIR_Info * info,
                                                    MPIR_Comm * comm, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int rc = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_CREATE_DYNAMIC);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_CREATE_DYNAMIC);

    MPIR_Win *win;

    rc = MPIDI_CH4R_win_init(0, 1, win_ptr, info, comm, MPI_WIN_FLAVOR_DYNAMIC, MPI_WIN_UNIFIED);

    if (rc != MPI_SUCCESS)
        goto fn_fail;

    win = *win_ptr;
    win->base = MPI_BOTTOM;

    mpi_errno = MPIDI_NM_mpi_win_create_dynamic_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_mpi_win_create_dynamic_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    mpi_errno = MPIR_Barrier(comm, &errflag);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_CREATE_DYNAMIC);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_flush_local
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_flush_local(int rank, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_LOCAL);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_LOCAL);

    /* Check window lock epoch.
     * PROC_NULL does not update per-target epoch. */
    MPIDI_CH4U_EPOCH_CHECK_PASSIVE(win, mpi_errno, return mpi_errno);
    if (rank == MPI_PROC_NULL)
        goto fn_exit;

    /* Ensure op local completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_target_local_cmpl_hook(rank, win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_target_local_cmpl_hook(rank, win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* Ensure completion of AM operations issued to the target.
     * If target object is not created (e.g., when all operations issued
     * to the target were via shm and in lockall), we also need trigger
     * progress once to handle remote AM. */
    MPIDI_CH4U_win_target_t *target_ptr = MPIDI_CH4U_win_target_find(win, rank);
    if (target_ptr) {
        if (MPIDI_CH4U_WIN(win, sync).access_epoch_type == MPIDI_CH4U_EPOTYPE_LOCK)
            MPIDI_CH4U_EPOCH_CHECK_TARGET_LOCK(target_ptr, mpi_errno, goto fn_fail);
    }

    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);
    } while (target_ptr && MPIR_cc_get(target_ptr->local_cmpl_cnts) != 0);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_LOCAL);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_sync
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_sync(MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_SYNC);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_SYNC);

    MPIDI_CH4U_EPOCH_CHECK_PASSIVE(win, mpi_errno, goto fn_fail);
    OPA_read_write_barrier();

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_SYNC);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_flush_all
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_flush_all(MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;
    int all_remote_completed = 0;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_ALL);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_ALL);

    MPIDI_CH4U_EPOCH_CHECK_PASSIVE(win, mpi_errno, goto fn_fail);

    /* Ensure op completion in netmod and shmmod */
    mpi_errno = MPIDI_NM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

#ifndef MPIDI_CH4_DIRECT_NETMOD
    mpi_errno = MPIDI_SHM_rma_win_cmpl_hook(win);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
#endif

    /* Ensure completion of AM operations */
    do {
        MPID_THREAD_CS_EXIT(VNI, MPIDI_CH4_Global.vni_lock);
        MPIDI_CH4R_PROGRESS();
        MPID_THREAD_CS_ENTER(VNI, MPIDI_CH4_Global.vni_lock);

        /* FIXME: now we simply set per-target counters for lockall in case
         * user flushes per target, but this should be optimized. */
        MPIDI_win_check_all_targets_remote_completed(win, &all_remote_completed);
    } while (all_remote_completed != 1);

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_FLUSH_ALL);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH4R_mpi_win_lock_all
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH4R_mpi_win_lock_all(int assert, MPIR_Win * win)
{
    int mpi_errno = MPI_SUCCESS;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH4R_MPI_WIN_LOCK_ALL);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH4R_MPI_WIN_LOCK_ALL);

    MPIDI_CH4U_ACCESS_EPOCH_CHECK_NONE(win, mpi_errno, goto fn_fail);

    MPIR_Assert(MPIDI_CH4U_WIN(win, sync).lockall.allLocked == 0);

    int size;
    size = win->comm_ptr->local_size;
    if (assert & MPI_MODE_NOCHECK) {
        MPIDI_CH4U_WIN(win, sync).assert_mode |= MPI_MODE_NOCHECK;
        MPIDI_CH4U_WIN(win, sync).lockall.allLocked = size;
        goto no_check;
    }

    int i;
    for (i = 0; i < size; i++) {
        MPIDI_CH4U_win_cntrl_msg_t msg;
        msg.win_id = MPIDI_CH4U_WIN(win, win_id);
        msg.origin_rank = win->comm_ptr->rank;
        msg.lock_type = MPI_LOCK_SHARED;

#ifndef MPIDI_CH4_DIRECT_NETMOD
        if (MPIDI_CH4_rank_is_local(i, win->comm_ptr))
            mpi_errno = MPIDI_SHM_am_send_hdr(i, win->comm_ptr,
                                              MPIDI_CH4U_WIN_LOCKALL, &msg, sizeof(msg));
        else
#endif
        {
            mpi_errno = MPIDI_NM_am_send_hdr(i, win->comm_ptr,
                                             MPIDI_CH4U_WIN_LOCKALL, &msg, sizeof(msg));
        }

        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, goto fn_fail, "**rmasync");
    }

    MPIDI_CH4R_PROGRESS_WHILE(size != (int) MPIDI_CH4U_WIN(win, sync).lockall.allLocked);
  no_check:
    MPIDI_CH4U_WIN(win, sync).access_epoch_type = MPIDI_CH4U_EPOTYPE_LOCK_ALL;

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH4R_MPI_WIN_LOCK_ALL);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#endif /* CH4R_WIN_H_INCLUDED */