Blob Blame History Raw
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2001 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */

#ifndef MPID_RMA_OPLIST_H_INCLUDED
#define MPID_RMA_OPLIST_H_INCLUDED

#include "utlist.h"
#include "mpid_rma_types.h"

int MPIDI_CH3I_RMA_Cleanup_ops_aggressive(MPIR_Win * win_ptr);
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPIR_Win * win_ptr, MPIDI_RMA_Target_t ** target);
int MPIDI_CH3I_RMA_Make_progress_target(MPIR_Win * win_ptr, int target_rank, int *made_progress);
int MPIDI_CH3I_RMA_Make_progress_win(MPIR_Win * win_ptr, int *made_progress);

extern MPIDI_RMA_Op_t *global_rma_op_pool_head, *global_rma_op_pool_start;
extern MPIDI_RMA_Target_t *global_rma_target_pool_head, *global_rma_target_pool_start;

extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_rmaqueue_alloc ATTRIBUTE((unused));

/* This macro returns two flags: local_completed and remote_completed,
 * to indicate if the completion is reached on this target. */
#define MPIDI_CH3I_RMA_ops_completion(win_, target_, local_completed_, remote_completed_) \
    do {                                                                \
        local_completed_ = 0;                                           \
        remote_completed_ = 0;                                          \
        if ((win_)->states.access_state != MPIDI_RMA_FENCE_ISSUED &&    \
            (win_)->states.access_state != MPIDI_RMA_PSCW_ISSUED &&     \
            (win_)->states.access_state != MPIDI_RMA_LOCK_ALL_ISSUED && \
            (target_)->access_state != MPIDI_RMA_LOCK_CALLED &&         \
            (target_)->access_state != MPIDI_RMA_LOCK_ISSUED &&         \
            (target_)->pending_net_ops_list_head == NULL &&             \
            (target_)->pending_user_ops_list_head == NULL &&            \
            (target_)->num_pkts_wait_for_local_completion == 0) {       \
            local_completed_ = 1;                                       \
            if ((target_)->sync.sync_flag == MPIDI_RMA_SYNC_NONE &&     \
                (target_)->num_ops_flush_not_issued == 0 &&             \
                (target_)->sync.outstanding_acks == 0)                  \
                remote_completed_ = 1;                                  \
        }                                                               \
    } while (0)


/* This macro returns a flag: win_remote_completed, to indicate if
 * the remote completion is reached on the entire window. */
#define MPIDI_CH3I_RMA_ops_win_remote_completion(win_ptr_, win_remote_completed_) \
    do {                                                                \
        MPIDI_RMA_Target_t *win_target_ = NULL;                         \
        int i_, num_targets_ = 0;                                       \
        int remote_completed_targets_ = 0;                              \
                                                                        \
        win_remote_completed_ = 0;                                      \
                                                                        \
        for (i_ = 0; i_ < (win_ptr_)->num_slots; i_++) {                \
            for (win_target_ = (win_ptr_)->slots[i_].target_list_head; win_target_;) { \
                int local_ ATTRIBUTE((unused)) = 0, remote_ = 0;        \
                                                                        \
                num_targets_++;                                         \
                                                                        \
                MPIDI_CH3I_RMA_ops_completion((win_ptr_), win_target_, local_, remote_); \
                                                                        \
                remote_completed_targets_ += remote_;                   \
                                                                        \
                win_target_ = win_target_->next;                        \
            }                                                           \
        }                                                               \
                                                                        \
        if (num_targets_ == remote_completed_targets_)                  \
            win_remote_completed_ = 1;                                  \
                                                                        \
    } while (0)

/* This macro returns a flag: win_local_completed, to indicate if
 * the local completion is reached on the entire window. */
#define MPIDI_CH3I_RMA_ops_win_local_completion(win_ptr_, win_local_completed_) \
    do {                                                                \
        MPIDI_RMA_Target_t *win_target_ = NULL;                         \
        int i_, total_remote_cnt_ = 0, total_local_cnt_ = 0;            \
        int remote_completed_targets_ = 0, local_completed_targets_ = 0; \
                                                                        \
        win_local_completed_ = 0;                                       \
                                                                        \
        for (i_ = 0; i_ < (win_ptr_)->num_slots; i_++) {                \
            for (win_target_ = (win_ptr_)->slots[i_].target_list_head; win_target_;) { \
                int local_ = 0, remote_ ATTRIBUTE((unused)) = 0;        \
                                                                        \
                total_local_cnt_++;                                     \
                                                                        \
                MPIDI_CH3I_RMA_ops_completion((win_ptr_), win_target_, local_, remote_); \
                                                                        \
                local_completed_targets_ += local_;                     \
                                                                        \
                win_target_ = win_target_->next;                        \
            }                                                           \
        }                                                               \
                                                                        \
        if (remote_completed_targets_ == total_remote_cnt_ &&           \
            local_completed_targets_ == total_local_cnt_)               \
            win_local_completed_ = 1;                                   \
                                                                        \
    } while (0)


/* Given a rank, return slot index */
#define MPIDI_CH3I_RMA_RANK_TO_SLOT(win_ptr_, rank_)                    \
    (((win_ptr_)->num_slots < (win_ptr_)->comm_ptr->local_size) ?       \
     &(win_ptr_)->slots[(rank_) % (win_ptr_)->num_slots] :              \
     &(win_ptr_)->slots[(rank_)])

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_set_active
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_set_active(MPIR_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;

    if (win_ptr->active == FALSE) {
        win_ptr->active = TRUE;

        if (MPIDI_RMA_Win_active_list_head == NULL) {
            /* This is the first active window, activate RMA progress */
            MPID_Progress_activate_hook(MPIDI_CH3I_RMA_Progress_hook_id);
        }

        DL_DELETE(MPIDI_RMA_Win_inactive_list_head, win_ptr);
        DL_APPEND(MPIDI_RMA_Win_active_list_head, win_ptr);
    }

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_set_inactive
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_set_inactive(MPIR_Win * win_ptr)
{
    int mpi_errno = MPI_SUCCESS;

    if (win_ptr->active == TRUE) {
        win_ptr->active = FALSE;
        DL_DELETE(MPIDI_RMA_Win_active_list_head, win_ptr);
        DL_APPEND(MPIDI_RMA_Win_inactive_list_head, win_ptr);

        if (MPIDI_RMA_Win_active_list_head == NULL) {
            /* This is the last active window, de-activate RMA progress */
            MPID_Progress_deactivate_hook(MPIDI_CH3I_RMA_Progress_hook_id);
        }
    }

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


/* MPIDI_CH3I_Win_op_alloc(): get a new op element from op pool and
 * initialize it. If we cannot get one, return NULL. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_op_alloc
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline MPIDI_RMA_Op_t *MPIDI_CH3I_Win_op_alloc(MPIR_Win * win_ptr)
{
    MPIDI_RMA_Op_t *e;

    if (win_ptr->op_pool_head == NULL) {
        /* local pool is empty, try to find something in the global pool */
        if (global_rma_op_pool_head == NULL)
            return NULL;
        else {
            e = global_rma_op_pool_head;
            DL_DELETE(global_rma_op_pool_head, e);
        }
    }
    else {
        e = win_ptr->op_pool_head;
        DL_DELETE(win_ptr->op_pool_head, e);
    }

    e->single_req = NULL;
    e->multi_reqs = NULL;
    e->reqs_size = 0;
    e->ureq = NULL;
    e->piggyback_lock_candidate = 0;
    e->issued_stream_count = 0;

    e->origin_datatype = MPI_DATATYPE_NULL;
    e->result_datatype = MPI_DATATYPE_NULL;

    return e;
}

/* MPIDI_CH3I_Win_op_free(): put an op element back to the op pool which
 * it belongs to. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_op_free
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_op_free(MPIR_Win * win_ptr, MPIDI_RMA_Op_t * e)
{
    int mpi_errno = MPI_SUCCESS;

    if (e->multi_reqs != NULL) {
        MPL_free(e->multi_reqs);
    }

    /* We enqueue elements to the right pool, so when they get freed
     * at window free time, they won't conflict with the global pool
     * or other windows */
    /* use PREPEND when return objects back to the pool
     * in order to improve cache performance */
    if (e->pool_type == MPIDI_RMA_POOL_WIN)
        DL_PREPEND(win_ptr->op_pool_head, e);
    else
        DL_PREPEND(global_rma_op_pool_head, e);

    return mpi_errno;
}

/* MPIDI_CH3I_Win_target_alloc(): get a target element from the target pool.
 * If we cannot get one, return NULL. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_target_alloc
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline MPIDI_RMA_Target_t *MPIDI_CH3I_Win_target_alloc(MPIR_Win * win_ptr)
{
    MPIDI_RMA_Target_t *e;

    if (win_ptr->target_pool_head == NULL) {
        /* local pool is empty, try to find something in the global pool */
        if (global_rma_target_pool_head == NULL)
            return NULL;
        else {
            e = global_rma_target_pool_head;
            DL_DELETE(global_rma_target_pool_head, e);
        }
    }
    else {
        e = win_ptr->target_pool_head;
        DL_DELETE(win_ptr->target_pool_head, e);
    }

    e->pending_net_ops_list_head = NULL;
    e->pending_user_ops_list_head = NULL;
    e->next_op_to_issue = NULL;

    e->target_rank = -1;
    e->access_state = MPIDI_RMA_NONE;
    e->lock_type = MPID_LOCK_NONE;
    e->lock_mode = 0;
    e->win_complete_flag = 0;

    e->sync.sync_flag = MPIDI_RMA_SYNC_NONE;
    e->sync.outstanding_acks = 0;

    e->num_pkts_wait_for_local_completion = 0;
    e->num_ops_flush_not_issued = 0;

    return e;
}

/* MPIDI_CH3I_Win_target_free(): put a target element back to the target pool
 * it belongs to. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_target_free
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_target_free(MPIR_Win * win_ptr, MPIDI_RMA_Target_t * e)
{
    int mpi_errno = MPI_SUCCESS;

    /* We enqueue elements to the right pool, so when they get freed
     * at window free time, they won't conflict with the global pool
     * or other windows */
    MPIR_Assert(e->pending_net_ops_list_head == NULL);
    MPIR_Assert(e->pending_user_ops_list_head == NULL);

    /* use PREPEND when return objects back to the pool
     * in order to improve cache performance */
    if (e->pool_type == MPIDI_RMA_POOL_WIN)
        DL_PREPEND(win_ptr->target_pool_head, e);
    else
        DL_PREPEND(global_rma_target_pool_head, e);

    return mpi_errno;
}

/* MPIDI_CH3I_Win_create_target(): given a rank, create
 * corresponding target in RMA slots. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_create_target
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_create_target(MPIR_Win * win_ptr, int target_rank,
                                               MPIDI_RMA_Target_t ** e)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_RMA_Slot_t *slot = NULL;
    MPIDI_RMA_Target_t *t = NULL;

    slot = MPIDI_CH3I_RMA_RANK_TO_SLOT(win_ptr, target_rank);
    t = MPIDI_CH3I_Win_target_alloc(win_ptr);
    if (t == NULL) {
        mpi_errno = MPIDI_CH3I_RMA_Cleanup_target_aggressive(win_ptr, &t);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

    t->target_rank = target_rank;

    /* Enqueue target into target list. */
    DL_APPEND(slot->target_list_head, t);

    assert(t != NULL);

    (*e) = t;

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

/* MPIDI_CH3I_Win_find_target(): given a rank, find
 * corresponding target in RMA slots. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_find_target
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_find_target(MPIR_Win * win_ptr, int target_rank,
                                             MPIDI_RMA_Target_t ** e)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_RMA_Slot_t *slot = NULL;
    MPIDI_RMA_Target_t *t = NULL;

    slot = MPIDI_CH3I_RMA_RANK_TO_SLOT(win_ptr, target_rank);

    t = slot->target_list_head;
    while (t != NULL) {
        if (t->target_rank == target_rank)
            break;
        t = t->next;
    }

    (*e) = t;

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

/* MPIDI_CH3I_Win_enqueue_op(): given an operation, enqueue it to the
 * corresponding operation lists in corresponding target element. This
 * routines is only called from operation routines. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_enqueue_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_enqueue_op(MPIR_Win * win_ptr, MPIDI_RMA_Op_t * op)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_RMA_Target_t *target = NULL;

    mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, op->target_rank, &target);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);
    if (target == NULL) {
        mpi_errno = MPIDI_CH3I_Win_create_target(win_ptr, op->target_rank, &target);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);

        if (win_ptr->states.access_state == MPIDI_RMA_PER_TARGET ||
            win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED) {
            /* If global state is MPIDI_RMA_PER_TARGET, this must not
             * be the first time to create this target (The first time
             * is in Win_lock). Here we recreated it and set the access
             * state to LOCK_GRANTED because before we free the previous
             * one, the lock should already be granted. */
            /* If global state is MPIDI_RMA_LOCK_ALL_GRANTED, all locks
             * should already be granted. So the access state for this
             * target is also set to MPIDI_RMA_LOCK_GRANTED. */
            target->access_state = MPIDI_RMA_LOCK_GRANTED;
        }
        else if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) {
            /* If global state is MPIDI_RMA_LOCK_ALL_CALLED, this must
             * the first time to create this target, set its access state
             * to MPIDI_RMA_LOCK_CALLED. */
            target->access_state = MPIDI_RMA_LOCK_CALLED;
            target->lock_type = MPI_LOCK_SHARED;
        }
    }

    /* Note that if it is a request-based RMA, do not put it in pending user list,
     * otherwise a wait call before unlock will be blocked. */
    if (MPIR_CVAR_CH3_RMA_DELAY_ISSUING_FOR_PIGGYBACKING && op->ureq == NULL) {
        if (target->pending_user_ops_list_head != NULL) {
            MPIDI_RMA_Op_t *user_op = target->pending_user_ops_list_head;
            /* Move head element of user pending list to net pending list */
            if (target->pending_net_ops_list_head == NULL)
                win_ptr->num_targets_with_pending_net_ops++;
            DL_DELETE(target->pending_user_ops_list_head, user_op);
            DL_APPEND(target->pending_net_ops_list_head, user_op);

            if (target->next_op_to_issue == NULL)
                target->next_op_to_issue = user_op;
        }

        /* Enqueue operation into user pending list. */
        DL_APPEND(target->pending_user_ops_list_head, op);
    }
    else {
        /* Enqueue operation into net pending list. */
        if (target->pending_net_ops_list_head == NULL)
            win_ptr->num_targets_with_pending_net_ops++;
        DL_APPEND(target->pending_net_ops_list_head, op);

        if (target->next_op_to_issue == NULL)
            target->next_op_to_issue = op;
    }

    if (target->pending_net_ops_list_head != NULL &&
        (win_ptr->states.access_state == MPIDI_RMA_FENCE_GRANTED ||
         win_ptr->states.access_state == MPIDI_RMA_PSCW_GRANTED ||
         win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_GRANTED ||
         target->access_state == MPIDI_RMA_LOCK_GRANTED))
        MPIDI_CH3I_Win_set_active(win_ptr);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


/* MPIDI_CH3I_Win_target_dequeue_and_free(): dequeue and free
 * the target in RMA slots. */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_target_dequeue_and_free
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_target_dequeue_and_free(MPIR_Win * win_ptr, MPIDI_RMA_Target_t * e)
{
    int mpi_errno = MPI_SUCCESS;
    int target_rank = e->target_rank;
    MPIDI_RMA_Slot_t *slot;

    slot = MPIDI_CH3I_RMA_RANK_TO_SLOT(win_ptr, target_rank);

    DL_DELETE(slot->target_list_head, e);

    mpi_errno = MPIDI_CH3I_Win_target_free(win_ptr, e);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Cleanup_targets_win
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_RMA_Cleanup_targets_win(MPIR_Win * win_ptr)
{
    MPIDI_RMA_Target_t *target = NULL, *next_target = NULL;
    int i, mpi_errno = MPI_SUCCESS;

    for (i = 0; i < win_ptr->num_slots; i++) {
        for (target = win_ptr->slots[i].target_list_head; target;) {
            next_target = target->next;
            mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, target);
            if (mpi_errno != MPI_SUCCESS)
                MPIR_ERR_POP(mpi_errno);
            target = next_target;
        }
    }

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_Win_get_op
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline int MPIDI_CH3I_Win_get_op(MPIR_Win * win_ptr, MPIDI_RMA_Op_t ** e)
{
    MPIDI_RMA_Op_t *new_ptr = NULL;
    int mpi_errno = MPI_SUCCESS;

    while (1) {
        MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc);
        new_ptr = MPIDI_CH3I_Win_op_alloc(win_ptr);
        MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc);
        if (new_ptr != NULL)
            break;

        mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_aggressive(win_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

    (*e) = new_ptr;

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}


/* Free an element in the RMA operations list.
 *
 * @param IN    list      Pointer to the RMA ops list
 * @param IN    curr_ptr  Pointer to the element to be freed.
 */
#undef FUNCNAME
#define FUNCNAME MPIDI_CH3I_RMA_Ops_free_elem
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
static inline void MPIDI_CH3I_RMA_Ops_free_elem(MPIR_Win * win_ptr, MPIDI_RMA_Ops_list_t * list,
                                                MPIDI_RMA_Op_t * curr_ptr)
{
    MPIDI_RMA_Op_t *tmp_ptr = curr_ptr;

    MPIR_Assert(curr_ptr != NULL);

    DL_DELETE(*list, curr_ptr);

    MPIDI_CH3I_Win_op_free(win_ptr, tmp_ptr);
}


#endif /* MPID_RMA_OPLIST_H_INCLUDED */