Blame src/mpi/coll/allgatherv/allgatherv_intra_recursive_doubling.c

Packit Service c5cf8c
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
Packit Service c5cf8c
/*
Packit Service c5cf8c
 *
Packit Service c5cf8c
 *  (C) 2001 by Argonne National Laboratory.
Packit Service c5cf8c
 *      See COPYRIGHT in top-level directory.
Packit Service c5cf8c
 */
Packit Service c5cf8c
Packit Service c5cf8c
#include "mpiimpl.h"
Packit Service c5cf8c
Packit Service c5cf8c
/*
Packit Service c5cf8c
 * Recursive Doubling Algorithm:
Packit Service c5cf8c
 *
Packit Service c5cf8c
 * Restrictions: power-of-two no. of processes
Packit Service c5cf8c
 *
Packit Service c5cf8c
 * Cost = lgp.alpha + n.((p-1)/p).beta
Packit Service c5cf8c
 *
Packit Service c5cf8c
 * TODO: On TCP, we may want to use recursive doubling instead of the
Packit Service c5cf8c
 * Bruck's algorithm in all cases because of the pairwise-exchange
Packit Service c5cf8c
 * property of recursive doubling (see Benson et al paper in Euro
Packit Service c5cf8c
 * PVM/MPI 2003).
Packit Service c5cf8c
 */
Packit Service c5cf8c
Packit Service c5cf8c
#undef FUNCNAME
Packit Service c5cf8c
#define FUNCNAME MPIR_Allgatherv_intra_recursive_doubling
Packit Service c5cf8c
#undef FCNAME
Packit Service c5cf8c
#define FCNAME MPL_QUOTE(FUNCNAME)
Packit Service c5cf8c
int MPIR_Allgatherv_intra_recursive_doubling(const void *sendbuf,
Packit Service c5cf8c
                                             int sendcount,
Packit Service c5cf8c
                                             MPI_Datatype sendtype,
Packit Service c5cf8c
                                             void *recvbuf,
Packit Service c5cf8c
                                             const int *recvcounts,
Packit Service c5cf8c
                                             const int *displs,
Packit Service c5cf8c
                                             MPI_Datatype recvtype,
Packit Service c5cf8c
                                             MPIR_Comm * comm_ptr, MPIR_Errflag_t * errflag)
Packit Service c5cf8c
{
Packit Service c5cf8c
    int comm_size, rank, j, i;
Packit Service c5cf8c
    int mpi_errno = MPI_SUCCESS;
Packit Service c5cf8c
    int mpi_errno_ret = MPI_SUCCESS;
Packit Service c5cf8c
    MPI_Status status;
Packit Service c5cf8c
    MPI_Aint recvtype_extent, recvtype_true_extent, recvtype_true_lb;
Packit Service c5cf8c
    MPI_Aint curr_cnt, last_recv_cnt;
Packit Service c5cf8c
    int dst, total_count;
Packit Service c5cf8c
    void *tmp_buf;
Packit Service c5cf8c
    int mask, dst_tree_root, my_tree_root, position,
Packit Service c5cf8c
        send_offset, recv_offset, nprocs_completed, k, offset, tmp_mask, tree_root;
Packit Service c5cf8c
    MPIR_CHKLMEM_DECL(1);
Packit Service c5cf8c
Packit Service c5cf8c
    comm_size = comm_ptr->local_size;
Packit Service c5cf8c
    rank = comm_ptr->rank;
Packit Service c5cf8c
Packit Service c5cf8c
#ifdef HAVE_ERROR_CHECKING
Packit Service c5cf8c
    /* Currently this algorithm can only handle power-of-2 comm_size.
Packit Service c5cf8c
     * Non power-of-2 comm_size is still experimental */
Packit Service c5cf8c
    MPIR_Assert(!(comm_size & (comm_size - 1)));
Packit Service c5cf8c
#endif /* HAVE_ERROR_CHECKING */
Packit Service c5cf8c
Packit Service c5cf8c
    total_count = 0;
Packit Service c5cf8c
    for (i = 0; i < comm_size; i++)
Packit Service c5cf8c
        total_count += recvcounts[i];
Packit Service c5cf8c
Packit Service c5cf8c
    if (total_count == 0)
Packit Service c5cf8c
        goto fn_exit;
Packit Service c5cf8c
Packit Service c5cf8c
    MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent);
Packit Service c5cf8c
Packit Service c5cf8c
Packit Service c5cf8c
    /* need to receive contiguously into tmp_buf because
Packit Service c5cf8c
     * displs could make the recvbuf noncontiguous */
Packit Service c5cf8c
Packit Service c5cf8c
    MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &recvtype_true_extent);
Packit Service c5cf8c
Packit Service c5cf8c
    MPIR_Ensure_Aint_fits_in_pointer(total_count *
Packit Service c5cf8c
                                     (MPL_MAX(recvtype_true_extent, recvtype_extent)));
Packit Service c5cf8c
    MPIR_CHKLMEM_MALLOC(tmp_buf, void *,
Packit Service c5cf8c
                        total_count * (MPL_MAX(recvtype_true_extent, recvtype_extent)),
Packit Service c5cf8c
                        mpi_errno, "tmp_buf", MPL_MEM_BUFFER);
Packit Service c5cf8c
Packit Service c5cf8c
    /* adjust for potential negative lower bound in datatype */
Packit Service c5cf8c
    tmp_buf = (void *) ((char *) tmp_buf - recvtype_true_lb);
Packit Service c5cf8c
Packit Service c5cf8c
    /* copy local data into right location in tmp_buf */
Packit Service c5cf8c
    position = 0;
Packit Service c5cf8c
    for (i = 0; i < rank; i++)
Packit Service c5cf8c
        position += recvcounts[i];
Packit Service c5cf8c
    if (sendbuf != MPI_IN_PLACE) {
Packit Service c5cf8c
        mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype,
Packit Service c5cf8c
                                   ((char *) tmp_buf + position *
Packit Service c5cf8c
                                    recvtype_extent), recvcounts[rank], recvtype);
Packit Service c5cf8c
        if (mpi_errno)
Packit Service c5cf8c
            MPIR_ERR_POP(mpi_errno);
Packit Service c5cf8c
    } else {
Packit Service c5cf8c
        /* if in_place specified, local data is found in recvbuf */
Packit Service c5cf8c
        mpi_errno = MPIR_Localcopy(((char *) recvbuf +
Packit Service c5cf8c
                                    displs[rank] * recvtype_extent),
Packit Service c5cf8c
                                   recvcounts[rank], recvtype,
Packit Service c5cf8c
                                   ((char *) tmp_buf + position *
Packit Service c5cf8c
                                    recvtype_extent), recvcounts[rank], recvtype);
Packit Service c5cf8c
        if (mpi_errno)
Packit Service c5cf8c
            MPIR_ERR_POP(mpi_errno);
Packit Service c5cf8c
    }
Packit Service c5cf8c
Packit Service c5cf8c
    curr_cnt = recvcounts[rank];
Packit Service c5cf8c
Packit Service c5cf8c
    mask = 0x1;
Packit Service c5cf8c
    i = 0;
Packit Service c5cf8c
    while (mask < comm_size) {
Packit Service c5cf8c
        dst = rank ^ mask;
Packit Service c5cf8c
Packit Service c5cf8c
        /* find offset into send and recv buffers. zero out
Packit Service c5cf8c
         * the least significant "i" bits of rank and dst to
Packit Service c5cf8c
         * find root of src and dst subtrees. Use ranks of
Packit Service c5cf8c
         * roots as index to send from and recv into buffer */
Packit Service c5cf8c
Packit Service c5cf8c
        dst_tree_root = dst >> i;
Packit Service c5cf8c
        dst_tree_root <<= i;
Packit Service c5cf8c
Packit Service c5cf8c
        my_tree_root = rank >> i;
Packit Service c5cf8c
        my_tree_root <<= i;
Packit Service c5cf8c
Packit Service c5cf8c
        if (dst < comm_size) {
Packit Service c5cf8c
            send_offset = 0;
Packit Service c5cf8c
            for (j = 0; j < my_tree_root; j++)
Packit Service c5cf8c
                send_offset += recvcounts[j];
Packit Service c5cf8c
Packit Service c5cf8c
            recv_offset = 0;
Packit Service c5cf8c
            for (j = 0; j < dst_tree_root; j++)
Packit Service c5cf8c
                recv_offset += recvcounts[j];
Packit Service c5cf8c
Packit Service c5cf8c
            mpi_errno = MPIC_Sendrecv(((char *) tmp_buf + send_offset * recvtype_extent),
Packit Service c5cf8c
                                      curr_cnt, recvtype, dst,
Packit Service c5cf8c
                                      MPIR_ALLGATHERV_TAG,
Packit Service c5cf8c
                                      ((char *) tmp_buf + recv_offset * recvtype_extent),
Packit Service c5cf8c
                                      total_count - recv_offset, recvtype, dst,
Packit Service c5cf8c
                                      MPIR_ALLGATHERV_TAG, comm_ptr, &status, errflag);
Packit Service c5cf8c
            if (mpi_errno) {
Packit Service c5cf8c
                /* for communication errors, just record the error but continue */
Packit Service c5cf8c
                *errflag =
Packit Service c5cf8c
                    MPIX_ERR_PROC_FAILED ==
Packit Service c5cf8c
                    MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
Packit Service c5cf8c
                MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
Packit Service c5cf8c
                MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
Packit Service c5cf8c
                last_recv_cnt = 0;
Packit Service c5cf8c
            } else
Packit Service c5cf8c
                /* for convenience, recv is posted for a bigger amount
Packit Service c5cf8c
                 * than will be sent */
Packit Service c5cf8c
                MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
Packit Service c5cf8c
            curr_cnt += last_recv_cnt;
Packit Service c5cf8c
        }
Packit Service c5cf8c
Packit Service c5cf8c
        /* if some processes in this process's subtree in this step
Packit Service c5cf8c
         * did not have any destination process to communicate with
Packit Service c5cf8c
         * because of non-power-of-two, we need to send them the
Packit Service c5cf8c
         * data that they would normally have received from those
Packit Service c5cf8c
         * processes. That is, the haves in this subtree must send to
Packit Service c5cf8c
         * the havenots. We use a logarithmic
Packit Service c5cf8c
         * recursive-halfing algorithm for this. */
Packit Service c5cf8c
Packit Service c5cf8c
        /* This part of the code will not currently be
Packit Service c5cf8c
         * executed because we are not using recursive
Packit Service c5cf8c
         * doubling for non power of two. Mark it as experimental
Packit Service c5cf8c
         * so that it doesn't show up as red in the coverage
Packit Service c5cf8c
         * tests. */
Packit Service c5cf8c
Packit Service c5cf8c
        /* --BEGIN EXPERIMENTAL-- */
Packit Service c5cf8c
        if (dst_tree_root + mask > comm_size) {
Packit Service c5cf8c
            nprocs_completed = comm_size - my_tree_root - mask;
Packit Service c5cf8c
            /* nprocs_completed is the number of processes in this
Packit Service c5cf8c
             * subtree that have all the data. Send data to others
Packit Service c5cf8c
             * in a tree fashion. First find root of current tree
Packit Service c5cf8c
             * that is being divided into two. k is the number of
Packit Service c5cf8c
             * least-significant bits in this process's rank that
Packit Service c5cf8c
             * must be zeroed out to find the rank of the root */
Packit Service c5cf8c
            j = mask;
Packit Service c5cf8c
            k = 0;
Packit Service c5cf8c
            while (j) {
Packit Service c5cf8c
                j >>= 1;
Packit Service c5cf8c
                k++;
Packit Service c5cf8c
            }
Packit Service c5cf8c
            k--;
Packit Service c5cf8c
Packit Service c5cf8c
            tmp_mask = mask >> 1;
Packit Service c5cf8c
Packit Service c5cf8c
            while (tmp_mask) {
Packit Service c5cf8c
                dst = rank ^ tmp_mask;
Packit Service c5cf8c
Packit Service c5cf8c
                tree_root = rank >> k;
Packit Service c5cf8c
                tree_root <<= k;
Packit Service c5cf8c
Packit Service c5cf8c
                /* send only if this proc has data and destination
Packit Service c5cf8c
                 * doesn't have data. at any step, multiple processes
Packit Service c5cf8c
                 * can send if they have the data */
Packit Service c5cf8c
                if ((dst > rank) && (rank < tree_root + nprocs_completed)
Packit Service c5cf8c
                    && (dst >= tree_root + nprocs_completed)) {
Packit Service c5cf8c
Packit Service c5cf8c
                    offset = 0;
Packit Service c5cf8c
                    for (j = 0; j < (my_tree_root + mask); j++)
Packit Service c5cf8c
                        offset += recvcounts[j];
Packit Service c5cf8c
                    offset *= recvtype_extent;
Packit Service c5cf8c
Packit Service c5cf8c
                    mpi_errno = MPIC_Send(((char *) tmp_buf + offset),
Packit Service c5cf8c
                                          last_recv_cnt,
Packit Service c5cf8c
                                          recvtype, dst, MPIR_ALLGATHERV_TAG, comm_ptr, errflag);
Packit Service c5cf8c
                    if (mpi_errno) {
Packit Service c5cf8c
                        /* for communication errors, just record the error but continue */
Packit Service c5cf8c
                        *errflag =
Packit Service c5cf8c
                            MPIX_ERR_PROC_FAILED ==
Packit Service c5cf8c
                            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
Packit Service c5cf8c
                        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
Packit Service c5cf8c
                        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
Packit Service c5cf8c
                    }
Packit Service c5cf8c
                    /* last_recv_cnt was set in the previous
Packit Service c5cf8c
                     * receive. that's the amount of data to be
Packit Service c5cf8c
                     * sent now. */
Packit Service c5cf8c
                }
Packit Service c5cf8c
                /* recv only if this proc. doesn't have data and sender
Packit Service c5cf8c
                 * has data */
Packit Service c5cf8c
                else if ((dst < rank) &&
Packit Service c5cf8c
                         (dst < tree_root + nprocs_completed) &&
Packit Service c5cf8c
                         (rank >= tree_root + nprocs_completed)) {
Packit Service c5cf8c
Packit Service c5cf8c
                    offset = 0;
Packit Service c5cf8c
                    for (j = 0; j < (my_tree_root + mask); j++)
Packit Service c5cf8c
                        offset += recvcounts[j];
Packit Service c5cf8c
Packit Service c5cf8c
                    mpi_errno = MPIC_Recv(((char *) tmp_buf + offset * recvtype_extent),
Packit Service c5cf8c
                                          total_count - offset, recvtype,
Packit Service c5cf8c
                                          dst, MPIR_ALLGATHERV_TAG, comm_ptr, &status, errflag);
Packit Service c5cf8c
                    if (mpi_errno) {
Packit Service c5cf8c
                        /* for communication errors, just record the error but continue */
Packit Service c5cf8c
                        *errflag =
Packit Service c5cf8c
                            MPIX_ERR_PROC_FAILED ==
Packit Service c5cf8c
                            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
Packit Service c5cf8c
                        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
Packit Service c5cf8c
                        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
Packit Service c5cf8c
                        last_recv_cnt = 0;
Packit Service c5cf8c
                    } else
Packit Service c5cf8c
                        /* for convenience, recv is posted for a
Packit Service c5cf8c
                         * bigger amount than will be sent */
Packit Service c5cf8c
                        MPIR_Get_count_impl(&status, recvtype, &last_recv_cnt);
Packit Service c5cf8c
                    curr_cnt += last_recv_cnt;
Packit Service c5cf8c
                }
Packit Service c5cf8c
                tmp_mask >>= 1;
Packit Service c5cf8c
                k--;
Packit Service c5cf8c
            }
Packit Service c5cf8c
        }
Packit Service c5cf8c
        /* --END EXPERIMENTAL-- */
Packit Service c5cf8c
Packit Service c5cf8c
        mask <<= 1;
Packit Service c5cf8c
        i++;
Packit Service c5cf8c
    }
Packit Service c5cf8c
Packit Service c5cf8c
    /* copy data from tmp_buf to recvbuf */
Packit Service c5cf8c
    position = 0;
Packit Service c5cf8c
    for (j = 0; j < comm_size; j++) {
Packit Service c5cf8c
        if ((sendbuf != MPI_IN_PLACE) || (j != rank)) {
Packit Service c5cf8c
            /* not necessary to copy if in_place and
Packit Service c5cf8c
             * j==rank. otherwise copy. */
Packit Service c5cf8c
            mpi_errno = MPIR_Localcopy(((char *) tmp_buf + position * recvtype_extent),
Packit Service c5cf8c
                                       recvcounts[j], recvtype,
Packit Service c5cf8c
                                       ((char *) recvbuf + displs[j] * recvtype_extent),
Packit Service c5cf8c
                                       recvcounts[j], recvtype);
Packit Service c5cf8c
            if (mpi_errno)
Packit Service c5cf8c
                MPIR_ERR_POP(mpi_errno);
Packit Service c5cf8c
        }
Packit Service c5cf8c
        position += recvcounts[j];
Packit Service c5cf8c
    }
Packit Service c5cf8c
Packit Service c5cf8c
  fn_exit:
Packit Service c5cf8c
    MPIR_CHKLMEM_FREEALL();
Packit Service c5cf8c
    if (mpi_errno_ret)
Packit Service c5cf8c
        mpi_errno = mpi_errno_ret;
Packit Service c5cf8c
    else if (*errflag != MPIR_ERR_NONE)
Packit Service c5cf8c
        MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
Packit Service c5cf8c
Packit Service c5cf8c
    return mpi_errno;
Packit Service c5cf8c
  fn_fail:
Packit Service c5cf8c
    goto fn_exit;
Packit Service c5cf8c
}