/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
* (C) 2010 by Argonne National Laboratory.
* See COPYRIGHT in top-level directory.
*/
#include "mpiimpl.h"
/*
=== BEGIN_MPI_T_CVAR_INFO_BLOCK ===
cvars:
- name : MPIR_CVAR_IALLGATHER_RECEXCH_KVAL
category : COLLECTIVE
type : int
default : 2
class : device
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
k value for recursive exchange based iallgather
- name : MPIR_CVAR_IALLGATHER_BRUCKS_KVAL
category : COLLECTIVE
type : int
default : 2
class : device
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
k value for radix in brucks based iallgather
- name : MPIR_CVAR_IALLGATHER_INTRA_ALGORITHM
category : COLLECTIVE
type : string
default : auto
class : device
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : |-
Variable to select iallgather algorithm
auto - Internal algorithm selection
brucks - Force brucks algorithm
recursive_doubling - Force recursive doubling algorithm
ring - Force ring algorithm
recexch_distance_doubling - Force generic transport recursive exchange with neighbours doubling in distance in each phase
recexch_distance_halving - Force generic transport recursive exchange with neighbours halving in distance in each phase
gentran_brucks - Force generic transport based brucks algorithm
- name : MPIR_CVAR_IALLGATHER_INTER_ALGORITHM
category : COLLECTIVE
type : string
default : auto
class : device
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : |-
Variable to select iallgather algorithm
auto - Internal algorithm selection
local_gather_remote_bcast - Force local-gather-remote-bcast algorithm
- name : MPIR_CVAR_IALLGATHER_DEVICE_COLLECTIVE
category : COLLECTIVE
type : boolean
default : true
class : device
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
If set to true, MPI_Iallgather will allow the device to override the
MPIR-level collective algorithms. The device still has the
option to call the MPIR-level algorithms manually.
If set to false, the device-level iallgather function will not be
called.
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/
/* -- Begin Profiling Symbol Block for routine MPI_Iallgather */
#if defined(HAVE_PRAGMA_WEAK)
#pragma weak MPI_Iallgather = PMPI_Iallgather
#elif defined(HAVE_PRAGMA_HP_SEC_DEF)
#pragma _HP_SECONDARY_DEF PMPI_Iallgather MPI_Iallgather
#elif defined(HAVE_PRAGMA_CRI_DUP)
#pragma _CRI duplicate MPI_Iallgather as PMPI_Iallgather
#elif defined(HAVE_WEAK_ATTRIBUTE)
int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request * request)
__attribute__ ((weak, alias("PMPI_Iallgather")));
#endif
/* -- End Profiling Symbol Block */
/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build
the MPI routines */
#ifndef MPICH_MPI_FROM_PMPI
#undef MPI_Iallgather
#define MPI_Iallgather PMPI_Iallgather
/* This is the machine-independent implementation of allgather. The algorithm is:
Algorithm: MPI_Allgather
For short messages and non-power-of-two no. of processes, we use
the algorithm from the Jehoshua Bruck et al IEEE TPDS Nov 97
paper. It is a variant of the disemmination algorithm for
barrier. It takes ceiling(lg p) steps.
Cost = lgp.alpha + n.((p-1)/p).beta
where n is total size of data gathered on each process.
For short or medium-size messages and power-of-two no. of
processes, we use the recursive doubling algorithm.
Cost = lgp.alpha + n.((p-1)/p).beta
TODO: On TCP, we may want to use recursive doubling instead of the Bruck
algorithm in all cases because of the pairwise-exchange property of
recursive doubling (see Benson et al paper in Euro PVM/MPI
2003).
It is interesting to note that either of the above algorithms for
MPI_Allgather has the same cost as the tree algorithm for MPI_Gather!
For long messages or medium-size messages and non-power-of-two
no. of processes, we use a ring algorithm. In the first step, each
process i sends its contribution to process i+1 and receives
the contribution from process i-1 (with wrap-around). From the
second step onwards, each process i forwards to process i+1 the
data it received from process i-1 in the previous step. This takes
a total of p-1 steps.
Cost = (p-1).alpha + n.((p-1)/p).beta
We use this algorithm instead of recursive doubling for long
messages because we find that this communication pattern (nearest
neighbor) performs twice as fast as recursive doubling for long
messages (on Myrinet and IBM SP).
Possible improvements:
End Algorithm: MPI_Allgather
*/
#undef FUNCNAME
#define FUNCNAME MPIR_Iallgather_sched_intra_auto
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
int MPIR_Iallgather_sched_intra_auto(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
void *recvbuf, int recvcount, MPI_Datatype recvtype,
MPIR_Comm * comm_ptr, MPIR_Sched_t s)
{
int mpi_errno = MPI_SUCCESS;
int comm_size, recvtype_size;
int tot_bytes;
if (((sendcount == 0) && (sendbuf != MPI_IN_PLACE)) || (recvcount == 0))
return MPI_SUCCESS;
comm_size = comm_ptr->local_size;
MPIR_Datatype_get_size_macro(recvtype, recvtype_size);
tot_bytes = (MPI_Aint) recvcount *comm_size * recvtype_size;
if ((tot_bytes < MPIR_CVAR_ALLGATHER_LONG_MSG_SIZE) && !(comm_size & (comm_size - 1))) {
mpi_errno =
MPIR_Iallgather_sched_intra_recursive_doubling(sendbuf, sendcount, sendtype, recvbuf,
recvcount, recvtype, comm_ptr, s);
} else if (tot_bytes < MPIR_CVAR_ALLGATHER_SHORT_MSG_SIZE) {
mpi_errno =
MPIR_Iallgather_sched_intra_brucks(sendbuf, sendcount, sendtype, recvbuf, recvcount,
recvtype, comm_ptr, s);
} else {
mpi_errno =
MPIR_Iallgather_sched_intra_ring(sendbuf, sendcount, sendtype, recvbuf, recvcount,
recvtype, comm_ptr, s);
}
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
fn_exit:
return mpi_errno;
fn_fail:
goto fn_exit;
}
#undef FUNCNAME
#define FUNCNAME MPIR_Iallgather_sched_inter_auto
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
int MPIR_Iallgather_sched_inter_auto(const void *sendbuf, int sendcount,
MPI_Datatype sendtype, void *recvbuf, int recvcount,
MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Sched_t s)
{
int mpi_errno = MPI_SUCCESS;
mpi_errno = MPIR_Iallgather_sched_inter_local_gather_remote_bcast(sendbuf, sendcount,
sendtype, recvbuf, recvcount,
recvtype, comm_ptr, s);
return mpi_errno;
}
#undef FUNCNAME
#define FUNCNAME MPIR_Iallgather_sched_impl
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
int MPIR_Iallgather_sched_impl(const void *sendbuf, int sendcount,
MPI_Datatype sendtype, void *recvbuf,
int recvcount, MPI_Datatype recvtype,
MPIR_Comm * comm_ptr, MPIR_Sched_t s)
{
int mpi_errno = MPI_SUCCESS;
if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
/* intracommunicator */
switch (MPIR_Iallgather_intra_algo_choice) {
case MPIR_IALLGATHER_INTRA_ALGO_BRUCKS:
mpi_errno = MPIR_Iallgather_sched_intra_brucks(sendbuf, sendcount, sendtype,
recvbuf, recvcount, recvtype,
comm_ptr, s);
break;
case MPIR_IALLGATHER_INTRA_ALGO_RECURSIVE_DOUBLING:
mpi_errno = MPIR_Iallgather_sched_intra_recursive_doubling(sendbuf, sendcount,
sendtype, recvbuf,
recvcount, recvtype,
comm_ptr, s);
break;
case MPIR_IALLGATHER_INTRA_ALGO_RING:
mpi_errno = MPIR_Iallgather_sched_intra_ring(sendbuf, sendcount, sendtype, recvbuf,
recvcount, recvtype, comm_ptr, s);
break;
case MPIR_IALLGATHER_INTRA_ALGO_AUTO:
MPL_FALLTHROUGH;
default:
mpi_errno = MPIR_Iallgather_sched_intra_auto(sendbuf, sendcount, sendtype, recvbuf,
recvcount, recvtype, comm_ptr, s);
break;
}
} else {
/* intercommunicator */
switch (MPIR_Iallgather_inter_algo_choice) {
case MPIR_IALLGATHER_INTER_ALGO_LOCAL_GATHER_REMOTE_BCAST:
mpi_errno =
MPIR_Iallgather_sched_inter_local_gather_remote_bcast(sendbuf, sendcount,
sendtype, recvbuf,
recvcount, recvtype,
comm_ptr, s);
break;
case MPIR_IALLGATHER_INTER_ALGO_AUTO:
MPL_FALLTHROUGH;
default:
mpi_errno = MPIR_Iallgather_sched_inter_auto(sendbuf, sendcount, sendtype,
recvbuf, recvcount, recvtype, comm_ptr,
s);
break;
}
}
return mpi_errno;
}
#undef FUNCNAME
#define FUNCNAME MPIR_Iallgather_sched
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
int MPIR_Iallgather_sched(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
int recvcount, MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
MPIR_Sched_t s)
{
int mpi_errno = MPI_SUCCESS;
if (MPIR_CVAR_IALLGATHER_DEVICE_COLLECTIVE && MPIR_CVAR_DEVICE_COLLECTIVES) {
mpi_errno = MPID_Iallgather_sched(sendbuf, sendcount, sendtype, recvbuf, recvcount,
recvtype, comm_ptr, s);
} else {
mpi_errno = MPIR_Iallgather_sched_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount,
recvtype, comm_ptr, s);
}
return mpi_errno;
}
#undef FUNCNAME
#define FUNCNAME MPIR_Iallgather_impl
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
int MPIR_Iallgather_impl(const void *sendbuf, int sendcount,
MPI_Datatype sendtype, void *recvbuf, int recvcount,
MPI_Datatype recvtype, MPIR_Comm * comm_ptr, MPIR_Request ** request)
{
int mpi_errno = MPI_SUCCESS;
int tag = -1;
MPIR_Sched_t s = MPIR_SCHED_NULL;
*request = NULL;
/* If the user picks one of the transport-enabled algorithms, branch there
* before going down to the MPIR_Sched-based algorithms. */
/* TODO - Eventually the intention is to replace all of the
* MPIR_Sched-based algorithms with transport-enabled algorithms, but that
* will require sufficient performance testing and replacement algorithms. */
if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) {
/* intracommunicator */
switch (MPIR_Iallgather_intra_algo_choice) {
case MPIR_IALLGATHER_INTRA_ALGO_GENTRAN_RECEXCH_DISTANCE_DOUBLING:
mpi_errno =
MPIR_Iallgather_intra_recexch_distance_doubling(sendbuf, sendcount, sendtype,
recvbuf, recvcount, recvtype,
comm_ptr, request);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
goto fn_exit;
break;
case MPIR_IALLGATHER_INTRA_ALGO_GENTRAN_RECEXCH_DISTANCE_HALVING:
mpi_errno =
MPIR_Iallgather_intra_recexch_distance_halving(sendbuf, sendcount, sendtype,
recvbuf, recvcount, recvtype,
comm_ptr, request);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
goto fn_exit;
break;
case MPIR_IALLGATHER_INTRA_ALGO_GENTRAN_BRUCKS:
mpi_errno =
MPIR_Iallgather_intra_gentran_brucks(sendbuf, sendcount, sendtype, recvbuf,
recvcount, recvtype, comm_ptr, request);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
goto fn_exit;
break;
default:
/* go down to the MPIR_Sched-based algorithms */
break;
}
}
mpi_errno = MPIR_Sched_next_tag(comm_ptr, &tag);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
mpi_errno = MPIR_Sched_create(&s);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
mpi_errno =
MPIR_Iallgather_sched(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm_ptr,
s);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
mpi_errno = MPIR_Sched_start(&s, comm_ptr, tag, request);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
fn_exit:
return mpi_errno;
fn_fail:
goto fn_exit;
}
#undef FUNCNAME
#define FUNCNAME MPIR_Iallgather
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
int MPIR_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
void *recvbuf, int recvcount, MPI_Datatype recvtype,
MPIR_Comm * comm_ptr, MPIR_Request ** request)
{
int mpi_errno = MPI_SUCCESS;
if (MPIR_CVAR_IALLGATHER_DEVICE_COLLECTIVE && MPIR_CVAR_DEVICE_COLLECTIVES) {
mpi_errno = MPID_Iallgather(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
comm_ptr, request);
} else {
mpi_errno = MPIR_Iallgather_impl(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype,
comm_ptr, request);
}
return mpi_errno;
}
#endif /* MPICH_MPI_FROM_PMPI */
#undef FUNCNAME
#define FUNCNAME MPI_Iallgather
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)
/*@
MPI_Iallgather - Gathers data from all tasks and distribute the combined data
to all tasks in a nonblocking way
Input Parameters:
+ sendbuf - starting address of the send buffer (choice)
. sendcount - number of elements in send buffer (non-negative integer)
. sendtype - data type of send buffer elements (handle)
. recvcount - number of elements in receive buffer (non-negative integer)
. recvtype - data type of receive buffer elements (handle)
- comm - communicator (handle)
Output Parameters:
+ recvbuf - starting address of the receive buffer (choice)
- request - communication request (handle)
.N ThreadSafe
.N Fortran
.N Errors
@*/
int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
void *recvbuf, int recvcount, MPI_Datatype recvtype,
MPI_Comm comm, MPI_Request * request)
{
int mpi_errno = MPI_SUCCESS;
MPIR_Comm *comm_ptr = NULL;
MPIR_Request *request_ptr = NULL;
MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_IALLGATHER);
MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_IALLGATHER);
/* Validate parameters, especially handles needing to be converted */
#ifdef HAVE_ERROR_CHECKING
{
MPID_BEGIN_ERROR_CHECKS;
{
if (sendbuf != MPI_IN_PLACE) {
MPIR_ERRTEST_DATATYPE(sendtype, "sendtype", mpi_errno);
MPIR_ERRTEST_COUNT(sendcount, mpi_errno);
}
MPIR_ERRTEST_DATATYPE(recvtype, "recvtype", mpi_errno);
MPIR_ERRTEST_COMM(comm, mpi_errno);
/* TODO more checks may be appropriate */
}
MPID_END_ERROR_CHECKS;
}
#endif /* HAVE_ERROR_CHECKING */
/* Convert MPI object handles to object pointers */
MPIR_Comm_get_ptr(comm, comm_ptr);
MPIR_Assert(comm_ptr != NULL);
/* Validate parameters and objects (post conversion) */
#ifdef HAVE_ERROR_CHECKING
{
MPID_BEGIN_ERROR_CHECKS;
{
MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE);
if (sendbuf != MPI_IN_PLACE && HANDLE_GET_KIND(sendtype) != HANDLE_KIND_BUILTIN) {
MPIR_Datatype *sendtype_ptr = NULL;
MPIR_Datatype_get_ptr(sendtype, sendtype_ptr);
MPIR_Datatype_valid_ptr(sendtype_ptr, mpi_errno);
if (mpi_errno != MPI_SUCCESS)
goto fn_fail;
MPIR_Datatype_committed_ptr(sendtype_ptr, mpi_errno);
if (mpi_errno != MPI_SUCCESS)
goto fn_fail;
}
if (HANDLE_GET_KIND(recvtype) != HANDLE_KIND_BUILTIN) {
MPIR_Datatype *recvtype_ptr = NULL;
MPIR_Datatype_get_ptr(recvtype, recvtype_ptr);
MPIR_Datatype_valid_ptr(recvtype_ptr, mpi_errno);
if (mpi_errno != MPI_SUCCESS)
goto fn_fail;
MPIR_Datatype_committed_ptr(recvtype_ptr, mpi_errno);
if (mpi_errno != MPI_SUCCESS)
goto fn_fail;
}
MPIR_ERRTEST_ARGNULL(request, "request", mpi_errno);
/* catch common aliasing cases */
if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM && recvbuf != MPI_IN_PLACE &&
sendtype == recvtype && sendcount == recvcount && sendcount != 0) {
int recvtype_size;
MPIR_Datatype_get_size_macro(recvtype, recvtype_size);
MPIR_ERRTEST_ALIAS_COLL(sendbuf,
(char *) recvbuf +
comm_ptr->rank * recvcount * recvtype_size, mpi_errno);
}
/* TODO more checks may be appropriate (counts, in_place, etc) */
}
MPID_END_ERROR_CHECKS;
}
#endif /* HAVE_ERROR_CHECKING */
/* ... body of routine ... */
mpi_errno = MPIR_Iallgather(sendbuf, sendcount, sendtype, recvbuf, recvcount,
recvtype, comm_ptr, &request_ptr);
if (mpi_errno)
MPIR_ERR_POP(mpi_errno);
/* create a complete request, if needed */
if (!request_ptr)
request_ptr = MPIR_Request_create_complete(MPIR_REQUEST_KIND__COLL);
/* return the handle of the request to the user */
*request = request_ptr->handle;
/* ... end of body of routine ... */
fn_exit:
MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_IALLGATHER);
MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
return mpi_errno;
fn_fail:
/* --BEGIN ERROR HANDLING-- */
#ifdef HAVE_ERROR_CHECKING
{
mpi_errno =
MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
"**mpi_iallgather", "**mpi_iallgather %p %d %D %p %d %D %C %p",
sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm,
request);
}
#endif
mpi_errno = MPIR_Err_return_comm(comm_ptr, FCNAME, mpi_errno);
goto fn_exit;
/* --END ERROR HANDLING-- */
}