/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ /* * * (C) 2001 by Argonne National Laboratory. * See COPYRIGHT in top-level directory. */ #include "mpiimpl.h" #include "mpicomm.h" /* === BEGIN_MPI_T_CVAR_INFO_BLOCK === categories: - name : COMMUNICATOR description : cvars that control communicator construction and operation cvars: - name : MPIR_CVAR_COMM_SPLIT_USE_QSORT category : COMMUNICATOR type : boolean default : true class : device verbosity : MPI_T_VERBOSITY_USER_BASIC scope : MPI_T_SCOPE_ALL_EQ description : >- Use qsort(3) in the implementation of MPI_Comm_split instead of bubble sort. === END_MPI_T_CVAR_INFO_BLOCK === */ /* -- Begin Profiling Symbol Block for routine MPI_Comm_split */ #if defined(HAVE_PRAGMA_WEAK) #pragma weak MPI_Comm_split = PMPI_Comm_split #elif defined(HAVE_PRAGMA_HP_SEC_DEF) #pragma _HP_SECONDARY_DEF PMPI_Comm_split MPI_Comm_split #elif defined(HAVE_PRAGMA_CRI_DUP) #pragma _CRI duplicate MPI_Comm_split as PMPI_Comm_split #elif defined(HAVE_WEAK_ATTRIBUTE) int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm * newcomm) __attribute__ ((weak, alias("PMPI_Comm_split"))); #endif /* -- End Profiling Symbol Block */ /* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build the MPI routines */ #ifndef MPICH_MPI_FROM_PMPI #undef MPI_Comm_split #define MPI_Comm_split PMPI_Comm_split typedef struct splittype { int color, key; } splittype; /* Same as splittype but with an additional field to stabilize the qsort. We * could just use one combined type, but using separate types simplifies the * allgather step. */ typedef struct sorttype { int color, key; int orig_idx; } sorttype; #if defined(HAVE_QSORT) static int sorttype_compare(const void *v1, const void *v2) { const sorttype *s1 = v1; const sorttype *s2 = v2; if (s1->key > s2->key) return 1; if (s1->key < s2->key) return -1; /* (s1->key == s2->key), maintain original order */ if (s1->orig_idx > s2->orig_idx) return 1; else if (s1->orig_idx < s2->orig_idx) return -1; /* --BEGIN ERROR HANDLING-- */ return 0; /* should never happen */ /* --END ERROR HANDLING-- */ } #endif /* Sort the entries in keytable into increasing order by key. A stable sort should be used incase the key values are not unique. */ static void MPIU_Sort_inttable(sorttype * keytable, int size) { sorttype tmp; int i, j; #if defined(HAVE_QSORT) /* temporary switch for profiling performance differences */ if (MPIR_CVAR_COMM_SPLIT_USE_QSORT) { /* qsort isn't a stable sort, so we have to enforce stability by keeping * track of the original indices */ for (i = 0; i < size; ++i) keytable[i].orig_idx = i; qsort(keytable, size, sizeof(sorttype), &sorttype_compare); } else #endif { /* --BEGIN USEREXTENSION-- */ /* fall through to insertion sort if qsort is unavailable/disabled */ for (i = 1; i < size; ++i) { tmp = keytable[i]; j = i - 1; while (1) { if (keytable[j].key > tmp.key) { keytable[j + 1] = keytable[j]; j = j - 1; if (j < 0) break; } else { break; } } keytable[j + 1] = tmp; } /* --END USEREXTENSION-- */ } } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_impl #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_impl(MPIR_Comm * comm_ptr, int color, int key, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *local_comm_ptr; splittype *table, *remotetable = 0; sorttype *keytable, *remotekeytable = 0; int rank, size, remote_size, i, new_size, new_remote_size, first_entry = 0, first_remote_entry = 0, *last_ptr; int in_newcomm; /* TRUE iff *newcomm should be populated */ MPIR_Context_id_t new_context_id, remote_context_id; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_Comm_map_t *mapper; MPIR_CHKLMEM_DECL(4); rank = comm_ptr->rank; size = comm_ptr->local_size; remote_size = comm_ptr->remote_size; /* Step 1: Find out what color and keys all of the processes have */ MPIR_CHKLMEM_MALLOC(table, splittype *, size * sizeof(splittype), mpi_errno, "table", MPL_MEM_COMM); table[rank].color = color; table[rank].key = key; /* Get the communicator to use in collectives on the local group of * processes */ if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { if (!comm_ptr->local_comm) { MPII_Setup_intercomm_localcomm(comm_ptr); } local_comm_ptr = comm_ptr->local_comm; } else { local_comm_ptr = comm_ptr; } /* Gather information on the local group of processes */ mpi_errno = MPIR_Allgather(MPI_IN_PLACE, 2, MPI_INT, table, 2, MPI_INT, local_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* Step 2: How many processes have our same color? */ new_size = 0; if (color != MPI_UNDEFINED) { /* Also replace the color value with the index of the *next* value * in this set. The integer first_entry is the index of the * first element */ last_ptr = &first_entry; for (i = 0; i < size; i++) { /* Replace color with the index in table of the next item * of the same color. We use this to efficiently populate * the keyval table */ if (table[i].color == color) { new_size++; *last_ptr = i; last_ptr = &table[i].color; } } } /* We don't need to set the last value to -1 because we loop through * the list for only the known size of the group */ /* If we're an intercomm, we need to do the same thing for the remote * table, as we need to know the size of the remote group of the * same color before deciding to create the communicator */ if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { splittype mypair; /* For the remote group, the situation is more complicated. * We need to find the size of our "partner" group in the * remote comm. The easiest way (in terms of code) is for * every process to essentially repeat the operation for the * local group - perform an (intercommunicator) all gather * of the color and rank information for the remote group. */ MPIR_CHKLMEM_MALLOC(remotetable, splittype *, remote_size * sizeof(splittype), mpi_errno, "remotetable", MPL_MEM_COMM); /* This is an intercommunicator allgather */ /* We must use a local splittype because we've already modified the * entries in table to indicate the location of the next rank of the * same color */ mypair.color = color; mypair.key = key; mpi_errno = MPIR_Allgather(&mypair, 2, MPI_INT, remotetable, 2, MPI_INT, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* Each process can now match its color with the entries in the table */ new_remote_size = 0; last_ptr = &first_remote_entry; for (i = 0; i < remote_size; i++) { /* Replace color with the index in table of the next item * of the same color. We use this to efficiently populate * the keyval table */ if (remotetable[i].color == color) { new_remote_size++; *last_ptr = i; last_ptr = &remotetable[i].color; } } /* Note that it might find that there a now processes in the remote * group with the same color. In that case, COMM_SPLIT will * return a null communicator */ } else { /* Set the size of the remote group to the size of our group. * This simplifies the test below for intercomms with an empty remote * group (must create comm_null) */ new_remote_size = new_size; } in_newcomm = (color != MPI_UNDEFINED && new_remote_size > 0); /* Step 3: Create the communicator */ /* Collectively create a new context id. The same context id will * be used by each (disjoint) collections of processes. The * processes whose color is MPI_UNDEFINED will not influence the * resulting context id (by passing ignore_id==TRUE). */ /* In the multi-threaded case, MPIR_Get_contextid_sparse assumes that the * calling routine already holds the single criticial section */ mpi_errno = MPIR_Get_contextid_sparse(local_comm_ptr, &new_context_id, !in_newcomm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(new_context_id != 0); /* In the intercomm case, we need to exchange the context ids */ if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { if (comm_ptr->rank == 0) { mpi_errno = MPIC_Sendrecv(&new_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, 0, &remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, 0, comm_ptr, MPI_STATUS_IGNORE, &errflag); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } mpi_errno = MPIR_Bcast(&remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, local_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } else { /* Broadcast to the other members of the local group */ mpi_errno = MPIR_Bcast(&remote_context_id, 1, MPIR_CONTEXT_ID_T_DATATYPE, 0, local_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } } *newcomm_ptr = NULL; /* Now, create the new communicator structure if necessary */ if (in_newcomm) { mpi_errno = MPIR_Comm_create(newcomm_ptr); if (mpi_errno) goto fn_fail; (*newcomm_ptr)->recvcontext_id = new_context_id; (*newcomm_ptr)->local_size = new_size; (*newcomm_ptr)->pof2 = MPL_pof2(new_size); (*newcomm_ptr)->comm_kind = comm_ptr->comm_kind; /* Other fields depend on whether this is an intercomm or intracomm */ /* Step 4: Order the processes by their key values. Sort the * list that is stored in table. To simplify the sort, we * extract the table into a smaller array and sort that. * Also, store in the "color" entry the rank in the input communicator * of the entry. */ MPIR_CHKLMEM_MALLOC(keytable, sorttype *, new_size * sizeof(sorttype), mpi_errno, "keytable", MPL_MEM_COMM); for (i = 0; i < new_size; i++) { keytable[i].key = table[first_entry].key; keytable[i].color = first_entry; first_entry = table[first_entry].color; } /* sort key table. The "color" entry is the rank of the corresponding * process in the input communicator */ MPIU_Sort_inttable(keytable, new_size); if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTERCOMM) { MPIR_CHKLMEM_MALLOC(remotekeytable, sorttype *, new_remote_size * sizeof(sorttype), mpi_errno, "remote keytable", MPL_MEM_COMM); for (i = 0; i < new_remote_size; i++) { remotekeytable[i].key = remotetable[first_remote_entry].key; remotekeytable[i].color = first_remote_entry; first_remote_entry = remotetable[first_remote_entry].color; } /* sort key table. The "color" entry is the rank of the * corresponding process in the input communicator */ MPIU_Sort_inttable(remotekeytable, new_remote_size); MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, new_size, MPIR_COMM_MAP_DIR__L2L, &mapper); for (i = 0; i < new_size; i++) { mapper->src_mapping[i] = keytable[i].color; if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } /* For the remote group, the situation is more complicated. * We need to find the size of our "partner" group in the * remote comm. The easiest way (in terms of code) is for * every process to essentially repeat the operation for the * local group - perform an (intercommunicator) all gather * of the color and rank information for the remote group. */ /* We apply the same sorting algorithm to the entries that we've * found to get the correct order of the entries. * * Note that if new_remote_size is 0 (no matching processes with * the same color in the remote group), then MPI_COMM_SPLIT * is required to return MPI_COMM_NULL instead of an intercomm * with an empty remote group. */ MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, new_remote_size, MPIR_COMM_MAP_DIR__R2R, &mapper); for (i = 0; i < new_remote_size; i++) mapper->src_mapping[i] = remotekeytable[i].color; (*newcomm_ptr)->context_id = remote_context_id; (*newcomm_ptr)->remote_size = new_remote_size; (*newcomm_ptr)->local_comm = 0; (*newcomm_ptr)->is_low_group = comm_ptr->is_low_group; } else { /* INTRA Communicator */ (*newcomm_ptr)->context_id = (*newcomm_ptr)->recvcontext_id; (*newcomm_ptr)->remote_size = new_size; MPIR_Comm_map_irregular(*newcomm_ptr, comm_ptr, NULL, new_size, MPIR_COMM_MAP_DIR__L2L, &mapper); for (i = 0; i < new_size; i++) { mapper->src_mapping[i] = keytable[i].color; if (keytable[i].color == comm_ptr->rank) (*newcomm_ptr)->rank = i; } } /* Inherit the error handler (if any) */ MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_COMM_MUTEX(comm_ptr)); (*newcomm_ptr)->errhandler = comm_ptr->errhandler; if (comm_ptr->errhandler) { MPIR_Errhandler_add_ref(comm_ptr->errhandler); } MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_COMM_MUTEX(comm_ptr)); mpi_errno = MPIR_Comm_commit(*newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } fn_exit: MPIR_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: goto fn_exit; } #endif #undef FUNCNAME #define FUNCNAME MPI_Comm_split #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) /*@ MPI_Comm_split - Creates new communicators based on colors and keys Input Parameters: + comm - communicator (handle) . color - control of subset assignment (nonnegative integer). Processes with the same color are in the same new communicator - key - control of rank assignment (integer) Output Parameters: . newcomm - new communicator (handle) Notes: The 'color' must be non-negative or 'MPI_UNDEFINED'. .N ThreadSafe .N Fortran Algorithm: .vb 1. Use MPI_Allgather to get the color and key from each process 2. Count the number of processes with the same color; create a communicator with that many processes. If this process has 'MPI_UNDEFINED' as the color, create a process with a single member. 3. Use key to order the ranks .ve .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_EXHAUSTED .seealso: MPI_Comm_free @*/ int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm * newcomm) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *comm_ptr = NULL, *newcomm_ptr; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_COMM_SPLIT); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_COMM_SPLIT); /* Validate parameters, especially handles needing to be converted */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm, mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* Get handles to MPI objects. */ MPIR_Comm_get_ptr(comm, comm_ptr); /* Validate parameters and objects (post conversion) */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { /* Validate comm_ptr */ MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE); /* If comm_ptr is not valid, it will be reset to null */ if (mpi_errno) goto fn_fail; MPIR_ERRTEST_ARGNULL(newcomm, "newcomm", mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, &newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (newcomm_ptr) MPIR_OBJ_PUBLISH_HANDLE(*newcomm, newcomm_ptr->handle); else *newcomm = MPI_COMM_NULL; /* ... end of body of routine ... */ fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_COMM_SPLIT); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ #ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_comm_split", "**mpi_comm_split %C %d %d %p", comm, color, key, newcomm); } #endif mpi_errno = MPIR_Err_return_comm(comm_ptr, FCNAME, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }