/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ /* * * (C) 2001 by Argonne National Laboratory. * See COPYRIGHT in top-level directory. */ #include "mpiimpl.h" #include "mpicomm.h" #ifdef HAVE_HWLOC #include "hwloc.h" #endif #ifdef HAVE_NETLOC #include "netloc_util.h" #endif /* -- Begin Profiling Symbol Block for routine MPI_Comm_split_type */ #if defined(HAVE_PRAGMA_WEAK) #pragma weak MPI_Comm_split_type = PMPI_Comm_split_type #elif defined(HAVE_PRAGMA_HP_SEC_DEF) #pragma _HP_SECONDARY_DEF PMPI_Comm_split_type MPI_Comm_split_type #elif defined(HAVE_PRAGMA_CRI_DUP) #pragma _CRI duplicate MPI_Comm_split_type as PMPI_Comm_split_type #elif defined(HAVE_WEAK_ATTRIBUTE) int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm * newcomm) __attribute__ ((weak, alias("PMPI_Comm_split_type"))); #endif /* -- End Profiling Symbol Block */ /* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build the MPI routines */ #ifndef MPICH_MPI_FROM_PMPI #undef MPI_Comm_split_type #define MPI_Comm_split_type PMPI_Comm_split_type #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type_self #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type_self(MPIR_Comm * user_comm_ptr, int split_type, int key, MPIR_Comm ** newcomm_ptr) { MPIR_Comm *comm_ptr = NULL; MPIR_Comm *comm_self_ptr; int mpi_errno = MPI_SUCCESS; /* split out the undefined processes */ mpi_errno = MPIR_Comm_split_impl(user_comm_ptr, split_type == MPI_UNDEFINED ? MPI_UNDEFINED : 0, key, &comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (split_type == MPI_UNDEFINED) { *newcomm_ptr = NULL; goto fn_exit; } MPIR_Comm_get_ptr(MPI_COMM_SELF, comm_self_ptr); mpi_errno = MPIR_Comm_dup_impl(comm_self_ptr, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: if (comm_ptr) MPIR_Comm_free_impl(comm_ptr); return mpi_errno; fn_fail: goto fn_exit; } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type_node #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type_node(MPIR_Comm * user_comm_ptr, int split_type, int key, MPIR_Comm ** newcomm_ptr) { MPIR_Comm *comm_ptr = NULL; int mpi_errno = MPI_SUCCESS; int color; /* split out the undefined processes */ mpi_errno = MPIR_Comm_split_impl(user_comm_ptr, split_type == MPI_UNDEFINED ? MPI_UNDEFINED : 0, key, &comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (split_type == MPI_UNDEFINED) { *newcomm_ptr = NULL; goto fn_exit; } mpi_errno = MPID_Get_node_id(comm_ptr, comm_ptr->rank, &color); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: if (comm_ptr) MPIR_Comm_free_impl(comm_ptr); return mpi_errno; fn_fail: goto fn_exit; } #ifdef HAVE_HWLOC struct shmem_processor_info_table { const char *val; hwloc_obj_type_t obj_type; }; /* hwloc processor object table */ static struct shmem_processor_info_table shmem_processor_info[] = { {"machine", HWLOC_OBJ_MACHINE}, {"socket", HWLOC_OBJ_PACKAGE}, {"package", HWLOC_OBJ_PACKAGE}, {"numa", HWLOC_OBJ_NUMANODE}, {"core", HWLOC_OBJ_CORE}, {"hwthread", HWLOC_OBJ_PU}, {"pu", HWLOC_OBJ_PU}, {"l1dcache", HWLOC_OBJ_L1CACHE}, {"l1ucache", HWLOC_OBJ_L1CACHE}, {"l1icache", HWLOC_OBJ_L1ICACHE}, {"l1cache", HWLOC_OBJ_L1CACHE}, {"l2dcache", HWLOC_OBJ_L2CACHE}, {"l2ucache", HWLOC_OBJ_L2CACHE}, {"l2icache", HWLOC_OBJ_L2ICACHE}, {"l2cache", HWLOC_OBJ_L2CACHE}, {"l3dcache", HWLOC_OBJ_L3CACHE}, {"l3ucache", HWLOC_OBJ_L3CACHE}, {"l3icache", HWLOC_OBJ_L3ICACHE}, {"l3cache", HWLOC_OBJ_L3CACHE}, {"l4dcache", HWLOC_OBJ_L4CACHE}, {"l4ucache", HWLOC_OBJ_L4CACHE}, {"l4cache", HWLOC_OBJ_L4CACHE}, {"l5dcache", HWLOC_OBJ_L5CACHE}, {"l5ucache", HWLOC_OBJ_L5CACHE}, {"l5cache", HWLOC_OBJ_L5CACHE}, {NULL, HWLOC_OBJ_TYPE_MAX} }; static int node_split_processor(MPIR_Comm * comm_ptr, int key, const char *hintval, MPIR_Comm ** newcomm_ptr) { int color; hwloc_obj_t obj_containing_cpuset; hwloc_obj_type_t query_obj_type = HWLOC_OBJ_TYPE_MAX; int i, mpi_errno = MPI_SUCCESS; /* assign the node id as the color, initially */ MPID_Get_node_id(comm_ptr, comm_ptr->rank, &color); /* try to find the info value in the processor object * table */ for (i = 0; shmem_processor_info[i].val; i++) { if (!strcmp(shmem_processor_info[i].val, hintval)) { query_obj_type = shmem_processor_info[i].obj_type; break; } } if (query_obj_type == HWLOC_OBJ_TYPE_MAX) goto split_id; obj_containing_cpuset = hwloc_get_obj_covering_cpuset(MPIR_Process.hwloc_topology, MPIR_Process.bindset); MPIR_Assert(obj_containing_cpuset != NULL); if (obj_containing_cpuset->type == query_obj_type) { color = obj_containing_cpuset->logical_index; } else { hwloc_obj_t hobj = NULL; hwloc_obj_t tmp = NULL; /* hwloc_get_ancestor_of_type call cannot be used here because HWLOC version 2.0 and above do not * treat memory objects (NUMA) as objects in topology tree (Details can be found in * https://www.open-mpi.org/projects/hwloc/doc/v2.0.1/a00327.php#upgrade_to_api_2x_memory_find) */ while ((tmp = hwloc_get_next_obj_by_type(MPIR_Process.hwloc_topology, query_obj_type, tmp)) != NULL) { if (hwloc_bitmap_isincluded(obj_containing_cpuset->cpuset, tmp->cpuset) || hwloc_bitmap_isequal(tmp->cpuset, obj_containing_cpuset->cpuset)) { hobj = tmp; break; } } if (hobj) color = hobj->logical_index; else color = MPI_UNDEFINED; } split_id: mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } static int node_split_pci_device(MPIR_Comm * comm_ptr, int key, const char *hintval, MPIR_Comm ** newcomm_ptr) { hwloc_obj_t obj_containing_cpuset, io_device = NULL; int mpi_errno = MPI_SUCCESS; int color; obj_containing_cpuset = hwloc_get_obj_covering_cpuset(MPIR_Process.hwloc_topology, MPIR_Process.bindset); MPIR_Assert(obj_containing_cpuset != NULL); io_device = hwloc_get_pcidev_by_busidstring(MPIR_Process.hwloc_topology, hintval + strlen("pci:")); if (io_device != NULL) { hwloc_obj_t non_io_ancestor = hwloc_get_non_io_ancestor_obj(MPIR_Process.hwloc_topology, io_device); /* An io object will never be the root of the topology and is * hence guaranteed to have a non io ancestor */ MPIR_Assert(non_io_ancestor); if (hwloc_obj_is_in_subtree (MPIR_Process.hwloc_topology, obj_containing_cpuset, non_io_ancestor)) { color = non_io_ancestor->logical_index; } else color = MPI_UNDEFINED; } else color = MPI_UNDEFINED; mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } static int io_device_found(const char *resource, const char *devname, hwloc_obj_t io_device, hwloc_obj_osdev_type_t obj_type) { if (!strncmp(resource, devname, strlen(devname))) { /* device type does not match */ if (io_device->attr->osdev.type != obj_type) return 0; /* device prefix does not match */ if (strncmp(io_device->name, devname, strlen(devname))) return 0; /* specific device is supplied, but does not match */ if (strlen(resource) != strlen(devname) && strcmp(io_device->name, resource)) return 0; } return 1; } static int node_split_network_device(MPIR_Comm * comm_ptr, int key, const char *hintval, MPIR_Comm ** newcomm_ptr) { hwloc_obj_t obj_containing_cpuset, io_device = NULL; int mpi_errno = MPI_SUCCESS; int color; /* assign the node id as the color, initially */ MPID_Get_node_id(comm_ptr, comm_ptr->rank, &color); obj_containing_cpuset = hwloc_get_obj_covering_cpuset(MPIR_Process.hwloc_topology, MPIR_Process.bindset); MPIR_Assert(obj_containing_cpuset != NULL); color = MPI_UNDEFINED; while ((io_device = hwloc_get_next_osdev(MPIR_Process.hwloc_topology, io_device))) { hwloc_obj_t non_io_ancestor; uint32_t depth; if (!io_device_found(hintval, "hfi", io_device, HWLOC_OBJ_OSDEV_OPENFABRICS)) continue; if (!io_device_found(hintval, "ib", io_device, HWLOC_OBJ_OSDEV_NETWORK)) continue; if (!io_device_found(hintval, "eth", io_device, HWLOC_OBJ_OSDEV_NETWORK) && !io_device_found(hintval, "en", io_device, HWLOC_OBJ_OSDEV_NETWORK)) continue; non_io_ancestor = hwloc_get_non_io_ancestor_obj(MPIR_Process.hwloc_topology, io_device); while (!hwloc_obj_type_is_normal(non_io_ancestor->type)) non_io_ancestor = non_io_ancestor->parent; MPIR_Assert(non_io_ancestor && non_io_ancestor->depth >= 0); if (!hwloc_obj_is_in_subtree (MPIR_Process.hwloc_topology, obj_containing_cpuset, non_io_ancestor)) continue; /* Get a unique ID for the non-IO object. Use fixed width * unsigned integers, so bit shift operations are well * defined */ depth = (uint32_t) non_io_ancestor->depth; color = (int) ((depth << 16) + non_io_ancestor->logical_index); break; } mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } static int node_split_gpu_device(MPIR_Comm * comm_ptr, int key, const char *hintval, MPIR_Comm ** newcomm_ptr) { hwloc_obj_t obj_containing_cpuset, io_device = NULL; int mpi_errno = MPI_SUCCESS; int color; obj_containing_cpuset = hwloc_get_obj_covering_cpuset(MPIR_Process.hwloc_topology, MPIR_Process.bindset); MPIR_Assert(obj_containing_cpuset != NULL); color = MPI_UNDEFINED; while ((io_device = hwloc_get_next_osdev(MPIR_Process.hwloc_topology, io_device)) != NULL) { if (io_device->attr->osdev.type == HWLOC_OBJ_OSDEV_GPU) { if ((*(hintval + strlen("gpu")) != '\0') && atoi(hintval + strlen("gpu")) != io_device->logical_index) continue; hwloc_obj_t non_io_ancestor = hwloc_get_non_io_ancestor_obj(MPIR_Process.hwloc_topology, io_device); MPIR_Assert(non_io_ancestor); if (hwloc_obj_is_in_subtree (MPIR_Process.hwloc_topology, obj_containing_cpuset, non_io_ancestor)) { color = (non_io_ancestor->type << (sizeof(int) * 4)) + non_io_ancestor->logical_index; break; } } } mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #endif /* HAVE_HWLOC */ #ifdef HAVE_NETLOC static int network_split_switch_level(MPIR_Comm * comm_ptr, int key, int switch_level, MPIR_Comm ** newcomm_ptr) { int i, color; int mpi_errno = MPI_SUCCESS; netloc_node_t *network_node; netloc_node_t **traversal_stack; int traversal_begin, traversal_end; if (MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__FAT_TREE || MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__CLOS_NETWORK) { netloc_node_t **switches_at_level; int switch_count; traversal_stack = (netloc_node_t **) MPL_malloc(sizeof(netloc_node_t *) * MPIR_Process.netloc_topology->num_nodes, MPL_MEM_OTHER); network_node = MPIR_Process.network_attr.network_endpoint; traversal_begin = 0; traversal_end = 0; MPIR_Netloc_get_switches_at_level(MPIR_Process.netloc_topology, MPIR_Process.network_attr, switch_level, &switches_at_level, &switch_count); /* Find the switch `switch_level` steps away */ MPIR_Assert(traversal_end < MPIR_Process.netloc_topology->num_nodes); traversal_stack[traversal_end++] = network_node; color = 0; while (traversal_end > traversal_begin) { netloc_node_t *current_node = traversal_stack[traversal_begin++]; int num_edges; netloc_edge_t **edges; if (current_node->node_type == NETLOC_NODE_TYPE_SWITCH && MPIR_Process.network_attr.u.tree.node_levels[current_node->__uid__] == switch_level) { for (i = 0; i < switch_count; i++) { if (switches_at_level[i] == current_node) { color = color & (1 < i); break; } } } else { continue; } /*find all nodes not visited with an edge from the current node */ netloc_get_all_edges(MPIR_Process.netloc_topology, network_node, &num_edges, &edges); for (i = 0; i < num_edges; i++) { MPIR_Assert(traversal_end < MPIR_Process.netloc_topology->num_nodes); traversal_stack[traversal_end++] = edges[i]->dest_node; } } if (color == 0) { color = MPI_UNDEFINED; } MPL_free(traversal_stack); MPL_free(switches_at_level); } else { color = MPI_UNDEFINED; } mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } static int get_color_from_subset_bitmap(int node_index, int *bitmap, int bitmap_size, int min_size) { int color; int subset_size; int current_comm_color; int prev_comm_color; int i; subset_size = 0; current_comm_color = 0; prev_comm_color = -1; for (i = 0; i < bitmap_size; i++) { if (subset_size >= min_size) { subset_size = 0; prev_comm_color = current_comm_color; current_comm_color = i; } subset_size += bitmap[i]; if (i == node_index) { color = current_comm_color; } } if (subset_size < min_size && i == bitmap_size) color = prev_comm_color; fn_exit: return color; fn_fail: goto fn_exit; } static int network_split_by_minsize(MPIR_Comm * comm_ptr, int key, int subcomm_min_size, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; int i, color; int comm_size = MPIR_Comm_size(comm_ptr); netloc_node_t *network_node; if (subcomm_min_size == 0 || comm_size < subcomm_min_size || MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__INVALID) { *newcomm_ptr = NULL; } else { int node_index, num_nodes, i; int *num_processes_at_node = NULL; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int subset_size; int current_comm_color; int prev_comm_color; network_node = MPIR_Process.network_attr.network_endpoint; if (MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__FAT_TREE || MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__CLOS_NETWORK) { mpi_errno = MPIR_Netloc_get_hostnode_index_in_tree(MPIR_Process.network_attr, MPIR_Process.netloc_topology, network_node, &node_index, &num_nodes); if (mpi_errno) MPIR_ERR_POP(mpi_errno); num_processes_at_node = (int *) MPL_calloc(1, sizeof(int) * num_nodes, MPL_MEM_OTHER); num_processes_at_node[node_index] = 1; } else if (MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__TORUS) { num_processes_at_node = (int *) MPL_calloc(1, sizeof(int) * MPIR_Process.netloc_topology->num_nodes, MPL_MEM_OTHER); num_processes_at_node[MPIR_Process.network_attr.u.torus.node_idx] = 1; } MPIR_Assert(num_processes_at_node != NULL); /* Send the count to processes */ mpi_errno = MPID_Allreduce(MPI_IN_PLACE, num_processes_at_node, num_nodes, MPI_INT, MPI_SUM, comm_ptr, &errflag); if (MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__FAT_TREE || MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__CLOS_NETWORK) { color = get_color_from_subset_bitmap(node_index, num_processes_at_node, num_nodes, subcomm_min_size); } else { int *offset_along_dimension = (int *) MPL_calloc(MPIR_Process.network_attr.u.torus.dimension, sizeof(int), MPL_MEM_OTHER); int *partition = (int *) MPL_calloc(MPIR_Process.network_attr.u.torus.dimension, sizeof(int), MPL_MEM_OTHER); int start_index = offset_along_dimension[0]; int num_processes = 0, total_num_processes = 0; int j, size; for (i = 0; i < MPIR_Process.network_attr.u.torus.dimension; i++) { partition[i] = 1; } while (1) { int node_covered = 0; color = total_num_processes; for (i = 0; i < MPIR_Process.network_attr.u.torus.dimension; i = (i + 1) % MPIR_Process.network_attr.u.torus.dimension) { int cube_size; if (partition[i] - 1 + offset_along_dimension[i] == MPIR_Process.network_attr.u.torus.geometry[i]) { if (i == MPIR_Process.network_attr.u.torus.dimension - 1) { break; } continue; } partition[i]++; cube_size = 0; for (j = 0; j < MPIR_Process.network_attr.u.torus.dimension; j++) { if (partition[j] != 0) { cube_size = cube_size * partition[j]; } } num_processes = 0; for (j = 0; j < cube_size; j++) { int *coordinate = (int *) MPL_calloc(MPIR_Process.network_attr.u.torus.dimension, sizeof(int), MPL_MEM_OTHER); int index = j; int k; int current_dim = 0; while (current_dim < MPIR_Process.network_attr.u.torus.dimension) { coordinate[current_dim++] = index % partition[j]; index = index / partition[j]; } index = 0; for (k = 0; k < MPIR_Process.network_attr.u.torus.dimension; k++) { index = index * (partition[j] + offset_along_dimension[i]) + coordinate[k]; } if (index == MPIR_Process.network_attr.u.torus.node_idx) { node_covered = 1; break; } num_processes += num_processes_at_node[index]; MPL_free(coordinate); } if (num_processes >= subcomm_min_size) { total_num_processes += num_processes; num_processes = 0; for (j = 0; j < MPIR_Process.network_attr.u.torus.dimension; j++) { offset_along_dimension[i] += partition[j] + 1; } break; } } if (total_num_processes == MPIR_Process.netloc_topology->num_nodes || node_covered) { break; } } MPL_free(offset_along_dimension); MPL_free(partition); } mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* There are more processes in the subset than requested within the node. * Split further inside each node */ if (num_processes_at_node[node_index] > subcomm_min_size && node_index == color && ((node_index < (node_index - 1) || num_processes_at_node[node_index] < subcomm_min_size))) { MPIR_Comm *node_comm; int subcomm_rank; int min_tree_depth; hwloc_cpuset_t *node_comm_bindset; int num_procs; num_procs = num_processes_at_node[node_index]; node_comm = *newcomm_ptr; subcomm_rank = MPIR_Comm_rank(node_comm); node_comm_bindset = (hwloc_cpuset_t *) MPL_calloc(num_procs, sizeof(hwloc_cpuset_t), MPL_MEM_OTHER); node_comm_bindset[subcomm_rank] = MPIR_Process.bindset; /* Send the bindset to processes in node communicator */ mpi_errno = MPID_Allreduce(MPI_IN_PLACE, node_comm_bindset, num_procs * sizeof(hwloc_cpuset_t), MPI_BYTE, MPI_NO_OP, node_comm, &errflag); min_tree_depth = -1; for (i = 0; i < num_procs; i++) { hwloc_obj_t obj_containing_cpuset = hwloc_get_obj_covering_cpuset(MPIR_Process.hwloc_topology, node_comm_bindset[i]); if (obj_containing_cpuset->depth < min_tree_depth || min_tree_depth == -1) { min_tree_depth = obj_containing_cpuset->depth; } } if (min_tree_depth) { int num_hwloc_objs_at_depth = hwloc_get_nbobjs_by_depth(MPIR_Process.hwloc_topology, min_tree_depth); int *processes_cpuset = (int *) MPL_calloc(num_hwloc_objs_at_depth, sizeof(int), MPL_MEM_OTHER); hwloc_obj_t parent_obj; int hw_obj_index; int current_proc_index = -1; parent_obj = NULL; hw_obj_index = 0; while ((parent_obj = hwloc_get_next_obj_by_depth(MPIR_Process.hwloc_topology, min_tree_depth, parent_obj)) != NULL) { for (i = 0; i < num_procs; i++) { if (hwloc_bitmap_isincluded(parent_obj->cpuset, node_comm_bindset[i]) || hwloc_bitmap_isequal(parent_obj->cpuset, node_comm_bindset[i])) { processes_cpuset[hw_obj_index] = 1; if (i == subcomm_rank) { current_proc_index = hw_obj_index; } break; } } hw_obj_index++; } color = get_color_from_subset_bitmap(current_proc_index, processes_cpuset, num_hwloc_objs_at_depth, subcomm_min_size); mpi_errno = MPIR_Comm_split_impl(node_comm, color, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPL_free(processes_cpuset); MPIR_Comm_free_impl(node_comm); } MPL_free(node_comm_bindset); } MPL_free(num_processes_at_node); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; } static int network_split_by_min_memsize(MPIR_Comm * comm_ptr, int key, long min_mem_size, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; int i, color; netloc_node_t *network_node; /* Get available memory in the node */ hwloc_obj_t memory_obj = NULL; long total_memory_size = 0; int memory_per_process; while ((memory_obj = hwloc_get_next_obj_by_type(MPIR_Process.hwloc_topology, HWLOC_OBJ_NUMANODE, memory_obj)) != NULL) { /* Memory size is in bytes here */ total_memory_size += memory_obj->total_memory; } if (min_mem_size == 0 || MPIR_Process.network_attr.type == MPIR_NETLOC_NETWORK_TYPE__INVALID) { *newcomm_ptr = NULL; } else { int num_ranks_node; if (MPIR_Process.comm_world->node_comm != NULL) { num_ranks_node = MPIR_Comm_size(MPIR_Process.comm_world->node_comm); } else { num_ranks_node = 1; } memory_per_process = total_memory_size / num_ranks_node; mpi_errno = network_split_by_minsize(comm_ptr, key, min_mem_size / memory_per_process, newcomm_ptr); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; } static int network_split_by_torus_dimension(MPIR_Comm * comm_ptr, int key, int dimension, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; int i, color; int comm_size = MPIR_Comm_size(comm_ptr); /* Dimension is assumed to be indexed from 0 */ if (MPIR_Process.network_attr.type != MPIR_NETLOC_NETWORK_TYPE__TORUS || dimension >= MPIR_Process.network_attr.u.torus.dimension) { *newcomm_ptr = NULL; } else { int node_coordinates = MPIR_Process.network_attr.u.torus.node_idx; int *node_dimensions = MPIR_Process.network_attr.u.torus.geometry; color = 0; for (i = 0; i < MPIR_Process.network_attr.u.torus.dimension; i++) { int coordinate_along_dim; if (i == dimension) { coordinate_along_dim = 0; } else { coordinate_along_dim = node_coordinates % node_dimensions[i]; } if (i == 0) { color = coordinate_along_dim; } else { color = color + coordinate_along_dim * node_dimensions[i - 1]; } node_coordinates = node_coordinates / node_dimensions[i]; } mpi_errno = MPIR_Comm_split_impl(comm_ptr, color, key, newcomm_ptr); } fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #endif static const char *SHMEM_INFO_KEY = "shmem_topo"; static const char *NETWORK_INFO_KEY = "network_topo"; static int compare_info_hint(const char *hintval, MPIR_Comm * comm_ptr, int *info_args_are_equal) { int hintval_size = strlen(hintval); int hintval_size_max; int hintval_equal; int hintval_equal_global = 0; char *hintval_global = NULL; int mpi_errno = MPI_SUCCESS; MPIR_Errflag_t errflag = MPIR_ERR_NONE; /* Find the maximum hintval size. Each process locally compares * its hintval size to the global max, and makes sure that this * comparison is successful on all processes. */ mpi_errno = MPID_Allreduce(&hintval_size, &hintval_size_max, 1, MPI_INT, MPI_MAX, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); hintval_equal = (hintval_size == hintval_size_max); mpi_errno = MPID_Allreduce(&hintval_equal, &hintval_equal_global, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (!hintval_equal_global) goto fn_exit; /* Now that the sizes of the hintvals match, check to make sure * the actual hintvals themselves are the equal */ hintval_global = (char *) MPL_malloc(strlen(hintval), MPL_MEM_OTHER); mpi_errno = MPID_Allreduce(hintval, hintval_global, strlen(hintval), MPI_CHAR, MPI_MAX, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); hintval_equal = !memcmp(hintval, hintval_global, strlen(hintval)); mpi_errno = MPID_Allreduce(&hintval_equal, &hintval_equal_global, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: if (hintval_global != NULL) MPL_free(hintval_global); *info_args_are_equal = hintval_equal_global; return mpi_errno; fn_fail: goto fn_exit; } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type_node_topo #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type_node_topo(MPIR_Comm * user_comm_ptr, int split_type, int key, MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr) { MPIR_Comm *comm_ptr; int mpi_errno = MPI_SUCCESS; int flag = 0; char hintval[MPI_MAX_INFO_VAL + 1]; int info_args_are_equal; *newcomm_ptr = NULL; mpi_errno = MPIR_Comm_split_type_node(user_comm_ptr, split_type, key, &comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (comm_ptr == NULL) { MPIR_Assert(split_type == MPI_UNDEFINED); *newcomm_ptr = NULL; goto fn_exit; } if (info_ptr) { MPIR_Info_get_impl(info_ptr, SHMEM_INFO_KEY, MPI_MAX_INFO_VAL, hintval, &flag); } if (!flag) { hintval[0] = '\0'; } mpi_errno = compare_info_hint(hintval, comm_ptr, &info_args_are_equal); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* if all processes do not have the same hintval, skip * topology-aware comm split */ if (!info_args_are_equal) goto use_node_comm; /* if no info key is given, skip topology-aware comm split */ if (!info_ptr) goto use_node_comm; #ifdef HAVE_HWLOC /* if our bindset is not valid, skip topology-aware comm split */ if (!MPIR_Process.bindset_is_valid) goto use_node_comm; if (flag) { if (!strncmp(hintval, "pci:", strlen("pci:"))) mpi_errno = node_split_pci_device(comm_ptr, key, hintval, newcomm_ptr); else if (!strncmp(hintval, "ib", strlen("ib")) || !strncmp(hintval, "en", strlen("en")) || !strncmp(hintval, "eth", strlen("eth")) || !strncmp(hintval, "hfi", strlen("hfi"))) mpi_errno = node_split_network_device(comm_ptr, key, hintval, newcomm_ptr); else if (!strncmp(hintval, "gpu", strlen("gpu"))) mpi_errno = node_split_gpu_device(comm_ptr, key, hintval, newcomm_ptr); else mpi_errno = node_split_processor(comm_ptr, key, hintval, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Comm_free_impl(comm_ptr); goto fn_exit; } #endif /* HAVE_HWLOC */ use_node_comm: *newcomm_ptr = comm_ptr; fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type_network_topo #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type_network_topo(MPIR_Comm * comm_ptr, int key, const char *hintval, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; #ifdef HAVE_NETLOC if (!strncmp(hintval, ("switch_level:"), strlen("switch_level:")) && *(hintval + strlen("switch_level:")) != '\0') { int switch_level = atoi(hintval + strlen("switch_level:")); mpi_errno = network_split_switch_level(comm_ptr, key, switch_level, newcomm_ptr); } else if (!strncmp(hintval, ("subcomm_min_size:"), strlen("subcomm_min_size:")) && *(hintval + strlen("subcomm_min_size:")) != '\0') { int subcomm_min_size = atoi(hintval + strlen("subcomm_min_size:")); mpi_errno = network_split_by_minsize(comm_ptr, key, subcomm_min_size, newcomm_ptr); } else if (!strncmp(hintval, ("min_mem_size:"), strlen("min_mem_size:")) && *(hintval + strlen("min_mem_size:")) != '\0') { long min_mem_size = atol(hintval + strlen("min_mem_size:")); /* Split by minimum memory size per subcommunicator in bytes */ mpi_errno = network_split_by_min_memsize(comm_ptr, key, min_mem_size, newcomm_ptr); } else if (!strncmp(hintval, ("torus_dimension:"), strlen("torus_dimension:")) && *(hintval + strlen("torus_dimension:")) != '\0') { int dimension = atol(hintval + strlen("torus_dimension:")); mpi_errno = network_split_by_torus_dimension(comm_ptr, key, dimension, newcomm_ptr); } #endif fn_exit: return mpi_errno; } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type(MPIR_Comm * user_comm_ptr, int split_type, int key, MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr) { MPIR_Comm *comm_ptr = NULL; int mpi_errno = MPI_SUCCESS; /* split out the undefined processes */ mpi_errno = MPIR_Comm_split_impl(user_comm_ptr, split_type == MPI_UNDEFINED ? MPI_UNDEFINED : 0, key, &comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (split_type == MPI_UNDEFINED) { *newcomm_ptr = NULL; goto fn_exit; } if (split_type == MPI_COMM_TYPE_SHARED) { mpi_errno = MPIR_Comm_split_type_self(comm_ptr, split_type, key, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else if (split_type == MPIX_COMM_TYPE_NEIGHBORHOOD) { mpi_errno = MPIR_Comm_split_type_neighborhood(comm_ptr, split_type, key, info_ptr, newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_ARG, "**arg"); } fn_exit: if (comm_ptr) MPIR_Comm_free_impl(comm_ptr); return mpi_errno; fn_fail: goto fn_exit; } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type_nbhd_common_dir #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type_nbhd_common_dir(MPIR_Comm * user_comm_ptr, int key, const char *hintval, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; #ifdef HAVE_ROMIO MPI_Comm dummycomm; MPIR_Comm *dummycomm_ptr; mpi_errno = MPIR_Comm_split_filesystem(user_comm_ptr->handle, key, hintval, &dummycomm); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Comm_get_ptr(dummycomm, dummycomm_ptr); *newcomm_ptr = dummycomm_ptr; #endif fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type_neighborhood #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type_neighborhood(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr) { int flag = 0; char hintval[MPI_MAX_INFO_VAL + 1]; int mpi_errno = MPI_SUCCESS; int info_args_are_equal; *newcomm_ptr = NULL; if (info_ptr) { MPIR_Info_get_impl(info_ptr, "nbhd_common_dirname", MPI_MAX_INFO_VAL, hintval, &flag); } if (!flag) { hintval[0] = '\0'; } *newcomm_ptr = NULL; mpi_errno = compare_info_hint(hintval, comm_ptr, &info_args_are_equal); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (info_args_are_equal && flag) { MPIR_Comm_split_type_nbhd_common_dir(comm_ptr, key, hintval, newcomm_ptr); } else { /* Check if the info hint is a network topology hint */ if (info_ptr) { MPIR_Info_get_impl(info_ptr, NETWORK_INFO_KEY, MPI_MAX_INFO_VAL, hintval, &flag); } if (!flag) { hintval[0] = '\0'; } mpi_errno = compare_info_hint(hintval, comm_ptr, &info_args_are_equal); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* if all processes have the same hintval, perform * topology-aware comm split */ if (info_args_are_equal) { MPIR_Comm_split_type_network_topo(comm_ptr, key, hintval, newcomm_ptr); } } fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #undef FUNCNAME #define FUNCNAME MPIR_Comm_split_type_impl #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) int MPIR_Comm_split_type_impl(MPIR_Comm * comm_ptr, int split_type, int key, MPIR_Info * info_ptr, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; /* Only MPI_COMM_TYPE_SHARED, MPI_UNDEFINED, and * NEIGHBORHOOD are supported */ MPIR_Assert(split_type == MPI_COMM_TYPE_SHARED || split_type == MPI_UNDEFINED || split_type == MPIX_COMM_TYPE_NEIGHBORHOOD); if (MPIR_Comm_fns != NULL && MPIR_Comm_fns->split_type != NULL) { mpi_errno = MPIR_Comm_fns->split_type(comm_ptr, split_type, key, info_ptr, newcomm_ptr); } else { mpi_errno = MPIR_Comm_split_type(comm_ptr, split_type, key, info_ptr, newcomm_ptr); } if (mpi_errno) MPIR_ERR_POP(mpi_errno); fn_exit: return mpi_errno; fn_fail: goto fn_exit; } #endif /* MPICH_MPI_FROM_PMPI */ #undef FUNCNAME #define FUNCNAME MPI_Comm_split_type #undef FCNAME #define FCNAME MPL_QUOTE(FUNCNAME) /*@ MPI_Comm_split_type - Creates new communicators based on split types and keys Input Parameters: + comm - communicator (handle) . split_type - type of processes to be grouped together (nonnegative integer). . key - control of rank assignment (integer) - info - hints to improve communicator creation (handle) Output Parameters: . newcomm - new communicator (handle) Notes: The 'split_type' must be non-negative or 'MPI_UNDEFINED'. .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_EXHAUSTED .seealso: MPI_Comm_free @*/ int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm * newcomm) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *comm_ptr = NULL, *newcomm_ptr; MPIR_Info *info_ptr = NULL; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_COMM_SPLIT_TYPE); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_COMM_SPLIT_TYPE); /* Validate parameters, especially handles needing to be converted */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm, mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* Get handles to MPI objects. */ MPIR_Comm_get_ptr(comm, comm_ptr); MPIR_Info_get_ptr(info, info_ptr); /* Validate parameters and objects (post conversion) */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { /* Validate comm_ptr */ MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE); /* If comm_ptr is not valid, it will be reset to null */ if (mpi_errno) goto fn_fail; MPIR_ERRTEST_ARGNULL(newcomm, "newcomm", mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ mpi_errno = MPIR_Comm_split_type_impl(comm_ptr, split_type, key, info_ptr, &newcomm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (newcomm_ptr) MPIR_OBJ_PUBLISH_HANDLE(*newcomm, newcomm_ptr->handle); else *newcomm = MPI_COMM_NULL; /* ... end of body of routine ... */ fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_COMM_SPLIT_TYPE); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ #ifdef HAVE_ERROR_CHECKING { /* FIXME this error code is wrong, it's the error code for * regular MPI_Comm_split */ mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_comm_split", "**mpi_comm_split %C %d %d %p", comm, split_type, key, newcomm); } #endif mpi_errno = MPIR_Err_return_comm(comm_ptr, FCNAME, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }