|
Packit Service |
c5cf8c |
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
|
|
Packit Service |
c5cf8c |
/*
|
|
Packit Service |
c5cf8c |
* (C) 2016 UChicago/Argonne LLC
|
|
Packit Service |
c5cf8c |
* See COPYRIGHT in top-level directory.
|
|
Packit Service |
c5cf8c |
*/
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
#include "mpioimpl.h"
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
#ifdef HAVE_STDLIB_H
|
|
Packit Service |
c5cf8c |
#include <stdlib.h>
|
|
Packit Service |
c5cf8c |
#endif
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
#ifdef HAVE_DIRENT_H
|
|
Packit Service |
c5cf8c |
#include <dirent.h>
|
|
Packit Service |
c5cf8c |
#endif
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
static int comm_split_filesystem_exhaustive(MPI_Comm comm, int key,
|
|
Packit Service |
c5cf8c |
const char *dirname, MPI_Comm * newcomm)
|
|
Packit Service |
c5cf8c |
{
|
|
Packit Service |
c5cf8c |
/* If you run this at scale against GPFS, be prepared to spend 30 mintues
|
|
Packit Service |
c5cf8c |
* creating 10,000 files -- and the numbers only get worse from there.
|
|
Packit Service |
c5cf8c |
*
|
|
Packit Service |
c5cf8c |
* - create random directory
|
|
Packit Service |
c5cf8c |
* - create files in that directory
|
|
Packit Service |
c5cf8c |
* - based on the visible files, construct a new group, then a new
|
|
Packit Service |
c5cf8c |
* communicator
|
|
Packit Service |
c5cf8c |
* - there are no directory operation routines in MPI so we'll do it via
|
|
Packit Service |
c5cf8c |
* POSIX. */
|
|
Packit Service |
c5cf8c |
int rank, nprocs, ret;
|
|
Packit Service |
c5cf8c |
int *ranks;
|
|
Packit Service |
c5cf8c |
MPI_Group comm_group, newgroup;
|
|
Packit Service |
c5cf8c |
int j = 0, mpi_errno = MPI_SUCCESS;
|
|
Packit Service |
c5cf8c |
char *filename = NULL, *testdirname = NULL;
|
|
Packit Service |
c5cf8c |
DIR *dir;
|
|
Packit Service |
c5cf8c |
struct dirent *entry;
|
|
Packit Service |
c5cf8c |
MPI_Comm_rank(comm, &rank;;
|
|
Packit Service |
c5cf8c |
MPI_Comm_size(comm, &nprocs);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* rank zero constructs the candidate directory name (just the
|
|
Packit Service |
c5cf8c |
* name). Everyone will create the directory though -- this will be
|
|
Packit Service |
c5cf8c |
* a headache for the file system at scale.. don't do this on a
|
|
Packit Service |
c5cf8c |
* large parallel file system! */
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
testdirname = MPL_malloc(PATH_MAX, MPL_MEM_IO);
|
|
Packit Service |
c5cf8c |
filename = MPL_malloc(PATH_MAX, MPL_MEM_IO);
|
|
Packit Service |
c5cf8c |
ranks = MPL_malloc(nprocs * sizeof(int), MPL_MEM_IO);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
if (rank == 0)
|
|
Packit Service |
c5cf8c |
MPL_create_pathname(testdirname, dirname, ".commonfstest.0", 1);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
MPI_Bcast(testdirname, PATH_MAX, MPI_BYTE, 0, comm);
|
|
Packit Service |
c5cf8c |
/* ignore EEXIST: quite likely another process will have made this
|
|
Packit Service |
c5cf8c |
* directory, but since the whole point is to figure out who we share this
|
|
Packit Service |
c5cf8c |
* directory with, brute force it is! */
|
|
Packit Service |
c5cf8c |
ret = mkdir(testdirname, S_IRWXU);
|
|
Packit Service |
c5cf8c |
if (ret == -1 && errno != EEXIST)
|
|
Packit Service |
c5cf8c |
goto fn_fail;
|
|
Packit Service |
c5cf8c |
MPL_snprintf(filename, PATH_MAX, "%s/%d", testdirname, rank);
|
|
Packit Service |
c5cf8c |
open(filename, O_CREAT, S_IRUSR | S_IWUSR);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
MPI_Barrier(comm);
|
|
Packit Service |
c5cf8c |
/* each process has created a file in a M-way shared directory (where M in
|
|
Packit Service |
c5cf8c |
* the range [1-nprocs]). Let's see who else can see this directory */
|
|
Packit Service |
c5cf8c |
if ((dir = opendir(testdirname)) == NULL)
|
|
Packit Service |
c5cf8c |
goto fn_fail;
|
|
Packit Service |
c5cf8c |
while ((entry = readdir(dir)) != NULL) {
|
|
Packit Service |
c5cf8c |
if (strcmp(entry->d_name, ".") == 0)
|
|
Packit Service |
c5cf8c |
continue;
|
|
Packit Service |
c5cf8c |
if (strcmp(entry->d_name, "..") == 0)
|
|
Packit Service |
c5cf8c |
continue;
|
|
Packit Service |
c5cf8c |
ranks[j++] = atoi(entry->d_name);
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
MPI_Comm_group(comm, &comm_group);
|
|
Packit Service |
c5cf8c |
MPI_Group_incl(comm_group, j, ranks, &newgroup);
|
|
Packit Service |
c5cf8c |
MPI_Comm_create(comm, newgroup, newcomm);
|
|
Packit Service |
c5cf8c |
MPI_Group_free(&newgroup);
|
|
Packit Service |
c5cf8c |
MPI_Group_free(&comm_group);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
unlink(filename);
|
|
Packit Service |
c5cf8c |
/* ok to ignore errors */
|
|
Packit Service |
c5cf8c |
rmdir(testdirname);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
fn_exit:
|
|
Packit Service |
c5cf8c |
MPL_free(ranks);
|
|
Packit Service |
c5cf8c |
MPL_free(filename);
|
|
Packit Service |
c5cf8c |
MPL_free(testdirname);
|
|
Packit Service |
c5cf8c |
return mpi_errno;
|
|
Packit Service |
c5cf8c |
fn_fail:
|
|
Packit Service |
c5cf8c |
goto fn_exit;
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
static int comm_split_filesystem_heuristic(MPI_Comm comm, int key,
|
|
Packit Service |
c5cf8c |
const char *dirname, MPI_Comm * newcomm)
|
|
Packit Service |
c5cf8c |
{
|
|
Packit Service |
c5cf8c |
int i, mpi_errno = MPI_SUCCESS;
|
|
Packit Service |
c5cf8c |
int rank, nprocs;
|
|
Packit Service |
c5cf8c |
int id;
|
|
Packit Service |
c5cf8c |
int32_t *all_ids;
|
|
Packit Service |
c5cf8c |
char *filename = NULL;
|
|
Packit Service |
c5cf8c |
int challenge_rank, globally_visible = 0;
|
|
Packit Service |
c5cf8c |
MPI_Request check_req;
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
MPI_Comm_rank(comm, &rank;;
|
|
Packit Service |
c5cf8c |
MPI_Comm_size(comm, &nprocs);
|
|
Packit Service |
c5cf8c |
MPIR_Get_node_id(comm, rank, &id;;
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* We could detect the common file systems by parsing 'df'-style
|
|
Packit Service |
c5cf8c |
* output, but that's fidgety, fragile, and error prone. Instead,
|
|
Packit Service |
c5cf8c |
* determine who shares a file system through testing.
|
|
Packit Service |
c5cf8c |
*
|
|
Packit Service |
c5cf8c |
* As an optimization, we should try to avoid creating a lot of
|
|
Packit Service |
c5cf8c |
* files: we want something that could work at hundreds of thousands
|
|
Packit Service |
c5cf8c |
* of nodes, and creating a hundred thousand files in a directory is
|
|
Packit Service |
c5cf8c |
* a recipe for sadness
|
|
Packit Service |
c5cf8c |
*
|
|
Packit Service |
c5cf8c |
* In CH3 and in wider practice "shared memory" is the same as "on
|
|
Packit Service |
c5cf8c |
* the same node, so let's start there.
|
|
Packit Service |
c5cf8c |
*
|
|
Packit Service |
c5cf8c |
* - Create file on one processor
|
|
Packit Service |
c5cf8c |
* - pick a processor outside the "on this node" group
|
|
Packit Service |
c5cf8c |
* - if that processor can see the file, then assume the file is
|
|
Packit Service |
c5cf8c |
* visible to all groups.
|
|
Packit Service |
c5cf8c |
*
|
|
Packit Service |
c5cf8c |
* note that this scheme works really well for traditional linux clusters:
|
|
Packit Service |
c5cf8c |
* think nodes with a local scratch drive. this scheme works less well for
|
|
Packit Service |
c5cf8c |
* a deeper heirarchy. what if the directory in question was hosted by an
|
|
Packit Service |
c5cf8c |
* i/o forwarding agent?
|
|
Packit Service |
c5cf8c |
*/
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* learn a bit about what groups were created: as a scalable
|
|
Packit Service |
c5cf8c |
* optimization we want to check a file's presence from a group
|
|
Packit Service |
c5cf8c |
* other than which created it */
|
|
Packit Service |
c5cf8c |
all_ids = MPL_malloc(nprocs * sizeof(*all_ids), MPL_MEM_IO);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
mpi_errno = MPI_Gather(&id, 1, MPI_INT32_T, all_ids, 1, MPI_INT32_T, 0, comm);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
if (rank == 0) {
|
|
Packit Service |
c5cf8c |
for (i = 0; i < nprocs; i++) {
|
|
Packit Service |
c5cf8c |
if (all_ids[i] != id)
|
|
Packit Service |
c5cf8c |
break;
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
if (i >= nprocs)
|
|
Packit Service |
c5cf8c |
/* everyone is in the same group; pick a process that's not rank 0
|
|
Packit Service |
c5cf8c |
* just in case the file system is really weird */
|
|
Packit Service |
c5cf8c |
challenge_rank = nprocs - 1;
|
|
Packit Service |
c5cf8c |
else
|
|
Packit Service |
c5cf8c |
challenge_rank = i;
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
mpi_errno = MPI_Bcast(&challenge_rank, 1, MPI_INT, 0, comm);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* now that we've informally lumped everyone into groups based on node
|
|
Packit Service |
c5cf8c |
* (like shared memory does) it's time to poke the file system and see
|
|
Packit Service |
c5cf8c |
* which group can see what files */
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* here come a bunch of assumptions:
|
|
Packit Service |
c5cf8c |
* - file system layouts are homogenous: if one system has /scratch,
|
|
Packit Service |
c5cf8c |
* all have /scratch
|
|
Packit Service |
c5cf8c |
* - a globally visible parallel file system will have the same name
|
|
Packit Service |
c5cf8c |
* everywhere: e.g /gpfs/users/something
|
|
Packit Service |
c5cf8c |
* - a file created on one node will be deterministically visible on
|
|
Packit Service |
c5cf8c |
* another. NFS has problems with this
|
|
Packit Service |
c5cf8c |
* - if a process from one group creates a file, and a process from
|
|
Packit Service |
c5cf8c |
* another group finds that file, then a process from all groups
|
|
Packit Service |
c5cf8c |
* can find that file
|
|
Packit Service |
c5cf8c |
*/
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* is the file globally visible to all? create on rank 0, test on a
|
|
Packit Service |
c5cf8c |
* different off-group rank.
|
|
Packit Service |
c5cf8c |
* Use a single short message to force check after create: ordering
|
|
Packit Service |
c5cf8c |
* is a little odd in case we are creating and checking on the same
|
|
Packit Service |
c5cf8c |
* rank */
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
filename = MPL_calloc(PATH_MAX, sizeof(char), MPL_MEM_IO);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
if (rank == 0)
|
|
Packit Service |
c5cf8c |
MPL_create_pathname(filename, dirname, ".commonfstest.0", 0);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
MPI_Bcast(filename, PATH_MAX, MPI_BYTE, 0, comm);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
if (rank == challenge_rank) {
|
|
Packit Service |
c5cf8c |
MPI_Irecv(NULL, 0, MPI_BYTE, 0, 0, comm, &check_req);
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
if (rank == 0) {
|
|
Packit Service |
c5cf8c |
MPI_File fh;
|
|
Packit Service |
c5cf8c |
mpi_errno = MPI_File_open(MPI_COMM_SELF, filename,
|
|
Packit Service |
c5cf8c |
MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_WRONLY,
|
|
Packit Service |
c5cf8c |
MPI_INFO_NULL, &fh;;
|
|
Packit Service |
c5cf8c |
if (mpi_errno != MPI_SUCCESS)
|
|
Packit Service |
c5cf8c |
goto fn_exit;
|
|
Packit Service |
c5cf8c |
MPI_File_close(&fh;;
|
|
Packit Service |
c5cf8c |
/* the check for file has to happen after file created. only need one
|
|
Packit Service |
c5cf8c |
* process, though, not a full barrier */
|
|
Packit Service |
c5cf8c |
MPI_Send(NULL, 0, MPI_BYTE, challenge_rank, 0, comm);
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
if (rank == challenge_rank) {
|
|
Packit Service |
c5cf8c |
MPI_File fh;
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
MPI_Wait(&check_req, MPI_STATUS_IGNORE);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* too bad there's no ADIO equivalent of access: we'll have to
|
|
Packit Service |
c5cf8c |
* open/close the file instead */
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
mpi_errno = MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh;;
|
|
Packit Service |
c5cf8c |
if (mpi_errno == MPI_SUCCESS) {
|
|
Packit Service |
c5cf8c |
globally_visible = 1;
|
|
Packit Service |
c5cf8c |
MPI_File_close(&fh;;
|
|
Packit Service |
c5cf8c |
} else {
|
|
Packit Service |
c5cf8c |
/* do not report error up to caller. we are merely testing the
|
|
Packit Service |
c5cf8c |
* presence of the file */
|
|
Packit Service |
c5cf8c |
mpi_errno = MPI_SUCCESS;
|
|
Packit Service |
c5cf8c |
globally_visible = 0;
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
MPI_Bcast(&globally_visible, 1, MPI_INT, challenge_rank, comm);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* with the above assumptions, we have two cases for a flie
|
|
Packit Service |
c5cf8c |
* created on one process:
|
|
Packit Service |
c5cf8c |
* -- either a process not in the group can access it (node-local
|
|
Packit Service |
c5cf8c |
* storage of some sort)
|
|
Packit Service |
c5cf8c |
* -- or a process not in the group cannot access it (globally
|
|
Packit Service |
c5cf8c |
* accessable parallel file system) */
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
if (globally_visible) {
|
|
Packit Service |
c5cf8c |
MPI_Comm_dup(comm, newcomm);
|
|
Packit Service |
c5cf8c |
} else {
|
|
Packit Service |
c5cf8c |
MPI_Comm_split(comm, id, key, newcomm);
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
if (rank == 0)
|
|
Packit Service |
c5cf8c |
MPI_File_delete(filename, MPI_INFO_NULL);
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
fn_exit:
|
|
Packit Service |
c5cf8c |
MPL_free(all_ids);
|
|
Packit Service |
c5cf8c |
MPL_free(filename);
|
|
Packit Service |
c5cf8c |
return mpi_errno;
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* not to be called directly (note the MPIR_ prefix), but instead from
|
|
Packit Service |
c5cf8c |
* MPI-level MPI_Comm_split_type implementation (e.g.
|
|
Packit Service |
c5cf8c |
* MPIR_Comm_split_type_impl). */
|
|
Packit Service |
c5cf8c |
#undef FUNCNAME
|
|
Packit Service |
c5cf8c |
#define FUNCNAME MPIR_Comm_split_filesystem
|
|
Packit Service |
c5cf8c |
#undef FCNAME
|
|
Packit Service |
c5cf8c |
#define FCNAME MPL_QUOTE(FUNCNAME)
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/* split communicator based on access to directory 'dirname'. */
|
|
Packit Service |
c5cf8c |
int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm * newcomm)
|
|
Packit Service |
c5cf8c |
{
|
|
Packit Service |
c5cf8c |
int mpi_errno = MPI_SUCCESS;
|
|
Packit Service |
c5cf8c |
char *s;
|
|
Packit Service |
c5cf8c |
if ((s = getenv("MPIX_SPLIT_DISABLE_HEURISTIC")) != NULL) {
|
|
Packit Service |
c5cf8c |
mpi_errno = comm_split_filesystem_exhaustive(comm, key, dirname, newcomm);
|
|
Packit Service |
c5cf8c |
} else {
|
|
Packit Service |
c5cf8c |
mpi_errno = comm_split_filesystem_heuristic(comm, key, dirname, newcomm);
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
return mpi_errno;
|
|
Packit Service |
c5cf8c |
}
|
|
Packit Service |
c5cf8c |
|
|
Packit Service |
c5cf8c |
/*
|
|
Packit Service |
c5cf8c |
* vim: ts=8 sts=4 sw=4 noexpandtab
|
|
Packit Service |
c5cf8c |
*/
|