Blame src/mpi/romio/mpi-io/mpir_cst_filesys.c

Packit Service c5cf8c
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
Packit Service c5cf8c
/*
Packit Service c5cf8c
 *  (C) 2016 UChicago/Argonne LLC
Packit Service c5cf8c
 *      See COPYRIGHT in top-level directory.
Packit Service c5cf8c
 */
Packit Service c5cf8c
Packit Service c5cf8c
#include "mpioimpl.h"
Packit Service c5cf8c
Packit Service c5cf8c
#ifdef HAVE_STDLIB_H
Packit Service c5cf8c
#include <stdlib.h>
Packit Service c5cf8c
#endif
Packit Service c5cf8c
Packit Service c5cf8c
#ifdef HAVE_DIRENT_H
Packit Service c5cf8c
#include <dirent.h>
Packit Service c5cf8c
#endif
Packit Service c5cf8c
Packit Service c5cf8c
static int comm_split_filesystem_exhaustive(MPI_Comm comm, int key,
Packit Service c5cf8c
                                            const char *dirname, MPI_Comm * newcomm)
Packit Service c5cf8c
{
Packit Service c5cf8c
    /* If you run this at scale against GPFS, be prepared to spend 30 mintues
Packit Service c5cf8c
     * creating 10,000 files -- and the numbers only get worse from there.
Packit Service c5cf8c
     *
Packit Service c5cf8c
     * - create random directory
Packit Service c5cf8c
     * - create files in that directory
Packit Service c5cf8c
     * - based on the visible files, construct a new group, then a new
Packit Service c5cf8c
     *   communicator
Packit Service c5cf8c
     * - there are no directory operation routines in MPI so we'll do it via
Packit Service c5cf8c
     *   POSIX.  */
Packit Service c5cf8c
    int rank, nprocs, ret;
Packit Service c5cf8c
    int *ranks;
Packit Service c5cf8c
    MPI_Group comm_group, newgroup;
Packit Service c5cf8c
    int j = 0, mpi_errno = MPI_SUCCESS;
Packit Service c5cf8c
    char *filename = NULL, *testdirname = NULL;
Packit Service c5cf8c
    DIR *dir;
Packit Service c5cf8c
    struct dirent *entry;
Packit Service c5cf8c
    MPI_Comm_rank(comm, &rank;;
Packit Service c5cf8c
    MPI_Comm_size(comm, &nprocs);
Packit Service c5cf8c
Packit Service c5cf8c
    /* rank zero constructs the candidate directory name (just the
Packit Service c5cf8c
     * name).  Everyone will create the directory though -- this will be
Packit Service c5cf8c
     * a headache for the file system at scale..  don't do this on a
Packit Service c5cf8c
     * large parallel file system! */
Packit Service c5cf8c
Packit Service c5cf8c
    testdirname = MPL_malloc(PATH_MAX, MPL_MEM_IO);
Packit Service c5cf8c
    filename = MPL_malloc(PATH_MAX, MPL_MEM_IO);
Packit Service c5cf8c
    ranks = MPL_malloc(nprocs * sizeof(int), MPL_MEM_IO);
Packit Service c5cf8c
Packit Service c5cf8c
    if (rank == 0)
Packit Service c5cf8c
        MPL_create_pathname(testdirname, dirname, ".commonfstest.0", 1);
Packit Service c5cf8c
Packit Service c5cf8c
    MPI_Bcast(testdirname, PATH_MAX, MPI_BYTE, 0, comm);
Packit Service c5cf8c
    /* ignore EEXIST: quite likely another process will have made this
Packit Service c5cf8c
     * directory, but since the whole point is to figure out who we share this
Packit Service c5cf8c
     * directory with, brute force it is! */
Packit Service c5cf8c
    ret = mkdir(testdirname, S_IRWXU);
Packit Service c5cf8c
    if (ret == -1 && errno != EEXIST)
Packit Service c5cf8c
        goto fn_fail;
Packit Service c5cf8c
    MPL_snprintf(filename, PATH_MAX, "%s/%d", testdirname, rank);
Packit Service c5cf8c
    open(filename, O_CREAT, S_IRUSR | S_IWUSR);
Packit Service c5cf8c
Packit Service c5cf8c
    MPI_Barrier(comm);
Packit Service c5cf8c
    /* each process has created a file in a M-way shared directory (where M in
Packit Service c5cf8c
     * the range [1-nprocs]).  Let's see who else can see this directory */
Packit Service c5cf8c
    if ((dir = opendir(testdirname)) == NULL)
Packit Service c5cf8c
        goto fn_fail;
Packit Service c5cf8c
    while ((entry = readdir(dir)) != NULL) {
Packit Service c5cf8c
        if (strcmp(entry->d_name, ".") == 0)
Packit Service c5cf8c
            continue;
Packit Service c5cf8c
        if (strcmp(entry->d_name, "..") == 0)
Packit Service c5cf8c
            continue;
Packit Service c5cf8c
        ranks[j++] = atoi(entry->d_name);
Packit Service c5cf8c
    }
Packit Service c5cf8c
Packit Service c5cf8c
    MPI_Comm_group(comm, &comm_group);
Packit Service c5cf8c
    MPI_Group_incl(comm_group, j, ranks, &newgroup);
Packit Service c5cf8c
    MPI_Comm_create(comm, newgroup, newcomm);
Packit Service c5cf8c
    MPI_Group_free(&newgroup);
Packit Service c5cf8c
    MPI_Group_free(&comm_group);
Packit Service c5cf8c
Packit Service c5cf8c
    unlink(filename);
Packit Service c5cf8c
    /* ok to ignore errors */
Packit Service c5cf8c
    rmdir(testdirname);
Packit Service c5cf8c
Packit Service c5cf8c
  fn_exit:
Packit Service c5cf8c
    MPL_free(ranks);
Packit Service c5cf8c
    MPL_free(filename);
Packit Service c5cf8c
    MPL_free(testdirname);
Packit Service c5cf8c
    return mpi_errno;
Packit Service c5cf8c
  fn_fail:
Packit Service c5cf8c
    goto fn_exit;
Packit Service c5cf8c
}
Packit Service c5cf8c
Packit Service c5cf8c
static int comm_split_filesystem_heuristic(MPI_Comm comm, int key,
Packit Service c5cf8c
                                           const char *dirname, MPI_Comm * newcomm)
Packit Service c5cf8c
{
Packit Service c5cf8c
    int i, mpi_errno = MPI_SUCCESS;
Packit Service c5cf8c
    int rank, nprocs;
Packit Service c5cf8c
    int id;
Packit Service c5cf8c
    int32_t *all_ids;
Packit Service c5cf8c
    char *filename = NULL;
Packit Service c5cf8c
    int challenge_rank, globally_visible = 0;
Packit Service c5cf8c
    MPI_Request check_req;
Packit Service c5cf8c
Packit Service c5cf8c
    MPI_Comm_rank(comm, &rank;;
Packit Service c5cf8c
    MPI_Comm_size(comm, &nprocs);
Packit Service c5cf8c
    MPIR_Get_node_id(comm, rank, &id;;
Packit Service c5cf8c
Packit Service c5cf8c
    /* We could detect the common file systems by parsing 'df'-style
Packit Service c5cf8c
     * output, but that's fidgety, fragile, and error prone.  Instead,
Packit Service c5cf8c
     * determine who shares a file system through testing.
Packit Service c5cf8c
     *
Packit Service c5cf8c
     * As an optimization, we should try to avoid creating a lot of
Packit Service c5cf8c
     * files: we want something that could work at hundreds of thousands
Packit Service c5cf8c
     * of nodes, and creating a hundred thousand files in a directory is
Packit Service c5cf8c
     * a recipe for sadness
Packit Service c5cf8c
     *
Packit Service c5cf8c
     * In CH3 and in wider practice "shared memory" is the same as "on
Packit Service c5cf8c
     * the same node, so let's start there.
Packit Service c5cf8c
     *
Packit Service c5cf8c
     * - Create file on one processor
Packit Service c5cf8c
     * - pick a processor outside the "on this node" group
Packit Service c5cf8c
     * - if that processor can see the file, then assume the file is
Packit Service c5cf8c
     *   visible to all groups.
Packit Service c5cf8c
     *
Packit Service c5cf8c
     * note that this scheme works really well for traditional linux clusters:
Packit Service c5cf8c
     * think nodes with a local scratch drive.  this scheme works less well for
Packit Service c5cf8c
     * a deeper heirarchy.  what if the directory in question was hosted by an
Packit Service c5cf8c
     * i/o forwarding agent?
Packit Service c5cf8c
     */
Packit Service c5cf8c
Packit Service c5cf8c
    /* learn a bit about what groups were created: as a scalable
Packit Service c5cf8c
     * optimization we want to check a file's presence from a group
Packit Service c5cf8c
     * other than which created it */
Packit Service c5cf8c
    all_ids = MPL_malloc(nprocs * sizeof(*all_ids), MPL_MEM_IO);
Packit Service c5cf8c
Packit Service c5cf8c
    mpi_errno = MPI_Gather(&id, 1, MPI_INT32_T, all_ids, 1, MPI_INT32_T, 0, comm);
Packit Service c5cf8c
Packit Service c5cf8c
    if (rank == 0) {
Packit Service c5cf8c
        for (i = 0; i < nprocs; i++) {
Packit Service c5cf8c
            if (all_ids[i] != id)
Packit Service c5cf8c
                break;
Packit Service c5cf8c
        }
Packit Service c5cf8c
        if (i >= nprocs)
Packit Service c5cf8c
            /* everyone is in the same group; pick a process that's not rank 0
Packit Service c5cf8c
             * just in case the file system is really weird */
Packit Service c5cf8c
            challenge_rank = nprocs - 1;
Packit Service c5cf8c
        else
Packit Service c5cf8c
            challenge_rank = i;
Packit Service c5cf8c
    }
Packit Service c5cf8c
    mpi_errno = MPI_Bcast(&challenge_rank, 1, MPI_INT, 0, comm);
Packit Service c5cf8c
Packit Service c5cf8c
    /* now that we've informally lumped everyone into groups based on node
Packit Service c5cf8c
     * (like shared memory does) it's time to poke the file system and see
Packit Service c5cf8c
     * which group can see what files */
Packit Service c5cf8c
Packit Service c5cf8c
    /* here come a bunch of assumptions:
Packit Service c5cf8c
     * - file system layouts are homogenous: if one system has /scratch,
Packit Service c5cf8c
     *   all have /scratch
Packit Service c5cf8c
     * - a globally visible parallel file system will have the same name
Packit Service c5cf8c
     *   everywhere: e.g /gpfs/users/something
Packit Service c5cf8c
     * - a file created on one node will be deterministically visible on
Packit Service c5cf8c
     *   another.  NFS has problems with this
Packit Service c5cf8c
     * - if a process from one group creates a file, and a process from
Packit Service c5cf8c
     *   another group finds that file, then a process from all groups
Packit Service c5cf8c
     *   can find that file
Packit Service c5cf8c
     */
Packit Service c5cf8c
Packit Service c5cf8c
    /* is the file globally visible to all?  create on rank 0, test on a
Packit Service c5cf8c
     * different off-group rank.
Packit Service c5cf8c
     * Use a single short message to force check after create: ordering
Packit Service c5cf8c
     * is a little odd in case we are creating and checking on the same
Packit Service c5cf8c
     * rank  */
Packit Service c5cf8c
Packit Service c5cf8c
    filename = MPL_calloc(PATH_MAX, sizeof(char), MPL_MEM_IO);
Packit Service c5cf8c
Packit Service c5cf8c
    if (rank == 0)
Packit Service c5cf8c
        MPL_create_pathname(filename, dirname, ".commonfstest.0", 0);
Packit Service c5cf8c
Packit Service c5cf8c
    MPI_Bcast(filename, PATH_MAX, MPI_BYTE, 0, comm);
Packit Service c5cf8c
Packit Service c5cf8c
    if (rank == challenge_rank) {
Packit Service c5cf8c
        MPI_Irecv(NULL, 0, MPI_BYTE, 0, 0, comm, &check_req);
Packit Service c5cf8c
    }
Packit Service c5cf8c
Packit Service c5cf8c
    if (rank == 0) {
Packit Service c5cf8c
        MPI_File fh;
Packit Service c5cf8c
        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename,
Packit Service c5cf8c
                                  MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_WRONLY,
Packit Service c5cf8c
                                  MPI_INFO_NULL, &fh;;
Packit Service c5cf8c
        if (mpi_errno != MPI_SUCCESS)
Packit Service c5cf8c
            goto fn_exit;
Packit Service c5cf8c
        MPI_File_close(&fh;;
Packit Service c5cf8c
        /* the check for file has to happen after file created. only need one
Packit Service c5cf8c
         * process, though, not a full barrier */
Packit Service c5cf8c
        MPI_Send(NULL, 0, MPI_BYTE, challenge_rank, 0, comm);
Packit Service c5cf8c
    }
Packit Service c5cf8c
Packit Service c5cf8c
    if (rank == challenge_rank) {
Packit Service c5cf8c
        MPI_File fh;
Packit Service c5cf8c
Packit Service c5cf8c
        MPI_Wait(&check_req, MPI_STATUS_IGNORE);
Packit Service c5cf8c
Packit Service c5cf8c
        /* too bad there's no ADIO equivalent of access: we'll have to
Packit Service c5cf8c
         * open/close the file instead */
Packit Service c5cf8c
Packit Service c5cf8c
        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh;;
Packit Service c5cf8c
        if (mpi_errno == MPI_SUCCESS) {
Packit Service c5cf8c
            globally_visible = 1;
Packit Service c5cf8c
            MPI_File_close(&fh;;
Packit Service c5cf8c
        } else {
Packit Service c5cf8c
            /* do not report error up to caller.  we are merely testing the
Packit Service c5cf8c
             * presence of the file */
Packit Service c5cf8c
            mpi_errno = MPI_SUCCESS;
Packit Service c5cf8c
            globally_visible = 0;
Packit Service c5cf8c
        }
Packit Service c5cf8c
    }
Packit Service c5cf8c
    MPI_Bcast(&globally_visible, 1, MPI_INT, challenge_rank, comm);
Packit Service c5cf8c
Packit Service c5cf8c
    /*   with the above assumptions, we have two cases for a flie
Packit Service c5cf8c
     *   created on one process:
Packit Service c5cf8c
     *   -- either a process not in the group can access it (node-local
Packit Service c5cf8c
     *      storage of some sort)
Packit Service c5cf8c
     *   -- or a process not in the group cannot access it (globally
Packit Service c5cf8c
     *      accessable parallel file system) */
Packit Service c5cf8c
Packit Service c5cf8c
    if (globally_visible) {
Packit Service c5cf8c
        MPI_Comm_dup(comm, newcomm);
Packit Service c5cf8c
    } else {
Packit Service c5cf8c
        MPI_Comm_split(comm, id, key, newcomm);
Packit Service c5cf8c
    }
Packit Service c5cf8c
    if (rank == 0)
Packit Service c5cf8c
        MPI_File_delete(filename, MPI_INFO_NULL);
Packit Service c5cf8c
Packit Service c5cf8c
  fn_exit:
Packit Service c5cf8c
    MPL_free(all_ids);
Packit Service c5cf8c
    MPL_free(filename);
Packit Service c5cf8c
    return mpi_errno;
Packit Service c5cf8c
}
Packit Service c5cf8c
Packit Service c5cf8c
/* not to be called directly (note the MPIR_ prefix), but instead from
Packit Service c5cf8c
 * MPI-level MPI_Comm_split_type implementation (e.g.
Packit Service c5cf8c
 * MPIR_Comm_split_type_impl). */
Packit Service c5cf8c
#undef FUNCNAME
Packit Service c5cf8c
#define FUNCNAME MPIR_Comm_split_filesystem
Packit Service c5cf8c
#undef FCNAME
Packit Service c5cf8c
#define FCNAME MPL_QUOTE(FUNCNAME)
Packit Service c5cf8c
Packit Service c5cf8c
/* split communicator based on access to directory 'dirname'. */
Packit Service c5cf8c
int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm * newcomm)
Packit Service c5cf8c
{
Packit Service c5cf8c
    int mpi_errno = MPI_SUCCESS;
Packit Service c5cf8c
    char *s;
Packit Service c5cf8c
    if ((s = getenv("MPIX_SPLIT_DISABLE_HEURISTIC")) != NULL) {
Packit Service c5cf8c
        mpi_errno = comm_split_filesystem_exhaustive(comm, key, dirname, newcomm);
Packit Service c5cf8c
    } else {
Packit Service c5cf8c
        mpi_errno = comm_split_filesystem_heuristic(comm, key, dirname, newcomm);
Packit Service c5cf8c
    }
Packit Service c5cf8c
    return mpi_errno;
Packit Service c5cf8c
}
Packit Service c5cf8c
Packit Service c5cf8c
/*
Packit Service c5cf8c
 * vim: ts=8 sts=4 sw=4 noexpandtab
Packit Service c5cf8c
 */