/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *  (C) 2016 UChicago/Argonne LLC
 *      See COPYRIGHT in top-level directory.
 */

#include "mpioimpl.h"

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif

static int comm_split_filesystem_exhaustive(MPI_Comm comm, int key,
                                            const char *dirname, MPI_Comm * newcomm)
{
    /* If you run this at scale against GPFS, be prepared to spend 30 mintues
     * creating 10,000 files -- and the numbers only get worse from there.
     *
     * - create random directory
     * - create files in that directory
     * - based on the visible files, construct a new group, then a new
     *   communicator
     * - there are no directory operation routines in MPI so we'll do it via
     *   POSIX.  */
    int rank, nprocs, ret;
    int *ranks;
    MPI_Group comm_group, newgroup;
    int j = 0, mpi_errno = MPI_SUCCESS;
    char *filename = NULL, *testdirname = NULL;
    DIR *dir;
    struct dirent *entry;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &nprocs);

    /* rank zero constructs the candidate directory name (just the
     * name).  Everyone will create the directory though -- this will be
     * a headache for the file system at scale..  don't do this on a
     * large parallel file system! */

    testdirname = MPL_malloc(PATH_MAX, MPL_MEM_IO);
    filename = MPL_malloc(PATH_MAX, MPL_MEM_IO);
    ranks = MPL_malloc(nprocs * sizeof(int), MPL_MEM_IO);

    if (rank == 0)
        MPL_create_pathname(testdirname, dirname, ".commonfstest.0", 1);

    MPI_Bcast(testdirname, PATH_MAX, MPI_BYTE, 0, comm);
    /* ignore EEXIST: quite likely another process will have made this
     * directory, but since the whole point is to figure out who we share this
     * directory with, brute force it is! */
    ret = mkdir(testdirname, S_IRWXU);
    if (ret == -1 && errno != EEXIST)
        goto fn_fail;
    MPL_snprintf(filename, PATH_MAX, "%s/%d", testdirname, rank);
    open(filename, O_CREAT, S_IRUSR | S_IWUSR);

    MPI_Barrier(comm);
    /* each process has created a file in a M-way shared directory (where M in
     * the range [1-nprocs]).  Let's see who else can see this directory */
    if ((dir = opendir(testdirname)) == NULL)
        goto fn_fail;
    while ((entry = readdir(dir)) != NULL) {
        if (strcmp(entry->d_name, ".") == 0)
            continue;
        if (strcmp(entry->d_name, "..") == 0)
            continue;
        ranks[j++] = atoi(entry->d_name);
    }

    MPI_Comm_group(comm, &comm_group);
    MPI_Group_incl(comm_group, j, ranks, &newgroup);
    MPI_Comm_create(comm, newgroup, newcomm);
    MPI_Group_free(&newgroup);
    MPI_Group_free(&comm_group);

    unlink(filename);
    /* ok to ignore errors */
    rmdir(testdirname);

  fn_exit:
    MPL_free(ranks);
    MPL_free(filename);
    MPL_free(testdirname);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

static int comm_split_filesystem_heuristic(MPI_Comm comm, int key,
                                           const char *dirname, MPI_Comm * newcomm)
{
    int i, mpi_errno = MPI_SUCCESS;
    int rank, nprocs;
    int id;
    int32_t *all_ids;
    char *filename = NULL;
    int challenge_rank, globally_visible = 0;
    MPI_Request check_req;

    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &nprocs);
    MPIR_Get_node_id(comm, rank, &id);

    /* We could detect the common file systems by parsing 'df'-style
     * output, but that's fidgety, fragile, and error prone.  Instead,
     * determine who shares a file system through testing.
     *
     * As an optimization, we should try to avoid creating a lot of
     * files: we want something that could work at hundreds of thousands
     * of nodes, and creating a hundred thousand files in a directory is
     * a recipe for sadness
     *
     * In CH3 and in wider practice "shared memory" is the same as "on
     * the same node, so let's start there.
     *
     * - Create file on one processor
     * - pick a processor outside the "on this node" group
     * - if that processor can see the file, then assume the file is
     *   visible to all groups.
     *
     * note that this scheme works really well for traditional linux clusters:
     * think nodes with a local scratch drive.  this scheme works less well for
     * a deeper heirarchy.  what if the directory in question was hosted by an
     * i/o forwarding agent?
     */

    /* learn a bit about what groups were created: as a scalable
     * optimization we want to check a file's presence from a group
     * other than which created it */
    all_ids = MPL_malloc(nprocs * sizeof(*all_ids), MPL_MEM_IO);

    mpi_errno = MPI_Gather(&id, 1, MPI_INT32_T, all_ids, 1, MPI_INT32_T, 0, comm);

    if (rank == 0) {
        for (i = 0; i < nprocs; i++) {
            if (all_ids[i] != id)
                break;
        }
        if (i >= nprocs)
            /* everyone is in the same group; pick a process that's not rank 0
             * just in case the file system is really weird */
            challenge_rank = nprocs - 1;
        else
            challenge_rank = i;
    }
    mpi_errno = MPI_Bcast(&challenge_rank, 1, MPI_INT, 0, comm);

    /* now that we've informally lumped everyone into groups based on node
     * (like shared memory does) it's time to poke the file system and see
     * which group can see what files */

    /* here come a bunch of assumptions:
     * - file system layouts are homogenous: if one system has /scratch,
     *   all have /scratch
     * - a globally visible parallel file system will have the same name
     *   everywhere: e.g /gpfs/users/something
     * - a file created on one node will be deterministically visible on
     *   another.  NFS has problems with this
     * - if a process from one group creates a file, and a process from
     *   another group finds that file, then a process from all groups
     *   can find that file
     */

    /* is the file globally visible to all?  create on rank 0, test on a
     * different off-group rank.
     * Use a single short message to force check after create: ordering
     * is a little odd in case we are creating and checking on the same
     * rank  */

    filename = MPL_calloc(PATH_MAX, sizeof(char), MPL_MEM_IO);

    if (rank == 0)
        MPL_create_pathname(filename, dirname, ".commonfstest.0", 0);

    MPI_Bcast(filename, PATH_MAX, MPI_BYTE, 0, comm);

    if (rank == challenge_rank) {
        MPI_Irecv(NULL, 0, MPI_BYTE, 0, 0, comm, &check_req);
    }

    if (rank == 0) {
        MPI_File fh;
        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename,
                                  MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_WRONLY,
                                  MPI_INFO_NULL, &fh);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_exit;
        MPI_File_close(&fh);
        /* the check for file has to happen after file created. only need one
         * process, though, not a full barrier */
        MPI_Send(NULL, 0, MPI_BYTE, challenge_rank, 0, comm);
    }

    if (rank == challenge_rank) {
        MPI_File fh;

        MPI_Wait(&check_req, MPI_STATUS_IGNORE);

        /* too bad there's no ADIO equivalent of access: we'll have to
         * open/close the file instead */

        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
        if (mpi_errno == MPI_SUCCESS) {
            globally_visible = 1;
            MPI_File_close(&fh);
        } else {
            /* do not report error up to caller.  we are merely testing the
             * presence of the file */
            mpi_errno = MPI_SUCCESS;
            globally_visible = 0;
        }
    }
    MPI_Bcast(&globally_visible, 1, MPI_INT, challenge_rank, comm);

    /*   with the above assumptions, we have two cases for a flie
     *   created on one process:
     *   -- either a process not in the group can access it (node-local
     *      storage of some sort)
     *   -- or a process not in the group cannot access it (globally
     *      accessable parallel file system) */

    if (globally_visible) {
        MPI_Comm_dup(comm, newcomm);
    } else {
        MPI_Comm_split(comm, id, key, newcomm);
    }
    if (rank == 0)
        MPI_File_delete(filename, MPI_INFO_NULL);

  fn_exit:
    MPL_free(all_ids);
    MPL_free(filename);
    return mpi_errno;
}

/* not to be called directly (note the MPIR_ prefix), but instead from
 * MPI-level MPI_Comm_split_type implementation (e.g.
 * MPIR_Comm_split_type_impl). */
#undef FUNCNAME
#define FUNCNAME MPIR_Comm_split_filesystem
#undef FCNAME
#define FCNAME MPL_QUOTE(FUNCNAME)

/* split communicator based on access to directory 'dirname'. */
int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm * newcomm)
{
    int mpi_errno = MPI_SUCCESS;
    char *s;
    if ((s = getenv("MPIX_SPLIT_DISABLE_HEURISTIC")) != NULL) {
        mpi_errno = comm_split_filesystem_exhaustive(comm, key, dirname, newcomm);
    } else {
        mpi_errno = comm_split_filesystem_heuristic(comm, key, dirname, newcomm);
    }
    return mpi_errno;
}

/*
 * vim: ts=8 sts=4 sw=4 noexpandtab
 */