Blob Blame History Raw
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
/*
 *
 *  (C) 2009 by Argonne National Laboratory.
 *      See COPYRIGHT in top-level directory.
 */
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include "mpitest.h"

/*
static char MTEST_Descrip[] = "Test error reporting from faults with point to point communication";
*/

int ReportErr(int errcode, const char name[]);

int main(int argc, char *argv[])
{
    int wrank, wsize, rank, size, color;
    int j, tmp;
    int err, toterrs, errs = 0;
    MPI_Comm newcomm;

    MPI_Init(&argc, &argv);

    MPI_Comm_size(MPI_COMM_WORLD, &wsize);
    MPI_Comm_rank(MPI_COMM_WORLD, &wrank);

    /* Color is 0 or 1; 1 will be the processes that "fault" */
    /* process 0 and wsize/2+1...wsize-1 are in non-faulting group */
    color = (wrank > 0) && (wrank <= wsize / 2);
    MPI_Comm_split(MPI_COMM_WORLD, color, wrank, &newcomm);

    MPI_Comm_size(newcomm, &size);
    MPI_Comm_rank(newcomm, &rank);

    /* Set errors return on COMM_WORLD and the new comm */
    MPI_Comm_set_errhandler(MPI_ERRORS_RETURN, MPI_COMM_WORLD);
    MPI_Comm_set_errhandler(MPI_ERRORS_RETURN, newcomm);

    err = MPI_Barrier(MPI_COMM_WORLD);
    if (err)
        errs += ReportErr(err, "Barrier");
    if (color) {
        /* Simulate a fault on some processes */
        exit(1);
    }
    else {
        /* To improve the chance that the "faulted" processes will have
         * exited, wait for 1 second */
        MTestSleep(1);
    }

    /* Can we still use newcomm? */
    for (j = 0; j < rank; j++) {
        err = MPI_Recv(&tmp, 1, MPI_INT, j, 0, newcomm, MPI_STATUS_IGNORE);
        if (err)
            errs += ReportErr(err, "Recv");
    }
    for (j = rank + 1; j < size; j++) {
        err = MPI_Send(&rank, 1, MPI_INT, j, 0, newcomm);
        if (err)
            errs += ReportErr(err, "Recv");
    }

    /* Now, try sending in MPI_COMM_WORLD on dead processes */
    /* There is a race condition here - we don't know for sure that the faulted
     * processes have exited.  However, we can ensure a failure by using
     * synchronous sends - the sender will wait until the reciever handles
     * receives the message, which will not happen (the process will exit
     * without matching the message, even if it has not yet exited). */
    for (j = 1; j <= wsize / 2; j++) {
        err = MPI_Ssend(&rank, 1, MPI_INT, j, 0, MPI_COMM_WORLD);
        if (!err) {
            errs++;
            fprintf(stderr, "Ssend succeeded to dead process %d\n", j);
        }
    }

    err = MPI_Allreduce(&errs, &toterrs, 1, MPI_INT, MPI_SUM, newcomm);
    if (err)
        errs += ReportErr(err, "Allreduce");
    MPI_Comm_free(&newcomm);

    MPI_Finalize();

    if (wrank == 0) {
        if (toterrs > 0) {
            printf(" Found %d errors\n", toterrs);
        }
        else {
            printf(" No Errors\n");
        }
    }

    return 0;
}

int ReportErr(int errcode, const char name[])
{
    int errclass, errlen;
    char errmsg[MPI_MAX_ERROR_STRING];
    MPI_Error_class(errcode, &errclass);
    MPI_Error_string(errcode, errmsg, &errlen);
    fprintf(stderr, "In %s, error code %d(class %d) = %s\n", name, errcode, errclass, errmsg);
    return 1;
}