Blame test/mpi/perf/manyrma.c

Packit 0848f5
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
Packit 0848f5
/*
Packit 0848f5
 *  (C) 2010 by Argonne National Laboratory.
Packit 0848f5
 *      See COPYRIGHT in top-level directory.
Packit 0848f5
 */
Packit 0848f5
Packit 0848f5
/* This test measures the performance of many rma operations to a single
Packit 0848f5
   target process.
Packit 0848f5
   It uses a number of operations (put or accumulate) to different
Packit 0848f5
   locations in the target window
Packit 0848f5
   This is one of the ways that RMA may be used, and is used in the
Packit 0848f5
   reference implementation of the graph500 benchmark.
Packit 0848f5
*/
Packit 0848f5
#include "mpi.h"
Packit 0848f5
#include <stdio.h>
Packit 0848f5
#include <stdlib.h>
Packit 0848f5
#include <string.h>
Packit 0848f5
Packit 0848f5
#define MAX_COUNT 65536*4
Packit 0848f5
#define MAX_RMA_SIZE 16
Packit 0848f5
#define MAX_RUNS 10
Packit 0848f5
Packit 0848f5
typedef enum { SYNC_NONE = 0,
Packit 0848f5
    SYNC_ALL = -1, SYNC_FENCE = 1, SYNC_LOCK = 2, SYNC_PSCW = 4
Packit 0848f5
} sync_t;
Packit 0848f5
typedef enum { RMA_NONE = 0, RMA_ALL = -1, RMA_PUT = 1, RMA_ACC = 2, RMA_GET = 4 } rma_t;
Packit 0848f5
/* Note GET not yet implemented */
Packit 0848f5
sync_t syncChoice = SYNC_ALL;
Packit 0848f5
rma_t rmaChoice = RMA_ALL;
Packit 0848f5
Packit 0848f5
typedef struct {
Packit 0848f5
    double startOp, endOp, endSync;
Packit 0848f5
} timing;
Packit 0848f5
Packit 0848f5
static int verbose = 1;
Packit 0848f5
static int barrierSync = 0;
Packit 0848f5
static double tickThreshold = 0.0;
Packit 0848f5
Packit 0848f5
void PrintResults(int cnt, timing t[]);
Packit 0848f5
void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
Packit 0848f5
void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
Packit 0848f5
void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
Packit 0848f5
void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
Packit 0848f5
void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
Packit 0848f5
                MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
Packit 0848f5
void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
Packit 0848f5
                MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
Packit 0848f5
Packit 0848f5
int main(int argc, char *argv[])
Packit 0848f5
{
Packit 0848f5
    int arraysize, i, cnt, sz, maxCount = MAX_COUNT, *arraybuffer;
Packit 0848f5
    int wrank, wsize, destRank, srcRank;
Packit 0848f5
    MPI_Win win;
Packit 0848f5
    MPI_Group wgroup, accessGroup, exposureGroup;
Packit 0848f5
    timing t[MAX_RUNS];
Packit 0848f5
    int maxSz = MAX_RMA_SIZE;
Packit 0848f5
Packit 0848f5
    MPI_Init(&argc, &argv);
Packit 0848f5
Packit 0848f5
    /* Determine clock accuracy */
Packit 0848f5
    tickThreshold = 10.0 * MPI_Wtick();
Packit 0848f5
    MPI_Allreduce(MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
Packit 0848f5
Packit 0848f5
    for (i = 1; i < argc; i++) {
Packit 0848f5
        if (strcmp(argv[i], "-put") == 0) {
Packit 0848f5
            if (rmaChoice == RMA_ALL)
Packit 0848f5
                rmaChoice = RMA_NONE;
Packit 0848f5
            rmaChoice |= RMA_PUT;
Packit 0848f5
        }
Packit 0848f5
        else if (strcmp(argv[i], "-acc") == 0) {
Packit 0848f5
            if (rmaChoice == RMA_ALL)
Packit 0848f5
                rmaChoice = RMA_NONE;
Packit 0848f5
            rmaChoice |= RMA_ACC;
Packit 0848f5
        }
Packit 0848f5
        else if (strcmp(argv[i], "-fence") == 0) {
Packit 0848f5
            if (syncChoice == SYNC_ALL)
Packit 0848f5
                syncChoice = SYNC_NONE;
Packit 0848f5
            syncChoice |= SYNC_FENCE;
Packit 0848f5
        }
Packit 0848f5
        else if (strcmp(argv[i], "-lock") == 0) {
Packit 0848f5
            if (syncChoice == SYNC_ALL)
Packit 0848f5
                syncChoice = SYNC_NONE;
Packit 0848f5
            syncChoice |= SYNC_LOCK;
Packit 0848f5
        }
Packit 0848f5
        else if (strcmp(argv[i], "-pscw") == 0) {
Packit 0848f5
            if (syncChoice == SYNC_ALL)
Packit 0848f5
                syncChoice = SYNC_NONE;
Packit 0848f5
            syncChoice |= SYNC_PSCW;
Packit 0848f5
        }
Packit 0848f5
        else if (strcmp(argv[i], "-maxsz") == 0) {
Packit 0848f5
            i++;
Packit 0848f5
            maxSz = atoi(argv[i]);
Packit 0848f5
        }
Packit 0848f5
        else if (strcmp(argv[i], "-maxcount") == 0) {
Packit 0848f5
            i++;
Packit 0848f5
            maxCount = atoi(argv[i]);
Packit 0848f5
        }
Packit 0848f5
        else if (strcmp(argv[i], "-barrier") == 0) {
Packit 0848f5
            barrierSync = 1;
Packit 0848f5
        }
Packit 0848f5
        else {
Packit 0848f5
            fprintf(stderr, "Unrecognized argument %s\n", argv[i]);
Packit 0848f5
            fprintf(stderr,
Packit 0848f5
                    "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ]  [ -maxsz msgsize ]\n",
Packit 0848f5
                    argv[0]);
Packit 0848f5
            MPI_Abort(MPI_COMM_WORLD, 1);
Packit 0848f5
        }
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
Packit 0848f5
    MPI_Comm_size(MPI_COMM_WORLD, &wsize);
Packit 0848f5
    destRank = wrank + 1;
Packit 0848f5
    while (destRank >= wsize)
Packit 0848f5
        destRank = destRank - wsize;
Packit 0848f5
    srcRank = wrank - 1;
Packit 0848f5
    if (srcRank < 0)
Packit 0848f5
        srcRank += wsize;
Packit 0848f5
Packit 0848f5
    /* Create groups for PSCW */
Packit 0848f5
    MPI_Comm_group(MPI_COMM_WORLD, &wgroup);
Packit 0848f5
    MPI_Group_incl(wgroup, 1, &destRank, &accessGroup);
Packit 0848f5
    MPI_Group_incl(wgroup, 1, &srcRank, &exposureGroup);
Packit 0848f5
    MPI_Group_free(&wgroup);
Packit 0848f5
Packit 0848f5
    arraysize = maxSz * MAX_COUNT;
Packit 0848f5
    arraybuffer = (int *) malloc(arraysize * sizeof(int));
Packit 0848f5
    if (!arraybuffer) {
Packit 0848f5
        fprintf(stderr, "Unable to allocate %d words\n", arraysize);
Packit 0848f5
        MPI_Abort(MPI_COMM_WORLD, 1);
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    MPI_Win_create(arraybuffer, arraysize * sizeof(int), (int) sizeof(int),
Packit 0848f5
                   MPI_INFO_NULL, MPI_COMM_WORLD, &win);
Packit 0848f5
Packit 0848f5
    /* FIXME: we need a test on performance consistency.
Packit 0848f5
     * The test needs to have both a relative growth limit and
Packit 0848f5
     * an absolute limit.
Packit 0848f5
     */
Packit 0848f5
Packit 0848f5
    if (maxCount > MAX_COUNT) {
Packit 0848f5
        fprintf(stderr, "MaxCount must not exceed %d\n", MAX_COUNT);
Packit 0848f5
        MPI_Abort(MPI_COMM_WORLD, 1);
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
Packit 0848f5
        for (sz = 1; sz <= maxSz; sz = sz + sz) {
Packit 0848f5
            if (wrank == 0)
Packit 0848f5
                printf("Accumulate with fence, %d elements\n", sz);
Packit 0848f5
            cnt = 1;
Packit 0848f5
            while (cnt <= maxCount) {
Packit 0848f5
                RunAccFence(win, destRank, cnt, sz, t);
Packit 0848f5
                if (wrank == 0) {
Packit 0848f5
                    PrintResults(cnt, t);
Packit 0848f5
                }
Packit 0848f5
                cnt = 2 * cnt;
Packit 0848f5
            }
Packit 0848f5
        }
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
Packit 0848f5
        for (sz = 1; sz <= maxSz; sz = sz + sz) {
Packit 0848f5
            if (wrank == 0)
Packit 0848f5
                printf("Accumulate with lock, %d elements\n", sz);
Packit 0848f5
            cnt = 1;
Packit 0848f5
            while (cnt <= maxCount) {
Packit 0848f5
                RunAccLock(win, destRank, cnt, sz, t);
Packit 0848f5
                if (wrank == 0) {
Packit 0848f5
                    PrintResults(cnt, t);
Packit 0848f5
                }
Packit 0848f5
                cnt = 2 * cnt;
Packit 0848f5
            }
Packit 0848f5
        }
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
Packit 0848f5
        for (sz = 1; sz <= maxSz; sz = sz + sz) {
Packit 0848f5
            if (wrank == 0)
Packit 0848f5
                printf("Put with fence, %d elements\n", sz);
Packit 0848f5
            cnt = 1;
Packit 0848f5
            while (cnt <= maxCount) {
Packit 0848f5
                RunPutFence(win, destRank, cnt, sz, t);
Packit 0848f5
                if (wrank == 0) {
Packit 0848f5
                    PrintResults(cnt, t);
Packit 0848f5
                }
Packit 0848f5
                cnt = 2 * cnt;
Packit 0848f5
            }
Packit 0848f5
        }
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
Packit 0848f5
        for (sz = 1; sz <= maxSz; sz = sz + sz) {
Packit 0848f5
            if (wrank == 0)
Packit 0848f5
                printf("Put with lock, %d elements\n", sz);
Packit 0848f5
            cnt = 1;
Packit 0848f5
            while (cnt <= maxCount) {
Packit 0848f5
                RunPutLock(win, destRank, cnt, sz, t);
Packit 0848f5
                if (wrank == 0) {
Packit 0848f5
                    PrintResults(cnt, t);
Packit 0848f5
                }
Packit 0848f5
                cnt = 2 * cnt;
Packit 0848f5
            }
Packit 0848f5
        }
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
Packit 0848f5
        for (sz = 1; sz <= maxSz; sz = sz + sz) {
Packit 0848f5
            if (wrank == 0)
Packit 0848f5
                printf("Put with pscw, %d elements\n", sz);
Packit 0848f5
            cnt = 1;
Packit 0848f5
            while (cnt <= maxCount) {
Packit 0848f5
                RunPutPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
Packit 0848f5
                if (wrank == 0) {
Packit 0848f5
                    PrintResults(cnt, t);
Packit 0848f5
                }
Packit 0848f5
                cnt = 2 * cnt;
Packit 0848f5
            }
Packit 0848f5
        }
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
Packit 0848f5
        for (sz = 1; sz <= maxSz; sz = sz + sz) {
Packit 0848f5
            if (wrank == 0)
Packit 0848f5
                printf("Accumulate with pscw, %d elements\n", sz);
Packit 0848f5
            cnt = 1;
Packit 0848f5
            while (cnt <= maxCount) {
Packit 0848f5
                RunAccPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
Packit 0848f5
                if (wrank == 0) {
Packit 0848f5
                    PrintResults(cnt, t);
Packit 0848f5
                }
Packit 0848f5
                cnt = 2 * cnt;
Packit 0848f5
            }
Packit 0848f5
        }
Packit 0848f5
    }
Packit 0848f5
Packit 0848f5
    MPI_Win_free(&win);
Packit 0848f5
Packit 0848f5
    MPI_Group_free(&accessGroup);
Packit 0848f5
    MPI_Group_free(&exposureGroup);
Packit 0848f5
Packit 0848f5
    MPI_Finalize();
Packit 0848f5
    return 0;
Packit 0848f5
}
Packit 0848f5
Packit 0848f5
Packit 0848f5
void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
Packit 0848f5
{
Packit 0848f5
    int k, i, j, one = 1;
Packit 0848f5
Packit 0848f5
    for (k = 0; k < MAX_RUNS; k++) {
Packit 0848f5
        MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_fence(0, win);
Packit 0848f5
        j = 0;
Packit 0848f5
        t[k].startOp = MPI_Wtime();
Packit 0848f5
        for (i = 0; i < cnt; i++) {
Packit 0848f5
            MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
Packit 0848f5
            j += sz;
Packit 0848f5
        }
Packit 0848f5
        t[k].endOp = MPI_Wtime();
Packit 0848f5
        if (barrierSync)
Packit 0848f5
            MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_fence(0, win);
Packit 0848f5
        t[k].endSync = MPI_Wtime();
Packit 0848f5
    }
Packit 0848f5
}
Packit 0848f5
Packit 0848f5
void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
Packit 0848f5
{
Packit 0848f5
    int k, i, j, one = 1;
Packit 0848f5
Packit 0848f5
    for (k = 0; k < MAX_RUNS; k++) {
Packit 0848f5
        MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
Packit 0848f5
        j = 0;
Packit 0848f5
        t[k].startOp = MPI_Wtime();
Packit 0848f5
        for (i = 0; i < cnt; i++) {
Packit 0848f5
            MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
Packit 0848f5
            j += sz;
Packit 0848f5
        }
Packit 0848f5
        t[k].endOp = MPI_Wtime();
Packit 0848f5
        if (barrierSync)
Packit 0848f5
            MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_unlock(destRank, win);
Packit 0848f5
        t[k].endSync = MPI_Wtime();
Packit 0848f5
    }
Packit 0848f5
}
Packit 0848f5
Packit 0848f5
void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
Packit 0848f5
{
Packit 0848f5
    int k, i, j, one = 1;
Packit 0848f5
Packit 0848f5
    for (k = 0; k < MAX_RUNS; k++) {
Packit 0848f5
        MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_fence(0, win);
Packit 0848f5
        j = 0;
Packit 0848f5
        t[k].startOp = MPI_Wtime();
Packit 0848f5
        for (i = 0; i < cnt; i++) {
Packit 0848f5
            MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
Packit 0848f5
            j += sz;
Packit 0848f5
        }
Packit 0848f5
        t[k].endOp = MPI_Wtime();
Packit 0848f5
        if (barrierSync)
Packit 0848f5
            MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_fence(0, win);
Packit 0848f5
        t[k].endSync = MPI_Wtime();
Packit 0848f5
    }
Packit 0848f5
}
Packit 0848f5
Packit 0848f5
void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
Packit 0848f5
{
Packit 0848f5
    int k, i, j, one = 1;
Packit 0848f5
Packit 0848f5
    for (k = 0; k < MAX_RUNS; k++) {
Packit 0848f5
        MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
Packit 0848f5
        j = 0;
Packit 0848f5
        t[k].startOp = MPI_Wtime();
Packit 0848f5
        for (i = 0; i < cnt; i++) {
Packit 0848f5
            MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
Packit 0848f5
            j += sz;
Packit 0848f5
        }
Packit 0848f5
        t[k].endOp = MPI_Wtime();
Packit 0848f5
        if (barrierSync)
Packit 0848f5
            MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_unlock(destRank, win);
Packit 0848f5
        t[k].endSync = MPI_Wtime();
Packit 0848f5
    }
Packit 0848f5
}
Packit 0848f5
Packit 0848f5
void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
Packit 0848f5
                MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
Packit 0848f5
{
Packit 0848f5
    int k, i, j, one = 1;
Packit 0848f5
Packit 0848f5
    for (k = 0; k < MAX_RUNS; k++) {
Packit 0848f5
        MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_post(exposureGroup, 0, win);
Packit 0848f5
        MPI_Win_start(accessGroup, 0, win);
Packit 0848f5
        j = 0;
Packit 0848f5
        t[k].startOp = MPI_Wtime();
Packit 0848f5
        for (i = 0; i < cnt; i++) {
Packit 0848f5
            MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
Packit 0848f5
            j += sz;
Packit 0848f5
        }
Packit 0848f5
        t[k].endOp = MPI_Wtime();
Packit 0848f5
        if (barrierSync)
Packit 0848f5
            MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_complete(win);
Packit 0848f5
        MPI_Win_wait(win);
Packit 0848f5
        t[k].endSync = MPI_Wtime();
Packit 0848f5
    }
Packit 0848f5
}
Packit 0848f5
Packit 0848f5
void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
Packit 0848f5
                MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
Packit 0848f5
{
Packit 0848f5
    int k, i, j, one = 1;
Packit 0848f5
Packit 0848f5
    for (k = 0; k < MAX_RUNS; k++) {
Packit 0848f5
        MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_post(exposureGroup, 0, win);
Packit 0848f5
        MPI_Win_start(accessGroup, 0, win);
Packit 0848f5
        j = 0;
Packit 0848f5
        t[k].startOp = MPI_Wtime();
Packit 0848f5
        for (i = 0; i < cnt; i++) {
Packit 0848f5
            MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
Packit 0848f5
            j += sz;
Packit 0848f5
        }
Packit 0848f5
        t[k].endOp = MPI_Wtime();
Packit 0848f5
        if (barrierSync)
Packit 0848f5
            MPI_Barrier(MPI_COMM_WORLD);
Packit 0848f5
        MPI_Win_complete(win);
Packit 0848f5
        MPI_Win_wait(win);
Packit 0848f5
        t[k].endSync = MPI_Wtime();
Packit 0848f5
    }
Packit 0848f5
}
Packit 0848f5
Packit 0848f5
void PrintResults(int cnt, timing t[])
Packit 0848f5
{
Packit 0848f5
    int k;
Packit 0848f5
    double d1 = 0, d2 = 0;
Packit 0848f5
    double minD1 = 1e10, minD2 = 1e10;
Packit 0848f5
    double tOp, tSync;
Packit 0848f5
    for (k = 0; k < MAX_RUNS; k++) {
Packit 0848f5
        tOp = t[k].endOp - t[k].startOp;
Packit 0848f5
        tSync = t[k].endSync - t[k].endOp;
Packit 0848f5
        d1 += tOp;
Packit 0848f5
        d2 += tSync;
Packit 0848f5
        if (tOp < minD1)
Packit 0848f5
            minD1 = tOp;
Packit 0848f5
        if (tSync < minD2)
Packit 0848f5
            minD2 = tSync;
Packit 0848f5
    }
Packit 0848f5
    if (verbose) {
Packit 0848f5
        long rate = 0;
Packit 0848f5
        /* Use the minimum times because they are more stable - if timing
Packit 0848f5
         * accuracy is an issue, use the min over multiple trials */
Packit 0848f5
        d1 = minD1;
Packit 0848f5
        d2 = minD2;
Packit 0848f5
        /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
Packit 0848f5
        if (d2 > 0)
Packit 0848f5
            rate = (long) (cnt) / d2;
Packit 0848f5
        /* count, op, sync, op/each, sync/each, rate */
Packit 0848f5
        printf("%d\t%e\t%e\t%e\t%e\t%ld\n", cnt, d1, d2, d1 / cnt, d2 / cnt, rate);
Packit 0848f5
    }
Packit 0848f5
}