Blame src/components/cuda/tests/simpleMultiGPU.cu

Packit 577717
/* PAPI Multiple GPU example.  This example is taken from the NVIDIA
Packit 577717
 * documentation (Copyright 1993-2013 NVIDIA Corporation) and has been
Packit 577717
 * adapted to show the use of CUPTI and PAPI in collecting event
Packit 577717
 * counters for multiple GPU contexts.  PAPI Team (2015)
Packit 577717
 */
Packit 577717
Packit 577717
/*
Packit 577717
 * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
Packit 577717
 *
Packit 577717
 * Please refer to the NVIDIA end user license agreement (EULA) associated
Packit 577717
 * with this source code for terms and conditions that govern your use of
Packit 577717
 * this software. Any use, reproduction, disclosure, or distribution of
Packit 577717
 * this software and related documentation outside the terms of the EULA
Packit 577717
 * is strictly prohibited.
Packit 577717
 *
Packit 577717
 */
Packit 577717
Packit 577717
/*
Packit 577717
 * This application demonstrates how to use the CUDA API to use multiple GPUs,
Packit 577717
 * with an emphasis on simple illustration of the techniques (not on performance).
Packit 577717
 *
Packit 577717
 * Note that in order to detect multiple GPUs in your system you have to disable
Packit 577717
 * SLI in the nvidia control panel. Otherwise only one GPU is visible to the
Packit 577717
 * application. On the other side, you can still extend your desktop to screens
Packit 577717
 * attached to both GPUs.
Packit 577717
 */
Packit 577717
Packit 577717
// System includes
Packit 577717
#include <stdio.h>
Packit 577717
#include <assert.h>
Packit 577717
Packit 577717
// CUDA runtime
Packit 577717
#include <cuda.h>
Packit 577717
#include <cuda_runtime.h>
Packit 577717
#include <cuda_runtime_api.h>
Packit 577717
#include <cupti.h>
Packit 577717
#include <timer.h>
Packit 577717
Packit 577717
#include "papi.h"
Packit 577717
#include "papi_test.h"
Packit 577717
Packit 577717
#if not defined PAPI
Packit 577717
#undef PAPI
Packit 577717
#endif
Packit 577717
Packit 577717
#if not defined CUPTI_ONLY
Packit 577717
#undef CUPTI_ONLY
Packit 577717
#endif
Packit 577717
Packit 577717
#ifndef MAX
Packit 577717
#define MAX(a,b) (a > b ? a : b)
Packit 577717
#endif
Packit 577717
Packit 577717
#include "simpleMultiGPU.h"
Packit 577717
Packit 577717
// //////////////////////////////////////////////////////////////////////////////
Packit 577717
// Data configuration
Packit 577717
// //////////////////////////////////////////////////////////////////////////////
Packit 577717
const int MAX_GPU_COUNT = 32;
Packit 577717
const int DATA_N = 48576 * 32;
Packit 577717
#ifdef PAPI
Packit 577717
const int MAX_NUM_EVENTS = 32;
Packit 577717
#endif
Packit 577717
Packit 577717
#define CHECK_CU_ERROR(err, cufunc)                                     \
Packit 577717
    if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; }
Packit 577717
Packit 577717
#define CHECK_CUDA_ERROR(err)                                           \
Packit 577717
    if (err != cudaSuccess) { printf ("Error %d for CUDA \n", err ); return -1; }
Packit 577717
Packit 577717
#define CHECK_CUPTI_ERROR(err, cuptifunc)                               \
Packit 577717
    if (err != CUPTI_SUCCESS) { printf ("Error %d for CUPTI API function '%s'\n", err, cuptifunc); return -1; }
Packit 577717
Packit 577717
Packit 577717
// //////////////////////////////////////////////////////////////////////////////
Packit 577717
// Simple reduction kernel.
Packit 577717
// Refer to the 'reduction' CUDA SDK sample describing
Packit 577717
// reduction optimization strategies
Packit 577717
// //////////////////////////////////////////////////////////////////////////////
Packit 577717
__global__ static void reduceKernel( float *d_Result, float *d_Input, int N )
Packit 577717
{
Packit 577717
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
Packit 577717
    const int threadN = gridDim.x * blockDim.x;
Packit 577717
    float sum = 0;
Packit 577717
    
Packit 577717
    for( int pos = tid; pos < N; pos += threadN )
Packit 577717
        sum += d_Input[pos];
Packit 577717
    
Packit 577717
    d_Result[tid] = sum;
Packit 577717
}
Packit 577717
Packit 577717
// //////////////////////////////////////////////////////////////////////////////
Packit 577717
// Program main
Packit 577717
// //////////////////////////////////////////////////////////////////////////////
Packit 577717
int main( int argc, char **argv )
Packit 577717
{
Packit 577717
    // Solver config
Packit 577717
    TGPUplan plan[MAX_GPU_COUNT];
Packit 577717
    // GPU reduction results
Packit 577717
    float h_SumGPU[MAX_GPU_COUNT];
Packit 577717
    float sumGPU;
Packit 577717
    double sumCPU, diff;
Packit 577717
    int i, j, gpuBase, GPU_N;
Packit 577717
    
Packit 577717
    const int BLOCK_N = 32;
Packit 577717
    const int THREAD_N = 256;
Packit 577717
    const int ACCUM_N = BLOCK_N * THREAD_N;
Packit 577717
Packit 577717
    CUcontext ctx[MAX_GPU_COUNT];
Packit 577717
    
Packit 577717
    printf( "Starting simpleMultiGPU\n" );
Packit 577717
    
Packit 577717
    // Report on the available CUDA devices
Packit 577717
    int computeCapabilityMajor = 0, computeCapabilityMinor = 0;
Packit 577717
    int runtimeVersion = 0, driverVersion = 0;
Packit 577717
    char deviceName[64];
Packit 577717
    CUdevice device[MAX_GPU_COUNT];
Packit 577717
    CHECK_CUDA_ERROR( cudaGetDeviceCount( &GPU_N ) );
Packit 577717
    if( GPU_N > MAX_GPU_COUNT ) GPU_N = MAX_GPU_COUNT;
Packit 577717
    printf( "CUDA-capable device count: %i\n", GPU_N );
Packit 577717
    for ( i=0; i
Packit 577717
        CHECK_CU_ERROR( cuDeviceGet( &device[i], i ), "cuDeviceGet" );
Packit 577717
        CHECK_CU_ERROR( cuDeviceGetName( deviceName, 64, device[i] ), "cuDeviceGetName" );
Packit 577717
        CHECK_CU_ERROR( cuDeviceComputeCapability( &computeCapabilityMajor, &computeCapabilityMinor,  device[i] ), "cuDeviceComputeCapability" );
Packit 577717
        cudaRuntimeGetVersion( &runtimeVersion );
Packit 577717
        cudaDriverGetVersion( &driverVersion );
Packit 577717
        printf( "CUDA Device %d: %s : computeCapability %d.%d runtimeVersion %d.%d driverVersion %d.%d\n", i, deviceName, computeCapabilityMajor, computeCapabilityMinor, runtimeVersion/1000, (runtimeVersion%100)/10, driverVersion/1000, (driverVersion%100)/10 );
Packit 577717
        if ( computeCapabilityMajor < 2 ) {
Packit 577717
            printf( "CUDA Device %d compute capability is too low... will not add any more GPUs\n", i );
Packit 577717
            GPU_N = i;
Packit 577717
            break;
Packit 577717
        }
Packit 577717
    }
Packit 577717
    uint32_t cupti_linked_version;
Packit 577717
    cuptiGetVersion( &cupti_linked_version );
Packit 577717
    printf("CUPTI version: Compiled against version %d; Linked against version %d\n", CUPTI_API_VERSION, cupti_linked_version );
Packit 577717
    
Packit 577717
    // create one context per device
Packit 577717
    for (i = 0; i < GPU_N; i++) {
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );
Packit 577717
        CHECK_CU_ERROR( cuCtxCreate( &(ctx[i]), 0, device[i] ), "cuCtxCreate" );
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
Packit 577717
    printf( "Generating input data...\n" );
Packit 577717
    
Packit 577717
    // Subdividing input data across GPUs
Packit 577717
    // Get data sizes for each GPU
Packit 577717
    for( i = 0; i < GPU_N; i++ )
Packit 577717
        plan[i].dataN = DATA_N / GPU_N;
Packit 577717
    // Take into account "odd" data sizes
Packit 577717
    for( i = 0; i < DATA_N % GPU_N; i++ )
Packit 577717
        plan[i].dataN++;
Packit 577717
    
Packit 577717
    // Assign data ranges to GPUs
Packit 577717
    gpuBase = 0;
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        plan[i].h_Sum = h_SumGPU + i; // point within h_SumGPU array
Packit 577717
        gpuBase += plan[i].dataN;
Packit 577717
    }
Packit 577717
Packit 577717
  
Packit 577717
    // Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );
Packit 577717
        CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
Packit 577717
        CHECK_CUDA_ERROR( cudaStreamCreate( &plan[i].stream ) );
Packit 577717
        CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Data, plan[i].dataN * sizeof( float ) ) );
Packit 577717
        CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Sum, ACCUM_N * sizeof( float ) ) );
Packit 577717
        CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Sum_from_device, ACCUM_N * sizeof( float ) ) );
Packit 577717
        CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Data, plan[i].dataN * sizeof( float ) ) );
Packit 577717
        for( j = 0; j < plan[i].dataN; j++ ) {
Packit 577717
            plan[i].h_Data[j] = ( float ) rand() / ( float ) RAND_MAX;
Packit 577717
        }
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
    
Packit 577717
    
Packit 577717
#ifdef CUPTI_ONLY
Packit 577717
    char const *cuptiEventName = "inst_executed"; // "elapsed_cycles_sm" "inst_executed"; "inst_issued0";
Packit 577717
    printf("Setup CUPTI counters internally for %s event (CUPTI_ONLY)\n", cuptiEventName);
Packit 577717
    CUpti_EventGroup eg[MAX_GPU_COUNT];
Packit 577717
    CUpti_EventID myevent;
Packit 577717
    for ( i=0; i
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );
Packit 577717
        CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
Packit 577717
        CHECK_CUPTI_ERROR(cuptiSetEventCollectionMode(ctx[i], CUPTI_EVENT_COLLECTION_MODE_KERNEL), "cuptiSetEventCollectionMode" );
Packit 577717
        CHECK_CUPTI_ERROR( cuptiEventGroupCreate( ctx[i], &eg[i], 0 ), "cuptiEventGroupCreate" );
Packit 577717
        cuptiEventGetIdFromName ( device[i], cuptiEventName, &myevent );
Packit 577717
        CHECK_CUPTI_ERROR( cuptiEventGroupAddEvent( eg[i], myevent ), "cuptiEventGroupAddEvent" );
Packit 577717
        CHECK_CUPTI_ERROR( cuptiEventGroupEnable( eg[i] ), "cuptiEventGroupEnable" );
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
#endif
Packit 577717
    
Packit 577717
#ifdef PAPI
Packit 577717
    printf("Setup PAPI counters internally (PAPI)\n");
Packit 577717
    int EventSet = PAPI_NULL;
Packit 577717
    int NUM_EVENTS = MAX_GPU_COUNT*MAX_NUM_EVENTS;
Packit 577717
    long long values[NUM_EVENTS];
Packit 577717
    int eventCount;
Packit 577717
    int retval, ee;
Packit 577717
    
Packit 577717
    /* PAPI Initialization */
Packit 577717
    retval = PAPI_library_init( PAPI_VER_CURRENT );
Packit 577717
    if( retval != PAPI_VER_CURRENT ) fprintf( stderr, "PAPI_library_init failed\n" );
Packit 577717
    printf( "PAPI version: %d.%d.%d\n", PAPI_VERSION_MAJOR( PAPI_VERSION ), PAPI_VERSION_MINOR( PAPI_VERSION ), PAPI_VERSION_REVISION( PAPI_VERSION ) );
Packit 577717
    
Packit 577717
    retval = PAPI_create_eventset( &EventSet );
Packit 577717
    if( retval != PAPI_OK ) fprintf( stderr, "PAPI_create_eventset failed\n" );
Packit 577717
    
Packit 577717
    // In this example measure events from each GPU
Packit 577717
    int numEventEndings = 3;
Packit 577717
    char const *EventEndings[] = { 
Packit 577717
        "cuda:::metric:inst_per_warp", 
Packit 577717
        "cuda:::event:inst_executed", 
Packit 577717
        "cuda:::event:elapsed_cycles_sm" 
Packit 577717
    };
Packit 577717
Packit 577717
    // Add events at a GPU specific level ... eg cuda:::device:2:elapsed_cycles_sm
Packit 577717
    char *EventName[NUM_EVENTS];
Packit 577717
    char tmpEventName[50];
Packit 577717
    eventCount = 0;
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );         // Set device
Packit 577717
        CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
Packit 577717
        CHECK_CUPTI_ERROR(cuptiSetEventCollectionMode(ctx[i], CUPTI_EVENT_COLLECTION_MODE_KERNEL), "cuptiSetEventCollectionMode" );
Packit 577717
        for ( ee=0; ee
Packit 577717
            snprintf( tmpEventName, 50, "%s:device=%d\0", EventEndings[ee], i );
Packit 577717
            printf( "Trying to add event %s to GPU %d in PAPI...", tmpEventName , i ); fflush(NULL);
Packit 577717
            retval = PAPI_add_named_event( EventSet, tmpEventName );
Packit 577717
            if (retval==PAPI_OK) {
Packit 577717
                printf( "Added event\n" );
Packit 577717
                EventName[eventCount] = (char *)calloc( 50, sizeof(char) );
Packit 577717
                snprintf( EventName[eventCount], 50, "%s", tmpEventName );
Packit 577717
                eventCount++;
Packit 577717
            } else {
Packit 577717
                printf( "Could not add event\n" );
Packit 577717
            }
Packit 577717
        }
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
    
Packit 577717
    // Start PAPI event measurement
Packit 577717
    retval = PAPI_start( EventSet );
Packit 577717
    if( retval != PAPI_OK )  fprintf( stderr, "PAPI_start failed\n" );
Packit 577717
#endif
Packit 577717
    
Packit 577717
    // Start timing and compute on GPU(s)
Packit 577717
    printf( "Computing with %d GPUs...\n", GPU_N );
Packit 577717
    StartTimer();
Packit 577717
    
Packit 577717
    // Copy data to GPU, launch the kernel and copy data back. All asynchronously
Packit 577717
    for (i = 0; i < GPU_N; i++) {
Packit 577717
        // Set device
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ));
Packit 577717
        CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
Packit 577717
        // Copy input data from CPU
Packit 577717
        CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof( float ), cudaMemcpyHostToDevice, plan[i].stream ) );
Packit 577717
        // Perform GPU computations
Packit 577717
        reduceKernel <<< BLOCK_N, THREAD_N, 0, plan[i].stream >>> ( plan[i].d_Sum, plan[i].d_Data, plan[i].dataN );
Packit 577717
        if ( cudaGetLastError() != cudaSuccess ) { printf( "reduceKernel() execution failed (GPU %d).\n", i ); exit(EXIT_FAILURE); }
Packit 577717
        // Read back GPU results
Packit 577717
        CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[i].stream ) );
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
    
Packit 577717
    // Process GPU results
Packit 577717
    printf( "Process GPU results on %d GPUs...\n", GPU_N );
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        float sum;
Packit 577717
        // Set device
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );
Packit 577717
        CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
Packit 577717
        // Wait for all operations to finish
Packit 577717
        cudaStreamSynchronize( plan[i].stream );
Packit 577717
        // Finalize GPU reduction for current subvector
Packit 577717
        sum = 0;
Packit 577717
        for( j = 0; j < ACCUM_N; j++ ) {
Packit 577717
            sum += plan[i].h_Sum_from_device[j];
Packit 577717
        }
Packit 577717
        *( plan[i].h_Sum ) = ( float ) sum;
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
    double gpuTime = GetTimer();
Packit 577717
Packit 577717
Packit 577717
#ifdef CUPTI_ONLY
Packit 577717
    size_t size = 1024;
Packit 577717
    size_t sizeBytes = size*sizeof(uint64_t);
Packit 577717
    uint64_t buffer[size];
Packit 577717
    uint64_t tmp[size];     for (int jj=0; jj<1024; jj++) tmp[jj]=0;
Packit 577717
    for ( i=0; i
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );
Packit 577717
        CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
Packit 577717
        CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
Packit 577717
        CHECK_CUPTI_ERROR( cuptiEventGroupReadEvent ( eg[i], CUPTI_EVENT_READ_FLAG_NONE, myevent, &sizeBytes, &tmp[0] ), "cuptiEventGroupReadEvent" );
Packit 577717
        buffer[i] = tmp[0];
Packit 577717
        printf( "CUPTI %s device %d counterValue %u (on one domain, may need to be multiplied by num of domains)\n", cuptiEventName, i, buffer[i] );
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
#endif
Packit 577717
Packit 577717
#ifdef PAPI
Packit 577717
    for ( i=0; i
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );
Packit 577717
        CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
Packit 577717
        CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
Packit 577717
        CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
Packit 577717
    }
Packit 577717
Packit 577717
    // retval = PAPI_read( EventSet, values );
Packit 577717
    // if( retval != PAPI_OK )  fprintf( stderr, "PAPI_read failed\n" );
Packit 577717
    // for( i = 0; i < eventCount; i++ )
Packit 577717
    //     printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );
Packit 577717
Packit 577717
    // retval = PAPI_read( EventSet, values );
Packit 577717
    // if( retval != PAPI_OK )  fprintf( stderr, "PAPI_read failed\n" );
Packit 577717
    // for( i = 0; i < eventCount; i++ )
Packit 577717
    //     printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );
Packit 577717
Packit 577717
    retval = PAPI_stop( EventSet, values );
Packit 577717
    if( retval != PAPI_OK )  fprintf( stderr, "PAPI_stop failed\n" );
Packit 577717
    for( i = 0; i < eventCount; i++ )
Packit 577717
        printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );
Packit 577717
Packit 577717
    retval = PAPI_cleanup_eventset( EventSet );
Packit 577717
    if( retval != PAPI_OK )  fprintf( stderr, "PAPI_cleanup_eventset failed\n" );
Packit 577717
    retval = PAPI_destroy_eventset( &EventSet );
Packit 577717
    if( retval != PAPI_OK ) fprintf( stderr, "PAPI_destroy_eventset failed\n" );
Packit 577717
    PAPI_shutdown();
Packit 577717
#endif
Packit 577717
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Sum_from_device ) );
Packit 577717
        CHECK_CUDA_ERROR( cudaFree( plan[i].d_Sum ) );
Packit 577717
        CHECK_CUDA_ERROR( cudaFree( plan[i].d_Data ) );
Packit 577717
        // Shut down this GPU
Packit 577717
        CHECK_CUDA_ERROR( cudaStreamDestroy( plan[i].stream ) );
Packit 577717
    }
Packit 577717
    sumGPU = 0;
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        sumGPU += h_SumGPU[i];
Packit 577717
    }
Packit 577717
    printf( "  GPU Processing time: %f (ms)\n", gpuTime );
Packit 577717
Packit 577717
    // Compute on Host CPU
Packit 577717
    printf( "Computing the same result with Host CPU...\n" );
Packit 577717
    StartTimer();
Packit 577717
    sumCPU = 0;
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        for( j = 0; j < plan[i].dataN; j++ ) {
Packit 577717
            sumCPU += plan[i].h_Data[j];
Packit 577717
        }
Packit 577717
    }
Packit 577717
    double cpuTime = GetTimer();
Packit 577717
    printf( "  CPU Processing time: %f (ms)\n", cpuTime );
Packit 577717
Packit 577717
    // Compare GPU and CPU results
Packit 577717
    printf( "Comparing GPU and Host CPU results...\n" );
Packit 577717
    diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU );
Packit 577717
    printf( "  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU );
Packit 577717
    printf( "  Relative difference: %E \n", diff );
Packit 577717
Packit 577717
    // Cleanup and shutdown
Packit 577717
    for( i = 0; i < GPU_N; i++ ) {
Packit 577717
        CHECK_CUDA_ERROR( cudaSetDevice( i ) );
Packit 577717
        CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Data ) );
Packit 577717
        cudaDeviceReset();
Packit 577717
    }
Packit 577717
Packit 577717
    exit( ( diff < 1e-5 ) ? EXIT_SUCCESS : EXIT_FAILURE );
Packit 577717
}
Packit 577717