Tree - source-git/papi - CentOS Git server

source-git / papi

Blame src/components/cuda/tests/simpleMultiGPU.cu

Blob History Raw

Packit	577717	`/* PAPI Multiple GPU example. This example is taken from the NVIDIA`
Packit	577717	`* documentation (Copyright 1993-2013 NVIDIA Corporation) and has been`
Packit	577717	`* adapted to show the use of CUPTI and PAPI in collecting event`
Packit	577717	`* counters for multiple GPU contexts. PAPI Team (2015)`
Packit	577717	`*/`
Packit	577717
Packit	577717	`/*`
Packit	577717	`* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.`
Packit	577717	`*`
Packit	577717	`* Please refer to the NVIDIA end user license agreement (EULA) associated`
Packit	577717	`* with this source code for terms and conditions that govern your use of`
Packit	577717	`* this software. Any use, reproduction, disclosure, or distribution of`
Packit	577717	`* this software and related documentation outside the terms of the EULA`
Packit	577717	`* is strictly prohibited.`
Packit	577717	`*`
Packit	577717	`*/`
Packit	577717
Packit	577717	`/*`
Packit	577717	`* This application demonstrates how to use the CUDA API to use multiple GPUs,`
Packit	577717	`* with an emphasis on simple illustration of the techniques (not on performance).`
Packit	577717	`*`
Packit	577717	`* Note that in order to detect multiple GPUs in your system you have to disable`
Packit	577717	`* SLI in the nvidia control panel. Otherwise only one GPU is visible to the`
Packit	577717	`* application. On the other side, you can still extend your desktop to screens`
Packit	577717	`* attached to both GPUs.`
Packit	577717	`*/`
Packit	577717
Packit	577717	`// System includes`
Packit	577717	`#include <stdio.h>`
Packit	577717	`#include <assert.h>`
Packit	577717
Packit	577717	`// CUDA runtime`
Packit	577717	`#include <cuda.h>`
Packit	577717	`#include <cuda_runtime.h>`
Packit	577717	`#include <cuda_runtime_api.h>`
Packit	577717	`#include <cupti.h>`
Packit	577717	`#include <timer.h>`
Packit	577717
Packit	577717	`#include "papi.h"`
Packit	577717	`#include "papi_test.h"`
Packit	577717
Packit	577717	`#if not defined PAPI`
Packit	577717	`#undef PAPI`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`#if not defined CUPTI_ONLY`
Packit	577717	`#undef CUPTI_ONLY`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`#ifndef MAX`
Packit	577717	`#define MAX(a,b) (a > b ? a : b)`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`#include "simpleMultiGPU.h"`
Packit	577717
Packit	577717	`// //////////////////////////////////////////////////////////////////////////////`
Packit	577717	`// Data configuration`
Packit	577717	`// //////////////////////////////////////////////////////////////////////////////`
Packit	577717	`const int MAX_GPU_COUNT = 32;`
Packit	577717	`const int DATA_N = 48576 * 32;`
Packit	577717	`#ifdef PAPI`
Packit	577717	`const int MAX_NUM_EVENTS = 32;`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`#define CHECK_CU_ERROR(err, cufunc) \`
Packit	577717	`if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; }`
Packit	577717
Packit	577717	`#define CHECK_CUDA_ERROR(err) \`
Packit	577717	`if (err != cudaSuccess) { printf ("Error %d for CUDA \n", err ); return -1; }`
Packit	577717
Packit	577717	`#define CHECK_CUPTI_ERROR(err, cuptifunc) \`
Packit	577717	`if (err != CUPTI_SUCCESS) { printf ("Error %d for CUPTI API function '%s'\n", err, cuptifunc); return -1; }`
Packit	577717
Packit	577717
Packit	577717	`// //////////////////////////////////////////////////////////////////////////////`
Packit	577717	`// Simple reduction kernel.`
Packit	577717	`// Refer to the 'reduction' CUDA SDK sample describing`
Packit	577717	`// reduction optimization strategies`
Packit	577717	`// //////////////////////////////////////////////////////////////////////////////`
Packit	577717	`__global__ static void reduceKernel( float d_Result, float d_Input, int N )`
Packit	577717	`{`
Packit	577717	`const int tid = blockIdx.x * blockDim.x + threadIdx.x;`
Packit	577717	`const int threadN = gridDim.x * blockDim.x;`
Packit	577717	`float sum = 0;`
Packit	577717
Packit	577717	`for( int pos = tid; pos < N; pos += threadN )`
Packit	577717	`sum += d_Input[pos];`
Packit	577717
Packit	577717	`d_Result[tid] = sum;`
Packit	577717	`}`
Packit	577717
Packit	577717	`// //////////////////////////////////////////////////////////////////////////////`
Packit	577717	`// Program main`
Packit	577717	`// //////////////////////////////////////////////////////////////////////////////`
Packit	577717	`int main( int argc, char **argv )`
Packit	577717	`{`
Packit	577717	`// Solver config`
Packit	577717	`TGPUplan plan[MAX_GPU_COUNT];`
Packit	577717	`// GPU reduction results`
Packit	577717	`float h_SumGPU[MAX_GPU_COUNT];`
Packit	577717	`float sumGPU;`
Packit	577717	`double sumCPU, diff;`
Packit	577717	`int i, j, gpuBase, GPU_N;`
Packit	577717
Packit	577717	`const int BLOCK_N = 32;`
Packit	577717	`const int THREAD_N = 256;`
Packit	577717	`const int ACCUM_N = BLOCK_N * THREAD_N;`
Packit	577717
Packit	577717	`CUcontext ctx[MAX_GPU_COUNT];`
Packit	577717
Packit	577717	`printf( "Starting simpleMultiGPU\n" );`
Packit	577717
Packit	577717	`// Report on the available CUDA devices`
Packit	577717	`int computeCapabilityMajor = 0, computeCapabilityMinor = 0;`
Packit	577717	`int runtimeVersion = 0, driverVersion = 0;`
Packit	577717	`char deviceName[64];`
Packit	577717	`CUdevice device[MAX_GPU_COUNT];`
Packit	577717	`CHECK_CUDA_ERROR( cudaGetDeviceCount( &GPU_N ) );`
Packit	577717	`if( GPU_N > MAX_GPU_COUNT ) GPU_N = MAX_GPU_COUNT;`
Packit	577717	`printf( "CUDA-capable device count: %i\n", GPU_N );`
Packit	577717	`for ( i=0; i`
Packit	577717	`CHECK_CU_ERROR( cuDeviceGet( &device[i], i ), "cuDeviceGet" );`
Packit	577717	`CHECK_CU_ERROR( cuDeviceGetName( deviceName, 64, device[i] ), "cuDeviceGetName" );`
Packit	577717	`CHECK_CU_ERROR( cuDeviceComputeCapability( &computeCapabilityMajor, &computeCapabilityMinor, device[i] ), "cuDeviceComputeCapability" );`
Packit	577717	`cudaRuntimeGetVersion( &runtimeVersion );`
Packit	577717	`cudaDriverGetVersion( &driverVersion );`
Packit	577717	`printf( "CUDA Device %d: %s : computeCapability %d.%d runtimeVersion %d.%d driverVersion %d.%d\n", i, deviceName, computeCapabilityMajor, computeCapabilityMinor, runtimeVersion/1000, (runtimeVersion%100)/10, driverVersion/1000, (driverVersion%100)/10 );`
Packit	577717	`if ( computeCapabilityMajor < 2 ) {`
Packit	577717	`printf( "CUDA Device %d compute capability is too low... will not add any more GPUs\n", i );`
Packit	577717	`GPU_N = i;`
Packit	577717	`break;`
Packit	577717	`}`
Packit	577717	`}`
Packit	577717	`uint32_t cupti_linked_version;`
Packit	577717	`cuptiGetVersion( &cupti_linked_version );`
Packit	577717	`printf("CUPTI version: Compiled against version %d; Linked against version %d\n", CUPTI_API_VERSION, cupti_linked_version );`
Packit	577717
Packit	577717	`// create one context per device`
Packit	577717	`for (i = 0; i < GPU_N; i++) {`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) );`
Packit	577717	`CHECK_CU_ERROR( cuCtxCreate( &(ctx[i]), 0, device[i] ), "cuCtxCreate" );`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717
Packit	577717	`printf( "Generating input data...\n" );`
Packit	577717
Packit	577717	`// Subdividing input data across GPUs`
Packit	577717	`// Get data sizes for each GPU`
Packit	577717	`for( i = 0; i < GPU_N; i++ )`
Packit	577717	`plan[i].dataN = DATA_N / GPU_N;`
Packit	577717	`// Take into account "odd" data sizes`
Packit	577717	`for( i = 0; i < DATA_N % GPU_N; i++ )`
Packit	577717	`plan[i].dataN++;`
Packit	577717
Packit	577717	`// Assign data ranges to GPUs`
Packit	577717	`gpuBase = 0;`
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`plan[i].h_Sum = h_SumGPU + i; // point within h_SumGPU array`
Packit	577717	`gpuBase += plan[i].dataN;`
Packit	577717	`}`
Packit	577717
Packit	577717
Packit	577717	`// Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)`
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) );`
Packit	577717	`CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");`
Packit	577717	`CHECK_CUDA_ERROR( cudaStreamCreate( &plan[i].stream ) );`
Packit	577717	`CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Data, plan[i].dataN * sizeof( float ) ) );`
Packit	577717	`CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Sum, ACCUM_N * sizeof( float ) ) );`
Packit	577717	`CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Sum_from_device, ACCUM_N * sizeof( float ) ) );`
Packit	577717	`CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Data, plan[i].dataN * sizeof( float ) ) );`
Packit	577717	`for( j = 0; j < plan[i].dataN; j++ ) {`
Packit	577717	`plan[i].h_Data[j] = ( float ) rand() / ( float ) RAND_MAX;`
Packit	577717	`}`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717
Packit	577717
Packit	577717	`#ifdef CUPTI_ONLY`
Packit	577717	`char const *cuptiEventName = "inst_executed"; // "elapsed_cycles_sm" "inst_executed"; "inst_issued0";`
Packit	577717	`printf("Setup CUPTI counters internally for %s event (CUPTI_ONLY)\n", cuptiEventName);`
Packit	577717	`CUpti_EventGroup eg[MAX_GPU_COUNT];`
Packit	577717	`CUpti_EventID myevent;`
Packit	577717	`for ( i=0; i`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) );`
Packit	577717	`CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");`
Packit	577717	`CHECK_CUPTI_ERROR(cuptiSetEventCollectionMode(ctx[i], CUPTI_EVENT_COLLECTION_MODE_KERNEL), "cuptiSetEventCollectionMode" );`
Packit	577717	`CHECK_CUPTI_ERROR( cuptiEventGroupCreate( ctx[i], &eg[i], 0 ), "cuptiEventGroupCreate" );`
Packit	577717	`cuptiEventGetIdFromName ( device[i], cuptiEventName, &myevent );`
Packit	577717	`CHECK_CUPTI_ERROR( cuptiEventGroupAddEvent( eg[i], myevent ), "cuptiEventGroupAddEvent" );`
Packit	577717	`CHECK_CUPTI_ERROR( cuptiEventGroupEnable( eg[i] ), "cuptiEventGroupEnable" );`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`#ifdef PAPI`
Packit	577717	`printf("Setup PAPI counters internally (PAPI)\n");`
Packit	577717	`int EventSet = PAPI_NULL;`
Packit	577717	`int NUM_EVENTS = MAX_GPU_COUNT*MAX_NUM_EVENTS;`
Packit	577717	`long long values[NUM_EVENTS];`
Packit	577717	`int eventCount;`
Packit	577717	`int retval, ee;`
Packit	577717
Packit	577717	`/* PAPI Initialization */`
Packit	577717	`retval = PAPI_library_init( PAPI_VER_CURRENT );`
Packit	577717	`if( retval != PAPI_VER_CURRENT ) fprintf( stderr, "PAPI_library_init failed\n" );`
Packit	577717	`printf( "PAPI version: %d.%d.%d\n", PAPI_VERSION_MAJOR( PAPI_VERSION ), PAPI_VERSION_MINOR( PAPI_VERSION ), PAPI_VERSION_REVISION( PAPI_VERSION ) );`
Packit	577717
Packit	577717	`retval = PAPI_create_eventset( &EventSet );`
Packit	577717	`if( retval != PAPI_OK ) fprintf( stderr, "PAPI_create_eventset failed\n" );`
Packit	577717
Packit	577717	`// In this example measure events from each GPU`
Packit	577717	`int numEventEndings = 3;`
Packit	577717	`char const *EventEndings[] = {`
Packit	577717	`"cuda:::metric:inst_per_warp",`
Packit	577717	`"cuda:::event:inst_executed",`
Packit	577717	`"cuda:::event:elapsed_cycles_sm"`
Packit	577717	`};`
Packit	577717
Packit	577717	`// Add events at a GPU specific level ... eg cuda:::device:2:elapsed_cycles_sm`
Packit	577717	`char *EventName[NUM_EVENTS];`
Packit	577717	`char tmpEventName[50];`
Packit	577717	`eventCount = 0;`
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) ); // Set device`
Packit	577717	`CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");`
Packit	577717	`CHECK_CUPTI_ERROR(cuptiSetEventCollectionMode(ctx[i], CUPTI_EVENT_COLLECTION_MODE_KERNEL), "cuptiSetEventCollectionMode" );`
Packit	577717	`for ( ee=0; ee`
Packit	577717	`snprintf( tmpEventName, 50, "%s:device=%d\0", EventEndings[ee], i );`
Packit	577717	`printf( "Trying to add event %s to GPU %d in PAPI...", tmpEventName , i ); fflush(NULL);`
Packit	577717	`retval = PAPI_add_named_event( EventSet, tmpEventName );`
Packit	577717	`if (retval==PAPI_OK) {`
Packit	577717	`printf( "Added event\n" );`
Packit	577717	`EventName[eventCount] = (char *)calloc( 50, sizeof(char) );`
Packit	577717	`snprintf( EventName[eventCount], 50, "%s", tmpEventName );`
Packit	577717	`eventCount++;`
Packit	577717	`} else {`
Packit	577717	`printf( "Could not add event\n" );`
Packit	577717	`}`
Packit	577717	`}`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717
Packit	577717	`// Start PAPI event measurement`
Packit	577717	`retval = PAPI_start( EventSet );`
Packit	577717	`if( retval != PAPI_OK ) fprintf( stderr, "PAPI_start failed\n" );`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`// Start timing and compute on GPU(s)`
Packit	577717	`printf( "Computing with %d GPUs...\n", GPU_N );`
Packit	577717	`StartTimer();`
Packit	577717
Packit	577717	`// Copy data to GPU, launch the kernel and copy data back. All asynchronously`
Packit	577717	`for (i = 0; i < GPU_N; i++) {`
Packit	577717	`// Set device`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ));`
Packit	577717	`CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");`
Packit	577717	`// Copy input data from CPU`
Packit	577717	`CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof( float ), cudaMemcpyHostToDevice, plan[i].stream ) );`
Packit	577717	`// Perform GPU computations`
Packit	577717	`reduceKernel <<< BLOCK_N, THREAD_N, 0, plan[i].stream >>> ( plan[i].d_Sum, plan[i].d_Data, plan[i].dataN );`
Packit	577717	`if ( cudaGetLastError() != cudaSuccess ) { printf( "reduceKernel() execution failed (GPU %d).\n", i ); exit(EXIT_FAILURE); }`
Packit	577717	`// Read back GPU results`
Packit	577717	`CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[i].stream ) );`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717
Packit	577717	`// Process GPU results`
Packit	577717	`printf( "Process GPU results on %d GPUs...\n", GPU_N );`
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`float sum;`
Packit	577717	`// Set device`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) );`
Packit	577717	`CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");`
Packit	577717	`// Wait for all operations to finish`
Packit	577717	`cudaStreamSynchronize( plan[i].stream );`
Packit	577717	`// Finalize GPU reduction for current subvector`
Packit	577717	`sum = 0;`
Packit	577717	`for( j = 0; j < ACCUM_N; j++ ) {`
Packit	577717	`sum += plan[i].h_Sum_from_device[j];`
Packit	577717	`}`
Packit	577717	`*( plan[i].h_Sum ) = ( float ) sum;`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717	`double gpuTime = GetTimer();`
Packit	577717
Packit	577717
Packit	577717	`#ifdef CUPTI_ONLY`
Packit	577717	`size_t size = 1024;`
Packit	577717	`size_t sizeBytes = size*sizeof(uint64_t);`
Packit	577717	`uint64_t buffer[size];`
Packit	577717	`uint64_t tmp[size]; for (int jj=0; jj<1024; jj++) tmp[jj]=0;`
Packit	577717	`for ( i=0; i`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) );`
Packit	577717	`CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");`
Packit	577717	`CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );`
Packit	577717	`CHECK_CUPTI_ERROR( cuptiEventGroupReadEvent ( eg[i], CUPTI_EVENT_READ_FLAG_NONE, myevent, &sizeBytes, &tmp[0] ), "cuptiEventGroupReadEvent" );`
Packit	577717	`buffer[i] = tmp[0];`
Packit	577717	`printf( "CUPTI %s device %d counterValue %u (on one domain, may need to be multiplied by num of domains)\n", cuptiEventName, i, buffer[i] );`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`#ifdef PAPI`
Packit	577717	`for ( i=0; i`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) );`
Packit	577717	`CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");`
Packit	577717	`CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );`
Packit	577717	`CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );`
Packit	577717	`}`
Packit	577717
Packit	577717	`// retval = PAPI_read( EventSet, values );`
Packit	577717	`// if( retval != PAPI_OK ) fprintf( stderr, "PAPI_read failed\n" );`
Packit	577717	`// for( i = 0; i < eventCount; i++ )`
Packit	577717	`// printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );`
Packit	577717
Packit	577717	`// retval = PAPI_read( EventSet, values );`
Packit	577717	`// if( retval != PAPI_OK ) fprintf( stderr, "PAPI_read failed\n" );`
Packit	577717	`// for( i = 0; i < eventCount; i++ )`
Packit	577717	`// printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );`
Packit	577717
Packit	577717	`retval = PAPI_stop( EventSet, values );`
Packit	577717	`if( retval != PAPI_OK ) fprintf( stderr, "PAPI_stop failed\n" );`
Packit	577717	`for( i = 0; i < eventCount; i++ )`
Packit	577717	`printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );`
Packit	577717
Packit	577717	`retval = PAPI_cleanup_eventset( EventSet );`
Packit	577717	`if( retval != PAPI_OK ) fprintf( stderr, "PAPI_cleanup_eventset failed\n" );`
Packit	577717	`retval = PAPI_destroy_eventset( &EventSet );`
Packit	577717	`if( retval != PAPI_OK ) fprintf( stderr, "PAPI_destroy_eventset failed\n" );`
Packit	577717	`PAPI_shutdown();`
Packit	577717	`#endif`
Packit	577717
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Sum_from_device ) );`
Packit	577717	`CHECK_CUDA_ERROR( cudaFree( plan[i].d_Sum ) );`
Packit	577717	`CHECK_CUDA_ERROR( cudaFree( plan[i].d_Data ) );`
Packit	577717	`// Shut down this GPU`
Packit	577717	`CHECK_CUDA_ERROR( cudaStreamDestroy( plan[i].stream ) );`
Packit	577717	`}`
Packit	577717	`sumGPU = 0;`
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`sumGPU += h_SumGPU[i];`
Packit	577717	`}`
Packit	577717	`printf( " GPU Processing time: %f (ms)\n", gpuTime );`
Packit	577717
Packit	577717	`// Compute on Host CPU`
Packit	577717	`printf( "Computing the same result with Host CPU...\n" );`
Packit	577717	`StartTimer();`
Packit	577717	`sumCPU = 0;`
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`for( j = 0; j < plan[i].dataN; j++ ) {`
Packit	577717	`sumCPU += plan[i].h_Data[j];`
Packit	577717	`}`
Packit	577717	`}`
Packit	577717	`double cpuTime = GetTimer();`
Packit	577717	`printf( " CPU Processing time: %f (ms)\n", cpuTime );`
Packit	577717
Packit	577717	`// Compare GPU and CPU results`
Packit	577717	`printf( "Comparing GPU and Host CPU results...\n" );`
Packit	577717	`diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU );`
Packit	577717	`printf( " GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU );`
Packit	577717	`printf( " Relative difference: %E \n", diff );`
Packit	577717
Packit	577717	`// Cleanup and shutdown`
Packit	577717	`for( i = 0; i < GPU_N; i++ ) {`
Packit	577717	`CHECK_CUDA_ERROR( cudaSetDevice( i ) );`
Packit	577717	`CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Data ) );`
Packit	577717	`cudaDeviceReset();`
Packit	577717	`}`
Packit	577717
Packit	577717	`exit( ( diff < 1e-5 ) ? EXIT_SUCCESS : EXIT_FAILURE );`
Packit	577717	`}`
Packit	577717

source-git / papi

Source Code

Blame src/components/cuda/tests/simpleMultiGPU.cu