/* PAPI Multiple GPU example. This example is taken from the NVIDIA * documentation (Copyright 1993-2013 NVIDIA Corporation) and has been * adapted to show the use of CUPTI and PAPI in collecting event * counters for multiple GPU contexts. PAPI Team (2015) */ /* * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /* * This application demonstrates how to use the CUDA API to use multiple GPUs, * with an emphasis on simple illustration of the techniques (not on performance). * * Note that in order to detect multiple GPUs in your system you have to disable * SLI in the nvidia control panel. Otherwise only one GPU is visible to the * application. On the other side, you can still extend your desktop to screens * attached to both GPUs. */ // System includes #include #include // CUDA runtime #include #include #include #include #include #include "papi.h" #include "papi_test.h" #if not defined PAPI #undef PAPI #endif #if not defined CUPTI_ONLY #undef CUPTI_ONLY #endif #ifndef MAX #define MAX(a,b) (a > b ? a : b) #endif #include "simpleMultiGPU.h" // ////////////////////////////////////////////////////////////////////////////// // Data configuration // ////////////////////////////////////////////////////////////////////////////// const int MAX_GPU_COUNT = 32; const int DATA_N = 48576 * 32; #ifdef PAPI const int MAX_NUM_EVENTS = 32; #endif #define CHECK_CU_ERROR(err, cufunc) \ if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; } #define CHECK_CUDA_ERROR(err) \ if (err != cudaSuccess) { printf ("Error %d for CUDA \n", err ); return -1; } #define CHECK_CUPTI_ERROR(err, cuptifunc) \ if (err != CUPTI_SUCCESS) { printf ("Error %d for CUPTI API function '%s'\n", err, cuptifunc); return -1; } // ////////////////////////////////////////////////////////////////////////////// // Simple reduction kernel. // Refer to the 'reduction' CUDA SDK sample describing // reduction optimization strategies // ////////////////////////////////////////////////////////////////////////////// __global__ static void reduceKernel( float *d_Result, float *d_Input, int N ) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; const int threadN = gridDim.x * blockDim.x; float sum = 0; for( int pos = tid; pos < N; pos += threadN ) sum += d_Input[pos]; d_Result[tid] = sum; } // ////////////////////////////////////////////////////////////////////////////// // Program main // ////////////////////////////////////////////////////////////////////////////// int main( int argc, char **argv ) { // Solver config TGPUplan plan[MAX_GPU_COUNT]; // GPU reduction results float h_SumGPU[MAX_GPU_COUNT]; float sumGPU; double sumCPU, diff; int i, j, gpuBase, GPU_N; const int BLOCK_N = 32; const int THREAD_N = 256; const int ACCUM_N = BLOCK_N * THREAD_N; CUcontext ctx[MAX_GPU_COUNT]; printf( "Starting simpleMultiGPU\n" ); // Report on the available CUDA devices int computeCapabilityMajor = 0, computeCapabilityMinor = 0; int runtimeVersion = 0, driverVersion = 0; char deviceName[64]; CUdevice device[MAX_GPU_COUNT]; CHECK_CUDA_ERROR( cudaGetDeviceCount( &GPU_N ) ); if( GPU_N > MAX_GPU_COUNT ) GPU_N = MAX_GPU_COUNT; printf( "CUDA-capable device count: %i\n", GPU_N ); for ( i=0; i>> ( plan[i].d_Sum, plan[i].d_Data, plan[i].dataN ); if ( cudaGetLastError() != cudaSuccess ) { printf( "reduceKernel() execution failed (GPU %d).\n", i ); exit(EXIT_FAILURE); } // Read back GPU results CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[i].stream ) ); CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" ); } // Process GPU results printf( "Process GPU results on %d GPUs...\n", GPU_N ); for( i = 0; i < GPU_N; i++ ) { float sum; // Set device CHECK_CUDA_ERROR( cudaSetDevice( i ) ); CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent"); // Wait for all operations to finish cudaStreamSynchronize( plan[i].stream ); // Finalize GPU reduction for current subvector sum = 0; for( j = 0; j < ACCUM_N; j++ ) { sum += plan[i].h_Sum_from_device[j]; } *( plan[i].h_Sum ) = ( float ) sum; CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" ); } double gpuTime = GetTimer(); #ifdef CUPTI_ONLY size_t size = 1024; size_t sizeBytes = size*sizeof(uint64_t); uint64_t buffer[size]; uint64_t tmp[size]; for (int jj=0; jj<1024; jj++) tmp[jj]=0; for ( i=0; i %s \n", values[i], EventName[i] ); // retval = PAPI_read( EventSet, values ); // if( retval != PAPI_OK ) fprintf( stderr, "PAPI_read failed\n" ); // for( i = 0; i < eventCount; i++ ) // printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] ); retval = PAPI_stop( EventSet, values ); if( retval != PAPI_OK ) fprintf( stderr, "PAPI_stop failed\n" ); for( i = 0; i < eventCount; i++ ) printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] ); retval = PAPI_cleanup_eventset( EventSet ); if( retval != PAPI_OK ) fprintf( stderr, "PAPI_cleanup_eventset failed\n" ); retval = PAPI_destroy_eventset( &EventSet ); if( retval != PAPI_OK ) fprintf( stderr, "PAPI_destroy_eventset failed\n" ); PAPI_shutdown(); #endif for( i = 0; i < GPU_N; i++ ) { CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Sum_from_device ) ); CHECK_CUDA_ERROR( cudaFree( plan[i].d_Sum ) ); CHECK_CUDA_ERROR( cudaFree( plan[i].d_Data ) ); // Shut down this GPU CHECK_CUDA_ERROR( cudaStreamDestroy( plan[i].stream ) ); } sumGPU = 0; for( i = 0; i < GPU_N; i++ ) { sumGPU += h_SumGPU[i]; } printf( " GPU Processing time: %f (ms)\n", gpuTime ); // Compute on Host CPU printf( "Computing the same result with Host CPU...\n" ); StartTimer(); sumCPU = 0; for( i = 0; i < GPU_N; i++ ) { for( j = 0; j < plan[i].dataN; j++ ) { sumCPU += plan[i].h_Data[j]; } } double cpuTime = GetTimer(); printf( " CPU Processing time: %f (ms)\n", cpuTime ); // Compare GPU and CPU results printf( "Comparing GPU and Host CPU results...\n" ); diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU ); printf( " GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU ); printf( " Relative difference: %E \n", diff ); // Cleanup and shutdown for( i = 0; i < GPU_N; i++ ) { CHECK_CUDA_ERROR( cudaSetDevice( i ) ); CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Data ) ); cudaDeviceReset(); } exit( ( diff < 1e-5 ) ? EXIT_SUCCESS : EXIT_FAILURE ); }