|
Packit Service |
a1973e |
/* PAPI Multiple GPU example. This example is taken from the NVIDIA
|
|
Packit Service |
a1973e |
* documentation (Copyright 1993-2013 NVIDIA Corporation) and has been
|
|
Packit Service |
a1973e |
* adapted to show the use of CUPTI and PAPI in collecting event
|
|
Packit Service |
a1973e |
* counters for multiple GPU contexts. PAPI Team (2015)
|
|
Packit Service |
a1973e |
*/
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
/*
|
|
Packit Service |
a1973e |
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
|
Packit Service |
a1973e |
*
|
|
Packit Service |
a1973e |
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
Packit Service |
a1973e |
* with this source code for terms and conditions that govern your use of
|
|
Packit Service |
a1973e |
* this software. Any use, reproduction, disclosure, or distribution of
|
|
Packit Service |
a1973e |
* this software and related documentation outside the terms of the EULA
|
|
Packit Service |
a1973e |
* is strictly prohibited.
|
|
Packit Service |
a1973e |
*
|
|
Packit Service |
a1973e |
*/
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
/*
|
|
Packit Service |
a1973e |
* This application demonstrates how to use the CUDA API to use multiple GPUs,
|
|
Packit Service |
a1973e |
* with an emphasis on simple illustration of the techniques (not on performance).
|
|
Packit Service |
a1973e |
*
|
|
Packit Service |
a1973e |
* Note that in order to detect multiple GPUs in your system you have to disable
|
|
Packit Service |
a1973e |
* SLI in the nvidia control panel. Otherwise only one GPU is visible to the
|
|
Packit Service |
a1973e |
* application. On the other side, you can still extend your desktop to screens
|
|
Packit Service |
a1973e |
* attached to both GPUs.
|
|
Packit Service |
a1973e |
*/
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// System includes
|
|
Packit Service |
a1973e |
#include <stdio.h>
|
|
Packit Service |
a1973e |
#include <assert.h>
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// CUDA runtime
|
|
Packit Service |
a1973e |
#include <cuda.h>
|
|
Packit Service |
a1973e |
#include <cuda_runtime.h>
|
|
Packit Service |
a1973e |
#include <cuda_runtime_api.h>
|
|
Packit Service |
a1973e |
#include <cupti.h>
|
|
Packit Service |
a1973e |
#include <timer.h>
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#include "papi.h"
|
|
Packit Service |
a1973e |
#include "papi_test.h"
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#if not defined PAPI
|
|
Packit Service |
a1973e |
#undef PAPI
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#if not defined CUPTI_ONLY
|
|
Packit Service |
a1973e |
#undef CUPTI_ONLY
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#ifndef MAX
|
|
Packit Service |
a1973e |
#define MAX(a,b) (a > b ? a : b)
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#include "simpleMultiGPU.h"
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// //////////////////////////////////////////////////////////////////////////////
|
|
Packit Service |
a1973e |
// Data configuration
|
|
Packit Service |
a1973e |
// //////////////////////////////////////////////////////////////////////////////
|
|
Packit Service |
a1973e |
const int MAX_GPU_COUNT = 32;
|
|
Packit Service |
a1973e |
const int DATA_N = 48576 * 32;
|
|
Packit Service |
a1973e |
#ifdef PAPI
|
|
Packit Service |
a1973e |
const int MAX_NUM_EVENTS = 32;
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#define CHECK_CU_ERROR(err, cufunc) \
|
|
Packit Service |
a1973e |
if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; }
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#define CHECK_CUDA_ERROR(err) \
|
|
Packit Service |
a1973e |
if (err != cudaSuccess) { printf ("Error %d for CUDA \n", err ); return -1; }
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#define CHECK_CUPTI_ERROR(err, cuptifunc) \
|
|
Packit Service |
a1973e |
if (err != CUPTI_SUCCESS) { printf ("Error %d for CUPTI API function '%s'\n", err, cuptifunc); return -1; }
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// //////////////////////////////////////////////////////////////////////////////
|
|
Packit Service |
a1973e |
// Simple reduction kernel.
|
|
Packit Service |
a1973e |
// Refer to the 'reduction' CUDA SDK sample describing
|
|
Packit Service |
a1973e |
// reduction optimization strategies
|
|
Packit Service |
a1973e |
// //////////////////////////////////////////////////////////////////////////////
|
|
Packit Service |
a1973e |
__global__ static void reduceKernel( float *d_Result, float *d_Input, int N )
|
|
Packit Service |
a1973e |
{
|
|
Packit Service |
a1973e |
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
Packit Service |
a1973e |
const int threadN = gridDim.x * blockDim.x;
|
|
Packit Service |
a1973e |
float sum = 0;
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
for( int pos = tid; pos < N; pos += threadN )
|
|
Packit Service |
a1973e |
sum += d_Input[pos];
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
d_Result[tid] = sum;
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// //////////////////////////////////////////////////////////////////////////////
|
|
Packit Service |
a1973e |
// Program main
|
|
Packit Service |
a1973e |
// //////////////////////////////////////////////////////////////////////////////
|
|
Packit Service |
a1973e |
int main( int argc, char **argv )
|
|
Packit Service |
a1973e |
{
|
|
Packit Service |
a1973e |
// Solver config
|
|
Packit Service |
a1973e |
TGPUplan plan[MAX_GPU_COUNT];
|
|
Packit Service |
a1973e |
// GPU reduction results
|
|
Packit Service |
a1973e |
float h_SumGPU[MAX_GPU_COUNT];
|
|
Packit Service |
a1973e |
float sumGPU;
|
|
Packit Service |
a1973e |
double sumCPU, diff;
|
|
Packit Service |
a1973e |
int i, j, gpuBase, GPU_N;
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
const int BLOCK_N = 32;
|
|
Packit Service |
a1973e |
const int THREAD_N = 256;
|
|
Packit Service |
a1973e |
const int ACCUM_N = BLOCK_N * THREAD_N;
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
CUcontext ctx[MAX_GPU_COUNT];
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
printf( "Starting simpleMultiGPU\n" );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Report on the available CUDA devices
|
|
Packit Service |
a1973e |
int computeCapabilityMajor = 0, computeCapabilityMinor = 0;
|
|
Packit Service |
a1973e |
int runtimeVersion = 0, driverVersion = 0;
|
|
Packit Service |
a1973e |
char deviceName[64];
|
|
Packit Service |
a1973e |
CUdevice device[MAX_GPU_COUNT];
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaGetDeviceCount( &GPU_N ) );
|
|
Packit Service |
a1973e |
if( GPU_N > MAX_GPU_COUNT ) GPU_N = MAX_GPU_COUNT;
|
|
Packit Service |
a1973e |
printf( "CUDA-capable device count: %i\n", GPU_N );
|
|
Packit Service |
a1973e |
for ( i=0; i
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuDeviceGet( &device[i], i ), "cuDeviceGet" );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuDeviceGetName( deviceName, 64, device[i] ), "cuDeviceGetName" );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuDeviceComputeCapability( &computeCapabilityMajor, &computeCapabilityMinor, device[i] ), "cuDeviceComputeCapability" );
|
|
Packit Service |
a1973e |
cudaRuntimeGetVersion( &runtimeVersion );
|
|
Packit Service |
a1973e |
cudaDriverGetVersion( &driverVersion );
|
|
Packit Service |
a1973e |
printf( "CUDA Device %d: %s : computeCapability %d.%d runtimeVersion %d.%d driverVersion %d.%d\n", i, deviceName, computeCapabilityMajor, computeCapabilityMinor, runtimeVersion/1000, (runtimeVersion%100)/10, driverVersion/1000, (driverVersion%100)/10 );
|
|
Packit Service |
a1973e |
if ( computeCapabilityMajor < 2 ) {
|
|
Packit Service |
a1973e |
printf( "CUDA Device %d compute capability is too low... will not add any more GPUs\n", i );
|
|
Packit Service |
a1973e |
GPU_N = i;
|
|
Packit Service |
a1973e |
break;
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
uint32_t cupti_linked_version;
|
|
Packit Service |
a1973e |
cuptiGetVersion( &cupti_linked_version );
|
|
Packit Service |
a1973e |
printf("CUPTI version: Compiled against version %d; Linked against version %d\n", CUPTI_API_VERSION, cupti_linked_version );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// create one context per device
|
|
Packit Service |
a1973e |
for (i = 0; i < GPU_N; i++) {
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxCreate( &(ctx[i]), 0, device[i] ), "cuCtxCreate" );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
printf( "Generating input data...\n" );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Subdividing input data across GPUs
|
|
Packit Service |
a1973e |
// Get data sizes for each GPU
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ )
|
|
Packit Service |
a1973e |
plan[i].dataN = DATA_N / GPU_N;
|
|
Packit Service |
a1973e |
// Take into account "odd" data sizes
|
|
Packit Service |
a1973e |
for( i = 0; i < DATA_N % GPU_N; i++ )
|
|
Packit Service |
a1973e |
plan[i].dataN++;
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Assign data ranges to GPUs
|
|
Packit Service |
a1973e |
gpuBase = 0;
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
plan[i].h_Sum = h_SumGPU + i; // point within h_SumGPU array
|
|
Packit Service |
a1973e |
gpuBase += plan[i].dataN;
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked)
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaStreamCreate( &plan[i].stream ) );
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Data, plan[i].dataN * sizeof( float ) ) );
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Sum, ACCUM_N * sizeof( float ) ) );
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Sum_from_device, ACCUM_N * sizeof( float ) ) );
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Data, plan[i].dataN * sizeof( float ) ) );
|
|
Packit Service |
a1973e |
for( j = 0; j < plan[i].dataN; j++ ) {
|
|
Packit Service |
a1973e |
plan[i].h_Data[j] = ( float ) rand() / ( float ) RAND_MAX;
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#ifdef CUPTI_ONLY
|
|
Packit Service |
a1973e |
char const *cuptiEventName = "inst_executed"; // "elapsed_cycles_sm" "inst_executed"; "inst_issued0";
|
|
Packit Service |
a1973e |
printf("Setup CUPTI counters internally for %s event (CUPTI_ONLY)\n", cuptiEventName);
|
|
Packit Service |
a1973e |
CUpti_EventGroup eg[MAX_GPU_COUNT];
|
|
Packit Service |
a1973e |
CUpti_EventID myevent;
|
|
Packit Service |
a1973e |
for ( i=0; i
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
|
|
Packit Service |
a1973e |
CHECK_CUPTI_ERROR(cuptiSetEventCollectionMode(ctx[i], CUPTI_EVENT_COLLECTION_MODE_KERNEL), "cuptiSetEventCollectionMode" );
|
|
Packit Service |
a1973e |
CHECK_CUPTI_ERROR( cuptiEventGroupCreate( ctx[i], &eg[i], 0 ), "cuptiEventGroupCreate" );
|
|
Packit Service |
a1973e |
cuptiEventGetIdFromName ( device[i], cuptiEventName, &myevent );
|
|
Packit Service |
a1973e |
CHECK_CUPTI_ERROR( cuptiEventGroupAddEvent( eg[i], myevent ), "cuptiEventGroupAddEvent" );
|
|
Packit Service |
a1973e |
CHECK_CUPTI_ERROR( cuptiEventGroupEnable( eg[i] ), "cuptiEventGroupEnable" );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#ifdef PAPI
|
|
Packit Service |
a1973e |
printf("Setup PAPI counters internally (PAPI)\n");
|
|
Packit Service |
a1973e |
int EventSet = PAPI_NULL;
|
|
Packit Service |
a1973e |
int NUM_EVENTS = MAX_GPU_COUNT*MAX_NUM_EVENTS;
|
|
Packit Service |
a1973e |
long long values[NUM_EVENTS];
|
|
Packit Service |
a1973e |
int eventCount;
|
|
Packit Service |
a1973e |
int retval, ee;
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
/* PAPI Initialization */
|
|
Packit Service |
a1973e |
retval = PAPI_library_init( PAPI_VER_CURRENT );
|
|
Packit Service |
a1973e |
if( retval != PAPI_VER_CURRENT ) fprintf( stderr, "PAPI_library_init failed\n" );
|
|
Packit Service |
a1973e |
printf( "PAPI version: %d.%d.%d\n", PAPI_VERSION_MAJOR( PAPI_VERSION ), PAPI_VERSION_MINOR( PAPI_VERSION ), PAPI_VERSION_REVISION( PAPI_VERSION ) );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
retval = PAPI_create_eventset( &EventSet );
|
|
Packit Service |
a1973e |
if( retval != PAPI_OK ) fprintf( stderr, "PAPI_create_eventset failed\n" );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// In this example measure events from each GPU
|
|
Packit Service |
a1973e |
int numEventEndings = 3;
|
|
Packit Service |
a1973e |
char const *EventEndings[] = {
|
|
Packit Service |
a1973e |
"cuda:::metric:inst_per_warp",
|
|
Packit Service |
a1973e |
"cuda:::event:inst_executed",
|
|
Packit Service |
a1973e |
"cuda:::event:elapsed_cycles_sm"
|
|
Packit Service |
a1973e |
};
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Add events at a GPU specific level ... eg cuda:::device:2:elapsed_cycles_sm
|
|
Packit Service |
a1973e |
char *EventName[NUM_EVENTS];
|
|
Packit Service |
a1973e |
char tmpEventName[50];
|
|
Packit Service |
a1973e |
eventCount = 0;
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) ); // Set device
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
|
|
Packit Service |
a1973e |
CHECK_CUPTI_ERROR(cuptiSetEventCollectionMode(ctx[i], CUPTI_EVENT_COLLECTION_MODE_KERNEL), "cuptiSetEventCollectionMode" );
|
|
Packit Service |
a1973e |
for ( ee=0; ee
|
|
Packit Service |
a1973e |
snprintf( tmpEventName, 50, "%s:device=%d\0", EventEndings[ee], i );
|
|
Packit Service |
a1973e |
printf( "Trying to add event %s to GPU %d in PAPI...", tmpEventName , i ); fflush(NULL);
|
|
Packit Service |
a1973e |
retval = PAPI_add_named_event( EventSet, tmpEventName );
|
|
Packit Service |
a1973e |
if (retval==PAPI_OK) {
|
|
Packit Service |
a1973e |
printf( "Added event\n" );
|
|
Packit Service |
a1973e |
EventName[eventCount] = (char *)calloc( 50, sizeof(char) );
|
|
Packit Service |
a1973e |
snprintf( EventName[eventCount], 50, "%s", tmpEventName );
|
|
Packit Service |
a1973e |
eventCount++;
|
|
Packit Service |
a1973e |
} else {
|
|
Packit Service |
a1973e |
printf( "Could not add event\n" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Start PAPI event measurement
|
|
Packit Service |
a1973e |
retval = PAPI_start( EventSet );
|
|
Packit Service |
a1973e |
if( retval != PAPI_OK ) fprintf( stderr, "PAPI_start failed\n" );
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Start timing and compute on GPU(s)
|
|
Packit Service |
a1973e |
printf( "Computing with %d GPUs...\n", GPU_N );
|
|
Packit Service |
a1973e |
StartTimer();
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Copy data to GPU, launch the kernel and copy data back. All asynchronously
|
|
Packit Service |
a1973e |
for (i = 0; i < GPU_N; i++) {
|
|
Packit Service |
a1973e |
// Set device
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ));
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
|
|
Packit Service |
a1973e |
// Copy input data from CPU
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof( float ), cudaMemcpyHostToDevice, plan[i].stream ) );
|
|
Packit Service |
a1973e |
// Perform GPU computations
|
|
Packit Service |
a1973e |
reduceKernel <<< BLOCK_N, THREAD_N, 0, plan[i].stream >>> ( plan[i].d_Sum, plan[i].d_Data, plan[i].dataN );
|
|
Packit Service |
a1973e |
if ( cudaGetLastError() != cudaSuccess ) { printf( "reduceKernel() execution failed (GPU %d).\n", i ); exit(EXIT_FAILURE); }
|
|
Packit Service |
a1973e |
// Read back GPU results
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[i].stream ) );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Process GPU results
|
|
Packit Service |
a1973e |
printf( "Process GPU results on %d GPUs...\n", GPU_N );
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
float sum;
|
|
Packit Service |
a1973e |
// Set device
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
|
|
Packit Service |
a1973e |
// Wait for all operations to finish
|
|
Packit Service |
a1973e |
cudaStreamSynchronize( plan[i].stream );
|
|
Packit Service |
a1973e |
// Finalize GPU reduction for current subvector
|
|
Packit Service |
a1973e |
sum = 0;
|
|
Packit Service |
a1973e |
for( j = 0; j < ACCUM_N; j++ ) {
|
|
Packit Service |
a1973e |
sum += plan[i].h_Sum_from_device[j];
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
*( plan[i].h_Sum ) = ( float ) sum;
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
double gpuTime = GetTimer();
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#ifdef CUPTI_ONLY
|
|
Packit Service |
a1973e |
size_t size = 1024;
|
|
Packit Service |
a1973e |
size_t sizeBytes = size*sizeof(uint64_t);
|
|
Packit Service |
a1973e |
uint64_t buffer[size];
|
|
Packit Service |
a1973e |
uint64_t tmp[size]; for (int jj=0; jj<1024; jj++) tmp[jj]=0;
|
|
Packit Service |
a1973e |
for ( i=0; i
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
|
|
Packit Service |
a1973e |
CHECK_CUPTI_ERROR( cuptiEventGroupReadEvent ( eg[i], CUPTI_EVENT_READ_FLAG_NONE, myevent, &sizeBytes, &tmp[0] ), "cuptiEventGroupReadEvent" );
|
|
Packit Service |
a1973e |
buffer[i] = tmp[0];
|
|
Packit Service |
a1973e |
printf( "CUPTI %s device %d counterValue %u (on one domain, may need to be multiplied by num of domains)\n", cuptiEventName, i, buffer[i] );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
#ifdef PAPI
|
|
Packit Service |
a1973e |
for ( i=0; i
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
|
|
Packit Service |
a1973e |
CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// retval = PAPI_read( EventSet, values );
|
|
Packit Service |
a1973e |
// if( retval != PAPI_OK ) fprintf( stderr, "PAPI_read failed\n" );
|
|
Packit Service |
a1973e |
// for( i = 0; i < eventCount; i++ )
|
|
Packit Service |
a1973e |
// printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// retval = PAPI_read( EventSet, values );
|
|
Packit Service |
a1973e |
// if( retval != PAPI_OK ) fprintf( stderr, "PAPI_read failed\n" );
|
|
Packit Service |
a1973e |
// for( i = 0; i < eventCount; i++ )
|
|
Packit Service |
a1973e |
// printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
retval = PAPI_stop( EventSet, values );
|
|
Packit Service |
a1973e |
if( retval != PAPI_OK ) fprintf( stderr, "PAPI_stop failed\n" );
|
|
Packit Service |
a1973e |
for( i = 0; i < eventCount; i++ )
|
|
Packit Service |
a1973e |
printf( "PAPI counterValue %12lld \t\t --> %s \n", values[i], EventName[i] );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
retval = PAPI_cleanup_eventset( EventSet );
|
|
Packit Service |
a1973e |
if( retval != PAPI_OK ) fprintf( stderr, "PAPI_cleanup_eventset failed\n" );
|
|
Packit Service |
a1973e |
retval = PAPI_destroy_eventset( &EventSet );
|
|
Packit Service |
a1973e |
if( retval != PAPI_OK ) fprintf( stderr, "PAPI_destroy_eventset failed\n" );
|
|
Packit Service |
a1973e |
PAPI_shutdown();
|
|
Packit Service |
a1973e |
#endif
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Sum_from_device ) );
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaFree( plan[i].d_Sum ) );
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaFree( plan[i].d_Data ) );
|
|
Packit Service |
a1973e |
// Shut down this GPU
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaStreamDestroy( plan[i].stream ) );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
sumGPU = 0;
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
sumGPU += h_SumGPU[i];
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
printf( " GPU Processing time: %f (ms)\n", gpuTime );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Compute on Host CPU
|
|
Packit Service |
a1973e |
printf( "Computing the same result with Host CPU...\n" );
|
|
Packit Service |
a1973e |
StartTimer();
|
|
Packit Service |
a1973e |
sumCPU = 0;
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
for( j = 0; j < plan[i].dataN; j++ ) {
|
|
Packit Service |
a1973e |
sumCPU += plan[i].h_Data[j];
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
double cpuTime = GetTimer();
|
|
Packit Service |
a1973e |
printf( " CPU Processing time: %f (ms)\n", cpuTime );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Compare GPU and CPU results
|
|
Packit Service |
a1973e |
printf( "Comparing GPU and Host CPU results...\n" );
|
|
Packit Service |
a1973e |
diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU );
|
|
Packit Service |
a1973e |
printf( " GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU );
|
|
Packit Service |
a1973e |
printf( " Relative difference: %E \n", diff );
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
// Cleanup and shutdown
|
|
Packit Service |
a1973e |
for( i = 0; i < GPU_N; i++ ) {
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaSetDevice( i ) );
|
|
Packit Service |
a1973e |
CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Data ) );
|
|
Packit Service |
a1973e |
cudaDeviceReset();
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|
|
Packit Service |
a1973e |
exit( ( diff < 1e-5 ) ? EXIT_SUCCESS : EXIT_FAILURE );
|
|
Packit Service |
a1973e |
}
|
|
Packit Service |
a1973e |
|