Blame src/components/cuda/tests/nvlink_bandwidth_cupti_only.cu

Packit 577717
 /*
Packit 577717
 * Copyright 2015-2016 NVIDIA Corporation. All rights reserved.
Packit 577717
 *
Packit 577717
 * Sample to demonstrate use of NVlink CUPTI APIs
Packit 577717
 */
Packit 577717
Packit 577717
 #include <stdio.h>
Packit 577717
 #include <stdlib.h>
Packit 577717
 #include <string.h>
Packit 577717
 #include <cuda.h>
Packit 577717
 #include <cupti.h>
Packit 577717
Packit 577717
 #define CUPTI_CALL(call)                                                      \
Packit 577717
 do {                                                                          \
Packit 577717
     CUptiResult _status = call;                                               \
Packit 577717
     if (_status != CUPTI_SUCCESS) {                                           \
Packit 577717
         const char *errstr;                                                   \
Packit 577717
         cuptiGetResultString(_status, &errstr);                               \
Packit 577717
         fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",  \
Packit 577717
                 __FILE__, __LINE__, #call, errstr);                           \
Packit 577717
         exit(-1);                                                             \
Packit 577717
     }                                                                         \
Packit 577717
 } while (0)
Packit 577717
Packit 577717
 #define DRIVER_API_CALL(apiFuncCall)                                           \
Packit 577717
 do {                                                                           \
Packit 577717
     CUresult _status = apiFuncCall;                                            \
Packit 577717
     if (_status != CUDA_SUCCESS) {                                             \
Packit 577717
         fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n",   \
Packit 577717
                 __FILE__, __LINE__, #apiFuncCall, _status);                    \
Packit 577717
         exit(-1);                                                              \
Packit 577717
     }                                                                          \
Packit 577717
 } while (0)
Packit 577717
Packit 577717
 #define RUNTIME_API_CALL(apiFuncCall)                                          \
Packit 577717
 do {                                                                           \
Packit 577717
     cudaError_t _status = apiFuncCall;                                         \
Packit 577717
     if (_status != cudaSuccess) {                                              \
Packit 577717
         fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",   \
Packit 577717
                 __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
Packit 577717
         exit(-1);                                                              \
Packit 577717
     }                                                                          \
Packit 577717
 } while (0)
Packit 577717
Packit 577717
 #define MEMORY_ALLOCATION_CALL(var)                                            \
Packit 577717
 do {                                                                           \
Packit 577717
     if (var == NULL) {                                                         \
Packit 577717
         fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n",           \
Packit 577717
                 __FILE__, __LINE__);                                           \
Packit 577717
         exit(-1);                                                              \
Packit 577717
     }                                                                          \
Packit 577717
 } while (0)
Packit 577717
Packit 577717
 #define MAX_DEVICES    (32)
Packit 577717
 #define BLOCK_SIZE     (1024)
Packit 577717
 #define GRID_SIZE      (512)
Packit 577717
 #define BUF_SIZE       (32 * 1024)
Packit 577717
 #define ALIGN_SIZE     (8)
Packit 577717
 #define SUCCESS        (0)
Packit 577717
 #define NUM_METRIC     (4)
Packit 577717
 #define NUM_EVENTS     (2)
Packit 577717
 #define MAX_SIZE       (64*1024*1024)   // 64 MB
Packit 577717
 #define NUM_STREAMS    (6)   // gp100 has 6 physical copy engines
Packit 577717
Packit 577717
 CUpti_ActivityNvLink *nvlinkRec = NULL;
Packit 577717
 int cpuToGpu = 0;
Packit 577717
 int gpuToGpu = 0;
Packit 577717
 int cpuToGpuAccess = 0;
Packit 577717
 int gpuToGpuAccess = 0;
Packit 577717
Packit 577717
 extern "C" __global__ void test_nvlink_bandwidth(float *src, float *dst)
Packit 577717
 {
Packit 577717
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
Packit 577717
     dst[idx] = src[idx] * 2.0f;
Packit 577717
 }
Packit 577717
Packit 577717
 static void printActivity(CUpti_Activity *record)
Packit 577717
 {
Packit 577717
     if (record->kind == CUPTI_ACTIVITY_KIND_NVLINK) {
Packit 577717
         nvlinkRec = (CUpti_ActivityNvLink *)record;
Packit 577717
Packit 577717
         printf("typeDev0 %d, typeDev1 %d, sysmem %d, peer %d, physical links %d, portdev0 %d, %d, %d, %d, portDev1 %d, %d, %d, %d, bandwidth %llu\n",
Packit 577717
           nvlinkRec->typeDev0,
Packit 577717
           nvlinkRec->typeDev1,
Packit 577717
           ((nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS) ? 1 : 0),
Packit 577717
           ((nvlinkRec->flag & CUPTI_LINK_FLAG_PEER_ACCESS) ? 1 : 0),
Packit 577717
           nvlinkRec->physicalNvLinkCount,
Packit 577717
           nvlinkRec->portDev0[0], nvlinkRec->portDev0[1], nvlinkRec->portDev0[2], nvlinkRec->portDev0[3],
Packit 577717
           nvlinkRec->portDev1[0], nvlinkRec->portDev1[1], nvlinkRec->portDev1[2], nvlinkRec->portDev1[3],
Packit 577717
           (long long unsigned int)nvlinkRec->bandwidth);
Packit 577717
         cpuToGpuAccess |= (nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS);
Packit 577717
         gpuToGpuAccess |= (nvlinkRec->flag & CUPTI_LINK_FLAG_PEER_ACCESS);
Packit 577717
     }
Packit 577717
     else {
Packit 577717
         printf("Error : Unexpected CUPTI activity kind.\nExpected Activity kind : CUPTI_ACTIVITY_KIND_NVLINK\n");
Packit 577717
     }
Packit 577717
 }
Packit 577717
Packit 577717
 static void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
Packit 577717
Packit 577717
 {
Packit 577717
     *size = BUF_SIZE + ALIGN_SIZE;
Packit 577717
     *buffer = (uint8_t*) calloc(1, *size);
Packit 577717
     MEMORY_ALLOCATION_CALL(*buffer);
Packit 577717
     *maxNumRecords = 0;
Packit 577717
 }
Packit 577717
Packit 577717
 static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId,
Packit 577717
                                 uint8_t *buffer, size_t size,
Packit 577717
                                 size_t validSize)
Packit 577717
 {
Packit 577717
     CUptiResult status;
Packit 577717
     CUpti_Activity *record = NULL;
Packit 577717
     do {
Packit 577717
         status = cuptiActivityGetNextRecord(buffer, validSize, &record);
Packit 577717
         if(status == CUPTI_SUCCESS) {
Packit 577717
             printActivity(record);
Packit 577717
         }
Packit 577717
         else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
Packit 577717
             break;
Packit 577717
         }
Packit 577717
         else {
Packit 577717
             CUPTI_CALL(status);
Packit 577717
         }
Packit 577717
     } while (1);
Packit 577717
Packit 577717
     size_t dropped;
Packit 577717
     CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
Packit 577717
     if (dropped != 0) {
Packit 577717
         printf("Dropped %u activity records\n", (unsigned int)dropped);
Packit 577717
     }
Packit 577717
 }
Packit 577717
Packit 577717
 #define DIM(x) (sizeof(x)/sizeof(*(x)))
Packit 577717
Packit 577717
 void calculateSize(char *result, uint64_t size)
Packit 577717
 {
Packit 577717
     int i;
Packit 577717
Packit 577717
     const char *sizes[]   = { "TB", "GB", "MB", "KB", "B" };
Packit 577717
     uint64_t  exbibytes = 1024ULL * 1024ULL * 1024ULL * 1024ULL;
Packit 577717
Packit 577717
     uint64_t  multiplier = exbibytes;
Packit 577717
Packit 577717
     for (i = 0; (unsigned)i < DIM(sizes); i++, multiplier /= (uint64_t)1024)
Packit 577717
     {
Packit 577717
         if (size < multiplier)
Packit 577717
             continue;
Packit 577717
         sprintf(result, "%.1f %s", (float) size / multiplier, sizes[i]);
Packit 577717
         return;
Packit 577717
     }
Packit 577717
     strcpy(result, "0");
Packit 577717
     return;
Packit 577717
 }
Packit 577717
Packit 577717
 void readMetricValue(CUpti_EventGroup eventGroup, uint32_t numEvents,
Packit 577717
                     CUdevice dev, CUpti_MetricID *metricId,
Packit 577717
                     uint64_t timeDuration,
Packit 577717
                     CUpti_MetricValue *metricValue) {
Packit 577717
Packit 577717
     size_t bufferSizeBytes, numCountersRead;
Packit 577717
     uint64_t *eventValueArray = NULL;
Packit 577717
     CUpti_EventID *eventIdArray;
Packit 577717
     size_t arraySizeBytes = 0;
Packit 577717
     size_t numTotalInstancesSize = 0;
Packit 577717
     uint64_t numTotalInstances = 0;
Packit 577717
     uint64_t *aggrEventValueArray = NULL;
Packit 577717
     size_t aggrEventValueArraySize;
Packit 577717
     uint32_t i = 0, j = 0;
Packit 577717
     CUpti_EventDomainID domainId;
Packit 577717
     size_t domainSize;
Packit 577717
Packit 577717
     domainSize = sizeof(CUpti_EventDomainID);
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiEventGroupGetAttribute(eventGroup, 
Packit 577717
                                            CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID, 
Packit 577717
                                            &domainSize, 
Packit 577717
                                            (void *)&domainId));
Packit 577717
Packit 577717
     numTotalInstancesSize = sizeof(uint64_t);
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiDeviceGetEventDomainAttribute(dev, 
Packit 577717
                                               domainId, 
Packit 577717
                                               CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, 
Packit 577717
                                               &numTotalInstancesSize, 
Packit 577717
                                               (void *)&numTotalInstances));
Packit 577717
Packit 577717
     arraySizeBytes = sizeof(CUpti_EventID) * numEvents;
Packit 577717
     bufferSizeBytes = sizeof(uint64_t) * numEvents * numTotalInstances;
Packit 577717
Packit 577717
     eventValueArray = (uint64_t *) malloc(bufferSizeBytes);
Packit 577717
     MEMORY_ALLOCATION_CALL(eventValueArray);
Packit 577717
Packit 577717
     eventIdArray = (CUpti_EventID *) malloc(arraySizeBytes);
Packit 577717
     MEMORY_ALLOCATION_CALL(eventIdArray);
Packit 577717
Packit 577717
     aggrEventValueArray = (uint64_t *) calloc(numEvents, sizeof(uint64_t));
Packit 577717
     MEMORY_ALLOCATION_CALL(aggrEventValueArray);
Packit 577717
Packit 577717
     aggrEventValueArraySize = sizeof(uint64_t) * numEvents;
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiEventGroupReadAllEvents(eventGroup, 
Packit 577717
                                                 CUPTI_EVENT_READ_FLAG_NONE,
Packit 577717
                                                 &bufferSizeBytes, 
Packit 577717
                                                 eventValueArray, 
Packit 577717
                                                 &arraySizeBytes, 
Packit 577717
                                                 eventIdArray, 
Packit 577717
                                                 &numCountersRead));
Packit 577717
Packit 577717
     for (i = 0; i < numEvents; i++) {
Packit 577717
         for (j = 0; j < numTotalInstances; j++) {
Packit 577717
             aggrEventValueArray[i] += eventValueArray[i + numEvents * j];
Packit 577717
             //printf("For event %d (id %d) instance %d value %ul aggregate %d = %ul\n", i, eventIdArray[i], j,  eventValueArray[i + numEvents * j], i, aggrEventValueArray[i]);
Packit 577717
         }
Packit 577717
     }
Packit 577717
Packit 577717
     for (i = 0; i < NUM_METRIC; i++) {
Packit 577717
         CUPTI_CALL(cuptiMetricGetValue(dev, metricId[i], arraySizeBytes,
Packit 577717
                               eventIdArray, aggrEventValueArraySize,
Packit 577717
                               aggrEventValueArray, timeDuration,
Packit 577717
                               &metricValue[i]));
Packit 577717
     }
Packit 577717
Packit 577717
     free(eventValueArray);
Packit 577717
     free(eventIdArray);
Packit 577717
 }
Packit 577717
Packit 577717
   // Print metric value, we format based on the value kind
Packit 577717
 int printMetricValue(CUpti_MetricID metricId, CUpti_MetricValue metricValue, const char *metricName, uint64_t timeDuration) {
Packit 577717
Packit 577717
     CUpti_MetricValueKind valueKind;
Packit 577717
     char str[64];
Packit 577717
     size_t valueKindSize = sizeof(valueKind);
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiMetricGetAttribute(metricId, CUPTI_METRIC_ATTR_VALUE_KIND,
Packit 577717
                                        &valueKindSize, &valueKind));
Packit 577717
     switch (valueKind) {
Packit 577717
Packit 577717
     case CUPTI_METRIC_VALUE_KIND_DOUBLE:
Packit 577717
         printf("%s = ", metricName);
Packit 577717
         calculateSize(str, (uint64_t)metricValue.metricValueDouble);
Packit 577717
         // printf("%s   (val %lu  %lu nsec)\n", str, metricValue.metricValueUint64, timeDuration);
Packit 577717
         printf("%s\n", str);
Packit 577717
         break;
Packit 577717
Packit 577717
     case CUPTI_METRIC_VALUE_KIND_UINT64:
Packit 577717
         printf("%s = ", metricName);
Packit 577717
         calculateSize(str, (uint64_t)metricValue.metricValueUint64);
Packit 577717
         printf("%s\n", str);
Packit 577717
         break;
Packit 577717
Packit 577717
     case CUPTI_METRIC_VALUE_KIND_INT64:
Packit 577717
         printf("%s = ", metricName);
Packit 577717
         calculateSize(str, (uint64_t)metricValue.metricValueInt64);
Packit 577717
         printf("%s\n", str);
Packit 577717
         break;
Packit 577717
Packit 577717
     case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
Packit 577717
         printf("%s = ", metricName);
Packit 577717
         calculateSize(str, (uint64_t)metricValue.metricValueThroughput);
Packit 577717
         printf("%s\n", str);
Packit 577717
         break;
Packit 577717
Packit 577717
     default:
Packit 577717
         fprintf(stderr, "error: unknown value kind\n");
Packit 577717
         return -1;
Packit 577717
     }
Packit 577717
     return 0;
Packit 577717
   }
Packit 577717
Packit 577717
 void testCpuToGpu(CUpti_EventGroup *eventGroup, CUdeviceptr *pDevBuffer,
Packit 577717
                     float** pHostBuffer, size_t bufferSize,
Packit 577717
                     cudaStream_t *cudaStreams,
Packit 577717
                     uint64_t *timeDuration, int numEventGroup)
Packit 577717
 {
Packit 577717
     int i;
Packit 577717
     uint32_t value = 1;
Packit 577717
     uint64_t startTimestamp, endTimestamp;
Packit 577717
Packit 577717
     for (i = 0; i < numEventGroup; i++) {
Packit 577717
             CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i]));
Packit 577717
             CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i],
Packit 577717
                                 CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
Packit 577717
                                 sizeof(uint32_t), (void*)&value));
Packit 577717
     }
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
Packit 577717
Packit 577717
     //Unidirectional copy H2D
Packit 577717
     for (i = 0; i < NUM_STREAMS; i++)
Packit 577717
     {
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit 577717
     }
Packit 577717
     RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
     //Unidirectional copy D2H
Packit 577717
     for (i = 0; i < NUM_STREAMS; i++)
Packit 577717
     {
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync(pHostBuffer[i], (void *)pDevBuffer[i], bufferSize, cudaMemcpyDeviceToHost, cudaStreams[i]));}
Packit 577717
Packit 577717
     RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
     //Bidirectional copy
Packit 577717
     for (i = 0; i < NUM_STREAMS; i+=2)
Packit 577717
     {
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync(pHostBuffer[i+1], (void *)pDevBuffer[i+1], bufferSize, cudaMemcpyDeviceToHost, cudaStreams[i+1]));
Packit 577717
     }
Packit 577717
     RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));
Packit 577717
     *timeDuration = endTimestamp - startTimestamp;
Packit 577717
 }
Packit 577717
Packit 577717
 void testGpuToGpu(CUpti_EventGroup *eventGroup, CUdeviceptr *pDevBuffer0, CUdeviceptr *pDevBuffer1,
Packit 577717
                     float** pHostBuffer, size_t bufferSize,
Packit 577717
                     cudaStream_t *cudaStreams,
Packit 577717
                     uint64_t *timeDuration, int numEventGroup)
Packit 577717
 {
Packit 577717
     int i;
Packit 577717
     uint32_t value = 1;
Packit 577717
     uint64_t startTimestamp, endTimestamp;
Packit 577717
Packit 577717
Packit 577717
     RUNTIME_API_CALL(cudaSetDevice(0));
Packit 577717
     RUNTIME_API_CALL(cudaDeviceEnablePeerAccess(1, 0));
Packit 577717
     RUNTIME_API_CALL(cudaSetDevice(1));
Packit 577717
     RUNTIME_API_CALL(cudaDeviceEnablePeerAccess(0, 0));
Packit 577717
Packit 577717
     //Unidirectional copy H2D
Packit 577717
     for (i = 0; i < NUM_STREAMS; i++) {
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer0[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit 577717
     }
Packit 577717
     RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
     for (i = 0; i < NUM_STREAMS; i++) {
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer1[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit 577717
     }
Packit 577717
     RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
Packit 577717
     for (i = 0; i < numEventGroup; i++) {
Packit 577717
         printf("cuptiEventGroupEnable(eventGroup[%d])\n", i);
Packit 577717
         CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i]));
Packit 577717
         CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i],
Packit 577717
                             CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
Packit 577717
                             sizeof(uint32_t), (void*)&value));
Packit 577717
     }
Packit 577717
     CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
Packit 577717
Packit 577717
     for (i = 0; i < NUM_STREAMS; i++) {
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer0[i], (void *)pDevBuffer1[i], bufferSize, cudaMemcpyDeviceToDevice, cudaStreams[i]));
Packit 577717
     }
Packit 577717
     RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
     for (i = 0; i < NUM_STREAMS; i++) {
Packit 577717
         RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer1[i], (void *)pDevBuffer0[i], bufferSize, cudaMemcpyDeviceToDevice, cudaStreams[i]));
Packit 577717
     }
Packit 577717
     RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
     for (i = 0; i < NUM_STREAMS; i++) {
Packit 577717
         test_nvlink_bandwidth<<<GRID_SIZE, BLOCK_SIZE>>>((float*)pDevBuffer1[i], (float*)pDevBuffer0[i]);
Packit 577717
     }
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));
Packit 577717
     *timeDuration = endTimestamp - startTimestamp;
Packit 577717
 }
Packit 577717
Packit 577717
 static void printUsage() {
Packit 577717
     printf("usage: Demonstrate use of NVlink CUPTI APIs\n");
Packit 577717
     printf("       -help           : display help message\n");
Packit 577717
     printf("       --cpu-to-gpu    : Show results for data transfer between CPU and GPU \n");
Packit 577717
     printf("       --gpu-to-gpu    : Show results for data transfer between two GPUs \n");
Packit 577717
 }
Packit 577717
Packit 577717
 void parseCommandLineArgs(int argc, char *argv[])
Packit 577717
 {
Packit 577717
     if (argc != 2) {
Packit 577717
         printf("Invalid number of options\n");
Packit 577717
         exit(0);
Packit 577717
     }
Packit 577717
Packit 577717
     if (strcmp(argv[1], "--cpu-to-gpu") == 0) {
Packit 577717
         cpuToGpu = 1;
Packit 577717
     }
Packit 577717
     else if (strcmp(argv[1], "--gpu-to-gpu") == 0) {
Packit 577717
         gpuToGpu = 1;
Packit 577717
     }
Packit 577717
     else if ((strcmp(argv[1], "--help") == 0) ||
Packit 577717
              (strcmp(argv[1], "-help") == 0) ||
Packit 577717
              (strcmp(argv[1], "-h") == 0)) {
Packit 577717
         printUsage();
Packit 577717
         exit(0);
Packit 577717
     }
Packit 577717
     else {
Packit 577717
         cpuToGpu = 1;
Packit 577717
     }
Packit 577717
 }
Packit 577717
Packit 577717
 int main(int argc, char *argv[])
Packit 577717
 {
Packit 577717
     int deviceCount = 0, i = 0, j = 0, numEventGroup = 0;
Packit 577717
     size_t bufferSize = 0, freeMemory = 0, totalMemory = 0;
Packit 577717
     CUpti_EventGroupSets *passes = NULL;
Packit 577717
     CUcontext ctx;
Packit 577717
     char str[64];
Packit 577717
Packit 577717
     CUdeviceptr pDevBuffer0[NUM_STREAMS];
Packit 577717
     CUdeviceptr pDevBuffer1[NUM_STREAMS];
Packit 577717
     float* pHostBuffer[NUM_STREAMS];
Packit 577717
Packit 577717
     cudaStream_t cudaStreams[NUM_STREAMS] = {0};
Packit 577717
Packit 577717
     CUpti_EventGroup eventGroup[32];
Packit 577717
     CUpti_MetricID metricId[NUM_METRIC];
Packit 577717
     uint32_t numEvents[NUM_METRIC];
Packit 577717
     CUpti_MetricValue metricValue[NUM_METRIC];
Packit 577717
     cudaDeviceProp prop[MAX_DEVICES];
Packit 577717
     uint64_t timeDuration;
Packit 577717
Packit 577717
     // Adding nvlink Metrics.
Packit 577717
     const char *metricName[NUM_METRIC] = {"nvlink_total_data_transmitted",
Packit 577717
                                     "nvlink_total_data_received",
Packit 577717
                                     "nvlink_transmit_throughput",
Packit 577717
                                     "nvlink_receive_throughput"};
Packit 577717
Packit 577717
     // Parse command line arguments
Packit 577717
     parseCommandLineArgs(argc, argv);
Packit 577717
Packit 577717
     CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NVLINK));
Packit 577717
     CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
Packit 577717
Packit 577717
     DRIVER_API_CALL(cuInit(0));
Packit 577717
Packit 577717
     RUNTIME_API_CALL(cudaGetDeviceCount(&deviceCount));
Packit 577717
Packit 577717
     printf("There are %d devices.\n", deviceCount);
Packit 577717
Packit 577717
     if (deviceCount == 0) {
Packit 577717
         printf("There is no device supporting CUDA.\n");
Packit 577717
         exit(-1);
Packit 577717
     }
Packit 577717
Packit 577717
     for (i = 0; i < deviceCount; i++) {
Packit 577717
        RUNTIME_API_CALL(cudaGetDeviceProperties(&prop[i], i));
Packit 577717
        printf("CUDA Device %d Name: %s\n", i, prop[i].name);
Packit 577717
    }
Packit 577717
Packit 577717
    // Set memcpy size based on available device memory
Packit 577717
    RUNTIME_API_CALL(cudaMemGetInfo(&freeMemory, &totalMemory));
Packit 577717
    bufferSize = MAX_SIZE < (freeMemory/4) ? MAX_SIZE : (freeMemory/4);
Packit 577717
Packit 577717
    printf("Total Device Memory available : ");
Packit 577717
    calculateSize(str, (uint64_t)totalMemory);
Packit 577717
    printf("%s\n", str);
Packit 577717
Packit 577717
    printf("Memcpy size is set to %llu B (%llu MB)\n",
Packit 577717
    (unsigned long long)bufferSize, (unsigned long long)bufferSize/(1024*1024));
Packit 577717
Packit 577717
    for(i = 0; i < NUM_STREAMS; i++) {
Packit 577717
       RUNTIME_API_CALL(cudaStreamCreate(&cudaStreams[i]));
Packit 577717
    }
Packit 577717
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit 577717
Packit 577717
    // Nvlink-topology Records are generated even before cudaMemcpy API is called.
Packit 577717
    CUPTI_CALL(cuptiActivityFlushAll(0));
Packit 577717
Packit 577717
    // Transfer Data between Host And Device, if Nvlink is Present
Packit 577717
    // Check condition : nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS
Packit 577717
    // True : Nvlink is present between CPU & GPU
Packit 577717
    // False : Nvlink is not present.
Packit 577717
    if ((nvlinkRec) && (((cpuToGpu) && (cpuToGpuAccess)) || ((gpuToGpu) && (gpuToGpuAccess)))) {
Packit 577717
Packit 577717
        for (i = 0; i < NUM_METRIC; i++) {
Packit 577717
            CUPTI_CALL(cuptiMetricGetIdFromName(0, metricName[i], &metricId[i]));
Packit 577717
            CUPTI_CALL(cuptiMetricGetNumEvents(metricId[i], &numEvents[i]));
Packit 577717
        }
Packit 577717
Packit 577717
        DRIVER_API_CALL(cuCtxCreate(&ctx, 0, 0));
Packit 577717
Packit 577717
        CUPTI_CALL(cuptiMetricCreateEventGroupSets(ctx, (sizeof metricId) ,metricId, &passes));
Packit 577717
Packit 577717
        // EventGroups required to profile Nvlink metrics.
Packit 577717
        for (i = 0; i < (signed)passes->numSets; i++) {
Packit 577717
            for (j = 0; j < (signed)passes->sets[i].numEventGroups; j++) {
Packit 577717
                eventGroup[numEventGroup] = passes->sets[i].eventGroups[j];
Packit 577717
Packit 577717
                if (!eventGroup[numEventGroup]) {
Packit 577717
                    printf("\n eventGroup initialization failed \n");
Packit 577717
                    exit(-1);
Packit 577717
                }
Packit 577717
Packit 577717
                numEventGroup++;
Packit 577717
            }
Packit 577717
        }
Packit 577717
Packit 577717
        CUPTI_CALL(cuptiSetEventCollectionMode(ctx, CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS));
Packit 577717
Packit 577717
        // ===== Allocate Memory =====================================
Packit 577717
Packit 577717
        for(i = 0; i < NUM_STREAMS; i++) {
Packit 577717
            RUNTIME_API_CALL(cudaMalloc((void**)&pDevBuffer0[i], bufferSize));
Packit 577717
Packit 577717
            pHostBuffer[i] = (float *)malloc(bufferSize);
Packit 577717
            MEMORY_ALLOCATION_CALL(pHostBuffer[i]);
Packit 577717
        }
Packit 577717
Packit 577717
        if (cpuToGpu) {
Packit 577717
            testCpuToGpu(eventGroup, pDevBuffer0, pHostBuffer, bufferSize, cudaStreams, &timeDuration, numEventGroup);
Packit 577717
            printf("Data tranferred between CPU & Device%d : \n", (int)nvlinkRec->typeDev0);
Packit 577717
        }
Packit 577717
        else if(gpuToGpu) {
Packit 577717
            RUNTIME_API_CALL(cudaSetDevice(1));
Packit 577717
            for(i = 0; i < NUM_STREAMS; i++) {
Packit 577717
                RUNTIME_API_CALL(cudaMalloc((void**)&pDevBuffer1[i], bufferSize));
Packit 577717
            }
Packit 577717
            testGpuToGpu(eventGroup, pDevBuffer0, pDevBuffer1,pHostBuffer, bufferSize, cudaStreams, &timeDuration, numEventGroup);
Packit 577717
            printf("Data tranferred between Device 0 & Device 1 : \n");
Packit 577717
        }
Packit 577717
Packit 577717
        // Collect Nvlink Metric values for the data transfer via Nvlink for all the eventGroups.
Packit 577717
        for (i = 0; i < numEventGroup; i++) {
Packit 577717
            readMetricValue(eventGroup[i], NUM_EVENTS, 0, metricId, timeDuration, metricValue);
Packit 577717
Packit 577717
            CUPTI_CALL(cuptiEventGroupDisable(eventGroup[i]));
Packit 577717
            CUPTI_CALL(cuptiEventGroupDestroy(eventGroup[i]));
Packit 577717
Packit 577717
            for (i = 0; i < NUM_METRIC; i++) {
Packit 577717
                if (printMetricValue(metricId[i], metricValue[i], metricName[i], timeDuration) != 0) {
Packit 577717
                    printf("\n printMetricValue failed \n");
Packit 577717
                    exit(-1);
Packit 577717
                }
Packit 577717
            }
Packit 577717
        }
Packit 577717
    }
Packit 577717
    else {
Packit 577717
        printf("No Nvlink supported device found\n");
Packit 577717
    }
Packit 577717
Packit 577717
    return 0;
Packit 577717
}