|
Packit |
577717 |
/*
|
|
Packit |
577717 |
* Copyright 2015-2016 NVIDIA Corporation. All rights reserved.
|
|
Packit |
577717 |
*
|
|
Packit |
577717 |
* Sample to demonstrate use of NVlink CUPTI APIs
|
|
Packit |
577717 |
*/
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#include <stdio.h>
|
|
Packit |
577717 |
#include <stdlib.h>
|
|
Packit |
577717 |
#include <string.h>
|
|
Packit |
577717 |
#include <cuda.h>
|
|
Packit |
577717 |
#include <cupti.h>
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#define CUPTI_CALL(call) \
|
|
Packit |
577717 |
do { \
|
|
Packit |
577717 |
CUptiResult _status = call; \
|
|
Packit |
577717 |
if (_status != CUPTI_SUCCESS) { \
|
|
Packit |
577717 |
const char *errstr; \
|
|
Packit |
577717 |
cuptiGetResultString(_status, &errstr); \
|
|
Packit |
577717 |
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
|
|
Packit |
577717 |
__FILE__, __LINE__, #call, errstr); \
|
|
Packit |
577717 |
exit(-1); \
|
|
Packit |
577717 |
} \
|
|
Packit |
577717 |
} while (0)
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#define DRIVER_API_CALL(apiFuncCall) \
|
|
Packit |
577717 |
do { \
|
|
Packit |
577717 |
CUresult _status = apiFuncCall; \
|
|
Packit |
577717 |
if (_status != CUDA_SUCCESS) { \
|
|
Packit |
577717 |
fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
|
|
Packit |
577717 |
__FILE__, __LINE__, #apiFuncCall, _status); \
|
|
Packit |
577717 |
exit(-1); \
|
|
Packit |
577717 |
} \
|
|
Packit |
577717 |
} while (0)
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#define RUNTIME_API_CALL(apiFuncCall) \
|
|
Packit |
577717 |
do { \
|
|
Packit |
577717 |
cudaError_t _status = apiFuncCall; \
|
|
Packit |
577717 |
if (_status != cudaSuccess) { \
|
|
Packit |
577717 |
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
|
|
Packit |
577717 |
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
|
|
Packit |
577717 |
exit(-1); \
|
|
Packit |
577717 |
} \
|
|
Packit |
577717 |
} while (0)
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#define MEMORY_ALLOCATION_CALL(var) \
|
|
Packit |
577717 |
do { \
|
|
Packit |
577717 |
if (var == NULL) { \
|
|
Packit |
577717 |
fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \
|
|
Packit |
577717 |
__FILE__, __LINE__); \
|
|
Packit |
577717 |
exit(-1); \
|
|
Packit |
577717 |
} \
|
|
Packit |
577717 |
} while (0)
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#define MAX_DEVICES (32)
|
|
Packit |
577717 |
#define BLOCK_SIZE (1024)
|
|
Packit |
577717 |
#define GRID_SIZE (512)
|
|
Packit |
577717 |
#define BUF_SIZE (32 * 1024)
|
|
Packit |
577717 |
#define ALIGN_SIZE (8)
|
|
Packit |
577717 |
#define SUCCESS (0)
|
|
Packit |
577717 |
#define NUM_METRIC (4)
|
|
Packit |
577717 |
#define NUM_EVENTS (2)
|
|
Packit |
577717 |
#define MAX_SIZE (64*1024*1024) // 64 MB
|
|
Packit |
577717 |
#define NUM_STREAMS (6) // gp100 has 6 physical copy engines
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUpti_ActivityNvLink *nvlinkRec = NULL;
|
|
Packit |
577717 |
int cpuToGpu = 0;
|
|
Packit |
577717 |
int gpuToGpu = 0;
|
|
Packit |
577717 |
int cpuToGpuAccess = 0;
|
|
Packit |
577717 |
int gpuToGpuAccess = 0;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
extern "C" __global__ void test_nvlink_bandwidth(float *src, float *dst)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
Packit |
577717 |
dst[idx] = src[idx] * 2.0f;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
static void printActivity(CUpti_Activity *record)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
if (record->kind == CUPTI_ACTIVITY_KIND_NVLINK) {
|
|
Packit |
577717 |
nvlinkRec = (CUpti_ActivityNvLink *)record;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
printf("typeDev0 %d, typeDev1 %d, sysmem %d, peer %d, physical links %d, portdev0 %d, %d, %d, %d, portDev1 %d, %d, %d, %d, bandwidth %llu\n",
|
|
Packit |
577717 |
nvlinkRec->typeDev0,
|
|
Packit |
577717 |
nvlinkRec->typeDev1,
|
|
Packit |
577717 |
((nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS) ? 1 : 0),
|
|
Packit |
577717 |
((nvlinkRec->flag & CUPTI_LINK_FLAG_PEER_ACCESS) ? 1 : 0),
|
|
Packit |
577717 |
nvlinkRec->physicalNvLinkCount,
|
|
Packit |
577717 |
nvlinkRec->portDev0[0], nvlinkRec->portDev0[1], nvlinkRec->portDev0[2], nvlinkRec->portDev0[3],
|
|
Packit |
577717 |
nvlinkRec->portDev1[0], nvlinkRec->portDev1[1], nvlinkRec->portDev1[2], nvlinkRec->portDev1[3],
|
|
Packit |
577717 |
(long long unsigned int)nvlinkRec->bandwidth);
|
|
Packit |
577717 |
cpuToGpuAccess |= (nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS);
|
|
Packit |
577717 |
gpuToGpuAccess |= (nvlinkRec->flag & CUPTI_LINK_FLAG_PEER_ACCESS);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else {
|
|
Packit |
577717 |
printf("Error : Unexpected CUPTI activity kind.\nExpected Activity kind : CUPTI_ACTIVITY_KIND_NVLINK\n");
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
static void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords)
|
|
Packit |
577717 |
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
*size = BUF_SIZE + ALIGN_SIZE;
|
|
Packit |
577717 |
*buffer = (uint8_t*) calloc(1, *size);
|
|
Packit |
577717 |
MEMORY_ALLOCATION_CALL(*buffer);
|
|
Packit |
577717 |
*maxNumRecords = 0;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId,
|
|
Packit |
577717 |
uint8_t *buffer, size_t size,
|
|
Packit |
577717 |
size_t validSize)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
CUptiResult status;
|
|
Packit |
577717 |
CUpti_Activity *record = NULL;
|
|
Packit |
577717 |
do {
|
|
Packit |
577717 |
status = cuptiActivityGetNextRecord(buffer, validSize, &record);
|
|
Packit |
577717 |
if(status == CUPTI_SUCCESS) {
|
|
Packit |
577717 |
printActivity(record);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
|
|
Packit |
577717 |
break;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else {
|
|
Packit |
577717 |
CUPTI_CALL(status);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
} while (1);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
size_t dropped;
|
|
Packit |
577717 |
CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
|
|
Packit |
577717 |
if (dropped != 0) {
|
|
Packit |
577717 |
printf("Dropped %u activity records\n", (unsigned int)dropped);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#define DIM(x) (sizeof(x)/sizeof(*(x)))
|
|
Packit |
577717 |
|
|
Packit |
577717 |
void calculateSize(char *result, uint64_t size)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
int i;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
const char *sizes[] = { "TB", "GB", "MB", "KB", "B" };
|
|
Packit |
577717 |
uint64_t exbibytes = 1024ULL * 1024ULL * 1024ULL * 1024ULL;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
uint64_t multiplier = exbibytes;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; (unsigned)i < DIM(sizes); i++, multiplier /= (uint64_t)1024)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
if (size < multiplier)
|
|
Packit |
577717 |
continue;
|
|
Packit |
577717 |
sprintf(result, "%.1f %s", (float) size / multiplier, sizes[i]);
|
|
Packit |
577717 |
return;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
strcpy(result, "0");
|
|
Packit |
577717 |
return;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
void readMetricValue(CUpti_EventGroup eventGroup, uint32_t numEvents,
|
|
Packit |
577717 |
CUdevice dev, CUpti_MetricID *metricId,
|
|
Packit |
577717 |
uint64_t timeDuration,
|
|
Packit |
577717 |
CUpti_MetricValue *metricValue) {
|
|
Packit |
577717 |
|
|
Packit |
577717 |
size_t bufferSizeBytes, numCountersRead;
|
|
Packit |
577717 |
uint64_t *eventValueArray = NULL;
|
|
Packit |
577717 |
CUpti_EventID *eventIdArray;
|
|
Packit |
577717 |
size_t arraySizeBytes = 0;
|
|
Packit |
577717 |
size_t numTotalInstancesSize = 0;
|
|
Packit |
577717 |
uint64_t numTotalInstances = 0;
|
|
Packit |
577717 |
uint64_t *aggrEventValueArray = NULL;
|
|
Packit |
577717 |
size_t aggrEventValueArraySize;
|
|
Packit |
577717 |
uint32_t i = 0, j = 0;
|
|
Packit |
577717 |
CUpti_EventDomainID domainId;
|
|
Packit |
577717 |
size_t domainSize;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
domainSize = sizeof(CUpti_EventDomainID);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupGetAttribute(eventGroup,
|
|
Packit |
577717 |
CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID,
|
|
Packit |
577717 |
&domainSize,
|
|
Packit |
577717 |
(void *)&domainId));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
numTotalInstancesSize = sizeof(uint64_t);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiDeviceGetEventDomainAttribute(dev,
|
|
Packit |
577717 |
domainId,
|
|
Packit |
577717 |
CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT,
|
|
Packit |
577717 |
&numTotalInstancesSize,
|
|
Packit |
577717 |
(void *)&numTotalInstances));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
arraySizeBytes = sizeof(CUpti_EventID) * numEvents;
|
|
Packit |
577717 |
bufferSizeBytes = sizeof(uint64_t) * numEvents * numTotalInstances;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
eventValueArray = (uint64_t *) malloc(bufferSizeBytes);
|
|
Packit |
577717 |
MEMORY_ALLOCATION_CALL(eventValueArray);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
eventIdArray = (CUpti_EventID *) malloc(arraySizeBytes);
|
|
Packit |
577717 |
MEMORY_ALLOCATION_CALL(eventIdArray);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
aggrEventValueArray = (uint64_t *) calloc(numEvents, sizeof(uint64_t));
|
|
Packit |
577717 |
MEMORY_ALLOCATION_CALL(aggrEventValueArray);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
aggrEventValueArraySize = sizeof(uint64_t) * numEvents;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupReadAllEvents(eventGroup,
|
|
Packit |
577717 |
CUPTI_EVENT_READ_FLAG_NONE,
|
|
Packit |
577717 |
&bufferSizeBytes,
|
|
Packit |
577717 |
eventValueArray,
|
|
Packit |
577717 |
&arraySizeBytes,
|
|
Packit |
577717 |
eventIdArray,
|
|
Packit |
577717 |
&numCountersRead));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < numEvents; i++) {
|
|
Packit |
577717 |
for (j = 0; j < numTotalInstances; j++) {
|
|
Packit |
577717 |
aggrEventValueArray[i] += eventValueArray[i + numEvents * j];
|
|
Packit |
577717 |
//printf("For event %d (id %d) instance %d value %ul aggregate %d = %ul\n", i, eventIdArray[i], j, eventValueArray[i + numEvents * j], i, aggrEventValueArray[i]);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < NUM_METRIC; i++) {
|
|
Packit |
577717 |
CUPTI_CALL(cuptiMetricGetValue(dev, metricId[i], arraySizeBytes,
|
|
Packit |
577717 |
eventIdArray, aggrEventValueArraySize,
|
|
Packit |
577717 |
aggrEventValueArray, timeDuration,
|
|
Packit |
577717 |
&metricValue[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
free(eventValueArray);
|
|
Packit |
577717 |
free(eventIdArray);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// Print metric value, we format based on the value kind
|
|
Packit |
577717 |
int printMetricValue(CUpti_MetricID metricId, CUpti_MetricValue metricValue, const char *metricName, uint64_t timeDuration) {
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUpti_MetricValueKind valueKind;
|
|
Packit |
577717 |
char str[64];
|
|
Packit |
577717 |
size_t valueKindSize = sizeof(valueKind);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiMetricGetAttribute(metricId, CUPTI_METRIC_ATTR_VALUE_KIND,
|
|
Packit |
577717 |
&valueKindSize, &valueKind));
|
|
Packit |
577717 |
switch (valueKind) {
|
|
Packit |
577717 |
|
|
Packit |
577717 |
case CUPTI_METRIC_VALUE_KIND_DOUBLE:
|
|
Packit |
577717 |
printf("%s = ", metricName);
|
|
Packit |
577717 |
calculateSize(str, (uint64_t)metricValue.metricValueDouble);
|
|
Packit |
577717 |
// printf("%s (val %lu %lu nsec)\n", str, metricValue.metricValueUint64, timeDuration);
|
|
Packit |
577717 |
printf("%s\n", str);
|
|
Packit |
577717 |
break;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
case CUPTI_METRIC_VALUE_KIND_UINT64:
|
|
Packit |
577717 |
printf("%s = ", metricName);
|
|
Packit |
577717 |
calculateSize(str, (uint64_t)metricValue.metricValueUint64);
|
|
Packit |
577717 |
printf("%s\n", str);
|
|
Packit |
577717 |
break;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
case CUPTI_METRIC_VALUE_KIND_INT64:
|
|
Packit |
577717 |
printf("%s = ", metricName);
|
|
Packit |
577717 |
calculateSize(str, (uint64_t)metricValue.metricValueInt64);
|
|
Packit |
577717 |
printf("%s\n", str);
|
|
Packit |
577717 |
break;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
|
|
Packit |
577717 |
printf("%s = ", metricName);
|
|
Packit |
577717 |
calculateSize(str, (uint64_t)metricValue.metricValueThroughput);
|
|
Packit |
577717 |
printf("%s\n", str);
|
|
Packit |
577717 |
break;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
default:
|
|
Packit |
577717 |
fprintf(stderr, "error: unknown value kind\n");
|
|
Packit |
577717 |
return -1;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
return 0;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
void testCpuToGpu(CUpti_EventGroup *eventGroup, CUdeviceptr *pDevBuffer,
|
|
Packit |
577717 |
float** pHostBuffer, size_t bufferSize,
|
|
Packit |
577717 |
cudaStream_t *cudaStreams,
|
|
Packit |
577717 |
uint64_t *timeDuration, int numEventGroup)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
int i;
|
|
Packit |
577717 |
uint32_t value = 1;
|
|
Packit |
577717 |
uint64_t startTimestamp, endTimestamp;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < numEventGroup; i++) {
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i]));
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i],
|
|
Packit |
577717 |
CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
|
|
Packit |
577717 |
sizeof(uint32_t), (void*)&value));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
//Unidirectional copy H2D
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i++)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
//Unidirectional copy D2H
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i++)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync(pHostBuffer[i], (void *)pDevBuffer[i], bufferSize, cudaMemcpyDeviceToHost, cudaStreams[i]));}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
//Bidirectional copy
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i+=2)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync(pHostBuffer[i+1], (void *)pDevBuffer[i+1], bufferSize, cudaMemcpyDeviceToHost, cudaStreams[i+1]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));
|
|
Packit |
577717 |
*timeDuration = endTimestamp - startTimestamp;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
void testGpuToGpu(CUpti_EventGroup *eventGroup, CUdeviceptr *pDevBuffer0, CUdeviceptr *pDevBuffer1,
|
|
Packit |
577717 |
float** pHostBuffer, size_t bufferSize,
|
|
Packit |
577717 |
cudaStream_t *cudaStreams,
|
|
Packit |
577717 |
uint64_t *timeDuration, int numEventGroup)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
int i;
|
|
Packit |
577717 |
uint32_t value = 1;
|
|
Packit |
577717 |
uint64_t startTimestamp, endTimestamp;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaSetDevice(0));
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceEnablePeerAccess(1, 0));
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaSetDevice(1));
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceEnablePeerAccess(0, 0));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
//Unidirectional copy H2D
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer0[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer1[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < numEventGroup; i++) {
|
|
Packit |
577717 |
printf("cuptiEventGroupEnable(eventGroup[%d])\n", i);
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i]));
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i],
|
|
Packit |
577717 |
CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
|
|
Packit |
577717 |
sizeof(uint32_t), (void*)&value));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer0[i], (void *)pDevBuffer1[i], bufferSize, cudaMemcpyDeviceToDevice, cudaStreams[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemcpyAsync((void *)pDevBuffer1[i], (void *)pDevBuffer0[i], bufferSize, cudaMemcpyDeviceToDevice, cudaStreams[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
test_nvlink_bandwidth<<<GRID_SIZE, BLOCK_SIZE>>>((float*)pDevBuffer1[i], (float*)pDevBuffer0[i]);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));
|
|
Packit |
577717 |
*timeDuration = endTimestamp - startTimestamp;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
static void printUsage() {
|
|
Packit |
577717 |
printf("usage: Demonstrate use of NVlink CUPTI APIs\n");
|
|
Packit |
577717 |
printf(" -help : display help message\n");
|
|
Packit |
577717 |
printf(" --cpu-to-gpu : Show results for data transfer between CPU and GPU \n");
|
|
Packit |
577717 |
printf(" --gpu-to-gpu : Show results for data transfer between two GPUs \n");
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
void parseCommandLineArgs(int argc, char *argv[])
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
if (argc != 2) {
|
|
Packit |
577717 |
printf("Invalid number of options\n");
|
|
Packit |
577717 |
exit(0);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
if (strcmp(argv[1], "--cpu-to-gpu") == 0) {
|
|
Packit |
577717 |
cpuToGpu = 1;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else if (strcmp(argv[1], "--gpu-to-gpu") == 0) {
|
|
Packit |
577717 |
gpuToGpu = 1;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else if ((strcmp(argv[1], "--help") == 0) ||
|
|
Packit |
577717 |
(strcmp(argv[1], "-help") == 0) ||
|
|
Packit |
577717 |
(strcmp(argv[1], "-h") == 0)) {
|
|
Packit |
577717 |
printUsage();
|
|
Packit |
577717 |
exit(0);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else {
|
|
Packit |
577717 |
cpuToGpu = 1;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
int main(int argc, char *argv[])
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
int deviceCount = 0, i = 0, j = 0, numEventGroup = 0;
|
|
Packit |
577717 |
size_t bufferSize = 0, freeMemory = 0, totalMemory = 0;
|
|
Packit |
577717 |
CUpti_EventGroupSets *passes = NULL;
|
|
Packit |
577717 |
CUcontext ctx;
|
|
Packit |
577717 |
char str[64];
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUdeviceptr pDevBuffer0[NUM_STREAMS];
|
|
Packit |
577717 |
CUdeviceptr pDevBuffer1[NUM_STREAMS];
|
|
Packit |
577717 |
float* pHostBuffer[NUM_STREAMS];
|
|
Packit |
577717 |
|
|
Packit |
577717 |
cudaStream_t cudaStreams[NUM_STREAMS] = {0};
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUpti_EventGroup eventGroup[32];
|
|
Packit |
577717 |
CUpti_MetricID metricId[NUM_METRIC];
|
|
Packit |
577717 |
uint32_t numEvents[NUM_METRIC];
|
|
Packit |
577717 |
CUpti_MetricValue metricValue[NUM_METRIC];
|
|
Packit |
577717 |
cudaDeviceProp prop[MAX_DEVICES];
|
|
Packit |
577717 |
uint64_t timeDuration;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// Adding nvlink Metrics.
|
|
Packit |
577717 |
const char *metricName[NUM_METRIC] = {"nvlink_total_data_transmitted",
|
|
Packit |
577717 |
"nvlink_total_data_received",
|
|
Packit |
577717 |
"nvlink_transmit_throughput",
|
|
Packit |
577717 |
"nvlink_receive_throughput"};
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// Parse command line arguments
|
|
Packit |
577717 |
parseCommandLineArgs(argc, argv);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NVLINK));
|
|
Packit |
577717 |
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
DRIVER_API_CALL(cuInit(0));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaGetDeviceCount(&deviceCount));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
printf("There are %d devices.\n", deviceCount);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
if (deviceCount == 0) {
|
|
Packit |
577717 |
printf("There is no device supporting CUDA.\n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < deviceCount; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaGetDeviceProperties(&prop[i], i));
|
|
Packit |
577717 |
printf("CUDA Device %d Name: %s\n", i, prop[i].name);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// Set memcpy size based on available device memory
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMemGetInfo(&freeMemory, &totalMemory));
|
|
Packit |
577717 |
bufferSize = MAX_SIZE < (freeMemory/4) ? MAX_SIZE : (freeMemory/4);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
printf("Total Device Memory available : ");
|
|
Packit |
577717 |
calculateSize(str, (uint64_t)totalMemory);
|
|
Packit |
577717 |
printf("%s\n", str);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
printf("Memcpy size is set to %llu B (%llu MB)\n",
|
|
Packit |
577717 |
(unsigned long long)bufferSize, (unsigned long long)bufferSize/(1024*1024));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for(i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaStreamCreate(&cudaStreams[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaDeviceSynchronize());
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// Nvlink-topology Records are generated even before cudaMemcpy API is called.
|
|
Packit |
577717 |
CUPTI_CALL(cuptiActivityFlushAll(0));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// Transfer Data between Host And Device, if Nvlink is Present
|
|
Packit |
577717 |
// Check condition : nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS
|
|
Packit |
577717 |
// True : Nvlink is present between CPU & GPU
|
|
Packit |
577717 |
// False : Nvlink is not present.
|
|
Packit |
577717 |
if ((nvlinkRec) && (((cpuToGpu) && (cpuToGpuAccess)) || ((gpuToGpu) && (gpuToGpuAccess)))) {
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < NUM_METRIC; i++) {
|
|
Packit |
577717 |
CUPTI_CALL(cuptiMetricGetIdFromName(0, metricName[i], &metricId[i]));
|
|
Packit |
577717 |
CUPTI_CALL(cuptiMetricGetNumEvents(metricId[i], &numEvents[i]));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
DRIVER_API_CALL(cuCtxCreate(&ctx, 0, 0));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiMetricCreateEventGroupSets(ctx, (sizeof metricId) ,metricId, &passes));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// EventGroups required to profile Nvlink metrics.
|
|
Packit |
577717 |
for (i = 0; i < (signed)passes->numSets; i++) {
|
|
Packit |
577717 |
for (j = 0; j < (signed)passes->sets[i].numEventGroups; j++) {
|
|
Packit |
577717 |
eventGroup[numEventGroup] = passes->sets[i].eventGroups[j];
|
|
Packit |
577717 |
|
|
Packit |
577717 |
if (!eventGroup[numEventGroup]) {
|
|
Packit |
577717 |
printf("\n eventGroup initialization failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
numEventGroup++;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiSetEventCollectionMode(ctx, CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// ===== Allocate Memory =====================================
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for(i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMalloc((void**)&pDevBuffer0[i], bufferSize));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
pHostBuffer[i] = (float *)malloc(bufferSize);
|
|
Packit |
577717 |
MEMORY_ALLOCATION_CALL(pHostBuffer[i]);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
if (cpuToGpu) {
|
|
Packit |
577717 |
testCpuToGpu(eventGroup, pDevBuffer0, pHostBuffer, bufferSize, cudaStreams, &timeDuration, numEventGroup);
|
|
Packit |
577717 |
printf("Data tranferred between CPU & Device%d : \n", (int)nvlinkRec->typeDev0);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else if(gpuToGpu) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaSetDevice(1));
|
|
Packit |
577717 |
for(i = 0; i < NUM_STREAMS; i++) {
|
|
Packit |
577717 |
RUNTIME_API_CALL(cudaMalloc((void**)&pDevBuffer1[i], bufferSize));
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
testGpuToGpu(eventGroup, pDevBuffer0, pDevBuffer1,pHostBuffer, bufferSize, cudaStreams, &timeDuration, numEventGroup);
|
|
Packit |
577717 |
printf("Data tranferred between Device 0 & Device 1 : \n");
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// Collect Nvlink Metric values for the data transfer via Nvlink for all the eventGroups.
|
|
Packit |
577717 |
for (i = 0; i < numEventGroup; i++) {
|
|
Packit |
577717 |
readMetricValue(eventGroup[i], NUM_EVENTS, 0, metricId, timeDuration, metricValue);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupDisable(eventGroup[i]));
|
|
Packit |
577717 |
CUPTI_CALL(cuptiEventGroupDestroy(eventGroup[i]));
|
|
Packit |
577717 |
|
|
Packit |
577717 |
for (i = 0; i < NUM_METRIC; i++) {
|
|
Packit |
577717 |
if (printMetricValue(metricId[i], metricValue[i], metricName[i], timeDuration) != 0) {
|
|
Packit |
577717 |
printf("\n printMetricValue failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
else {
|
|
Packit |
577717 |
printf("No Nvlink supported device found\n");
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
return 0;
|
|
Packit |
577717 |
}
|