Blame src/components/cuda/tests/nvlink_bandwidth.cu

Packit Service a1973e
/* 
Packit Service a1973e
 * Copyright 2015-2016 NVIDIA Corporation. All rights reserved.
Packit Service a1973e
 *
Packit Service a1973e
 * Sample to demonstrate use of NVlink CUPTI APIs
Packit Service a1973e
 */
Packit Service a1973e
Packit Service a1973e
#include <stdio.h>
Packit Service a1973e
#include <stdlib.h>
Packit Service a1973e
#include <string.h>
Packit Service a1973e
#include <cuda.h>
Packit Service a1973e
#include <cupti.h>
Packit Service a1973e
Packit Service a1973e
#ifdef PAPI
Packit Service a1973e
#include "papi.h"
Packit Service a1973e
#endif
Packit Service a1973e
Packit Service a1973e
#define CUPTI_CALL(call)                                                \
Packit Service a1973e
    do {                                                                \
Packit Service a1973e
        CUptiResult _status = call;                                     \
Packit Service a1973e
        if (_status != CUPTI_SUCCESS) {                                 \
Packit Service a1973e
            const char *errstr;                                         \
Packit Service a1973e
            cuptiGetResultString(_status, &errstr);                     \
Packit Service a1973e
            fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
Packit Service a1973e
                    __FILE__, __LINE__, #call, errstr);                 \
Packit Service a1973e
            exit(-1);                                                   \
Packit Service a1973e
        }                                                               \
Packit Service a1973e
    } while (0)
Packit Service a1973e
Packit Service a1973e
#define DRIVER_API_CALL(apiFuncCall)                                    \
Packit Service a1973e
    do {                                                                \
Packit Service a1973e
        CUresult _status = apiFuncCall;                                 \
Packit Service a1973e
        if (_status != CUDA_SUCCESS) {                                  \
Packit Service a1973e
            fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
Packit Service a1973e
                    __FILE__, __LINE__, #apiFuncCall, _status);         \
Packit Service a1973e
            exit(-1);                                                   \
Packit Service a1973e
        }                                                               \
Packit Service a1973e
    } while (0)
Packit Service a1973e
Packit Service a1973e
#define RUNTIME_API_CALL(apiFuncCall)                                   \
Packit Service a1973e
    do {                                                                \
Packit Service a1973e
        cudaError_t _status = apiFuncCall;                              \
Packit Service a1973e
        if (_status != cudaSuccess) {                                   \
Packit Service a1973e
            fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
Packit Service a1973e
                    __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status)); \
Packit Service a1973e
            exit(-1);                                                   \
Packit Service a1973e
        }                                                               \
Packit Service a1973e
    } while (0)
Packit Service a1973e
Packit Service a1973e
#define MEMORY_ALLOCATION_CALL(var)                                     \
Packit Service a1973e
    do {                                                                \
Packit Service a1973e
        if (var == NULL) {                                              \
Packit Service a1973e
            fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \
Packit Service a1973e
                    __FILE__, __LINE__);                                \
Packit Service a1973e
            exit(-1);                                                   \
Packit Service a1973e
        }                                                               \
Packit Service a1973e
    } while (0)
Packit Service a1973e
Packit Service a1973e
#define MAX_DEVICES    (32)
Packit Service a1973e
#define BLOCK_SIZE     (1024)
Packit Service a1973e
#define GRID_SIZE      (512)
Packit Service a1973e
#define BUF_SIZE       (32 * 1024)
Packit Service a1973e
#define ALIGN_SIZE     (8)
Packit Service a1973e
#define SUCCESS        (0)
Packit Service a1973e
#define NUM_METRIC     (4)
Packit Service a1973e
#define NUM_EVENTS     (2)
Packit Service a1973e
#define MAX_SIZE       (64*1024*1024)   // 64 MB
Packit Service a1973e
#define NUM_STREAMS    (6)      // gp100 has 6 physical copy engines
Packit Service a1973e
Packit Service a1973e
CUpti_ActivityNvLink *nvlinkRec = NULL;
Packit Service a1973e
int cpuToGpu = 0;
Packit Service a1973e
int gpuToGpu = 0;
Packit Service a1973e
int cpuToGpuAccess = 0;
Packit Service a1973e
int gpuToGpuAccess = 0;
Packit Service a1973e
Packit Service a1973e
extern "C" __global__ void test_nvlink_bandwidth(float *src, float *dst)
Packit Service a1973e
{
Packit Service a1973e
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
Packit Service a1973e
    dst[idx] = src[idx] * 2.0f;
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
static void printActivity(CUpti_Activity * record)
Packit Service a1973e
{
Packit Service a1973e
    if(record->kind == CUPTI_ACTIVITY_KIND_NVLINK) {
Packit Service a1973e
        nvlinkRec = (CUpti_ActivityNvLink *) record;
Packit Service a1973e
        // printf("typeDev0 %d, typeDev1 %d, sysmem %d, peer %d, physical links %d, portdev0 %d, %d, %d, %d, portDev1 %d, %d, %d, %d, bandwidth %llu\n", nvlinkRec->typeDev0, nvlinkRec->typeDev1, ((nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS) ? 1 : 0), ((nvlinkRec->flag & CUPTI_LINK_FLAG_PEER_ACCESS) ? 1 : 0), nvlinkRec->physicalNvLinkCount, nvlinkRec->portDev0[0], nvlinkRec->portDev0[1], nvlinkRec->portDev0[2], nvlinkRec->portDev0[3], nvlinkRec->portDev1[0], nvlinkRec->portDev1[1], nvlinkRec->portDev1[2], nvlinkRec->portDev1[3], (long long unsigned int) nvlinkRec->bandwidth);
Packit Service a1973e
        cpuToGpuAccess |= (nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS);
Packit Service a1973e
        gpuToGpuAccess |= (nvlinkRec->flag & CUPTI_LINK_FLAG_PEER_ACCESS);
Packit Service a1973e
    } else {
Packit Service a1973e
        printf("Error : Unexpected CUPTI activity kind.\nExpected Activity kind : CUPTI_ACTIVITY_KIND_NVLINK\n");
Packit Service a1973e
    }
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
static void CUPTIAPI bufferRequested(uint8_t ** buffer, size_t * size, size_t * maxNumRecords)
Packit Service a1973e
{
Packit Service a1973e
    *size = BUF_SIZE + ALIGN_SIZE;
Packit Service a1973e
    *buffer = (uint8_t *) calloc(1, *size);
Packit Service a1973e
    MEMORY_ALLOCATION_CALL(*buffer);
Packit Service a1973e
    *maxNumRecords = 0;
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t * buffer, size_t size, size_t validSize)
Packit Service a1973e
{
Packit Service a1973e
    CUptiResult status;
Packit Service a1973e
    CUpti_Activity *record = NULL;
Packit Service a1973e
    do {
Packit Service a1973e
        status = cuptiActivityGetNextRecord(buffer, validSize, &record);
Packit Service a1973e
        if(status == CUPTI_SUCCESS) {
Packit Service a1973e
            printActivity(record);
Packit Service a1973e
        } else if(status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
Packit Service a1973e
            break;
Packit Service a1973e
        } else {
Packit Service a1973e
            CUPTI_CALL(status);
Packit Service a1973e
        }
Packit Service a1973e
    } while(1);
Packit Service a1973e
Packit Service a1973e
    size_t dropped;
Packit Service a1973e
    CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
Packit Service a1973e
    if(dropped != 0) {
Packit Service a1973e
        printf("Dropped %u activity records\n", (unsigned int) dropped);
Packit Service a1973e
    }
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
#define DIM(x) (sizeof(x)/sizeof(*(x)))
Packit Service a1973e
Packit Service a1973e
void calculateSize(char *result, uint64_t size)
Packit Service a1973e
{
Packit Service a1973e
    int i;
Packit Service a1973e
Packit Service a1973e
    const char *sizes[] = { "TB", "GB", "MB", "KB", "B" };
Packit Service a1973e
    uint64_t exbibytes = 1024ULL * 1024ULL * 1024ULL * 1024ULL;
Packit Service a1973e
Packit Service a1973e
    uint64_t multiplier = exbibytes;
Packit Service a1973e
Packit Service a1973e
    for(i = 0; (unsigned) i < DIM(sizes); i++, multiplier /= (uint64_t) 1024) {
Packit Service a1973e
        if(size < multiplier)
Packit Service a1973e
            continue;
Packit Service a1973e
        sprintf(result, "%.1f %s", (float) size / multiplier, sizes[i]);
Packit Service a1973e
        return;
Packit Service a1973e
    }
Packit Service a1973e
    strcpy(result, "0");
Packit Service a1973e
    return;
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
void readMetricValue(CUpti_EventGroup eventGroup, uint32_t numEvents, CUdevice dev, CUpti_MetricID * metricId, uint64_t timeDuration, CUpti_MetricValue * metricValue)
Packit Service a1973e
{
Packit Service a1973e
    size_t bufferSizeBytes, numCountersRead;
Packit Service a1973e
    uint64_t *eventValueArray = NULL;
Packit Service a1973e
    CUpti_EventID *eventIdArray;
Packit Service a1973e
    size_t arraySizeBytes = 0;
Packit Service a1973e
    size_t numTotalInstancesSize = 0;
Packit Service a1973e
    uint64_t numTotalInstances = 0;
Packit Service a1973e
    uint64_t *aggrEventValueArray = NULL;
Packit Service a1973e
    size_t aggrEventValueArraySize;
Packit Service a1973e
    uint32_t i = 0, j = 0;
Packit Service a1973e
    CUpti_EventDomainID domainId;
Packit Service a1973e
    size_t domainSize;
Packit Service a1973e
Packit Service a1973e
    domainSize = sizeof(CUpti_EventDomainID);
Packit Service a1973e
Packit Service a1973e
    CUPTI_CALL(cuptiEventGroupGetAttribute(eventGroup, CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID, &domainSize, (void *) &domainId));
Packit Service a1973e
Packit Service a1973e
    numTotalInstancesSize = sizeof(uint64_t);
Packit Service a1973e
Packit Service a1973e
    CUPTI_CALL(cuptiDeviceGetEventDomainAttribute(dev, domainId, CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT, &numTotalInstancesSize, (void *) &numTotalInstances));
Packit Service a1973e
Packit Service a1973e
    arraySizeBytes = sizeof(CUpti_EventID) * numEvents;
Packit Service a1973e
    bufferSizeBytes = sizeof(uint64_t) * numEvents * numTotalInstances;
Packit Service a1973e
Packit Service a1973e
    eventValueArray = (uint64_t *) malloc(bufferSizeBytes);
Packit Service a1973e
    MEMORY_ALLOCATION_CALL(eventValueArray);
Packit Service a1973e
Packit Service a1973e
    eventIdArray = (CUpti_EventID *) malloc(arraySizeBytes);
Packit Service a1973e
    MEMORY_ALLOCATION_CALL(eventIdArray);
Packit Service a1973e
Packit Service a1973e
    aggrEventValueArray = (uint64_t *) calloc(numEvents, sizeof(uint64_t));
Packit Service a1973e
    MEMORY_ALLOCATION_CALL(aggrEventValueArray);
Packit Service a1973e
Packit Service a1973e
    aggrEventValueArraySize = sizeof(uint64_t) * numEvents;
Packit Service a1973e
Packit Service a1973e
    CUPTI_CALL(cuptiEventGroupReadAllEvents(eventGroup, CUPTI_EVENT_READ_FLAG_NONE, &bufferSizeBytes, eventValueArray, &arraySizeBytes, eventIdArray, &numCountersRead));
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < numEvents; i++) {
Packit Service a1973e
        for(j = 0; j < numTotalInstances; j++) {
Packit Service a1973e
            aggrEventValueArray[i] += eventValueArray[i + numEvents * j];
Packit Service a1973e
            printf("For event %d instance %d value %ul aggregate %d = %ul\n", i, j,  eventValueArray[i + numEvents * j], i, aggrEventValueArray[i]);
Packit Service a1973e
        }
Packit Service a1973e
    }
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < NUM_METRIC; i++) {
Packit Service a1973e
        CUPTI_CALL(cuptiMetricGetValue(dev, metricId[i], arraySizeBytes, eventIdArray, aggrEventValueArraySize, aggrEventValueArray, timeDuration, &metricValue[i]));
Packit Service a1973e
    }
Packit Service a1973e
Packit Service a1973e
    free(eventValueArray);
Packit Service a1973e
    free(eventIdArray);
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
// Print metric value, we format based on the value kind
Packit Service a1973e
int printMetricValue(CUpti_MetricID metricId, CUpti_MetricValue metricValue, const char *metricName)
Packit Service a1973e
{
Packit Service a1973e
Packit Service a1973e
    CUpti_MetricValueKind valueKind;
Packit Service a1973e
    char str[64];
Packit Service a1973e
    size_t valueKindSize = sizeof(valueKind);
Packit Service a1973e
Packit Service a1973e
    CUPTI_CALL(cuptiMetricGetAttribute(metricId, CUPTI_METRIC_ATTR_VALUE_KIND, &valueKindSize, &valueKind));
Packit Service a1973e
    switch (valueKind) {
Packit Service a1973e
Packit Service a1973e
    case CUPTI_METRIC_VALUE_KIND_DOUBLE:
Packit Service a1973e
        printf("%s = %f ", metricName, metricValue.metricValueDouble);
Packit Service a1973e
        calculateSize(str, (uint64_t) metricValue.metricValueDouble);
Packit Service a1973e
        printf("%s\n", str);
Packit Service a1973e
        break;
Packit Service a1973e
Packit Service a1973e
    case CUPTI_METRIC_VALUE_KIND_UINT64:
Packit Service a1973e
        printf("%s = %lu ", metricName, metricValue.metricValueUint64);
Packit Service a1973e
        calculateSize(str, (uint64_t) metricValue.metricValueUint64);
Packit Service a1973e
        printf("%s\n", str);
Packit Service a1973e
        break;
Packit Service a1973e
Packit Service a1973e
    case CUPTI_METRIC_VALUE_KIND_INT64:
Packit Service a1973e
        printf("%s = %ld ", metricName, metricValue.metricValueInt64);
Packit Service a1973e
        calculateSize(str, (uint64_t) metricValue.metricValueInt64);
Packit Service a1973e
        printf("%s\n", str);
Packit Service a1973e
        break;
Packit Service a1973e
Packit Service a1973e
    case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
Packit Service a1973e
        printf("%s = %f ", metricName, metricValue.metricValueThroughput);
Packit Service a1973e
        calculateSize(str, (uint64_t) metricValue.metricValueThroughput);
Packit Service a1973e
        printf("%s/Sec\n", str);
Packit Service a1973e
        break;
Packit Service a1973e
Packit Service a1973e
    default:
Packit Service a1973e
        fprintf(stderr, "error: unknown value kind\n");
Packit Service a1973e
        return -1;
Packit Service a1973e
    }
Packit Service a1973e
    return 0;
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
void testCpuToGpu(CUpti_EventGroup * eventGroup, CUdeviceptr * pDevBuffer, float **pHostBuffer, size_t bufferSize, cudaStream_t * cudaStreams, uint64_t * timeDuration, int numEventGroup)
Packit Service a1973e
{
Packit Service a1973e
    int i;
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
    uint32_t value = 1;
Packit Service a1973e
    uint64_t startTimestamp, endTimestamp;
Packit Service a1973e
    for(i = 0; i < numEventGroup; i++) {
Packit Service a1973e
        CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i]));
Packit Service a1973e
        CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i], CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, sizeof(uint32_t), (void *) &value));
Packit Service a1973e
    }
Packit Service a1973e
    CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
Packit Service a1973e
    // Unidirectional copy H2D
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync((void *) pDevBuffer[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
Packit Service a1973e
    // Unidirectional copy D2H
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync(pHostBuffer[i], (void *) pDevBuffer[i], bufferSize, cudaMemcpyDeviceToHost, cudaStreams[i]));
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
Packit Service a1973e
    // Bidirectional copy
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i += 2) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync((void *) pDevBuffer[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync(pHostBuffer[i + 1], (void *) pDevBuffer[i + 1], bufferSize, cudaMemcpyDeviceToHost, cudaStreams[i + 1]));
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
    CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));
Packit Service a1973e
    *timeDuration = endTimestamp - startTimestamp;
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
void testGpuToGpu_part1(CUpti_EventGroup * eventGroup, CUdeviceptr * pDevBuffer0, CUdeviceptr * pDevBuffer1, float **pHostBuffer, size_t bufferSize, cudaStream_t * cudaStreams, uint64_t * timeDuration, int numEventGroup)
Packit Service a1973e
{
Packit Service a1973e
    int i;
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
    uint32_t value = 1;
Packit Service a1973e
    uint64_t startTimestamp, endTimestamp;
Packit Service a1973e
    for(i = 0; i < numEventGroup; i++) {
Packit Service a1973e
        CUPTI_CALL(cuptiEventGroupEnable(eventGroup[i]));
Packit Service a1973e
        CUPTI_CALL(cuptiEventGroupSetAttribute(eventGroup[i], CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES, sizeof(uint32_t), (void *) &value));
Packit Service a1973e
    }
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
Packit Service a1973e
    RUNTIME_API_CALL(cudaSetDevice(0));
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceEnablePeerAccess(1, 0));
Packit Service a1973e
    RUNTIME_API_CALL(cudaSetDevice(1));
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceEnablePeerAccess(0, 0));
Packit Service a1973e
Packit Service a1973e
    // Unidirectional copy H2D
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync((void *) pDevBuffer0[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync((void *) pDevBuffer1[i], pHostBuffer[i], bufferSize, cudaMemcpyHostToDevice, cudaStreams[i]));
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
void testGpuToGpu_part2(CUpti_EventGroup * eventGroup, CUdeviceptr * pDevBuffer0, CUdeviceptr * pDevBuffer1, float **pHostBuffer, size_t bufferSize, cudaStream_t * cudaStreams, uint64_t * timeDuration, int numEventGroup)
Packit Service a1973e
{
Packit Service a1973e
    int i;
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
    uint32_t value = 1;
Packit Service a1973e
    uint64_t startTimestamp, endTimestamp;
Packit Service a1973e
    CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
Packit Service a1973e
#endif
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync((void *) pDevBuffer0[i], (void *) pDevBuffer1[i], bufferSize, cudaMemcpyDeviceToDevice, cudaStreams[i]));
Packit Service a1973e
        //printf("Copy %zu stream %d to devBuffer0 from devBuffer1 \n", bufferSize, i);
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaMemcpyAsync((void *) pDevBuffer1[i], (void *) pDevBuffer0[i], bufferSize, cudaMemcpyDeviceToDevice, cudaStreams[i]));
Packit Service a1973e
        // printf("Copy %zu stream %d to devBuffer0 from devBuffer1 \n", bufferSize, i);
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        test_nvlink_bandwidth <<< GRID_SIZE, BLOCK_SIZE >>> ((float *) pDevBuffer1[i], (float *) pDevBuffer0[i]);
Packit Service a1973e
        // printf("test_nvlink_bandwidth stream %d \n", i);
Packit Service a1973e
    }
Packit Service a1973e
    
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
    CUPTI_CALL(cuptiGetTimestamp(&endTimestamp));
Packit Service a1973e
    *timeDuration = endTimestamp - startTimestamp;
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
Packit Service a1973e
static void printUsage()
Packit Service a1973e
{
Packit Service a1973e
    printf("usage: Demonstrate use of NVlink CUPTI APIs\n");
Packit Service a1973e
    printf("       -help           : display help message\n");
Packit Service a1973e
    printf("       --cpu-to-gpu    : Show results for data transfer between CPU and GPU \n");
Packit Service a1973e
    printf("       --gpu-to-gpu    : Show results for data transfer between two GPUs \n");
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
void parseCommandLineArgs(int argc, char *argv[])
Packit Service a1973e
{
Packit Service a1973e
    if(argc != 2) {
Packit Service a1973e
        printf("Invalid number of options\n");
Packit Service a1973e
        exit(0);
Packit Service a1973e
    }
Packit Service a1973e
Packit Service a1973e
    if(strcmp(argv[1], "--cpu-to-gpu") == 0) {
Packit Service a1973e
        cpuToGpu = 1;
Packit Service a1973e
    } else if(strcmp(argv[1], "--gpu-to-gpu") == 0) {
Packit Service a1973e
        gpuToGpu = 1;
Packit Service a1973e
    } else if((strcmp(argv[1], "--help") == 0) || (strcmp(argv[1], "-help") == 0) || (strcmp(argv[1], "-h") == 0)) {
Packit Service a1973e
        printUsage();
Packit Service a1973e
        exit(0);
Packit Service a1973e
    } else {
Packit Service a1973e
        cpuToGpu = 1;
Packit Service a1973e
    }
Packit Service a1973e
}
Packit Service a1973e
Packit Service a1973e
int main(int argc, char *argv[])
Packit Service a1973e
{
Packit Service a1973e
    int deviceCount = 0, i = 0, numEventGroup = 0;
Packit Service a1973e
    size_t bufferSize = 0, freeMemory = 0, totalMemory = 0;
Packit Service a1973e
    CUcontext ctx;
Packit Service a1973e
    char str[64];
Packit Service a1973e
Packit Service a1973e
    CUdeviceptr pDevBuffer0[NUM_STREAMS];
Packit Service a1973e
    CUdeviceptr pDevBuffer1[NUM_STREAMS];
Packit Service a1973e
    float *pHostBuffer[NUM_STREAMS];
Packit Service a1973e
Packit Service a1973e
    cudaStream_t cudaStreams[NUM_STREAMS] = { 0 };
Packit Service a1973e
    cudaDeviceProp prop[MAX_DEVICES];
Packit Service a1973e
    uint64_t timeDuration;
Packit Service a1973e
    CUpti_EventGroup eventGroup[32];
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
    CUpti_MetricID metricId[NUM_METRIC];
Packit Service a1973e
    uint32_t numEvents[NUM_METRIC];
Packit Service a1973e
    CUpti_MetricValue metricValue[NUM_METRIC];
Packit Service a1973e
Packit Service a1973e
    // Adding nvlink Metrics.
Packit Service a1973e
    const char *metricName[NUM_METRIC] = { 
Packit Service a1973e
        "nvlink_total_data_transmitted",
Packit Service a1973e
        "nvlink_total_data_received",
Packit Service a1973e
        "nvlink_transmit_throughput",
Packit Service a1973e
        "nvlink_receive_throughput"
Packit Service a1973e
    };
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
Packit Service a1973e
    // Parse command line arguments
Packit Service a1973e
    parseCommandLineArgs(argc, argv);
Packit Service a1973e
Packit Service a1973e
    CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NVLINK));
Packit Service a1973e
    CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));
Packit Service a1973e
Packit Service a1973e
    DRIVER_API_CALL(cuInit(0));
Packit Service a1973e
    RUNTIME_API_CALL(cudaGetDeviceCount(&deviceCount));
Packit Service a1973e
    printf("There are %d devices.\n", deviceCount);
Packit Service a1973e
Packit Service a1973e
    if(deviceCount == 0) {
Packit Service a1973e
        printf("There is no device supporting CUDA.\n");
Packit Service a1973e
        exit(-1);
Packit Service a1973e
    }
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < deviceCount; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaGetDeviceProperties(&prop[i], i));
Packit Service a1973e
        printf("CUDA Device %d Name: %s\n", i, prop[i].name);
Packit Service a1973e
    }
Packit Service a1973e
Packit Service a1973e
    // Set memcpy size based on available device memory
Packit Service a1973e
    RUNTIME_API_CALL(cudaMemGetInfo(&freeMemory, &totalMemory));
Packit Service a1973e
    printf("Total Device Memory available : ");
Packit Service a1973e
    calculateSize(str, (uint64_t) totalMemory);
Packit Service a1973e
    printf("%s\n", str);
Packit Service a1973e
Packit Service a1973e
    bufferSize = MAX_SIZE < (freeMemory / 4) ? MAX_SIZE : (freeMemory / 4);
Packit Service a1973e
    bufferSize = bufferSize/2;
Packit Service a1973e
    printf("Memcpy size is set to %llu B (%llu MB)\n", (unsigned long long) bufferSize, (unsigned long long) bufferSize / (1024 * 1024));
Packit Service a1973e
Packit Service a1973e
    for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
        RUNTIME_API_CALL(cudaStreamCreate(&cudaStreams[i]));
Packit Service a1973e
    }
Packit Service a1973e
    RUNTIME_API_CALL(cudaDeviceSynchronize());
Packit Service a1973e
Packit Service a1973e
    // Nvlink-topology Records are generated even before cudaMemcpy API is called.
Packit Service a1973e
    CUPTI_CALL(cuptiActivityFlushAll(0));
Packit Service a1973e
Packit Service a1973e
    // Transfer Data between Host And Device, if Nvlink is Present
Packit Service a1973e
    // Check condition : nvlinkRec->flag & CUPTI_LINK_FLAG_SYSMEM_ACCESS
Packit Service a1973e
    // True : Nvlink is present between CPU & GPU
Packit Service a1973e
    // False : Nvlink is not present.
Packit Service a1973e
    if((nvlinkRec) && (((cpuToGpu) && (cpuToGpuAccess)) || ((gpuToGpu) && (gpuToGpuAccess)))) {
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
        for(i = 0; i < NUM_METRIC; i++) {
Packit Service a1973e
            CUPTI_CALL(cuptiMetricGetIdFromName(0, metricName[i], &metricId[i]));
Packit Service a1973e
            CUPTI_CALL(cuptiMetricGetNumEvents(metricId[i], &numEvents[i]));
Packit Service a1973e
        }
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
Packit Service a1973e
        DRIVER_API_CALL(cuCtxCreate(&ctx, 0, 0));
Packit Service a1973e
Packit Service a1973e
#ifdef PAPI
Packit Service a1973e
        printf("Setup PAPI counters internally (PAPI)\n");
Packit Service a1973e
        int EventSet = PAPI_NULL;
Packit Service a1973e
        long long values[MAX_DEVICES * NUM_METRIC];
Packit Service a1973e
        char *EventName[MAX_DEVICES * NUM_METRIC];
Packit Service a1973e
        int eventCount;
Packit Service a1973e
        int retval, ee;
Packit Service a1973e
Packit Service a1973e
        /* PAPI Initialization */
Packit Service a1973e
        retval = PAPI_library_init(PAPI_VER_CURRENT);
Packit Service a1973e
        if(retval != PAPI_VER_CURRENT) fprintf(stderr, "PAPI_library_init failed\n");
Packit Service a1973e
        printf("PAPI version: %d.%d.%d\n", PAPI_VERSION_MAJOR(PAPI_VERSION), PAPI_VERSION_MINOR(PAPI_VERSION), PAPI_VERSION_REVISION(PAPI_VERSION));
Packit Service a1973e
Packit Service a1973e
        retval = PAPI_create_eventset(&EventSet);
Packit Service a1973e
        if(retval != PAPI_OK) fprintf(stderr, "PAPI_create_eventset failed\n");
Packit Service a1973e
Packit Service a1973e
        const char *EventEndings[NUM_METRIC] = {
Packit Service a1973e
            "cuda:::metric:nvlink_total_data_transmitted",
Packit Service a1973e
            "cuda:::metric:nvlink_total_data_received",
Packit Service a1973e
            "cuda:::metric:nvlink_transmit_throughput",
Packit Service a1973e
            "cuda:::metric:nvlink_receive_throughput",
Packit Service a1973e
        };
Packit Service a1973e
        
Packit Service a1973e
        // Add events at a GPU specific level ... eg cuda:::metric:nvlink_total_data_transmitted:device=0
Packit Service a1973e
        // Just profile devices to match the CUPTI example
Packit Service a1973e
        char tmpEventName[1024];
Packit Service a1973e
        eventCount = 0;
Packit Service a1973e
        for(i = 0; i < 1; i++) { // only profile device 0
Packit Service a1973e
            printf("Set device to %d\n", i);
Packit Service a1973e
            for(ee = 0; ee < NUM_METRIC; ee++) {
Packit Service a1973e
                snprintf(tmpEventName, 1024, "%s:device=%d\0", EventEndings[ee], i);
Packit Service a1973e
                printf("Trying to add event %s to GPU %d in PAPI...", tmpEventName, i);
Packit Service a1973e
                retval = PAPI_add_named_event(EventSet, tmpEventName);
Packit Service a1973e
                if(retval == PAPI_OK) {
Packit Service a1973e
                    printf("Added event\n");
Packit Service a1973e
                    EventName[eventCount] = strdup(tmpEventName);
Packit Service a1973e
                    eventCount++;
Packit Service a1973e
                } else {
Packit Service a1973e
                    printf("Could not add event\n");
Packit Service a1973e
                }
Packit Service a1973e
            }
Packit Service a1973e
        }
Packit Service a1973e
        for(i = 0; i < eventCount; i++)
Packit Service a1973e
            values[i] = -1;
Packit Service a1973e
#endif                          // PAPI_ONLY
Packit Service a1973e
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
        CUpti_EventGroupSets *passes = NULL;
Packit Service a1973e
        int j = 0;
Packit Service a1973e
        CUPTI_CALL(cuptiMetricCreateEventGroupSets(ctx, (sizeof metricId), metricId, &passes));
Packit Service a1973e
        // EventGroups required to profile Nvlink metrics.
Packit Service a1973e
        for(i = 0; i < (signed) passes->numSets; i++) {
Packit Service a1973e
            for(j = 0; j < (signed) passes->sets[i].numEventGroups; j++) {
Packit Service a1973e
                eventGroup[numEventGroup] = passes->sets[i].eventGroups[j];
Packit Service a1973e
                if(!eventGroup[numEventGroup]) {
Packit Service a1973e
                    printf("\n eventGroup initialization failed \n");
Packit Service a1973e
                    exit(-1);
Packit Service a1973e
                }
Packit Service a1973e
                numEventGroup++;
Packit Service a1973e
            }
Packit Service a1973e
        }
Packit Service a1973e
        CUPTI_CALL(cuptiSetEventCollectionMode(ctx, CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS));
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
Packit Service a1973e
        // ===== Allocate Memory =====================================
Packit Service a1973e
Packit Service a1973e
        for(i = 0; i < NUM_STREAMS; i++) {
Packit Service a1973e
            RUNTIME_API_CALL(cudaMalloc((void **) &pDevBuffer0[i], bufferSize));
Packit Service a1973e
Packit Service a1973e
            pHostBuffer[i] = (float *) malloc(bufferSize);
Packit Service a1973e
            MEMORY_ALLOCATION_CALL(pHostBuffer[i]);
Packit Service a1973e
        }
Packit Service a1973e
        
Packit Service a1973e
        if(cpuToGpu) {
Packit Service a1973e
#ifdef PAPI
Packit Service a1973e
            retval = PAPI_start( EventSet );
Packit Service a1973e
            if( retval != PAPI_OK )  fprintf( stderr, "PAPI_start failed\n" );
Packit Service a1973e
#endif// PAPI
Packit Service a1973e
            testCpuToGpu(eventGroup, pDevBuffer0, pHostBuffer, bufferSize, cudaStreams, &timeDuration, numEventGroup);
Packit Service a1973e
#ifdef PAPI
Packit Service a1973e
            retval = PAPI_stop(EventSet, values);
Packit Service a1973e
            if( retval != PAPI_OK )  fprintf( stderr, "PAPI_stop failed\n" );
Packit Service a1973e
#endif 
Packit Service a1973e
            printf("Data tranferred between CPU & Device%d : \n", (int) nvlinkRec->typeDev0);
Packit Service a1973e
Packit Service a1973e
Packit Service a1973e
        } else if(gpuToGpu) {
Packit Service a1973e
            RUNTIME_API_CALL(cudaSetDevice(1));
Packit Service a1973e
            for(i = 0; i < NUM_STREAMS; i++) 
Packit Service a1973e
                RUNTIME_API_CALL(cudaMalloc((void **) &pDevBuffer1[i], bufferSize));
Packit Service a1973e
            testGpuToGpu_part1(eventGroup, pDevBuffer0, pDevBuffer1, pHostBuffer, bufferSize, cudaStreams, &timeDuration, numEventGroup);
Packit Service a1973e
#ifdef PAPI
Packit Service a1973e
            retval = PAPI_start( EventSet );
Packit Service a1973e
            if( retval != PAPI_OK )  fprintf( stderr, "PAPI_start failed\n" );
Packit Service a1973e
#endif
Packit Service a1973e
            testGpuToGpu_part2(eventGroup, pDevBuffer0, pDevBuffer1, pHostBuffer, bufferSize, cudaStreams, &timeDuration, numEventGroup);
Packit Service a1973e
#ifdef PAPI
Packit Service a1973e
            retval = PAPI_stop(EventSet, values);
Packit Service a1973e
            if( retval != PAPI_OK )  fprintf( stderr, "PAPI_stop failed\n" );
Packit Service a1973e
#endif
Packit Service a1973e
            printf("Data tranferred between Device 0 & Device 1 : \n");
Packit Service a1973e
        }
Packit Service a1973e
Packit Service a1973e
#ifdef CUPTI_ONLY
Packit Service a1973e
        // Collect Nvlink Metric values for the data transfer via Nvlink for all the eventGroups.
Packit Service a1973e
        for(i = 0; i < numEventGroup; i++) {
Packit Service a1973e
            readMetricValue(eventGroup[i], NUM_EVENTS, 0, metricId, timeDuration, metricValue);
Packit Service a1973e
Packit Service a1973e
            CUPTI_CALL(cuptiEventGroupDisable(eventGroup[i]));
Packit Service a1973e
            CUPTI_CALL(cuptiEventGroupDestroy(eventGroup[i]));
Packit Service a1973e
Packit Service a1973e
            for(i = 0; i < NUM_METRIC; i++) {
Packit Service a1973e
                if(printMetricValue(metricId[i], metricValue[i], metricName[i]) != 0) {
Packit Service a1973e
                    printf("\n printMetricValue failed \n");
Packit Service a1973e
                    exit(-1);
Packit Service a1973e
                }
Packit Service a1973e
            }
Packit Service a1973e
        }
Packit Service a1973e
#endif                          // CUPTI_ONLY
Packit Service a1973e
Packit Service a1973e
#ifdef PAPI
Packit Service a1973e
        for(i = 0; i < eventCount; i++) {
Packit Service a1973e
            char str[64];                
Packit Service a1973e
            calculateSize(str, (uint64_t) values[i] );
Packit Service a1973e
            printf("PAPI %s %s \n", EventName[i], str);
Packit Service a1973e
        }
Packit Service a1973e
	retval = PAPI_cleanup_eventset(EventSet);
Packit Service a1973e
	if( retval != PAPI_OK )
Packit Service a1973e
            fprintf(stderr, "PAPI_cleanup_eventset failed\n");
Packit Service a1973e
	retval = PAPI_destroy_eventset(&EventSet);
Packit Service a1973e
	if (retval != PAPI_OK)
Packit Service a1973e
            fprintf(stderr, "PAPI_destroy_eventset failed\n");
Packit Service a1973e
	PAPI_shutdown();
Packit Service a1973e
#endif
Packit Service a1973e
        
Packit Service a1973e
    } else {
Packit Service a1973e
        printf("No Nvlink supported device found\n");
Packit Service a1973e
    }
Packit Service a1973e
Packit Service a1973e
    printf("Exit\n");
Packit Service a1973e
    return 0;
Packit Service a1973e
}