|
Packit |
577717 |
//This is a matrix multiplication program in CUDA without any optimizations
|
|
Packit |
577717 |
//like tiling, using shared memory etc
|
|
Packit |
577717 |
|
|
Packit |
577717 |
#include<stdio.h>
|
|
Packit |
577717 |
#include<stdlib.h>
|
|
Packit |
577717 |
#include<cuda_runtime.h>
|
|
Packit |
577717 |
#include<assert.h>
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int width)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
|
|
Packit |
577717 |
//2D thread ID
|
|
Packit |
577717 |
int bx=blockIdx.x;
|
|
Packit |
577717 |
int by=blockIdx.y;
|
|
Packit |
577717 |
int tdx=threadIdx.x;
|
|
Packit |
577717 |
int tdy=threadIdx.y;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
int tx=bx*blockDim.x+tdx;
|
|
Packit |
577717 |
int ty=by*blockDim.y+tdy;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
//Pvalue stores the Pd element that is computed by the thread
|
|
Packit |
577717 |
float Pvalue=0;
|
|
Packit |
577717 |
for(int k=0;k
|
|
Packit |
577717 |
float Mdelement=Md[ty*width+k];
|
|
Packit |
577717 |
float Ndelement=Nd[k*width+tx];
|
|
Packit |
577717 |
Pvalue += Mdelement*Ndelement;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
//Write the matrix to device memory each thread writes one element
|
|
Packit |
577717 |
Pd[ty*width+tx]=Pvalue;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
int main(int argc, char** argv){
|
|
Packit |
577717 |
|
|
Packit |
577717 |
int width;
|
|
Packit |
577717 |
int BlockDim;
|
|
Packit |
577717 |
int GridDim;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
if (argc == 3){
|
|
Packit |
577717 |
width=atoi(argv[1]);
|
|
Packit |
577717 |
BlockDim=atoi(argv[2]);
|
|
Packit |
577717 |
GridDim=width/BlockDim;
|
|
Packit |
577717 |
printf("Using matrix dimension %dx%d ,Block Dim %dx%d threads per block, Grid Dim %dx%d blocks per grid\n",width,width,BlockDim,BlockDim,GridDim,GridDim);
|
|
Packit |
577717 |
}else{
|
|
Packit |
577717 |
width=512;
|
|
Packit |
577717 |
BlockDim=16;
|
|
Packit |
577717 |
GridDim=width/BlockDim;
|
|
Packit |
577717 |
printf("Using Default Parameters: matrix dimension %dx%d ,Block Dim %dx%d threads per block, Grid Dim %dx%d blocks per grid\n",width,width,BlockDim,BlockDim,GridDim,GridDim);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
dim3 dimBlock(BlockDim,BlockDim);
|
|
Packit |
577717 |
dim3 dimGrid(GridDim,GridDim);
|
|
Packit |
577717 |
cudaError_t error;
|
|
Packit |
577717 |
cudaDeviceProp deviceProp;
|
|
Packit |
577717 |
int devID=0;
|
|
Packit |
577717 |
error=cudaGetDevice(&devID);
|
|
Packit |
577717 |
if (error != cudaSuccess)
|
|
Packit |
577717 |
{
|
|
Packit |
577717 |
printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
error=cudaGetDeviceProperties(&deviceProp,devID);
|
|
Packit |
577717 |
if (error != cudaSuccess){
|
|
Packit |
577717 |
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
|
|
Packit |
577717 |
}else{
|
|
Packit |
577717 |
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
int size=width*width*sizeof(float);
|
|
Packit |
577717 |
float* M=(float*)malloc(size);
|
|
Packit |
577717 |
float* N=(float*)malloc(size);
|
|
Packit |
577717 |
float* P=(float*)malloc(size);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
float* Md,*Nd,*Pd;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
if(!(M&&N)){
|
|
Packit |
577717 |
printf("Malloc failed\n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// initialization of host data
|
|
Packit |
577717 |
for (int j = 0; j < width; j++) {
|
|
Packit |
577717 |
for (int i = 0; i < width; i++) {
|
|
Packit |
577717 |
M[j*width + i] = (float)(rand()%50);
|
|
Packit |
577717 |
N[j*width + i] = (float)(rand()%50);
|
|
Packit |
577717 |
P[j*width + i] = 0;
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
error=cudaMalloc((void**)&Md,size);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("Device memory allocation for M failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
error=cudaMalloc((void**)&Nd,size);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("Device memory allocation for N failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
error=cudaMalloc((void**)&Pd,size);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("Device memory allocation for P failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
error=cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("Device memory copy for M failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
error=cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("Device memory copy for N failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
cudaEvent_t start;
|
|
Packit |
577717 |
error=cudaEventCreate(&start;;
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("cuda event start failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
cudaEvent_t stop;
|
|
Packit |
577717 |
error=cudaEventCreate(&stop);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("cuda event stop failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
error =cudaEventRecord(start,NULL);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("cuda event start record failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,width);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
// error=cudaDeviceSynchronize();
|
|
Packit |
577717 |
error =cudaEventRecord(stop,NULL);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("cuda event stop record failed with error=%s\n",cudaGetErrorString(error));
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
error = cudaEventSynchronize(stop);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("cuda event sync failed :%s\n",cudaGetErrorString(error));
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
float msecTotal=0.0f;
|
|
Packit |
577717 |
error = cudaEventElapsedTime(&msecTotal,start,stop);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("cuda elapsed time calculation failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
float msecPerMatrixMul = msecTotal;
|
|
Packit |
577717 |
double flopsPerMatrixMul = 2*width*width*width;
|
|
Packit |
577717 |
double gigaFlops=(flopsPerMatrixMul*1.0e-9f)/(msecPerMatrixMul/1000.0f);
|
|
Packit |
577717 |
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
|
|
Packit |
577717 |
gigaFlops,
|
|
Packit |
577717 |
msecPerMatrixMul,
|
|
Packit |
577717 |
flopsPerMatrixMul,
|
|
Packit |
577717 |
width * width);
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
error=cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
|
|
Packit |
577717 |
if(error!=cudaSuccess){
|
|
Packit |
577717 |
printf("Device memoory copy back for Pd failed \n");
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
printf("Very slow Host Matrix Mult \n");
|
|
Packit |
577717 |
float temp;
|
|
Packit |
577717 |
// initialization of host data
|
|
Packit |
577717 |
for (int i = 0; i < width; ++i) {
|
|
Packit |
577717 |
for ( int j = 0; j < width; ++j) {
|
|
Packit |
577717 |
temp=0;
|
|
Packit |
577717 |
for(int k=0; k
|
|
Packit |
577717 |
temp+=M[i*width+k]*N[k*width+j];
|
|
Packit |
577717 |
if(temp != P[i*width+j]){
|
|
Packit |
577717 |
printf("Matrix Mult Screwed Up!! differ in values CPU:%f and GPU:%f \n",temp,P[i*width+j]);
|
|
Packit |
577717 |
exit(-1);
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
}
|
|
Packit |
577717 |
|
|
Packit |
577717 |
|
|
Packit |
577717 |
free(M);
|
|
Packit |
577717 |
free(N);
|
|
Packit |
577717 |
free(P);
|
|
Packit |
577717 |
cudaFree(Md);
|
|
Packit |
577717 |
cudaFree(Nd);
|
|
Packit |
577717 |
cudaFree(Pd);
|
|
Packit |
577717 |
cudaDeviceReset();
|
|
Packit |
577717 |
return 1;
|
|
Packit |
577717 |
|
|
Packit |
577717 |
}
|