Blame src/components/cuda/sampling/test/matmul.cu

Packit 577717
//This is a matrix multiplication program in CUDA without any optimizations
Packit 577717
//like tiling, using shared memory etc
Packit 577717
Packit 577717
#include<stdio.h>
Packit 577717
#include<stdlib.h>
Packit 577717
#include<cuda_runtime.h>
Packit 577717
#include<assert.h>
Packit 577717
Packit 577717
Packit 577717
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int width)
Packit 577717
{
Packit 577717
Packit 577717
	//2D thread ID 
Packit 577717
	int bx=blockIdx.x;
Packit 577717
	int by=blockIdx.y;
Packit 577717
	int tdx=threadIdx.x;
Packit 577717
	int tdy=threadIdx.y;
Packit 577717
Packit 577717
	int tx=bx*blockDim.x+tdx;
Packit 577717
	int ty=by*blockDim.y+tdy;
Packit 577717
	
Packit 577717
	//Pvalue stores the Pd element that is computed by the thread
Packit 577717
	float Pvalue=0;
Packit 577717
	for(int k=0;k
Packit 577717
		float Mdelement=Md[ty*width+k];
Packit 577717
		float Ndelement=Nd[k*width+tx];
Packit 577717
		Pvalue += Mdelement*Ndelement;
Packit 577717
	}
Packit 577717
	//Write the matrix to device memory each thread writes one element
Packit 577717
	Pd[ty*width+tx]=Pvalue;
Packit 577717
}
Packit 577717
Packit 577717
Packit 577717
int main(int argc, char** argv){
Packit 577717
Packit 577717
	int width;
Packit 577717
	int BlockDim;
Packit 577717
	int GridDim;
Packit 577717
Packit 577717
	if (argc == 3){
Packit 577717
		width=atoi(argv[1]);
Packit 577717
		BlockDim=atoi(argv[2]);
Packit 577717
		GridDim=width/BlockDim;
Packit 577717
		printf("Using matrix dimension %dx%d ,Block Dim %dx%d threads per block, Grid Dim %dx%d blocks per grid\n",width,width,BlockDim,BlockDim,GridDim,GridDim);
Packit 577717
	}else{
Packit 577717
		width=512;
Packit 577717
		BlockDim=16;
Packit 577717
		GridDim=width/BlockDim;
Packit 577717
		printf("Using Default Parameters: matrix dimension %dx%d ,Block Dim %dx%d threads per block, Grid Dim %dx%d blocks per grid\n",width,width,BlockDim,BlockDim,GridDim,GridDim);		
Packit 577717
	}
Packit 577717
	dim3 dimBlock(BlockDim,BlockDim);
Packit 577717
	dim3 dimGrid(GridDim,GridDim);
Packit 577717
	cudaError_t error;
Packit 577717
	cudaDeviceProp deviceProp;
Packit 577717
	int devID=0;
Packit 577717
	error=cudaGetDevice(&devID);
Packit 577717
	if (error != cudaSuccess)
Packit 577717
	{
Packit 577717
		printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
Packit 577717
	}
Packit 577717
Packit 577717
	error=cudaGetDeviceProperties(&deviceProp,devID);
Packit 577717
	if (error != cudaSuccess){
Packit 577717
		printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
Packit 577717
	}else{
Packit 577717
		printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
Packit 577717
	}
Packit 577717
Packit 577717
	int size=width*width*sizeof(float);
Packit 577717
	float* M=(float*)malloc(size);
Packit 577717
	float* N=(float*)malloc(size);
Packit 577717
	float* P=(float*)malloc(size);
Packit 577717
Packit 577717
	float* Md,*Nd,*Pd;
Packit 577717
Packit 577717
	if(!(M&&N)){
Packit 577717
		printf("Malloc failed\n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	 // initialization of host data
Packit 577717
	for (int j = 0; j < width; j++) {
Packit 577717
		for (int i = 0; i < width; i++) {
Packit 577717
			M[j*width + i] = (float)(rand()%50);
Packit 577717
			N[j*width + i] = (float)(rand()%50);
Packit 577717
			P[j*width + i] = 0;
Packit 577717
		}
Packit 577717
	}
Packit 577717
Packit 577717
	error=cudaMalloc((void**)&Md,size);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("Device memory allocation for M failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
	error=cudaMalloc((void**)&Nd,size);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("Device memory allocation for N failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
	error=cudaMalloc((void**)&Pd,size);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("Device memory allocation for P failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	error=cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("Device memory copy for M failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
	
Packit 577717
	error=cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("Device memory copy for N failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	
Packit 577717
	cudaEvent_t start;
Packit 577717
	error=cudaEventCreate(&start;;
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("cuda event start failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	cudaEvent_t stop;
Packit 577717
	error=cudaEventCreate(&stop);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("cuda event stop failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	error =cudaEventRecord(start,NULL);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("cuda event start record failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
	
Packit 577717
	
Packit 577717
	MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,width);
Packit 577717
Packit 577717
//	error=cudaDeviceSynchronize();
Packit 577717
	error =cudaEventRecord(stop,NULL);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("cuda event stop record failed with error=%s\n",cudaGetErrorString(error));
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	error = cudaEventSynchronize(stop);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("cuda event sync failed :%s\n",cudaGetErrorString(error));
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
	
Packit 577717
Packit 577717
Packit 577717
	float msecTotal=0.0f;
Packit 577717
	error = cudaEventElapsedTime(&msecTotal,start,stop);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("cuda elapsed time calculation failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	float msecPerMatrixMul = msecTotal;
Packit 577717
	double flopsPerMatrixMul = 2*width*width*width;
Packit 577717
	double gigaFlops=(flopsPerMatrixMul*1.0e-9f)/(msecPerMatrixMul/1000.0f);
Packit 577717
	printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
Packit 577717
		    gigaFlops,
Packit 577717
			msecPerMatrixMul,
Packit 577717
			flopsPerMatrixMul,
Packit 577717
			width * width);
Packit 577717
Packit 577717
Packit 577717
Packit 577717
	error=cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
Packit 577717
	if(error!=cudaSuccess){
Packit 577717
		printf("Device memoory copy back for Pd failed \n");
Packit 577717
		exit(-1);
Packit 577717
	}
Packit 577717
Packit 577717
	printf("Very slow Host Matrix Mult \n");
Packit 577717
	float temp;
Packit 577717
	// initialization of host data
Packit 577717
	for (int i = 0; i < width; ++i) {
Packit 577717
		for ( int j = 0; j < width; ++j) {
Packit 577717
			temp=0;
Packit 577717
			for(int k=0; k
Packit 577717
				temp+=M[i*width+k]*N[k*width+j];
Packit 577717
			if(temp != P[i*width+j]){
Packit 577717
				printf("Matrix Mult Screwed Up!! differ in values CPU:%f and GPU:%f \n",temp,P[i*width+j]);
Packit 577717
				exit(-1);
Packit 577717
			}
Packit 577717
		}
Packit 577717
		
Packit 577717
	}
Packit 577717
	
Packit 577717
	
Packit 577717
	free(M);
Packit 577717
	free(N);
Packit 577717
	free(P);
Packit 577717
	cudaFree(Md);	
Packit 577717
	cudaFree(Nd);	
Packit 577717
	cudaFree(Pd);	
Packit 577717
	cudaDeviceReset();
Packit 577717
	return 1;
Packit 577717
Packit 577717
}