//This is a matrix multiplication program in CUDA without any optimizations //like tiling, using shared memory etc #include #include #include #include __global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int width) { //2D thread ID int bx=blockIdx.x; int by=blockIdx.y; int tdx=threadIdx.x; int tdy=threadIdx.y; int tx=bx*blockDim.x+tdx; int ty=by*blockDim.y+tdy; //Pvalue stores the Pd element that is computed by the thread float Pvalue=0; for(int k=0;k>>(Md,Nd,Pd,width); // error=cudaDeviceSynchronize(); error =cudaEventRecord(stop,NULL); if(error!=cudaSuccess){ printf("cuda event stop record failed with error=%s\n",cudaGetErrorString(error)); exit(-1); } error = cudaEventSynchronize(stop); if(error!=cudaSuccess){ printf("cuda event sync failed :%s\n",cudaGetErrorString(error)); exit(-1); } float msecTotal=0.0f; error = cudaEventElapsedTime(&msecTotal,start,stop); if(error!=cudaSuccess){ printf("cuda elapsed time calculation failed \n"); exit(-1); } float msecPerMatrixMul = msecTotal; double flopsPerMatrixMul = 2*width*width*width; double gigaFlops=(flopsPerMatrixMul*1.0e-9f)/(msecPerMatrixMul/1000.0f); printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, width * width); error=cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost); if(error!=cudaSuccess){ printf("Device memoory copy back for Pd failed \n"); exit(-1); } printf("Very slow Host Matrix Mult \n"); float temp; // initialization of host data for (int i = 0; i < width; ++i) { for ( int j = 0; j < width; ++j) { temp=0; for(int k=0; k