【CUDA并行编程之四】矩阵相乘

小灰灰 2022-08-01 12:17 255阅读 0赞

前面介绍了基本的Cuda编程的相关知识,那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能,我们拿矩阵相乘来作为例子。

1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码:

mat_mul.cc:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. //a[i]*b[i] + c[i] = d[i]
  2. #include
  3. #include
  4. #include
  5. #include
  6. #include”wtime.h”
  7. using namespace std;
  8. const int N = 320;
  9. //矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示
  10. int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
  11. int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];
  12. void init()
  13. {
  14. for(int i=0;i<N;i++)
  15. for(int j=0;j<N;j++)
  16. {
  17. a[i][j] = 1;
  18. b[i][j] = 2;
  19. c[i][j] = 3;
  20. }
  21. }
  22. void init1()
  23. {
  24. for(int i=0;i<N;i++)
  25. for(int j=0;j<N;j++)
  26. {
  27. aa[i*N+j] = 1;
  28. bb[i*N+j] = 2;
  29. cc[i*N+j] = 3;
  30. }
  31. }
  32. void mul()
  33. {
  34. for(int i=0;i<N;i++)
  35. for(int j=0;j<N;j++)
  36. {
  37. for(int k=0;k<N;k++)
  38. {
  39. d[i][j] += a[i][k] * b[k][j];
  40. }
  41. d[i][j] += c[i][j];
  42. }
  43. }
  44. void mul1()
  45. {
  46. for(int i=0;i<N;i++)
  47. for(int j=0;j<N;j++)
  48. {
  49. for(int k=0;k<N;k++)
  50. {
  51. dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
  52. }
  53. dd[N*i+j] += cc[N*i+j];
  54. }
  55. }
  56. void print()
  57. {
  58. ofstream fout;
  59. fout.open(“result.txt”);
  60. if(!fout)
  61. {
  62. perror(“can not open the file”);
  63. }
  64. for(int i=0;i<N;i++)
  65. {
  66. for(int j=0;j<N;j++)
  67. {
  68. fout<<d[i][j]<<” “;
  69. }
  70. fout<<endl;
  71. }
  72. fout.close();
  73. }
  74. int main()
  75. {
  76. init1();
  77. double t = wtime();
  78. mul1();
  79. t = wtime()-t;
  80. printf(“computation timing = %10.10f sec\n”,t);
  81. //print();
  82. return 0;
  83. }

wtime.h:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. #ifndef _WTIME_
  2. #define _WTIME_
  3. double wtime();
  4. #endif

wtime.cc:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. #include
  2. #include
  3. #include
  4. #include
  5. double wtime(void)
  6. {
  7. double now_time;
  8. struct timeval etstart;
  9. struct timezone tzp;
  10. if(gettimeofday(&etstart,&tzp)==-1)
  11. {
  12. perror(“Error:calling gettimeofday() not successfully.\n”);
  13. }
  14. now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
  15. return now_time;
  16. }
  17. #if 0
  18. int main()
  19. {
  20. double time;
  21. time = wtime();
  22. printf(“time of day = %10.4f\n”,time);
  23. return 0;
  24. }
  25. #endif

makefile:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. target:
  2. g++ mat_mul.cc wtime.cc
  3. ./a.out

结果:

SouthEast

2.GPU上执行矩阵相乘以及性能。

代码:

cuda_mat_mul_v1.cu:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. //matrix multiplication with global memory
  2. #include
  3. #include
  4. #include “wtime.h”
  5. using namespace std;
  6. const int BLOCK_SIZE = 16;
  7. const int GRID_SIZE = 20;
  8. //D = A * B + C;
  9. __global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)
  10. {
  11. int row = blockIdx.y * blockDim.y + threadIdx.y;
  12. int col = blockIdx.x * blockDim.x + threadIdx.x;
  13. int sum = 0;
  14. for(int i=0;i<N;i++)
  15. {
  16. sum += da[row*N + i] * db[row*i+col];
  17. }
  18. dd[row*N + col] = sum + dc[row*N + col];
  19. }
  20. int main()
  21. {
  22. int N = BLOCK_SIZE * GRID_SIZE;
  23. int *ha,*hb,*hc,*hd;
  24. int *da,*db,*dc,*dd;
  25. double time;
  26. ha = new int[N*N];
  27. hb = new int[N*N];
  28. hc = new int[N*N];
  29. hd = new int[N*N];
  30. cudaError_t err;
  31. //initialize
  32. for(int i=0;i<N;i++)
  33. for(int j=0;j<N;j++)
  34. {
  35. ha[i*N+j] = 1;
  36. hb[i*N+j] = 2;
  37. hc[i*N+j] = 3;
  38. }
  39. //malloc
  40. cudaMalloc(&da,N*N*sizeof(int));
  41. cudaMalloc(&db,N*N*sizeof(int));
  42. cudaMalloc(&dc,N*N*sizeof(int));
  43. err = cudaMalloc(&dd,N*N*sizeof(int));
  44. printf(“Cuda Malloc C : %s\n”,cudaGetErrorString(err));
  45. //host to device
  46. cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
  47. cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
  48. cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
  49. cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);
  50. dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
  51. dim3 grid(GRID_SIZE,GRID_SIZE);
  52. //kernel
  53. time = wtime();
  54. mat_mul<<>>(da,db,dc,dd,N);
  55. printf(“Computation time is %10.10f\n”,wtime()-time);
  56. //device to host
  57. cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);
  58. //print result to file
  59. ofstream fout;
  60. fout.open(“result_v1.txt”);
  61. if(!fout)
  62. {
  63. cerr<<”open the file error”<<endl;
  64. exit(-1);
  65. }
  66. for(int i=0;i<N;i++)
  67. {
  68. for(int j=0;j<N;j++)
  69. {
  70. fout<<hd[i*N+j]<<” “;
  71. }
  72. fout<<endl;
  73. }
  74. delete []ha;delete []hb;delete []hc;delete []hd;
  75. cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);
  76. return 0;
  77. }

cuda_wtime.cu:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. #include
  2. #include
  3. #include
  4. #include
  5. double wtime(void)
  6. {
  7. double now_time;
  8. struct timeval etstart;
  9. struct timezone tzp;
  10. if(gettimeofday(&etstart,&tzp)==-1)
  11. {
  12. perror(“Error:calling gettimeofday() not successfully.\n”);
  13. }
  14. now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
  15. return now_time;
  16. }
  17. #if 0
  18. int main()
  19. {
  20. double time;
  21. time = wtime();
  22. printf(“time of day = %10.4f\n”,time);
  23. return 0;
  24. }
  25. #endif

wtime.h:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. #ifndef _WTIME_
  2. #define _WTIME_
  3. double wtime();
  4. #endif

cuda_wtime.cu:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. #include
  2. #include
  3. #include
  4. #include
  5. double wtime(void)
  6. {
  7. double now_time;
  8. struct timeval etstart;
  9. struct timezone tzp;
  10. if(gettimeofday(&etstart,&tzp)==-1)
  11. {
  12. perror(“Error:calling gettimeofday() not successfully.\n”);
  13. }
  14. now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
  15. return now_time;
  16. }
  17. #if 0
  18. int main()
  19. {
  20. double time;
  21. time = wtime();
  22. printf(“time of day = %10.4f\n”,time);
  23. return 0;
  24. }
  25. #endif

makefile:

[cpp] view plain copy 在CODE上查看代码片 派生到我的代码片

  1. cu:
  2. nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
  3. ./a.out

结果:

SouthEast 1

3.计算性能对比:









































矩阵大小
16001600

12001200


800800


320320


串行时间/s


30.9


11.49865


2.597987


0.162311


并行时间
grid=100/block=16

grid=75/block=16


grid=50/block=16
grid=20/block=16


kernel执行时间/s


0.0000319


0.0000309944


0.0000309944


0.0000231266


并行计算总时间(分配内存加+数据拷贝+计算)/s


0.70796


0.439213


0.310214


0.237676

可见,在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。

注明出处:http://blog.csdn.net/lavorange/article/details/41896591

发表评论

表情:
评论列表 (有 0 条评论,255人围观)

还没有评论,来说两句吧...

相关阅读