CUDA并行计算框架编程+矩阵相乘并行计算

清疚 2022-05-25 21:06 342阅读 0赞

当下的GPGPU(General Purpose GPU(Graphic Process Unit))—(CUDA: Compute Unified Device Architecture)即通用计算图形处理器。

安装过程可参考我的另一篇blog:https://blog.csdn.net/pnan222/article/details/79449923

CUDA软件架构:(1)开发库(CUDA Library)(2)运行时环境(CUDA Runtime)(3)驱动(CUDA Driver)

CUDA的线程层次结构:Kenel—>Grid—>Block—>Thread

GPU硬件的一个核心组件是SM(Streaming Multiprocessor)流式多处理器,SM可以并发地执行数百个线程,且一个block对应一个SM,而一个SM则可以对应多个block

grid只是逻辑层;而SM才是真正的物理层;block的大小一般要设置成32的倍数

在VS上的配置过程可参考blog:https://blog.csdn.net/kyocen/article/details/51424161

Code:

  1. #include <iostream>
  2. #include <assert.h>
  3. #include <time.h>
  4. #include <Windows.h>
  5. //#include <mmsystem.h>
  6. #include "cuda_runtime.h"
  7. #include "device_launch_parameters.h"
  8. #define DWORD unsigned long
  9. #pragma comment(lib,"winmm.lib")
  10. using namespace std;
  11. __global__ void add(float* a, float* b, float* c, int N)
  12. {
  13. for (int i = 0; i < N; i++)
  14. {
  15. c[i] = a[i] + b[i];
  16. }
  17. }
  18. int main() {
  19. int N = 1 << 15;
  20. int nBytes = N * sizeof(float);
  21. //申请host内存
  22. float *x, *y, *z;
  23. x = (float*)malloc(nBytes);
  24. y = (float*)malloc(nBytes);
  25. z = (float*)malloc(nBytes);
  26. //初始化数据
  27. for (int i = 0; i < N; i++)
  28. {
  29. x[i] = 10.0;
  30. y[i] = 20.0;
  31. }
  32. //申请device内存
  33. float *d_x, *d_y, *d_z;
  34. cudaMalloc((void**)&d_x, nBytes);
  35. cudaMalloc((void**)&d_y, nBytes);
  36. cudaMalloc((void**)&d_z, nBytes);
  37. //将host数据拷贝到device
  38. cudaMemcpy((void*)d_x, (void*)x, nBytes, cudaMemcpyHostToDevice);
  39. cudaMemcpy((void*)d_y, (void*)y, nBytes, cudaMemcpyHostToDevice);
  40. //定义kernel的执行配置
  41. dim3 blockSize(256);
  42. dim3 gridSize((int)((N + blockSize.x - 1) / blockSize.x));
  43. //编写计时函数
  44. DWORD t1, t2;
  45. t1 = timeGetTime();
  46. //执行kernel
  47. add << < gridSize, blockSize >> >(d_x, d_y, d_z, N);
  48. t2 = timeGetTime();
  49. printf("Use Time:%f (s)\n", (t2 - t1)*1.0 / 1000);
  50. //将device得到的结果拷贝到host
  51. cudaMemcpy((void*)z, (void*)d_z, nBytes, cudaMemcpyDeviceToHost);
  52. //检查执行结果
  53. float maxError = 0.0;
  54. for (int i = 0; i < N; i++)
  55. {
  56. maxError = fmax(maxError, fabs(z[i] - 30.0));
  57. }
  58. cout << "最大误差:" << maxError << endl;
  59. //释放device内存
  60. cudaFree(d_x);
  61. cudaFree(d_y);
  62. cudaFree(d_z);
  63. //释放host内存
  64. free(x);
  65. free(y);
  66. free(z);
  67. system("pause");
  68. return 0;
  69. }

实现矩阵相乘并行计算的代码:

  1. #include <iostream>
  2. #include <assert.h>
  3. #include <time.h>
  4. #include <Windows.h>
  5. //#include <mmsystem.h>
  6. #include "cuda_runtime.h"
  7. #include "device_launch_parameters.h"
  8. #define DWORD unsigned long
  9. #pragma comment(lib,"winmm.lib")
  10. using namespace std;
  11. __global__ void add(float* a, float* b, float* c, int N)
  12. {
  13. for (int i = 0; i < N; i++)
  14. {
  15. c[i] = a[i] + b[i];
  16. }
  17. }
  18. __global__ void MatrixMuiOnDevice(int *M, int *N, int *P, int width)
  19. {
  20. int x = threadIdx.x;
  21. int y = threadIdx.y; //获取该线程的位置
  22. float Pervalue = 0;
  23. for (int i = 0; i < width; i++)
  24. {
  25. float Mdlement = M[y * width + i];
  26. float Ndlement = N[width * i + x];
  27. Pervalue += Mdlement * Ndlement;
  28. }
  29. P[y * width + x] = Pervalue;
  30. }
  31. int main() {
  32. int a[30][30], b[30][30], c[30][30];
  33. int *M, *N, *P;
  34. int width = 30;
  35. int NUM = 900;
  36. dim3 dimBlock(30, 30);
  37. cudaEvent_t start, stop;
  38. float elapsedTime;
  39. cudaEventCreate(&start);
  40. cudaEventCreate(&stop);
  41. cudaMalloc((void**)&M, 900 * sizeof(int));
  42. cudaMalloc((void**)&N, 900 * sizeof(int));
  43. cudaMalloc((void**)&P, 900 * sizeof(int));
  44. //初始化
  45. for (int i = 0; i < 30; i++)
  46. {
  47. for (int j = 0; j < 30; j++)
  48. {
  49. a[i][j] = 2;
  50. b[i][j] = 3;
  51. }
  52. }
  53. cudaMemcpy(M, a, NUM * sizeof(int), cudaMemcpyHostToDevice);
  54. cudaMemcpy(N, b, NUM * sizeof(int), cudaMemcpyHostToDevice);
  55. cudaEventRecord(start, 0);
  56. MatrixMuiOnDevice << <1, dimBlock >> >(M, N, P, width);
  57. cudaThreadSynchronize();
  58. cudaEventRecord(stop, 0);
  59. cudaEventSynchronize(stop);
  60. cudaEventElapsedTime(&elapsedTime, start, stop);
  61. printf("%f\n", elapsedTime);
  62. cudaMemcpy(c, P, NUM * sizeof(int), cudaMemcpyDeviceToHost);
  63. for (int i = 0; i < 30; i++)
  64. {
  65. for (int j = 0; j < 30; j++)
  66. {
  67. printf("%d ", c[i][j]);
  68. }
  69. cout << endl;
  70. }
  71. cudaFree(M);
  72. cudaFree(N);
  73. cudaFree(P);
  74. system("pause");
  75. return 0;
  76. }

发表评论

表情:
评论列表 (有 0 条评论,342人围观)

还没有评论,来说两句吧...

相关阅读

    相关 并行计算简介

    1 什么是并行计算? 串行计算: 传统的软件通常被设计成为串行计算模式,具有如下特点: 一个问题被分解成为一系列离散的指令; 这些指令被顺次执行;

    相关 CUDA 并行计算

    CUDA 并行计算 并行计算可以被定义为同时使用许多计算资源 (核心或计算机) 来执行并发计算,一个大的问题可以被分解成多个小问题,然后在不同的计算资源上并行处理这些小