CUDA编程--实现并行矩阵乘法【80行代码】

- 日理万妓 2022-04-03 15:19 280阅读 0赞

简述

这里只写了方阵之间的乘法,但是本质上都是一样的。

  • 我测试过100规模的方阵之间的乘法,没有问题。

代码

  • 读取文件data.txt
  • 数据格式就是一个数值N,然后来连续的两个N*N的矩阵。用空格隔开。

    include “cuda_runtime.h”

    include “device_launch_parameters.h”

    include

    include

    include

    // Kernal:
    global void MatrixMultiply(int a, int b, int *c, int N) {

    1. int tx = threadIdx.x + blockIdx.x * blockDim.x;
    2. int ty = threadIdx.y + blockIdx.y * blockDim.y;
    3. if (tx < N && ty < N) {
    4. int sum = 0;
    5. for (int k = 0; k < N; ++k) {
    6. int adata = a[tx * N + k];
    7. int bdata = b[k * N + ty];
    8. sum += adata * bdata;
    9. }
    10. c[tx * N + ty] = sum;
    11. }

    }

    cudaError_t matrixMultiplyWithCuda(int a, int b, int *c, size_t size);

    int main()
    {

    1. std::ifstream in("data.txt");
    2. int N;
    3. in >> N;
    4. if (in.fail()) {
    5. printf("Something wrong\n");
    6. }
    7. else {
    8. printf("Success read\n");
    9. }
    10. // host initial
    11. int *a = new int[N * N];
    12. int *b = new int[N * N];
    13. int *c = new int[N * N];
    14. // read
    15. for (int i = 0; i < N; ++i)
    16. for (int j = 0; j < N; ++j) in >> a[i * N + j];
    17. for (int i = 0; i < N; ++i)
    18. for (int j = 0; j < N; ++j) in >> b[i * N + j];
    19. cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);
    20. for (int i = 0; i < N; ++i) {
    21. for (int j = 0; j < N; ++j) std::cout << c[i * N + j]<<" ";
    22. std::cout << std::endl;
    23. }
    24. cudaStatus = cudaThreadExit();
    25. // host free
    26. delete[] a;
    27. delete[] b;
    28. delete[] c;
    29. return 0;

    }
    cudaError_t matrixMultiplyWithCuda(int a, int b, int *c, size_t N) {

    1. int *dev_a = 0;
    2. int *dev_b = 0;
    3. int *dev_c = 0;
    4. cudaError_t cudaStatus;
    5. cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int));
    6. cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int));
    7. cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int));
    8. cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
    9. cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);
    10. if (cudaStatus != cudaSuccess) {
    11. printf("Something wrong\n");
    12. goto Error;
    13. }
    14. // kernal invocation
    15. dim3 threadPerBlock(32, 32);
    16. dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1);
    17. MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
    18. if (cudaStatus != cudaSuccess) {
    19. printf( "Calculate wrong\n");
    20. goto Error;
    21. }
    22. cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    Error:

    1. cudaFree(dev_a);
    2. cudaFree(dev_b);
    3. cudaFree(dev_c);
    4. return cudaStatus;

    }

写入文件的 版本

(也改成了浮点数运算了)

  1. #include "cuda_runtime.h"
  2. #include "device_launch_parameters.h"
  3. #include <iostream>
  4. #include <fstream>
  5. #include <stdio.h>
  6. // Kernal:
  7. __global__ void MatrixMultiply(float *a, float * b, float *c, int N) {
  8. int tx = threadIdx.x + blockIdx.x * blockDim.x;
  9. int ty = threadIdx.y + blockIdx.y * blockDim.y;
  10. if (tx < N && ty < N) {
  11. float sum = 0;
  12. for (int k = 0; k < N; ++k) {
  13. float adata = a[tx * N + k];
  14. float bdata = b[k * N + ty];
  15. sum += adata * bdata;
  16. }
  17. c[tx * N + ty] = sum;
  18. }
  19. }
  20. cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t size);
  21. int main()
  22. {
  23. std::ifstream in("data.txt");
  24. int N;
  25. in >> N;
  26. if (in.fail()) {
  27. printf("Something wrong\n");
  28. }
  29. else {
  30. printf("Success read\n");
  31. }
  32. // host initial
  33. float *a = new float[N * N];
  34. float *b = new float[N * N];
  35. float *c = new float[N * N];
  36. // read
  37. for (int i = 0; i < N; ++i)
  38. for (int j = 0; j < N; ++j) in >> a[i * N + j];
  39. for (int i = 0; i < N; ++i)
  40. for (int j = 0; j < N; ++j) in >> b[i * N + j];
  41. cudaError_t cudaStatus = matrixMultiplyWithCuda(a, b, c, N);
  42. std::ofstream out("output.txt");
  43. for (int i = 0; i < N; ++i) {
  44. for (int j = 0; j < N; ++j) out << c[i * N + j]<<" ";
  45. out << std::endl;
  46. }
  47. cudaStatus = cudaThreadExit();
  48. // host free
  49. delete[] a;
  50. delete[] b;
  51. delete[] c;
  52. return 0;
  53. }
  54. cudaError_t matrixMultiplyWithCuda(float *a, float *b, float *c, size_t N) {
  55. float *dev_a = 0;
  56. float *dev_b = 0;
  57. float *dev_c = 0;
  58. cudaError_t cudaStatus;
  59. cudaStatus = cudaMalloc((void**)&dev_a, N * N * sizeof(int));
  60. cudaStatus = cudaMalloc((void**)&dev_b, N * N * sizeof(int));
  61. cudaStatus = cudaMalloc((void**)&dev_c, N * N * sizeof(int));
  62. cudaStatus = cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
  63. cudaStatus = cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);
  64. if (cudaStatus != cudaSuccess) {
  65. printf("Something wrong\n");
  66. goto Error;
  67. }
  68. // kernal invocation
  69. dim3 threadPerBlock(32, 32);
  70. dim3 numBlocks(N / threadPerBlock.x + 1, N / threadPerBlock.y + 1);
  71. MatrixMultiply<<<numBlocks, threadPerBlock>>>(dev_a, dev_b, dev_c, N);
  72. if (cudaStatus != cudaSuccess) {
  73. printf( "Calculate wrong\n");
  74. goto Error;
  75. }
  76. cudaStatus = cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);
  77. Error:
  78. cudaFree(dev_a);
  79. cudaFree(dev_b);
  80. cudaFree(dev_c);
  81. return cudaStatus;
  82. }

发表评论

表情:
评论列表 (有 0 条评论,280人围观)

还没有评论,来说两句吧...

相关阅读

    相关 实现矩阵乘法

    完成函数MulMatrices的编写,该函数实现3\3的矩阵乘法,将矩阵a和b相乘,结果存入矩阵c中。输入分别为矩阵a和矩阵b。 示例输入: 1 0 0 0 1 0 0