多GPU多个流上实现复制与计算的重叠

缺乏、安全感 2022-11-17 14:41 199阅读 0赞
  • 获取应用程序可使用的 GPU 的数量
  • 激活任意可用的 GPU
  • 在多个 GPU 上分配显存
  • 在多个 GPU 上传入和转出显存数据
  • 在多个 GPU 上启动核函数

获取多个 GPU 的相关信息

如要以运行程序的方式得出可用 GPU 的数量,请使用 cudaGetDeviceCount

  1. uint64_t num_gpus;
  2. cudaGetDeviceCount(&num_gpus);

如要以运行程序的方式得到当前处于活动状态的 GPU,请使用 cudaGetDevice:

  1. uint64_t device;
  2. cudaGetDevice(&device); // `device` is now a 0-based index of the current GPU.

设置当前的 GPU

对于每个主机线程,每次只有一个 GPU 设备处于活动状态。如要将特定的 GPU 设置为活动状态,请使用 cudaSetDevice 以及所需 GPU 的索引(从 0 开始):

  1. cudaSetDevice(0);

循环使用可用的 GPU

一种常见的模式为,遍历可用的 GPU,并为每个 GPU 执行相应操作:

  1. uint64_t num_gpus;
  2. cudaGetDeviceCount(&num_gpus);
  3. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  4. cudaSetDevice(gpu);
  5. // Perform operations for this GPU.
  6. }

为多个 GPU 执行数据分块

与多个非默认流相同,多个 GPU 中的每个 GPU 都可处理一个数据块。我们将创建和利用数据指针数组,为每个可用的 GPU 分配显存:

  1. uint64_t num_gpus;
  2. cudaGetDeviceCount(&num_gpus);
  3. const uint64_t num_entries = 1UL << 26;
  4. const uint64_t chunk_size = sdiv(num_entries, num_gpus);
  5. uint64_t *data_gpu[num_gpus]; // One pointer for each GPU.
  6. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  7. cudaSetDevice(gpu);
  8. const uint64_t lower = chunk_size*gpu;
  9. const uint64_t upper = min(lower+chunk_size, num_entries);
  10. const uint64_t width = upper-lower;
  11. cudaMalloc(&data_gpu[gpu], sizeof(uint64_t)*width); // Allocate chunk of data for current GPU.
  12. }

为多个 GPU 复制数据
通过使用相同的循环遍历和分块技术,我们可在多个 GPU 上传入和传出数据:

  1. // ...Assume data has been allocated on host and for each GPU
  2. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  3. cudaSetDevice(gpu);
  4. const uint64_t lower = chunk_size*gpu;
  5. const uint64_t upper = min(lower+chunk_size, num_entries);
  6. const uint64_t width = upper-lower;
  7. // Note use of `cudaMemcpy` and not `cudaMemcpyAsync` since we are not
  8. // presently using non-default streams.
  9. cudaMemcpy(data_gpu[gpu], data_cpu+lower,
  10. sizeof(uint64_t)*width, cudaMemcpyHostToDevice); // ...or cudaMemcpyDeviceToHost
  11. }

为多个 GPU 启动核函数

通过使用相同的循环遍历和分块技术,我们可在多个 GPU 上启动核函数并处理数据块:

  1. // ...Assume data has been allocated on host and for each GPU
  2. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  3. cudaSetDevice(gpu);
  4. const uint64_t lower = chunk_size*gpu;
  5. const uint64_t upper = min(lower+chunk_size, num_entries);
  6. const uint64_t width = upper-lower;
  7. kernel<<<grid, block>>>(data_gpu[gpu], width); // Pass chunk of data for current GPU to work on.
  8. }

在上面,使用深度优先的方法将一部分工作传递给每个GPU。在某些情况下,尤其是在数据量极高的情况下,使用宽度优先的方法可能更有意义。这种方法上的改变并不是需要额外的CUDA知识。不过,此stack overflow的回答提供了一些使用深度优先和宽度优先方法的CUDA代码示例。

多个GPU之间进行对等内存传输,以及在多个节点上使用多个GPU, 此超级计算会议演示文稿。

例子:
多个GPU使用默认流

  1. #include <cstdint>
  2. #include <iostream>
  3. #include "helpers.cuh"
  4. #include "encryption.cuh"
  5. void encrypt_cpu(uint64_t * data, uint64_t num_entries,
  6. uint64_t num_iters, bool parallel=true) {
  7. #pragma omp parallel for if (parallel)
  8. for (uint64_t entry = 0; entry < num_entries; entry++)
  9. data[entry] = permute64(entry, num_iters);
  10. }
  11. __global__
  12. void decrypt_gpu(uint64_t * data, uint64_t num_entries,
  13. uint64_t num_iters) {
  14. const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
  15. const uint64_t stride = blockDim.x*gridDim.x;
  16. for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
  17. data[entry] = unpermute64(data[entry], num_iters);
  18. }
  19. bool check_result_cpu(uint64_t * data, uint64_t num_entries,
  20. bool parallel=true) {
  21. uint64_t counter = 0;
  22. #pragma omp parallel for reduction(+: counter) if (parallel)
  23. for (uint64_t entry = 0; entry < num_entries; entry++)
  24. counter += data[entry] == entry;
  25. return counter == num_entries;
  26. }
  27. int main (int argc, char * argv[]) {
  28. Timer timer;
  29. Timer overall;
  30. const uint64_t num_entries = 1UL << 26;
  31. const uint64_t num_iters = 1UL << 10;
  32. const bool openmp = true;
  33. timer.start();
  34. uint64_t * data_cpu;
  35. cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
  36. // cudaMalloc (&data_gpu, sizeof(uint64_t)*num_entries);
  37. timer.stop("allocate memory");
  38. check_last_error();
  39. timer.start();
  40. encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
  41. timer.stop("encrypt data on CPU");
  42. overall.start();
  43. timer.start();
  44. int num_gpus;
  45. cudaGetDeviceCount(&num_gpus);
  46. const uint64_t chunk_size = sdiv(num_entries, num_gpus);
  47. uint64_t *data_gpu[num_gpus]; // One pointer for each GPU.
  48. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  49. cudaSetDevice(gpu);
  50. const uint64_t lower = chunk_size*gpu;
  51. const uint64_t upper = min(lower+chunk_size, num_entries);
  52. const uint64_t width = upper-lower;
  53. cudaMalloc(&data_gpu[gpu], sizeof(uint64_t)*width); // Allocate chunk of data for current GPU.
  54. cudaMemcpy(data_gpu[gpu], data_cpu+lower,
  55. sizeof(uint64_t)*width, cudaMemcpyHostToDevice); // ...or cudaMemcpyDeviceToHost
  56. }
  57. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  58. cudaSetDevice(gpu);
  59. const uint64_t lower = chunk_size*gpu;
  60. const uint64_t upper = min(lower+chunk_size, num_entries);
  61. const uint64_t width = upper-lower;
  62. decrypt_gpu<<<32*80, 64>>>(data_gpu[gpu], width, num_iters); // Pass chunk of data for current GPU to work on.
  63. }
  64. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  65. cudaSetDevice(gpu);
  66. const uint64_t lower = chunk_size*gpu;
  67. const uint64_t upper = min(lower+chunk_size, num_entries);
  68. const uint64_t width = upper-lower;
  69. cudaMemcpy(data_cpu+lower, data_gpu[gpu],
  70. sizeof(uint64_t)*width, cudaMemcpyDeviceToHost);
  71. }
  72. timer.stop("multi GPU times");
  73. check_last_error();
  74. overall.stop("total time on GPU");
  75. check_last_error();
  76. timer.start();
  77. const bool success = check_result_cpu(data_cpu, num_entries, openmp);
  78. std::cout << "STATUS: test "
  79. << ( success ? "passed" : "failed")
  80. << std::endl;
  81. timer.stop("checking result on CPU");
  82. timer.start();
  83. cudaFreeHost(data_cpu);
  84. // cudaFree (data_gpu);
  85. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  86. cudaFree(data_gpu[gpu]);
  87. }
  88. timer.stop("free memory");
  89. check_last_error();
  90. }

在上面代码,可以看到内存传输没有重叠。 为什么会这样?
代码既不使用非默认流,也不使用“cudaMemcpyAsync”存储复制。 因此,它们阻止了操作。

在多个 GPU上实现数据复制与计算的重叠

  • 流与每个 GPU 设备是如何关联的
  • 如何为多个 GPU 创建非默认流
  • 如何在多个 GPU 上实现复制与计算的重叠

每个 GPU 都有各自的默认流。我们可以为当前处于活动状态的 GPU 设备创建、使用和销毁非默认流。切记不要在未与当前处于活动状态的 GPU 建立关联的流中启动核函数

为多个 GPU 创建多个非默认流
在多个 GPU 上使用多个非默认流时,与之前不同的是,我们不是简单地将流存储在数组中,而是将其存储于二维数组中,且数组中的每一行皆包含单个 GPU 的流:

  1. cudaStream_t streams[num_gpus][num_streams]; // 2D array containing number of streams for each GPU.
  2. // For each available GPU...
  3. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  4. // ...set as active device...
  5. cudaSetDevice(gpu);
  6. for (uint64_t stream = 0; stream < num_streams; stream++)
  7. // ...create and store its number of streams.
  8. cudaStreamCreate(&streams[gpu][stream]);
  9. }

多个 GPU 上多流的数据块大小
当在多个 GPU 上使用多个非默认流时,全局数据索引尤为棘手。为帮助实现索引,我们可以为单个流和整个 GPU 分别定义数据块大小。

  1. // Each stream needs num_entries/num_gpus/num_streams data. We use round up division for
  2. // reasons previously discussed.
  3. const uint64_t stream_chunk_size = sdiv(sdiv(num_entries, num_gpus), num_streams);
  4. // It will be helpful to also to have handy the chunk size for an entire GPU.
  5. const uint64_t gpu_chunk_size = stream_chunk_size*num_streams;

为多个 GPU 的多个流分配显存
GPU 的显存并未分配给各个流,所以此处的分配操作看起来与之前的多 GPU 任务相似,我们只需注意数据块的大小是分配给整个 GPU 的而非其中一个流的即可:

  1. ```c
  2. // For each GPU...
  3. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  4. // ...set device as active...
  5. cudaSetDevice(gpu);
  6. // ...use a GPU chunk's worth of data to calculate indices and width...
  7. const uint64_t lower = gpu_chunk_size*gpu;
  8. const uint64_t upper = min(lower+gpu_chunk_size, num_entries);
  9. const uint64_t width = upper-lower;
  10. // ...allocate data.
  11. cudaMalloc(&data_gpu[gpu], sizeof(uint64_t)*width);
  12. }

在多个 GPU 的多个流上实现复制与计算的重叠

  1. #include <cstdint>
  2. #include <iostream>
  3. #include "helpers.cuh"
  4. #include "encryption.cuh"
  5. void encrypt_cpu(uint64_t * data, uint64_t num_entries,
  6. uint64_t num_iters, bool parallel=true) {
  7. #pragma omp parallel for if (parallel)
  8. for (uint64_t entry = 0; entry < num_entries; entry++)
  9. data[entry] = permute64(entry, num_iters);
  10. }
  11. __global__
  12. void decrypt_gpu(uint64_t * data, uint64_t num_entries,
  13. uint64_t num_iters) {
  14. const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
  15. const uint64_t stride = blockDim.x*gridDim.x;
  16. for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
  17. data[entry] = unpermute64(data[entry], num_iters);
  18. }
  19. bool check_result_cpu(uint64_t * data, uint64_t num_entries,
  20. bool parallel=true) {
  21. uint64_t counter = 0;
  22. #pragma omp parallel for reduction(+: counter) if (parallel)
  23. for (uint64_t entry = 0; entry < num_entries; entry++)
  24. counter += data[entry] == entry;
  25. return counter == num_entries;
  26. }
  27. int main (int argc, char * argv[]) {
  28. Timer timer;
  29. Timer overall;
  30. const uint64_t num_entries = 1UL << 26;
  31. const uint64_t num_iters = 1UL << 10;
  32. const bool openmp = true;
  33. timer.start();
  34. uint64_t * data_cpu;
  35. cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
  36. timer.stop("allocate memory");
  37. check_last_error();
  38. timer.start();
  39. encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
  40. timer.stop("encrypt data on CPU");
  41. int num_gpus;
  42. cudaGetDeviceCount(&num_gpus);
  43. uint64_t num_streams = 32;
  44. uint64_t *data_gpu[num_gpus];
  45. cudaStream_t streams[num_gpus][num_streams];
  46. uint64_t gpu_chunk_size = sdiv(num_entries, num_gpus);
  47. overall.start();
  48. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  49. cudaSetDevice(gpu);
  50. for (uint64_t s = 0; s < num_streams; s++) {
  51. cudaStreamCreate(&streams[gpu][s]);
  52. }
  53. uint64_t gpu_lower = gpu_chunk_size* gpu;
  54. uint64_t gpu_upper = min(gpu_lower+gpu_chunk_size, num_entries);
  55. uint64_t gpu_width = gpu_upper-gpu_lower;
  56. cudaMalloc(&data_gpu[gpu], sizeof(uint64_t)*gpu_width);
  57. }
  58. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  59. cudaSetDevice(gpu);
  60. uint64_t gpu_lower = gpu_chunk_size * gpu;
  61. uint64_t gpu_upper = min(gpu_lower+gpu_chunk_size, num_entries);
  62. uint64_t gpu_width = gpu_upper - gpu_lower;
  63. uint64_t s_chunk_size = sdiv(gpu_width, num_streams);
  64. for (uint64_t s = 0; s < num_streams; s++) {
  65. uint64_t s_offset = s * s_chunk_size;
  66. uint64_t s_lower = gpu_lower + s_offset;
  67. uint64_t s_upper = min(s_lower + s_chunk_size, gpu_upper);
  68. uint64_t s_width = s_upper - s_lower;
  69. cudaMemcpyAsync(data_gpu[gpu] + s_offset, data_cpu+s_lower,
  70. sizeof(uint64_t)*s_width, cudaMemcpyHostToDevice, streams[gpu][s]);
  71. decrypt_gpu<<<80*32, 64, 0, streams[gpu][s]>>>(data_gpu[gpu]+s_offset, s_width, num_iters);
  72. cudaMemcpyAsync(data_cpu+s_lower, data_gpu[gpu] + s_offset,
  73. sizeof(uint64_t)*s_width, cudaMemcpyDeviceToHost, streams[gpu][s]);
  74. }
  75. }
  76. check_last_error();
  77. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  78. cudaSetDevice(gpu);
  79. for (uint64_t s = 0; s < num_streams; s++) {
  80. cudaStreamSynchronize(streams[gpu][s]);
  81. }
  82. }
  83. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  84. cudaSetDevice(gpu);
  85. for (uint64_t s = 0; s < num_streams; s++) {
  86. cudaStreamDestroy(streams[gpu][s]);
  87. }
  88. }
  89. overall.stop("total time on GPU");
  90. check_last_error();
  91. timer.start();
  92. bool success = true;
  93. success = check_result_cpu(data_cpu, num_entries, openmp);
  94. std::cout << "STATUS: test "
  95. << ( success ? "passed" : "failed")
  96. << std::endl;
  97. timer.stop("checking result on CPU");
  98. timer.start();
  99. cudaFreeHost(data_cpu);
  100. for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
  101. cudaSetDevice(gpu);
  102. cudaFree(data_gpu[gpu]);
  103. }
  104. timer.stop("free memory");
  105. check_last_error();
  106. }

发表评论

表情:
评论列表 (有 0 条评论,199人围观)

还没有评论,来说两句吧...

相关阅读