cuda数据复制与计算重叠

╰半橙微兮° 2022-11-17 14:37 174阅读 0赞

数据复制与计算的重叠

深入学习:
CUDA流:最佳实践和常见陷阱

非默认流中执行主机到设备和设备到主机的内存传输。
复制与计算重叠的代码示例
第一个示例适用于数据的条目数能被流的数量整除的情况,第二个示例则是不能整除的情况。

N可被流的数量整除

  1. // "Simple" version where number of entries is evenly divisible by number of streams.
  2. // Set to a ridiculously low value to clarify mechanisms of the technique.
  3. const uint64_t num_entries = 10;
  4. const uint64_t num_iters = 1UL << 10;
  5. // Allocate memory for all data entries. Make sure to pin host memory.
  6. cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
  7. cudaMalloc (&data_gpu, sizeof(uint64_t)*num_entries);
  8. // Set the number of streams.
  9. const uint64_t num_streams = 2;
  10. // Create an array of streams containing number of streams
  11. cudaStream_t streams[num_streams];
  12. for (uint64_t stream = 0; stream < num_streams; stream++)
  13. cudaStreamCreate(&streams[stream]);
  14. // Set number of entries for each "chunk". Assumes `num_entires % num_streams == 0`.
  15. const uint64_t chunk_size = num_entries / num_streams;
  16. // For each stream, calculate indices for its chunk of full dataset and then, HtoD copy, compute, DtoH copy.
  17. for (uint64_t stream = 0; stream < num_streams; stream++) {
  18. // Get start index in full dataset for this stream's work.
  19. const uint64_t lower = chunk_size*stream;
  20. // Stream-indexed (`data+lower`) and chunk-sized HtoD copy in the non-default stream
  21. // `streams[stream]`.
  22. cudaMemcpyAsync(data_gpu+lower, data_cpu+lower,
  23. sizeof(uint64_t)*chunk_size, cudaMemcpyHostToDevice,
  24. streams[stream]);
  25. // Stream-indexed (`data_gpu+lower`) and chunk-sized compute in the non-default stream
  26. // `streams[stream]`.
  27. decrypt_gpu<<<80*32, 64, 0, streams[stream]>>>
  28. (data_gpu+lower, chunk_size, num_iters);
  29. // Stream-indexed (`data+lower`) and chunk-sized DtoH copy in the non-default stream
  30. // `streams[stream]`.
  31. cudaMemcpyAsync(data_cpu+lower, data_gpu+lower,
  32. sizeof(uint64_t)*chunk_size, cudaMemcpyDeviceToHost,
  33. streams[stream]);
  34. }
  35. // Destroy streams.
  36. for (uint64_t stream = 0; stream < num_streams; stream++)
  37. cudaStreamDestroy(streams[stream]);

N不可被流的数量整除

  1. // Able to handle when `num_entries % num_streams != 0`.
  2. const uint64_t num_entries = 10;
  3. const uint64_t num_iters = 1UL << 10;
  4. cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
  5. cudaMalloc (&data_gpu, sizeof(uint64_t)*num_entries);
  6. // Set the number of streams to not evenly divide num_entries.
  7. const uint64_t num_streams = 3;
  8. cudaStream_t streams[num_streams];
  9. for (uint64_t stream = 0; stream < num_streams; stream++)
  10. cudaStreamCreate(&streams[stream]);
  11. // Use round-up division (`sdiv`, defined in helper.cu) so `num_streams*chunk_size`
  12. // is never less than `num_entries`.
  13. // This can result in `num_streams*chunk_size` being greater than `num_entries`, meaning
  14. // we will need to guard against out-of-range errors in the final "tail" stream (see below).
  15. const uint64_t chunk_size = sdiv(num_entries, num_streams);
  16. for (uint64_t stream = 0; stream < num_streams; stream++) {
  17. const uint64_t lower = chunk_size*stream;
  18. // For tail stream `lower+chunk_size` could be out of range, so here we guard against that.
  19. const uint64_t upper = min(lower+chunk_size, num_entries);
  20. // Since the tail stream width may not be `chunk_size`,
  21. // we need to calculate a separate `width` value.
  22. const uint64_t width = upper-lower;
  23. // Use `width` instead of `chunk_size`.
  24. cudaMemcpyAsync(data_gpu+lower, data_cpu+lower,
  25. sizeof(uint64_t)*width, cudaMemcpyHostToDevice,
  26. streams[stream]);
  27. // Use `width` instead of `chunk_size`.
  28. decrypt_gpu<<<80*32, 64, 0, streams[stream]>>>
  29. (data_gpu+lower, width, num_iters);
  30. // Use `width` instead of `chunk_size`.
  31. cudaMemcpyAsync(data_cpu+lower, data_gpu+lower,
  32. sizeof(uint64_t)*width, cudaMemcpyDeviceToHost,
  33. streams[stream]);
  34. }
  35. // Destroy streams.
  36. for (uint64_t stream = 0; stream < num_streams; stream++)
  37. cudaStreamDestroy(streams[stream]);

例子:

  1. #include <cstdint>
  2. #include <iostream>
  3. #include "helpers.cuh"
  4. #include "encryption.cuh"
  5. void encrypt_cpu(uint64_t * data, uint64_t num_entries,
  6. uint64_t num_iters, bool parallel=true) {
  7. #pragma omp parallel for if (parallel)
  8. for (uint64_t entry = 0; entry < num_entries; entry++)
  9. data[entry] = permute64(entry, num_iters);
  10. }
  11. __global__
  12. void decrypt_gpu(uint64_t * data, uint64_t num_entries,
  13. uint64_t num_iters) {
  14. const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
  15. const uint64_t stride = blockDim.x*gridDim.x;
  16. for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
  17. data[entry] = unpermute64(data[entry], num_iters);
  18. }
  19. bool check_result_cpu(uint64_t * data, uint64_t num_entries,
  20. bool parallel=true) {
  21. uint64_t counter = 0;
  22. #pragma omp parallel for reduction(+: counter) if (parallel)
  23. for (uint64_t entry = 0; entry < num_entries; entry++)
  24. counter += data[entry] == entry;
  25. return counter == num_entries;
  26. }
  27. int main (int argc, char * argv[]) {
  28. Timer timer;
  29. Timer overall;
  30. const uint64_t num_entries = 1UL << 26;
  31. const uint64_t num_iters = 1UL << 10;
  32. const bool openmp = true;
  33. const uint64_t num_streams = 32;
  34. timer.start();
  35. uint64_t * data_cpu, * data_gpu;
  36. cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
  37. cudaMalloc (&data_gpu, sizeof(uint64_t)*num_entries);
  38. timer.stop("allocate memory");
  39. check_last_error();
  40. encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
  41. timer.start();
  42. cudaStream_t streams[num_streams];
  43. for (uint64_t stream = 0; stream < num_streams; stream++)
  44. cudaStreamCreate(&streams[stream]);
  45. const uint64_t chunk_size = sdiv(num_entries, num_streams);
  46. for (uint64_t stream = 0; stream < num_streams; stream++) {
  47. const uint64_t lower = chunk_size*stream;
  48. const uint64_t upper = min(lower+chunk_size, num_entries);
  49. const uint64_t width = upper-lower;
  50. cudaMemcpyAsync(data_gpu+lower, data_cpu+lower,
  51. sizeof(uint64_t)*width, cudaMemcpyHostToDevice,
  52. streams[stream]);
  53. decrypt_gpu<<<80*32, 64, 0, streams[stream]>>>
  54. (data_gpu+lower, width, num_iters);
  55. cudaMemcpyAsync(data_cpu+lower, data_gpu+lower,
  56. sizeof(uint64_t)*width, cudaMemcpyDeviceToHost,
  57. streams[stream]);
  58. }
  59. for (uint64_t stream = 0; stream < num_streams; stream++)
  60. cudaStreamSynchronize(streams[stream]);
  61. for (uint64_t stream = 0; stream < num_streams; stream++)
  62. cudaStreamDestroy(streams[stream]);
  63. timer.stop("encrypt data on CPU");
  64. overall.start();
  65. timer.start();
  66. //cudaMemcpy(data_gpu, data_cpu,
  67. // sizeof(uint64_t)*num_entries, cudaMemcpyHostToDevice);
  68. timer.stop("copy data from CPU to GPU");
  69. check_last_error();
  70. timer.start();
  71. // decrypt_gpu<<<80*32, 64>>>(data_gpu, num_entries, num_iters);
  72. timer.stop("decrypt data on GPU");
  73. check_last_error();
  74. timer.start();
  75. //cudaMemcpy(data_cpu, data_gpu,
  76. // sizeof(uint64_t)*num_entries, cudaMemcpyDeviceToHost);
  77. timer.stop("copy data from GPU to CPU");
  78. overall.stop("total time on GPU");
  79. check_last_error();
  80. timer.start();
  81. const bool success = check_result_cpu(data_cpu, num_entries, openmp);
  82. std::cout << "STATUS: test "
  83. << ( success ? "passed" : "failed")
  84. << std::endl;
  85. timer.stop("checking result on CPU");
  86. timer.start();
  87. cudaFreeHost(data_cpu);
  88. cudaFree (data_gpu);
  89. timer.stop("free memory");
  90. check_last_error();
  91. }

发表评论

表情:
评论列表 (有 0 条评论,174人围观)

还没有评论,来说两句吧...

相关阅读

    相关 CUDA 并行计算

    CUDA 并行计算 并行计算可以被定义为同时使用许多计算资源 (核心或计算机) 来执行并发计算,一个大的问题可以被分解成多个小问题,然后在不同的计算资源上并行处理这些小

    相关 异构计算CUDA

    记录下关于异构计算的链接 [参考链接][Link 1] GPU和CPU执行程序是异步的,核函数调用后成立刻会到主机线程继续,而不管GPU端核函数是否执行完毕,所以上面