1. 程式人生 > >CUDA實例練習(十):多個cuda流

CUDA實例練習(十):多個cuda流

events get 所有 float speed print main rand free

  1 #include <stdio.h>
  2 #include <cuda_runtime.h>
  3 #include <device_launch_parameters.h>
  4 #include "book.h"
  5 
  6 #define N (1024*1024)
  7 #define FULL_DATA_SIZE (N * 20)
  8 
  9 __global__ void kernel(int *a, int *b, int *c){
 10     int idx = threadIdx.x + blockIdx.x * blockDim.x;
11 if (idx < N){ 12 int idx1 = (idx + 1) % 256; 13 int idx2 = (idx + 2) % 256; 14 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 15 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 16 c[idx] = (as + bs) / 2; 17 } 18 } 19 20 int main(void){ 21 cudaDeviceProp prop;
22 int whichDevice; 23 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 24 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 25 if (!prop.deviceOverlap){ 26 printf("Device will not handle overlaps, so no speed up from streams\n"); 27 return 0; 28 }
29 cudaEvent_t start, stop; 30 float elapsedTime; 31 32 //啟動計時器 33 HANDLE_ERROR(cudaEventCreate(&start)); 34 HANDLE_ERROR(cudaEventCreate(&stop)); 35 HANDLE_ERROR(cudaEventRecord(start, 0)); 36 37 //初始化流 38 cudaStream_t stream0, stream1; 39 HANDLE_ERROR(cudaStreamCreate(&stream0)); 40 HANDLE_ERROR(cudaStreamCreate(&stream1)); 41 42 int *host_a, *host_b, *host_c; 43 int *dev_a0, *dev_b0, *dev_c0;//為第0個流分配的GPU內存 44 int *dev_a1, *dev_b1, *dev_c1;//為第1個流分配的GPU內存 45 46 //在GPU上分配內存 47 HANDLE_ERROR(cudaMalloc((void **)&dev_a0, N * sizeof(int))); 48 HANDLE_ERROR(cudaMalloc((void **)&dev_b0, N * sizeof(int))); 49 HANDLE_ERROR(cudaMalloc((void **)&dev_c0, N * sizeof(int))); 50 HANDLE_ERROR(cudaMalloc((void **)&dev_a1, N * sizeof(int))); 51 HANDLE_ERROR(cudaMalloc((void **)&dev_b1, N * sizeof(int))); 52 HANDLE_ERROR(cudaMalloc((void **)&dev_c1, N * sizeof(int))); 53 54 //分配在流中使用的頁鎖定內存 55 HANDLE_ERROR(cudaHostAlloc((void **)&host_a, FULL_DATA_SIZE * sizeof(int), 56 cudaHostAllocDefault)); 57 HANDLE_ERROR(cudaHostAlloc((void **)&host_b, FULL_DATA_SIZE * sizeof(int), 58 cudaHostAllocDefault)); 59 HANDLE_ERROR(cudaHostAlloc((void **)&host_c, FULL_DATA_SIZE * sizeof(int), 60 cudaHostAllocDefault)); 61 62 for (int i = 0; i < FULL_DATA_SIZE; i++){ 63 host_a[i] = rand(); 64 host_b[i] = rand(); 65 } 66 67 //在整體數據上循環,每個數據塊的大小為N 68 for (int i = 0; i < FULL_DATA_SIZE; i += N * 2){ 69 //將鎖定內存以異步方式復制到設備上 70 HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int), 71 cudaMemcpyHostToDevice, stream0)); 72 HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int), 73 cudaMemcpyHostToDevice, stream0)); 74 kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0); 75 76 //將數據從設備復制回鎖定內存 77 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int), 78 cudaMemcpyDeviceToHost, stream0)); 79 80 //將鎖定內存以異步方式復制到設備上 81 HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, N* sizeof(int), 82 cudaMemcpyHostToDevice, stream1)); 83 HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N, N * sizeof(int), 84 cudaMemcpyHostToDevice, stream1)); 85 kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1); 86 87 //將數據從設備復制回到鎖定內存 88 HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, N * sizeof(int), 89 cudaMemcpyDeviceToHost, stream1)); 90 } 91 92 //在停止應用程序的計時器之前,首先將兩個流進行同步 93 HANDLE_ERROR(cudaStreamSynchronize(stream0)); 94 HANDLE_ERROR(cudaStreamSynchronize(stream1)); 95 HANDLE_ERROR(cudaEventRecord(stop, 0)); 96 HANDLE_ERROR(cudaEventSynchronize(stop)); 97 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop)); 98 printf("Time taken: %3.1f ms\n", elapsedTime); 99 100 //釋放流和內存 101 HANDLE_ERROR(cudaFreeHost(host_a)); 102 HANDLE_ERROR(cudaFreeHost(host_b)); 103 HANDLE_ERROR(cudaFreeHost(host_c)); 104 HANDLE_ERROR(cudaFree(dev_a0)); 105 HANDLE_ERROR(cudaFree(dev_b0)); 106 HANDLE_ERROR(cudaFree(dev_c0)); 107 HANDLE_ERROR(cudaFree(dev_a1)); 108 HANDLE_ERROR(cudaFree(dev_b1)); 109 HANDLE_ERROR(cudaFree(dev_c1)); 110 HANDLE_ERROR(cudaStreamDestroy(stream0)); 111 HANDLE_ERROR(cudaStreamDestroy(stream1)); 112 113 return 0; 114 115 116 117 }

技術分享

如果同時調度某個流的所有操作,那麽很容易在無意中阻塞另一個流的復制操作或者核函數執行。要解決這個問題,在將操作放入流的隊列時應采用寬度優先方式,而非深度優先方式。

  1 #include <stdio.h>
  2 #include <cuda_runtime.h>
  3 #include <device_launch_parameters.h>
  4 #include "book.h"
  5 
  6 #define N (1024*1024)
  7 #define FULL_DATA_SIZE (N * 20)
  8 
  9 __global__ void kernel(int *a, int *b, int *c){
 10     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 11     if (idx < N){
 12         int idx1 = (idx + 1) % 256;
 13         int idx2 = (idx + 2) % 256;
 14         float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
 15         float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
 16         c[idx] = (as + bs) / 2;
 17     }
 18 }
 19 
 20 int main(void){
 21     cudaDeviceProp prop;
 22     int whichDevice;
 23     HANDLE_ERROR(cudaGetDevice(&whichDevice));
 24     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
 25     if (!prop.deviceOverlap){
 26         printf("Device will not handle overlaps, so no speed up from streams\n");
 27         return 0;
 28     }
 29     cudaEvent_t start, stop;
 30     float elapsedTime;
 31 
 32     //啟動計時器
 33     HANDLE_ERROR(cudaEventCreate(&start));
 34     HANDLE_ERROR(cudaEventCreate(&stop));
 35     HANDLE_ERROR(cudaEventRecord(start, 0));
 36 
 37     //初始化流
 38     cudaStream_t stream0, stream1;
 39     HANDLE_ERROR(cudaStreamCreate(&stream0));
 40     HANDLE_ERROR(cudaStreamCreate(&stream1));
 41 
 42     int *host_a, *host_b, *host_c;
 43     int *dev_a0, *dev_b0, *dev_c0;//為第0個流分配的GPU內存
 44     int *dev_a1, *dev_b1, *dev_c1;//為第1個流分配的GPU內存
 45 
 46     //在GPU上分配內存
 47     HANDLE_ERROR(cudaMalloc((void **)&dev_a0, N * sizeof(int)));
 48     HANDLE_ERROR(cudaMalloc((void **)&dev_b0, N * sizeof(int)));
 49     HANDLE_ERROR(cudaMalloc((void **)&dev_c0, N * sizeof(int)));
 50     HANDLE_ERROR(cudaMalloc((void **)&dev_a1, N * sizeof(int)));
 51     HANDLE_ERROR(cudaMalloc((void **)&dev_b1, N * sizeof(int)));
 52     HANDLE_ERROR(cudaMalloc((void **)&dev_c1, N * sizeof(int)));
 53 
 54     //分配在流中使用的頁鎖定內存
 55     HANDLE_ERROR(cudaHostAlloc((void **)&host_a, FULL_DATA_SIZE * sizeof(int),
 56         cudaHostAllocDefault));
 57     HANDLE_ERROR(cudaHostAlloc((void **)&host_b, FULL_DATA_SIZE * sizeof(int),
 58         cudaHostAllocDefault));
 59     HANDLE_ERROR(cudaHostAlloc((void **)&host_c, FULL_DATA_SIZE * sizeof(int),
 60         cudaHostAllocDefault));
 61 
 62     for (int i = 0; i < FULL_DATA_SIZE; i++){
 63         host_a[i] = rand();
 64         host_b[i] = rand();
 65     }
 66 
 67     //在整體數據上循環,每個數據塊的大小為N
 68     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) {
 69         // enqueue copies of a in stream0 and stream1
 70         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
 71             N * sizeof(int),
 72             cudaMemcpyHostToDevice,
 73             stream0));
 74         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
 75             N * sizeof(int),
 76             cudaMemcpyHostToDevice,
 77             stream1));
 78         // enqueue copies of b in stream0 and stream1
 79         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
 80             N * sizeof(int),
 81             cudaMemcpyHostToDevice,
 82             stream0));
 83         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
 84             N * sizeof(int),
 85             cudaMemcpyHostToDevice,
 86             stream1));
 87 
 88         // enqueue kernels in stream0 and stream1   
 89         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);
 90         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);
 91 
 92         // enqueue copies of c from device to locked memory
 93         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
 94             N * sizeof(int),
 95             cudaMemcpyDeviceToHost,
 96             stream0));
 97         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
 98             N * sizeof(int),
 99             cudaMemcpyDeviceToHost,
100             stream1));
101     }
102 
103 
104     //在停止應用程序的計時器之前,首先將兩個流進行同步
105     HANDLE_ERROR(cudaStreamSynchronize(stream0));
106     HANDLE_ERROR(cudaStreamSynchronize(stream1));
107     HANDLE_ERROR(cudaEventRecord(stop, 0));
108     HANDLE_ERROR(cudaEventSynchronize(stop));
109     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
110     printf("Time taken: %3.1f ms\n", elapsedTime);
111 
112     //釋放流和內存
113     HANDLE_ERROR(cudaFreeHost(host_a));
114     HANDLE_ERROR(cudaFreeHost(host_b));
115     HANDLE_ERROR(cudaFreeHost(host_c));
116     HANDLE_ERROR(cudaFree(dev_a0));
117     HANDLE_ERROR(cudaFree(dev_b0));
118     HANDLE_ERROR(cudaFree(dev_c0));
119     HANDLE_ERROR(cudaFree(dev_a1));
120     HANDLE_ERROR(cudaFree(dev_b1));
121     HANDLE_ERROR(cudaFree(dev_c1));
122     HANDLE_ERROR(cudaStreamDestroy(stream0));
123     HANDLE_ERROR(cudaStreamDestroy(stream1));
124 
125     return 0;
126 
127 
128 
129 }

技術分享

CUDA實例練習(十):多個cuda流