高性能計算-CUDA單流/多流調度(24)
1. 介紹:
(1) 用CUDA計算 pow(sin(id),2)+ pow(cos(id),2)的結果
(2) 對比單流(同步傳輸、異步傳輸)、多流深度優先調度、多流廣度優先調度的效率(包含數據傳輸和計算)
核心代碼
1. 用CUDA計算 pow(sin(id),2)+ pow(cos(id),2)的結果
2. 對比單流(同步傳輸、異步傳輸)、多流深度優先調度、多流廣度優先調度的效率(包含數據傳輸和計算)
3. 使用接口錯誤檢查宏
*/
#include <stdio.h>
#define CUDA_ERROR_CHECK //API檢查控制宏
#define BLOCKSIZE 256
int N = 1<<28; //數據個數
int NBytes = N*sizeof(float); //數據字節數
//宏定義檢查API調用是否出錯
#define CudaSafecCall(err) __cudaSafeCall(err,__FILE__,__LINE__)
inline void __cudaSafeCall(cudaError_t err,const char* file,const int line)
{
#ifdef CUDA_ERROR_CHECK
if(err!=cudaSuccess)
{
fprintf(stderr,"cudaSafeCall failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
exit(-1);
}
#endif
}
//宏定義檢查獲取流中的執行錯誤,主要是對核函數
#define CudaCheckError() _cudaCheckError(__FILE__,__LINE__)
inline void _cudaCheckError(const char * file,const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess)
{
fprintf(stderr,"cudaCheckError failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
exit(-1);
}
#endif
}
__global__ void kernel_func(float * arr,int offset,const int n)
{
int id = offset + threadIdx.x + blockIdx.x * blockDim.x;
if(id<n)
arr[id] = pow(sinf(id),2) + pow(cosf(id),2);
}
//單流主機非鎖頁內存,同步傳輸
float gpu_base()
{
//開辟主機非鎖頁內存空間
float* hostA,*deviceA;
hostA = (float*)calloc(N,sizeof(float));
CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));
float gpuTime = 0.0;
cudaEvent_t start,end;
CudaSafecCall(cudaEventCreate(&start));
CudaSafecCall(cudaEventCreate(&end));
CudaSafecCall(cudaEventRecord(start));
CudaSafecCall(cudaMemcpy(deviceA,hostA,NBytes,cudaMemcpyHostToDevice));
kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N);
CudaCheckError();
CudaSafecCall(cudaEventRecord(end));
CudaSafecCall(cudaEventSynchronize(end));
CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
CudaSafecCall(cudaEventDestroy(start));
CudaSafecCall(cudaEventDestroy(end));
CudaSafecCall(cudaMemcpy(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost));
printf("gpu_base 單流非鎖頁內存,數據傳輸和計算耗時 %f ms\n",gpuTime);
CudaSafecCall(cudaFree(deviceA));
free(hostA);
return gpuTime;
}
//單流主機鎖頁內存,異步傳輸
float gpu_base_pinMem()
{
//開辟主機鎖頁內存空間
float* hostA,*deviceA;
CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));
CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));
float gpuTime = 0.0;
cudaEvent_t start,end;
CudaSafecCall(cudaEventCreate(&start));
CudaSafecCall(cudaEventCreate(&end));
CudaSafecCall(cudaEventRecord(start));
CudaSafecCall(cudaMemcpyAsync(deviceA,hostA,NBytes,cudaMemcpyHostToDevice));
kernel_func<<<(N-1)/BLOCKSIZE + 1,BLOCKSIZE>>>(deviceA,0,N);
CudaCheckError();
CudaSafecCall(cudaEventRecord(end));
CudaSafecCall(cudaEventSynchronize(end));
CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
CudaSafecCall(cudaEventDestroy(start));
CudaSafecCall(cudaEventDestroy(end));
CudaSafecCall(cudaMemcpyAsync(hostA,deviceA,NBytes,cudaMemcpyDeviceToHost));
printf("gpu_base_pinMem 單流鎖頁內存,數據傳輸和計算耗時 %f ms\n",gpuTime);
CudaSafecCall(cudaFreeHost(hostA));
CudaSafecCall(cudaFree(deviceA));
return gpuTime;
}
//多流深度優先調度
float gpu_MStream_deep(int nStreams)
{
//開辟主機非鎖頁內存空間
float* hostA,*deviceA;
//異步傳輸必須用鎖頁主機內存
CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));
CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));
float gpuTime = 0.0;
cudaEvent_t start,end;
cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t));
for(int i=0;i<nStreams;i++)
CudaSafecCall(cudaStreamCreate(streams+i));
CudaSafecCall(cudaEventCreate(&start));
CudaSafecCall(cudaEventCreate(&end));
CudaSafecCall(cudaEventRecord(start));
//傳輸、計算,流間最多只有一個傳輸和一個計算同時進行
// 每個流處理的數據量
int nByStream = N/nStreams;
for(int i=0;i<nStreams;i++)
{
int offset = i * nByStream;
CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream);
CudaCheckError();
CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));
}
for(int i=0;i<nStreams;i++)
CudaSafecCall(cudaStreamSynchronize(streams[i]));
CudaSafecCall(cudaEventRecord(end));
CudaSafecCall(cudaEventSynchronize(end));
CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
CudaSafecCall(cudaEventDestroy(start));
CudaSafecCall(cudaEventDestroy(end));
printf("gpu_MStream_deep %d個流深度優先調度,數據傳輸和計算耗時 %f ms\n",nStreams,gpuTime);
for(int i=0;i<nStreams;i++)
CudaSafecCall(cudaStreamDestroy(streams[i]));
CudaSafecCall(cudaFreeHost(hostA));
CudaSafecCall(cudaFree(deviceA));
free(streams);
return gpuTime;
}
//多流廣度優先調度
float gpu_MStream_wide(int nStreams)
{
//開辟主機非鎖頁內存空間
float* hostA,*deviceA;
//異步傳輸必須用鎖頁主機內存
CudaSafecCall(cudaMallocHost((void**)&hostA,NBytes));
CudaSafecCall(cudaMalloc((void**)&deviceA,NBytes));
float gpuTime = 0.0;
cudaEvent_t start,end;
cudaStream_t* streams = (cudaStream_t*)calloc(nStreams,sizeof(cudaStream_t));
for(int i=0;i<nStreams;i++)
CudaSafecCall(cudaStreamCreate(streams+i));
CudaSafecCall(cudaEventCreate(&start));
CudaSafecCall(cudaEventCreate(&end));
CudaSafecCall(cudaEventRecord(start));
//傳輸、計算,流間并行
// 每個流處理的數據量
int nByStream = N/nStreams;
for(int i=0;i<nStreams;i++)
{
int offset = i * nByStream;
CudaSafecCall(cudaMemcpyAsync(deviceA+offset,hostA+offset,nByStream*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
}
for(int i=0;i<nStreams;i++)
{
int offset = i * nByStream;
kernel_func<<<(nByStream-1)/BLOCKSIZE + 1,BLOCKSIZE,0,streams[i]>>>(deviceA,offset,(i+1)*nByStream);
CudaCheckError();
}
for(int i=0;i<nStreams;i++)
{
int offset = i * nByStream;
CudaSafecCall(cudaMemcpyAsync(hostA+offset,deviceA+offset,nByStream*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));
}
for(int i=0;i<nStreams;i++)
CudaSafecCall(cudaStreamSynchronize(streams[i]));
CudaSafecCall(cudaEventRecord(end));
CudaSafecCall(cudaEventSynchronize(end));
CudaSafecCall(cudaEventElapsedTime(&gpuTime,start,end));
CudaSafecCall(cudaEventDestroy(start));
CudaSafecCall(cudaEventDestroy(end));
printf("gpu_MStream_wide %d個流廣度優先調度,數據傳輸和計算耗時 %f ms\n",nStreams,gpuTime);
for(int i=0;i<nStreams;i++)
CudaSafecCall(cudaStreamDestroy(streams[i]));
CudaSafecCall(cudaFreeHost(hostA));
CudaSafecCall(cudaFree(deviceA));
free(streams);
return gpuTime;
}
int main(int argc,char* argv[])
{
int nStreams = argc==2? atoi(argv[1]):4;
//gpu默認單流,主機非鎖頁內存,同步傳輸
float gpuTime1 = gpu_base();
//gpu默認單流,主機鎖頁內存,異步傳輸
float gpuTime2 = gpu_base_pinMem();
//gpu多流深度優先調度,異步傳輸
float gpuTime3 = gpu_MStream_deep(nStreams);
//gpu多流廣度優先調度,異步傳輸
float gpuTime4 = gpu_MStream_wide(nStreams);
printf("相比默認單流同步傳輸與計算,單流異步傳輸及運算加速比為 %f\n",nStreams,gpuTime1/gpuTime2);
printf("相比默認單流同步傳輸與計算,%d 個流深度優先調度異步傳輸及運算加速比為 %f\n",nStreams,gpuTime1/gpuTime3);
printf("相比默認單流同步傳輸與計算,%d 個流廣度優先調度異步傳輸及運算加速比為 %f\n",nStreams,gpuTime1/gpuTime4);
return 0;
}
3. 測試結果
各項測試耗時及與單流同步傳輸加速比
| 項目\流數量 | 1 | 4 | 8 | 16 | 32 | 64 |
|---|---|---|---|---|---|---|
| 單流同步傳輸(耗時ms) | 306.7 | - | - | - | - | - |
| 單流異步傳輸(耗時ms/加速比) | 199.4/1.53 | - | - | - | - | - |
| 多流深度優先調度(耗時ms/加速比) | - | 151.04/2.06 | 129.95/2.29 | 131.49/2.32 | 123.08/2.49 | 126.48/2.45 |
| 多流廣度優先調度(耗時ms/加速比) | - | 149.29/2.09 | 129.6/2.3 | 134.55/2.27 | 122.82/2.49 | 126.42/2.45 |

浙公網安備 33010602011771號