高性能計算-GPU單進程多卡(多流)并行計算編程模型示例(25)

1. 簡介

(1) 使用CPU對向量點乘進行串行計算
(2) 對數據進行分塊,使用單進程多卡(多流)并行計算
(3) 使用不同數據規模,比較加速比的變化

2. 代碼

#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>

#define CUDA_ERROR_CHECK

int nGpus = 1;                  //gpu數量
int blockSize = 256;            //線程塊大小
int leftBit = 10;               //數據規模左移位數
unsigned long nSize = 1LL << leftBit;    //方陣維度
float *hostA = NULL;            //向量 A
float *hostB = NULL;            //向量 B
float *hostResult = NULL;       //串行計算結果
float *deviceResult = NULL;      //gpu計算結果


//宏定義檢查API調用是否出錯
#define CudaCall(err) __cudaSafeCall(err,__FILE__,__LINE__)
inline void __cudaSafeCall(cudaError_t err,const char* file,const int line)
{
    #ifdef CUDA_ERROR_CHECK
    if(err!=cudaSuccess)
    {
        fprintf(stderr,"cudaSafeCall failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
        exit(-1);
    }
    #endif
}

//宏定義檢查獲取流中的執行錯誤，主要是對核函數
#define CudaCheck() _cudaCheckError(__FILE__,__LINE__)
inline void _cudaCheckError(const char * file,const int line)
{
    #ifdef CUDA_ERROR_CHECK
    cudaError_t err = cudaGetLastError();
    if(err != cudaSuccess)
    {
        fprintf(stderr,"cudaCheckError failed at %s:%d :(%d) %s\n",file,line,err,cudaGetErrorString(err));
        exit(-1);
    }
    #endif
}

//ms
long getTime()
{
    struct timeval cur;
    gettimeofday(&cur, NULL);
    // printf("sec %ld usec %ld,toal ms %ld\n",cur.tv_sec,cur.tv_usec,cur.tv_sec*1e3 + cur.tv_usec / 1e3);
    return cur.tv_sec*1e3 + cur.tv_usec / 1e3;
}

void initData(float *A,float *B,unsigned long len)
{
    //設置隨機數種子
    srand(0);
    // len = 10;
    for(unsigned long i=0;i<len;i++)
    {
        A[i] = (float)rand()/RAND_MAX;
        B[i] = (float)rand()/RAND_MAX;
        // printf("%f %f\n",A[i],B[i]);
    }
}

//cpu 串行計算
long serial(unsigned long len)
{
    long start = getTime();
    for(unsigned long i=0;i<len;i++)
        hostResult[i] = hostA[i] * hostB[i];
    long end = getTime();
    // printf("cpu time %d\n",end-start);
    return end-start;
}

__global__ void kernel(float *A,float *B,float *result,unsigned long len)
{
    unsigned long id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id<len)
        result[id] = A[id] * B[id];
}

//gpu多卡并行
float gpu_multi(float *result,unsigned long len,int ngpus)
{
    float gpuTime = 0.0;
    //對數據分塊,每個gpu上開辟內存空間存儲數據,并創建一個流,每個GPU計算自己的數據
    //每個流GPU處理的數據個數
    unsigned long nPerGpu = len/ngpus;
    float **deviceA,**deviceB,**deviceResult;
    deviceA = (float**)calloc(ngpus,sizeof(float*));
    deviceB = (float**)calloc(ngpus,sizeof(float*));
    deviceResult = (float**)calloc(ngpus,sizeof(float*));
    cudaStream_t *streams = (cudaStream_t*)calloc(ngpus,sizeof(cudaStream_t));
    //在gpu上分配內存空間
    for(int i=0;i<ngpus;i++)
    {
        CudaCall(cudaSetDevice(i));
        CudaCall(cudaMalloc((void**)&deviceA[i],nPerGpu*sizeof(float)));
        CudaCall(cudaMalloc((void**)&deviceB[i],nPerGpu*sizeof(float)));
        CudaCall(cudaMalloc((void**)&deviceResult[i],nPerGpu*sizeof(float)));
        CudaCall(cudaStreamCreate(streams+i));
    }
    //事件記錄在默認流
    cudaEvent_t start,end;
    CudaCall(cudaSetDevice(0));
    CudaCall(cudaEventCreate(&start));
    CudaCall(cudaEventCreate(&end));
    CudaCall(cudaEventRecord(start,streams[0]));
    for(int i=0;i<ngpus;i++)
    {
        CudaCall(cudaSetDevice(i));
        //異步數據拷貝
        CudaCall(cudaMemcpyAsync(deviceA[i],hostA+i*nPerGpu,nPerGpu*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
        CudaCall(cudaMemcpyAsync(deviceB[i],hostB+i*nPerGpu,nPerGpu*sizeof(float),cudaMemcpyHostToDevice,streams[i]));
        //計算
        int gridDim = (nPerGpu-1)/blockSize + 1;
        kernel<<<gridDim,blockSize,0,streams[i]>>>(deviceA[i],deviceB[i],deviceResult[i],nPerGpu);
        CudaCheck();
        //異步拷貝數據
        CudaCall(cudaMemcpyAsync(result+i*nPerGpu,deviceResult[i],nPerGpu*sizeof(float),cudaMemcpyDeviceToHost,streams[i]));
    }
    CudaCall(cudaSetDevice(0));
    CudaCall(cudaEventRecord(end,streams[0]));
    //流同步
    for(int i=0;i<ngpus;i++)
    {
        CudaCall(cudaSetDevice(i));
        CudaCall(cudaStreamSynchronize(streams[i]));
    }
    // CudaCall(cudaEventSynchronize(end));
    CudaCall(cudaEventElapsedTime(&gpuTime,start,end));

    //free
    CudaCall(cudaEventDestroy(start));
    CudaCall(cudaEventDestroy(end));
    for(int i=0;i<ngpus;i++)
    {
        CudaCall(cudaSetDevice(i));
        CudaCall(cudaFree(deviceA[i]));
        CudaCall(cudaFree(deviceB[i]));
        CudaCall(cudaFree(deviceResult[i]));
        CudaCall(cudaStreamDestroy(streams[i]));
    }
    
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceResult);
    free(streams);
    // printf("gpu time %f\n",gpuTime);
    return gpuTime;
}

int main(int argc, char* argv[])
{
    cudaDeviceProp prop;
    int globalMemSize = 0;
    int memSize = 0;    //對單卡顯存需求大小
    CudaCall(cudaGetDeviceProperties(&prop ,0));
    globalMemSize = (float)prop.totalGlobalMem/1024/1024;
    // printf("compute capability %d.%d\n", prop.major,prop.minor);//k80 3.7
    // printf("Memory clock rate: %d\n",prop.memoryClockRate);
    // printf("global memory:%dMB\n",globalMemSize);

    //獲得 device 數量
    CudaCall(cudaGetDeviceCount(&nGpus));
    //限制參數設置的最大gpu數量
    if(argc==3)
    {
        leftBit = atoi(argv[2]);
        nSize = 1LL << leftBit;
        int n = atoi(argv[1]);
	//當gpu數量設置為3時,nSize%n !=0,使用最大gpu數量計算
        nGpus = ((n > nGpus || nSize%n !=0)?nGpus:n);
        memSize = nSize*sizeof(float)*3/nGpus/1024/1024;
        //判斷顯存是否夠用,k80 單卡可用顯存為 11441MB
        if(memSize > globalMemSize)
        {
            printf("one gpu memory not enough gater %dMB\n",globalMemSize);
            exit(-1);
        }
    }
    else
    {
        printf("parameter 1:ngpus 2:matrix dim 2^(_)\n");
        exit(-1);
    }

    unsigned long nBytes = nSize * sizeof(float);   //單個向量字節數
    //數據初始化,開辟主機鎖頁內存
    // hostA = (float*)calloc(nSize,sizeof(float));
    // hostB = (float*)calloc(nSize,sizeof(float));
    // hostResult = (float*)calloc(nSize,sizeof(float));
    CudaCall(cudaMallocHost((void**)&hostA,nBytes));
    CudaCall(cudaMallocHost((void**)&hostB,nBytes));
    CudaCall(cudaMallocHost((void**)&hostResult,nBytes));
    CudaCall(cudaMallocHost((void**)&deviceResult,nBytes));
    initData(hostA,hostB,nSize);

    //串行計算
    long cpuTime = serial(nSize);

    //多GPU計算
    float gpuTime = gpu_multi(deviceResult,nSize,nGpus);

    printf("單個向量長度 2^%ld,單個顯卡三個數組需要顯存 %dMB,使用 %d個GPU,cpu串行耗時 %ldms,GPU并行數據傳輸和計算耗時 %fms,加速比: %f\n",\
    leftBit,memSize,nGpus,cpuTime,gpuTime,cpuTime/gpuTime);

    cudaFreeHost(hostA);
    cudaFreeHost(hostB);
    cudaFreeHost(hostResult);
    cudaFreeHost(deviceResult);
    return 0;
}

3. 測試腳本

#!/bin/bash
# 編譯
nvcc pointMul.cu -o pointMul
dir=out
# 清空文件夾
> "$dir"

echo "start $(date)" >> out

# 串行計算
# for((i=0;i<4;i++)); do
#     yhrun -N1 -n1 -pTH_GPU ./matrix_add2D 0 | tee -a "$dir"
# done

# 顯卡數量
nGpus=(1 2 3 4)   
# 數據規模 2^(S)
S=(24 28 30 31)


# gpu
for n in "${nGpus[@]}"; do
    for s in "${S[@]}"; do
        for((i=0;i<3;i++)); do
            yhrun -N1 -n1 -pTH_GPU ./pointMul "$n" "$s" | tee -a "$dir"
        done
    done
done

echo "end $(date)" >> out

4. 測試數據

由于測試腳本的限制，CPU串行計算在GPU單卡(K80 12G顯存)、雙卡、四卡測試中分別跑了一輪，數據如下:

數據長度	gpu單卡(ms)	gpu2個卡(ms)	gpu4個卡(ms)
2^24	44.3	44.7	44
2^28	719.7	717.3	703.3
2^30	-	2988.7	2951.7
2^31	-	-	5906.7

GPU測試耗時及加速比數據：

數據長度	gpu單卡(ms)	gpu2個卡(ms)	gpu4個卡(ms)
2^24(耗時/加速比)	27.9/1.6	17.7/2.5	17.1/2.6
2^28(耗時/加速比)	399.8/1.8	273.6/2.6	273.9/2.6
2^30(耗時/加速比)	顯存不足	985/3.0	1056.3/2.8
2^31(耗時/加速比)	顯存不足	顯存不足	1942.4/3.0

5. 結果分析

（1）GPU比CPU計算有明顯的性能提升，根據數據規模，數據量越大提升越明顯。
（2）GPU數量越多，計算效率提升越高，數據規模越大，提升越明顯。

posted @ 2025-02-19 16:56 安洛8 閱讀(136) 評論(0) 收藏舉報

刷新頁面返回頂部

anluo8