高性能計(jì)算-GPU并行規(guī)約(27)

1. 目標(biāo)：對數(shù)組進(jìn)行求和，并做優(yōu)化對比

2. baseline 代碼

相鄰求和: 根據(jù)blockSize對數(shù)據(jù)分塊,并將數(shù)據(jù)放在共享內(nèi)存,以線程塊為單位,塊內(nèi)線程數(shù)量=數(shù)據(jù)個(gè)數(shù),相鄰配對,用其中第一個(gè)元素索引為ID的線程進(jìn)行計(jì)算,計(jì)算結(jié)果放在第一個(gè)元素位置,循環(huán)進(jìn)行下一輪計(jì)算,最后塊求和計(jì)算結(jié)果賦值到全局內(nèi)存以blockIdx.x為id的數(shù)組中,拷貝到主機(jī)對數(shù)組求和得到所有數(shù)字之和。

#include <stdio.h>
#include "common.h"

#define BLOCKSIZE 512

__global__ void kernel_reduce(float* in,long N,float* out,int gridSize)
{
    //每個(gè)線程塊加載與線程數(shù)量相同的數(shù)據(jù)
    __shared__ float arrShared[BLOCKSIZE];
    int tid = threadIdx.x;
    //共享內(nèi)存數(shù)據(jù)初始化
    arrShared[tid] = in[blockIdx.x*blockDim.x + tid];
    __syncthreads();
    //線程塊內(nèi)計(jì)算
    for(long s=1;s<=BLOCKSIZE/2;s*=2)
    {
        if(0 == tid % (2*s))
            arrShared[tid] += arrShared[tid+s];
        __syncthreads();
    }
    if(tid ==0)
        out[blockIdx.x] = arrShared[0];
}

int main(int argc, char ** argv)
{
    long N =1<<10;
    if(argc > 1)
        N = 1<<atoi(argv[1]);
    N *= 32;
    //GPU計(jì)算參數(shù)
    float gpuTime = 0;
    int gridSize = (N-1)/BLOCKSIZE+1;   //數(shù)組初始化
    printf("N %ld ,GridSize %d\n",N,gridSize);
    //為了保證對比準(zhǔn)確性,最后求和不計(jì)入耗時(shí)對比
    float cpuTime = 0;
    float* arrHost = (float*)calloc(N,sizeof(float));
    float* arrResult = (float*)calloc(gridSize,sizeof(float));
    float resultHost = 0;
    initialData(arrHost,N);
    double start = cpuSecond();
    for(int i=0;i<N/BLOCKSIZE;i++)
    {
        float temp =0;
        for(int j=0;j<BLOCKSIZE;j++)
            temp += arrHost[i*BLOCKSIZE+j];
        arrResult[i] = temp;
    }
    for(int i=0;i<N%BLOCKSIZE;i++)
        arrResult[gridSize-1] += arrHost[N/BLOCKSIZE*BLOCKSIZE + i];
    double end = cpuSecond();
    cpuTime = (end - start)*1000;

    //gpu
    float *arrD = NULL;
    float *resultD = NULL;
    float *resultFromD = NULL;
    float resultGpu = 0;
    CudaSafeCall(cudaMalloc((void**)&arrD,N*sizeof(float)));
    CudaSafeCall(cudaMalloc((void**)&resultD,gridSize*sizeof(float)));
    resultFromD = (float*)calloc(gridSize,sizeof(float));
    cudaEvent_t startD;
    cudaEvent_t endD;
    CudaSafeCall(cudaEventCreate(&startD));
    CudaSafeCall(cudaEventCreate(&endD));
    CudaSafeCall(cudaEventRecord(startD));
    CudaSafeCall(cudaMemcpy(arrD,arrHost,N*sizeof(float),cudaMemcpyHostToDevice));

    kernel_reduce<<<gridSize,BLOCKSIZE,sizeof(float)*BLOCKSIZE>>>(arrD,N,resultD,gridSize);
    CudaCheckError();

    CudaSafeCall(cudaMemcpy(resultFromD,resultD,gridSize*sizeof(float),cudaMemcpyDeviceToHost));
    CudaSafeCall(cudaEventRecord(endD));
    cudaEventSynchronize(endD);
    CudaSafeCall(cudaEventElapsedTime(&gpuTime,startD,endD));
    CudaSafeCall(cudaEventDestroy(startD));
    CudaSafeCall(cudaEventDestroy(endD));

    //匯總求和
    for(int i=0;i<gridSize;i++)
    {
        resultHost += arrResult[i];
        resultGpu += resultFromD[i];
    }
    printf("數(shù)據(jù)量 %ld ;串行結(jié)算結(jié)果為%.3f,耗時(shí) %.3f ms;GPU計(jì)算結(jié)果為%.3f,耗時(shí) %.3f ms;加速比為%.3f\n",N,resultHost,cpuTime,resultGpu,gpuTime,cpuTime/gpuTime);

    CudaSafeCall(cudaFree(arrD));
    CudaSafeCall(cudaFree(resultD));
    free(arrHost);
    free(resultFromD);
    return 0;
}

3. 優(yōu)化代碼及思路

優(yōu)化一:從全局內(nèi)存加載時(shí)計(jì)算,一個(gè)線程塊計(jì)算8個(gè)線程塊的數(shù)據(jù)

優(yōu)化二:從全局內(nèi)存加載數(shù)據(jù)時(shí)使用合并訪問,一個(gè)線程處理的數(shù)據(jù)索引步長大小為 blocksize 大小,

這樣處理的話在一個(gè)線程束中訪存可以對 cacheline 連續(xù)命中

優(yōu)化三:線程束內(nèi)做循環(huán)展開

優(yōu)化四:用 shfl API 線程束內(nèi)訪問其他線程寄存器

/*
數(shù)組求和
優(yōu)化一:從全局內(nèi)存加載時(shí)計(jì)算,一個(gè)線程塊計(jì)算8個(gè)線程塊的數(shù)據(jù)
優(yōu)化二:從全局內(nèi)存加載數(shù)據(jù)時(shí)使用合并訪問,一個(gè)線程處理的數(shù)據(jù)索引步長大小為 blocksize 大小,
這樣處理的話在一個(gè)線程束中訪存可以對 cacheline 連續(xù)命中
優(yōu)化三:線程束內(nèi)做循環(huán)展開
優(yōu)化四:用 shfl API 線程束內(nèi)訪問其他線程寄存器 
注意：如果是 float 每個(gè)block cpu和gpu計(jì)算數(shù)據(jù)順序不同，大數(shù)+小數(shù),小數(shù)可能會(huì)被吃掉,精度不能太高
*/

#include <stdio.h>
#include "common.h"

#define BLOCK_SIZE 256  //最大1024
#define NPerThread 8    //每個(gè)線程初始化數(shù)據(jù)個(gè)數(shù)
#define WARP_SIZE 32    //線程束大小  
// #define warpNum 1024/WARP_SIZE

//優(yōu)化三:線程束內(nèi)的規(guī)約,并做循環(huán)展開
//優(yōu)化四:使用 shfl API 可以在warp內(nèi)跨線程訪問寄存器數(shù)據(jù)
template <int warp_len>     //warp 中有效數(shù)據(jù)個(gè)數(shù)
__device__ float warp_reduce(float sum)
// __device__ float warpReduceSum(float sum)
{
    //防止線程塊大小設(shè)置太小,做判斷
    if(warp_len>=32)
        //向前面的線程傳遞寄存器數(shù)值
        sum += __shfl_down_sync(0xffffffff,sum,16);
    if(warp_len>=16) 
        sum += __shfl_down_sync(0xffffffff,sum,8);
    if(warp_len>=8) 
        sum += __shfl_down_sync(0xffffffff,sum,4);
    if(warp_len>=4) 
        sum += __shfl_down_sync(0xffffffff,sum,2);
    if(warp_len>=2) 
        sum += __shfl_down_sync(0xffffffff,sum,1);
    return sum;
}

//線程塊大小 每個(gè)線程要處理的個(gè)數(shù)
template <int blockSize, int NUM_PER_THREAD>
__global__ void block_reduce(float* g_in,float* g_out)
{
    float sum = 0;      //保存當(dāng)前線程加和的數(shù)值
    int tid = threadIdx.x;
    #if 0
    //優(yōu)化:從全局內(nèi)存取數(shù)據(jù)時(shí),每個(gè)線程取多個(gè)數(shù)據(jù)
    //問題:有bank沖突
    int tempId1 = blockIdx.x*blockDim.x*NUM_PER_THREAD;
    int tempId2 = tid*NUM_PER_THREAD;   
    //共享內(nèi)存數(shù)據(jù)初始化,循環(huán)展開
    #pragma unrool
    for(int i=0;i<NUM_PER_THREAD;i++)    
       sum += g_in[tempId1 + tempId2 + i];
    #else 
    //優(yōu)化二:全局內(nèi)存合并訪問增加線程束一次訪存的緩存命中,
    //在 NUM_PER_THREAD 個(gè)線程塊相同位置分別取數(shù)據(jù)給線程求和
    //而不是對一個(gè)塊中連續(xù)內(nèi)存訪問求和
    int temp1 = blockIdx.x * blockSize * NUM_PER_THREAD + tid;
    #pragma unrool
    for(int i=0;i<NUM_PER_THREAD;i++)
        //優(yōu)化一:加載時(shí)計(jì)算
        sum += g_in[temp1 + i * blockSize];
    #endif

    // 共享內(nèi)存more初始化為0
    // __shared__ float arrWarp[WARP_SIZE];
    __shared__ float arrWarp[blockSize/WARP_SIZE];
    //做warp內(nèi)規(guī)約
    //使用 shfl 接口,傳入要從其他線程復(fù)制的寄存器變量 sum
    sum = warp_reduce<WARP_SIZE>(sum);
    //wrap內(nèi) 0號(hào)線程將結(jié)果保存到共享內(nèi)存
    if(tid % WARP_SIZE == 0)
        arrWarp[tid/WARP_SIZE] = sum;
    __syncthreads();

    //將warp規(guī)約結(jié)果放入第一個(gè)warp內(nèi)
    sum = (tid<blockSize/WARP_SIZE) ? arrWarp[tid] : 0;
    //對第一個(gè)warp規(guī)約
    if((tid / WARP_SIZE) == 0)
        sum = warp_reduce<blockSize/WARP_SIZE>(sum);
    //保存整個(gè)線程塊規(guī)約結(jié)果
    if(tid == 0)
        g_out[blockIdx.x] = sum;
}

int main(int argc, char **argv)
{
    printf("0\n");
    int N = 1<<10;
    if(argc > 1)
        N = 1<<atoi(argv[1]);
    N *= 32;

    //GPU參數(shù)計(jì)算
    float gpuTime = 0;
    printf("1\n");
    int gridSize = (N-1)/(BLOCK_SIZE*NPerThread)+1; 
    // int gridSize = (N-1)/BLOCK_SIZE + 1;
    printf("N %ld ,GridSize %d\n",N,gridSize);

    //cpu
    float cpuTime = 0;
    float* arrHost = (float*)calloc(N,sizeof(float));
    float* arrResult = (float*)calloc(gridSize,sizeof(float));
    float resultHost = 0;
    initialData(arrHost,N);
    double start = cpuSecond();
    int number = BLOCK_SIZE*NPerThread;
    for(int i=0;i < N/number;i++)
    {
        float temp = 0;
        for(int j=0;j<number;j++)
            temp += arrHost[i*number+j];
        arrResult[i] = temp;
    }
    for(int i=0;i < N%(BLOCK_SIZE*NPerThread);i++)
        arrResult[gridSize-1] += arrHost[N/number*number + i];
    double end = cpuSecond();
    cpuTime = (end - start)*1000;

    //gpu
    float *arrD = NULL;
    float *resultD = NULL;
    float *resultFromD = NULL;
    float resultGpu = 0;
    CudaSafeCall(cudaMalloc((void**)&arrD,N*sizeof(float)));
    CudaSafeCall(cudaMalloc((void**)&resultD,gridSize*sizeof(float)));
    resultFromD = (float*)calloc(gridSize,sizeof(float));
    cudaEvent_t startD;
    cudaEvent_t endD;
    CudaSafeCall(cudaEventCreate(&startD));
    CudaSafeCall(cudaEventCreate(&endD));
    
    CudaSafeCall(cudaMemcpy(arrD,arrHost,N*sizeof(float),cudaMemcpyHostToDevice));
    CudaSafeCall(cudaEventRecord(startD,0));
    block_reduce<BLOCK_SIZE,NPerThread><<<gridSize,BLOCK_SIZE,BLOCK_SIZE/WARP_SIZE*sizeof(float)>>>(arrD,resultD);
    CudaCheckError();
    CudaSafeCall(cudaEventRecord(endD,0));

    CudaSafeCall(cudaMemcpy(resultFromD,resultD,gridSize*sizeof(float),cudaMemcpyDeviceToHost));
    cudaEventSynchronize(endD);
    CudaSafeCall(cudaEventElapsedTime(&gpuTime,startD,endD));
    CudaSafeCall(cudaEventDestroy(startD));
    CudaSafeCall(cudaEventDestroy(endD));
    
    //匯總求和
    //如果是 float 每個(gè)block cpu和gpu計(jì)算數(shù)據(jù)順序不同，大數(shù)+小數(shù),小數(shù)可能會(huì)被吃掉,精度不能太高

    for(int i=0;i<gridSize;i++)
    {
        resultHost += arrResult[i];
        resultGpu += resultFromD[i];
    }
    printf("數(shù)據(jù)量 %ld ;串行結(jié)算結(jié)果為%.3f,耗時(shí) %.3f ms;GPU計(jì)算結(jié)果為%.3f,耗時(shí) %.3f ms;加速比為%.3f\n",N,resultHost,cpuTime,resultGpu,gpuTime,cpuTime/gpuTime);

    CudaSafeCall(cudaFree(arrD));
    CudaSafeCall(cudaFree(resultD));
    free(arrHost);
    free(arrResult);
    free(resultFromD);
    return 0;
}

5. 測試結(jié)果

	串行(ms)	gpu(ms)	加速比
baseline	95.2	5.3	17.8
優(yōu)化	89.1	1.029	87
thrust		1.158

6. 結(jié)果分析

（1）baseline版本相鄰求和，時(shí)間復(fù)雜度從 O(n)，降為log(n)。
（2）經(jīng)測試優(yōu)化中的優(yōu)化二合并訪問內(nèi)存能加速3倍，還有加載時(shí)計(jì)算也對性能提升較大。
（3）經(jīng)過優(yōu)化后的代碼達(dá)到了與 thrust reduce差不多的效率。

7. 注意

（1）如果使用float 數(shù)據(jù)類型規(guī)約，CPU和GPU對一個(gè)block處理的數(shù)據(jù)順序不同，大數(shù)加小數(shù)會(huì)發(fā)生大數(shù)吃小數(shù)的情況，計(jì)算精度不一樣，誤差精度不應(yīng)該設(shè)置太高。

posted @ 2025-03-16 23:11 安洛8 閱讀(63) 評論(0) 收藏舉報(bào)

刷新頁面返回頂部

anluo8