c++ - CUDA shuffle 指令减少比共享内存减少慢？

基于 Shuffle 指令的 warp 缩减预计比使用共享内存或全局内存的缩减执行更快的缩减，如 - 中所述 -

https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/

和

https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-kepler-shuffle/

在下面的代码中，我试图验证这一点:-

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>


__inline__ __device__
float warpReduceSum(float val) {
    for (int offset = 16; offset > 0; offset /= 2)
        val += __shfl_down(val, offset);
    return val;
}

__inline__ __device__
float blockReduceSum(float val) {
    static __shared__ int shared[32];
    int lane = threadIdx.x%32;
    int wid = threadIdx.x / 32;
    val = warpReduceSum(val);

    //write reduced value to shared memory
    if (lane == 0) shared[wid] = val;
    __syncthreads();

    //ensure we only grab a value from shared memory if that warp existed
    val = (threadIdx.x<blockDim.x / 32) ? shared[lane] : int(0);
    if (wid == 0) val = warpReduceSum(val);

    return val;
}

__global__ void device_reduce_stable_kernel(float *in, float* out, int N) {
    float sum = int(0);
    //printf("value = %d ", blockDim.x*gridDim.x);
    for (int i = blockIdx.x*blockDim.x + threadIdx.x; i<N; i += blockDim.x*gridDim.x) {
        sum += in[i];
    }
    sum = blockReduceSum(sum);
    if (threadIdx.x == 0)
        out[blockIdx.x] = sum;
}

void device_reduce_stable(float *in, float* out, int N) {
    //int threads = 512;
    //int blocks = min((N + threads - 1) / threads, 1024);
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = N / maxThreadsPerBlock;
    device_reduce_stable_kernel << <blocks, threads >> >(in, out, N);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error: %s\n", cudaGetErrorString(err));

    device_reduce_stable_kernel << <1, 1024 >> >(out, out, blocks);
    //cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
        printf("Error: %s\n", cudaGetErrorString(err));
}

__global__ void global_reduce_kernel(float * d_out, float * d_in)
{
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid = threadIdx.x;

    // do reduction in global mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            d_in[myId] += d_in[myId + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = d_in[myId];
    }
}

__global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
{
    // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
    extern __shared__ float sdata[];

    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid = threadIdx.x;

    // load shared mem from global mem
    sdata[tid] = d_in[myId];
    __syncthreads();            // make sure entire block is loaded!

    // do reduction in shared mem
    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
    {
        if (tid < s)
        {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global mem
    if (tid == 0)
    {
        d_out[blockIdx.x] = sdata[0];
    }
}

void reduce(float * d_out, float * d_intermediate, float * d_in,
    int size, bool usesSharedMemory)
{
    // assumes that size is not greater than maxThreadsPerBlock^2
    // and that size is a multiple of maxThreadsPerBlock
    const int maxThreadsPerBlock = 1024;
    int threads = maxThreadsPerBlock;
    int blocks = size / maxThreadsPerBlock;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
            (d_intermediate, d_in);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    else
    {
        global_reduce_kernel << <blocks, threads >> >
            (d_intermediate, d_in);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    // now we're down to one block left, so reduce it
    threads = blocks; // launch one thread for each block in prev step
    blocks = 1;
    if (usesSharedMemory)
    {
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >
            (d_out, d_intermediate);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
    else
    {
        global_reduce_kernel << <blocks, threads >> >
            (d_out, d_intermediate);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess)
            printf("Error: %s\n", cudaGetErrorString(err));
    }
}

int main()
{
    /*int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        fprintf(stderr, "error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }
    int dev = 0;
    cudaSetDevice(dev);

    cudaDeviceProp devProps;
    if (cudaGetDeviceProperties(&devProps, dev) == 0)
    {
        printf("Using device %d:\n", dev);
        printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
            devProps.name, (int)devProps.totalGlobalMem,
            (int)devProps.major, (int)devProps.minor,
            (int)devProps.clockRate);
    }
*/
    const int ARRAY_SIZE = 2048;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // generate the input array on the host
    float h_in[ARRAY_SIZE];
    float sum = 0.0f;
    for (int i = 0; i < ARRAY_SIZE; i++) {
        // generate random float in [-1.0f, 1.0f]
        h_in[i] = i;
        sum += h_in[i];
    }

    // declare GPU memory pointers
    float * d_in, *d_intermediate, *d_out;

    // allocate GPU memory
    cudaMalloc((void **)&d_in, ARRAY_BYTES);
    cudaMalloc((void **)&d_intermediate, ARRAY_BYTES); // overallocated
    cudaMalloc((void **)&d_out, sizeof(float));

    // transfer the input array to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

    int whichKernel = 2;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    // launch the kernel
    cudaProfilerStart();
    switch (whichKernel) {
    case 0:
        printf("Running global reduce\n");
        cudaEventRecord(start, 0);
        //for (int i = 0; i < 100; i++)
        //{
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
        //}
        cudaEventRecord(stop, 0);
        break;
    case 1:
        printf("Running reduce with shared mem\n");
        cudaEventRecord(start, 0);
        //for (int i = 0; i < 100; i++)
        //{
            reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
        //}
        cudaEventRecord(stop, 0);
        break;
    case 2:
        printf("Running reduce with shuffle instruction\n");
        cudaEventRecord(start, 0);
        /*for (int i = 0; i < 100; i++)
        {*/
            device_reduce_stable(d_in, d_out, ARRAY_SIZE);
        //}
        cudaEventRecord(stop, 0);
        break;
    default:
        fprintf(stderr, "error: ran no kernel\n");
        exit(EXIT_FAILURE);
    }
    cudaProfilerStop();
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    elapsedTime /= 100.0f;      // 100 trials

    // copy back the sum from GPU
    float h_out;
    cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    printf("average time elapsed: %f\n", elapsedTime);

    // free GPU memory allocation
    cudaFree(d_in);
    cudaFree(d_intermediate);
    cudaFree(d_out);

    return 0;

}

结果表明，基于 warp 的缩减花费的时间几乎是基于共享内存的缩减的两倍。这些结果与预期的行为相矛盾。实验在计算能力高于3.0的Tesla K40c上进行。

最佳答案

我正在比较以下两个缩减内核，一个使用在最后一个 warp 缩减阶段只使用共享内存而不使用 warp shuffling(version4)，一个使用 最后一个 warp 缩减阶段的共享内存和 warp 改组(version5)。

version4

template <class T>
__global__ void version4(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid = threadIdx.x;                                 // --- Local thread index
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // --- Global thread index - Fictitiously double the block dimension

    // --- Performs the first level of reduction in registers when reading from global memory. 
    T mySum = (i < N) ? g_idata[i] : 0;
    if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
    sdata[tid] = mySum;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
    {
        if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
    if (tid < 32) {
        sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
        sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

version5

template <class T>
__global__ void version5(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid = threadIdx.x;                                 // --- Local thread index
    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;   // --- Global thread index - Fictitiously double the block dimension

    // --- Performs the first level of reduction in registers when reading from global memory. 
    T mySum = (i < N) ? g_idata[i] : 0;
    if (i + blockDim.x < N) mySum += g_idata[i + blockDim.x];
    sdata[tid] = mySum;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
    {
        if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Single warp reduction by shuffle operations
    if (tid < 32)
    {
        // --- Last iteration removed from the for loop, but needed for shuffle reduction
        mySum += sdata[tid + 32];
        // --- Reduce final warp using shuffle
        //for (int offset = warpSize / 2; offset > 0; offset /= 2) mySum += __shfl_down_sync(0xffffffff, mySum, offset);
        for (int offset=1; offset < warpSize; offset *= 2) mySum += __shfl_xor_sync(0xffffffff, mySum, i);
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = mySum;
}

我确认两者之间没有敏感差异。在我的 GTX920M 卡上，时间如下:

N = 33554432
version4 = 27.5ms
version5 = 27.095ms

所以，我确认罗伯特的上述评论。

关于c++ - CUDA shuffle 指令减少比共享内存减少慢？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/44278317/

c++ - CUDA shuffle 指令减少比共享内存减少慢？

上一篇：c# - 无法在 Visual Studio 2017 的共享项目中创建文件夹

下一篇：c++ - 比较bcrypt加密后的密码