c - cuda 上的 vector 加法未计算某些元素

这是代码:

#include "common/book.h"

#define N 36 

__global__ void add(int *a, int *b, int *c) {
    int tid = blockIdx.x * gridDim.y * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; 
    if(tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    int a[N], b[N], c[N]; 
    int *dev_a, *dev_b, *dev_c; 
    cudaMalloc( (void**) &dev_a, N * sizeof(int));
    cudaMalloc( (void**) &dev_b, N * sizeof(int));
    cudaMalloc( (void**) &dev_c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = -1; 
        b[i] = i * i;
    }

    cudaMemcpy(
                dev_a, 
                a, 
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    cudaMemcpy(
                dev_b, 
                b, 
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    dim3 grid_dim(3, 2);
    dim3 block_dim(3, 2);
    add<<<grid_dim, block_dim>>>(dev_a, dev_b, dev_c);
    cudaMemcpy(
                c, 
                dev_c, 
                N * sizeof(int),
                cudaMemcpyDeviceToHost
                   );
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
}

基本上，我试图在具有 3x2 布局的网格上按元素添加两个 vector ，网格中的每个 block 都有 3x2 线程布局。

这是我运行编译的二进制文件时的结果:

-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 0
-1 + 49 = 0
-1 + 64 = 0
-1 + 81 = 0
-1 + 100 = 0
-1 + 121 = 0
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 0
-1 + 361 = 0
-1 + 400 = 0
-1 + 441 = 0
-1 + 484 = 0
-1 + 529 = 0
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 0
-1 + 961 = 0
-1 + 1024 = 0
-1 + 1089 = 0
-1 + 1156 = 0
-1 + 1225 = 0

显然有些 block 被忽略了。我还尝试过如何在内核函数 add 中计算 tid，但总是缺少一些 block 。

有什么建议吗？

最佳答案

正如您已经猜测的那样，唯一的问题是您的 tid 计算。

执行映射和创建算术的方法有很多种。对于通用 2D 网格，我发现在 x 和 y 中创建 2D 索引很方便(即易于记住的方法)，然后使用网格宽度(以 x 为单位)乘以 y 索引，再加上 x 索引，创建线程唯一的一维索引:

int idy = threadIdx.y+blockDim.y*blockIdx.y;  // y-index
int idx = threadIdx.x+blockDim.x*blockIdx.x;  // x-index
int tid = gridDim.x*blockDim.x*idy + idx;     // thread-unique 1D index

gridDim.x*blockDim.x 是以 x 为单位的网格宽度，以线程为单位表示。

当我们在代码中使用此通用 2D 索引方案时，它似乎对我来说可以正常工作:

$ cat t10.cu
#include <stdio.h>

#define N 36

__global__ void add(int *a, int *b, int *c) {
    int idy = threadIdx.y+blockDim.y*blockIdx.y;
    int idx = threadIdx.x+blockDim.x*blockIdx.x;
    int tid = gridDim.x*blockDim.x*idy + idx;
    if(tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;
    cudaMalloc( (void**) &dev_a, N * sizeof(int));
    cudaMalloc( (void**) &dev_b, N * sizeof(int));
    cudaMalloc( (void**) &dev_c, N * sizeof(int));
    for (int i = 0; i < N; i++) {
        a[i] = -1;
        b[i] = i * i;
    }

    cudaMemcpy(
                dev_a,
                a,
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    cudaMemcpy(
                dev_b,
                b,
                N * sizeof(int),
                cudaMemcpyHostToDevice
                   );
    dim3 grid_dim(3, 2);
    dim3 block_dim(3, 2);
    add<<<grid_dim, block_dim>>>(dev_a, dev_b, dev_c);
    cudaMemcpy(
                c,
                dev_c,
                N * sizeof(int),
                cudaMemcpyDeviceToHost
                   );
    for (int i = 0; i < N; i++) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
}
$ nvcc -arch=sm_35 -o t10 t10.cu
$ cuda-memcheck ./t10
========= CUDA-MEMCHECK
-1 + 0 = -1
-1 + 1 = 0
-1 + 4 = 3
-1 + 9 = 8
-1 + 16 = 15
-1 + 25 = 24
-1 + 36 = 35
-1 + 49 = 48
-1 + 64 = 63
-1 + 81 = 80
-1 + 100 = 99
-1 + 121 = 120
-1 + 144 = 143
-1 + 169 = 168
-1 + 196 = 195
-1 + 225 = 224
-1 + 256 = 255
-1 + 289 = 288
-1 + 324 = 323
-1 + 361 = 360
-1 + 400 = 399
-1 + 441 = 440
-1 + 484 = 483
-1 + 529 = 528
-1 + 576 = 575
-1 + 625 = 624
-1 + 676 = 675
-1 + 729 = 728
-1 + 784 = 783
-1 + 841 = 840
-1 + 900 = 899
-1 + 961 = 960
-1 + 1024 = 1023
-1 + 1089 = 1088
-1 + 1156 = 1155
-1 + 1225 = 1224
========= ERROR SUMMARY: 0 errors
$

以上应该提供正确的结果。就性能而言，这可能不是这个玩具问题的最有效映射。此问题的线程 block 大小不是 32 的倍数，对于高效的 CUDA 编程通常不建议这样做。对于这种情况，我的建议是重新组织线程 block 以提供至少 32 个线程的倍数，而不是尝试为这种情况提出最佳映射(就性能/效率而言)，并且我还建议至少考虑 block 的 x 维度中有 16 或 32 个线程，使索引易于理解，并产生近似最佳的内存访问性能。

关于c - cuda 上的 vector 加法未计算某些元素，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/47909694/

c - cuda 上的 vector 加法未计算某些元素

上一篇：shell - # 和 ! 之间有空格在 shebang (#!/usr/bin/ksh)

下一篇：coq - 程序修复点: recursive call in `let` and hypothesis of the obligation