c++ - 我的 Cuda 脚本数组输出错误

这个周末我想学习 Cuda。

我想做的是c=a+b。每个变量(a、b 和 c)都是一个包含 5 个元素的数组。

我对结果有疑问。这是我想要的结果:

{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 36.000000, 48.000000, 62.000000}

但这就是我得到的:

PS E:\testing\cuda2\Debug> .\cuda2.exe
{a1, a2, a3, a4, a5} = {11.000000, 21.000000, 31.000000, 41.000000, 51.000000}
{b1, b2, b3, b4, b5} = {1.000000, 3.000000, 5.000000, 7.000000, 11.000000}
{c1, c2, c3, c4, c5} = {12.000000, 24.000000, 0.000000, 0.000000, 0.000000}

如您所见，结果 (c3, c4, c5) 是错误的。
请告诉我如何使下面的代码做正确的事情。

我正在使用 VS2015 和 Cuda 工具包 8。我在项目解决方案中创建了 3 个文件:main.cpp、simple_math.cu、simple_math.cuh；

主要.cpp

#include "simple_math.cuh"
#include <iostream> // fprintf


int main()
{
    const int arraySize = 5;
    float a[arraySize] = { 11, 21, 31, 41, 51 };
    float b[arraySize] = { 1, 3, 5, 7, 11 };
    double c[arraySize] = { 0, 0, 0, 0, 0 };

    cudaError_t cudaStatus = mathWithCuda(c, a, b, arraySize, ADD);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "mathWithCuda failed!");
        return 1;
    }


    fprintf(stdout, "{a1, a2, a3, a4, a5} = {%f, %f, %f, %f, %f} \n{b1, b2, b3, b4, b5} = {%f, %f, %f, %f, %f} \n{c1, c2, c3, c4, c5} = {%f, %f, %f, %f, %f}",
        a[0], a[1], a[2], a[3], a[4], b[0], b[1], b[2], b[3], b[4], c[0], c[1], c[2], c[3], c[4]);


    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

simple_math.cuh

#ifndef SIMPLE_MATH_CUH_
#define SIMPLE_MATH_CUH_


#include <cuda_runtime.h> // cudaError_t

#define ADD 0
#define SUB 1
#define MUL 2
#define DIV 3


cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int size, int mode);

__global__ void addKernel(double *c, const float *a, const float *b);
__global__ void subKernel(double *c, const float *a, const float *b);
__global__ void mulKernel(double *c, const float *a, const float *b);
__global__ void divKernel(double *c, const float *a, const float *b);


#endif

simple_math.cu

#include <device_launch_parameters.h> // threadIdx
#include <stdio.h> // fprintf
#include <math.h> // ceil
#include "simple_math.cuh"


cudaError_t mathWithCuda(double *c, const float *a, const float *b, unsigned int arraySize, int mode)
{
    float *dev_a, *dev_b;
    double *dev_c;
    cudaError_t cudaStatus;


    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }


    if ((cudaStatus = cudaMalloc((void**)&dev_c, arraySize * sizeof(double))) != cudaSuccess ||
        (cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(float))) != cudaSuccess ||
        (cudaStatus = cudaMalloc((void**)&dev_b, arraySize * sizeof(float))) != cudaSuccess)
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }


    if ((cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess ||
        (cudaStatus = cudaMemcpy(dev_b, b, arraySize * sizeof(float), cudaMemcpyHostToDevice)) != cudaSuccess)
    {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }


    int blocksPerGrid, threadsPerBlock;
    if (arraySize < 512) {
        blocksPerGrid = 1;
        threadsPerBlock = arraySize;
    }
    else {
        blocksPerGrid = ceil(double(arraySize) / double(threadsPerBlock));
        threadsPerBlock = 512;
    }


    switch (mode)
    {
    case 0:
        addKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
        break;
    case 1:
        subKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
        break;
    case 2:
        mulKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
        break;
    case 3:
        divKernel <<<blocksPerGrid, threadsPerBlock >>>(dev_c, dev_a, dev_b);
        break;
    default:
        // nothing
        break;
    }


    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }


    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }


    cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }


Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);


    return cudaStatus;
}


__global__ void addKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fadd_rn(a[i], b[i]); // a + b
}    

__global__ void subKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fsub_rn(a[i], b[i]); // a - b
}

__global__ void mulKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fmul_rn(a[i], b[i]); // a * b
}

__global__ void divKernel(double *c, const float *a, const float *b)
{
    int i = threadIdx.x;
    c[i] = __fdividef(a[i], b[i]); // a/b
}

最佳答案

问题似乎出在这里:

cudaStatus = cudaMemcpy(c, dev_c, arraySize * sizeof(float), cudaMemcpyDeviceToHost);

我认为你应该复制 arraySize * sizeof(double) 字节。

关于c++ - 我的 Cuda 脚本数组输出错误，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/43718022/

c++ - 我的 Cuda 脚本数组输出错误

上一篇：c++ - 如何访问 PCLPointCloud2 类型的点

下一篇：c++ - 似乎无法通过 ifstream 逐行读取 tmp 文件