c++ - 类问题中的 CUDA 内存管理/指针

我正在努力解决一些内存管理问题。在将结果复制到主机时，我不断收到“未指定的启动失败”。

我的代码非常简单——它在每个线程中生成两个单位并将它们相乘。我有提供随机数的类(class):

class CuRandCuRandomNumberProvider :
{  
public:
    CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock);
    CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock, unsigned int seed);
    __device__ unsigned int GetRandomNumber();
    ~CuRandCuRandomNumberProvider();
protected:
     curandState * states;
    __device__ bool IsPrime(unsigned int number);
};

CuRandCuRandomNumberProvider::CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock)
{
    int numberOfThreads = threadsPerBlock.x * threadsPerBlock.y * numBlocks.x * numBlocks.y;
    std::cout << numberOfThreads << std::endl;
    cudaMalloc ( &this->states, numberOfThreads*sizeof( curandState ) );
    setup_kernel <<< numBlocks, threadsPerBlock >>> ( this->states, time(NULL) );
}

__device__ unsigned int CuRandCuRandomNumberProvider::GetRandomNumber()
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;
    register float r =  curand_uniform(&this->states[offset]);
    return 0 + ((double)UINT_MAX) * r;
}

setup_kernel 存储在头文件中，如下所示:

__global__ void setup_kernel ( curandState * state, unsigned long seed )
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;
    curand_init ( seed, offset, 0, &state[offset] );
}

我的主内核非常简单，看起来像这样:

__global__  void InitKernel(uint3 * ptr, CuRandCuRandomNumberProvider * provider)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    ptr[offset].x = provider->GetRandomNumber();
    ptr[offset].y = provider->GetRandomNumber();
    ptr[offset].z = ptr[offset].x * ptr[offset].y;
}

main 中最后一个 cudaMemcpy 导致问题的执行是:

uint3 * pqnD;

uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads );
memset(pqnH,0,sizeof(uint3) * numberOfThreads );

HANDLE_ERROR( cudaMalloc( (void**)&pqnD, sizeof(uint3) * numberOfThreads ));

CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock);

InitKernel<<<numBlocks, threadsPerBlock>>>(pqnD, provider);

HANDLE_ERROR( cudaMemcpy( pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost ) ); // this line causes error

HANDLE_ERROR( cudaFree( pqnD ) );

如果我明确地做所有事情，比如:

uint3 * pqnD;

uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads );

memset(pqnH,0,sizeof(uint3) * numberOfThreads );

HANDLE_ERROR( cudaMalloc( (void**)&pqnD, sizeof(uint3) * numberOfThreads ));

curandState * states;

cudaMalloc ( &states, numberOfThreads*sizeof( curandState ) );

setup_kernel <<< numBlocks, threadsPerBlock >>> ( states, time(NULL) );

CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock, states);


InitKernel2<<<numBlocks, threadsPerBlock>>>(pqnD, states);

HANDLE_ERROR( cudaMemcpy( pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost ) );

HANDLE_ERROR( cudaFree( pqnD ) );

其中 setup_kernel 与 InitKernel2 完全相同:

__global__  void InitKernel2(uint3 * ptr, curandState * states)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    ptr[offset].x = GetRandomNumber(states);
    ptr[offset].y = GetRandomNumber(states);
    ptr[offset].z =     ptr[offset].x *     ptr[offset].y;
}

GetRandomNumber 是:

__device__ unsigned int GetRandomNumber(curandState * states)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;
    register float r =  curand_uniform(&states[offset]);
    return 0 + ((double)UINT_MAX) * r;

}

一切都像一个魅力。有谁知道我做错了什么？我已经为此苦苦挣扎了几个小时。我想这可能与内存管理或指针传递有关，但我不知道它会是什么。

请帮忙:)!

最佳答案

这是非法的:

CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock);

InitKernel<<<numBlocks, threadsPerBlock>>>(pqnD, provider);

provider 是您在主机上分配的变量。将该指针传递给设备并在设备代码中取消引用它:

ptr[offset].x = provider->GetRandomNumber();

(最终导致:)

register float r =  curand_uniform(&this->states[offset]);

是非法的。

由于您似乎想在主机上设置对象(属于 CuRandCuRandomNumberProvider 类)并将其传递给设备，因此一种可能的解决方法是按值传递对象，而不是指针。这需要进行一些更改，主要是:

CuRandCuRandomNumberProvider provider(numBlocks, threadsPerBlock);

在初始化内核中:

__global__  void InitKernel(uint3 * ptr, CuRandCuRandomNumberProvider provider) // change
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    ptr[offset].x = provider.GetRandomNumber();  // change
    ptr[offset].y = provider.GetRandomNumber();  // change
    ptr[offset].z = ptr[offset].x * ptr[offset].y;
}

在 CuRandCuRandomNumberProvider::GetRandomNumber() 中:

__device__ unsigned int CuRandCuRandomNumberProvider::GetRandomNumber()
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;
    register float r =  curand_uniform(&(states[offset])); // change
    return 0 + ((double)UINT_MAX) * r;
}

(我也删除了析构函数原型(prototype)，因为它妨碍了。)

关于c++ - 类问题中的 CUDA 内存管理/指针，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/27348409/

c++ - 类问题中的 CUDA 内存管理/指针

上一篇：c++ - 我似乎无法在 qt creator 中打印几行数据。程序覆盖除最后一行以外的所有输出

下一篇：c++ - CWnd::Create 会产生一个新线程吗？