我试图将所有 CUDA 代码放到单独的 test.cu 文件中,并使用 test.h 文件从我的 main.cpp 文件中调用它。但是当我尝试从设备获取数据时,我总是收到错误“ExampleSeparate.exe 中 0x0F277552 (nvcuda.dll) 处的未处理异常:0xC0000005:访问冲突写入位置 0x04A8D000。”
你能告诉我代码有什么问题吗?将内核代码和代码的主要部分分离到不同的文件中我做错了什么?最好的方法是什么?
我知道如何在 OpenCL 中执行此操作,但无法在 CUDA 中进行管理。
main.cpp
printf("My CUDA example.\n");
int iWidth, iHeight, iBpp, cycles_max = 100;
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;
unsigned int SizeIn, SizeOut;
unsigned char *devDatOut, *devDatIn, *PInData, *POutData, *DatIn, *DatOut;
int error1 = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);
if (error1 != 0 || pDataIn.size() == 0 || iBpp != 32)
{
printf("error load input file!\n");
}
pDataOut.resize(pDataIn.size()/4);
//Для CUDA
SizeIn = pDataIn.size();
SizeOut = pDataOut.size();
PInData = pDataIn.data();
POutData = pDataOut.data();
//Для CPU
DatIn = pDataIn.data();
DatOut = pDataOut.data();
my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);
return 0;
测试.h
void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut);
test.cu
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut){
uchar4 *devDatIn;
unsigned char *devDatOut;
printf("Allocate memory on device\n");
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
gpuErrchk(cudaMalloc((void**)&devDatOut, SizeOut * sizeof(unsigned char)));
printf("Copy data on device\n");
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devDatOut, POutData, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice));
dim3 blocks(8100, 1, 1);
dim3 threads(1024, 1, 1);
addMatrix<<<blocks, threads>>>(devDatIn, devDatOut);
gpuErrchk(cudaMemcpy(POutData, devDatOut, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost));
cudaFree(devDatOut);
cudaFree(devDatIn);
_getch();
}
最佳答案
在这行代码中:
SizeIn = pDataIn.size();
你的pDataIn
是 <unsigned char>
的 vector 大概有足够的大小来处理 3840x2160 的图像,每个像素 4 个字节。所以SizeIn
应为 3840x2160x4。
然后将 vector 数据分配给 unsigned char
指针:
PInData = pDataIn.data();
然后你将该指针指向一个uchar4
, 同时传递 old SizeIn
以字节为单位:
my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);
在你的my_cuda
功能,您为设备存储分配的大小太大了 4 倍:
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
然后您尝试从主机向设备复制 4 倍过多的数据:
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
几乎可以肯定,该行将在主机上出现段错误。
解决方案可能很简单:
SizeIn = pDataIn.size()/4;
这是一个基于您显示的代码的完整示例,演示了段错误和修复:
$ cat t1135.cu
#include <stdio.h>
#include <vector>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut){
uchar4 *devDatIn;
unsigned char *devDatOut;
printf("Allocate memory on device\n");
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
gpuErrchk(cudaMalloc((void**)&devDatOut, SizeOut * sizeof(unsigned char)));
printf("Copy data on device\n");
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devDatOut, POutData, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice));
dim3 blocks(8100, 1, 1);
dim3 threads(1024, 1, 1);
//addMatrix<<<blocks, threads>>>(devDatIn, devDatOut);
gpuErrchk(cudaMemcpy(POutData, devDatOut, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost));
cudaFree(devDatOut);
cudaFree(devDatIn);
}
int main(){
printf("My CUDA example.\n");
vector<unsigned char> pDataIn(3840*2160*4);
vector<unsigned char> pDataOut;
unsigned int SizeIn, SizeOut;
unsigned char *PInData, *POutData;
pDataOut.resize(pDataIn.size()/4);
//... CUDA
#ifdef FIX
SizeIn = pDataIn.size()/4;
#else
SizeIn = pDataIn.size();
#endif
SizeOut = pDataOut.size();
PInData = pDataIn.data();
POutData = pDataOut.data();
my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);
return 0;
}
$ nvcc -o t1135 t1135.cu
$ ./t1135
My CUDA example.
Allocate memory on device
Copy data on device
Segmentation fault (core dumped)
$ nvcc -DFIX -o t1135 t1135.cu
$ ./t1135
My CUDA example.
Allocate memory on device
Copy data on device
$
关于c++ - CUDA分离内核文件错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36705341/