假设我有一个结构如下:
typedef struct values{
int one, int two, int three
} values;
现在,假设我在主机上创建了一个值数组并用随机数据填充
values vals*;
__device__ values* d_vals;
int main(){
vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER);
PopulateWithDate(); //populates vals with random data
}
现在我希望能够将这些值复制到设备,这样我就可以像这样在我的内核中访问它们:
__global__ void myKernel(){
printf("%d", d_vals[0].one);//I don't really want to print, but whenever I try to access I get an error
}
无论我尝试什么,我都会遇到非法内存访问错误。
这是我目前的尝试:
int main(){
vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER);
PopulateWithDate(); //populates vals with random data
values* d_ptr;
cudaGetSymbolAddress((void**)&d_ptr, d_vals);
cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values));
cudaMemcpyToSymbol(d_ptr, &vals, sizeof(values) * A_LARGE_NUMBER);
cudaDeviceSynchronize();
dim3 blocksPerGrid(2, 2);
dim3 threadsPerBlock(16, 16);
myKernel<< <blocksPerGrid, threadsPerBlock >> >();
}
最佳答案
对于您目前所展示的内容,使用 __device__
指针变量只会产生不必要的复杂性。只需使用使用 cudaMalloc
的普通动态分配来进行设备存储,否则遵循类似于任何 CUDA 示例代码(例如 vectorAdd)的方法。这是一个例子:
$ cat t1315.cu
#include <stdio.h>
#define A_LARGE_NUMBER 10
struct values{
int one, two, three;
};
values *vals;
__global__ void myKernel(values *d_vals){
printf("%d\n", d_vals[0].one);
}
void PopulateWithData(){
for (int i = 0; i < A_LARGE_NUMBER; i++){
vals[i].one = 1;
vals[i].two = 2;
vals[i].three = 3;
}
}
int main(){
vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER);
PopulateWithData(); //populates vals with random data
values* d_ptr;
cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values));
cudaMemcpy(d_ptr, vals, A_LARGE_NUMBER *sizeof(values),cudaMemcpyHostToDevice);
dim3 blocksPerGrid(1,1);
dim3 threadsPerBlock(1, 1);
myKernel<< <blocksPerGrid, threadsPerBlock >> >(d_ptr);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t1315 t1315.cu
$ cuda-memcheck ./t1315
========= CUDA-MEMCHECK
1
========= ERROR SUMMARY: 0 errors
$
您在所展示的内容中存在各种其他基本(非 CUDA)编码错误,我不会尝试逐一检查它们。
如果你真的想保留你的 __device__
指针变量,并用它来指向设备数据(结构数组)那么你仍然需要使用 cudaMalloc
,整个过程需要额外的步骤。您可以按照答案 here 中的示例进行操作.
在该示例之后,这里是对上述代码的一组更改,以使其使用 __device__
指针变量而不是作为内核参数传递的指针:
$ cat t1315.cu
#include <stdio.h>
#define A_LARGE_NUMBER 10
struct values{
int one, two, three;
};
values *vals;
__device__ values *d_vals;
__global__ void myKernel(){
printf("%d\n", d_vals[0].one);
}
void PopulateWithData(){
for (int i = 0; i < A_LARGE_NUMBER; i++){
vals[i].one = 1;
vals[i].two = 2;
vals[i].three = 3;
}
}
int main(){
vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER);
PopulateWithData(); //populates vals with random data
values* d_ptr;
cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values));
cudaMemcpy(d_ptr, vals, A_LARGE_NUMBER *sizeof(values),cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_vals, &d_ptr, sizeof(values*));
dim3 blocksPerGrid(1,1);
dim3 threadsPerBlock(1, 1);
myKernel<< <blocksPerGrid, threadsPerBlock >> >();
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_35 -o t1315 t1315.cu
$ cuda-memcheck ./t1315
========= CUDA-MEMCHECK
1
========= ERROR SUMMARY: 0 errors
$
关于将结构数组从主机复制到设备cuda,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43175162/