我声明了一个类 A
和 B
,它们都有一个成员 int* m_pDevicePtr
,其值是使用 cudaMalloc( )
。
我正在检查两个内核的结果:
输入参数为这些设备指针的内核函数:
kernelAdd(int* a, int* b)
。使用这个内核,输出符合预期。
一个签名为
kernelAdd(int* a, B* pB)
的核函数,使用了B
的成员函数,签名为B::GetNumber(int index)
,做同样的工作。...但是这个内核不工作。
代码如下:
#include <iostream>
#include <fstream>
#include <string>
#include <stack>
#include <cstdarg>
#include <limits.h>
#include <windows.h>
#include <tchar.h>
#include <stdio.h>
#include <stdarg.h>
#include <math.h>
#include <malloc.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "vector_types.h"
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
#ifdef __DRIVER_TYPES_H__
static const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
#endif
template <typename T> void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
DEVICE_RESET
// Make sure we call CUDA Device Reset before exiting
exit(EXIT_FAILURE);
}
}
__global__ void _kInitialArray(int* thearray)
{
int iX = threadIdx.x + blockDim.x * blockIdx.x;
int iY = threadIdx.y + blockDim.y * blockIdx.y;
int iZ = threadIdx.z + blockDim.z * blockIdx.z;
thearray[iX * 16 + iY * 4 + iZ] = iX * 16 + iY * 4 + iZ;
}
extern "C" {
void _cInitialArray(int* thearray)
{
dim3 block(1, 1, 1);
dim3 th(4, 4, 4);
_kInitialArray << <block, th >> > (thearray);
checkCudaErrors(cudaGetLastError());
}
}
class B
{
public:
B()
{
checkCudaErrors(cudaMalloc((void**)&m_pDevicePtr, sizeof(int) * 64));
_cInitialArray(m_pDevicePtr);
}
~B()
{
cudaFree(m_pDevicePtr);
}
__device__ int GetNumber(int index)
{
m_pDevicePtr[index] = m_pDevicePtr[index] + 1;
return m_pDevicePtr[index];
}
int* m_pDevicePtr;
};
__global__ void _kAddArray(int* thearray1, int* thearray2)
{
int iX = threadIdx.x + blockDim.x * blockIdx.x;
int iY = threadIdx.y + blockDim.y * blockIdx.y;
int iZ = threadIdx.z + blockDim.z * blockIdx.z;
thearray2[iX * 16 + iY * 4 + iZ] = thearray2[iX * 16 + iY * 4 + iZ] + 1;
thearray1[iX * 16 + iY * 4 + iZ] = thearray1[iX * 16 + iY * 4 + iZ] + thearray2[iX * 16 + iY * 4 + iZ];
}
__global__ void _kAddArrayB(int* thearray1, B* pB)
{
int iX = threadIdx.x + blockDim.x * blockIdx.x;
int iY = threadIdx.y + blockDim.y * blockIdx.y;
int iZ = threadIdx.z + blockDim.z * blockIdx.z;
thearray1[iX * 16 + iY * 4 + iZ] = thearray1[iX * 16 + iY * 4 + iZ] + pB->GetNumber(iX * 16 + iY * 4 + iZ);
}
extern "C" {
void _cAddArray(int* thearray1, int* thearray2)
{
dim3 block(1, 1, 1);
dim3 th(4, 4, 4);
_kAddArray << <block, th >> > (thearray1, thearray2);
checkCudaErrors(cudaGetLastError());
}
void _cAddArrayB(int* thearray1, B* pB)
{
dim3 block(1, 1, 1);
dim3 th(4, 4, 4);
_kAddArrayB << <block, th >> > (thearray1, pB);
checkCudaErrors(cudaGetLastError());
}
}
class A
{
public:
A()
{
checkCudaErrors(cudaMalloc((void**)&m_pDevicePtr, sizeof(int) * 64));
_cInitialArray(m_pDevicePtr);
}
~A()
{
checkCudaErrors(cudaFree(m_pDevicePtr));
}
void Add(int* toAdd)
{
_cAddArray(m_pDevicePtr, toAdd);
}
void Add(B* toAdd)
{
_cAddArrayB(m_pDevicePtr, toAdd);
}
int* m_pDevicePtr;
};
int main(int argc, char * argv[])
{
B* pB = new B();
A* pA = new A();
pA->Add(pB->m_pDevicePtr);
int* res = (int*)malloc(sizeof(int) * 64);
checkCudaErrors(cudaMemcpy(res, pA->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- A=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
checkCudaErrors(cudaMemcpy(res, pB->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- B=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
B* pB2 = new B();
A* pA2 = new A();
pA2->Add(pB2);
checkCudaErrors(cudaMemcpy(res, pA2->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- A2=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
checkCudaErrors(cudaMemcpy(res, pB2->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- B2=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
delete pA;
delete pB;
delete pA2;
delete pB2;
return 0;
}
运行这个程序的结果是:
----------- A=
res 0=1 res 1=3 res 2=5 res 3=7 res 4=9 res 5=11 res 6=13 res 7=15
res 8=17 res 9=19 res 10=21 res 11=23 res 12=25 res 13=27 res 14=29 res 15=31
res 16=33 res 17=35 res 18=37 res 19=39 res 20=41 res 21=43 res 22=45 res 23=47
res 24=49 res 25=51 res 26=53 res 27=55 res 28=57 res 29=59 res 30=61 res 31=63
res 32=65 res 33=67 res 34=69 res 35=71 res 36=73 res 37=75 res 38=77 res 39=79
res 40=81 res 41=83 res 42=85 res 43=87 res 44=89 res 45=91 res 46=93 res 47=95
res 48=97 res 49=99 res 50=101 res 51=103 res 52=105 res 53=107 res 54=109 res 55=111
res 56=113 res 57=115 res 58=117 res 59=119 res 60=121 res 61=123 res 62=125 res 63=127
----------- B=
res 0=1 res 1=2 res 2=3 res 3=4 res 4=5 res 5=6 res 6=7 res 7=8
res 8=9 res 9=10 res 10=11 res 11=12 res 12=13 res 13=14 res 14=15 res 15=16
res 16=17 res 17=18 res 18=19 res 19=20 res 20=21 res 21=22 res 22=23 res 23=24
res 24=25 res 25=26 res 26=27 res 27=28 res 28=29 res 29=30 res 30=31 res 31=32
res 32=33 res 33=34 res 34=35 res 35=36 res 36=37 res 37=38 res 38=39 res 39=40
res 40=41 res 41=42 res 42=43 res 43=44 res 44=45 res 45=46 res 46=47 res 47=48
res 48=49 res 49=50 res 50=51 res 51=52 res 52=53 res 53=54 res 54=55 res 55=56
res 56=57 res 57=58 res 58=59 res 59=60 res 60=61 res 61=62 res 62=63 res 63=64
CUDA error at F:/CPPProject/CudaLatticeGauge/CudaLatticeGauge/Code/CudaLibTest/CudaHelper.cu:183 code=77(cudaErrorIllegalAddress) "cudaMemcpy(res, pA2->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost)"
看起来我无法在成员函数中操作设备内存的内容。是因为我做错了什么吗?还是我就是做不到?
我在 Windows 10 和 Visual Studio 2017 上使用 CUDA 10。
最佳答案
在 CUDA 中,取消引用设备代码中的主机指针是非法的。你在这里这样做:
__global__ void _kAddArrayB(int* thearray1, B* pB)
{
int iX = threadIdx.x + blockDim.x * blockIdx.x;
int iY = threadIdx.y + blockDim.y * blockIdx.y;
int iZ = threadIdx.z + blockDim.z * blockIdx.z;
thearray1[iX * 16 + iY * 4 + iZ] = thearray1[iX * 16 + iY * 4 + iZ] + pB->GetNumber(iX * 16 + iY * 4 + iZ);
// ^^^^^
}
上面代码中的pB
是一个指向B
对象的指针。您之前已经在主机内存中创建了这个 B
对象:
B* pB2 = new B();
pB2
指向的任何内容都不能在设备代码中访问。完全没有。任何在设备代码中取消引用 pB2
指针的尝试都将导致设备代码执行错误。
我确信有很多可能的方法可以解决这个问题。我相信一种不破坏所有其他代码的直接方法是将 pB2
指向的对象复制到设备内存,并在内核中使用指向该对象拷贝的指针。这将只涉及对您的 _cAddArrayB()
例程的更改,如下所示:
void _cAddArrayB(int* thearray1, B* pB)
{
dim3 block(1, 1, 1);
dim3 th(4, 4, 4);
B *dpB;
cudaMalloc(&dpB, sizeof(B));
checkCudaErrors(cudaMemcpy(dpB, pB, sizeof(B), cudaMemcpyHostToDevice));
_kAddArrayB << <block, th >> > (thearray1, dpB);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaFree(dpB));
}
这是一个有效的示例,证明此更改导致代码执行时没有错误:
$ cat t361.cu
#include <iostream>
#include <fstream>
#include <string>
#include <stack>
#include <cstdarg>
#include <limits.h>
#include <stdio.h>
#include <stdarg.h>
#include <math.h>
#include <malloc.h>
#include <stdlib.h>
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#ifdef __DRIVER_TYPES_H__
#ifndef DEVICE_RESET
#define DEVICE_RESET cudaDeviceReset();
#endif
#else
#ifndef DEVICE_RESET
#define DEVICE_RESET
#endif
#endif
#ifdef __DRIVER_TYPES_H__
static const char *_cudaGetErrorEnum(cudaError_t error) {
return cudaGetErrorName(error);
}
#endif
template <typename T> void check(T result, char const *const func, const char *const file,
int const line) {
if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
DEVICE_RESET
// Make sure we call CUDA Device Reset before exiting
exit(EXIT_FAILURE);
}
}
__global__ void _kInitialArray(int* thearray)
{
int iX = threadIdx.x + blockDim.x * blockIdx.x;
int iY = threadIdx.y + blockDim.y * blockIdx.y;
int iZ = threadIdx.z + blockDim.z * blockIdx.z;
thearray[iX * 16 + iY * 4 + iZ] = iX * 16 + iY * 4 + iZ;
}
extern "C" {
void _cInitialArray(int* thearray)
{
dim3 block(1, 1, 1);
dim3 th(4, 4, 4);
_kInitialArray << <block, th >> > (thearray);
checkCudaErrors(cudaGetLastError());
}
}
class B
{
public:
B()
{
checkCudaErrors(cudaMalloc((void**)&m_pDevicePtr, sizeof(int) * 64));
_cInitialArray(m_pDevicePtr);
}
~B()
{
cudaFree(m_pDevicePtr);
}
__device__ int GetNumber(int index)
{
m_pDevicePtr[index] = m_pDevicePtr[index] + 1;
return m_pDevicePtr[index];
}
int* m_pDevicePtr;
};
__global__ void _kAddArray(int* thearray1, int* thearray2)
{
int iX = threadIdx.x + blockDim.x * blockIdx.x;
int iY = threadIdx.y + blockDim.y * blockIdx.y;
int iZ = threadIdx.z + blockDim.z * blockIdx.z;
thearray2[iX * 16 + iY * 4 + iZ] = thearray2[iX * 16 + iY * 4 + iZ] + 1;
thearray1[iX * 16 + iY * 4 + iZ] = thearray1[iX * 16 + iY * 4 + iZ] + thearray2[iX * 16 + iY * 4 + iZ];
}
__global__ void _kAddArrayB(int* thearray1, B* pB)
{
int iX = threadIdx.x + blockDim.x * blockIdx.x;
int iY = threadIdx.y + blockDim.y * blockIdx.y;
int iZ = threadIdx.z + blockDim.z * blockIdx.z;
thearray1[iX * 16 + iY * 4 + iZ] = thearray1[iX * 16 + iY * 4 + iZ] + pB->GetNumber(iX * 16 + iY * 4 + iZ);
}
extern "C" {
void _cAddArray(int* thearray1, int* thearray2)
{
dim3 block(1, 1, 1);
dim3 th(4, 4, 4);
_kAddArray << <block, th >> > (thearray1, thearray2);
checkCudaErrors(cudaGetLastError());
}
void _cAddArrayB(int* thearray1, B* pB)
{
dim3 block(1, 1, 1);
dim3 th(4, 4, 4);
B *dpB;
cudaMalloc(&dpB, sizeof(B));
checkCudaErrors(cudaMemcpy(dpB, pB, sizeof(B), cudaMemcpyHostToDevice));
_kAddArrayB << <block, th >> > (thearray1, dpB);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaFree(dpB));
}
}
class A
{
public:
A()
{
checkCudaErrors(cudaMalloc((void**)&m_pDevicePtr, sizeof(int) * 64));
_cInitialArray(m_pDevicePtr);
}
~A()
{
checkCudaErrors(cudaFree(m_pDevicePtr));
}
void Add(int* toAdd)
{
_cAddArray(m_pDevicePtr, toAdd);
}
void Add(B* toAdd)
{
_cAddArrayB(m_pDevicePtr, toAdd);
}
int* m_pDevicePtr;
};
int main(int argc, char * argv[])
{
B* pB = new B();
A* pA = new A();
pA->Add(pB->m_pDevicePtr);
int* res = (int*)malloc(sizeof(int) * 64);
checkCudaErrors(cudaMemcpy(res, pA->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- A=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
checkCudaErrors(cudaMemcpy(res, pB->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- B=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
B* pB2 = new B();
A* pA2 = new A();
pA2->Add(pB2);
checkCudaErrors(cudaMemcpy(res, pA2->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- A2=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
checkCudaErrors(cudaMemcpy(res, pB2->m_pDevicePtr, sizeof(int) * 64, cudaMemcpyDeviceToHost));
printf("----------- B2=");
for (int i = 0; i < 8; ++i)
{
printf("\n");
for (int j = 0; j < 8; ++j)
printf("res %d=%d ", i * 8 + j, res[i * 8 + j]);
}
printf("\n");
delete pA;
delete pB;
delete pA2;
delete pB2;
return 0;
}
$ nvcc -o t361 t361.cu
$ cuda-memcheck ./t361
========= CUDA-MEMCHECK
----------- A=
res 0=1 res 1=3 res 2=5 res 3=7 res 4=9 res 5=11 res 6=13 res 7=15
res 8=17 res 9=19 res 10=21 res 11=23 res 12=25 res 13=27 res 14=29 res 15=31
res 16=33 res 17=35 res 18=37 res 19=39 res 20=41 res 21=43 res 22=45 res 23=47
res 24=49 res 25=51 res 26=53 res 27=55 res 28=57 res 29=59 res 30=61 res 31=63
res 32=65 res 33=67 res 34=69 res 35=71 res 36=73 res 37=75 res 38=77 res 39=79
res 40=81 res 41=83 res 42=85 res 43=87 res 44=89 res 45=91 res 46=93 res 47=95
res 48=97 res 49=99 res 50=101 res 51=103 res 52=105 res 53=107 res 54=109 res 55=111
res 56=113 res 57=115 res 58=117 res 59=119 res 60=121 res 61=123 res 62=125 res 63=127
----------- B=
res 0=1 res 1=2 res 2=3 res 3=4 res 4=5 res 5=6 res 6=7 res 7=8
res 8=9 res 9=10 res 10=11 res 11=12 res 12=13 res 13=14 res 14=15 res 15=16
res 16=17 res 17=18 res 18=19 res 19=20 res 20=21 res 21=22 res 22=23 res 23=24
res 24=25 res 25=26 res 26=27 res 27=28 res 28=29 res 29=30 res 30=31 res 31=32
res 32=33 res 33=34 res 34=35 res 35=36 res 36=37 res 37=38 res 38=39 res 39=40
res 40=41 res 41=42 res 42=43 res 43=44 res 44=45 res 45=46 res 46=47 res 47=48
res 48=49 res 49=50 res 50=51 res 51=52 res 52=53 res 53=54 res 54=55 res 55=56
res 56=57 res 57=58 res 58=59 res 59=60 res 60=61 res 61=62 res 62=63 res 63=64
----------- A2=
res 0=1 res 1=3 res 2=5 res 3=7 res 4=9 res 5=11 res 6=13 res 7=15
res 8=17 res 9=19 res 10=21 res 11=23 res 12=25 res 13=27 res 14=29 res 15=31
res 16=33 res 17=35 res 18=37 res 19=39 res 20=41 res 21=43 res 22=45 res 23=47
res 24=49 res 25=51 res 26=53 res 27=55 res 28=57 res 29=59 res 30=61 res 31=63
res 32=65 res 33=67 res 34=69 res 35=71 res 36=73 res 37=75 res 38=77 res 39=79
res 40=81 res 41=83 res 42=85 res 43=87 res 44=89 res 45=91 res 46=93 res 47=95
res 48=97 res 49=99 res 50=101 res 51=103 res 52=105 res 53=107 res 54=109 res 55=111
res 56=113 res 57=115 res 58=117 res 59=119 res 60=121 res 61=123 res 62=125 res 63=127
----------- B2=
res 0=1 res 1=2 res 2=3 res 3=4 res 4=5 res 5=6 res 6=7 res 7=8
res 8=9 res 9=10 res 10=11 res 11=12 res 12=13 res 13=14 res 14=15 res 15=16
res 16=17 res 17=18 res 18=19 res 19=20 res 20=21 res 21=22 res 22=23 res 23=24
res 24=25 res 25=26 res 26=27 res 27=28 res 28=29 res 29=30 res 30=31 res 31=32
res 32=33 res 33=34 res 34=35 res 35=36 res 36=37 res 37=38 res 38=39 res 39=40
res 40=41 res 41=42 res 42=43 res 43=44 res 44=45 res 45=46 res 46=47 res 47=48
res 48=49 res 49=50 res 50=51 res 51=52 res 52=53 res 53=54 res 54=55 res 55=56
res 56=57 res 57=58 res 58=59 res 59=60 res 60=61 res 61=62 res 62=63 res 63=64
========= ERROR SUMMARY: 0 errors
$
关于c++ - CUDA,具有设备指针和设备成员函数的成员字段可以访问它,这可能吗?如何?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/53781421/