我需要帮助让我的 cuda 程序运行得更快。 NVIDIA 视觉分析器显示性能不佳,显示“低计算利用率 1.4%”:
代码如下。第一次内核准备:
void laskeSyvyydet(int& tiilet0, int& tiilet1, int& tiilet2, int& tiilet3) {
cudaArray *tekstuuriSisaan, *tekstuuriUlos;
//take care of synchronazion
cudaEvent_t cEvent;
cudaEventCreate(&cEvent);
//let's take control of OpenGL textures
cudaGraphicsMapResources(1, &cuda.cMaxSyvyys);
cudaEventRecord(cEvent, 0);
cudaGraphicsMapResources(1, &cuda.cDepthTex);
cudaEventRecord(cEvent, 0);
//need to create CUDA pointers
cudaGraphicsSubResourceGetMappedArray(&tekstuuriSisaan, cuda.cDepthTex, 0, 0);
cudaGraphicsSubResourceGetMappedArray(&tekstuuriUlos, cuda.cMaxSyvyys, 0, 0);
cudaProfilerStart();
//launch kernel
cLaskeSyvyydet(tiilet0, tiilet1, tiilet2, tiilet3, tekstuuriSisaan, tekstuuriUlos);
cudaEventRecord(cEvent, 0);
cudaProfilerStop();
//release textures back to OpenGL
cudaGraphicsUnmapResources(1, &cuda.cMaxSyvyys, 0);
cudaEventRecord(cEvent, 0);
cudaGraphicsUnmapResources(1, &cuda.cDepthTex, 0);
cudaEventRecord(cEvent, 0);
//final synchronazion
cudaEventSynchronize(cEvent);
cudaEventDestroy(cEvent);
}
内核启动:
void cLaskeSyvyydet(int& tiilet0, int& tiilet1, int& tiilet2, int& tiilet3, cudaArray* tekstuuriSisaan, cudaArray* tekstuuriUlos) {
cudaBindTextureToArray(surfRefSisaan, tekstuuriSisaan);
cudaBindSurfaceToArray(surfRefUlos, tekstuuriUlos);
int blocksW = (int)ceilf( tiilet0 / 32.0f );
int blocksH = (int)ceilf( tiilet1 / 32.0f );
dim3 gridDim( blocksW, blocksH, 1 );
dim3 blockDim(32, 32, 1 );
kLaskeSyvyydet<<<gridDim, blockDim>>>(tiilet0, tiilet1, tiilet2, tiilet3);
}
和内核:
__global__ void kLaskeSyvyydet(const int tiilet0, const int tiilet1, const int tiilet2, const int tiilet3) {
//first define indexes
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= tiilet0 || j >= tiilet1) return;
//if we are inside boundaries, let's find the greatest depth value
unsigned int takana=0;
unsigned int ddd;
uchar4 syvyys;
uchar4 dd;
//there's possibly four different tile sizes to choose between
if (j!=tiilet1-1 && i!=tiilet0-1) {
for (int y=j*BLOCK_SIZE; y<(j+1)*BLOCK_SIZE; y++) {
for (int x=i*BLOCK_SIZE; x<(i+1)*BLOCK_SIZE; x++) {
dd=tex2D(surfRefSisaan, x, y);
ddd=(dd.x << 24) | (dd.y << 16) | (dd.z << 8) | (dd.w);
takana=max(takana, ddd);
}
}
} else if (j==tiilet1-1 && i!=tiilet0-1) {
for (int y=j*BLOCK_SIZE; y<j*BLOCK_SIZE+tiilet3; y++) {
for (int x=i*BLOCK_SIZE; x<(i+1)*BLOCK_SIZE; x++) {
dd=tex2D(surfRefSisaan, x, y);
ddd=(dd.x << 24) | (dd.y << 16) | (dd.z << 8) | (dd.w);
takana=max(takana, ddd);
}
}
} else if (j!=tiilet1-1 && i==tiilet0-1) {
for (int y=j*BLOCK_SIZE; y<(j+1)*BLOCK_SIZE; y++) {
for (int x=i*BLOCK_SIZE; x<i*BLOCK_SIZE+tiilet2; x++) {
dd=tex2D(surfRefSisaan, x, y);
ddd=(dd.x << 24) | (dd.y << 16) | (dd.z << 8) | (dd.w);
takana=max(takana, ddd);
}
}
} else if (j==tiilet1-1 && i==tiilet0-1) {
for (int y=j*BLOCK_SIZE; y<j*BLOCK_SIZE+tiilet3; y++) {
for (int x=i*BLOCK_SIZE; x<i*BLOCK_SIZE+tiilet2; x++) {
dd=tex2D(surfRefSisaan, x, y);
ddd=(dd.x << 24) | (dd.y << 16) | (dd.z << 8) | (dd.w);
takana=max(takana, ddd);
}
}
}
//if there's empty texture, then we choose the maximum possible value
if (takana==0) {
takana=1000000000;
}
//after slicing the greatest 32bit depth value into four 8bit pieces we write the value into another texture
syvyys.x=(takana & 0xFF000000) >> 24;
syvyys.y=(takana & 0x00FF0000) >> 16;
syvyys.z=(takana & 0x0000FF00) >> 8;
syvyys.w=(takana & 0x000000FF) >> 0;
surf2Dwrite(syvyys, surfRefUlos, i*sizeof(syvyys), j, cudaBoundaryModeZero);
}
请帮助我更快地完成这项工作,我没有想法...
最佳答案
看起来您有一个大小为
的二维int
输入数组
((tiilet0-1)*BLOCK_SIZE+tiilet2, ((tiilet1-1)*BLOCK_SIZE)+tiilet3)
您的每个线程将依次读取大小为输入 block 中的所有元素
(BLOCK_SIZE, BLOCK_SIZE)
并将每个输入 block 的最大值写入大小为 2D 结果数组
(tiilet0, tiilet1)
与 union 内存访问相比,这可能是访问全局内存的最糟糕的方式,即使是 2D 纹理。你们很多人想阅读有关 union 内存访问的内容。
https://devblogs.nvidia.com/parallelforall/how-access-global-memory-efficiently-cuda-c-kernels/
通常,您在一个线程中投入了过多的工作。鉴于您将 CUDA 线程 block 映射到输入数组的方式,我想除非您有非常大的输入,否则您的 gridDim
将太小而无法充分利用 GPU。
为了获得更好的性能,您可能希望将每个输入 block 一个 CUDA 线程更改为每个输入 block 一个 CUDA 线程 block (int[BLOCK_SIZE][BLOCK_SIZE]
),并使用 parallel reduction找到 block 级最大值。
关于c++ - 为什么这个 CUDA 内核很慢?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37775822/