我正在尝试优化我的表面检测内核;给定输入二进制512w x 1024h
图像,我想找到图像中的第一个亮表面。我编写的代码声明了512个线程,并在3x3
附近搜索了第一个亮像素。该代码可以正常工作,但是~9.46 ms
有点慢,我想使其运行得更快。
编辑1:,性能提高不到我的原始内核运行时间的一半。 Robert的内核在我的Quadro K6000上以4.032 ms
运行。
编辑2:设法通过将线程数减少一半来进一步提高性能。现在,我的(Robert修改过的)内核在Quadro K6000上以2.125 ms
运行。
使用以下命令调用内核:firstSurfaceDetection <<< 1, 512 >>> (threshImg, firstSurfaceImg, actualImHeight, actualImWidth);
我想使用共享内存来改善内存获取;关于如何优化此代码补丁的任何想法?
__global__ void firstSurfaceDetection (float *threshImg, float *firstSurfaceImg, int height, int width) {
int col = threadIdx.x + (blockDim.x*blockIdx.x);
int rows2skip = 10;
float thresh = 1.0f;
//thread Index: (0 -> 511)
if (col < width) {
if( col == 0 ) { // first col - 0
for (int row = 0 + rows2skip; row < height - 2; row++) { // skip first 30 rows
int cnt = 0;
float neibs[6]; // not shared mem as it reduces speed
// get six neighbours - three in same col, and three to the right
neibs[0] = threshImg[((row)*width) +(col)]; if(neibs[0] == thresh) { cnt++; } // current position
neibs[1] = threshImg[((row)*width) +(col+1)]; if(neibs[1] == thresh) { cnt++; } // right
neibs[2] = threshImg[((row+1)*width) +(col)]; if(neibs[2] == thresh) { cnt++; } // bottom
neibs[3] = threshImg[((row+1)*width) +(col+1)]; if(neibs[3] == thresh) { cnt++; } // bottom right
neibs[4] = threshImg[((row+2)*width) +(col)]; if(neibs[4] == thresh) { cnt++; } // curr offset by 2 - bottom
neibs[5] = threshImg[((row+2)*width) +(col+1)]; if(neibs[5] == thresh) { cnt++; } // curr offset by 2 - bottom right
if(cnt == 6) { // if all neighbours are bright, we are at the edge boundary
firstSurfaceImg[(row)*width + col] = 1.0f;
row = height;
}
}
}
else if ( col == (width-1) ) { // last col
for (int row = 0 + rows2skip; row < height -2; row++) {
int cnt = 0;
float neibs[6]; // not shared mem as it reduces speed
// get six neighbours - three in same col, and three to the left
neibs[0] = threshImg[((row)*width) +(col)]; if(neibs[0] == thresh) { cnt++; } // current position
neibs[1] = threshImg[((row)*width) +(col-1)]; if(neibs[1] == thresh) { cnt++; } // left
neibs[2] = threshImg[((row+1)*width) +(col)]; if(neibs[2] == thresh) { cnt++; } // bottom
neibs[3] = threshImg[((row+1)*width) +(col-1)]; if(neibs[3] == thresh) { cnt++; } // bottom left
neibs[4] = threshImg[((row+2)*width) +(col)]; if(neibs[4] == thresh) { cnt++; } // curr offset by 2 - bottom
neibs[5] = threshImg[((row+2)*width) +(col-1)]; if(neibs[5] == thresh) { cnt++; } // curr offset by 2 - bottom left
if(cnt == 6) { // if all neighbours are bright, we are at the edge boundary
firstSurfaceImg[(row)*width + col] = 1.0f;
row = height;
}
}
}
// remaining threads are: (1 -> 510)
else { // any col other than first or last column
for (int row = 0 + rows2skip; row < height - 2; row++) {
int cnt = 0;
float neibs[9]; // not shared mem as it reduces speed
// for threads < width/4, get the neighbors
// get nine neighbours - three in curr col, three each to left and right
neibs[0] = threshImg[((row)*width) +(col-1)]; if(neibs[0] == thresh) { cnt++; }
neibs[1] = threshImg[((row)*width) +(col)]; if(neibs[1] == thresh) { cnt++; }
neibs[2] = threshImg[((row)*width) +(col+1)]; if(neibs[2] == thresh) { cnt++; }
neibs[3] = threshImg[((row+1)*width) +(col-1)]; if(neibs[3] == thresh) { cnt++; }
neibs[4] = threshImg[((row+1)*width) +(col)]; if(neibs[4] == thresh) { cnt++; }
neibs[5] = threshImg[((row+1)*width) +(col+1)]; if(neibs[5] == thresh) { cnt++; }
neibs[6] = threshImg[((row+2)*width) +(col-1)]; if(neibs[6] == thresh) { cnt++; }
neibs[7] = threshImg[((row+2)*width) +(col)]; if(neibs[7] == thresh) { cnt++; }
neibs[8] = threshImg[((row+2)*width) +(col+1)]; if(neibs[8] == thresh) { cnt++; }
if(cnt == 9) { // if all neighbours are bright, we are at the edge boundary
firstSurfaceImg[(row)*width + col] = 1.0f;
row = height;
}
}
}
}
__syncthreads();
}
最佳答案
这是一个工作示例,演示了注释中讨论的3个概念中的2个:
atomicMin
。 以下代码演示了上述2种更改(并且不包括共享内存的任何使用),并演示了(对我而言,在Tesla K40上)与PC原始内核相比,加速范围是1x-20x。加速范围是由于算法的工作取决于图像结构而变化的事实。两种算法都有提早退出策略。由于for循环上的早期退出结构,原始内核可以完成或多或少的工作,具体取决于每列中发现“明亮”像素邻域的位置(如果有)。因此,如果所有列在第0行附近都具有明亮的邻域,我会看到大约1倍的“改进”(即,我的内核以与原始内核相同的速度运行)。如果所有列(仅)在图像的另一“端”附近都具有明亮的邻域,则可以看到大约提高了20倍。这可能会因GPU而异,因为开普勒GPU已提高了我正在使用的全局原子吞吐量。 编辑:由于这种可变的工作方式,我添加了一个粗略的“提前退出”策略作为对我的代码的琐碎修改。这使最短的执行时间接近两个内核之间的奇偶校验(即大约1倍)。
其余的优化可能包括:
例:
#include <stdlib.h>
#include <stdio.h>
#define SKIP_ROWS 10
#define THRESH 1.0f
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void firstSurfaceDetection (float *threshImg, float *firstSurfaceImg, int height, int width) {
int col = threadIdx.x + (blockDim.x*blockIdx.x);
int rows2skip = SKIP_ROWS;
float thresh = THRESH;
//thread Index: (0 -> 511)
if (col < width) {
if( col == 0 ) { // first col - 0
for (int row = 0 + rows2skip; row < height; row++) { // skip first 30 rows
int cnt = 0;
float neibs[6]; // not shared mem as it reduces speed
// get six neighbours - three in same col, and three to the right
neibs[0] = threshImg[((row)*width) +(col)]; if(neibs[0] == thresh) { cnt++; } // current position
neibs[1] = threshImg[((row)*width) +(col+1)]; if(neibs[1] == thresh) { cnt++; } // right
neibs[2] = threshImg[((row+1)*width) +(col)]; if(neibs[2] == thresh) { cnt++; } // bottom
neibs[3] = threshImg[((row+1)*width) +(col+1)]; if(neibs[3] == thresh) { cnt++; } // bottom right
neibs[4] = threshImg[((row+2)*width) +(col)]; if(neibs[4] == thresh) { cnt++; } // curr offset by 2 - bottom
neibs[5] = threshImg[((row+2)*width) +(col+1)]; if(neibs[5] == thresh) { cnt++; } // curr offset by 2 - bottom right
if(cnt == 6) { // if all neighbours are bright, we are at the edge boundary
firstSurfaceImg[(row)*width + col] = 1.0f;
row = height;
}
}
}
else if ( col == (width-1) ) { // last col
for (int row = 0 + rows2skip; row < height; row++) {
int cnt = 0;
float neibs[6]; // not shared mem as it reduces speed
// get six neighbours - three in same col, and three to the left
neibs[0] = threshImg[((row)*width) +(col)]; if(neibs[0] == thresh) { cnt++; } // current position
neibs[1] = threshImg[((row)*width) +(col-1)]; if(neibs[1] == thresh) { cnt++; } // left
neibs[2] = threshImg[((row+1)*width) +(col)]; if(neibs[2] == thresh) { cnt++; } // bottom
neibs[3] = threshImg[((row+1)*width) +(col-1)]; if(neibs[3] == thresh) { cnt++; } // bottom left
neibs[4] = threshImg[((row+2)*width) +(col)]; if(neibs[4] == thresh) { cnt++; } // curr offset by 2 - bottom
neibs[5] = threshImg[((row+2)*width) +(col-1)]; if(neibs[5] == thresh) { cnt++; } // curr offset by 2 - bottom left
if(cnt == 6) { // if all neighbours are bright, we are at the edge boundary
firstSurfaceImg[(row)*width + col] = 1.0f;
row = height;
}
}
}
// remaining threads are: (1 -> 510)
else { // any col other than first or last column
for (int row = 0 + rows2skip; row < height; row++) {
int cnt = 0;
float neibs[9]; // not shared mem as it reduces speed
// for threads < width/4, get the neighbors
// get nine neighbours - three in curr col, three each to left and right
neibs[0] = threshImg[((row)*width) +(col-1)]; if(neibs[0] == thresh) { cnt++; }
neibs[1] = threshImg[((row)*width) +(col)]; if(neibs[1] == thresh) { cnt++; }
neibs[2] = threshImg[((row)*width) +(col+1)]; if(neibs[2] == thresh) { cnt++; }
neibs[3] = threshImg[((row+1)*width) +(col-1)]; if(neibs[3] == thresh) { cnt++; }
neibs[4] = threshImg[((row+1)*width) +(col)]; if(neibs[4] == thresh) { cnt++; }
neibs[5] = threshImg[((row+1)*width) +(col+1)]; if(neibs[5] == thresh) { cnt++; }
neibs[6] = threshImg[((row+2)*width) +(col-1)]; if(neibs[6] == thresh) { cnt++; }
neibs[7] = threshImg[((row+2)*width) +(col)]; if(neibs[7] == thresh) { cnt++; }
neibs[8] = threshImg[((row+2)*width) +(col+1)]; if(neibs[8] == thresh) { cnt++; }
if(cnt == 9) { // if all neighbours are bright, we are at the edge boundary
firstSurfaceImg[(row)*width + col] = 1.0f;
row = height;
}
}
}
}
__syncthreads();
}
__global__ void firstSurfaceDetection_opt (const float * __restrict__ threshImg, int *firstSurfaceImgRow, int height, int width) {
int col = threadIdx.x + (blockDim.x*blockIdx.x);
int row = threadIdx.y + (blockDim.y*blockIdx.y);
int rows2skip = SKIP_ROWS;
float thresh = THRESH;
if ((row >= rows2skip) && (row < height-2) && (col < width) && (row < firstSurfaceImgRow[col])) {
int cnt = 0;
int inc = 0;
if (col == 0) inc = +1;
if (col == (width-1)) inc = -1;
if (inc){
cnt = 3;
if (threshImg[((row)*width) +(col)] == thresh) cnt++;
if (threshImg[((row)*width) +(col+inc)] == thresh) cnt++;
if (threshImg[((row+1)*width) +(col)] == thresh) cnt++;
if (threshImg[((row+1)*width) +(col+inc)] == thresh) cnt++;
if (threshImg[((row+2)*width) +(col)] == thresh) cnt++;
if (threshImg[((row+2)*width) +(col+inc)] == thresh) cnt++;
}
else {
// get nine neighbours - three in curr col, three each to left and right
if (threshImg[((row)*width) +(col-1)] == thresh) cnt++;
if (threshImg[((row)*width) +(col)] == thresh) cnt++;
if (threshImg[((row)*width) +(col+1)] == thresh) cnt++;
if (threshImg[((row+1)*width) +(col-1)] == thresh) cnt++;
if (threshImg[((row+1)*width) +(col)] == thresh) cnt++;
if (threshImg[((row+1)*width) +(col+1)] == thresh) cnt++;
if (threshImg[((row+2)*width) +(col-1)] == thresh) cnt++;
if (threshImg[((row+2)*width) +(col)] == thresh) cnt++;
if (threshImg[((row+2)*width) +(col+1)] == thresh) cnt++;
}
if(cnt == 9) { // if all neighbours are bright, we are at the edge boundary
atomicMin(firstSurfaceImgRow + col, row);
}
}
}
int main(int argc, char *argv[]){
float *threshImg, *h_threshImg, *firstSurfaceImg, *h_firstSurfaceImg;
int *firstSurfaceImgRow, *h_firstSurfaceImgRow;
int actualImHeight = 1024;
int actualImWidth = 512;
int row_set = 512;
if (argc > 1){
int my_val = atoi(argv[1]);
if ((my_val > SKIP_ROWS) && (my_val < actualImHeight - 3)) row_set = my_val;
}
h_firstSurfaceImg = (float *)malloc(actualImHeight*actualImWidth*sizeof(float));
h_threshImg = (float *)malloc(actualImHeight*actualImWidth*sizeof(float));
h_firstSurfaceImgRow = (int *)malloc(actualImWidth*sizeof(int));
cudaMalloc(&threshImg, actualImHeight*actualImWidth*sizeof(float));
cudaMalloc(&firstSurfaceImg, actualImHeight*actualImWidth*sizeof(float));
cudaMalloc(&firstSurfaceImgRow, actualImWidth*sizeof(int));
cudaMemset(firstSurfaceImgRow, 1, actualImWidth*sizeof(int));
cudaMemset(firstSurfaceImg, 0, actualImHeight*actualImWidth*sizeof(float));
for (int i = 0; i < actualImHeight*actualImWidth; i++) h_threshImg[i] = 0.0f;
// insert "bright row" here
for (int i = (row_set*actualImWidth); i < ((row_set+3)*actualImWidth); i++) h_threshImg[i] = THRESH;
cudaMemcpy(threshImg, h_threshImg, actualImHeight*actualImWidth*sizeof(float), cudaMemcpyHostToDevice);
dim3 grid(1,1024);
//warm-up run
firstSurfaceDetection_opt <<< grid, 512 >>> (threshImg, firstSurfaceImgRow, actualImHeight, actualImWidth);
cudaDeviceSynchronize();
cudaMemset(firstSurfaceImgRow, 1, actualImWidth*sizeof(int));
cudaDeviceSynchronize();
unsigned long long t2 = dtime_usec(0);
firstSurfaceDetection_opt <<< grid, 512 >>> (threshImg, firstSurfaceImgRow, actualImHeight, actualImWidth);
cudaDeviceSynchronize();
t2 = dtime_usec(t2);
cudaMemcpy(h_firstSurfaceImgRow, firstSurfaceImgRow, actualImWidth*sizeof(float), cudaMemcpyDeviceToHost);
unsigned long long t1 = dtime_usec(0);
firstSurfaceDetection <<< 1, 512 >>> (threshImg, firstSurfaceImg, actualImHeight, actualImWidth);
cudaDeviceSynchronize();
t1 = dtime_usec(t1);
cudaMemcpy(h_firstSurfaceImg, firstSurfaceImg, actualImWidth*actualImHeight*sizeof(float), cudaMemcpyDeviceToHost);
printf("t1 = %fs, t2 = %fs\n", t1/(float)USECPSEC, t2/(float)USECPSEC);
// validate results
for (int i = 0; i < actualImWidth; i++)
if (h_firstSurfaceImgRow[i] < actualImHeight)
if (h_firstSurfaceImg[(h_firstSurfaceImgRow[i]*actualImWidth)+i] != THRESH)
{printf("mismatch at %d, was %f, should be %d\n", i, h_firstSurfaceImg[(h_firstSurfaceImgRow[i]*actualImWidth)+i], THRESH); return 1;}
return 0;
}
$ nvcc -arch=sm_35 -o t667 t667.cu
$ ./t667
t1 = 0.000978s, t2 = 0.000050s
$
笔记:
上面的示例
编辑:我再次修改了代码,以便可以将插入明亮边界的测试图像中的行(行)作为命令行参数传递,并且我已经在3种不同的设备上测试了该代码,对亮行使用3种不同的设置:
execution time on: K40 C2075 Quadro NVS 310
bright row = 15: 31/33 23/45 29/314
bright row = 512: 978/50 929/112 1232/805
bright row = 1000: 2031/71 2033/149 2418/1285
all times are microseconds (original kernel/optimized kernel)
CUDA 6.5, CentOS 6.2
关于c++ - CUDA-优化表面检测内核,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28509499/