performance - Cuda Bayer/CFA 去马赛克示例

我写了一个 CUDA4 Bayer 去马赛克例程，但它比单线程 CPU 代码慢，在 a16core GTS250 上运行。
Blocksize 是 (16,16) 并且图像变暗是 16 的倍数 - 但改变它并不能改善它。

我在做任何明显愚蠢的事情吗？

--------------- calling routine ------------------
uchar4 *d_output;
size_t num_bytes; 

cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);    
cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource);

// Do the conversion, leave the result in the PBO fordisplay
kernel_wrapper( imageWidth, imageHeight, blockSize, gridSize, d_output );

cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);

--------------- cuda -------------------------------
texture<uchar, 2, cudaReadModeElementType> tex;
cudaArray *d_imageArray = 0;

__global__ void convertGRBG(uchar4 *d_output, uint width, uint height)
{
    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
    uint i = __umul24(y, width) + x;

    // input is GR/BG output is BGRA
    if ((x < width) && (y < height)) {

        if ( y & 0x01 ) {
            if ( x & 0x01 ) {  
                d_output[i].x =  (tex2D(tex,x+1,y)+tex2D(tex,x-1,y))/2;  // B                
                d_output[i].y = (tex2D(tex,x,y));     // G in B
                d_output[i].z = (tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/2;  // R                    
            } else {
                d_output[i].x = (tex2D(tex,x,y));        //B
                d_output[i].y = (tex2D(tex,x+1,y) + tex2D(tex,x-1,y)+tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/4;  // G
                d_output[i].z = (tex2D(tex,x+1,y+1) + tex2D(tex,x+1,y-1)+tex2D(tex,x-1,y+1)+tex2D(tex,x-1,y-1))/4;   // R
            }
        } else {
            if ( x & 0x01 ) {
                 // odd col = R
                d_output[i].y = (tex2D(tex,x+1,y+1) + tex2D(tex,x+1,y-1)+tex2D(tex,x-1,y+1)+tex2D(tex,x-1,y-1))/4;  // B
                d_output[i].z = (tex2D(tex,x,y));        //R
                d_output[i].y = (tex2D(tex,x+1,y) + tex2D(tex,x-1,y)+tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/4;  // G    
            } else {    
                d_output[i].x = (tex2D(tex,x,y+1)+tex2D(tex,x,y-1))/2;  // B
                d_output[i].y = (tex2D(tex,x,y));               // G  in R               
                d_output[i].z = (tex2D(tex,x+1,y)+tex2D(tex,x-1,y))/2;  // R                    
            }
        }                                
    }
}



void initTexture(int imageWidth, int imageHeight, uchar *imagedata)
{

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);
    cutilSafeCall( cudaMallocArray(&d_imageArray, &channelDesc, imageWidth, imageHeight) ); 
    uint size = imageWidth * imageHeight * sizeof(uchar);
    cutilSafeCall( cudaMemcpyToArray(d_imageArray, 0, 0, imagedata, size, cudaMemcpyHostToDevice) );
    cutFree(imagedata);

    // bind array to texture reference with point sampling
    tex.addressMode[0] = cudaAddressModeClamp;
    tex.addressMode[1] = cudaAddressModeClamp;
    tex.filterMode = cudaFilterModePoint;
    tex.normalized = false; 

    cutilSafeCall( cudaBindTextureToArray(tex, d_imageArray) );
}

最佳答案

您的代码中没有任何明显的错误，但有几个明显的性能机会:

1) 为了获得最佳性能，您应该使用纹理进入共享内存 - 请参阅“SobelFilter”SDK 示例。

2) 正如所写，代码正在将字节写入全局内存，这保证会导致很大的性能损失。在将结果提交到全局内存之前，您可以使用共享内存来暂存结果。

3) 以匹配硬件的纹理缓存属性的方式调整块大小具有惊人的巨大性能优势。在 Tesla 级硬件上，使用与内核相同的寻址方案的内核的最佳块大小是 16x4。 (每块 64 个线程)

对于这样的工作负载，可能很难与优化的 CPU 代码竞争。 SSE2 可以在一条指令中执行 16 个字节大小的操作，CPU 的时钟速度大约是其 5 倍。

关于performance - Cuda Bayer/CFA 去马赛克示例，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/8072305/

performance - Cuda Bayer/CFA 去马赛克示例

上一篇：mediaelement.js - 什么是模式 :shim actually doing?

下一篇：wolfram-mathematica - 试图让 Mathematica 逼近积分