c++ - 二维 CUDA 中值滤波器优化

标签 c++ performance optimization cuda median

我在 CUDA 中实现了一个二维中值滤波器,整个程序如下所示。

#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <iostream>  
#include <fstream>   
#include <iomanip>   
#include <windows.h>
#include <io.h>                  
#include <stdio.h>
#include<conio.h>
#include <cstdlib>
#include "cstdlib"
#include <process.h>
#include <stdlib.h>
#include <malloc.h>
#include <ctime>
using namespace std;

#define MEDIAN_DIMENSION  3 // For matrix of 3 x 3. We can Use 5 x 5 , 7 x 7 , 9 x 9......   
#define MEDIAN_LENGTH 9   // Shoul be  MEDIAN_DIMENSION x MEDIAN_DIMENSION = 3 x 3

#define BLOCK_WIDTH 16  // Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]
#define BLOCK_HEIGHT 16// Should be 8 If matrix is of larger then of 5 x 5 elese error occur as " uses too much shared data "  at surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH]

 __global__ void MedianFilter_gpu( unsigned short *Device_ImageData,int Image_Width,int Image_Height){

      __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][MEDIAN_LENGTH];

    int iterator;
    const int Half_Of_MEDIAN_LENGTH =(MEDIAN_LENGTH/2)+1;
    int StartPoint=MEDIAN_DIMENSION/2;
    int EndPoint=StartPoint+1;

    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;

    const int tid=threadIdx.y*blockDim.y+threadIdx.x;   

      if(x>=Image_Width || y>=Image_Height)
        return;

     //Fill surround with pixel value of Image in Matrix Pettern of MEDIAN_DIMENSION x MEDIAN_DIMENSION
            if (x == 0 || x == Image_Width - StartPoint || y == 0
                || y == Image_Height - StartPoint) {
            } else {             
                iterator = 0;
                for (int r = x - StartPoint; r < x + (EndPoint); r++) {
                    for (int c = y - StartPoint; c < y + (EndPoint); c++) {
                        surround[tid][iterator] =*(Device_ImageData+(c*Image_Width)+r);
                        iterator++;
                    }
                }
//Sort the Surround Array to Find Median. Use Bubble Short  if Matrix oF 3 x 3 Matrix 
                    //You can use Insertion commented below to Short Bigger Dimension Matrix  

                              ////      bubble short //

                    for ( int i=0; i<Half_Of_MEDIAN_LENGTH; ++i)
                    {     
                        // Find position of minimum element
                        int min=i;
                        for ( int l=i+1; l<MEDIAN_LENGTH; ++l)
                            if (surround[tid][l] <surround[tid][min] )
                                min=l;
                        // Put found minimum element in its place
                        unsigned short  temp= surround[tid][i];
                        surround[tid][i]=surround[tid][min];
                        surround[tid][min]=temp;
                    }//bubble short  end

                    //////insertion sort start   //

                    /*int t,j,i;
                    for ( i = 1 ; i< MEDIAN_LENGTH ; i++) {
                        j = i;
                        while ( j > 0 && surround[tid][j] < surround[tid][j-1]) {
                            t= surround[tid][j];
                            surround[tid][j]= surround[tid][j-1];
                            surround[tid][j-1] = t;
                            j--;
                        }
                    }*/

                    ////insertion sort end   



                    *(Device_ImageData+(y*Image_Width)+x)= surround[tid][Half_Of_MEDIAN_LENGTH-1];   // it will give value of surround[tid][4] as Median Value if use 3 x 3 matrix
                        __syncthreads();
            }  
}

  int main( int argc, const char** argv )
{
    int dataLength;
    int p1;
    unsigned short* Host_ImageData = NULL;
    ifstream is; // Read File 
    is.open ("D:\\Image_To_Be_Filtered.raw", ios::binary );

    // get length of file:
    is.seekg (0, ios::end);
    dataLength = is.tellg();
    is.seekg (0, ios::beg);

    Host_ImageData = new  unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read ((char*)Host_ImageData,dataLength);
    is.close();

    int Image_Width = 1580;
    int Image_Height = 1050;

    unsigned short *Host_ResultData = (unsigned short *)malloc(dataLength);
    unsigned short *Device_ImageData = NULL;

    /////////////////////////////
    // As First time cudaMalloc take more time  for memory alocation, i dont want to cosider this time in my process. 
    //So Please Ignore Code For Displaying First CudaMelloc Time
    clock_t begin = clock();
    unsigned short *forFirstCudaMalloc = NULL;
    cudaMalloc( (void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short) );
    clock_t end = clock();
    double elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"First CudaMelloc time = "<<elapsed_secs<<"  Second\n" ;
    cudaFree( forFirstCudaMalloc );
    ////////////////////////////

    //Actual Process Starts From Here 
    clock_t beginOverAll = clock();   //
    cudaMalloc( (void**)&Device_ImageData, dataLength * sizeof(unsigned short) ); 
    cudaMemcpy(Device_ImageData, Host_ImageData, dataLength, cudaMemcpyHostToDevice);// copying Host Data To Device Memory For Filtering

    int x = static_cast<int>(ceilf(static_cast<float>(1580.0) /BLOCK_WIDTH));
    int y = static_cast<int>(ceilf(static_cast<float>(1050.0) /BLOCK_HEIGHT));

    const dim3 grid (x, y, 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    begin = clock();

    MedianFilter_gpu<<<grid,block>>>( Device_ImageData, Image_Width, Image_Height);
    cudaDeviceSynchronize();

    end = clock();
    elapsed_secs = double(end - begin) / CLOCKS_PER_SEC;
    cout<<"Process time = "<<elapsed_secs<<"  Second\n" ;

    cudaMemcpy(Host_ResultData, Device_ImageData, dataLength, cudaMemcpyDeviceToHost); // copying Back Device Data To Host Memory To write In file After Filter Done

    clock_t endOverall = clock();
    elapsed_secs = double(endOverall - beginOverAll) / CLOCKS_PER_SEC;
    cout<<"Complete Time  = "<<elapsed_secs<<"  Second\n" ;

    ofstream of2;   //Write Filtered Image Into File
    of2.open("D:\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Host_ResultData,dataLength);
    of2.close();
    cout<<"\nEnd of Writing File.  Press Any Key To Exit..!!";
    cudaFree(Device_ImageData);
    delete Host_ImageData;
    delete Host_ResultData;

    getch();
    return 0;
}

Here是我使用的文件的链接。我用了ImajeJ以“原始”格式存储图像,读取“原始”图像也是如此。我的图像像素是 16 位,unsigned short。图片的宽度是1580,高度是1050

我坚信通过使用适当的 CUDA 优化可以使过滤器更加高效和快速。

确实,我在 GeForce GT 520M 显卡上运行,时间如下

1) MEDIAN_DIMENSION 3 x 3 = 0.027 秒

2) MEDIAN_DIMENSION 5 x 5 = 0.206 秒

3) MEDIAN_DIMENSION 7 x 7 = 1.11 秒

4) MEDIAN_DIMENSION 9 x 9 = 4.931 秒

如您所见,随着我们增加 MEDIAN_DIMENSION,时间会增加很多,我的应用程序通常使用更高的 MEDIAN_DIMENSION,例如 7 x 79 x 9。我认为,通过使用 Cuda,即使对于 9 x 9,时间也应该少于 1 秒

既然我认为排序部分在这里占用了大部分时间,我们能否使算法的排序部分更快?

我们可以更有效地使用gridblock 吗?我可以使用更大的 BLOCK_WIDTHBLOCK_HEIGHT(比如 3232)并且仍然没有达到最大值 __shared__ 我的设备的内存限制为 4Kb

能否更有效地使用__shared__内存?

我们将不胜感激。

提前致谢。

最佳答案

我正在回答你关于共享内存使用的最后一个问题。

正如 Eric 已经注意到的,您对共享内存的使用并没有真正导致线程协作。

我正在将您的解决方案(对于 3x3 情况)与您的内核变体(根本不使用共享内存)以及与 2D median filtering in CUDA: how to efficiently copy global memory to shared memory 中讨论的 Acceleryes 解决方案进行比较。 .

完整代码如下:

#include <iostream>  
#include <fstream>   

using namespace std;

#define BLOCK_WIDTH 16 
#define BLOCK_HEIGHT 16

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
    const int tx_l = threadIdx.x;                           // --- Local thread x index
    const int ty_l = threadIdx.y;                           // --- Local thread y index

    const int tx_g = blockIdx.x * blockDim.x + tx_l;        // --- Global thread x index
    const int ty_g = blockIdx.y * blockDim.y + ty_l;        // --- Global thread y index

    __shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];

    // --- Fill the shared memory border with zeros
    if (tx_l == 0)                      smem[tx_l]  [ty_l+1]    = 0;    // --- left border
    else if (tx_l == BLOCK_WIDTH-1)     smem[tx_l+2][ty_l+1]    = 0;    // --- right border
    if (ty_l == 0) {                    smem[tx_l+1][ty_l]      = 0;    // --- upper border
        if (tx_l == 0)                  smem[tx_l]  [ty_l]      = 0;    // --- top-left corner
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l]      = 0;    // --- top-right corner
        }   else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2]  = 0;    // --- bottom border
        if (tx_l == 0)                  smem[tx_l]  [ty_l+2]    = 0;    // --- bottom-left corder
        else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2]    = 0;    // --- bottom-right corner
    }

    // --- Fill shared memory
                                                                    smem[tx_l+1][ty_l+1] =                           Input_Image[ty_g*Image_Width + tx_g];      // --- center
    if ((tx_l == 0)&&((tx_g > 0)))                                      smem[tx_l]  [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1];      // --- left border
    else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))         smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1];      // --- right border
    if ((ty_l == 0)&&(ty_g > 0)) {                                      smem[tx_l+1][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g];    // --- upper border
            if ((tx_l == 0)&&((tx_g > 0)))                                  smem[tx_l]  [ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- top-left corner
            else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g+1];  // --- top-right corner
         } else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) {  smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g];    // --- bottom border
         if ((tx_l == 0)&&((tx_g > 0)))                                 smem[tx_l]  [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- bottom-left corder
        else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1];  // --- bottom-right corner
    }
    __syncthreads();

    // --- Pull the 3x3 window in a local array
    unsigned short v[9] = { smem[tx_l][ty_l],   smem[tx_l+1][ty_l],     smem[tx_l+2][ty_l],
                            smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1],   smem[tx_l+2][ty_l+1],
                            smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2],   smem[tx_l+2][ty_l+2] };    

    // --- Bubble-sort
    for (int i = 0; i < 5; i++) {
        for (int j = i + 1; j < 9; j++) {
            if (v[i] > v[j]) { // swap?
                unsigned short tmp = v[i];
                v[i] = v[j];
                v[j] = tmp;
            }
         }
    }

    // --- Pick the middle one
    Output_Image[ty_g*Image_Width + tx_g] = v[4];
}

/****************************/
/* ORIGINAL KERNEL FUNCTION */
/****************************/
__global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill shared memory
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[tid][iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort shared memory to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[tid][i];
        surround[tid][i]=surround[tid][minval];
        surround[tid][minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[tid][4]; 

    __syncthreads();

}

/***********************************************/
/* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
/***********************************************/
__global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {

    unsigned short surround[9];

    int iterator;

    const int x     = blockDim.x * blockIdx.x + threadIdx.x;
    const int y     = blockDim.y * blockIdx.y + threadIdx.y;
    const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   

    if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;

    // --- Fill array private to the threads
    iterator = 0;
    for (int r = x - 1; r <= x + 1; r++) {
        for (int c = y - 1; c <= y + 1; c++) {
            surround[iterator] = Input_Image[c*Image_Width+r];
            iterator++;
        }
    }

    // --- Sort private array to find the median using Bubble Short
    for (int i=0; i<5; ++i) {

        // --- Find the position of the minimum element
        int minval=i;
        for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;

        // --- Put found minimum element in its place
        unsigned short temp = surround[i];
        surround[i]=surround[minval];
        surround[minval]=temp;
    }

    // --- Pick the middle one
    Output_Image[(y*Image_Width)+x]=surround[4]; 

}

/********/
/* MAIN */
/********/
int main()
{
    const int Image_Width = 1580;
    const int Image_Height = 1050;

    // --- Open data file
    ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );

    // --- Get file length
    is.seekg(0, ios::end);
    int dataLength = is.tellg();
    is.seekg(0, ios::beg);

    // --- Read data from file and close file
    unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
    is.read((char*)Input_Image_Host,dataLength);
    is.close();

    // --- CUDA warm up
    unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
    gpuErrchk(cudaFree(forFirstCudaMalloc));

    // --- Allocate host and device memory spaces 
    unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
    unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short))); 
    unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short))); 

    // --- Copy data from host to device
    gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering

    // --- Grid and block sizes
    const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);      
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    /****************************/
    /* ORIGINAL KERNEL FUNCTION */
    /****************************/
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
    Original_Kernel_Function<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - elapsed time:  %3.3f ms \n", time);

    /***********************************************/
    /* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
    /***********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
    Original_Kernel_Function_no_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Original kernel function - no shared - elapsed time:  %3.3f ms \n", time);

    /**********************************************/
    /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
    /**********************************************/
    cudaEventRecord(start, 0);

    cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
    Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Optimized kernel function - shared - elapsed time:  %3.3f ms \n", time);

    // --- Copy results back to the host
    gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));

    // --- Open results file, write results and close the file
    ofstream of2;     of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw",  ios::binary);
    of2.write((char*)Output_Image_Host, dataLength);
    of2.close();

    cout << "\n Press Any Key To Exit..!!";
    gpuErrchk(cudaFree(Input_Image));

    delete Input_Image_Host;
    delete Output_Image_Host;

    return 0;
}

以下是 Kepler K20c 的计时结果:

1580 x 1050
Original_Kernel_Function             = 1.588ms
Original_Kernel_Function_no_shared   = 1.278ms
Optimized_Kernel_Function_shared     = 1.455ms

2048 x 2048
Original_Kernel_Function             = 3.94ms
Original_Kernel_Function_no_shared   = 3.118ms
Optimized_Kernel_Function_shared     = 3.709ms

4096 x 4096
Original_Kernel_Function             = 16.003ms
Original_Kernel_Function_no_shared   = 13.735ms
Optimized_Kernel_Function_shared     = 14.526ms

8192 x 8192
Original_Kernel_Function             = 62.278ms
Original_Kernel_Function_no_shared   = 47.484ms
Optimized_Kernel_Function_shared     = 57.474ms

以下是 GT540M 上的计时结果,它与您的卡更相似:

1580 x 1050
Original_Kernel_Function             = 10.332 ms
Original_Kernel_Function_no_shared   =  9.294 ms
Optimized_Kernel_Function_shared     = 10.301 ms

2048 x 2048
Original_Kernel_Function             = 25.256 ms
Original_Kernel_Function_no_shared   = 23.567 ms
Optimized_Kernel_Function_shared     = 23.876 ms

4096 x 4096
Original_Kernel_Function             = 99.791 ms
Original_Kernel_Function_no_shared   = 93.919 ms
Optimized_Kernel_Function_shared     = 95.464 ms

8192 x 8192
Original_Kernel_Function             = 399.259 ms
Original_Kernel_Function_no_shared   = 375.634 ms
Optimized_Kernel_Function_shared     = 383.121 ms

可以看出,不使用共享内存的版本似乎在所有情况下都(稍微)方便。

关于c++ - 二维 CUDA 中值滤波器优化,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/19634328/

相关文章:

c++ - 带有 Visual Assist X 和 Doxygen 的 Visual Studio 2012 IntelliSense

javascript - Cloudflare Rocket Loader 不适用于移动设备

Android Camera2 RAW 流媒体

algorithm - 有效的猜测算法

algorithm - 以尽可能少的比较对元素进行排序

iphone - Objective c 将 block 转换为选择器

c++ - 递归时的段错误

c++ - 将 Qt 与 DirectX 一起使用?

haskell - 在 Haskell 中优化 Perlin 噪声

c - 如何修改从 C 生成的现有汇编代码