CUDA 写入常量内存错误值

标签 cuda gpu-constant-memory

我有以下代码从主机变量复制到 CUDA 中的 __constant__ 变量

int main(int argc, char **argv){

    int exit_code;

    if (argc < 4) {
        std::cout << "Usage: \n " << argv[0] << " <input> <output> <nColors>" << std::endl;
        return 1;
    }

    Color *h_input;
    int h_rows, h_cols;

    timer1.Start();
    exit_code = readText2RGB(argv[1], &h_input, &h_rows, &h_cols);
    timer1.Stop();
    std::cout << "Reading: " << timer1.Elapsed() << std::endl;

    if (exit_code != SUCCESS){
        std::cout << "Error trying to read file." << std::endl;
        return FAILURE;
    }

    CpuTimer timer1;
    GpuTimer timer2;
    float timeStep2 = 0, timeStep3 = 0;

    int h_numColors = atoi(argv[3]);

    int h_change = 0;
    int *h_pixelGroup = new int[h_rows*h_cols];
    Color *h_groupRep = new Color[h_numColors];
    Color *h_output = new Color[h_rows*h_cols];

    Color *d_input;
    int *d_pixelGroup;
    Color *d_groupRep;
    Color *d_output;

    dim3 block(B_WIDTH, B_HEIGHT);
    dim3 grid((h_cols+B_WIDTH-1)/B_WIDTH, (h_rows+B_HEIGHT-1)/B_HEIGHT);

    checkCudaError(cudaMalloc((void**)&d_input, sizeof(Color)*h_rows*h_cols));
    checkCudaError(cudaMalloc((void**)&d_pixelGroup, sizeof(int)*h_rows*h_cols));
    checkCudaError(cudaMalloc((void**)&d_groupRep, sizeof(Color)*h_numColors));
    checkCudaError(cudaMalloc((void**)&d_output, sizeof(Color)*h_rows*h_cols));

    //       STEP 1
    //Evenly distribute all pixels of the image onto the color set
    timer2.Start();
    checkCudaError(cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int)));
    checkCudaError(cudaMemcpyToSymbol(c_cols, &h_cols, sizeof(int)));
    checkCudaError(cudaMemcpyToSymbol(c_numColors, &h_numColors, sizeof(int)));
    checkCudaError(cudaMemcpy(d_input, h_input, sizeof(Color)*h_rows*h_cols, cudaMemcpyHostToDevice));

    clut_distributePixels<<<grid, block>>>(d_pixelGroup);
    checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
    timer2.Stop();
    std::cout << "Phase 1: " << timer2.Elapsed() << std::endl;

    std::cout << h_pixelGroup[0] << ","
                << h_pixelGroup[3] << ","
                << h_pixelGroup[4] << ","
                << h_pixelGroup[7] << ","
                << h_pixelGroup[8] << std::endl;

    //Do the STEP 2 and STEP 3 as long as there is at least one change of representative in a group
    do {
        //      STEP 2
        //Set the representative value to the average colour of all pixels in the same set
        timer1.Start();
        for (int ng = 0; ng < h_numColors; ng++) {
            int r = 0, g = 0, b = 0;
            int elem = 0;
            for (int i = 0; i < h_rows; i++) {
                for (int j = 0; j < h_cols; j++) {
                    if (h_pixelGroup[i*h_cols+j] == ng) {
                        r += h_input[i*h_cols+j].r;
                        g += h_input[i*h_cols+j].g;
                        b += h_input[i*h_cols+j].b;
                        elem++;
                    }
                }
            }
            if (elem == 0) {
                h_groupRep[ng].r = 255;
                h_groupRep[ng].g = 255;
                h_groupRep[ng].b = 255;
            }else{
                h_groupRep[ng].r = r/elem;
                h_groupRep[ng].g = g/elem;
                h_groupRep[ng].b = b/elem;
            }
        }
        timer1.Stop();
        timeStep2 += timer1.Elapsed();

        //      STEP 3
        //For each pixel in the image, compute Euclidean's distance to each representative
        //and assign it to the set which is closest
        h_change = 0;

        timer2.Start();
        checkCudaError(cudaMemcpyToSymbol(d_change, &h_change, sizeof(int)));
        checkCudaError(cudaMemcpy(d_groupRep, h_groupRep, sizeof(Color)*h_numColors, cudaMemcpyHostToDevice));

        clut_checkDistances<<<grid, block>>>(d_input, d_pixelGroup, d_groupRep);
        checkCudaError(cudaMemcpy(h_pixelGroup, d_pixelGroup, sizeof(int)*h_rows*h_cols, cudaMemcpyDeviceToHost));
        checkCudaError(cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int)));
        timer2.Stop();
        timeStep3 += timer2.Elapsed();

        std::cout << "Chunche" << std::endl;

    } while (h_change == 1);

    std::cout << "Phase 2: " << timeStep2 << std::endl;
    std::cout << "Phase 3: " << timeStep3 << std::endl;

    //      STEP 4
    //Create the new image with the resulting color lookup table
    timer2.Start();
    clut_createImage<<<grid, block>>>(d_output, d_pixelGroup, d_groupRep);
    checkCudaError(cudaMemcpy(h_output, d_output, sizeof(Color)*h_rows*h_cols, cudaMemcpyDeviceToHost));
    timer2.Stop();
    std::cout << "Phase 4: " << timer2.Elapsed() << std::endl;

    checkCudaError(cudaFree(d_input));
    checkCudaError(cudaFree(d_pixelGroup));
    checkCudaError(cudaFree(d_groupRep));
    checkCudaError(cudaFree(d_output));

    timer1.Start();
    exit_code = writeRGB2Text(argv[2], h_input, h_rows, h_cols);
    timer1.Stop();
    std::cout << "Writing: " << timer1.Elapsed() << std::endl;

    delete[] h_pixelGroup;
    delete[] h_groupRep;
    delete[] h_output;

    return SUCCESS;
}

当我从内核中打印时,我得到三个值的零

__global__
void clut_distributePixels(int *pixelGroup){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    if(i == 0 && j == 0){
        printf("a: %d\n", c_rows);
        printf("b: %d\n", c_cols);
        printf("c: %d\n", c_numColors);
    }

    while (i < c_rows) {
        while (j < c_cols) {
            pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }

}

要么我没有正确复制到常量内存,要么……我不知道哪里出了问题。任何建议!? 我发布了整个主机代码,可能是其他东西扰乱了常量副本。

更新

主.cu

#include "Imageproc.cuh"
int main(){
  int h_change = 0;
  int h_rows = 512;
  cudaMemcpyToSymbol(c_rows, &h_rows, sizeof(int));
  chunche<<<1,1>>>();
  cudaMemcpyFromSymbol(&h_change, d_change, sizeof(int));

  std::cout << "H = " << h_change << std::endl;
  return 0
}

Imageproc.cuh

#ifndef _IMAGEPROC_CUH_
#define _IMAGEPROC_CUH_

#include "Utilities.cuh"

#define B_WIDTH     16
#define B_HEIGHT    16

__constant__ int c_rows;
__constant__ int c_cols;
__constant__ int c_numColors;

__device__ int d_change;

    #ifdef __cplusplus
        extern "C"
        {
    #endif
        __global__
        void chunche();
        __global__
        void clut_distributePixels(int *pixelGroup);
        __global__
        void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep);
        __global__
        void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep);
    #ifdef __cplusplus
        }
    #endif

#endif

Imageproc.cu

#include "Imageproc.cuh"

__global__
void chunche(){
    d_change = c_rows + 1;
}

__global__
void clut_distributePixels(int *pixelGroup){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    while (i < c_rows) {
        while (j < c_cols) {
            pixelGroup[i*c_cols+j] = (i*c_cols+j)/c_numColors;
            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }

}

__global__
void clut_checkDistances(Color *input, int *pixelGroup, Color *groupRep){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;
    int newGroup;

    while (i < c_rows) {
        while (j < c_cols) {
            newGroup = 0;
            for (int ng = 1; ng < c_numColors; ng++) {
                if (
                    /*If distance from color to group ng is less than distance from color to group idx
                     then color should belong to ng*/
                    (groupRep[ng].r-input[i*c_cols+j].r)*(groupRep[ng].r-input[i*c_cols+j].r) +
                    (groupRep[ng].g-input[i*c_cols+j].g)*(groupRep[ng].g-input[i*c_cols+j].g) +
                    (groupRep[ng].b-input[i*c_cols+j].b)*(groupRep[ng].b-input[i*c_cols+j].b)
                    <
                    (groupRep[newGroup].r-input[i*c_cols+j].r)*(groupRep[newGroup].r-input[i*c_cols+j].r)+
                    (groupRep[newGroup].g-input[i*c_cols+j].g)*(groupRep[newGroup].g-input[i*c_cols+j].g)+
                    (groupRep[newGroup].b-input[i*c_cols+j].b)*(groupRep[newGroup].b-input[i*c_cols+j].b)
                    )
                {
                    newGroup = ng;
                }
            }

            if (pixelGroup[i*c_cols+j] != newGroup) {
                pixelGroup[i*c_cols+j] = newGroup;
                d_change = 1;
            }

            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }

}

__global__
void clut_createImage(Color *clutImage, int *pixelGroup, Color *groupRep){
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    while (i < c_rows) {
        while (j < c_cols) {
            clutImage[i*c_cols+j].r = groupRep[pixelGroup[i*c_cols+j]].r;
            clutImage[i*c_cols+j].g = groupRep[pixelGroup[i*c_cols+j]].g;
            clutImage[i*c_cols+j].b = groupRep[pixelGroup[i*c_cols+j]].b;
            j += gridDim.x * blockDim.x;
        }
        j = blockDim.x * blockIdx.x + threadIdx.x;
        i += gridDim.y * blockDim.y;
    }
}

Utilities.cuh

#ifndef _UTILITIES_CUH_
#define _UTILITIES_CUH_

#include <iostream>
#include <fstream>
#include <string>

#define SUCCESS     1
#define FAILURE     0

#define checkCudaError(val) check( (val), #val, __FILE__, __LINE__)

typedef struct {
    int r;
    int g;
    int b;
} vec3u;

typedef vec3u Color;
typedef unsigned char uchar;
typedef uchar Grayscale;

struct GpuTimer{
    cudaEvent_t start;
    cudaEvent_t stop;
    GpuTimer(){
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }
    ~GpuTimer(){
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }
    void Start(){
        cudaEventRecord(start, 0);
    }
    void Stop(){
        cudaEventRecord(stop, 0);
    }
    float Elapsed(){
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
    if (err != cudaSuccess) {
        std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
        std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
        exit(1);
    }
}

int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols);
int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols);

int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols);
int readText2RGB(const std::string filename, Color **image, int *rows, int *cols);

struct CpuTimer{
    clock_t start;
    clock_t stop;
    void Start(){
        start = clock();
    }
    void Stop(){
        stop = clock();
    }
    float Elapsed(){
        return ((float)stop-start)/CLOCKS_PER_SEC * 1000.0f;
    }
};

#endif

Utilities.cu

#include "Utilities.cuh"

int writeGrayscale2Text(const std::string filename, const Grayscale *image, const int rows, const int cols){    
    std::ofstream fileWriter(filename.c_str());
    if (!fileWriter.is_open()) {
        std::cerr << "** writeGrayscale2Text() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileWriter << rows << "\n";
    fileWriter << cols << "\n";
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            fileWriter << (int)image[i*cols+j] << "\n";
        }
    }
    fileWriter.close();
    return SUCCESS;
}

int readText2Grayscale(const std::string filename, Grayscale **image, int *rows, int *cols){
    std::ifstream fileReader(filename.c_str());
    if (!fileReader.is_open()) {
        std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileReader >> *rows;
    fileReader >> *cols;
    *image = new Grayscale[(*rows)*(*cols)];
    int value;
    for (int i = 0; i < *rows; i++) {
        for (int j = 0; j < *cols; j++) {
            fileReader >> value;
            (*image)[i*(*cols)+j] = (Grayscale)value;
        }
    }
    fileReader.close();
    return SUCCESS;
}

int writeRGB2Text(const std::string filename, const Color *image, const int rows, const int cols){
    std::ofstream fileWriter(filename.c_str());
    if (!fileWriter.is_open()) {
        std::cerr << "** writeRGB2Text() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileWriter << rows << "\n";
    fileWriter << cols << "\n";
    for (int k = 0; k < 3; k++) {
        for (int i = 0; i < rows; i++) {
            for (int j = 0; j < cols; j++) {
                switch (k) {
                    case 0:
                        fileWriter << image[i*cols+j].r << "\n";
                        break;
                    case 1:
                        fileWriter << image[i*cols+j].g << "\n";
                        break;
                    case 2:
                        fileWriter << image[i*cols+j].b << "\n";
                        break;
                }
            }
        }
    }
    fileWriter.close();
    return SUCCESS;
}

int readText2RGB(const std::string filename, Color **image, int *rows, int *cols){
    std::ifstream fileReader(filename.c_str());
    if (!fileReader.is_open()) {
        std::cerr << "** readText2Grayscale() ** : Unable to open file." << std::endl;
        return FAILURE;
    }
    fileReader >> *rows;
    fileReader >> *cols;
    *image = new Color[(*rows)*(*cols)];
    for (int k = 0; k < 3; k++) {
        for (int i = 0; i < *rows; i++) {
            for (int j = 0; j < *cols; j++) {
                switch (k) {
                    case 0:
                        fileReader >> (*image)[i*(*cols)+j].r;
                        break;
                    case 1:
                        fileReader >> (*image)[i*(*cols)+j].g;
                        break;
                    case 2:
                        fileReader >> (*image)[i*(*cols)+j].b;
                        break;
                }
            }
        }
    }
    fileReader.close();
    return SUCCESS;
}

最佳答案

常量内存具有隐式局部作用域链接 - answer to this on stack overflow . 这意味着 cudaMemcpyToSymbol 必须位于要使用它的内核生成的同一个 .obj 文件中。 您在 Main.cu 中执行 memcopy,但您使用常量内存的内核在 Imageproc.cu 中。因此对于内核 chunche 来说,常量值是未知的。

解决您的问题的一个选项是实现包装器。只需在执行 cudaMemcpyToSymbolImagepro.cu 中添加一个函数,然后在 Main.cu 中调用包装器并传递所需的常量值内存在那里。

关于CUDA 写入常量内存错误值,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29827072/

相关文章:

cuda - 通过共享内存传递内核参数是否值得?

optimization - 在一维网格中计算扭曲ID/车道ID的最有效方法是什么?

c++ - 如何在 CUDA 中实现压缩表?

c++ - 如何使用cmake将cuda文件正确链接到cpp文件

c++ - Tensorflow GPU 新运算内存分配

c - 使用 CUDA 5 的 cudaMemcpyToSymbol 出错

cuda - 本地、全局、常量和共享内存

memory - 解释 ptxas 的详细输出,第一部分

cuda - 将参数按值传递给内核函数时,参数复制到哪里?