c++ - 二维循环 OpenCl 程序不工作

标签 c++ loops parallel-processing opencl

这个程序是一个简单的并行程序,它添加了 2 个数组的元素。程序编译成功,但结果不对。

程序从 2 个文件中读取数组,然后添加它们的元素。

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif



const int number_of_points = 12;  // number of points in Both  A and B files (number of rows)
const int number_of_axis = 3;     // number of points axis in Both  A and B files (number of Columns)


using namespace std;

int main(int argc, char *argv[]) {
    clock_t tStart = clock();
    // Create the two input vectors
    // working variables
    int i,j;
    ifstream input_fileA, input_fileB;  // input files
    string line;    // transfer row from file to array
    float x;        // transfer word from file to array
    int row = 0;    // number of rows of file A,B (= array)
    int col = 0;    // number of rows of file A,B (= array)

    // working arrays
    int mem_size_InoutA = number_of_points * number_of_axis;
    int mem_size_InoutB = number_of_points * number_of_axis;
    int mem_size_Output = number_of_points * number_of_axis;

    float inputAArray[number_of_points][number_of_axis]={{0}};  // array contains file A data
    float inputBArray[number_of_points][number_of_axis]={{0}};  // array contains file B data
    float outputArray[number_of_points][number_of_axis]={{0}};  // array contains file B data


    // import input files
    input_fileA.open(argv[1]);
    input_fileB.open(argv[2]);


    // transfer input files data to array
    // input file A to arrayA
    row = 0;
    while (getline(input_fileA, line))
    {

        istringstream streamA(line);
        col = 0;
        while(streamA >> x){
            inputAArray[row][col] = x;
            col++;
        }
        row++;
    }

    // input file B to arrayB
    row = 0;
    while (getline(input_fileB, line))
    {

        istringstream streamB(line);
        col = 0;
        while(streamB >> x){
            inputBArray[row][col] = x;
            col++;
        }
        row++;
    }

    // switch columns of B array
    for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
    {
        float temporary = inputBArray[row_of_arrayB][2];
        inputBArray[row_of_arrayB][2] = inputBArray[row_of_arrayB][1];
        inputBArray[row_of_arrayB][1] = temporary;
    }

    // close input files
    input_fileA.close();
    input_fileB.close();




    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(number_of_points);
    source_size = fread( source_str, 1, number_of_points, fp);
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
            &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InoutA , NULL, &ret);
    cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InoutB, NULL, &ret);

    cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
            mem_size_Output, NULL, &ret);


    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
            mem_size_InoutA, inputAArray, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
            mem_size_InoutB, inputBArray, 0, NULL, NULL);


    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
            (const char **)&source_str, (const size_t *)&source_size, &ret);

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);

    // Execute the OpenCL kernel on the list
    size_t global_item_size[2], local_item_size[2];
    global_item_size[0] = number_of_points; // Process the entire lists
    global_item_size[1] = number_of_points; // Process the entire lists
    local_item_size[0] = 3; // Process in groups of 64
    local_item_size[1] = 3; // Process in groups of 64

    ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
            global_item_size, local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
//    int *C = (int*)malloc(sizeof(int)*number_of_points);


//    float *C = (float*)malloc(sizeof(float)*number_of_points);
    ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
            number_of_points * sizeof(float), outputArray, 0, NULL, NULL);


    // Display the result to the screen
    float buttomSNM = 0;
    for(i = 0; i < number_of_points; i++)
    {
        for(j= 0; j < number_of_axis; j++)
        {
            printf("%f + %f = %f\n", inputAArray[i][j], inputBArray[i][j], outputArray[i][j]);
        }
    }


    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(inputa_mem_obj);
    ret = clReleaseMemObject(inputb_mem_obj);
    ret = clReleaseMemObject(output_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);


printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
    return 0;
}

内核文件是:

__kernel void calculate_bottom_SNM(__global float *inputAArray, __global float *inputBArray,
                         __global float *outputArray) {

    // Get the index of the current element
    int i = get_global_id(0);
    int j = get_global_id(1);




    outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
}

第一个文件(第一个数组)

0   0.000000e+00    9.998994e-01    
1   1.000000e-03    9.998981e-01    
2   2.000000e-03    9.998967e-01    
3   3.000000e-03    9.998953e-01    
4   4.000000e-03    9.998939e-01    
5   5.000000e-03    9.998925e-01    
6   6.000000e-03    9.998911e-01    
7   7.000000e-03    9.998896e-01    
8   8.000000e-03    9.998881e-01    
9   9.000000e-03    9.998865e-01    
10  1.000000e-02    9.998850e-01    
11  1.100000e-02    9.998834e-01

第二个文件(第二个数组)

0   0.000000e+00    9.998966e-01    
1   1.000000e-03    9.998953e-01    
2   2.000000e-03    9.998939e-01    
3   3.000000e-03    9.998925e-01    
4   4.000000e-03    9.998911e-01    
5   5.000000e-03    9.998896e-01    
6   6.000000e-03    9.998881e-01    
7   7.000000e-03    9.998866e-01    
8   8.000000e-03    9.998850e-01    
9   9.000000e-03    9.998834e-01    
10  1.000000e-02    9.998818e-01    

结果:

0.000000 + 0.000000 = 0.000000
0.000000 + 0.999897 = 0.000000
0.999899 + 0.000000 = 0.000000
1.000000 + 1.000000 = 0.000000
0.001000 + 0.999895 = 0.000000
0.999898 + 0.001000 = 0.000000
2.000000 + 2.000000 = 0.000000
0.002000 + 0.999894 = 0.000000
0.999897 + 0.002000 = 0.000000
3.000000 + 3.000000 = 0.000000
0.003000 + 0.999892 = 0.000000
0.999895 + 0.003000 = 0.000000
4.000000 + 4.000000 = 0.000000
0.004000 + 0.999891 = 0.000000
0.999894 + 0.004000 = 0.000000
5.000000 + 5.000000 = 0.000000
0.005000 + 0.999890 = 0.000000
0.999892 + 0.005000 = 0.000000
6.000000 + 6.000000 = 0.000000
0.006000 + 0.999888 = 0.000000
0.999891 + 0.006000 = 0.000000
7.000000 + 7.000000 = 0.000000
0.007000 + 0.999887 = 0.000000
0.999890 + 0.007000 = 0.000000
8.000000 + 8.000000 = 0.000000
0.008000 + 0.999885 = 0.000000
0.999888 + 0.008000 = 0.000000
9.000000 + 9.000000 = 0.000000
0.009000 + 0.999883 = 0.000000
0.999887 + 0.009000 = 0.000000
10.000000 + 10.000000 = 0.000000
0.010000 + 0.999882 = 0.000000
0.999885 + 0.010000 = 0.000000
11.000000 + 0.000000 = 0.000000
0.011000 + 0.000000 = 0.000000
0.999883 + 0.000000 = 0.000000
ALL Time taken: 0.06s

结果当然不对,对的是元素之和。 谢谢,

最佳答案

您再次未能检查 OpenCL API 调用的返回码。如果不这样做,就不可能知道问题何时发生。每次调用 OpenCL 函数时,您应该执行如下操作:

ret = clDoSomething(...);
if (ret != CL_SUCCESS)
{
  printf("Failed on function clDoSomething: %d\n", ret);
  exit(1); // Or do whatever cleanup needs to be done before exiting
}

您可以通过定义一个简单的效用函数来简化此操作:

void checkError(cl_int err, const char *operation)
{
  if (err != CL_SUCCESS)
  {
    fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
    exit(1);
  }
}

...

ret = clDoSomething(...);
checkError(ret, "calling clDoSomething");

这一次,问题似乎来自 clBuildProgram 调用(它返回 -54,对应于 CL_BUILD_PROGRAM_FAILURE)。在这种情况下,您还需要获取构建日志以查看完整错误:

  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  if (ret == CL_BUILD_PROGRAM_FAILURE)
  {
    // Get size of build log
    size_t logSize;
    ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                0, NULL, &logSize);
    checkError(ret, "getting build log size");

    // Get build log
    char log[logSize];
    ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                logSize, log, NULL);
    checkError(ret, "getting build log");

    printf("OpenCL program build log:\n%s\n", log);
    exit(1);
  }

如果将此添加到代码中,您将获得如下所示的构建日志:

input.cl:1:10: error: unknown type name 'voi'; did you mean 'void'?
__kernel voi
         ^~~
         void
input.cl:1:13: error: expected identifier or '('
__kernel voi
            ^

这看起来很奇怪,但表明您的程序可能会在几个字符后被中断。如果您查看为从文件中读取 OpenCL 程序而编写的代码,您会发现:

source_str = (char*)malloc(number_of_points);
source_size = fread( source_str, 1, number_of_points, fp);

所以,您只阅读了程序的前 12 个字符!您可以使用 fseekftell 来获取文件的实际长度:

fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
rewind(fp);

source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';

如果你这样做,你会得到一个不同的程序构建错误:

input.cl:8:17: error: subscripted value is not an array, pointer, or vector
  outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
  ~~~~~~~~~~~~~~^~
input.cl:8:37: error: subscripted value is not an array, pointer, or vector
  outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
                      ~~~~~~~~~~~~~~^~
input.cl:8:57: error: subscripted value is not an array, pointer, or vector
  outputArray[i][j] = inputAArray[i][j] + inputBArray[i][j];
                                          ~~~~~~~~~~~~~~^~

这是因为您试图将数组索引为二维的,而实际上它们只是一维的(就像所有 OpenCL 缓冲区一样)。您需要手动计算一维数组中的偏移量以解决此问题,例如:

outputArray[i + j*number_of_points] = inputAArray[i + j*number_of_points] + inputBArray[i + j*number_of_points];

(这需要您将 number_of_points 作为参数传递给您的内核)。

最后,还有一些其他错误:

  1. 正如在另一个答案中指出的,内存对象的大小需要乘以 sizeof(cl_float)(并且 clEnqueueReadBuffer 调用需要使用这个).

  2. 您的全局工作规模可能应该是这样的:

    global_item_size[0] = number_of_points;

    global_item_size[1] = number_of_axis;

这个答案的主要收获是你真的,真的需要检查每个 OpenCL API 函数调用返回的错误代码,否则你将永远无法调试这些问题。

关于c++ - 二维循环 OpenCl 程序不工作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29306693/

相关文章:

c++ - 是否有必要销毁工具提示?

c++ - 如何将成员函数作为参数传递给不需要它的函数?

c++ - 在 VS2013 与 VS2017 中的 printf 函数中使用宏

javascript - 通过循环访问数组中对象的属性时,仅获取第一个元素

c# - 是否可以在核心之间划分工作?

c++ - 为什么一个类允许拥有自己的静态成员,而不是非静态成员?

c# - Razor 中 foreach 中的 html 和代码的混合

python - 在具有特定数字的python循环中生成数字

c++ - 使用蒙特卡罗方法多线程计算 pi 值

c - 使用 SIMD 和 openMP 加速矩阵乘法