c++ - OpenCl:示例 float4 程序 - 段错误(核心已转储)

标签 c++ floating-point opencl

这是一个简单的程序,从文件中读取两个 float4 vector ,然后计算相反数的和。 我找不到问题: 主文件:

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif



const int number_of_points = 16;  // number of points in Both  A and B files (number of rows)
const int number_of_axis = 4;     // number of points axis in Both  A and B files (number of Columns)


using namespace std;

void checkError(cl_int err, const char *operation)
{
  if (err != CL_SUCCESS)
  {
    fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
    exit(1);
  }
}

int main(int argc, char *argv[]) {
    clock_t tStart = clock();
    // Create the two input vectors
    // working variables
    int i;
    ifstream input_fileA, input_fileB;  // input files
    string line;    // transfer row from file to array
    float x;        // transfer word from file to array
    int row = 0;    // number of rows of file A,B (= array)
    int col = 0;    // number of rows of file A,B (= array)

    // working arrays

    // working arrays
//  int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
//  int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);

    float tempAArray[number_of_points][number_of_axis]={{0}};   // array contains file A data
    float tempBArray[number_of_points][number_of_axis]={{0}};   // array contains file B data



    int mem_size_InputA = number_of_points * number_of_axis ;
    int mem_size_InputB = number_of_points * number_of_axis ;
    int mem_size_Output = number_of_points * number_of_axis ;

    float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file A data
    float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file B data
    float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4));   // array contains file B data


    // import input files
    input_fileA.open(argv[1]);
    input_fileB.open(argv[2]);


    // transfer input files data to array
    // input file A to arrayA
    row = 0;
    while (getline(input_fileA, line))
    {

        istringstream streamA(line);
        col = 0;
        while(streamA >> x){
            tempAArray[row][col] = x;
            col++;
        }
        row++;
    }

    // input file B to arrayB
    row = 0;
    while (getline(input_fileB, line))
    {

        istringstream streamB(line);
        col = 0;
        while(streamB >> x){
            tempBArray[row][col] = x;
            col++;
        }
        row++;
    }

    // switch columns of B array
    for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
    {
        float temporary = tempBArray[row_of_arrayB][2];
        tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
        tempBArray[row_of_arrayB][1] = temporary;
    }

    // from Array to 3d vectors
//    for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
//    {
//      inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
//      inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
//    }

    for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
    {

        inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0];
        inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1];
        inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2];
        inputAArray[row_of_array*number_of_points+3] = 0.0f;

        inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0];
        inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1];
        inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2];
        inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3];

        outputArray[row_of_array*number_of_points+0] = 0.0f;
        outputArray[row_of_array*number_of_points+1] = 0.0f;
        outputArray[row_of_array*number_of_points+2] = 0.0f;
        outputArray[row_of_array*number_of_points+3] = 0.0f;
//      inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);

    }
//    for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
//    {
//      printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
//              inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
//    }
    // close input files
    input_fileA.close();
    input_fileB.close();




    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }

    fseek(fp, 0, SEEK_END);
    size_t programLength = ftell(fp);
    rewind(fp);

    source_str = (char*)malloc(programLength+1);
    source_size = fread( source_str, 1, programLength, fp);
    source_str[programLength] = '\0';
    fclose( fp );

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
            &device_id, &ret_num_devices);

    // Create an OpenCL context
    cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    // Create memory buffers on the device for each vector
    cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
    cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
            mem_size_InputB*sizeof(cl_float4), NULL, &ret);

    cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
            mem_size_Output*sizeof(cl_float4), NULL, &ret);


    // Copy the lists A and B to their respective memory buffers
    ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
            mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
            mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);


    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,
            (const char **)&source_str, (const size_t *)&source_size, &ret);

    // Build the program

    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret == CL_BUILD_PROGRAM_FAILURE)
      {
        // Get size of build log
        size_t logSize;
        ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                    0, NULL, &logSize);
        checkError(ret, "getting build log size");

        // Get build log
        char log[logSize];
        ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                                    logSize, log, NULL);
        checkError(ret, "getting build log");

        printf("OpenCL program build log:\n%s\n", log);
        exit(1);
      }


    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);

    // Execute the OpenCL kernel on the list
    size_t global_item_size = number_of_points; // Process the entire lists
    size_t local_item_size = 4; // Process in groups of 64

    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
            &global_item_size, &local_item_size, 0, NULL, NULL);

    // Read the memory buffer C on the device to the local variable C
//    int *C = (int*)malloc(sizeof(int)*number_of_points);


//    float *C = (float*)malloc(sizeof(float)*number_of_points);
    clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
            mem_size_Output, outputArray, 0, NULL, NULL);


    // Display the result to the screen
//    float buttomSNM = 0;
//    for(i = 0; i < number_of_points; i++)
//    {
//      for (int t=0; t<4; t++)
//      {
//          cout << "h" ;
////            printf("%f, \n", outputArray[i*number_of_points+t]);
//      }
//  }

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(inputa_mem_obj);
    ret = clReleaseMemObject(inputb_mem_obj);
    ret = clReleaseMemObject(output_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free (inputAArray);
    free (inputBArray);
    free (outputArray);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
    return 0;
}

内核:

__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
                         __global float4 *outputArray) {

    // Get the index of the current element
    int i = get_global_id(0);
    int number_of_points = 16;


    outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0];
    outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1];
    outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2];
    outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3];


}

第一个输入文件:A.txt

0   0.000000e+00    9.998994e-01    
1   1.000000e-03    9.998981e-01    
2   2.000000e-03    9.998967e-01    
3   3.000000e-03    9.998953e-01    
4   4.000000e-03    9.998939e-01    
5   5.000000e-03    9.998925e-01    
6   6.000000e-03    9.998911e-01    
7   7.000000e-03    9.998896e-01    
8   8.000000e-03    9.998881e-01    
9   9.000000e-03    9.998865e-01    
10  1.000000e-02    9.998850e-01    
11  1.100000e-02    9.998834e-01    
12  1.200000e-02    9.998817e-01    
13  1.300000e-02    9.998800e-01    
14  1.400000e-02    9.998783e-01    
15  1.500000e-02    9.998766e-01

第二个输入文件B:

0   0.000000e+00    9.998966e-01    
1   1.000000e-03    9.998953e-01    
2   2.000000e-03    9.998939e-01    
3   3.000000e-03    9.998925e-01    
4   4.000000e-03    9.998911e-01    
5   5.000000e-03    9.998896e-01    
6   6.000000e-03    9.998881e-01    
7   7.000000e-03    9.998866e-01    
8   8.000000e-03    9.998850e-01    
9   9.000000e-03    9.998834e-01    
10  1.000000e-02    9.998818e-01    
11  1.100000e-02    9.998801e-01    
12  1.200000e-02    9.998785e-01    
13  1.300000e-02    9.998767e-01    
14  1.400000e-02    9.998750e-01    
15  1.500000e-02    9.998732e-01

提前致谢

最佳答案

您正在以一种相当奇怪的方式计算内核中的数组索引:

i*number_of_points+0
i*number_of_points+1
i*number_of_points+2
i*number_of_points+3

想想对于不同的 i 值,这实际上意味着什么(假设 number_of_points=16):

 i     array indices (i*16 + (0,1,2,3))
 --------------------------------------
 0     0,   1,  2,  3
 1     16, 17, 18, 19
 2     32, 33, 34, 35
 ...
 etc

这肯定不是你想要的!您的示例代码似乎只是在尝试执行矢量化 vector 加法。如果是这种情况,您的内核代码只需要看起来像这样:

__kernel void vecadd(__global float4 *inputA,
                     __global float4 *inputB,
                     __global float4 *output)
{
    int i = get_global_id(0);
    output[i] = inputA[i] + inputB[i];
}

之所以可行,是因为我们对 vector 的每个元素执行相同的操作。如果你有一个内核需要单独使用这些元素,你会写这样的代码:

float4 valueA = inputA[i];
float4 valueB = inputB[i];

float4 result;
result.x = valueA.x + valueB.x; // Do something with first component
result.y = valueA.y * valueB.y; // Do something with second component
result.z = valueA.z / valueB.z; // Do something with third component
result.w = valueA.w - valueB.w; // Do something with fourth component

关于c++ - OpenCl:示例 float4 程序 - 段错误(核心已转储),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29364437/

相关文章:

c++ - 如何编写一个错误结构,它可以包含不同的强类型枚举作为错误代码?

python - 小数点表现不佳 - ValueError : could not convert string to float: '.'

opencl - Nvidia 硬件上的 clEnqueueNDRange 阻塞? (还有多 GPU)

c++ - std::atomic 中的自定义类型

C++ 数据结构和 CUDA

Javascript 数学不精确错误? (0.1+0.01+0.01+0.01+0.01+0.01 == 0.15000000000000002)

Python 中的 C# BitConverter.ToSingle 等价物

python - 对于多平台 GPGPU 计算,是否有 OpenCL+PyOpenCL 的替代方案?

android - 带有Android OpenCV SDK的OpenCL和GPU

c++ - 多播大数据包包含到多个客户端的所有信息与到目标客户端的单个数据包