memory-management - OpenCl 清理导致段错误

标签 memory-management segmentation-fault opencl

我使用网络上的不同来源构建了自己的 Opencl 小示例。实际的内核工作,我得到了我想要的输出,但是我在其中一个示例中发现的清理函数导致了段错误。我做错了什么?

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl

#define CL_CHECK(_expr)                                                         \
   do {                                                                         \
     cl_int _err = _expr;                                                       \
     if (_err == CL_SUCCESS)                                                    \
       break;                                                                   \
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
     abort();                                                                   \
   } while (0)

#define CL_CHECK_ERR(_expr)                                                     \
   ({                                                                           \
     cl_int _err = CL_INVALID_VALUE;                                            \
     typeof(_expr) _ret = _expr;                                                \
     if (_err != CL_SUCCESS) {                                                  \
       fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
       abort();                                                                 \
     }                                                                          \
     _ret;                                                                      \
   })

const char* OpenCLSource[] = {
       "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
       "{",
       "      // Index of the elements to add \n",
       "      unsigned int n = get_global_id(0);",
       "      // Sum the n’th element of vectors a and b and store in c \n",
       "      c[n] = a[n] + b[n];",
       "}"
};

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
    cl_int _err;
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
    printf("\n1-%i\n",_err);
    // Get the list of GPU devices associated with this context
    size_t ParmDataBytes;
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
    cl_device_id* GPUDevices;
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
    // Create a command-queue on the first GPU device
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
    printf("\n2-%i\n",_err);
    // Create OpenCL program with source code
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
    printf("\n3-%i\n",_err);

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
              NULL, NULL, NULL, NULL));


     cl_int errcode;
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
               "VectorAdd", &errcode);
               printf("\n7-%i\n",errcode);

    return GPUDevices;
}


int main(int argc, char** argv)
{
    cl_context GPUContext;
    cl_command_queue GPUCommandQueue;
    cl_program OpenCLProgram;
    cl_kernel OpenCLVectorAdd;
    cl_device_id* GPUDevices;

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);

    // Two integer source vectors in Host memory
    int n=5 ;
    int x[5]={1,2,4,6,8};
    int y[5]={1,2,4,6,8};
    int output[n];
    int size_x = n*sizeof(x);
    int size_y = n*sizeof(y);

    int size_output = n*sizeof(output); // this changes for the second forward1
    cl_int _err;
    // Allocate GPU memory for source vectors AND initialize from CPU memory
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
                     printf("\n4-%i\n",_err);
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
                     printf("\n5-%i\n",_err);


    // Allocate output memory on GPU
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
                                          size_output, NULL, &_err);
                                           printf("\n6-%i\n",_err);

     // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);


    // 7. Launch OpenCL kernel
    size_t localWorkSize[1], globalWorkSize[1];
    //localWorkSize = ;
    globalWorkSize[0] = n;

    // Launch the Kernel on the GPU
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
    // Copy the output in GPU memory back to CPU memory

    //float* h_C = (float*) malloc(size_output);
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
              total_cl, CL_TRUE, 0, size_output, 
                output, 0, NULL, NULL));
    for (int i=0; i<n;i++){
        printf("\n%i",output[i]);
    }

    // Cleanup (each of the following lines causes a seg fault
    // ******************************
    CL_CHECK(free(GPUDevices)); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
    CL_CHECK(clReleaseProgram(OpenCLProgram));
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
    CL_CHECK(clReleaseContext(GPUContext));
    CL_CHECK(clReleaseMemObject(total_cl));
    CL_CHECK(clReleaseMemObject(x_cl));
    CL_CHECK(clReleaseMemObject(y_cl));
    /* ****************

    return 0;
}

谢谢!

最佳答案

对于 future 来到这里的人:

正如 Brafford 所建议的,这可以通过在 clEnqueueNDRangeKernel 和 clEnqueueReadBuffer 之后添加 clFinish(GPUCommandQueue) 来解决。

显然,尝试清理仍在执行中的任何对象(例如释放队列)会产生段错误。

关于memory-management - OpenCl 清理导致段错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/3727534/

相关文章:

c++ - 在 C++ 中为数组正确分配和释放内存

opengl - 我必须在程序结束时使用 glDeleteTextures() 吗?

c++ - 我们可以通过 IPC 传递具有动态管理成员的类的对象吗?

opencl - 私有(private)内存比本地内存慢吗?

android - 如何释放 TabHost 占用的内存

c - 初始化大 vector 时出现段错误

C 编程 - 段错误(核心转储)

c - X509_STORE_add_lookup() 中的段错误

cuda - 在 opencl 中 CPU 作为主机,intel HD 4000 作为设备 1,离散 GPU 作为设备 2

opencl - 寄存器和专用缓冲区