c - OpenCL 命令队列 (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) 不工作 (MacOS)

标签 c macos opencl nvidia

研究来自 Fixstars 的示例和源代码。具体来说,我正在尝试第 5 章中的最后一段代码(两条移动平均线 - 又名黄金交叉):

http://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/opencl-programming-practice/

代码在这里可用:

http://www.fixstars.com/en/opencl/book/sample/

我将在下面发布具体示例。但它的缺点是,通过如下设置命令队列:

command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);

导致无效的 command_queue (clError)。因此,返回数据没有得到正确处理。 IE.,全为零。

但是,如果我将代码设置为仅计算一个移动平均线,而不使用 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,那么一切正常。相应地,本章之前的 Fixstars 代码似乎都可以正常工作。

我正在开发配备 NVIDIA 芯片的全新 MacBook Pro(视网膜)。所以我想知道这是否是 NVIDIA 实现或其他方面的问题。

无论如何,这对我来说是一个阻碍,因为我最终试图做的是启动多个(相同的)进程,每个进程都有不同的参数,本质上与这个例子试图通过计算两个股票价格移动平均线来做的事情相同同时。

下面的代码片段。我在他们的源代码中添加了调试打印输出功能。所以我在程序执行时看到以下内容:

Return code from clGetPlatformIDs: Success!
Return code from clGetDeviceIDs: Success!
Return code from clCreateContext: Success!
Return code from clCreateCommandQueue: Invalid value
Return code from clBuildProgram: Success!
Return code from clCreateKernel(13): Success!
Return code from clCreateKernel(26): Success!
Return code from clEnqueueTask(13): Invalid command queue
Return code from clEnqueueTask(26): Invalid command queue
result[25]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
result[26]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
result[27]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)
result[28]:[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)[0] (0.000000,0.000000)

... with the rest of the data being all zeros as well. I'm compiling with:

gcc -O2 -c moving_average_vec4p.c<br>
gcc moving_average_vec4p.o -o moving_average_vec4p -framework opencl

----(主机代码)moving_average_vec4p.c ----

#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include <stdio.h>

#define NAME_NUM (4) /* Number of stocks */
#define DATA_NUM (100) /* Number of data to process for each stock */

/* Read Stock data */
int stock_array_4[NAME_NUM*DATA_NUM]= {
#include "stock_array_4.txt"
};

/* Moving average width */
#define WINDOW_SIZE_13 (13)
#define WINDOW_SIZE_26 (26)

#define MAX_SOURCE_SIZE (0x100000)

/* DT: added to aid in debugging */

void printCLError (int err) {
  switch (err) {
  case CL_SUCCESS:                            printf("Success!\n"); break;
  case CL_DEVICE_NOT_FOUND:                   printf("Device not found.\n"); break;
  case CL_DEVICE_NOT_AVAILABLE:               printf("Device not available\n"); break;
  case CL_COMPILER_NOT_AVAILABLE:             printf("Compiler not available\n"); break;
  case CL_MEM_OBJECT_ALLOCATION_FAILURE:      printf("Memory object allocation failure\n"); break;
  case CL_OUT_OF_RESOURCES:                   printf("Out of resources\n"); break;
  case CL_OUT_OF_HOST_MEMORY:                 printf("Out of host memory\n"); break;
  case CL_PROFILING_INFO_NOT_AVAILABLE:       printf("Profiling information not available\n"); break;
  case CL_MEM_COPY_OVERLAP:                   printf("Memory copy overlap\n"); break;
  case CL_IMAGE_FORMAT_MISMATCH:              printf("Image format mismatch\n"); break;
  case CL_IMAGE_FORMAT_NOT_SUPPORTED:         printf("Image format not supported\n"); break;
  case CL_BUILD_PROGRAM_FAILURE:              printf("Program build failure\n"); break;
  case CL_MAP_FAILURE:                        printf("Map failure\n"); break;
  case CL_INVALID_VALUE:                      printf("Invalid value\n"); break;
  case CL_INVALID_DEVICE_TYPE:                printf("Invalid device type\n"); break;
  case CL_INVALID_PLATFORM:                   printf("Invalid platform\n"); break;
  case CL_INVALID_DEVICE:                     printf("Invalid device\n"); break;
  case CL_INVALID_CONTEXT:                    printf("Invalid context\n"); break;
  case CL_INVALID_QUEUE_PROPERTIES:           printf("Invalid queue properties\n"); break;
  case CL_INVALID_COMMAND_QUEUE:              printf("Invalid command queue\n"); break;
  case CL_INVALID_HOST_PTR:                   printf("Invalid host pointer\n"); break;
  case CL_INVALID_MEM_OBJECT:                 printf("Invalid memory object\n"); break;
  case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:    printf("Invalid image format descriptor\n"); break;
  case CL_INVALID_IMAGE_SIZE:                 printf("Invalid image size\n"); break;
  case CL_INVALID_SAMPLER:                    printf("Invalid sampler\n"); break;
  case CL_INVALID_BINARY:                     printf("Invalid binary\n"); break;
  case CL_INVALID_BUILD_OPTIONS:              printf("Invalid build options\n"); break;
  case CL_INVALID_PROGRAM:                    printf("Invalid program\n"); break;
  case CL_INVALID_PROGRAM_EXECUTABLE:         printf("Invalid program executable\n"); break;
  case CL_INVALID_KERNEL_NAME:                printf("Invalid kernel name\n"); break;
  case CL_INVALID_KERNEL_DEFINITION:          printf("Invalid kernel definition\n"); break;
  case CL_INVALID_KERNEL:                     printf("Invalid kernel\n"); break;
  case CL_INVALID_ARG_INDEX:                  printf("Invalid argument index\n"); break;
  case CL_INVALID_ARG_VALUE:                  printf("Invalid argument value\n"); break;
  case CL_INVALID_ARG_SIZE:                   printf("Invalid argument size\n"); break;
  case CL_INVALID_KERNEL_ARGS:                printf("Invalid kernel arguments\n"); break;
  case CL_INVALID_WORK_DIMENSION:             printf("Invalid work dimension\n"); break;
  case CL_INVALID_WORK_GROUP_SIZE:            printf("Invalid work group size\n"); break;
  case CL_INVALID_WORK_ITEM_SIZE:             printf("Invalid work item size\n"); break;
  case CL_INVALID_GLOBAL_OFFSET:              printf("Invalid global offset\n"); break;
  case CL_INVALID_EVENT_WAIT_LIST:            printf("Invalid event wait list\n"); break;
  case CL_INVALID_EVENT:                      printf("Invalid event\n"); break;
  case CL_INVALID_OPERATION:                  printf("Invalid operation\n"); break;
  case CL_INVALID_GL_OBJECT:                  printf("Invalid OpenGL object\n"); break;
  case CL_INVALID_BUFFER_SIZE:                printf("Invalid buffer size\n"); break;
  case CL_INVALID_MIP_LEVEL:                  printf("Invalid mip-map level\n"); break;
  default: printf("Unknown\n");
  }
}


int main(void)
{
  cl_platform_id platform_id = NULL;
  cl_uint ret_num_platforms;
  cl_device_id device_id = NULL;
  cl_uint ret_num_devices;
  cl_context context = NULL;
  cl_command_queue command_queue = NULL;
  cl_mem memobj_in = NULL;
  cl_mem memobj_out13 = NULL;
  cl_mem memobj_out26 = NULL;
  cl_program program = NULL;
  cl_kernel kernel13 = NULL;
  cl_kernel kernel26 = NULL;
  cl_event event13, event26;
  size_t kernel_code_size;
  char *kernel_src_str;
  float *result13;
  float *result26;
  cl_int ret;
  FILE *fp;

  int window_num_13 = (int)WINDOW_SIZE_13;
  int window_num_26 = (int)WINDOW_SIZE_26;
  int point_num = (NAME_NUM * DATA_NUM);
  int data_num = (int)DATA_NUM;
  int name_num = (int)NAME_NUM;

  int i, j;
  /* Allocate space to read in kernel code */
  kernel_src_str = (char *)malloc(MAX_SOURCE_SIZE);

  /* Allocate space for the result on the host side */
  result13 = (float *)malloc(point_num*sizeof(float)); /* average over13 weeks */
  result26 = (float *)malloc(point_num*sizeof(float)); /* average over26 weeks */

  /* Get Platform */
  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
  printf("Return code from clGetPlatformIDs: ");
  printCLError(ret);

  /* Get Device */
  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
               &ret_num_devices);
  printf("Return code from clGetDeviceIDs: ");
  printCLError(ret);

  /* Create Context */
  context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
  printf("Return code from clCreateContext: ");
  printCLError(ret);

  /* Create Command Queue */

  // DT:  this seems to break it (ie., output is all zeros)
  command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &ret);
  printf("Return code from clCreateCommandQueue: ");
  printCLError(ret);

  /* Read kernel source code */
  fp = fopen("moving_average_vec4.cl", "r");
  kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
  fclose(fp);

  /* Create Program Object */
  program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
                      (const size_t *)&kernel_code_size, &ret);

  /* Compile kernel */
  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
  printf("Return code from clBuildProgram: ");
  printCLError(ret);

  /* Create kernel */
  kernel13 = clCreateKernel(program, "moving_average_vec4", &ret); /* 13 weeks */
  printf("Return code from clCreateKernel(13): ");
  printCLError(ret);

  kernel26 = clCreateKernel(program, "moving_average_vec4", &ret); /* 26 weeks */
  printf("Return code from clCreateKernel(26): ");
  printCLError(ret);

  /* Create buffer for the input data on the device */
  memobj_in = clCreateBuffer(context, CL_MEM_READ_WRITE,
                 point_num * sizeof(int), NULL, &ret);

  /* Create buffer for the result on the device */
  memobj_out13 = clCreateBuffer(context, CL_MEM_READ_WRITE,
                point_num * sizeof(float), NULL, &ret); /* 13 weeks */
  memobj_out26 = clCreateBuffer(context, CL_MEM_READ_WRITE,
                point_num * sizeof(float), NULL, &ret); /* 26 weeks */

  /* Copy input data to the global memory on the device*/
  ret = clEnqueueWriteBuffer(command_queue, memobj_in, CL_TRUE, 0, point_num * sizeof(int), stock_array_4, 0, NULL, NULL);

  /* Set Kernel Arguments (13 weeks) */
  ret = clSetKernelArg(kernel13, 0, sizeof(cl_mem), (void *)&memobj_in);
  ret = clSetKernelArg(kernel13, 1, sizeof(cl_mem), (void *)&memobj_out13);
  ret = clSetKernelArg(kernel13, 2, sizeof(int), (void *)&data_num);
  ret = clSetKernelArg(kernel13, 3, sizeof(int), (void *)&window_num_13);

  /* Submit task to compute the moving average over 13 weeks */

  ret = clEnqueueTask(command_queue, kernel13, 0, NULL, NULL);
  printf("Return code from clEnqueueTask(13): ");
  printCLError(ret);

  /* Set Kernel Arguments (26 weeks) */
  ret = clSetKernelArg(kernel26, 0, sizeof(cl_mem), (void *)&memobj_in);
  ret = clSetKernelArg(kernel26, 1, sizeof(cl_mem), (void *)&memobj_out26);
  ret = clSetKernelArg(kernel26, 2, sizeof(int), (void *)&data_num);
  ret = clSetKernelArg(kernel26, 3, sizeof(int), (void *)&window_num_26);

  /* Submit task to compute the moving average over 26 weeks */
  ret = clEnqueueTask(command_queue, kernel26, 0, NULL, &event26);
  printf("Return code from clEnqueueTask(26): ");
  printCLError(ret);

  // DT:  doesn't seem to help ... ;-(
  ret = clFinish(command_queue);

  /* Copy result for the 13 weeks moving average from device to host */
  ret = clEnqueueReadBuffer(command_queue, memobj_out13, CL_TRUE, 0, point_num * sizeof(float), result13, 1, &event13, NULL);

  /* Copy result for the 26 weeks moving average from device to host */
  ret = clEnqueueReadBuffer(command_queue, memobj_out26, CL_TRUE, 0, point_num * sizeof(float), result26, 1, &event26, NULL);


  /* OpenCL Object Finalization */
  ret = clReleaseKernel(kernel13);
  ret = clReleaseKernel(kernel26);
  ret = clReleaseProgram(program);
  ret = clReleaseMemObject(memobj_in);
  ret = clReleaseMemObject(memobj_out13);
  ret = clReleaseMemObject(memobj_out26);
  ret = clReleaseCommandQueue(command_queue);
  ret = clReleaseContext(context);


  /* Display results */

  /* DT: also added printout to see if actual numeric values are passing through */

  for (i=window_num_26-1; i < data_num; i++) {
    printf("result[%d]:", i );
    for (j=0; j < name_num; j++ ) {
      /* Display whether the 13 week average is greater */
      printf( "[%d] (%f,%f)", (result13[i*NAME_NUM+j] > result26[i*NAME_NUM+j]),result13[i*NAME_NUM+j],result26[i*NAME_NUM+j] );
    }
    printf("\n");
  }

  /* Deallocate memory on the host */
  free(result13);
  free(result26);
  free(kernel_src_str);

  return 0;
}

----(OPENCL 内核代码)moving_average_vec4.cl ----

__kernel void moving_average_vec4(__global int4 *values,
                  __global float4 *average,
                  int length,
                  int width)
{
  int i;
  int4 add_value; /* A vector to hold 4 components */

  /* Compute sum for the first "width" elements for 4 stocks */
  add_value = (int4)0;
  for (i=0; i < width; i++) {
    add_value += values[i];
  }
  average[width-1] = convert_float4(add_value);

  /* Compute sum for the (width)th ~ (length-1)th elements for 4 stocks */
  for (i=width; i < length; i++) {
    add_value = add_value - values[i-width] + values[i];
    average[i] = convert_float4(add_value);
  }

  /* Insert zeros to 0th ~ (width-2)th element for 4 stocks*/
  for (i=0; i < width-1; i++) {
    average[i] = (float4)(1.1f);
  }

  /* Compute average of (width-1) ~ (length-1) elements for 4 stocks  */
  for (i=width-1; i < length; i++) {
    average[i] /= (float4)width;
  }
}

最佳答案

很少有 OpenCL 实现支持乱序命令队列。对于重叠工作,改为使用多个命令队列和(如无序队列)事件来同步。

关于c - OpenCL 命令队列 (CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) 不工作 (MacOS),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16821293/

相关文章:

c - 如何实现服务器名称指示(SNI)

objective-c - 您可以将任何具有 i386 架构的库用于 Cocoa/Mac OSX 应用程序吗?

c - 如何在套接字服务器应用程序中重用端口?

macos - 我可以在 bash 脚本中与 OSX `say` 命令的输出交互吗?

c++ - libtiff.3.dylib : unknown required load command 0x80000022

opencl - OpenCL SubSlice 是否类似于 CUDA Warp (SIMD)?

opencl - CPU/Intel OpenCL 性能问题、实现问题

gpu - AMD Polaris 上某些尺寸的矩阵乘法性能下降

c - 在 C 项目的 .c 文件内的函数中,如何重用分配的内存,这样我就不会经常使用 malloc-free?

c - Doxygen 不一致地扩展 C 宏