c - OpenCL 内核仅部分写入输出缓冲区

标签 c opencl

我正在从一个包含超过一百万个元素的数组中读取大整数值。获取的值是使用 libsndfile 库从 wav 文件中获取的。现在,如果我不使用内核,我可以将原始数组写入我的输出文件并毫无问题地收听音频。然而,当我决定使用内核做完全相同的事情时,它只写了不到一秒钟的歌曲。

起初,我认为这是一个内存问题,所以我尝试了缓冲区大小,但仍然没有成功。然后我认为这可能是我在内核中做的循环,我也玩弄循环值得出相同的结论(仍然不起作用。)我现在很困惑,不知道该怎么做.下面是我的代码。下面的一些代码是我的,但主要结构是我在网上找到的,可以帮助我设置内核。

在此代码的最底部,如果我将 OutputData 更改为 Array,我会得到完全相同的音频。我很确定内核有问题,这就是为什么它没有写回整首歌。

我知道这段代码很乱,但要尝试和测试这段代码,您只需复制并废弃它,然后简单地更改输入 wav 文件和输出 wav 文件的路径即可。

为了明确目标,我将尝试修改 wav 文件中的每个值,看看会发生什么。到目前为止,如果我将内核中的输出值乘以 2,它就会失真。但同样,只持续了大约 1 秒,剪辑的其余部分是空的。请注意,输入和输出文件的大小相同。

我的 For 循环也在循环进行 120 万次迭代,因为那是我的示例 wav 文件中的项目数

const char* prog = "__kernel void exchange(__global int *Array, __global int *Output) { for(int j = 0; j < 100000; j++){ for(int i = 0; i < 12; i++){ Output[j+i] = (Array[j+i]);}  }  }";

int main() {
// This code executes on the OpenCL host
SNDFILE *sf;
SF_INFO info;
int num_channels;
int num, num_items;
//input and output data
int *Array;
int *OutputData;

int f, sr, c;
int i, j;
FILE *out;

/* Open the WAV file. */
info.format = 0;
sf = sf_open("Yourwavfilepathhere", SFM_READ, &info);
if (sf == NULL)
{
    printf("Failed to open the file.\n");
    perror("Error");
    exit(-1);
}
/* Print some of the info, and figure out how much data to read. */
f = info.frames;
sr = info.samplerate;
int format = info.format;
c = info.channels;
printf("frames=%d\n", f);
printf("samplerate=%d\n", sr);
printf("channels=%d\n", c);
printf("format %i\n", format);
num_items = f*c;
printf("num_items=%d\n", num_items);
/* Allocate space for the data to be read, then read it. */
Array = (int *)malloc(num_items*sizeof(int));
OutputData = (int*)malloc(num_items*sizeof(int));;
num = sf_read_int(sf, Array, num_items);
sf_close(sf);
printf("Read %d items\n", num);

//Time variables for performance execution. Event variable needed for timing constraint 
cl_event someEvent;
cl_ulong start = (cl_ulong)0;
cl_ulong end = (cl_ulong)0;
cl_ulong finalTime = (cl_ulong)0;

//Number of sampling points 
int sampleSize = 100;
float h = 0;

//Coefficient used to multiply the values entering the FIFO buffer implemented inside the kernel
float coefficient = 1 / sampleSize;

//Signal Frequency in Hz
float signalFreq = 10;

//Number of points between 0 and max val (T_Sample)
float freqSample = sampleSize*signalFreq;

//Step = max value or T_Sample. ******Either 1/freqSample or 1/sampleSize for the stepSize******
float stepSize = 1.0 / freqSample;

/*
  This is a different Example
*/


// Use this to check the output of each API call
cl_int status;

//-----------------------------------------------------
// STEP 1: Discover and initialize the platforms
//-----------------------------------------------------

cl_uint numPlatforms = 0;

cl_platform_id *platforms = NULL;

// Use clGetPlatformIDs() to retrieve the number of 
// platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);

// Allocate enough space for each platform
platforms =
    (cl_platform_id*)malloc(
        numPlatforms*sizeof(cl_platform_id));

// Fill in platforms with clGetPlatformIDs()
status = clGetPlatformIDs(numPlatforms, platforms,
    NULL);

//-----------------------------------------------------
// STEP 2: Discover and initialize the devices
//----------------------------------------------------- 

cl_uint numDevices = 0;
cl_device_id *devices = NULL;

// Use clGetDeviceIDs() to retrieve the number of 
// devices present
status = clGetDeviceIDs(
    platforms[0],
    CL_DEVICE_TYPE_CPU,
    0,
    NULL,
    &numDevices);

// Allocate enough space for each device
devices =
    (cl_device_id*)malloc(
        numDevices*sizeof(cl_device_id));

// Fill in devices with clGetDeviceIDs()
status = clGetDeviceIDs(
    platforms[0],
    CL_DEVICE_TYPE_CPU,
    numDevices,
    devices,
    NULL);


//-----------------------------------------------------
// STEP 3: Create a context
//----------------------------------------------------- 

cl_context context = NULL;

// Create a context using clCreateContext() and 
// associate it with the devices
context = clCreateContext(
    NULL,
    numDevices,
    devices,
    NULL,
    NULL,
    &status);

//-----------------------------------------------------
// STEP 4: Create a command queue
//----------------------------------------------------- 

cl_command_queue cmdQueue;

// Create a command queue using clCreateCommandQueue(),
// and associate it with the device you want to execute 
// on
cmdQueue = clCreateCommandQueue(
    context,
    devices[0],
    CL_QUEUE_PROFILING_ENABLE,
    &status);

//-----------------------------------------------------
// STEP 5: Create device buffers
//----------------------------------------------------- 

cl_mem input;
cl_mem output;
cl_float coeff;

input = clCreateBuffer(
    context,
    CL_MEM_READ_ONLY,
    num_items,
    NULL,
    &status);

output = clCreateBuffer(
    context,
    CL_MEM_WRITE_ONLY,
    num_items,
    NULL,
    &status);

//-----------------------------------------------------
// STEP 6: Write host data to device buffers
//----------------------------------------------------- 

// Use clEnqueueWriteBuffer() to write input array Array to
// the device buffer input
status = clEnqueueWriteBuffer(
    cmdQueue,
    input,
    CL_FALSE,
    0,
    num_items,
    Array,
    0,
    NULL,
    NULL);

printf("status %i \n", status);

//-----------------------------------------------------
// STEP 7: Create and compile the program
//----------------------------------------------------- 

// Create a program using clCreateProgramWithSource()
cl_program program = clCreateProgramWithSource(
    context,
    1,
    (const char**)&prog,
    NULL,
    &status);
printf("status %i \n", status);

// Build (compile) the program for the devices with
// clBuildProgram()
status = clBuildProgram(
    program,
    numDevices,
    devices,
    NULL,
    NULL,
    NULL);

//-----------------------------------------------------
// STEP 8: Create the kernel
//----------------------------------------------------- 

cl_kernel kernel = NULL;

kernel = clCreateKernel(program, "exchange", &status);

//-----------------------------------------------------
// STEP 9: Set the kernel arguments
//----------------------------------------------------- 

// Associate the input and output buffers with the 
// kernel 
// using clSetKernelArg()
status = clSetKernelArg(
    kernel,
    0,
    sizeof(cl_mem),
    &input);
printf("Status %i \n",status);

status |= clSetKernelArg(
    kernel,
    1,
    sizeof(cl_mem),
    &output);


//-----------------------------------------------------
// STEP 10: Configure the work-item structure
//----------------------------------------------------- 

// Define an index space (global work size) of work 
// items for 
// execution. A workgroup size (local work size) is not 
// required, 
// but can be used.
size_t globalWorkSize[1];
// There are 'elements' work-items 
globalWorkSize[0] = sampleSize;

//-----------------------------------------------------
// STEP 11: Enqueue the kernel for execution
//----------------------------------------------------- 

// Execute the kernel by using 
// clEnqueueNDRangeKernel().
// 'globalWorkSize' is the 1D dimension of the 
// work-items
status = clEnqueueNDRangeKernel(
    cmdQueue,
    kernel,
    1,
    NULL,
    globalWorkSize,
    NULL,
    0,
    NULL,
    &someEvent);

clFinish(cmdQueue);

clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

double totalTime = end - start;

printf("Total time is: %f ms \n", totalTime / 1000000.0);
//-----------------------------------------------------
// STEP 12: Read the output buffer back to the host
//----------------------------------------------------- 

// Use clEnqueueReadBuffer() to read the OpenCL output  
// buffer (bufferC) 
// to the host output array (C)
printf("Made it here! %i \n", status);
clEnqueueReadBuffer(
    cmdQueue,
    output,
    CL_TRUE,
    0,
    num_items,
    OutputData,
    0,
    NULL,
    NULL);
printf("Made it here2! %i \n", status);


SNDFILE * outfile = sf_open("outputwavfilepathhere", SFM_WRITE, &info);
sf_count_t count = sf_write_int(outfile, OutputData, num_items);
sf_write_sync(outfile);
sf_close(outfile);

//-----------------------------------------------------
// STEP 13: Release OpenCL resources
//----------------------------------------------------- 

// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseContext(context);

// Free host resources
free(OutputData);
free(platforms);
free(devices);
free(Array);
}

最佳答案

试试这个:

__kernel void exchange(__global int *Array, __global int *Output)
{
    int globalSize = get_global_size(0)
    int globalId = get_global_id(0)

    for(int i = globalId; i < 1200000; i += globalSize){
        Output[i] = (Array[i]);
    }
}

确保在 for 循环中使用正确的上限。理想情况下,您将其作为另一个参数传入。

您最初做错的是重复写入前 100012 个元素。关注工作项函数以了解变量的含义。 OpenCL 1.2 reference here.

关于c - OpenCL 内核仅部分写入输出缓冲区,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37326783/

相关文章:

c - 打开文件时出现段错误

opencl - 在OpenCL内核中存储小型恒定值数组的最佳实践?

python - PyOpenCL 程序未返回预期输出

c++ - OpenCL 找不到平台?

opencv - 如何确保 OpenCV 实际运行 OpenCL 内核?

c - FORTRAN 比 C 快 - 对于在同一处理器上运行的矩阵乘法程序,为什么?

c - 如何在共享对象库中共享变量

c - 按位循环遍历大数据 block 的最快方法是什么

c - 两个函数同时执行

c - linux 上的 opencl 问题