我正在从一个包含超过一百万个元素的数组中读取大整数值。获取的值是使用 libsndfile 库从 wav 文件中获取的。现在,如果我不使用内核,我可以将原始数组写入我的输出文件并毫无问题地收听音频。然而,当我决定使用内核做完全相同的事情时,它只写了不到一秒钟的歌曲。
起初,我认为这是一个内存问题,所以我尝试了缓冲区大小,但仍然没有成功。然后我认为这可能是我在内核中做的循环,我也玩弄循环值得出相同的结论(仍然不起作用。)我现在很困惑,不知道该怎么做.下面是我的代码。下面的一些代码是我的,但主要结构是我在网上找到的,可以帮助我设置内核。
在此代码的最底部,如果我将 OutputData 更改为 Array,我会得到完全相同的音频。我很确定内核有问题,这就是为什么它没有写回整首歌。
我知道这段代码很乱,但要尝试和测试这段代码,您只需复制并废弃它,然后简单地更改输入 wav 文件和输出 wav 文件的路径即可。
为了明确目标,我将尝试修改 wav 文件中的每个值,看看会发生什么。到目前为止,如果我将内核中的输出值乘以 2,它就会失真。但同样,只持续了大约 1 秒,剪辑的其余部分是空的。请注意,输入和输出文件的大小相同。
我的 For 循环也在循环进行 120 万次迭代,因为那是我的示例 wav 文件中的项目数
const char* prog = "__kernel void exchange(__global int *Array, __global int *Output) { for(int j = 0; j < 100000; j++){ for(int i = 0; i < 12; i++){ Output[j+i] = (Array[j+i]);} } }";
int main() {
// This code executes on the OpenCL host
SNDFILE *sf;
SF_INFO info;
int num_channels;
int num, num_items;
//input and output data
int *Array;
int *OutputData;
int f, sr, c;
int i, j;
FILE *out;
/* Open the WAV file. */
info.format = 0;
sf = sf_open("Yourwavfilepathhere", SFM_READ, &info);
if (sf == NULL)
{
printf("Failed to open the file.\n");
perror("Error");
exit(-1);
}
/* Print some of the info, and figure out how much data to read. */
f = info.frames;
sr = info.samplerate;
int format = info.format;
c = info.channels;
printf("frames=%d\n", f);
printf("samplerate=%d\n", sr);
printf("channels=%d\n", c);
printf("format %i\n", format);
num_items = f*c;
printf("num_items=%d\n", num_items);
/* Allocate space for the data to be read, then read it. */
Array = (int *)malloc(num_items*sizeof(int));
OutputData = (int*)malloc(num_items*sizeof(int));;
num = sf_read_int(sf, Array, num_items);
sf_close(sf);
printf("Read %d items\n", num);
//Time variables for performance execution. Event variable needed for timing constraint
cl_event someEvent;
cl_ulong start = (cl_ulong)0;
cl_ulong end = (cl_ulong)0;
cl_ulong finalTime = (cl_ulong)0;
//Number of sampling points
int sampleSize = 100;
float h = 0;
//Coefficient used to multiply the values entering the FIFO buffer implemented inside the kernel
float coefficient = 1 / sampleSize;
//Signal Frequency in Hz
float signalFreq = 10;
//Number of points between 0 and max val (T_Sample)
float freqSample = sampleSize*signalFreq;
//Step = max value or T_Sample. ******Either 1/freqSample or 1/sampleSize for the stepSize******
float stepSize = 1.0 / freqSample;
/*
This is a different Example
*/
// Use this to check the output of each API call
cl_int status;
//-----------------------------------------------------
// STEP 1: Discover and initialize the platforms
//-----------------------------------------------------
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
// Use clGetPlatformIDs() to retrieve the number of
// platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
// Allocate enough space for each platform
platforms =
(cl_platform_id*)malloc(
numPlatforms*sizeof(cl_platform_id));
// Fill in platforms with clGetPlatformIDs()
status = clGetPlatformIDs(numPlatforms, platforms,
NULL);
//-----------------------------------------------------
// STEP 2: Discover and initialize the devices
//-----------------------------------------------------
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
// Use clGetDeviceIDs() to retrieve the number of
// devices present
status = clGetDeviceIDs(
platforms[0],
CL_DEVICE_TYPE_CPU,
0,
NULL,
&numDevices);
// Allocate enough space for each device
devices =
(cl_device_id*)malloc(
numDevices*sizeof(cl_device_id));
// Fill in devices with clGetDeviceIDs()
status = clGetDeviceIDs(
platforms[0],
CL_DEVICE_TYPE_CPU,
numDevices,
devices,
NULL);
//-----------------------------------------------------
// STEP 3: Create a context
//-----------------------------------------------------
cl_context context = NULL;
// Create a context using clCreateContext() and
// associate it with the devices
context = clCreateContext(
NULL,
numDevices,
devices,
NULL,
NULL,
&status);
//-----------------------------------------------------
// STEP 4: Create a command queue
//-----------------------------------------------------
cl_command_queue cmdQueue;
// Create a command queue using clCreateCommandQueue(),
// and associate it with the device you want to execute
// on
cmdQueue = clCreateCommandQueue(
context,
devices[0],
CL_QUEUE_PROFILING_ENABLE,
&status);
//-----------------------------------------------------
// STEP 5: Create device buffers
//-----------------------------------------------------
cl_mem input;
cl_mem output;
cl_float coeff;
input = clCreateBuffer(
context,
CL_MEM_READ_ONLY,
num_items,
NULL,
&status);
output = clCreateBuffer(
context,
CL_MEM_WRITE_ONLY,
num_items,
NULL,
&status);
//-----------------------------------------------------
// STEP 6: Write host data to device buffers
//-----------------------------------------------------
// Use clEnqueueWriteBuffer() to write input array Array to
// the device buffer input
status = clEnqueueWriteBuffer(
cmdQueue,
input,
CL_FALSE,
0,
num_items,
Array,
0,
NULL,
NULL);
printf("status %i \n", status);
//-----------------------------------------------------
// STEP 7: Create and compile the program
//-----------------------------------------------------
// Create a program using clCreateProgramWithSource()
cl_program program = clCreateProgramWithSource(
context,
1,
(const char**)&prog,
NULL,
&status);
printf("status %i \n", status);
// Build (compile) the program for the devices with
// clBuildProgram()
status = clBuildProgram(
program,
numDevices,
devices,
NULL,
NULL,
NULL);
//-----------------------------------------------------
// STEP 8: Create the kernel
//-----------------------------------------------------
cl_kernel kernel = NULL;
kernel = clCreateKernel(program, "exchange", &status);
//-----------------------------------------------------
// STEP 9: Set the kernel arguments
//-----------------------------------------------------
// Associate the input and output buffers with the
// kernel
// using clSetKernelArg()
status = clSetKernelArg(
kernel,
0,
sizeof(cl_mem),
&input);
printf("Status %i \n",status);
status |= clSetKernelArg(
kernel,
1,
sizeof(cl_mem),
&output);
//-----------------------------------------------------
// STEP 10: Configure the work-item structure
//-----------------------------------------------------
// Define an index space (global work size) of work
// items for
// execution. A workgroup size (local work size) is not
// required,
// but can be used.
size_t globalWorkSize[1];
// There are 'elements' work-items
globalWorkSize[0] = sampleSize;
//-----------------------------------------------------
// STEP 11: Enqueue the kernel for execution
//-----------------------------------------------------
// Execute the kernel by using
// clEnqueueNDRangeKernel().
// 'globalWorkSize' is the 1D dimension of the
// work-items
status = clEnqueueNDRangeKernel(
cmdQueue,
kernel,
1,
NULL,
globalWorkSize,
NULL,
0,
NULL,
&someEvent);
clFinish(cmdQueue);
clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(someEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
double totalTime = end - start;
printf("Total time is: %f ms \n", totalTime / 1000000.0);
//-----------------------------------------------------
// STEP 12: Read the output buffer back to the host
//-----------------------------------------------------
// Use clEnqueueReadBuffer() to read the OpenCL output
// buffer (bufferC)
// to the host output array (C)
printf("Made it here! %i \n", status);
clEnqueueReadBuffer(
cmdQueue,
output,
CL_TRUE,
0,
num_items,
OutputData,
0,
NULL,
NULL);
printf("Made it here2! %i \n", status);
SNDFILE * outfile = sf_open("outputwavfilepathhere", SFM_WRITE, &info);
sf_count_t count = sf_write_int(outfile, OutputData, num_items);
sf_write_sync(outfile);
sf_close(outfile);
//-----------------------------------------------------
// STEP 13: Release OpenCL resources
//-----------------------------------------------------
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseContext(context);
// Free host resources
free(OutputData);
free(platforms);
free(devices);
free(Array);
}
最佳答案
试试这个:
__kernel void exchange(__global int *Array, __global int *Output)
{
int globalSize = get_global_size(0)
int globalId = get_global_id(0)
for(int i = globalId; i < 1200000; i += globalSize){
Output[i] = (Array[i]);
}
}
确保在 for 循环中使用正确的上限。理想情况下,您将其作为另一个参数传入。
您最初做错的是重复写入前 100012 个元素。关注工作项函数以了解变量的含义。 OpenCL 1.2 reference here.
关于c - OpenCL 内核仅部分写入输出缓冲区,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37326783/