我正在致力于在 pyOpenCL 中优化我的 Mandelbrot 渲染器,并希望将迭代分成 block ,以便我可以更好地利用我的 GPU。
最大迭代次数 = 1000 和 2 个“ block ”的示例:
1. 运行 mandelbrot 转义算法迭代 0-500。
2. 保存迭代次数 < 500 的每个点所需的迭代,并针对迭代次数为 500 - 1000 的所有其他点再次运行。
第一个循环按预期工作,但之后的每个 block 都会导致错误的结果。我真的很想更具体,但我不知道真正的问题出在哪里(现在盯着代码超过 2 天)。
我怀疑从内核复制旧的 x,y(实数,虚数)部分时出了问题,但我不知道如何调试它。
我在 GPU 和 CPU 上运行时得到相同的结果,所以我猜它与 GPU 无关。
iterations=2000 和 10 个 block 的示例图像:
这几乎只是第一个 block (加上一些“错误”的像素)。
全部在一个 block 中完成(迭代=200 和 1 block ):
iterations=2000 且 chunks = 1 的预期结果:
import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal
def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
mf = cl.mem_flags
cl_queue = cl.CommandQueue(ctx)
# build program
code = """
#if real_t == double
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
kernel void mandel(
__global real_t *coords,
__global uint *output,
__global real_t *output_coord,
const uint max_iter,
const uint start_iter
){
uint id = get_global_id(0);
real_t2 my_coords = vload2(id, coords);
real_t x = my_coords.x;
real_t y = my_coords.y;
uint iter = 0;
for(iter=start_iter; iter<max_iter; ++iter){
if(x*x + y*y > 4.0f){
break;
}
real_t xtemp = x*x - y*y + my_coords.x;
y = 2*x*y + my_coords.y;
x = xtemp;
}
// copy the current x,y pair back
real_t2 val = (real_t2){x, y};
vstore2(val, id, output_coord);
output[id] = iter;
}
"""
_cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))
# Calculate the "viewport".
x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
y1 = y + ((Decimal(2) * zoom)/Decimal(2.))
# Create index map in x,y pairs
xx = np.arange(0, width, 1, dtype=np.uint32)
yy = np.arange(0, height, 1, dtype=np.uint32)
index_map = np.dstack(np.meshgrid(xx, yy))
# and local "coordinates" (real, imaginary parts)
coord_map = np.ndarray(index_map.shape, dtype=_nptype)
coord_map[:] = index_map
coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
coord_map[:] += (_nptype(x0), _nptype(y0))
coord_map = coord_map.flatten()
index_map = index_map.flatten().astype(dtype=np.uint32)
# Create input and output buffer
buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
buffer_out_coords_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out_coords.nbytes)
# 2D Buffer to collect the iterations needed per pixel
#iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))
start_max_iter = 0
to_do = coord_map.size / 2
steps_size = int(max_iter / float(iter_steps))
while to_do > 0 and start_max_iter < max_iter:
end_max_iter = min(max_iter, start_max_iter + steps_size )
print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)
# copy x/y pairs to device
cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()
# and finally call the ocl function
prg.mandel(cl_queue, (to_do,), None,
buffer_in_cl,
buffer_out_cl,
buffer_out_coords_cl,
np.uint32(end_max_iter),
np.uint32(start_max_iter)
).wait()
# Copy the output back
cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()
# Get indices of "found" escapes
done = np.where(buffer_out[:to_do]<end_max_iter)[0]
# and write the iterations to the coresponding cell
index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
tmp = index_reshaped[done]
iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]
#iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]
# Get the indices of non escapes
undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
# and write them back to our "job" maps for the next loop
tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
coord_map[:undone.size*2] = tmp[undone].flatten()
index_map[:undone.size*2] = index_reshaped[undone].flatten()
to_do = undone.size
start_max_iter = end_max_iter
print "%i done. %i unknown" % (done.size, undone.size)
# simple coloring by modulo 255 on the iter_map
return (iter_map % 255).astype(np.uint8).reshape((height, width))
if __name__ == '__main__':
ctx = cl.create_some_context(interactive=True)
img = mandel(ctx,
x=Decimal("-0.7546546453361122021732941811"),
y=Decimal("0.05020518634419688663435986387"),
zoom=Decimal("0.0002046859427855630601247281079"),
max_iter=2000,
iter_steps=1,
width=500,
height=400,
use_double=False
)
Image.fromarray(img).show()
编辑:Here是另一个版本,其中实部/虚部永远不会离开 GPU 内存。
结果是一样的。
我完全没有主意了。
最佳答案
在进行 Z 平方加 c
计算时,您将使用 buffer_out_coords
中更新的“坐标”而不是原始坐标作为 c 值。 buffer_out_coords 包含当前的 Z
值,而不是原始的 c
坐标,因此这些是您想要开始的值,但您也需要原始坐标。
您只需要 4 处更改:
- 使 buffer_out_coords_cl READ_WRITE
- 每次运行前将 buffer_out_coords 复制到 buffer_out_coords_cl
- 按“undone”过滤 buffer_out_coords 和 coord_map
- 在opencl代码中,从output_coord而不是coords加载起始x和y
我没有得到与您提供的代码相同的输出,所以我不确定这里是否还有其他问题,但此更改给了我一致的输出:
1 block = 153052 未知
PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 500 for 200000 numbers
46948 done. 153052 unknown
5 block = 153052 未知
PYOPENCL_COMPILER_OUTPUT=1 PYOPENCL_CTX='0' oclgrind python testmand.py
Iterations from iteration 0 to 100 for 200000 numbers
0 done. 200000 unknown
Iterations from iteration 100 to 200 for 200000 numbers
11181 done. 188819 unknown
Iterations from iteration 200 to 300 for 188819 numbers
9627 done. 179192 unknown
Iterations from iteration 300 to 400 for 179192 numbers
16878 done. 162314 unknown
Iterations from iteration 400 to 500 for 162314 numbers
9262 done. 153052 unknown
代码如下:
import pyopencl as cl
import numpy as np
from PIL import Image
from decimal import Decimal
def mandel(ctx, x, y, zoom, max_iter=1000, iter_steps=1, width=500, height=500, use_double=False):
mf = cl.mem_flags
cl_queue = cl.CommandQueue(ctx)
# build program
code = """
#if real_t == double
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
kernel void mandel(
__global real_t *coords,
__global uint *output,
__global real_t *output_coord,
const uint max_iter,
const uint start_iter
){
uint id = get_global_id(0);
real_t2 my_coords = vload2(id, coords);
real_t2 my_value_coords = vload2(id, output_coord);
real_t x = my_value_coords.x;
real_t y = my_value_coords.y;
uint iter = 0;
for(iter=start_iter; iter<max_iter; ++iter){
if(x*x + y*y > 4.0f){
break;
}
real_t xtemp = x*x - y*y + my_coords.x;
y = 2*x*y + my_coords.y;
x = xtemp;
}
// copy the current x,y pair back
real_t2 val = (real_t2){x, y};
vstore2(val, id, output_coord);
output[id] = iter;
}
"""
_cltype, _nptype = ("double",np.float64) if use_double else ("float", np.float32)
prg = cl.Program(ctx, code).build("-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))
# Calculate the "viewport".
x0 = x - ((Decimal(3) * zoom)/Decimal(2.))
y0 = y - ((Decimal(2) * zoom)/Decimal(2.))
x1 = x + ((Decimal(3) * zoom)/Decimal(2.))
y1 = y + ((Decimal(2) * zoom)/Decimal(2.))
# Create index map in x,y pairs
xx = np.arange(0, width, 1, dtype=np.uint32)
yy = np.arange(0, height, 1, dtype=np.uint32)
index_map = np.dstack(np.meshgrid(xx, yy))
# and local "coordinates" (real, imaginary parts)
coord_map = np.ndarray(index_map.shape, dtype=_nptype)
coord_map[:] = index_map
coord_map[:] *= (_nptype((x1-x0)/Decimal(width)), _nptype((y1-y0)/Decimal(height)))
coord_map[:] += (_nptype(x0), _nptype(y0))
coord_map = coord_map.flatten()
index_map = index_map.flatten().astype(dtype=np.uint32)
# Create input and output buffer
buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
buffer_out = np.zeros(width*height, dtype=np.uint32) # This will contain the iteration values of that run
buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
buffer_out_coords = np.zeros(width*height*2, dtype=_nptype) # This the last x,y values
buffer_out_coords_cl = cl.Buffer(ctx, mf.READ_WRITE, size=buffer_out_coords.nbytes)
# 2D Buffer to collect the iterations needed per pixel
#iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
iter_map = np.zeros(width*height, dtype=np.uint32).reshape((height, width))
start_max_iter = 0
to_do = coord_map.size / 2
steps_size = int(max_iter / float(iter_steps))
while to_do > 0 and start_max_iter < max_iter:
end_max_iter = min(max_iter, start_max_iter + steps_size )
print "Iterations from iteration %i to %i for %i numbers" % (start_max_iter, end_max_iter, to_do)
# copy x/y pairs to device
cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do*2]).wait()
cl.enqueue_copy(cl_queue, buffer_out_coords_cl, buffer_out_coords[:to_do*2]).wait()
# and finally call the ocl function
prg.mandel(cl_queue, (to_do,), None,
buffer_in_cl,
buffer_out_cl,
buffer_out_coords_cl,
np.uint32(end_max_iter),
np.uint32(start_max_iter)
).wait()
# Copy the output back
cl.enqueue_copy(cl_queue, buffer_out_coords, buffer_out_coords_cl).wait()
cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()
# Get indices of "found" escapes
done = np.where(buffer_out[:to_do]<end_max_iter)[0]
# and write the iterations to the coresponding cell
index_reshaped = index_map[:to_do*2].reshape((to_do, 2))
tmp = index_reshaped[done]
iter_map[tmp[:,1], tmp[:,0]] = buffer_out[done]
#iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]
# Get the indices of non escapes
undone = np.where(buffer_out[:to_do]==end_max_iter)[0]
# and write them back to our "job" maps for the next loop
tmp = buffer_out_coords[:to_do*2].reshape((to_do, 2))
buffer_out_coords[:undone.size*2] = tmp[undone].flatten()
tmp = coord_map[:to_do*2].reshape((to_do, 2))
coord_map[:undone.size*2] = tmp[undone].flatten()
index_map[:undone.size*2] = index_reshaped[undone].flatten()
to_do = undone.size
start_max_iter = end_max_iter
print "%i done. %i unknown" % (done.size, undone.size)
# simple coloring by modulo 255 on the iter_map
return (iter_map % 255).astype(np.uint8).reshape((height, width))
if __name__ == '__main__':
ctx = cl.create_some_context(interactive=True)
img = mandel(ctx,
x=Decimal("-0.7546546453361122021732941811"),
y=Decimal("0.05020518634419688663435986387"),
zoom=Decimal("0.0002046859427855630601247281079"),
max_iter=2000,
iter_steps=1,
width=500,
height=400,
use_double=False
)
Image.fromarray(img).show()
关于python - 无法在 pyopencl 中渲染 mandelbrot,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43299613/