OpenCL 遍历内核——进一步优化

目前，我有一个 OpenCL 内核，用于如下遍历。如果有人对这个相当大的内核的优化有一些看法，我会很高兴。

问题是，我正在使用 SAH BVH 运行此代码，并且我想获得与 Timo Aila 在他的论文(Understanding the Efficiency of Ray Traversal on GPUs)中的遍历相似的性能，当然他的代码使用 SplitBVH(其中我可能会考虑使用 SAH BVH 来代替，但在我看来，它的构建时间真的很慢)。但我问的是遍历，而不是 BVH(而且我到目前为止只处理场景，其中 SplitBVH 不会给你带来比 SAH BVH 太多的优势)。

首先，这是我到目前为止所拥有的(标准的 while-while 遍历内核)。

__constant sampler_t sampler = CLK_FILTER_NEAREST;

// Inline definition of horizontal max
inline float max4(float a, float b, float c, float d)
{
    return max(max(max(a, b), c), d);
}

// Inline definition of horizontal min
inline float min4(float a, float b, float c, float d)
{
    return min(min(min(a, b), c), d);
}

// Traversal kernel
__kernel void traverse( __read_only image2d_t nodes,
                        __global const float4* triangles,
                        __global const float4* rays,
                        __global float4* result,
                        const int num,
                        const int w,
                        const int h)
{
    // Ray index
    int idx = get_global_id(0);

    if(idx < num)
    {
        // Stack
        int todo[32];
        int todoOffset = 0;

        // Current node
        int nodeNum = 0;

        float tmin = 0.0f;
        float depth = 2e30f;

        // Fetch ray origin, direction and compute invdirection
        float4 origin = rays[2 * idx + 0];
        float4 direction = rays[2 * idx + 1];
        float4 invdir = native_recip(direction);

        float4 temp = (float4)(0.0f, 0.0f, 0.0f, 1.0f);

        // Traversal loop
        while(true)
        {
            // Fetch node information
            int2 nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
            int4 specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));

            // While node isn't leaf
            while(specs.z == 0)
            {
                // Fetch child bounding boxes
                float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
                float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
                float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));

                // Test ray against child bounding boxes
                float oodx = origin.x * invdir.x;
                float oody = origin.y * invdir.y;
                float oodz = origin.z * invdir.z;
                float c0lox = n0xy.x * invdir.x - oodx;
                float c0hix = n0xy.y * invdir.x - oodx;
                float c0loy = n0xy.z * invdir.y - oody;
                float c0hiy = n0xy.w * invdir.y - oody;
                float c0loz = nz.x * invdir.z - oodz;
                float c0hiz = nz.y * invdir.z - oodz;
                float c1loz = nz.z * invdir.z - oodz;
                float c1hiz = nz.w * invdir.z - oodz;
                float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), tmin);
                float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), depth);
                float c1lox = n1xy.x * invdir.x - oodx;
                float c1hix = n1xy.y * invdir.x - oodx;
                float c1loy = n1xy.z * invdir.y - oody;
                float c1hiy = n1xy.w * invdir.y - oody;
                float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), tmin);
                float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), depth);

                bool traverseChild0 = (c0max >= c0min);
                bool traverseChild1 = (c1max >= c1min);

                nodeNum = specs.x;
                int nodeAbove = specs.y;

                // We hit just one out of 2 childs
                if(traverseChild0 != traverseChild1)
                {
                    if(traverseChild1)
                    {
                        nodeNum = nodeAbove;
                    }
                }
                // We hit either both or none
                else
                {
                    // If we hit none, pop node from stack (or exit traversal, if stack is empty)
                    if (!traverseChild0)
                    {
                        if(todoOffset == 0)
                        {
                            break;
                        }
                        nodeNum = todo[--todoOffset];
                    }
                    // If we hit both
                    else
                    {
                        // Sort them (so nearest goes 1st, further 2nd)
                        if(c1min < c0min)
                        {
                            unsigned int tmp = nodeNum;
                            nodeNum = nodeAbove;
                            nodeAbove = tmp;
                        }

                        // Push further on stack
                        todo[todoOffset++] = nodeAbove;
                    }
                }

                // Fetch next node information
                nodeCoord = (int2)((nodeNum << 2) % w, (nodeNum << 2) / w);
                specs = read_imagei(nodes, sampler, nodeCoord + (int2)(3, 0));
            }

            // If node is leaf & has some primitives
            if(specs.z > 0)
            {
                // Loop through primitives & perform intersection with them (Woop triangles)
                for(int i = specs.x; i < specs.y; i++)
                {
                    // Fetch first point from global memory
                    float4 v0 = triangles[i * 4 + 0];

                    float o_z = v0.w - origin.x * v0.x - origin.y * v0.y - origin.z * v0.z;
                    float i_z = 1.0f / (direction.x * v0.x + direction.y * v0.y + direction.z * v0.z);
                    float t = o_z * i_z;

                    if(t > 0.0f && t < depth)
                    {
                        // Fetch second point from global memory
                        float4 v1 = triangles[i * 4 + 1];

                        float o_x = v1.w + origin.x * v1.x + origin.y * v1.y + origin.z * v1.z;
                        float d_x = direction.x * v1.x + direction.y * v1.y + direction.z * v1.z;
                        float u = o_x + t * d_x;

                        if(u >= 0.0f && u <= 1.0f)
                        {
                            // Fetch third point from global memory
                            float4 v2 = triangles[i * 4 + 2];

                            float o_y = v2.w + origin.x * v2.x + origin.y * v2.y + origin.z * v2.z;
                            float d_y = direction.x * v2.x + direction.y * v2.y + direction.z * v2.z;
                            float v = o_y + t * d_y;

                            if(v >= 0.0f && u + v <= 1.0f)
                            {
                                // We got successful hit, store the information
                                depth = t;
                                temp.x = u;
                                temp.y = v;
                                temp.z = t;
                                temp.w = as_float(i);
                            }
                        }
                    }
                }
            }

            // Pop node from stack (if empty, finish traversal)
            if(todoOffset == 0)
            {
                break;
            }

            nodeNum = todo[--todoOffset];
        }

        // Store the ray traversal result in global memory
        result[idx] = temp;
    }
}

今天的第一个问题是，如何在 OpenCL 中编写他的 Persistent while-while 和 Speculative while-while 内核？

广告持续一段时间 ，我说得对吗，我实际上只是以与本地工作大小相等的全局工作大小启动内核，并且这两个数字都应该等于 GPU 的扭曲/波前大小？
我知道使用 CUDA 持久线程实现如下所示:

  do
  {
        volatile int& jobIndexBase = nextJobArray[threadIndex.y];

        if(threadIndex.x == 0)
        {
              jobIndexBase = atomicAdd(&warpCounter, WARP_SIZE);
        }

        index = jobIndexBase + threadIndex.x;

        if(index >= totalJobs)
              return;

        /* Perform work for task numbered 'index' */
  }
  while(true);

OpenCL 中的等价物如何看起来像，我知道我必须在那里做一些障碍，我也知道一个应该在我原子地将 WARP_SIZE 添加到 warpCounter 的分数之后。

广告投机遍历 - 好吧，我可能没有任何想法应该如何在 OpenCL 中实现，因此欢迎提供任何提示。我也不知道把障碍放在哪里(因为把它们放在模拟的 __any 周围会导致驱动程序崩溃)。

如果您在这里成功，感谢您的阅读，欢迎您提供任何提示、答案等!

最佳答案

您可以做的优化是使用向量变量和融合乘加函数来加速设置数学。至于内核的其余部分，它很慢，因为它是分支的。如果您可以对信号数据进行假设，则可以通过减少代码分支来减少执行时间。我还没有检查 float4 swizles(浮点 4 变量之后的 .xxyy 和 .x .y .z .w)，所以只需检查一下。

            float4 n0xy = read_imagef(nodes, sampler, nodeCoord);
            float4 n1xy = read_imagef(nodes, sampler, nodeCoord + (int2)(1, 0));
            float4 nz = read_imagef(nodes, sampler, nodeCoord + (int2)(2, 0));

            float4 oodf4 = -origin * invdir;

            float4 c0xyf4 = fma(n0xy,invdir.xxyy,oodf4);

            float4 c0zc1z = fma(nz,(float4)(invdir.z),oodf4);

            float c0min = max4(min(c0xyf4.x, c0xyf4.y), min(c0xyf4.z, c0xyf4.w), min(c0zc1z.z, c0zc1z.w), tmin);
            float c0max = min4(max(c0xyf4.x, c0xyf4.y), max(c0xyf4.z, c0xyf4.w), max(c0zc1z.z, c0zc1z.w), depth);

            float4 c1xy = fma(n1xy,invdir.xxyy,oodf4);

            float c1min = max4(min(c1xy.x, c1xy.y), min(c1xy.z, c1xy.w), min(c0zc1z.z, c0zc1z.w), tmin);
            float c1max = min4(max(c1xy.x, c1xy.y), max(c1xy.z, c1xy.w), max(c0zc1z.z, c0zc1z.w), depth);

关于OpenCL 遍历内核——进一步优化，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/15933974/

OpenCL 遍历内核——进一步优化

上一篇：printf - Notepad++ 特殊字符串字符高亮

下一篇：linux - dup2 真的可以返回 EINTR 吗？