c++ - SSE 双线性插值

我在一个紧密的循环中实现双线性插值并尝试使用 SSE 对其进行优化，但我从中得到的加速为零。

这是代码，非 SIMD 版本使用一个简单的 vector 结构，可以定义为 struct Vec3f { float x, y, z; 实现了乘法和加法运算符:

#ifdef USE_SIMD
    const Color c11 = pixelCache[y1 * size.x + x1];
    const Color c12 = pixelCache[y2 * size.x + x1];
    const Color c22 = pixelCache[y2 * size.x + x2];
    const Color c21 = pixelCache[y1 * size.x + x2];

    __declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() };
    __declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() };
    __declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() };
    __declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() };

    // scalars in vector form for SSE
    const float s11 = (x2-x)*(y2-y);
    const float s12 = (x2-x)*(y-y1);
    const float s22 = (x-x1)*(y-y1);
    const float s21 = (x-x1)*(y2-y);

    __declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11};
    __declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12};
    __declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22};
    __declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21};

    __asm {
        movaps xmm0, mc11
        movaps xmm1, mc12
        movaps xmm2, mc22
        movaps xmm3, mc21

        movaps xmm4, ms11
        movaps xmm5, ms12
        movaps xmm6, ms22
        movaps xmm7, ms21

        mulps xmm0, xmm4
        mulps xmm1, xmm5
        mulps xmm2, xmm6
        mulps xmm3, xmm7

        addps xmm0, xmm1
        addps xmm0, xmm2
        addps xmm0, xmm3

        movaps mc11, xmm0
    }
#else
    const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
    const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
    const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
    const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);

    const Vec3f colour =
            c11*(x2-x)*(y2-y) +
            c21*(x-x1)*(y2-y) +
            c12*(x2-x)*(y-y1) +
            c22*(x-x1)*(y-y1);
#endif

重新排列 asm 代码以重用寄存器(最后只有三个 xmm 寄存器)没有产生任何效果。我也尝试过使用内在函数:

// perform bilinear interpolation
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);

// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);

__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r);
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r);
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r);
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r);

__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11);
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12);
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22);
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21);

mc11 = _mm_mul_ps(mc11, ms11);
mc12 = _mm_mul_ps(mc12, ms12);
mc22 = _mm_mul_ps(mc22, ms22);
mc21 = _mm_mul_ps(mc21, ms21);

mc11 = _mm_add_ps(mc11, mc12);
mc11 = _mm_add_ps(mc11, mc22);
mc11 = _mm_add_ps(mc11, mc21);

Vec3f colour;
_mm_storeu_ps(colour.array, mc11);

但无济于事。我是不是错过了什么，或者不可能在这里获得任何额外的速度？

最佳答案

为什么是 float ？给定 a、b、c、d 和 xerr、yerr 在 0-256 范围内的打包像素 argb，一个简单的例子是:

// =================================================================================================================
// xs_Bilerp
// =================================================================================================================
finline uint32 xs_Bilerp (uint32 a, uint32 b, uint32 c, uint32 d, uint32 xerr, uint32 yerr)
{
    #define xs_rbmask    0x00ff00ff
    #define xs_agmask    0xff00ff00

    if (a==b && c==d && a==d)   return a;

    const uint32 arb        =   a & xs_rbmask;
    const uint32 crb        =   c & xs_rbmask;
    const uint32 aag        =   a & xs_agmask;
    const uint32 cag        =   c & xs_agmask;

    const uint32 rbdx1      =  (b & xs_rbmask) - arb;
    const uint32 rbdx2      =  (d & xs_rbmask) - crb;
    const uint32 agdx1      = ((b & xs_agmask)>>8) - (aag >> 8);
    const uint32 agdx2      = ((d & xs_agmask)>>8) - (cag >> 8);

    const uint32 rb1        = (arb      + ((rbdx1 * xerr) >> 8)) & xs_rbmask;
    const uint32 ag1        = (aag      + ((agdx1 * xerr)     )) & xs_agmask;
    const uint32 rbdy       = ((crb     + ((rbdx2 * xerr) >> 8)) & xs_rbmask)       - rb1;
    const uint32 agdy       = (((cag    + ((agdx2 * xerr)     )) & xs_agmask)>>8)   - (ag1 >> 8);

    const uint32 rb         = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask;
    const uint32 ag         = (ag1 + ((agdy * yerr)     )) & xs_agmask;

    return ag | rb;
}

关于c++ - SSE 双线性插值，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/14659612/

c++ - SSE 双线性插值

上一篇：c++ - ADL 不查找静态成员函数吗？

下一篇：c++ - try_lock_for 未按预期工作