performance - 快速 SSE 阈值算法

标签 performance algorithm optimization sse simd

我正在尝试使用 SSE 提出一个非常快速的阈值算法来替换它:

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// Handle left edge
*pDst++ = *pSrc++;

// Likeness filter
for (uint32_t k = 2; k < width; k++, pSrc++, pDst++)
    if ((*pDst - *pSrc) * (*pDst - *pSrc) > 100 /*THRESHOLD_SQUARED*/) {
        *pDst = *pSrc;
    }
}

// Handle right edge
*pDst++ = *pSrc++;

到目前为止我有这个:

const uint8_t THRESHOLD = 10;

__attribute__((aligned (16))) static const uint8_t mask[16] = {
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};

__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// I have other code with another mask for the first 16 entries

for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
    xmm3 = _mm_load_si128((__m128i const *)pDst);
    xmm4 = _mm_load_si128((__m128i const *)pSrc);
    xmm5 = _mm_unpacklo_epi8(xmm3, xmm6);
    xmm7 = _mm_unpackhi_epi8(xmm3, xmm6);
    xmm8 = _mm_unpacklo_epi8(xmm4, xmm6);
    xmm9 = _mm_unpackhi_epi8(xmm4, xmm6);
    xmm5 = _mm_sub_epi16(xmm5, xmm8);
    xmm7 = _mm_sub_epi16(xmm7, xmm9);
    xmm5 = _mm_abs_epi16(xmm5);
    xmm7 = _mm_abs_epi16(xmm7);
    xmm5 = _mm_packs_epi16(xmm5, xmm7);
    xmm5 = _mm_cmpgt_epi8(xmm5, xmm1);
    xmm3 = _mm_blendv_epi8(xmm3, xmm4, xmm5);
    _mm_store_si128((__m128i *)pDst, xmm3);
}

// I have other code with another mask for the last 16 entries

我有想法用另一种算法来处理两个值之差的绝对值(主要是留在U8(uchar)空间):

a' = a >> 1;
b' = b >> 1;
diff = (abs(sub(a' - b')) << 1) + ((a ^ b) & 1);

这将需要 8 条 SSE 指令,而不是上面的 9 条(不包括编译器生成的任何额外寄存器移动),但由于依赖延迟,我不确定它是否更快。

有没有其他SSE专家有更好的建议(最高使用SSE 4.2)?

更新 1 - 感谢 Yves 的建议!

const uint8_t THRESHOLD = 10;

__attribute__((aligned (16))) static const uint8_t mask[16] = {
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD,
    THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD
};

__m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm1 = _mm_load_si128((__m128i const *)mask);
xmm6 = _mm_setzero_si128();

uint8_t *pSrc, *pDst;

// Assume pSrc and pDst point to valid data

// I have other code with another mask for the first 16 entries

for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) {
    xmm3 = _mm_load_si128((__m128i const *)pDst);
    xmm4 = _mm_load_si128((__m128i const *)pSrc);
    xmm5 = _mm_subs_epu8(xmm3, xmm4);
    xmm7 = _mm_subs_epu8(xmm4, xmm3);
    xmm5 = _mm_adds_epu8(xmm5, xmm7);
    xmm5 = _mm_subs_epu8(xmm5, xmm1);
    xmm5 = _mm_cmpeq_epi8(xmm5, xmm6);
    xmm4 = _mm_blendv_epi8(xmm4, xmm3, xmm5);
    _mm_store_si128((__m128i *)pDst, xmm4);
}

// I have other code with another mask for the last 16 entries

最佳答案

有一种有效的替代方法可以利用算术饱和度来计算绝对差。

实际上,饱和减法计算 A - B = Max(A - B, 0),因此 |A-B| = (A - B) + (B - A)

Diff= _mm_adds_epu8(_mm_subs_epu8(A, B), _mm_subs_epu8(B, A));

总和不会饱和。这样,您将保持 16 x 8 位无符号并获得最大吞吐量。

关于performance - 快速 SSE 阈值算法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26570178/

相关文章:

c++ - 函数大小与执行速度

python - Numpy:将行值广播到 channel

python - 关灯算法

performance - 从 Windows Server 2003 迁移到 2008(IIS 6 到 IIS 7)时性能显着下降

java - 当服务器 JIT 激活时,什么会导致我的代码运行速度变慢?

algorithm - opencv:检测棋盘角点的最佳方法

algorithm - 用 1 种颜色对图表进行部分着色

c - 非平凡的 goto 使用(可能击败编译器)

c - 指针地址不递增

c++ - 如何优化以下公共(public)循环?