c++ - 如何利用 SIMD 功能计算 RGBA 像素 8 位分量之间的平方差之和?

标签 c++ simd sse intrinsics avx

下面的代码尝试提取像素值的红色、绿色和蓝色 channel ,并与另一组 RGB 值执行算术运算。 看起来代码在尝试执行平方和加法的逻辑方面速度很慢。

是否有可能用更快的版本替换它,因为此逻辑似乎根本不使用 SIMD 功能。

typedef struct {
        unsigned char b, g, r, a;
    } pixel;

            register pixel *pPixel;
            register int i, red1, green1, blue1, alpha1;
            register int red2, green2, blue2, alpha2;
            register long oldD, newD;

            red1 = GetRed( *pPixel );
            green1 = GetGreen( *pPixel );
            blue1 = GetBlue( *pPixel );
            alpha1 = GetAlpha( *pPixel );
            oldD = 2000000000;
            for ( i = 0; i < newcolors; ++i ) {
                red2 = GetRed( mycolormap[i].acolor );
                green2 = GetGreen( mycolormap[i].acolor );
                blue2 = GetBlue( mycolormap[i].acolor );
                alpha2 = GetAlpha( mycolormap[i].acolor );

                newD = ( red1 - red2 ) * ( red1 - red2 ) +  
                          ( green1 - green2 ) * ( green1 - green2 ) +
                          ( blue1 - blue2 ) * ( blue1 - blue2 ) +
                          ( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
                if ( newD < oldD ) {
                    oldD = newD;
                }
            }

下面的代码部分似乎需要改进

  newD = ( red1 - red2 ) * ( red1 - red2 ) +  
                              ( green1 - green2 ) * ( green1 - green2 ) +
                              ( blue1 - blue2 ) * ( blue1 - blue2 ) +
                              ( alpha1 - alpha2 ) * ( alpha1 - alpha2 );

最佳答案

这比看起来更难。对您来说不幸的是,C++ 编译器中的自动向量化器很少能像您那样在整数运算方面做得很好。

以下实现仅需要SSE4.1。如果您可以通过将所有这些 vector 升级为 32 字节 vector 来大幅改进 AVX2,但这将使一些事情变得复杂,包括余数和最终的减少。

我假设您不仅需要最小点积,还需要像素的索引。如果您只需要最小点积,请删除 bestIndices 字段以及处理该字段的代码。

struct alignas( 4 ) Pixel
{
    uint8_t b, g, r, a;
};

// Define __SSE4_1__ macro when building with MSVC for AVX1 or newer ISA
#if defined( _MSC_VER ) && defined( __AVX__ ) && !defined( __SSE4_1__ )
#define __SSE4_1__ 1
#endif

size_t findClosestPixel( const Pixel& ref, const Pixel* rsi, size_t length, int& bestValue )
{
    if( 0 == length )
    {
        bestValue = INT_MAX;
        return ~(size_t)0;
    }

    class Acc
    {
        // The reference pixel we're after, broadcasted and split into low/high pieces in 16-bit lanes
        __m128i lowRef, highRef;
        // The best dot product so far
        __m128i bestSquares = _mm_set1_epi32( INT_MAX );
        // Index of the pixels currently in bestSquares
        __m128i bestIndices = _mm_set1_epi32( -1 );

        const __m128i lowMask = _mm_set1_epi16( 0xFF );

        // For lanes where dp < bestSquares, update bestSquares and bestIndices vectors
        void updateFields( __m128i dp, __m128i indices )
        {
            const __m128i lt = _mm_cmplt_epi32( dp, bestSquares );
#ifndef __SSE4_1__
            bestSquares = _mm_or_si128( _mm_and_si128( lt, dp ), _mm_andnot_si128( lt, bestSquares ) );
            bestIndices = _mm_or_si128( _mm_and_si128( lt, indices ), _mm_andnot_si128( lt, bestIndices ) );
#else
            bestSquares = _mm_min_epi32( dp, bestSquares );
            bestIndices = _mm_blendv_epi8( bestIndices, indices, lt );
#endif
        }

    public:

        Acc( const Pixel& ref )
        {
            __m128i tmp = _mm_set1_epi32( *(const int*)( &ref ) );
            lowRef = _mm_and_si128( tmp, lowMask );
            highRef = _mm_srli_epi16( tmp, 8 );
        }

        // Update the accumulator with another 4 pixels
        void update( __m128i pixels, __m128i indices )
        {
            // Split into two vectors with 16-bit lanes:
            // low contains blue and red channels, high contains green and alpha
            __m128i low = _mm_and_si128( pixels, lowMask );
            __m128i high = _mm_srli_epi16( pixels, 8 );

            // Compute difference with the reference value we're after
            low = _mm_sub_epi16( low, lowRef );
            high = _mm_sub_epi16( high, highRef );

            // Compute squares as 32-bit numbers, add adjacent pairs
            low = _mm_madd_epi16( low, low );
            high = _mm_madd_epi16( high, high );
            // Adding them results in the dot product (sum of squares) for all 4 channels
            __m128i dp = _mm_add_epi32( low, high );

            // Update the state
            updateFields( dp, indices );
        }

        // Compute horizontal minimum across lanes in these accumulators
        uint32_t reduce( int& bestDp )
        {
            // Swap low/high halves
            __m128i s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 1, 0, 3, 2 ) );
            __m128i i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 1, 0, 3, 2 ) );
            updateFields( s2, i2 );

            // Swap even/odd lanes
            s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 2, 3, 0, 1 ) );
            i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 2, 3, 0, 1 ) );
            updateFields( s2, i2 );

            // Return lowest lanes from both vectors
            bestDp = _mm_cvtsi128_si32( bestSquares );
            return (uint32_t)_mm_cvtsi128_si32( bestIndices );
        }
    };

    Acc impl{ ref };

    const size_t lengthAligned = ( length / 4 ) * 4;
    size_t i;
    __m128i currentIndices = _mm_setr_epi32( 0, 1, 2, 3 );
    for( i = 0; i < lengthAligned; i += 4 )
    {
        // Load 4 source pixels
        __m128i src = _mm_loadu_si128( ( const __m128i* )( rsi + i ) );
        // Update things
        impl.update( src, currentIndices );
        // Increment index vector by 4 pixels
        currentIndices = _mm_add_epi32( currentIndices, _mm_set1_epi32( 4 ) );
    }

    const size_t remainder = length % 4;
    if( remainder == 0 )
    {
        // The input was a multiple of 4 pixels
        return impl.reduce( bestValue );
    }

    const int* const pi = (const int*)( rsi + i );
    __m128i rv;
    if( lengthAligned > 0 )
    {
        // We had at least 4 elements on input, can do unaligned load with negative offset
        size_t offset = 4 - remainder;
        currentIndices = _mm_sub_epi32( currentIndices, _mm_set1_epi32( (int)offset ) );
        rv = _mm_loadu_si128( ( const __m128i* )( pi - offset ) );
    }
    else
    {
        // Less than 4 elements on input, doing partial load and broadcasting the last element
        const size_t remainder = length % 4;
        switch( remainder )
        {
        case 1:
            rv = _mm_set1_epi32( pi[ 0 ] );
            break;
        case 2:
            rv = _mm_loadl_epi64( ( const __m128i* )pi );
            rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 1, 1, 1, 0 ) );
            break;
        case 3:
            rv = _mm_loadl_epi64( ( const __m128i* )pi );
#ifndef __SSE4_1__
            rv = _mm_unpacklo_epi64( rv, _mm_set1_epi32( pi[ 2 ] ) );
#else
            rv = _mm_insert_epi32( rv, pi[ 2 ], 2 );
            rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 2, 2, 1, 0 ) );
#endif
            break;
        }
    }

    impl.update( rv, currentIndices );
    return impl.reduce( bestValue );
}

关于c++ - 如何利用 SIMD 功能计算 RGBA 像素 8 位分量之间的平方差之和?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/72136987/

相关文章:

针对 SIMD : Making an SoA less of a PiTA 的 C++ 设计

x86 - SSE:将短整数转换为 float

c++ - vector * matrix 乘积效率问题

c++ - 在 C++ 中打印链表的元素(参见代码)

c++ - 通过安装 intel parallel X studio 使用 Cilk 库

c++ - C++ 中的 size_t 和 int 有什么区别?

c - __m256d 的广播条目

c++ - 尽管 try-catch 异常仍泄漏到系统

c++ - SSE 加载/存储内存事务

c - 内存操作 : set every n-th bit (C/C++) in modern CPUs/GPUs