下面的代码尝试提取像素值的红色、绿色和蓝色 channel ,并与另一组 RGB 值执行算术运算。 看起来代码在尝试执行平方和加法的逻辑方面速度很慢。
是否有可能用更快的版本替换它,因为此逻辑似乎根本不使用 SIMD 功能。
typedef struct {
unsigned char b, g, r, a;
} pixel;
register pixel *pPixel;
register int i, red1, green1, blue1, alpha1;
register int red2, green2, blue2, alpha2;
register long oldD, newD;
red1 = GetRed( *pPixel );
green1 = GetGreen( *pPixel );
blue1 = GetBlue( *pPixel );
alpha1 = GetAlpha( *pPixel );
oldD = 2000000000;
for ( i = 0; i < newcolors; ++i ) {
red2 = GetRed( mycolormap[i].acolor );
green2 = GetGreen( mycolormap[i].acolor );
blue2 = GetBlue( mycolormap[i].acolor );
alpha2 = GetAlpha( mycolormap[i].acolor );
newD = ( red1 - red2 ) * ( red1 - red2 ) +
( green1 - green2 ) * ( green1 - green2 ) +
( blue1 - blue2 ) * ( blue1 - blue2 ) +
( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
if ( newD < oldD ) {
oldD = newD;
}
}
下面的代码部分似乎需要改进
newD = ( red1 - red2 ) * ( red1 - red2 ) +
( green1 - green2 ) * ( green1 - green2 ) +
( blue1 - blue2 ) * ( blue1 - blue2 ) +
( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
最佳答案
这比看起来更难。对您来说不幸的是,C++ 编译器中的自动向量化器很少能像您那样在整数运算方面做得很好。
以下实现仅需要SSE4.1。如果您可以通过将所有这些 vector 升级为 32 字节 vector 来大幅改进 AVX2,但这将使一些事情变得复杂,包括余数和最终的减少。
我假设您不仅需要最小点积,还需要像素的索引。如果您只需要最小点积,请删除 bestIndices
字段以及处理该字段的代码。
struct alignas( 4 ) Pixel
{
uint8_t b, g, r, a;
};
// Define __SSE4_1__ macro when building with MSVC for AVX1 or newer ISA
#if defined( _MSC_VER ) && defined( __AVX__ ) && !defined( __SSE4_1__ )
#define __SSE4_1__ 1
#endif
size_t findClosestPixel( const Pixel& ref, const Pixel* rsi, size_t length, int& bestValue )
{
if( 0 == length )
{
bestValue = INT_MAX;
return ~(size_t)0;
}
class Acc
{
// The reference pixel we're after, broadcasted and split into low/high pieces in 16-bit lanes
__m128i lowRef, highRef;
// The best dot product so far
__m128i bestSquares = _mm_set1_epi32( INT_MAX );
// Index of the pixels currently in bestSquares
__m128i bestIndices = _mm_set1_epi32( -1 );
const __m128i lowMask = _mm_set1_epi16( 0xFF );
// For lanes where dp < bestSquares, update bestSquares and bestIndices vectors
void updateFields( __m128i dp, __m128i indices )
{
const __m128i lt = _mm_cmplt_epi32( dp, bestSquares );
#ifndef __SSE4_1__
bestSquares = _mm_or_si128( _mm_and_si128( lt, dp ), _mm_andnot_si128( lt, bestSquares ) );
bestIndices = _mm_or_si128( _mm_and_si128( lt, indices ), _mm_andnot_si128( lt, bestIndices ) );
#else
bestSquares = _mm_min_epi32( dp, bestSquares );
bestIndices = _mm_blendv_epi8( bestIndices, indices, lt );
#endif
}
public:
Acc( const Pixel& ref )
{
__m128i tmp = _mm_set1_epi32( *(const int*)( &ref ) );
lowRef = _mm_and_si128( tmp, lowMask );
highRef = _mm_srli_epi16( tmp, 8 );
}
// Update the accumulator with another 4 pixels
void update( __m128i pixels, __m128i indices )
{
// Split into two vectors with 16-bit lanes:
// low contains blue and red channels, high contains green and alpha
__m128i low = _mm_and_si128( pixels, lowMask );
__m128i high = _mm_srli_epi16( pixels, 8 );
// Compute difference with the reference value we're after
low = _mm_sub_epi16( low, lowRef );
high = _mm_sub_epi16( high, highRef );
// Compute squares as 32-bit numbers, add adjacent pairs
low = _mm_madd_epi16( low, low );
high = _mm_madd_epi16( high, high );
// Adding them results in the dot product (sum of squares) for all 4 channels
__m128i dp = _mm_add_epi32( low, high );
// Update the state
updateFields( dp, indices );
}
// Compute horizontal minimum across lanes in these accumulators
uint32_t reduce( int& bestDp )
{
// Swap low/high halves
__m128i s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 1, 0, 3, 2 ) );
__m128i i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 1, 0, 3, 2 ) );
updateFields( s2, i2 );
// Swap even/odd lanes
s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 2, 3, 0, 1 ) );
i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 2, 3, 0, 1 ) );
updateFields( s2, i2 );
// Return lowest lanes from both vectors
bestDp = _mm_cvtsi128_si32( bestSquares );
return (uint32_t)_mm_cvtsi128_si32( bestIndices );
}
};
Acc impl{ ref };
const size_t lengthAligned = ( length / 4 ) * 4;
size_t i;
__m128i currentIndices = _mm_setr_epi32( 0, 1, 2, 3 );
for( i = 0; i < lengthAligned; i += 4 )
{
// Load 4 source pixels
__m128i src = _mm_loadu_si128( ( const __m128i* )( rsi + i ) );
// Update things
impl.update( src, currentIndices );
// Increment index vector by 4 pixels
currentIndices = _mm_add_epi32( currentIndices, _mm_set1_epi32( 4 ) );
}
const size_t remainder = length % 4;
if( remainder == 0 )
{
// The input was a multiple of 4 pixels
return impl.reduce( bestValue );
}
const int* const pi = (const int*)( rsi + i );
__m128i rv;
if( lengthAligned > 0 )
{
// We had at least 4 elements on input, can do unaligned load with negative offset
size_t offset = 4 - remainder;
currentIndices = _mm_sub_epi32( currentIndices, _mm_set1_epi32( (int)offset ) );
rv = _mm_loadu_si128( ( const __m128i* )( pi - offset ) );
}
else
{
// Less than 4 elements on input, doing partial load and broadcasting the last element
const size_t remainder = length % 4;
switch( remainder )
{
case 1:
rv = _mm_set1_epi32( pi[ 0 ] );
break;
case 2:
rv = _mm_loadl_epi64( ( const __m128i* )pi );
rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 1, 1, 1, 0 ) );
break;
case 3:
rv = _mm_loadl_epi64( ( const __m128i* )pi );
#ifndef __SSE4_1__
rv = _mm_unpacklo_epi64( rv, _mm_set1_epi32( pi[ 2 ] ) );
#else
rv = _mm_insert_epi32( rv, pi[ 2 ], 2 );
rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 2, 2, 1, 0 ) );
#endif
break;
}
}
impl.update( rv, currentIndices );
return impl.reduce( bestValue );
}
关于c++ - 如何利用 SIMD 功能计算 RGBA 像素 8 位分量之间的平方差之和?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/72136987/