我想尽可能高效地将 64 次双重比较的结果加载/比较/打包到 uint64_t 位掩码中。
我当前的方法是通过 AVX2
比较 2*2 对使用_mm256_cmp_pd
。两次 (=8) 比较的结果均使用 _mm256_movemask_pd
转换为位图。并通过a|b<<4
作为字节分配到 union 中(1x uint64_t/8 uint8_t)以保存一些移位/或操作。
这个例子可能有助于可视化
union ui64 {
uint64_t i64;
struct { uint8_t i0,i1,i2,i3,i4,i5,i6,i7; } i8;
};
static inline uint64_t cmp64 (double* in0, double* in1) {
ui64 result;
// +0
result.i8.i0 =
_mm256_movemask_pd(
_mm256_cmp_pd(
_mm256_load_pd(in0 + 0),
_mm256_load_pd(in1 + 0),
_CMP_LT_OQ)) |
_mm256_movemask_pd(
_mm256_cmp_pd(
_mm256_load_pd(in0 + 4),
_mm256_load_pd(in1 + 4),
_CMP_LT_OQ)) << 4;
// +8
// load, compare, pack n+8 each 'iteration' using result.i8.i1,...i7
// ...
return result.i64;
}
compress&set 的变体看起来很直接,但使用较慢的指令:1x _mm256_set_m128
和 2x _mm256_cvtpd_ps
与标量 <<
和|
像这样
_mm256_movemask_ps(_mm256_set_m128(
_mm256_cvtpd_ps(v0),
_mm256_cvtpd_ps(v1)));
使用的CPU是Zen 1
(最大 AVX2
),不确定 GPU 使用(Nvidia)是否是一个选项。
请分享您的想法。
最佳答案
这是一个例子。它使用最有效的指令将比较结果打包到字节中,每 32 个数字一次改组为正确的顺序,并使用 _mm256_movemask_epi8
一次生成 32 位。
// Compare 4 numbers, return 32 bytes with results of 4 comparisons:
// 00000000 11111111 22222222 33333333
inline __m256d compare4( const double* a, const double* b )
{
return _mm256_cmp_pd( _mm256_load_pd( a ), _mm256_load_pd( b ), _CMP_LT_OQ );
}
// Compare 8 numbers, combine 8 results into the following bytes:
// 0000 4444 1111 5555 2222 6666 3333 7777
inline __m256i compare8( const double* a, const double* b )
{
__m256 c0 = _mm256_castpd_ps( compare4( a, b ) );
__m256 c1 = _mm256_castpd_ps( compare4( a + 4, b + 4 ) );
return _mm256_castps_si256( _mm256_blend_ps( c0, c1, 0b10101010 ) );
}
// Compare 16 numbers, combine 16 results into following bytes:
// 00 44 11 55 88 CC 99 DD 22 66 33 77 AA EE BB FF
inline __m256i compare16( const double* a, const double* b )
{
__m256i c0 = compare8( a, b );
__m256i c1 = compare8( a + 8, b + 8 );
return _mm256_packs_epi32( c0, c1 );
}
inline uint32_t compare32( const double* a, const double* b )
{
// Compare 32 numbers and merge them into a single vector
__m256i c0 = compare16( a, b );
__m256i c1 = compare16( a + 16, b + 16 );
__m256i src = _mm256_packs_epi16( c0, c1 );
// We got the 32 bytes, but the byte order is screwed up in that vector:
// 0 4 1 5 8 12 9 13 16 20 17 21 24 28 25 29
// 2 6 3 7 10 14 11 15 18 22 19 23 26 30 27 31
// The following 2 instructions are fixing the order
// Shuffle 8-byte pieces across the complete vector
// That instruction is relatively expensive on most CPUs, but we only doing it once per 32 numbers
src = _mm256_permute4x64_epi64( src, _MM_SHUFFLE( 3, 1, 2, 0 ) );
// The order of bytes in the vector is still wrong:
// 0 4 1 5 8 12 9 13 2 6 3 7 10 14 11 15
// 13 16 20 17 21 24 28 25 18 22 19 23 26 30 27 31
// Need one more shuffle instruction
const __m128i perm16 = _mm_setr_epi8(
0, 2, 8, 10, 1, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15 );
// If you calling this in a loop and everything is inlined,
// the shuffling vector should be pre-loaded outside of the loop.
const __m256i perm = _mm256_broadcastsi128_si256( perm16 );
// Shuffle the bytes
src = _mm256_shuffle_epi8( src, perm );
// The order of bytes is now correct, can use _mm256_movemask_epi8 to make 32 bits of the result
return (uint32_t)_mm256_movemask_epi8( src );
}
uint64_t compareAndPack64( const double* a, const double* b )
{
uint64_t low = compare32( a, b );
uint64_t high = compare32( a + 32, b + 32 );
return low | ( high << 32 );
}
关于c++ - 在 uint64_t 位掩码中高效加载/计算/打包 64 个双重比较结果,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/70974476/