c++ - 比较两个 vector<bool> 与 SSE

标签 c++ x86 sse simd

我有两个 vector<bool> A和B。

我想比较它们并计算相等的元素数:

例如:

A = {0,1,0,1}
B = {0,0,1,1}

结果将等于 2。

我可以使用 _mm_cmpeq_epi8 但它只比较 16 个元素(即我应该将 0 和 1 转换为 char 然后进行比较)。 是否可以用SSE(或SIMD指令)每次比较128个元素?

最佳答案

如果你可以假设vector<bool>正在使用连续的字节大小的元素进行存储,或者如果您可以考虑使用类似 vector<uint8_t> 的东西相反,这个例子应该给你一个很好的起点:

static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
    assert(vec1.size() == vec2.size());         // vectors must be same size

    const size_t n = vec1.size();
    const size_t max_block_size = 255 * 16;     // max block size before possible overflow

    __m128i vcount = _mm_setzero_si128();
    size_t i, count = 0;

    for (i = 0; i + 16 <= n; )                  // for each block
    {
        size_t m = std::min(n, i + max_block_size);

        for ( ; i + 16 <= m; i += 16)           // for each vector in block
        {
            __m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
            __m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
            __m128i vcmp = _mm_cmpeq_epi8(v1, v2);
            vcount = _mm_sub_epi8(vcount, vcmp);
        }
        vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
        count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
        vcount = _mm_setzero_si128();           // update count from current block
    }
    vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
    count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
    for ( ; i < n; ++i)                         // deal with any remaining partial vector
    {
        count += (vec1[i] == vec2[i]);
    }
    return count;
}

请注意,这是使用 vector<uint8_t> .如果你真的必须使用 vector<bool>并且可以保证元素将始终是连续的和字节大小的然后你只需要强制 vector<bool>进入 const uint8_t *或以某种方式类似。

测试工具:

#include <cassert>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <vector>

#include <emmintrin.h>    // SSE2

using std::vector;

static size_t count_equal_ref(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
    assert(vec1.size() == vec2.size());

    const size_t n = vec1.size();
    size_t i, count = 0;

    for (i = 0 ; i < n; ++i)
    {
        count += (vec1[i] == vec2[i]);
    }
    return count;
}

static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
    assert(vec1.size() == vec2.size());         // vectors must be same size

    const size_t n = vec1.size();
    const size_t max_block_size = 255 * 16;     // max block size before possible overflow

    __m128i vcount = _mm_setzero_si128();
    size_t i, count = 0;

    for (i = 0; i + 16 <= n; )                  // for each block
    {
        size_t m = std::min(n, i + max_block_size);

        for ( ; i + 16 <= m; i += 16)           // for each vector in block
        {
            __m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
            __m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
            __m128i vcmp = _mm_cmpeq_epi8(v1, v2);
            vcount = _mm_sub_epi8(vcount, vcmp);
        }
        vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
        count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
        vcount = _mm_setzero_si128();           // update count from current block
    }
    vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
    count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
    for ( ; i < n; ++i)                         // deal with any remaining partial vector
    {
        count += (vec1[i] == vec2[i]);
    }
    return count;
}

int main(int argc, char * argv[])
{
    size_t n = 100;

    if (argc > 1)
    {
        n = atoi(argv[1]);
    }

    vector<uint8_t> vec1(n);
    vector<uint8_t> vec2(n);

    srand((unsigned int)time(NULL));

    for (size_t i = 0; i < n; ++i)
    {
        vec1[i] = rand() & 1;
        vec2[i] = rand() & 1;
    }

    size_t n_ref = count_equal_ref(vec1, vec2);
    size_t n_test = count_equal(vec1, vec2);

    if (n_ref == n_test)
    {
        std::cout << "PASS" << std::endl;
    }
    else
    {
        std::cout << "FAIL: n_ref = " << n_ref << ", n_test = " << n_test << std::endl;
    }

    return 0;
}

编译运行:

$ g++ -Wall -msse3 -O3 test.cpp && ./a.out
PASS

关于c++ - 比较两个 vector<bool> 与 SSE,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/34257708/

相关文章:

c++ - 如何在没有重复代码的情况下处理 getter 的 const/non const 组合?

java - 在Java中,构造函数中的final字段赋值涉及到哪些操作?

c++ - 将浮点 vector 转换为 16 位 int 而不饱和

simd - 如何使用avx(但没有avx-512)将int 64转换为int 32

c - SIMD以下代码

c++ - 尝试处理嵌套对象/结构和动态数组时发生内存泄漏或内存错误。可能的 Xcode/malloc 问题

c++ - CMake 无法使用 C++ 确定链接器语言

c++ - 切换到 clang 3.4 和 libc++ 时找不到标准 header

linux - 在没有 printf 的 NASM 中打印 ARGC

assembly - 为什么在执行 mov eax,0FFFFFFFFh 后,寄存器 eax 在调试器中显示为 0xccffffffh