假设我想为 16 位 block 中的 64 位整数创建一个编译时构造的位计数查找表。我知道这样做的唯一方法是以下代码:
#define B4(n) n, n + 1, n + 1, n + 2
#define B6(n) B4(n), B4(n + 1), B4(n + 1), B4(n + 2)
#define B8(n) B6(n), B6(n + 1), B6(n + 1), B6(n + 2)
#define B10(n) B8(n), B8(n + 1), B8(n + 1), B8(n + 2)
#define B12(n) B10(n), B10(n + 1), B10(n + 1), B10(n + 2)
#define B14(n) B12(n), B12(n + 1), B12(n + 1), B12(n + 2)
#define B16(n) B14(n), B14(n + 1), B14(n + 1), B14(n + 2)
#define COUNT_BITS B16(0), B16(1), B16(1), B16(2)
unsigned int lookup[65536] = { COUNT_BITS };
是否有现代 (C++11/14) 方法可以获得相同的结果?
最佳答案
为什么不使用标准库?
#include <bitset>
int bits_in(std::uint64_t u)
{
auto bs = std::bitset<64>(u);
return bs.count();
}
生成的汇编器(使用 -O2 -march=native
编译):
bits_in(unsigned long):
xor eax, eax
popcnt rax, rdi
ret
此时值得一提的是,并非所有 x86 处理器都有此指令,因此(至少使用 gcc)您需要让它知道要编译的架构。
@tambre 提到,实际上,如果可以,优化器会走得更远:
volatile int results[3];
int main()
{
results[0] = bits_in(255);
results[1] = bits_in(1023);
results[2] = bits_in(0x8000800080008000);
}
生成的汇编器:
main:
mov DWORD PTR results[rip], 8
xor eax, eax
mov DWORD PTR results[rip+4], 10
mov DWORD PTR results[rip+8], 4
ret
像我这样的老派小玩意儿需要找到新的问题来解决:)
更新
并不是每个人都对解决方案依赖 cpu 帮助来计算位数感到高兴。那么如果我们使用自动生成的表格但允许开发人员配置它的大小呢? (警告 - 16 位表版本的编译时间长)
#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <bitset>
template<std::size_t word_size, std::size_t...Is>
constexpr auto generate(std::integral_constant<std::size_t, word_size>, std::index_sequence<Is...>) {
struct popcount_type {
constexpr auto operator()(int i) const {
int bits = 0;
while (i) {
i &= i - 1;
++bits;
}
return bits;
}
};
constexpr auto popcnt = popcount_type();
return std::array<int, sizeof...(Is)>
{
{popcnt(Is)...}
};
}
template<class T>
constexpr auto power2(T x) {
T result = 1;
for (T i = 0; i < x; ++i)
result *= 2;
return result;
}
template<class TableWord>
struct table {
static constexpr auto word_size = std::numeric_limits<TableWord>::digits;
static constexpr auto table_length = power2(word_size);
using array_type = std::array<int, table_length>;
static const array_type& get_data() {
static const array_type data = generate(std::integral_constant<std::size_t, word_size>(),
std::make_index_sequence<table_length>());
return data;
};
};
template<class Word>
struct use_table_word {
};
template<class Word, class TableWord = std::uint8_t>
int bits_in(Word val, use_table_word<TableWord> = use_table_word<TableWord>()) {
constexpr auto table_word_size = std::numeric_limits<TableWord>::digits;
constexpr auto word_size = std::numeric_limits<Word>::digits;
constexpr auto times = word_size / table_word_size;
static_assert(times > 0, "incompatible");
auto reduce = [val](auto times) {
return (val >> (table_word_size * times)) & (power2(table_word_size) - 1);
};
auto const& data = table<TableWord>::get_data();
auto result = 0;
for (int i = 0; i < times; ++i) {
result += data[reduce(i)];
}
return result;
}
volatile int results[3];
#include <iostream>
int main() {
auto input = std::uint64_t(1023);
results[0] = bits_in(input);
results[0] = bits_in(input, use_table_word<std::uint16_t>());
results[1] = bits_in(0x8000800080008000);
results[2] = bits_in(34567890);
for (int i = 0; i < 3; ++i) {
std::cout << results[i] << std::endl;
}
return 0;
}
最终更新
此版本允许使用查找表中的任意位数并支持任何输入类型,即使它小于查找表中的位数。
如果高位为零,它也会短路。
#include <utility>
#include <cstdint>
#include <array>
#include <numeric>
#include <algorithm>
namespace detail {
template<std::size_t bits, typename = void>
struct smallest_word;
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits <= 8)>>
{
using type = std::uint8_t;
};
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 8 and bits <= 16)>>
{
using type = std::uint16_t;
};
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 16 and bits <= 32)>>
{
using type = std::uint32_t;
};
template<std::size_t bits>
struct smallest_word<bits, std::enable_if_t<(bits > 32 and bits <= 64)>>
{
using type = std::uint64_t;
};
}
template<std::size_t bits> using smallest_word = typename detail::smallest_word<bits>::type;
template<class WordType, std::size_t bits, std::size_t...Is>
constexpr auto generate(std::index_sequence<Is...>) {
using word_type = WordType;
struct popcount_type {
constexpr auto operator()(word_type i) const {
int result = 0;
while (i) {
i &= i - 1;
++result;
}
return result;
}
};
constexpr auto popcnt = popcount_type();
return std::array<word_type, sizeof...(Is)>
{
{popcnt(Is)...}
};
}
template<class T>
constexpr auto power2(T x) {
return T(1) << x;
}
template<std::size_t word_size>
struct table {
static constexpr auto table_length = power2(word_size);
using word_type = smallest_word<word_size>;
using array_type = std::array<word_type, table_length>;
static const array_type& get_data() {
static const array_type data = generate<word_type, word_size>(std::make_index_sequence<table_length>());
return data;
};
template<class Type, std::size_t bits>
static constexpr auto n_bits() {
auto result = Type();
auto b = bits;
while(b--) {
result = (result << 1) | Type(1);
}
return result;
};
template<class Uint>
int operator()(Uint i) const {
constexpr auto mask = n_bits<Uint, word_size>();
return get_data()[i & mask];
}
};
template<int bits>
struct use_bits {
static constexpr auto digits = bits;
};
template<class T>
constexpr auto minimum(T x, T y) { return x < y ? x : y; }
template<class Word, class UseBits = use_bits<8>>
int bits_in(Word val, UseBits = UseBits()) {
using word_type = std::make_unsigned_t<Word>;
auto uval = static_cast<word_type>(val);
constexpr auto table_word_size = UseBits::digits;
constexpr auto word_size = std::numeric_limits<word_type>::digits;
auto const& mytable = table<table_word_size>();
int result = 0;
while (uval)
{
result += mytable(uval);
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshift-count-overflow"
uval >>= minimum(table_word_size, word_size);
#pragma clang diagnostic pop
}
return result;
}
volatile int results[4];
#include <iostream>
int main() {
auto input = std::uint8_t(127);
results[0] = bits_in(input);
results[1] = bits_in(input, use_bits<4>());
results[2] = bits_in(input, use_bits<11>());
results[3] = bits_in(input, use_bits<15>());
for (auto&& i : results) {
std::cout << i << std::endl;
}
auto input2 = 0xabcdef;
results[0] = bits_in(input2);
results[1] = bits_in(input2, use_bits<4>());
results[2] = bits_in(input2, use_bits<11>());
results[3] = bits_in(input2, use_bits<15>());
for (auto&& i : results) {
std::cout << i << std::endl;
}
auto input3 = -1;
results[0] = bits_in(input3);
results[1] = bits_in(input3, use_bits<4>());
results[2] = bits_in(input3, use_bits<11>());
results[3] = bits_in(input3, use_bits<15>());
for (auto&& i : results) {
std::cout << i << std::endl;
}
return 0;
}
示例输出:
7
7
7
7
17
17
17
17
32
32
32
32
例如,调用 bits_in(int, use_bits<11>())
得到的汇编输出变为:
.L16:
mov edx, edi
and edx, 2047
movzx edx, WORD PTR table<11ul>::get_data()::data[rdx+rdx]
add eax, edx
shr edi, 11
jne .L16
这在我看来是合理的。
关于c++ - 位数 : preprocessor magic vs modern C++,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45188786/