From f823df364e3558ba31cc12c409f5306b0451e971 Mon Sep 17 00:00:00 2001 From: goldsteinn <35538541+goldsteinn@users.noreply.github.com> Date: Fri, 22 Apr 2022 06:54:35 -0500 Subject: [PATCH] Refactor: use popcnt in bitfield and cleanup warnings (#2950) Co-authored-by: Noah Goldstein --- libtransmission/bitfield.cc | 59 ++++++---- libtransmission/tr-popcount.h | 214 ++++++++++++++++++++++++++++++++++ 2 files changed, 250 insertions(+), 23 deletions(-) create mode 100644 libtransmission/tr-popcount.h diff --git a/libtransmission/bitfield.cc b/libtransmission/bitfield.cc index a57b4d9c6..0218215e8 100644 --- a/libtransmission/bitfield.cc +++ b/libtransmission/bitfield.cc @@ -7,6 +7,8 @@ #include #include +#include "tr-popcount.h" + #include "transmission.h" #include "bitfield.h" @@ -27,35 +29,40 @@ namespace return (bit_count >> 3) + ((bit_count & 7) != 0 ? 1 : 0); } +/* Used only in cases where it can be guranteed bit_count <= SIZE_MAX - 8 */ +[[nodiscard]] constexpr size_t getBytesNeededSafe(size_t bit_count) noexcept +{ + return ((bit_count + 7) >> 3); +} + void setAllTrue(uint8_t* array, size_t bit_count) { uint8_t constexpr Val = 0xFF; - size_t const n = getBytesNeeded(bit_count); + /* Only ever called internally with in-use bit counts. Impossible + for bitcount > SIZE_MAX - 8. */ + size_t const n = getBytesNeededSafe(bit_count); if (n > 0) { std::fill_n(array, n, Val); - array[n - 1] = Val << ((-bit_count) & 7U); + /* -bit_count & 7U. Since bitcount is unsigned do ~bitcount + + 1 to replace -bitcount as linters warn about negating + unsigned types. Any compiler will optimize ~x + 1 to -x in + the backend. */ + uint32_t shift = ((~bit_count) + 1) & 7U; + array[n - 1] = Val << shift; } } -auto constexpr TrueBitCount = std::array{ - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, - 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, - 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, - 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, - 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, - 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, - 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 -}; - /* Switch to std::popcount if project upgrades to c++20 or newer */ -[[nodiscard]] constexpr int doPopcount(uint8_t flags) noexcept +[[nodiscard]] uint32_t doPopcount(uint8_t flags) noexcept { - return TrueBitCount[flags]; + /* If flags are ever expanded to use machine words instead of + uint8_t popcnt64 is also available */ + return tr_popcnt::count(flags); } -[[nodiscard]] constexpr size_t rawCountFlags(uint8_t const* flags, size_t n) noexcept +[[nodiscard]] size_t rawCountFlags(uint8_t const* flags, size_t n) noexcept { auto ret = size_t{}; @@ -120,8 +127,8 @@ size_t tr_bitfield::countFlags(size_t begin, size_t end) const noexcept /* middle bytes */ - /* Use second accumulator because loads generally have high - latency but fast throughput. */ + /* Use 2x accumulators to help alleviate high latency of + popcnt instruction on many architectures. */ size_t tmp_accum = 0; for (size_t i = first_byte + 1; i < walk_end;) { @@ -137,7 +144,11 @@ size_t tr_bitfield::countFlags(size_t begin, size_t end) const noexcept /* last byte */ if (last_byte < std::size(flags_)) { - size_t const last_shift = (-end) & 7U; + /* -end & 7U. Since bitcount is unsigned do ~end + 1 to + replace -end as linters warn about negating unsigned + types. Any compiler will optimize ~x + 1 to -x in the + backend. */ + uint32_t const last_shift = (~end + 1) & 7U; val = flags_[last_byte]; val >>= last_shift; /* No need to shift back val for correct popcount. */ @@ -175,7 +186,8 @@ bool tr_bitfield::isValid() const std::vector tr_bitfield::raw() const { - auto const n = getBytesNeeded(bit_count_); + /* Impossible for bit_count_ to exceed SIZE_MAX - 8 */ + auto const n = getBytesNeededSafe(bit_count_); if (!std::empty(flags_)) { @@ -196,12 +208,12 @@ void tr_bitfield::ensureBitsAlloced(size_t n) { bool const has_all = hasAll(); + /* Cant use getBytesNeededSafe as n can be > SIZE_MAX - 8. */ size_t const bytes_needed = has_all ? getBytesNeeded(std::max(n, true_count_)) : getBytesNeeded(n); if (std::size(flags_) < bytes_needed) { flags_.resize(bytes_needed); - if (has_all) { setAllTrue(std::data(flags_), true_count_); @@ -298,7 +310,7 @@ void tr_bitfield::setRaw(uint8_t const* raw, size_t byte_count) flags_.assign(raw, raw + byte_count); // ensure any excess bits at the end of the array are set to '0'. - if (byte_count == getBytesNeeded(bit_count_)) + if (byte_count == getBytesNeededSafe(bit_count_)) { auto const excess_bit_count = byte_count * 8 - bit_count_; @@ -389,9 +401,10 @@ void tr_bitfield::setSpan(size_t begin, size_t end, bool value) auto const last_byte = end >> 3; unsigned char first_mask = 0xff >> (begin & 7U); + unsigned char last_mask = 0xff << ((~end) & 7U); if (value) { - unsigned char last_mask = 0xff << ((~end) & 7U); + if (walk == last_byte) { flags_[walk] |= first_mask & last_mask; @@ -413,7 +426,7 @@ void tr_bitfield::setSpan(size_t begin, size_t end, bool value) else { first_mask = ~first_mask; - unsigned char last_mask = 0xff >> ((end & 7U) + 1); + last_mask = ~last_mask; if (walk == last_byte) { flags_[walk] &= first_mask | last_mask; diff --git a/libtransmission/tr-popcount.h b/libtransmission/tr-popcount.h new file mode 100644 index 000000000..9e6068ebb --- /dev/null +++ b/libtransmission/tr-popcount.h @@ -0,0 +1,214 @@ +// This file Copyright © 2022 Mnemosyne LLC. +// It may be used under GPLv2 (SPDX: GPL-2.0-only), GPLv3 (SPDX: GPL-3.0-only), +// or any future license endorsed by Mnemosyne LLC. +// License text can be found in the licenses/ folder. + +#ifndef TR_POPCNT_H +#define TR_POPCNT_H + +#include + +/* Avoid defining irrelevant helpers that might interfere with other + * preprocessor logic. */ +#if (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) || __cplusplus >= 202002L +#define TR_HAVE_STD_POPCOUNT +#endif + +#if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) || \ + (defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 0))) + +#ifdef __has_builtin +#if __has_builtin(__builtin_popcount) +#define TR_HAVE_BUILTIN_POPCOUNT +#endif +#endif +#if defined(__x86_64__) && defined(__SSE__) +#define TR_HAVE_ASM_POPCNT +#endif +#endif + +#if defined(_MSC_VER) +#if defined(_M_X64) || defined(_M_IX86) +#include +#endif +#endif + +#if defined(TR_HAVE_STD_POPCOUNT) +#include +#endif + +template +struct tr_popcnt +{ + static_assert(std::is_integral::value != 0, "Can only popcnt integral types"); + + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported size"); + + /* Needed regularly to avoid sign extension / get unsigned shift behavior. + */ + using unsigned_T = typename std::make_unsigned::type; + + /* Sanity tests. */ + static_assert(sizeof(unsigned_T) == sizeof(T), "Unsigned type somehow smaller than signed type"); + static_assert(std::is_integral::value != 0, "Unsigned type somehow non integral"); + +#if defined(TR_HAVE_STD_POPCOUNT) + /* If we have std::popcount just use that. */ + static inline unsigned count(T v) + { + unsigned_T unsigned_v = static_cast(v); + return static_cast(std::popcount(unsigned_v)); + } + +#elif defined(TR_HAVE_BUILTIN_POPCOUNT) && defined(TR_HAVE_ASM_POPCNT) + /* Use __builtin as opposed to straight assembly to avoid any + strain the latter puts on the optimized. */ + static inline unsigned count(T v) + { + /* Make sure we know how to find that right __builtin_popcount. */ + static_assert( + sizeof(T) == sizeof(long long) || sizeof(T) == sizeof(long) || sizeof(T) <= sizeof(int), + "Unkown type size!"); + + if constexpr (sizeof(T) == sizeof(long long)) + { + return __builtin_popcountll(v); + } + else if constexpr (sizeof(T) == sizeof(long)) + { + return __builtin_popcountl(v); + } + else if constexpr (sizeof(T) == sizeof(int)) + { + return __builtin_popcount(v); + } + else + { + static_assert(sizeof(T) < sizeof(int)); + /* Need to avoid sign extension. */ + unsigned_T unsigned_v = static_cast(v); + return __builtin_popcount(unsigned_v); + } + } + +#elif defined(_MSC_VER) && defined(_M_X64) + /* 64-bit x86 intrinsics. */ + static inline unsigned count(T v) + { + if constexpr (sizeof(T) == sizeof(uint64_t)) + { + return _mm_popcnt_u64(v); + } + else if constexpr (sizeof(T) == sizeof(uint32_t)) + { + return _mm_popcnt_u32(v); + } + else + { + static_assert(sizeof(T) < sizeof(uint32_t)); + /* Need to avoid sign extension. */ + unsigned_T unsigned_v = static_cast(v); + return _mm_popcnt_u32(unsigned_v); + } + } +#elif defined(_MSC_VER) && defined(_M_IX86) + /* 32-bit x86 intrinsics. */ + static inline unsigned count(T v) + { + if constexpr (sizeof(T) == sizeof(uint64_t)) + { + /* Avoid signed shift. */ + unsigned_T unsigned_v = static_cast(v); + + return _mm_popcnt_u32(static_cast(unsigned_v)) + _mm_popcnt_u32(static_cast(unsigned_v >> 32)); + } + else if constexpr (sizeof(T) == sizeof(uint32_t)) + { + return _mm_popcnt_u32(v); + } + else + { + static_assert(sizeof(T) < sizeof(uint32_t)); + /* Need to avoid sign extension. */ + unsigned_T unsigned_v = static_cast(v); + return _mm_popcnt_u32(unsigned_v); + } + } +#elif defined(TR_HAVE_ASM_POPCNT) + /* raw assembly. This prohibits constexpr so may be worth dropping if there + * is need for count() to be constexpr. */ + static inline unsigned count(T v) + { + unsigned_T unsigned_v = static_cast(v); + if constexpr (sizeof(T) >= sizeof(uint32_t)) + { + /* Make sure dst == src to avoid false dependency on many x86 + * implementations. */ + __asm__ __volatile__("popcnt %1, %0" : "=r"(unsigned_v) : "0"(unsigned_v) : "cc"); + + return unsigned_v; + } + else + { + /* No popcnt instruct for register size < 32. */ + uint32_t unsigned_v32 = static_cast(unsigned_v); + + /* Make sure dst == src to avoid false dependency on many x86 + * implementations. */ + __asm__ __volatile__("popcnt %1, %0" : "=r"(unsigned_v32) : "0"(unsigned_v32) : "cc"); + return unsigned_v32; + } + } +#else + /* Generic implementation. */ + static inline unsigned count(T v) + { + /* To avoid signed shifts. */ + unsigned_T unsigned_v = static_cast(v); + + if constexpr (sizeof(T) <= sizeof(uint16_t)) + { + /* Use LUT for small sizes. In reality we only need half a + * byte for each value if ever hit a case where perf is + * limitted by severe bottleneck on L1D this can be + * optimized. */ + static constexpr uint8_t popcnt_lut[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, + 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, + 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, + 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, + 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 + }; + if constexpr (sizeof(T) == sizeof(uint8_t)) + { + return popcnt_lut[unsigned_v]; + } + else + { + return popcnt_lut[unsigned_v & 0xFF] + popcnt_lut[unsigned_v >> 8]; + } + } + else + { + /* for larger sizes use implementation described here: + * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation + */ + static constexpr unsigned_T m1 = static_cast(0x5555555555555555ll); + static constexpr unsigned_T m2 = static_cast(0x3333333333333333ll); + static constexpr unsigned_T m4 = static_cast(0x0F0F0F0F0F0F0F0Fll); + static constexpr unsigned_T h01 = static_cast(0x0101010101010101ll); + + unsigned_v = unsigned_v - ((unsigned_v >> 1) & m1); + unsigned_v = (unsigned_v & m2) + ((unsigned_v >> 2) & m2); + unsigned_v = (unsigned_v + (unsigned_v >> 4)) & m4; + + unsigned_v = unsigned_v * h01; + return unsigned_v >> (8 * sizeof(T) - 8); + } + } +#endif +}; + +#endif