Refactor: use popcnt in bitfield and cleanup warnings (#2950)

Co-authored-by: Noah Goldstein <goldstein.w.n@gmail.com>
2024-12-25 01:03:01 +00:00 · 2022-04-22 06:54:35 -05:00 · 2022-04-22 06:54:35 -05:00 · f823df364e
commit f823df364e
parent 4d7d35ad57
2 changed files with 250 additions and 23 deletions
--- a/libtransmission/bitfield.cc
+++ b/libtransmission/bitfield.cc
@ -7,6 +7,8 @@
 #include <array>
 #include <vector>

+#include "tr-popcount.h"
+
 #include "transmission.h"

 #include "bitfield.h"
@ -27,35 +29,40 @@ namespace
    return (bit_count >> 3) + ((bit_count & 7) != 0 ? 1 : 0);
 }

+/* Used only in cases where it can be guranteed bit_count <= SIZE_MAX - 8 */
+[[nodiscard]] constexpr size_t getBytesNeededSafe(size_t bit_count) noexcept
+{
+    return ((bit_count + 7) >> 3);
+}
+
 void setAllTrue(uint8_t* array, size_t bit_count)
 {
    uint8_t constexpr Val = 0xFF;
-    size_t const n = getBytesNeeded(bit_count);
+    /* Only ever called internally with in-use bit counts. Impossible
+       for bitcount > SIZE_MAX - 8. */
+    size_t const n = getBytesNeededSafe(bit_count);

    if (n > 0)
    {
        std::fill_n(array, n, Val);
-        array[n - 1] = Val << ((-bit_count) & 7U);
+        /* -bit_count & 7U. Since bitcount is unsigned do ~bitcount +
+           1 to replace -bitcount as linters warn about negating
+           unsigned types. Any compiler will optimize ~x + 1 to -x in
+           the backend. */
+        uint32_t shift = ((~bit_count) + 1) & 7U;
+        array[n - 1] = Val << shift;
    }
 }

-auto constexpr TrueBitCount = std::array<size_t, 256>{
-    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2,
-    3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3,
-    3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
-    6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4,
-    3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4,
-    5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6,
-    6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
 /* Switch to std::popcount if project upgrades to c++20 or newer */
-[[nodiscard]] constexpr int doPopcount(uint8_t flags) noexcept
+[[nodiscard]] uint32_t doPopcount(uint8_t flags) noexcept
 {
-    return TrueBitCount[flags];
+    /* If flags are ever expanded to use machine words instead of
+       uint8_t popcnt64 is also available */
+    return tr_popcnt<uint8_t>::count(flags);
 }

-[[nodiscard]] constexpr size_t rawCountFlags(uint8_t const* flags, size_t n) noexcept
+[[nodiscard]] size_t rawCountFlags(uint8_t const* flags, size_t n) noexcept
 {
    auto ret = size_t{};

@ -120,8 +127,8 @@ size_t tr_bitfield::countFlags(size_t begin, size_t end) const noexcept

        /* middle bytes */

-        /* Use second accumulator because loads generally have high
-           latency but fast throughput. */
+        /* Use 2x accumulators to help alleviate high latency of
+           popcnt instruction on many architectures. */
        size_t tmp_accum = 0;
        for (size_t i = first_byte + 1; i < walk_end;)
        {
@ -137,7 +144,11 @@ size_t tr_bitfield::countFlags(size_t begin, size_t end) const noexcept
        /* last byte */
        if (last_byte < std::size(flags_))
        {
-            size_t const last_shift = (-end) & 7U;
+            /* -end & 7U. Since bitcount is unsigned do ~end + 1 to
+               replace -end as linters warn about negating unsigned
+               types. Any compiler will optimize ~x + 1 to -x in the
+               backend. */
+            uint32_t const last_shift = (~end + 1) & 7U;
            val = flags_[last_byte];
            val >>= last_shift;
            /* No need to shift back val for correct popcount. */
@ -175,7 +186,8 @@ bool tr_bitfield::isValid() const

 std::vector<uint8_t> tr_bitfield::raw() const
 {
-    auto const n = getBytesNeeded(bit_count_);
+    /* Impossible for bit_count_ to exceed SIZE_MAX - 8 */
+    auto const n = getBytesNeededSafe(bit_count_);

    if (!std::empty(flags_))
    {
@ -196,12 +208,12 @@ void tr_bitfield::ensureBitsAlloced(size_t n)
 {
    bool const has_all = hasAll();

+    /* Cant use getBytesNeededSafe as n can be > SIZE_MAX - 8. */
    size_t const bytes_needed = has_all ? getBytesNeeded(std::max(n, true_count_)) : getBytesNeeded(n);

    if (std::size(flags_) < bytes_needed)
    {
        flags_.resize(bytes_needed);
-
        if (has_all)
        {
            setAllTrue(std::data(flags_), true_count_);
@ -298,7 +310,7 @@ void tr_bitfield::setRaw(uint8_t const* raw, size_t byte_count)
    flags_.assign(raw, raw + byte_count);

    // ensure any excess bits at the end of the array are set to '0'.
-    if (byte_count == getBytesNeeded(bit_count_))
+    if (byte_count == getBytesNeededSafe(bit_count_))
    {
        auto const excess_bit_count = byte_count * 8 - bit_count_;

@ -389,9 +401,10 @@ void tr_bitfield::setSpan(size_t begin, size_t end, bool value)
    auto const last_byte = end >> 3;

    unsigned char first_mask = 0xff >> (begin & 7U);
+    unsigned char last_mask = 0xff << ((~end) & 7U);
    if (value)
    {
-        unsigned char last_mask = 0xff << ((~end) & 7U);
+
        if (walk == last_byte)
        {
            flags_[walk] |= first_mask & last_mask;
@ -413,7 +426,7 @@ void tr_bitfield::setSpan(size_t begin, size_t end, bool value)
    else
    {
        first_mask = ~first_mask;
-        unsigned char last_mask = 0xff >> ((end & 7U) + 1);
+        last_mask = ~last_mask;
        if (walk == last_byte)
        {
            flags_[walk] &= first_mask | last_mask;
--- a/libtransmission/tr-popcount.h
+++ b/libtransmission/tr-popcount.h
@ -0,0 +1,214 @@
+// This file Copyright © 2022 Mnemosyne LLC.
+// It may be used under GPLv2 (SPDX: GPL-2.0-only), GPLv3 (SPDX: GPL-3.0-only),
+// or any future license endorsed by Mnemosyne LLC.
+// License text can be found in the licenses/ folder.
+
+#ifndef TR_POPCNT_H
+#define TR_POPCNT_H
+
+#include <stdint.h>
+
+/* Avoid defining irrelevant helpers that might interfere with other
+ * preprocessor logic. */
+#if (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) || __cplusplus >= 202002L
+#define TR_HAVE_STD_POPCOUNT
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) || \
+    (defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 0)))
+
+#ifdef __has_builtin
+#if __has_builtin(__builtin_popcount)
+#define TR_HAVE_BUILTIN_POPCOUNT
+#endif
+#endif
+#if defined(__x86_64__) && defined(__SSE__)
+#define TR_HAVE_ASM_POPCNT
+#endif
+#endif
+
+#if defined(_MSC_VER)
+#if defined(_M_X64) || defined(_M_IX86)
+#include <nmmintrin.h>
+#endif
+#endif
+
+#if defined(TR_HAVE_STD_POPCOUNT)
+#include <bit>
+#endif
+
+template<typename T>
+struct tr_popcnt
+{
+    static_assert(std::is_integral<T>::value != 0, "Can only popcnt integral types");
+
+    static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported size");
+
+    /* Needed regularly to avoid sign extension / get unsigned shift behavior.
+     */
+    using unsigned_T = typename std::make_unsigned<T>::type;
+
+    /* Sanity tests. */
+    static_assert(sizeof(unsigned_T) == sizeof(T), "Unsigned type somehow smaller than signed type");
+    static_assert(std::is_integral<unsigned_T>::value != 0, "Unsigned type somehow non integral");
+
+#if defined(TR_HAVE_STD_POPCOUNT)
+    /* If we have std::popcount just use that. */
+    static inline unsigned count(T v)
+    {
+        unsigned_T unsigned_v = static_cast<unsigned_T>(v);
+        return static_cast<unsigned>(std::popcount(unsigned_v));
+    }
+
+#elif defined(TR_HAVE_BUILTIN_POPCOUNT) && defined(TR_HAVE_ASM_POPCNT)
+    /* Use __builtin as opposed to straight assembly to avoid any
+       strain the latter puts on the optimized. */
+    static inline unsigned count(T v)
+    {
+        /* Make sure we know how to find that right __builtin_popcount. */
+        static_assert(
+            sizeof(T) == sizeof(long long) || sizeof(T) == sizeof(long) || sizeof(T) <= sizeof(int),
+            "Unkown type size!");
+
+        if constexpr (sizeof(T) == sizeof(long long))
+        {
+            return __builtin_popcountll(v);
+        }
+        else if constexpr (sizeof(T) == sizeof(long))
+        {
+            return __builtin_popcountl(v);
+        }
+        else if constexpr (sizeof(T) == sizeof(int))
+        {
+            return __builtin_popcount(v);
+        }
+        else
+        {
+            static_assert(sizeof(T) < sizeof(int));
+            /* Need to avoid sign extension. */
+            unsigned_T unsigned_v = static_cast<unsigned_T>(v);
+            return __builtin_popcount(unsigned_v);
+        }
+    }
+
+#elif defined(_MSC_VER) && defined(_M_X64)
+    /* 64-bit x86 intrinsics. */
+    static inline unsigned count(T v)
+    {
+        if constexpr (sizeof(T) == sizeof(uint64_t))
+        {
+            return _mm_popcnt_u64(v);
+        }
+        else if constexpr (sizeof(T) == sizeof(uint32_t))
+        {
+            return _mm_popcnt_u32(v);
+        }
+        else
+        {
+            static_assert(sizeof(T) < sizeof(uint32_t));
+            /* Need to avoid sign extension. */
+            unsigned_T unsigned_v = static_cast<unsigned_T>(v);
+            return _mm_popcnt_u32(unsigned_v);
+        }
+    }
+#elif defined(_MSC_VER) && defined(_M_IX86)
+    /* 32-bit x86 intrinsics. */
+    static inline unsigned count(T v)
+    {
+        if constexpr (sizeof(T) == sizeof(uint64_t))
+        {
+            /* Avoid signed shift. */
+            unsigned_T unsigned_v = static_cast<unsigned_T>(v);
+
+            return _mm_popcnt_u32(static_cast<uint32_t>(unsigned_v)) + _mm_popcnt_u32(static_cast<uint32_t>(unsigned_v >> 32));
+        }
+        else if constexpr (sizeof(T) == sizeof(uint32_t))
+        {
+            return _mm_popcnt_u32(v);
+        }
+        else
+        {
+            static_assert(sizeof(T) < sizeof(uint32_t));
+            /* Need to avoid sign extension. */
+            unsigned_T unsigned_v = static_cast<unsigned_T>(v);
+            return _mm_popcnt_u32(unsigned_v);
+        }
+    }
+#elif defined(TR_HAVE_ASM_POPCNT)
+    /* raw assembly. This prohibits constexpr so may be worth dropping if there
+     * is need for count() to be constexpr. */
+    static inline unsigned count(T v)
+    {
+        unsigned_T unsigned_v = static_cast<unsigned_T>(v);
+        if constexpr (sizeof(T) >= sizeof(uint32_t))
+        {
+            /* Make sure dst == src to avoid false dependency on many x86
+             * implementations. */
+            __asm__ __volatile__("popcnt %1, %0" : "=r"(unsigned_v) : "0"(unsigned_v) : "cc");
+
+            return unsigned_v;
+        }
+        else
+        {
+            /* No popcnt instruct for register size < 32. */
+            uint32_t unsigned_v32 = static_cast<uint32_t>(unsigned_v);
+
+            /* Make sure dst == src to avoid false dependency on many x86
+             * implementations. */
+            __asm__ __volatile__("popcnt %1, %0" : "=r"(unsigned_v32) : "0"(unsigned_v32) : "cc");
+            return unsigned_v32;
+        }
+    }
+#else
+    /* Generic implementation. */
+    static inline unsigned count(T v)
+    {
+        /* To avoid signed shifts. */
+        unsigned_T unsigned_v = static_cast<unsigned_T>(v);
+
+        if constexpr (sizeof(T) <= sizeof(uint16_t))
+        {
+            /* Use LUT for small sizes. In reality we only need half a
+             * byte for each value if ever hit a case where perf is
+             * limitted by severe bottleneck on L1D this can be
+             * optimized. */
+            static constexpr uint8_t popcnt_lut[256] = {
+                0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2,
+                3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3,
+                3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
+                6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4,
+                3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4,
+                5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6,
+                6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+            };
+            if constexpr (sizeof(T) == sizeof(uint8_t))
+            {
+                return popcnt_lut[unsigned_v];
+            }
+            else
+            {
+                return popcnt_lut[unsigned_v & 0xFF] + popcnt_lut[unsigned_v >> 8];
+            }
+        }
+        else
+        {
+            /* for larger sizes use implementation described here:
+             * http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
+             */
+            static constexpr unsigned_T m1 = static_cast<unsigned_T>(0x5555555555555555ll);
+            static constexpr unsigned_T m2 = static_cast<unsigned_T>(0x3333333333333333ll);
+            static constexpr unsigned_T m4 = static_cast<unsigned_T>(0x0F0F0F0F0F0F0F0Fll);
+            static constexpr unsigned_T h01 = static_cast<unsigned_T>(0x0101010101010101ll);
+
+            unsigned_v = unsigned_v - ((unsigned_v >> 1) & m1);
+            unsigned_v = (unsigned_v & m2) + ((unsigned_v >> 2) & m2);
+            unsigned_v = (unsigned_v + (unsigned_v >> 4)) & m4;
+
+            unsigned_v = unsigned_v * h01;
+            return unsigned_v >> (8 * sizeof(T) - 8);
+        }
+    }
+#endif
+};
+
+#endif