diff --git a/dwio/nimble/common/Varint.cpp b/dwio/nimble/common/Varint.cpp index 6c374e02..be9e2bd6 100644 --- a/dwio/nimble/common/Varint.cpp +++ b/dwio/nimble/common/Varint.cpp @@ -21,9 +21,14 @@ #include "common/aarch64/compat.h" #endif //__aarch64__ +#include +#include + +#include + #include "dwio/nimble/common/Exceptions.h" #include "dwio/nimble/common/Varint.h" -#include "folly/CpuId.h" +#include "folly/Likely.h" namespace facebook::nimble::varint { @@ -69,760 +74,215 @@ __attribute__((__target__("bmi2"))) // __attribute__ ((optimize("Os"))) const char* bulkVarintDecodeBmi2(uint64_t n, const char* pos, T* output); -const char* bulkVarintDecode32(uint64_t n, const char* pos, uint32_t* output) { - static bool hasBmi2 = folly::CpuId().bmi2(); - if (hasBmi2) { - return bulkVarintDecodeBmi2(n, pos, output); - } - for (uint64_t i = 0; i < n; ++i) { - *output++ = readVarint32(&pos); +// Zero-extend 8 consecutive bytes into T-sized output elements using xsimd +// batch construction and store. +template +inline void expandByteWord(const uint8_t* bytes, T* output) { + using batch_type = xsimd::batch; + constexpr auto kBatchSize = batch_type::size; + + if constexpr (kBatchSize >= 8) { + batch_type( + static_cast(bytes[0]), + static_cast(bytes[1]), + static_cast(bytes[2]), + static_cast(bytes[3]), + static_cast(bytes[4]), + static_cast(bytes[5]), + static_cast(bytes[6]), + static_cast(bytes[7])) + .store_unaligned(output); + } else if constexpr (kBatchSize == 4) { + batch_type( + static_cast(bytes[0]), + static_cast(bytes[1]), + static_cast(bytes[2]), + static_cast(bytes[3])) + .store_unaligned(output); + batch_type( + static_cast(bytes[4]), + static_cast(bytes[5]), + static_cast(bytes[6]), + static_cast(bytes[7])) + .store_unaligned(output + 4); + } else if constexpr (kBatchSize == 2) { + batch_type(static_cast(bytes[0]), static_cast(bytes[1])) + .store_unaligned(output); + batch_type(static_cast(bytes[2]), static_cast(bytes[3])) + .store_unaligned(output + 2); + batch_type(static_cast(bytes[4]), static_cast(bytes[5])) + .store_unaligned(output + 4); + batch_type(static_cast(bytes[6]), static_cast(bytes[7])) + .store_unaligned(output + 6); } - return pos; } -const char* bulkVarintDecode64(uint64_t n, const char* pos, uint64_t* output) { - static bool hasBmi2 = folly::CpuId().bmi2(); - if (hasBmi2) { - return bulkVarintDecodeBmi2(n, pos, output); +// Process runs of single-byte varints using xsimd for both the high-bit +// check and byte-to-element widening. Works with uint8_t* throughout, +// avoiding reinterpret_cast to uint64_t* (alignment/strict-aliasing issues). +// Returns the number of elements remaining after processing. +template +inline uint64_t +bulkDecodeSingleByteRun(uint64_t n, const char*& pos, T*& output) { + using u8_batch = xsimd::batch; + constexpr auto kU8Size = u8_batch::size; + constexpr uint64_t wordSize = 8; + constexpr uint64_t kHighBits = 0x8080808080808080ULL; + + const auto* src = reinterpret_cast(pos); + + // Process kU8BatchSize bytes at a time. + // Single wide load + vptest + while (n >= kU8Size) { + auto bytes = u8_batch::load_unaligned(src); + if (xsimd::any((bytes & u8_batch(0x80)) != u8_batch(0))) { + break; + } + for (size_t i = 0; i < kU8Size; i += wordSize) { + expandByteWord(src + i, output + i); + } + src += kU8Size; + output += kU8Size; + n -= kU8Size; } - for (uint64_t i = 0; i < n; ++i) { - *output++ = readVarint64(&pos); + + // Process 8 bytes at a time. Use memcpy for the high-bit check to avoid + // reinterpret_cast strict-aliasing/alignment issues. + while (n >= wordSize) { + uint64_t word; + std::memcpy(&word, src, sizeof(word)); + + if (word & kHighBits) { + break; + } + expandByteWord(src, output); + src += wordSize; + output += wordSize; + n -= wordSize; } - return pos; + + // Handle trailing single-byte varints one at a time. + while (n > 0 && !(src[0] & 0x80)) { + *output++ = static_cast(src[0]); + ++src; + --n; + } + + pos = reinterpret_cast(src); + return n; } -// Codegen for the cases below. Useful if we want to try to tweak something -// (different mask length, etc) in the future. - -// std::string codegenVarintMask(int endByte, int len) { -// CHECK_GE(endByte + 1, len); -// std::string s = "0x0000000000000000ULL"; -// int offset = 5 + endByte * 2; -// for (int i = 0; i < len; ++i) { -// *(s.end() - offset) = '7'; -// *(s.end() - offset + 1) = 'f'; -// offset -= 2; -// } -// return s; -// } - -// std::string codegen32(uint64_t controlBits, int maskLength) { -// CHECK(controlBits < (1 << maskLength)); -// std::string s = absl::Substitute( -// " case $0ULL: {", controlBits); -// int lastZero = -1; -// int numVariants = 0; -// bool carryoverUsed = false; -// for (int nextBit = 0; nextBit < maskLength; ++nextBit) { -// // A zero control bit means we detected the end of a varint, so -// // we can construct a mask of the bottom 7 bits starting at the end -// // of the nextBit byte and going back (nextBit - lastZero) bytes. -// if ((controlBits & (1ULL << nextBit)) == 0) { -// if (carryoverUsed) { -// s += absl::Substitute("\n *output++ = _pext_u64(word, $0);", -// CodegenVarintMask(nextBit, nextBit - -// lastZero)); -// } else { -// s += absl::Substitute("\n const uint64_t firstValue = " -// "_pext_u64(word, $0);", -// CodegenVarintMask(nextBit, nextBit - -// lastZero)); -// s += "\n *output++ = (firstValue << carryoverBits) | -// carryover;"; carryoverUsed = true; -// } -// lastZero = nextBit; -// ++numVariants; -// } -// } -// // Ending on a complete varint, not completing any varint, and completing -// // at least 1 varint but no ending on one are all distinct cases. -// if (lastZero == -1) { -// s += absl::Substitute("\n carryover |= " -// "_pext_u64(word, $0) << carryoverBits;", -// CodegenVarintMask(maskLength - 1, maskLength)); -// s += absl::Substitute("\n carryoverBits += $0;", 7 * maskLength); -// } else if (lastZero == maskLength - 1) { -// s += "\n carryover = 0ULL;"; -// s += "\n carryoverBits = 0;"; -// s += absl::Substitute("\n n -= $0;", numVariants); -// } else { -// s += absl::Substitute("\n carryover = _pext_u64(word, $0);", -// CodegenVarintMask(maskLength - 1, -// maskLength - 1 - lastZero)); -// s += absl::Substitute("\n carryoverBits = $0;", -// 7 * (maskLength - 1 - lastZero)); -// s += absl::Substitute("\n n -= $0;", numVariants); -// } -// // s += absl::Substitute("\n pos += $0;", maskLength); -// s += "\n continue;"; -// s += "\n }"; -// return s; -// } +constexpr std::size_t kCacheLineBytes = 64; +constexpr std::size_t kMaxControlBitsValue = 64; +constexpr std::size_t kMaskLength = 6; + +// Lookup table entry for table-driven BMI2 varint decode. +struct alignas(kCacheLineBytes) VarintLookupEntry { + // Extraction masks for up to 6 completed varints. Unused slots are 0 + uint64_t valueMasks[kMaskLength]; + // Extraction mask for carryover bytes (partial varint at end of chunk), and + // zero when the chunk ends on a clean varint boundary. + uint64_t carryOverMask; + uint8_t numCompleted; + uint8_t carryOverBits; + uint8_t padding[6]; +}; + +static_assert( + sizeof(VarintLookupEntry) == kCacheLineBytes, + "Must fit one cache line"); + +// We build the full 64-entry/kMaxControlBitsValue for control bit lookup table +// at compile time. +static constexpr auto kDecodeTable = [] { + std::array table{}; + for (int i = 0; i < kMaxControlBitsValue; ++i) { + VarintLookupEntry entry{}; + uint64_t currentMask = 0; + + int lastZero = -1, numCompleted = 0; + const uint8_t controlBits = static_cast(i); + for (int j = 0; j < kMaskLength; ++j) { + currentMask |= uint64_t(0x7f) << (j * 8); + if (!((controlBits >> j) & 1)) { + entry.valueMasks[numCompleted] = currentMask; + ++numCompleted; + + lastZero = j; + currentMask = 0; + } + } + + entry.numCompleted = static_cast(numCompleted); + + entry.carryOverMask = 0; + entry.carryOverBits = 0; + if (lastZero < 5) { + // Partial varint at end of chunk. + entry.carryOverMask = currentMask; + entry.carryOverBits = static_cast(7 * (5 - lastZero)); + } else if (lastZero == -1) { + // All 6 bytes are continuation bytes (case 63). Accumulate carryover. + // This is a rare case + entry.carryOverMask = currentMask; + entry.carryOverBits = 42; + } + table[i] = entry; + } + return table; +}(); + +// Table-driven BMI2 varint decode. Reads extraction masks from a lookup table. template -const char* bulkVarintDecodeBmi2(uint64_t n, const char* pos, T* output) { - constexpr uint64_t mask = 0x0000808080808080; - // Note that we could of course use a maskLength of up to 8. But I found - // that with maskLength > 6 we start to spill out of the l1i cache in - // opt mode and that counterbalances the gain. Plus the first run and/or - // small n are more expensive as we have to load more instructions. - constexpr int maskLength = 6; +__attribute__((__target__("bmi2"))) const char* +bulkVarintDecodeBmi2Table(uint64_t n, const char* pos, T* output) { + constexpr uint64_t kControlMask = 0x0000808080808080ULL; + constexpr int kChunkLen = 6; + uint64_t carryover = 0; int carryoverBits = 0; - pos -= maskLength; - // Also note that a handful of these cases are impossible for 32-bit varints. - // We could save a tiny bit of program size by pruning them out. + pos -= kChunkLen; + while (n >= 8) { - pos += maskLength; - uint64_t word = *reinterpret_cast(pos); - const uint64_t controlBits = _pext_u64(word, mask); - switch (controlBits) { - case 0ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 6; - continue; - } - case 1ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 5; - continue; - } - case 2ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 5; - continue; - } - case 3ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 4ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 5; - continue; - } - case 5ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 6ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f7f00ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 7ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000007f7f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 8ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 5; - continue; - } - case 9ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 10ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 11ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 12ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x0000007f7f7f0000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 13ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f7f7f0000ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 14ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f7f7f7f00ULL); - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 15ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000007f7f7f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00007f0000000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 2; - continue; - } - case 16ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 5; - continue; - } - case 17ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 18ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 19ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 20ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 21ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 22ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f7f00ULL); - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 23ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000007f7f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00007f7f00000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 2; - continue; - } - case 24ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x00007f7f7f000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 4; - continue; - } - case 25ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x00007f7f7f000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 26ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - *output++ = _pext_u64(word, 0x00007f7f7f000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 27ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00007f7f7f000000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 2; - continue; - } - case 28ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00007f7f7f7f0000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 3; - continue; - } - case 29ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00007f7f7f7f0000ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 2; - continue; - } - case 30ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00007f7f7f7f7f00ULL); - carryover = 0ULL; - carryoverBits = 0; - n -= 2; - continue; - } - case 31ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00007f7f7f7f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - carryover = 0ULL; - carryoverBits = 0; - n -= 1; - continue; - } - case 32ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 5; - continue; - } - case 33ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 4; - continue; - } - case 34ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 4; - continue; - } - case 35ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f000000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 3; - continue; - } - case 36ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 4; - continue; - } - case 37ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 3; - continue; - } - case 38ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f7f00ULL); - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 3; - continue; - } - case 39ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000007f7f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f00000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 2; - continue; - } - case 40ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 4; - continue; - } - case 41ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 3; - continue; - } - case 42ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 3; - continue; - } - case 43ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f7f000000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 2; - continue; - } - case 44ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x0000007f7f7f0000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 3; - continue; - } - case 45ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f7f7f0000ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 2; - continue; - } - case 46ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000007f7f7f7f00ULL); - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 2; - continue; - } - case 47ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000007f7f7f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - carryover = _pext_u64(word, 0x00007f0000000000ULL); - carryoverBits = 7; - n -= 1; - continue; - } - case 48ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 4; - continue; - } - case 49ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 3; - continue; - } - case 50ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - *output++ = _pext_u64(word, 0x000000007f000000ULL); - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 3; - continue; - } - case 51ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f000000ULL); - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 2; - continue; - } - case 52ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 3; - continue; - } - case 53ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f0000ULL); - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 2; - continue; - } - case 54ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x000000007f7f7f00ULL); - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 2; - continue; - } - case 55ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000007f7f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - carryover = _pext_u64(word, 0x00007f7f00000000ULL); - carryoverBits = 14; - n -= 1; - continue; - } - case 56ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - carryover = _pext_u64(word, 0x00007f7f7f000000ULL); - carryoverBits = 21; - n -= 3; - continue; - } - case 57ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f0000ULL); - carryover = _pext_u64(word, 0x00007f7f7f000000ULL); - carryoverBits = 21; - n -= 2; - continue; - } - case 58ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x00000000007f7f00ULL); - carryover = _pext_u64(word, 0x00007f7f7f000000ULL); - carryoverBits = 21; - n -= 2; - continue; - } - case 59ULL: { - const uint64_t firstValue = _pext_u64(word, 0x00000000007f7f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - carryover = _pext_u64(word, 0x00007f7f7f000000ULL); - carryoverBits = 21; - n -= 1; - continue; - } - case 60ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - *output++ = _pext_u64(word, 0x0000000000007f00ULL); - carryover = _pext_u64(word, 0x00007f7f7f7f0000ULL); - carryoverBits = 28; - n -= 2; - continue; - } - case 61ULL: { - const uint64_t firstValue = _pext_u64(word, 0x0000000000007f7fULL); - *output++ = (firstValue << carryoverBits) | carryover; - carryover = _pext_u64(word, 0x00007f7f7f7f0000ULL); - carryoverBits = 28; - n -= 1; - continue; - } - case 62ULL: { - const uint64_t firstValue = _pext_u64(word, 0x000000000000007fULL); - *output++ = (firstValue << carryoverBits) | carryover; - carryover = _pext_u64(word, 0x00007f7f7f7f7f00ULL); - carryoverBits = 35; - n -= 1; - continue; - } - case 63ULL: { - carryover |= _pext_u64(word, 0x00007f7f7f7f7f7fULL) << carryoverBits; - carryoverBits += 42; - continue; - } - default: { - NIMBLE_UNREACHABLE("Control bits must be < 64"); - } + pos += kChunkLen; + + uint64_t word; + std::memcpy(&word, pos, sizeof(word)); + const uint64_t cb = _pext_u64(word, kControlMask); + + // Case 63 (all continuation bytes) requires accumulating carryover + // rather than replacing it. This case is extremely rare + if (FOLLY_UNLIKELY(cb == 63)) { + carryover |= _pext_u64(word, 0x00007f7f7f7f7f7fULL) << carryoverBits; + carryoverBits += 42; + continue; } + + const auto& info = kDecodeTable[cb]; + + // Extract and store up to 6 values. Unused mask slots are 0, producing + // harmless zero writes that will be overwritten by subsequent iterations + output[0] = static_cast( + (_pext_u64(word, info.valueMasks[0]) << carryoverBits) | carryover); + output[1] = static_cast(_pext_u64(word, info.valueMasks[1])); + output[2] = static_cast(_pext_u64(word, info.valueMasks[2])); + output[3] = static_cast(_pext_u64(word, info.valueMasks[3])); + output[4] = static_cast(_pext_u64(word, info.valueMasks[4])); + output[5] = static_cast(_pext_u64(word, info.valueMasks[5])); + + output += info.numCompleted; + n -= info.numCompleted; + + // Update carryover. When carryoverMask is 0, _pext returns 0 and + // carryoverBits is 0, effectively clearing the carryover state. + carryover = _pext_u64(word, info.carryOverMask); + carryoverBits = info.carryOverBits; } - pos += maskLength; + + pos += kChunkLen; if (n > 0) { - if constexpr (std::is_same::value) { + if constexpr (std::is_same_v) { *output++ = readVarint32(&pos) << carryoverBits | carryover; for (uint64_t i = 1; i < n; ++i) { *output++ = readVarint32(&pos); @@ -837,4 +297,20 @@ const char* bulkVarintDecodeBmi2(uint64_t n, const char* pos, T* output) { return pos; } +const char* bulkVarintDecode32(uint64_t n, const char* pos, uint32_t* output) { + n = bulkDecodeSingleByteRun(n, pos, output); + if (n == 0) { + return pos; + } + return bulkVarintDecodeBmi2Table(n, pos, output); +} + +const char* bulkVarintDecode64(uint64_t n, const char* pos, uint64_t* output) { + n = bulkDecodeSingleByteRun(n, pos, output); + if (n == 0) { + return pos; + } + return bulkVarintDecodeBmi2Table(n, pos, output); +} + } // namespace facebook::nimble::varint diff --git a/dwio/nimble/common/benchmarks/VarintBenchmark.cpp b/dwio/nimble/common/benchmarks/VarintBenchmark.cpp index 6d095fde..b2c6d61a 100644 --- a/dwio/nimble/common/benchmarks/VarintBenchmark.cpp +++ b/dwio/nimble/common/benchmarks/VarintBenchmark.cpp @@ -117,57 +117,66 @@ std::vector MakeSkewedData(int num_elements = kNumElements) { return data; } -BENCHMARK(Encode, iters) { - std::vector data; - std::unique_ptr buf; - BENCHMARK_SUSPEND { - data = MakeUniformData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); +// Makes data where all values fit in exactly `numBytes` varint bytes. +std::vector MakeFixedWidthData32( + int numBytes, + int num_elements = kNumElements) { + std::vector data(num_elements); + uint32_t lo = (numBytes == 1) ? 0 : (1u << (7 * (numBytes - 1))); + uint32_t hi = (1u << (7 * numBytes)) - 1; + if (numBytes == 5) { + hi = UINT32_MAX; } - while (iters--) { - char* pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - nimble::varint::writeVarint(data[i], &pos); - } - CHECK_GE(pos - buf.get(), kNumElements); + for (int i = 0; i < num_elements; ++i) { + data[i] = lo + folly::Random::secureRand32() % (hi - lo + 1); } + return data; } -BENCHMARK(FollyEncode, iters) { - std::vector data; - std::unique_ptr buf; - BENCHMARK_SUSPEND { - data = MakeUniformData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); +// Makes 64-bit data where all values fit in exactly `numBytes` varint bytes. +std::vector MakeFixedWidthData64( + int numBytes, + int num_elements = kNumElements) { + std::vector data(num_elements); + uint64_t lo = (numBytes == 1) ? 0 : (1ull << (7 * (numBytes - 1))); + uint64_t hi = (numBytes >= 10) ? UINT64_MAX : ((1ull << (7 * numBytes)) - 1); + for (int i = 0; i < num_elements; ++i) { + data[i] = lo + folly::Random::secureRand64() % (hi - lo + 1); } - while (iters--) { - uint8_t* pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - pos += folly::encodeVarint(data[i], pos); - } - CHECK_GE(pos - buf.get(), kNumElements); + return data; +} + +// Encode data into a varint buffer, returns total encoded size. +template +std::unique_ptr EncodeData( + const std::vector& data, + uint64_t& encodedSize) { + auto buf = std::make_unique(data.size() * folly::kMaxVarintLength64); + char* pos = buf.get(); + for (auto val : data) { + nimble::varint::writeVarint(val, &pos); } + encodedSize = pos - buf.get(); + return buf; } -BENCHMARK(NimbleDecodeUniform, iters) { +// ============================================================================ +// Original benchmarks (uniform + skewed, 32-bit) +// ============================================================================ + +BENCHMARK(Encode, iters) { std::vector data; std::unique_ptr buf; - std::vector recovered; BENCHMARK_SUSPEND { - recovered.resize(kNumElements); data = MakeUniformData(); buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); - char* pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - nimble::varint::writeVarint(data[i], &pos); - } } while (iters--) { - const char* cpos = buf.get(); + char* pos = buf.get(); for (int i = 0; i < kNumElements; ++i) { - recovered[i] = nimble::varint::readVarint32(&cpos); + nimble::varint::writeVarint(data[i], &pos); } - CHECK_EQ(recovered.back(), data.back()); + CHECK_GE(pos - buf.get(), kNumElements); } } @@ -191,90 +200,40 @@ BENCHMARK(NimbleBulkDecodeUniform, iters) { } } -BENCHMARK(FollyDecodeUniform, iters) { - std::vector data; - std::unique_ptr buf; - uint8_t* pos; - std::vector recovered; - BENCHMARK_SUSPEND { - recovered.resize(kNumElements); - data = MakeUniformData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); - pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - pos += folly::encodeVarint(data[i], pos); - } - } - while (iters--) { - const uint8_t* fstart = buf.get(); - const uint8_t* fend = buf.get() + (pos - buf.get()); - folly::Range frange(fstart, fend); - for (int i = 0; i < kNumElements; ++i) { - recovered[i] = folly::decodeVarint(frange); - } - CHECK_EQ(recovered.back(), data.back()); - } -} +// ============================================================================ +// Fixed byte-width benchmarks (32-bit): isolate per-width performance +// ============================================================================ -BENCHMARK(DwrfDecodeUniform, iters) { - std::vector data; - std::unique_ptr buf; - std::vector recovered; - uint64_t varint_bytes; - BENCHMARK_SUSPEND { - recovered.resize(kNumElements); - data = MakeUniformData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); - char* pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - nimble::varint::writeVarint(data[i], &pos); - } - varint_bytes = pos - buf.get(); - } - while (iters--) { - const char* cpos = buf.get(); - const char* end = cpos + varint_bytes; - for (int i = 0; i < kNumElements; ++i) { - recovered[i] = DwrfRead(&cpos, end); - } - CHECK_EQ(recovered.back(), data.back()); - } -} +BENCHMARK_DRAW_LINE(); -BENCHMARK(NimbleDecodeSkewed, iters) { +BENCHMARK(BulkDecode_1byte, iters) { std::vector data; std::unique_ptr buf; std::vector recovered; BENCHMARK_SUSPEND { recovered.resize(kNumElements); - data = MakeSkewedData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); - char* pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - nimble::varint::writeVarint(data[i], &pos); - } + data = MakeFixedWidthData32(1); + uint64_t sz; + buf = EncodeData(data, sz); } while (iters--) { const char* cpos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - recovered[i] = nimble::varint::readVarint32(&cpos); - } + nimble::varint::bulkVarintDecode32(kNumElements, cpos, recovered.data()); CHECK_EQ(recovered.back(), data.back()); } } -BENCHMARK(NimbleBulkDecodeSkewed, iters) { +BENCHMARK_DRAW_LINE(); + +BENCHMARK(BulkDecode_2byte, iters) { std::vector data; std::unique_ptr buf; std::vector recovered; BENCHMARK_SUSPEND { recovered.resize(kNumElements); - data = MakeSkewedData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); - char* pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - nimble::varint::writeVarint(data[i], &pos); - } + data = MakeFixedWidthData32(2); + uint64_t sz; + buf = EncodeData(data, sz); } while (iters--) { const char* cpos = buf.get(); @@ -283,55 +242,39 @@ BENCHMARK(NimbleBulkDecodeSkewed, iters) { } } -BENCHMARK(FollyDecodeSkewed, iters) { - std::vector data; - std::unique_ptr buf; - uint8_t* pos; - std::vector recovered; - BENCHMARK_SUSPEND { - recovered.resize(kNumElements); - data = MakeSkewedData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); - pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - pos += folly::encodeVarint(data[i], pos); - } - } - while (iters--) { - const uint8_t* fstart = buf.get(); - const uint8_t* fend = buf.get() + (pos - buf.get()); - folly::Range frange(fstart, fend); - for (int i = 0; i < kNumElements; ++i) { - recovered[i] = folly::decodeVarint(frange); - } - CHECK_EQ(recovered.back(), data.back()); - } -} +BENCHMARK_DRAW_LINE(); -BENCHMARK(DwrfDecodeSkewed, iters) { - std::vector data; - std::unique_ptr buf; - std::vector recovered; - uint64_t varint_bytes; - BENCHMARK_SUSPEND { - recovered.resize(kNumElements); - data = MakeSkewedData(); - buf = std::make_unique(kNumElements * folly::kMaxVarintLength32); - char* pos = buf.get(); - for (int i = 0; i < kNumElements; ++i) { - nimble::varint::writeVarint(data[i], &pos); - } - varint_bytes = pos - buf.get(); - } - while (iters--) { - const char* cpos = buf.get(); - const char* end = cpos + varint_bytes; - for (int i = 0; i < kNumElements; ++i) { - recovered[i] = DwrfRead(&cpos, end); - } - CHECK_EQ(recovered.back(), data.back()); +// ============================================================================ +// Batch size benchmarks: how does bulk decode scale with n? +// ============================================================================ + +BENCHMARK_DRAW_LINE(); + +#define BATCH_SIZE_BENCH(N) \ + BENCHMARK(BulkDecode_batch##N, iters) { \ + std::vector data; \ + std::unique_ptr buf; \ + std::vector recovered; \ + BENCHMARK_SUSPEND { \ + recovered.resize(N); \ + data = MakeUniformData(N); \ + uint64_t sz; \ + buf = EncodeData(data, sz); \ + } \ + while (iters--) { \ + const char* cpos = buf.get(); \ + nimble::varint::bulkVarintDecode32(N, cpos, recovered.data()); \ + folly::doNotOptimizeAway(recovered.back()); \ + } \ } -} + +BATCH_SIZE_BENCH(4) +BATCH_SIZE_BENCH(8) +BATCH_SIZE_BENCH(16) +BATCH_SIZE_BENCH(64) +BATCH_SIZE_BENCH(256) +BATCH_SIZE_BENCH(1024) +BATCH_SIZE_BENCH(4096) int main() { folly::runBenchmarks(); diff --git a/dwio/nimble/common/tests/VarintTests.cpp b/dwio/nimble/common/tests/VarintTests.cpp index 95fea935..a06266a6 100644 --- a/dwio/nimble/common/tests/VarintTests.cpp +++ b/dwio/nimble/common/tests/VarintTests.cpp @@ -16,6 +16,8 @@ #include #include +#include + #include "dwio/nimble/common/Varint.h" #include "folly/Random.h" #include "folly/Range.h" @@ -25,8 +27,40 @@ using namespace ::facebook; namespace { const int kNumElements = 10000; + +// Encode a vector of values into a varint buffer, returning the buffer and its +// size. +template +std::pair, size_t> encodeValues( + const std::vector& values) { + auto buf = + std::make_unique(values.size() * folly::kMaxVarintLength64); + char* pos = buf.get(); + for (auto val : values) { + nimble::varint::writeVarint(val, &pos); + } + return {std::move(buf), static_cast(pos - buf.get())}; } +// Bulk-decode and verify the result matches the expected values. +template +void verifyBulkDecode(const std::vector& expected, const char* encoded) { + std::vector decoded(expected.size()); + if constexpr (sizeof(T) == 4) { + nimble::varint::bulkVarintDecode32( + expected.size(), encoded, decoded.data()); + } else { + nimble::varint::bulkVarintDecode64( + expected.size(), encoded, decoded.data()); + } + for (size_t i = 0; i < expected.size(); ++i) { + ASSERT_EQ(expected[i], decoded[i]) + << "mismatch at index " << i << " of " << expected.size(); + } +} + +} // namespace + TEST(VarintTests, varintSize32) { // Boundary values for varint encoding. EXPECT_EQ(nimble::varint::varintSize(uint32_t{0}), 1); @@ -181,3 +215,227 @@ TEST(VarintTests, WriteRead64) { ASSERT_EQ(data[i], bulk[i]); } } + +// ============================================================================ +// Single-byte varint tests: exercise the SIMD decodeSingleByteRun path. +// The function has three loops: +// 1. Wide loop: processes kU8BatchSize bytes (32 on AVX2, 16 on SSE/NEON) +// 2. 8-byte loop: processes 8 bytes at a time +// 3. Tail loop: processes 1 byte at a time +// These tests cover boundary conditions for all three loops. +// ============================================================================ + +// All 128 single-byte values (0-127) decode correctly for uint32_t. +TEST(VarintTests, SingleByte32_AllValues) { + std::vector data(128); + std::iota(data.begin(), data.end(), 0); + auto [buf, size] = encodeValues(data); + ASSERT_EQ(size, 128u); + verifyBulkDecode(data, buf.get()); +} + +// All 128 single-byte values (0-127) decode correctly for uint64_t. +TEST(VarintTests, SingleByte64_AllValues) { + std::vector data(128); + std::iota(data.begin(), data.end(), 0); + auto [buf, size] = encodeValues(data); + ASSERT_EQ(size, 128u); + verifyBulkDecode(data, buf.get()); +} + +// Test every count from 0 to 100 with all-zero values. +// Exercises exact boundary transitions between wide/8-byte/tail loops. +TEST(VarintTests, SingleByte32_AllCountsZero) { + for (int count = 0; count <= 100; ++count) { + std::vector data(count, 0); + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); + } +} + +// Test every count from 0 to 100 with value 127 (max single-byte varint). +TEST(VarintTests, SingleByte32_AllCountsMax) { + for (int count = 0; count <= 100; ++count) { + std::vector data(count, 127); + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); + } +} + +// Test counts at specific SIMD boundaries with uint64_t. +TEST(VarintTests, SingleByte64_SimdBoundaries) { + for (int count : {0, 1, 2, 7, 8, 9, 15, 16, 17, 31, + 32, 33, 63, 64, 65, 96, 127, 128, 256, 1000}) { + std::vector data(count); + for (int i = 0; i < count; ++i) { + data[i] = i % 128; + } + auto [buf, size] = encodeValues(data); + ASSERT_EQ(size, static_cast(count)); + verifyBulkDecode(data, buf.get()); + } +} + +// A multi-byte varint (>=128) interrupts the single-byte run at each position +// within a 64-element window. Verifies the SIMD path correctly bails out and +// the remaining elements are decoded by the fallback path. +TEST(VarintTests, SingleByte32_MultiByteInterrupt) { + for (int interruptPos = 0; interruptPos < 64; ++interruptPos) { + const int count = 64; + std::vector data(count); + for (int i = 0; i < count; ++i) { + data[i] = (i == interruptPos) ? 200 : (i % 128); + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); + } +} + +TEST(VarintTests, SingleByte64_MultiByteInterrupt) { + for (int interruptPos = 0; interruptPos < 64; ++interruptPos) { + const int count = 64; + std::vector data(count); + for (int i = 0; i < count; ++i) { + data[i] = (i == interruptPos) ? 200 : (i % 128); + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); + } +} + +// Single-byte values followed by progressively longer multi-byte varints. +// Tests the transition from decodeSingleByteRun into the BMI2/scalar path. +TEST(VarintTests, SingleByte32_TransitionToMultiByte) { + for (int singleCount : {0, 1, 7, 8, 15, 16, 31, 32, 33, 64}) { + for (int multiCount : {0, 1, 5, 10}) { + std::vector data; + data.reserve(singleCount + multiCount); + for (int i = 0; i < singleCount; ++i) { + data.push_back(i % 128); + } + for (int i = 0; i < multiCount; ++i) { + data.push_back(128 + i * 1000); + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); + } + } +} + +TEST(VarintTests, SingleByte64_TransitionToMultiByte) { + for (int singleCount : {0, 1, 7, 8, 15, 16, 31, 32, 33, 64}) { + for (int multiCount : {0, 1, 5, 10}) { + std::vector data; + data.reserve(singleCount + multiCount); + for (int i = 0; i < singleCount; ++i) { + data.push_back(i % 128); + } + for (int i = 0; i < multiCount; ++i) { + data.push_back(128 + static_cast(i) * 1000); + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); + } + } +} + +// Alternating single-byte and multi-byte varints. The SIMD path must +// correctly handle frequent bail-outs and re-entries. +TEST(VarintTests, SingleByte32_AlternatingSingleMulti) { + std::vector data; + data.reserve(200); + for (int i = 0; i < 200; ++i) { + data.push_back(i % 2 == 0 ? (i % 128) : (128 + i)); + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); +} + +// Large run of single-byte varints to stress the wide SIMD loop. +TEST(VarintTests, SingleByte32_LargeRun) { + const int count = 100000; + std::vector data(count); + for (int i = 0; i < count; ++i) { + data[i] = i % 128; + } + auto [buf, size] = encodeValues(data); + ASSERT_EQ(size, static_cast(count)); + verifyBulkDecode(data, buf.get()); +} + +TEST(VarintTests, SingleByte64_LargeRun) { + const int count = 100000; + std::vector data(count); + for (int i = 0; i < count; ++i) { + data[i] = i % 128; + } + auto [buf, size] = encodeValues(data); + ASSERT_EQ(size, static_cast(count)); + verifyBulkDecode(data, buf.get()); +} + +// Random mix: ~80% single-byte, ~20% multi-byte, with a random seed. +TEST(VarintTests, SingleByte32_RandomMix) { + auto seed = folly::Random::rand32(); + LOG(INFO) << "seed: " << seed; + std::mt19937 rng(seed); + + std::vector data(kNumElements); + for (int i = 0; i < kNumElements; ++i) { + if (folly::Random::rand32(rng) % 5 != 0) { + data[i] = folly::Random::rand32(rng) % 128; + } else { + data[i] = 128 + folly::Random::rand32(rng) % 10000; + } + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); +} + +TEST(VarintTests, SingleByte64_RandomMix) { + auto seed = folly::Random::rand32(); + LOG(INFO) << "seed: " << seed; + std::mt19937 rng(seed); + + std::vector data(kNumElements); + for (int i = 0; i < kNumElements; ++i) { + if (folly::Random::rand32(rng) % 5 != 0) { + data[i] = folly::Random::rand32(rng) % 128; + } else { + data[i] = 128 + folly::Random::rand64(rng) % 1000000; + } + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); +} + +// Constant value runs for each single-byte value. +TEST(VarintTests, SingleByte32_ConstantRuns) { + for (uint32_t val = 0; val < 128; ++val) { + std::vector data(37, val); + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); + } +} + +// Verify that a single multi-byte varint at the very start works. +TEST(VarintTests, SingleByte32_MultiByteFirst) { + std::vector data = {300}; + for (int i = 0; i < 50; ++i) { + data.push_back(i % 128); + } + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); +} + +// Verify that a single multi-byte varint at the very end works. +TEST(VarintTests, SingleByte32_MultiByteLast) { + std::vector data; + data.reserve(51); + for (int i = 0; i < 50; ++i) { + data.push_back(i % 128); + } + data.push_back(300); + auto [buf, size] = encodeValues(data); + verifyBulkDecode(data, buf.get()); +}