Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f06434a
encodings: add frequency partitioning to encoding identifiers
David-C-L Apr 2, 2026
16e4559
fix compilation issues wrt mem pool in FreqPartitionEncoding
David-C-L Jan 20, 2026
81b1dd7
encodings/tests: align encoding tests with config templatisation and …
David-C-L Apr 2, 2026
a415df6
extend flatbuffers finding in CMakeLists
David-C-L Jan 20, 2026
9d1601a
encoding/tests: update import name to FrequencyPartitionEncoding.hpp
David-C-L Apr 3, 2026
f9e5589
fix decode dictionary indexing
David-C-L Jan 20, 2026
b50d24c
update ForEncoding to use 64bit and include in nimble framework
David-C-L Jan 20, 2026
dc25009
add tests for ForEncoding with selective and bulk reads
David-C-L Jan 20, 2026
aa268ac
remove unnecessary comments
David-C-L Jan 20, 2026
f7fc4dc
refactor encoding selection to use zstd when meta internal is disabled
David-C-L Feb 9, 2026
42e5597
restore metainternal use in tests instead of zstd
David-C-L Feb 9, 2026
7052d4a
update for and freq interfaces and tests to use string buffer factory
David-C-L Feb 9, 2026
e537fb3
fix conflicting NUMERIC macros in EncodingFactor
David-C-L Feb 9, 2026
6602782
parameterise forward defs of encoding type traits
David-C-L Feb 9, 2026
b64ceb9
add string buffer factory default initialiser for Freq and For tests
David-C-L Feb 9, 2026
d5a5461
fix compilation bug in VeloxReaderTests comparing non-numeric to int 0
David-C-L Feb 9, 2026
c5a06b0
encodings: thread options through factory and prefix serialization
David-C-L Mar 31, 2026
0df6fd6
tests: make optional list/map literals explicit in selective reader t…
David-C-L Mar 31, 2026
349c885
[encodings/FreqPart] replace for loop vector assignment with memcpy
David-C-L Apr 13, 2026
8f75f1c
[encodings/FreqPart] replace value-to-tier assignments via vectors wi…
David-C-L Apr 13, 2026
bb9a094
[encodings/FreqPart] add pre-allocation for serialised Dict and Key s…
David-C-L Apr 13, 2026
b798880
[encodings/FreqPart] refactor encoding buffer creation to use options…
David-C-L Apr 13, 2026
610ecc2
[encodings/FOR] refactor encoding buffer creation to use options factory
David-C-L Apr 13, 2026
825c731
[encodings/FOR] implement bulk decode for residuals
David-C-L Apr 13, 2026
b53cf5a
[encodings/tests] remove redundant EncodingTypeTraits system
David-C-L Apr 14, 2026
5a3e411
[velox/tests] reorder parameters of velox writer test initialisation …
David-C-L Apr 14, 2026
7c7ccef
[encoding/tests] add guard for pool initialisation to avoid test setu…
David-C-L Apr 14, 2026
8289202
[encodings/Encoding] add frequencyPartitionIndex var to distinguish b…
David-C-L May 7, 2026
442e724
[encodings/FPE] extend FPE to support indexes reconstituting the orig…
David-C-L May 7, 2026
56972bc
[encodings/tests/FPE] add FPE tests for each new index type: PerTierB…
David-C-L May 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,24 @@ add_link_options(-Wl,--gc-sections)

include(CTest) # include after project() but before add_subdirectory()

find_package(flatbuffers)
set(NIMBLE_NEED_BUNDLED_BUILD_FLAT_BUFFERS FLASE)
if(flatbuffers_FOUND)
if(flatbuffers_VERSION VERSION_LESS "22.9.4")
# Try to find FlatBuffers with uppercase name first (newer versions)
find_package(FlatBuffers QUIET)
set(NIMBLE_NEED_BUNDLED_BUILD_FLAT_BUFFERS FALSE)
if(FlatBuffers_FOUND)
if(FlatBuffers_VERSION VERSION_LESS "22.9.4")
set(NIMBLE_NEED_BUNDLED_BUILD_FLAT_BUFFERS TRUE)
endif()
else()
# Fallback to old FlatBuffers (< 2.0.0).
find_package(Flatbuffers REQUIRED)
set(NIMBLE_NEED_BUNDLED_BUILD_FLAT_BUFFERS TRUE)
# Try lowercase variant
find_package(flatbuffers QUIET)
if(flatbuffers_FOUND)
if(flatbuffers_VERSION VERSION_LESS "22.9.4")
set(NIMBLE_NEED_BUNDLED_BUILD_FLAT_BUFFERS TRUE)
endif()
else()
# No system FlatBuffers found, use bundled
set(NIMBLE_NEED_BUNDLED_BUILD_FLAT_BUFFERS TRUE)
endif()
endif()
if(NIMBLE_NEED_BUNDLED_BUILD_FLAT_BUFFERS)
# Old FlatBuffers (< 22.9.4) doesn't provide build_flatbuffers(). So
Expand Down
4 changes: 4 additions & 0 deletions dwio/nimble/common/Types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ std::string toString(EncodingType encodingType) {
return "Constant";
case EncodingType::MainlyConstant:
return "MainlyConstant";
case EncodingType::FrequencyPartition:
return "FrequencyPartition";
case EncodingType::FOR:
return "FOR";
case EncodingType::Sentinel:
return "Sentinel";
case EncodingType::Prefix:
Expand Down
6 changes: 6 additions & 0 deletions dwio/nimble/common/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ enum class EncodingType {
// shared across consecutive entries to reduce storage. Supports seek
// operations for efficient random access.
Prefix = 11,
// Partitions data by value frequency. Frequent values get shorter bit-width
// codes. Rows are reordered to group values with same code length.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@David-C-L Can we see if we can achieve better performance without re-ordering the rows?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The row reordering is to enable efficient value-granularity random access. Without reordering, the encoding would be limited to O(n) bulk decoding (similar to huffman encoding) due to the variable-sized keys. We did explore some indexes (they should be explained in the initial PR description) that could be used as a view for interfacing with the original order, so you get the benefit of reordering for random access while allowing access through the original ordering.

Do you think it's worth implementing these indexes as an option for the encoding?

FrequencyPartition = 12,
// Frame of Reference: stores offsets from per-frame minimum values.
// Supports O(1) random access. Preserves row order.
FOR = 13,
};
std::string toString(EncodingType encodingType);
std::ostream& operator<<(std::ostream& out, EncodingType encodingType);
Expand Down
57 changes: 57 additions & 0 deletions dwio/nimble/encodings/Compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,60 @@ struct CompressorRegistry {

ICompressor& getCompressor(CompressionType compressionType) {
static CompressorRegistry registry;

#ifdef DISABLE_META_INTERNAL_COMPRESSOR
// When MetaInternal is not available, redirect to Zstd
if (compressionType == CompressionType::MetaInternal) {
compressionType = CompressionType::Zstd;
}
#endif

auto it = registry.compressors.find(compressionType);
NIMBLE_CHECK(
it != registry.compressors.end(),
"Compressor for type {} is not registered.",
toString(compressionType));
return *it->second;
}

#ifdef DISABLE_META_INTERNAL_COMPRESSOR
// Wrapper to redirect MetaInternal compression to Zstd
class MetaInternalToZstdPolicy : public CompressionPolicy {
public:
explicit MetaInternalToZstdPolicy(const CompressionPolicy& base)
: base_(base) {}

CompressionInformation compression() const override {
auto info = base_.compression();
if (info.compressionType == CompressionType::MetaInternal) {
// Convert MetaInternal parameters to Zstd
CompressionInformation zstdInfo{
.compressionType = CompressionType::Zstd,
.minCompressionSize = info.minCompressionSize};
// Map MetaInternal compression level to Zstd compression level
zstdInfo.parameters.zstd.compressionLevel =
info.parameters.metaInternal.compressionLevel;
return zstdInfo;
}
return info;
}

bool shouldAccept(
CompressionType compressionType,
uint64_t uncompressedSize,
uint64_t compressedSize) const override {
// Redirect MetaInternal to Zstd for acceptance check
if (compressionType == CompressionType::MetaInternal) {
compressionType = CompressionType::Zstd;
}
return base_.shouldAccept(compressionType, uncompressedSize, compressedSize);
}

private:
const CompressionPolicy& base_;
};
#endif

} // namespace

/* static */ CompressionResult Compression::compress(
Expand All @@ -60,6 +107,16 @@ ICompressor& getCompressor(CompressionType compressionType) {
const CompressionPolicy& compressionPolicy) {
auto compression = compressionPolicy.compression();

#ifdef DISABLE_META_INTERNAL_COMPRESSOR
// Wrap the policy to redirect MetaInternal to Zstd
if (compression.compressionType == CompressionType::MetaInternal) {
MetaInternalToZstdPolicy wrapper(compressionPolicy);
compression = wrapper.compression();
return getCompressor(compression.compressionType)
.compress(memoryPool, data, dataType, bitWidth, wrapper);
}
#endif

return getCompressor(compression.compressionType)
.compress(memoryPool, data, dataType, bitWidth, compressionPolicy);
}
Expand Down
15 changes: 9 additions & 6 deletions dwio/nimble/encodings/Encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ class Encoding {
/// fixed 4-byte uint32. Determined from file format version or serializer
/// version.
bool useVarintRowCount;
/// FrequencyPartitionEncoding index type (cast to FreqPartIndexType).
/// 0 = NoIndex (default, backward-compatible), 1 = PerTierBitmaps,
/// 2 = TierTagArray, 3 = EliasFano.
uint8_t frequencyPartitionIndex;
};

/// The binary layout for each Encoding begins with the same prefix:
Expand Down Expand Up @@ -249,16 +253,15 @@ class Encoding {

protected:
static void serializePrefix(
EncodingType encodingType,
DataType dataType,
uint32_t rowCount,
bool useVarint,
char*& pos);
EncodingType encodingType,
DataType dataType,
uint32_t rowCount,
bool useVarint,
char*& pos);

// Compute the prefix size for serialization. Returns kPrefixSize for fixed
// format, or 2 + varintSize(rowCount) for varint format.
static uint32_t serializePrefixSize(uint32_t rowCount, bool useVarint);

// Static helpers for initializer list computation.
static DataType readDataType(std::string_view data);
static uint32_t readRowCount(std::string_view data, bool useVarint);
Expand Down
49 changes: 49 additions & 0 deletions dwio/nimble/encodings/EncodingFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include "dwio/nimble/encodings/DictionaryEncoding.h"
#include "dwio/nimble/encodings/EncodingSelection.h"
#include "dwio/nimble/encodings/FixedBitWidthEncoding.h"
#include "dwio/nimble/encodings/ForEncoding.h"
#include "dwio/nimble/encodings/FrequencyPartitionEncoding.h"
#include "dwio/nimble/encodings/MainlyConstantEncoding.h"
#include "dwio/nimble/encodings/NullableEncoding.h"
#include "dwio/nimble/encodings/PrefixEncoding.h"
Expand Down Expand Up @@ -199,6 +201,30 @@ std::unique_ptr<Encoding> EncodingFactory::create(
toString(dataType)); \
}

#define RETURN_ENCODING_BY_INTEGER_TYPE(Encoding, dataType) \
switch (dataType) { \
case DataType::Int8: \
return std::make_unique<Encoding<int8_t>>(memoryPool, data); \
case DataType::Uint8: \
return std::make_unique<Encoding<uint8_t>>(memoryPool, data); \
case DataType::Int16: \
return std::make_unique<Encoding<int16_t>>(memoryPool, data); \
case DataType::Uint16: \
return std::make_unique<Encoding<uint16_t>>(memoryPool, data); \
case DataType::Int32: \
return std::make_unique<Encoding<int32_t>>(memoryPool, data); \
case DataType::Uint32: \
return std::make_unique<Encoding<uint32_t>>(memoryPool, data); \
case DataType::Int64: \
return std::make_unique<Encoding<int64_t>>(memoryPool, data); \
case DataType::Uint64: \
return std::make_unique<Encoding<uint64_t>>(memoryPool, data); \
default: \
NIMBLE_UNREACHABLE( \
"ForEncoding only supports integer types, got {}.", \
toString(dataType)); \
}

switch (encodingType) {
case EncodingType::Trivial: {
RETURN_ENCODING_BY_LEAF_TYPE(TrivialEncoding, dataType);
Expand Down Expand Up @@ -243,6 +269,12 @@ std::unique_ptr<Encoding> EncodingFactory::create(
case EncodingType::Delta: {
RETURN_ENCODING_BY_NUMERIC_TYPE(DeltaEncoding, dataType);
}
case EncodingType::FOR: {
RETURN_ENCODING_BY_NUMERIC_TYPE(ForEncoding, dataType);
}
case EncodingType::FrequencyPartition: {
RETURN_ENCODING_BY_NUMERIC_TYPE(FrequencyPartitionEncoding, dataType);
}
default: {
NIMBLE_UNREACHABLE(
"Trying to deserialize invalid EncodingType:{} -- garbage input?",
Expand Down Expand Up @@ -349,6 +381,23 @@ std::string_view EncodingFactory::encode(
selection, castedValues, buffer, options);
}
}
case EncodingType::FrequencyPartition: {
if constexpr (std::is_same<T, bool>::value) {
NIMBLE_INCOMPATIBLE_ENCODING(
"FrequencyPartition encoding should not be selected for bool data types.");
} else {
return FrequencyPartitionEncoding<T>::encode(
selection, castedValues, buffer, options);
}
}
case EncodingType::FOR: {
if constexpr (std::is_integral<physicalType>::value && !std::is_same<T, bool>::value) {
return ForEncoding<T>::encode(selection, castedValues, buffer, options);
} else {
NIMBLE_INCOMPATIBLE_ENCODING(
"For encoding can only be selected for integral data types (not bool).");
}
}
case EncodingType::SparseBool: {
if constexpr (!std::is_same<T, bool>::value) {
NIMBLE_INCOMPATIBLE_ENCODING(
Expand Down
22 changes: 22 additions & 0 deletions dwio/nimble/encodings/EncodingIdentifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,28 @@ struct EncodingIdentifiers {
static constexpr NestedEncodingIdentifier Restatements = 1;
static constexpr NestedEncodingIdentifier IsRestatements = 2;
};

struct FrequencyPartition {
// Partition metadata
static constexpr NestedEncodingIdentifier PartitionOffsets = 0;
static constexpr NestedEncodingIdentifier PartitionSizes = 1;
// Per-tier dictionaries (1-bit, 2-bit, 4-bit, 8-bit, etc.)
static constexpr NestedEncodingIdentifier Dict1Bit = 2;
static constexpr NestedEncodingIdentifier Dict2Bit = 3;
static constexpr NestedEncodingIdentifier Dict4Bit = 4;
static constexpr NestedEncodingIdentifier Dict8Bit = 5;
static constexpr NestedEncodingIdentifier Dict16Bit = 6;
static constexpr NestedEncodingIdentifier Dict32Bit = 7;
// Per-tier encoded keys
static constexpr NestedEncodingIdentifier Keys1Bit = 8;
static constexpr NestedEncodingIdentifier Keys2Bit = 9;
static constexpr NestedEncodingIdentifier Keys4Bit = 10;
static constexpr NestedEncodingIdentifier Keys8Bit = 11;
static constexpr NestedEncodingIdentifier Keys16Bit = 12;
static constexpr NestedEncodingIdentifier Keys32Bit = 13;
// Unencoded partition (raw values that don't fit in any tier)
static constexpr NestedEncodingIdentifier UnencodedValues = 14;
};
};

} // namespace facebook::nimble
10 changes: 9 additions & 1 deletion dwio/nimble/encodings/EncodingLayout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,15 @@ std::pair<EncodingLayout, uint32_t> EncodingLayout::create(

auto pos = encoding.data();
const auto encodingType = encoding::read<uint8_t, EncodingType>(pos);
const auto compressionType = encoding::read<uint8_t, CompressionType>(pos);
auto compressionType = encoding::read<uint8_t, CompressionType>(pos);

#ifdef DISABLE_META_INTERNAL_COMPRESSOR
// When MetaInternal is not available, map it to Zstd
if (compressionType == CompressionType::MetaInternal) {
compressionType = CompressionType::Zstd;
}
#endif

const auto childrenCount = encoding::read<uint8_t>(pos);
[[maybe_unused]] const auto extraDataSize = encoding::read<uint16_t>(pos);

Expand Down
7 changes: 7 additions & 0 deletions dwio/nimble/encodings/EncodingUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "dwio/nimble/encodings/DeltaEncoding.h"
#include "dwio/nimble/encodings/DictionaryEncoding.h"
#include "dwio/nimble/encodings/FixedBitWidthEncoding.h"
#include "dwio/nimble/encodings/ForEncoding.h"
#include "dwio/nimble/encodings/MainlyConstantEncoding.h"
#include "dwio/nimble/encodings/NullableEncoding.h"
#include "dwio/nimble/encodings/PrefixEncoding.h"
Expand Down Expand Up @@ -130,6 +131,12 @@ auto encodingTypeDispatchNonString(Encoding& encoding, F&& f) {
return f(static_cast<MainlyConstantEncoding<T>&>(encoding));
case EncodingType::Delta:
return f(static_cast<DeltaEncoding<T>&>(encoding));
case EncodingType::FOR:
if constexpr (std::is_integral_v<T> && !std::is_same_v<T, bool>) {
return f(static_cast<ForEncoding<T>&>(encoding));
} else {
NIMBLE_UNREACHABLE(toString(encoding.dataType()));
}
default:
NIMBLE_UNSUPPORTED(toString(encoding.encodingType()));
}
Expand Down
Loading