diff --git a/Makefile b/Makefile index 7d9384ef3b1..3fc69c3c820 100644 --- a/Makefile +++ b/Makefile @@ -32,9 +32,9 @@ _VENV_PIP := $(VENV_DIR)/$(VENV_NAME)/bin/pip # ── Phony targets ──────────────────────────────────────────────────────────── .PHONY: help setup protoc venv activate lint lint-check \ build build-debug configure configure-debug \ - test-cpp test-cpp-debug symlink symlink-debug \ + test-cpp test-cpp-debug test-cpp-rapidcheck test-cpp-rapidcheck-debug symlink symlink-debug \ test-py build-and-test-py build-and-test-py-debug \ - wheel bench-cpp bench-py install-editable + wheel bench-cpp bench-cpp-build bench-py install-editable # ── help ───────────────────────────────────────────────────────────────────── help: ## Show this help @@ -136,6 +136,15 @@ test-cpp-debug: $(_DEBUG_BUILD_DIR)/.configure-stamp ## Build and run C++ unit t cmake --build $(_DEBUG_BUILD_DIR) -j $(CMAKE_JOBS) --target test_unit_arcticdb $(_DEBUG_BUILD_DIR)/arcticdb/test_unit_arcticdb $(if $(FILTER),--gtest_filter=$(FILTER)) +# ── test-cpp-rapidcheck ────────────────────────────────────────────────────── +test-cpp-rapidcheck: $(_RELEASE_BUILD_DIR)/.configure-stamp ## Build and run C++ rapidcheck tests (release, FILTER= for gtest_filter) + cmake --build $(_RELEASE_BUILD_DIR) -j $(CMAKE_JOBS) --target arcticdb_rapidcheck_tests + $(_RELEASE_BUILD_DIR)/arcticdb/arcticdb_rapidcheck_tests $(if $(FILTER),--gtest_filter=$(FILTER)) + +test-cpp-rapidcheck-debug: $(_DEBUG_BUILD_DIR)/.configure-stamp ## Build and run C++ rapidcheck tests (debug, FILTER= for gtest_filter) + cmake --build $(_DEBUG_BUILD_DIR) -j $(CMAKE_JOBS) --target arcticdb_rapidcheck_tests + $(_DEBUG_BUILD_DIR)/arcticdb/arcticdb_rapidcheck_tests $(if $(FILTER),--gtest_filter=$(FILTER)) + # ── symlink ────────────────────────────────────────────────────────────────── _EXT_SUFFIX := $(shell python3 -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))") @@ -169,8 +178,10 @@ wheel: ## Build a pip wheel $(PROXY_CMD) $(_VENV_PIP) wheel . --no-deps -w dist/ # ── bench-cpp ──────────────────────────────────────────────────────────────── -bench-cpp: $(_RELEASE_BUILD_DIR)/.configure-stamp ## Build and run C++ benchmarks (release, FILTER= for benchmark_filter) +bench-cpp-build: $(_RELEASE_BUILD_DIR)/.configure-stamp ## Build C++ benchmarks without running (release) cmake --build $(_RELEASE_BUILD_DIR) -j $(CMAKE_JOBS) --target benchmarks + +bench-cpp: bench-cpp-build ## Build and run C++ benchmarks (release, FILTER= for benchmark_filter) $(_RELEASE_BUILD_DIR)/arcticdb/benchmarks $(if $(FILTER),--benchmark_filter=$(FILTER)) # ── install-editable ───────────────────────────────────────────────────────── diff --git a/cpp/arcticdb/column_store/column_algorithms.hpp b/cpp/arcticdb/column_store/column_algorithms.hpp index 4c2d59a6ef5..80509657c2e 100644 --- a/cpp/arcticdb/column_store/column_algorithms.hpp +++ b/cpp/arcticdb/column_store/column_algorithms.hpp @@ -355,14 +355,17 @@ typename TDT::DataTypeTag::raw_type value_at(const ColumnData::ColumnDataIterato // For lower_bound that is `probe < value`; for upper_bound it is `probe <= value`. // `within_block_bisect` is std::lower_bound or std::upper_bound run on the contiguous block memory. template +requires(ID == IteratorDensity::DENSE) && (TDT::dimension() == Dimension::Dim0) && + std::predicate && + std::invocable< + WithinBlockBisect, const typename TDT::DataTypeTag::raw_type*, + const typename TDT::DataTypeTag::raw_type*, typename TDT::DataTypeTag::raw_type> ColumnData::ColumnDataIterator bound_search( const ColumnData::ColumnDataIterator& begin, const ColumnData::ColumnDataIterator& end, typename TDT::DataTypeTag::raw_type value, - IsBeforeAnswer is_before, WithinBlockBisect bisect + IsBeforeAnswer&& is_before, WithinBlockBisect&& bisect ) { using RawType = typename TDT::DataTypeTag::raw_type; - static_assert(ID == IteratorDensity::DENSE, "Sorted search currently supports DENSE only"); - static_assert(TDT::dimension() == Dimension::Dim0, "Sorted search supports Dim0 only"); util::check(begin.parent() == end.parent(), "bound_search: begin and end have different parents"); if (begin == end) { @@ -414,11 +417,12 @@ ColumnData::ColumnDataIterator bound_search( // Gallop forward from `begin` in steps of 2**n until an element after value is reached. // Returns the exponential range known to contain the first element for which `!is_before`. template -std::pair, ColumnData::ColumnDataIterator> -gallop_bracket( +requires(ID == IteratorDensity::DENSE) && (TDT::dimension() == Dimension::Dim0) && + std::predicate +std::pair, ColumnData::ColumnDataIterator> gallop_bracket( const ColumnData::ColumnDataIterator& begin, const ColumnData::ColumnDataIterator& end, typename TDT::DataTypeTag::raw_type value, - IsBeforeAnswer is_before + IsBeforeAnswer&& is_before ) { using RawType = typename TDT::DataTypeTag::raw_type; if (begin == end) { @@ -481,8 +485,7 @@ gallop_bracket( // We iterate until `first_offset+step < up_to - 1` because we'll later explicitly probe at // the last element of the first block const size_t up_to = end_block_idx > first_block_idx ? first_block_row_count : end_in_block_offset; - size_t step = 1; - for (; first_offset + step + 1 < up_to; step *= 2) { + for (size_t step = 1; first_offset + step + 1 < up_to; step *= 2) { const size_t probe_offset = first_offset + step; if (!record_probe_in_first_block(probe_offset + 1, first_block_data[probe_offset])) { return {make_iter_in_first_block(prev_offset), make_iter_in_first_block(cur_offset)}; @@ -500,8 +503,7 @@ gallop_bracket( } // Answer is after the first block — probe the last elements of blocks at first_idx + 2**n - step = 1; - for (; first_block_idx + step < end_block_idx; step *= 2) { + for (size_t step = 1; first_block_idx + step < end_block_idx; step *= 2) { const size_t block_idx = first_block_idx + step; const RawType last_in_block = block_data_at(block_idx)[block_row_count_at(block_idx) - 1]; if (!record_probe(block_idx + 1, 0, last_in_block)) { diff --git a/cpp/arcticdb/column_store/test/benchmark_column.cpp b/cpp/arcticdb/column_store/test/benchmark_column.cpp index b4d3efa4278..83aa7509528 100644 --- a/cpp/arcticdb/column_store/test/benchmark_column.cpp +++ b/cpp/arcticdb/column_store/test/benchmark_column.cpp @@ -13,6 +13,7 @@ #include #include #include +#include using namespace arcticdb; @@ -25,7 +26,7 @@ static std::mt19937 gen(rd()); // ─── Sorted-search benchmarks across block layouts ──────────────────────────────────────────────── // -// Four column shapes — single-block (PRESIZED memcpy), regular blocks (presized_in_blocks), +// Four column shapes — single-block (PRESIZED), regular blocks (presized_in_blocks), // irregular blocks of size 1000 (DETACHABLE), irregular blocks of size 1 (DETACHABLE). namespace { @@ -41,51 +42,21 @@ std::vector make_sorted_data(size_t num_rows, std::mt19937& rng) { return data; } -void populate(Column& col, const std::vector& data) { - for (size_t i = 0; i < data.size(); ++i) { - col.reference_at(i) = data[i]; - } -} - -Column make_single_block(const std::vector& data) { - Column col( - make_scalar_type(DataType::NANOSECONDS_UTC64), - data.size(), - AllocationType::PRESIZED, - Sparsity::NOT_PERMITTED - ); - memcpy(col.ptr(), data.data(), data.size() * sizeof(timestamp)); - col.set_row_data(data.size() - 1); - return col; -} - -Column make_regular_blocks(const std::vector& data) { - Column col( - make_scalar_type(DataType::NANOSECONDS_UTC64), - Sparsity::NOT_PERMITTED, - ChunkedBuffer::presized_in_blocks(data.size() * sizeof(timestamp)) - ); - populate(col, data); - return col; -} - -// DETACHABLE allocation routes lookups through ChunkedBuffer::block_offsets_ even with uniform -// block sizes, so these stress the irregular path while keeping block sizes consistent. -Column make_irregular_blocks(const std::vector& data, size_t block_size) { - Column col(make_scalar_type(DataType::NANOSECONDS_UTC64), 0, AllocationType::DETACHABLE, Sparsity::NOT_PERMITTED); - size_t remaining = data.size(); - while (remaining > 0) { - const size_t alloc = std::min(remaining, block_size); - col.allocate_data(alloc * sizeof(timestamp)); - col.advance_data(alloc * sizeof(timestamp)); - remaining -= alloc; - } - populate(col, data); - return col; -} - -auto make_irregular_blocks_1000 = [](const std::vector& data) { return make_irregular_blocks(data, 1000); }; -auto make_irregular_blocks_1 = [](const std::vector& data) { return make_irregular_blocks(data, 1); }; +// NANOSECONDS_UTC64 keeps the column type consistent with BenchTDT. +constexpr DataType index_data_type = DataType::NANOSECONDS_UTC64; + +auto make_single_block = [](const std::vector& data) { + return make_single_block_column(data, index_data_type); +}; +auto make_regular_blocks = [](const std::vector& data) { + return make_regular_blocks_column(data, index_data_type); +}; +auto make_irregular_blocks_1000 = [](const std::vector& data) { + return make_irregular_blocks_column(data, uniform_block_sizes(data.size(), 1000), index_data_type); +}; +auto make_irregular_blocks_1 = [](const std::vector& data) { + return make_irregular_blocks_column(data, uniform_block_sizes(data.size(), 1), index_data_type); +}; } // namespace diff --git a/cpp/arcticdb/column_store/test/rapidcheck_column.cpp b/cpp/arcticdb/column_store/test/rapidcheck_column.cpp index 9862d8fe01f..3ca8054fdd1 100644 --- a/cpp/arcticdb/column_store/test/rapidcheck_column.cpp +++ b/cpp/arcticdb/column_store/test/rapidcheck_column.cpp @@ -151,30 +151,35 @@ RC_GTEST_PROP(Column, SearchSorted, (const std::vector& input, int64_t auto n = sorted_input.size(); auto smallest_value = sorted_input[0]; auto largest_value = sorted_input[n - 1]; - using TDT = TypeDescriptorTag, DimensionTag>; - Column column(static_cast(TDT{}), 0, AllocationType::DYNAMIC, Sparsity::NOT_PERMITTED); - for (size_t idx = 0; idx < n; ++idx) { - column.set_scalar(idx, sorted_input[idx]); - } - auto left_idx = lower_bound_idx(column, value_to_find); - auto right_idx = upper_bound_idx(column, value_to_find); - RC_ASSERT(left_idx <= n); - RC_ASSERT(right_idx <= n); - if (left_idx == 0) { - RC_ASSERT(value_to_find <= smallest_value); - } else if (left_idx == n) { - RC_ASSERT(value_to_find > largest_value); - } else { - RC_ASSERT(value_to_find > sorted_input[left_idx - 1]); - RC_ASSERT(value_to_find <= sorted_input[left_idx]); - } - if (right_idx == 0) { - RC_ASSERT(value_to_find <= smallest_value); - } else if (right_idx == n) { - RC_ASSERT(value_to_find >= largest_value); - } else { - RC_ASSERT(value_to_find >= sorted_input[right_idx - 1]); - RC_ASSERT(value_to_find < sorted_input[right_idx]); + + // Run against single / regular / irregular block layouts so block-jumping is exercised, not just + // the contiguous case. + std::vector columns; + columns.push_back(make_single_block_column(sorted_input, DataType::INT64)); + columns.push_back(make_regular_blocks_column(sorted_input, DataType::INT64)); + columns.push_back(make_irregular_blocks_column(sorted_input, DataType::INT64)); + + for (const auto& column : columns) { + auto left_idx = lower_bound_idx(column, value_to_find); + auto right_idx = upper_bound_idx(column, value_to_find); + RC_ASSERT(left_idx <= n); + RC_ASSERT(right_idx <= n); + if (left_idx == 0) { + RC_ASSERT(value_to_find <= smallest_value); + } else if (left_idx == n) { + RC_ASSERT(value_to_find > largest_value); + } else { + RC_ASSERT(value_to_find > sorted_input[left_idx - 1]); + RC_ASSERT(value_to_find <= sorted_input[left_idx]); + } + if (right_idx == 0) { + RC_ASSERT(value_to_find <= smallest_value); + } else if (right_idx == n) { + RC_ASSERT(value_to_find >= largest_value); + } else { + RC_ASSERT(value_to_find >= sorted_input[right_idx - 1]); + RC_ASSERT(value_to_find < sorted_input[right_idx]); + } } } diff --git a/cpp/arcticdb/column_store/test/test_column.cpp b/cpp/arcticdb/column_store/test/test_column.cpp index cf03cf415a5..afc6942a9ad 100644 --- a/cpp/arcticdb/column_store/test/test_column.cpp +++ b/cpp/arcticdb/column_store/test/test_column.cpp @@ -279,12 +279,11 @@ TEST(ColumnData, Iterator) { } } -TEST(ColumnData, IteratorSkipsEmptyBlocks) { +TEST(ColumnData, IteratorSkipsTrailingEmptyBlock) { using namespace arcticdb; using TDT = TypeDescriptorTag, DimensionTag>; - // Trailing empty block Column col(static_cast(TDT{}), 0, AllocationType::DYNAMIC, Sparsity::PERMITTED); std::array data{10, 20, 30}; col.set_external_block(0, data.data(), data.size()); @@ -302,8 +301,14 @@ TEST(ColumnData, IteratorSkipsEmptyBlocks) { visited.push_back(*it); } EXPECT_EQ(visited, (std::vector{10, 20, 30})); +} + +TEST(ColumnData, IteratorOnAllEmptyColumn) { + using namespace arcticdb; - // All-empty column: a single zero-size external block. begin must compare equal to end. + using TDT = TypeDescriptorTag, DimensionTag>; + + // A single zero-size external block. begin must compare equal to end. Column empty_col(static_cast(TDT{}), 0, AllocationType::DYNAMIC, Sparsity::PERMITTED); empty_col.set_external_block(0, static_cast(nullptr), 0); ASSERT_EQ(empty_col.buffer().num_blocks(), 1u); @@ -477,56 +482,20 @@ namespace { using namespace arcticdb; using SearchTDT = TypeDescriptorTag, DimensionTag>; -void populate(Column& col, const std::vector& values) { - for (size_t i = 0; i < values.size(); ++i) { - col.reference_at(i) = values[i]; - } -} - -// Three column shapes exercise the three random_accessor paths: SINGLE / REGULAR / IRREGULAR. Column make_single_block(const std::vector& values) { - Column col( - static_cast(SearchTDT{}), values.size(), AllocationType::PRESIZED, Sparsity::NOT_PERMITTED - ); - populate(col, values); - return col; + return make_single_block_column(values, DataType::INT64); } Column make_regular_blocks(const std::vector& values) { - Column col( - static_cast(SearchTDT{}), - Sparsity::NOT_PERMITTED, - ChunkedBuffer::presized_in_blocks(values.size() * sizeof(int64_t)) - ); - populate(col, values); - return col; + return make_regular_blocks_column(values, DataType::INT64); } Column make_irregular_blocks(const std::vector& values, const std::vector& block_sizes) { - Column col(static_cast(SearchTDT{}), 0, AllocationType::DETACHABLE, Sparsity::NOT_PERMITTED); - for (size_t block_size : block_sizes) { - col.allocate_data(block_size * sizeof(int64_t)); - col.advance_data(block_size * sizeof(int64_t)); - } - populate(col, values); - return col; -} - -// Default irregular pattern: [1, 1, 1, 3, 1, 5, 1, 7, ...] — alternates 1-element and i-element blocks. -std::vector default_irregular_sizes(size_t total) { - std::vector sizes; - size_t remaining = total; - for (size_t i = 0; remaining > 0; ++i) { - size_t current = i % 2 == 0 ? 1 : i; - current = std::min(current, remaining); - sizes.push_back(current); - remaining -= current; - } - return sizes; + return make_irregular_blocks_column(values, block_sizes, DataType::INT64); } Column make_irregular_blocks(const std::vector& values) { - return make_irregular_blocks(values, default_irregular_sizes(values.size())); + return make_irregular_blocks_column(values, DataType::INT64); } // Cross-checks our search functions against std::lower_bound / upper_bound on the reference vector. @@ -585,10 +554,14 @@ TEST(ColumnSearch, BasicRegular) { auto column_data = col.data(); auto begin = column_data.cbegin(); auto end = column_data.cend(); + // 20 is duplicated at indices 4 and 5; lower_bound must land on the first (4) and upper_bound past + // the last (6). auto lb = lower_bound(begin, end, int64_t{20}); ASSERT_EQ(*lb, 20); + ASSERT_EQ(std::distance(begin, lb), 4); auto ub = upper_bound(begin, end, int64_t{20}); ASSERT_EQ(*ub, 25); + ASSERT_EQ(std::distance(begin, ub), 6); } TEST(ColumnSearch, BasicEnumerated) { diff --git a/cpp/arcticdb/util/test/test_utils.hpp b/cpp/arcticdb/util/test/test_utils.hpp index c9ec7d19b84..e00e996fc67 100644 --- a/cpp/arcticdb/util/test/test_utils.hpp +++ b/cpp/arcticdb/util/test/test_utils.hpp @@ -275,4 +275,74 @@ Column create_dense_column(const Input& data) { } result.set_row_data(data.size()); return result; +} + +// Column builders that lay the same values out across different block topologies, exercising the +// SINGLE / REGULAR / IRREGULAR random_accessor paths. `dt` selects the column's data type (e.g. INT64 +// or NANOSECONDS_UTC64); the raw type `T` must match its width. + +template +void populate_column_from_vector(Column& col, const std::vector& values) { + for (size_t i = 0; i < values.size(); ++i) { + col.reference_at(i) = values[i]; + } +} + +template +Column make_single_block_column(const std::vector& values, DataType dt) { + Column col(make_scalar_type(dt), values.size(), AllocationType::PRESIZED, Sparsity::NOT_PERMITTED); + populate_column_from_vector(col, values); + return col; +} + +template +Column make_regular_blocks_column(const std::vector& values, DataType dt) { + Column col( + make_scalar_type(dt), Sparsity::NOT_PERMITTED, ChunkedBuffer::presized_in_blocks(values.size() * sizeof(T)) + ); + populate_column_from_vector(col, values); + return col; +} + +// DETACHABLE allocation routes lookups through ChunkedBuffer::block_offsets_, so this stresses the +// irregular path regardless of whether the supplied block sizes are uniform. +template +Column make_irregular_blocks_column(const std::vector& values, const std::vector& block_sizes, DataType dt) { + Column col(make_scalar_type(dt), 0, AllocationType::DETACHABLE, Sparsity::NOT_PERMITTED); + for (size_t block_size : block_sizes) { + col.allocate_data(block_size * sizeof(T)); + col.advance_data(block_size * sizeof(T)); + } + populate_column_from_vector(col, values); + return col; +} + +// Default irregular pattern: [1, 1, 1, 3, 1, 5, 1, 7, ...] — alternates 1-element and i-element blocks. +inline std::vector default_irregular_block_sizes(size_t total) { + std::vector sizes; + size_t remaining = total; + for (size_t i = 0; remaining > 0; ++i) { + size_t current = i % 2 == 0 ? 1 : i; + current = std::min(current, remaining); + sizes.push_back(current); + remaining -= current; + } + return sizes; +} + +// Blocks of `block_size` elements each, with a smaller trailing block for the remainder. +inline std::vector uniform_block_sizes(size_t total, size_t block_size) { + std::vector sizes; + size_t remaining = total; + while (remaining > 0) { + const size_t current = std::min(remaining, block_size); + sizes.push_back(current); + remaining -= current; + } + return sizes; +} + +template +Column make_irregular_blocks_column(const std::vector& values, DataType dt) { + return make_irregular_blocks_column(values, default_irregular_block_sizes(values.size()), dt); } \ No newline at end of file