Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ _VENV_PIP := $(VENV_DIR)/$(VENV_NAME)/bin/pip
# ── Phony targets ────────────────────────────────────────────────────────────
.PHONY: help setup protoc venv activate lint lint-check \
build build-debug configure configure-debug \
test-cpp test-cpp-debug symlink symlink-debug \
test-cpp test-cpp-debug test-cpp-rapidcheck test-cpp-rapidcheck-debug symlink symlink-debug \
test-py build-and-test-py build-and-test-py-debug \
wheel bench-cpp bench-py install-editable
wheel bench-cpp bench-cpp-build bench-py install-editable

# ── help ─────────────────────────────────────────────────────────────────────
help: ## Show this help
Expand Down Expand Up @@ -136,6 +136,15 @@ test-cpp-debug: $(_DEBUG_BUILD_DIR)/.configure-stamp ## Build and run C++ unit t
cmake --build $(_DEBUG_BUILD_DIR) -j $(CMAKE_JOBS) --target test_unit_arcticdb
$(_DEBUG_BUILD_DIR)/arcticdb/test_unit_arcticdb $(if $(FILTER),--gtest_filter=$(FILTER))

# ── test-cpp-rapidcheck ──────────────────────────────────────────────────────
test-cpp-rapidcheck: $(_RELEASE_BUILD_DIR)/.configure-stamp ## Build and run C++ rapidcheck tests (release, FILTER= for gtest_filter)
cmake --build $(_RELEASE_BUILD_DIR) -j $(CMAKE_JOBS) --target arcticdb_rapidcheck_tests
$(_RELEASE_BUILD_DIR)/arcticdb/arcticdb_rapidcheck_tests $(if $(FILTER),--gtest_filter=$(FILTER))

test-cpp-rapidcheck-debug: $(_DEBUG_BUILD_DIR)/.configure-stamp ## Build and run C++ rapidcheck tests (debug, FILTER= for gtest_filter)
cmake --build $(_DEBUG_BUILD_DIR) -j $(CMAKE_JOBS) --target arcticdb_rapidcheck_tests
$(_DEBUG_BUILD_DIR)/arcticdb/arcticdb_rapidcheck_tests $(if $(FILTER),--gtest_filter=$(FILTER))

# ── symlink ──────────────────────────────────────────────────────────────────
_EXT_SUFFIX := $(shell python3 -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")

Expand Down Expand Up @@ -169,8 +178,10 @@ wheel: ## Build a pip wheel
$(PROXY_CMD) $(_VENV_PIP) wheel . --no-deps -w dist/

# ── bench-cpp ────────────────────────────────────────────────────────────────
bench-cpp: $(_RELEASE_BUILD_DIR)/.configure-stamp ## Build and run C++ benchmarks (release, FILTER= for benchmark_filter)
bench-cpp-build: $(_RELEASE_BUILD_DIR)/.configure-stamp ## Build C++ benchmarks without running (release)
cmake --build $(_RELEASE_BUILD_DIR) -j $(CMAKE_JOBS) --target benchmarks

bench-cpp: bench-cpp-build ## Build and run C++ benchmarks (release, FILTER= for benchmark_filter)
$(_RELEASE_BUILD_DIR)/arcticdb/benchmarks $(if $(FILTER),--benchmark_filter=$(FILTER))

# ── install-editable ─────────────────────────────────────────────────────────
Expand Down
22 changes: 12 additions & 10 deletions cpp/arcticdb/column_store/column_algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,14 +355,17 @@ typename TDT::DataTypeTag::raw_type value_at(const ColumnData::ColumnDataIterato
// For lower_bound that is `probe < value`; for upper_bound it is `probe <= value`.
// `within_block_bisect` is std::lower_bound or std::upper_bound run on the contiguous block memory.
template<typename TDT, IteratorType IT, IteratorDensity ID, typename IsBeforeAnswer, typename WithinBlockBisect>
requires(ID == IteratorDensity::DENSE) && (TDT::dimension() == Dimension::Dim0) &&
std::predicate<IsBeforeAnswer, typename TDT::DataTypeTag::raw_type, typename TDT::DataTypeTag::raw_type> &&
std::invocable<
WithinBlockBisect, const typename TDT::DataTypeTag::raw_type*,
const typename TDT::DataTypeTag::raw_type*, typename TDT::DataTypeTag::raw_type>
ColumnData::ColumnDataIterator<TDT, IT, ID, true> bound_search(
const ColumnData::ColumnDataIterator<TDT, IT, ID, true>& begin,
const ColumnData::ColumnDataIterator<TDT, IT, ID, true>& end, typename TDT::DataTypeTag::raw_type value,
IsBeforeAnswer is_before, WithinBlockBisect bisect
IsBeforeAnswer&& is_before, WithinBlockBisect&& bisect
) {
using RawType = typename TDT::DataTypeTag::raw_type;
static_assert(ID == IteratorDensity::DENSE, "Sorted search currently supports DENSE only");
static_assert(TDT::dimension() == Dimension::Dim0, "Sorted search supports Dim0 only");
util::check(begin.parent() == end.parent(), "bound_search: begin and end have different parents");

if (begin == end) {
Expand Down Expand Up @@ -414,11 +417,12 @@ ColumnData::ColumnDataIterator<TDT, IT, ID, true> bound_search(
// Gallop forward from `begin` in steps of 2**n until an element after value is reached.
// Returns the exponential range known to contain the first element for which `!is_before`.
template<typename TDT, IteratorType IT, IteratorDensity ID, typename IsBeforeAnswer>
std::pair<ColumnData::ColumnDataIterator<TDT, IT, ID, true>, ColumnData::ColumnDataIterator<TDT, IT, ID, true>>
gallop_bracket(
requires(ID == IteratorDensity::DENSE) && (TDT::dimension() == Dimension::Dim0) &&
std::predicate<IsBeforeAnswer, typename TDT::DataTypeTag::raw_type, typename TDT::DataTypeTag::raw_type>
std::pair<ColumnData::ColumnDataIterator<TDT, IT, ID, true>, ColumnData::ColumnDataIterator<TDT, IT, ID, true>> gallop_bracket(
const ColumnData::ColumnDataIterator<TDT, IT, ID, true>& begin,
const ColumnData::ColumnDataIterator<TDT, IT, ID, true>& end, typename TDT::DataTypeTag::raw_type value,
IsBeforeAnswer is_before
IsBeforeAnswer&& is_before
) {
using RawType = typename TDT::DataTypeTag::raw_type;
if (begin == end) {
Expand Down Expand Up @@ -481,8 +485,7 @@ gallop_bracket(
// We iterate until `first_offset+step < up_to - 1` because we'll later explicitly probe at
// the last element of the first block
const size_t up_to = end_block_idx > first_block_idx ? first_block_row_count : end_in_block_offset;
size_t step = 1;
for (; first_offset + step + 1 < up_to; step *= 2) {
for (size_t step = 1; first_offset + step + 1 < up_to; step *= 2) {
const size_t probe_offset = first_offset + step;
if (!record_probe_in_first_block(probe_offset + 1, first_block_data[probe_offset])) {
return {make_iter_in_first_block(prev_offset), make_iter_in_first_block(cur_offset)};
Expand All @@ -500,8 +503,7 @@ gallop_bracket(
}

// Answer is after the first block — probe the last elements of blocks at first_idx + 2**n
step = 1;
for (; first_block_idx + step < end_block_idx; step *= 2) {
for (size_t step = 1; first_block_idx + step < end_block_idx; step *= 2) {
const size_t block_idx = first_block_idx + step;
const RawType last_in_block = block_data_at(block_idx)[block_row_count_at(block_idx) - 1];
if (!record_probe(block_idx + 1, 0, last_in_block)) {
Expand Down
63 changes: 17 additions & 46 deletions cpp/arcticdb/column_store/test/benchmark_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <benchmark/benchmark.h>
#include <arcticdb/column_store/column.hpp>
#include <arcticdb/column_store/column_algorithms.hpp>
#include <arcticdb/util/test/test_utils.hpp>

using namespace arcticdb;

Expand All @@ -25,7 +26,7 @@ static std::mt19937 gen(rd());

// ─── Sorted-search benchmarks across block layouts ────────────────────────────────────────────────
//
// Four column shapes — single-block (PRESIZED memcpy), regular blocks (presized_in_blocks),
// Four column shapes — single-block (PRESIZED), regular blocks (presized_in_blocks),
// irregular blocks of size 1000 (DETACHABLE), irregular blocks of size 1 (DETACHABLE).

namespace {
Expand All @@ -41,51 +42,21 @@ std::vector<timestamp> make_sorted_data(size_t num_rows, std::mt19937& rng) {
return data;
}

void populate(Column& col, const std::vector<timestamp>& data) {
for (size_t i = 0; i < data.size(); ++i) {
col.reference_at<timestamp>(i) = data[i];
}
}

Column make_single_block(const std::vector<timestamp>& data) {
Column col(
make_scalar_type(DataType::NANOSECONDS_UTC64),
data.size(),
AllocationType::PRESIZED,
Sparsity::NOT_PERMITTED
);
memcpy(col.ptr(), data.data(), data.size() * sizeof(timestamp));
col.set_row_data(data.size() - 1);
return col;
}

Column make_regular_blocks(const std::vector<timestamp>& data) {
Column col(
make_scalar_type(DataType::NANOSECONDS_UTC64),
Sparsity::NOT_PERMITTED,
ChunkedBuffer::presized_in_blocks(data.size() * sizeof(timestamp))
);
populate(col, data);
return col;
}

// DETACHABLE allocation routes lookups through ChunkedBuffer::block_offsets_ even with uniform
// block sizes, so these stress the irregular path while keeping block sizes consistent.
Column make_irregular_blocks(const std::vector<timestamp>& data, size_t block_size) {
Column col(make_scalar_type(DataType::NANOSECONDS_UTC64), 0, AllocationType::DETACHABLE, Sparsity::NOT_PERMITTED);
size_t remaining = data.size();
while (remaining > 0) {
const size_t alloc = std::min(remaining, block_size);
col.allocate_data(alloc * sizeof(timestamp));
col.advance_data(alloc * sizeof(timestamp));
remaining -= alloc;
}
populate(col, data);
return col;
}

auto make_irregular_blocks_1000 = [](const std::vector<timestamp>& data) { return make_irregular_blocks(data, 1000); };
auto make_irregular_blocks_1 = [](const std::vector<timestamp>& data) { return make_irregular_blocks(data, 1); };
// NANOSECONDS_UTC64 keeps the column type consistent with BenchTDT.
constexpr DataType index_data_type = DataType::NANOSECONDS_UTC64;

auto make_single_block = [](const std::vector<timestamp>& data) {
return make_single_block_column<timestamp>(data, index_data_type);
};
auto make_regular_blocks = [](const std::vector<timestamp>& data) {
return make_regular_blocks_column<timestamp>(data, index_data_type);
};
auto make_irregular_blocks_1000 = [](const std::vector<timestamp>& data) {
return make_irregular_blocks_column<timestamp>(data, uniform_block_sizes(data.size(), 1000), index_data_type);
};
auto make_irregular_blocks_1 = [](const std::vector<timestamp>& data) {
return make_irregular_blocks_column<timestamp>(data, uniform_block_sizes(data.size(), 1), index_data_type);
};

} // namespace

Expand Down
53 changes: 29 additions & 24 deletions cpp/arcticdb/column_store/test/rapidcheck_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,30 +151,35 @@ RC_GTEST_PROP(Column, SearchSorted, (const std::vector<int64_t>& input, int64_t
auto n = sorted_input.size();
auto smallest_value = sorted_input[0];
auto largest_value = sorted_input[n - 1];
using TDT = TypeDescriptorTag<DataTypeTag<DataType::INT64>, DimensionTag<Dimension::Dim0>>;
Column column(static_cast<TypeDescriptor>(TDT{}), 0, AllocationType::DYNAMIC, Sparsity::NOT_PERMITTED);
for (size_t idx = 0; idx < n; ++idx) {
column.set_scalar<int64_t>(idx, sorted_input[idx]);
}
auto left_idx = lower_bound_idx<int64_t>(column, value_to_find);
auto right_idx = upper_bound_idx<int64_t>(column, value_to_find);
RC_ASSERT(left_idx <= n);
RC_ASSERT(right_idx <= n);
if (left_idx == 0) {
RC_ASSERT(value_to_find <= smallest_value);
} else if (left_idx == n) {
RC_ASSERT(value_to_find > largest_value);
} else {
RC_ASSERT(value_to_find > sorted_input[left_idx - 1]);
RC_ASSERT(value_to_find <= sorted_input[left_idx]);
}
if (right_idx == 0) {
RC_ASSERT(value_to_find <= smallest_value);
} else if (right_idx == n) {
RC_ASSERT(value_to_find >= largest_value);
} else {
RC_ASSERT(value_to_find >= sorted_input[right_idx - 1]);
RC_ASSERT(value_to_find < sorted_input[right_idx]);

// Run against single / regular / irregular block layouts so block-jumping is exercised, not just
// the contiguous case.
std::vector<Column> columns;
columns.push_back(make_single_block_column<int64_t>(sorted_input, DataType::INT64));
columns.push_back(make_regular_blocks_column<int64_t>(sorted_input, DataType::INT64));
columns.push_back(make_irregular_blocks_column<int64_t>(sorted_input, DataType::INT64));

for (const auto& column : columns) {
auto left_idx = lower_bound_idx<int64_t>(column, value_to_find);
auto right_idx = upper_bound_idx<int64_t>(column, value_to_find);
RC_ASSERT(left_idx <= n);
RC_ASSERT(right_idx <= n);
if (left_idx == 0) {
RC_ASSERT(value_to_find <= smallest_value);
} else if (left_idx == n) {
RC_ASSERT(value_to_find > largest_value);
} else {
RC_ASSERT(value_to_find > sorted_input[left_idx - 1]);
RC_ASSERT(value_to_find <= sorted_input[left_idx]);
}
if (right_idx == 0) {
RC_ASSERT(value_to_find <= smallest_value);
} else if (right_idx == n) {
RC_ASSERT(value_to_find >= largest_value);
} else {
RC_ASSERT(value_to_find >= sorted_input[right_idx - 1]);
RC_ASSERT(value_to_find < sorted_input[right_idx]);
}
}
}

Expand Down
59 changes: 16 additions & 43 deletions cpp/arcticdb/column_store/test/test_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,12 +279,11 @@ TEST(ColumnData, Iterator) {
}
}

TEST(ColumnData, IteratorSkipsEmptyBlocks) {
TEST(ColumnData, IteratorSkipsTrailingEmptyBlock) {
using namespace arcticdb;

using TDT = TypeDescriptorTag<DataTypeTag<DataType::INT64>, DimensionTag<Dimension::Dim0>>;

// Trailing empty block
Column col(static_cast<TypeDescriptor>(TDT{}), 0, AllocationType::DYNAMIC, Sparsity::PERMITTED);
std::array<int64_t, 3> data{10, 20, 30};
col.set_external_block(0, data.data(), data.size());
Expand All @@ -302,8 +301,14 @@ TEST(ColumnData, IteratorSkipsEmptyBlocks) {
visited.push_back(*it);
}
EXPECT_EQ(visited, (std::vector<int64_t>{10, 20, 30}));
}

TEST(ColumnData, IteratorOnAllEmptyColumn) {
using namespace arcticdb;

// All-empty column: a single zero-size external block. begin must compare equal to end.
using TDT = TypeDescriptorTag<DataTypeTag<DataType::INT64>, DimensionTag<Dimension::Dim0>>;

// A single zero-size external block. begin must compare equal to end.
Column empty_col(static_cast<TypeDescriptor>(TDT{}), 0, AllocationType::DYNAMIC, Sparsity::PERMITTED);
empty_col.set_external_block(0, static_cast<int64_t*>(nullptr), 0);
ASSERT_EQ(empty_col.buffer().num_blocks(), 1u);
Expand Down Expand Up @@ -477,56 +482,20 @@ namespace {
using namespace arcticdb;
using SearchTDT = TypeDescriptorTag<DataTypeTag<DataType::INT64>, DimensionTag<Dimension::Dim0>>;

void populate(Column& col, const std::vector<int64_t>& values) {
for (size_t i = 0; i < values.size(); ++i) {
col.reference_at<int64_t>(i) = values[i];
}
}

// Three column shapes exercise the three random_accessor paths: SINGLE / REGULAR / IRREGULAR.
Column make_single_block(const std::vector<int64_t>& values) {
Column col(
static_cast<TypeDescriptor>(SearchTDT{}), values.size(), AllocationType::PRESIZED, Sparsity::NOT_PERMITTED
);
populate(col, values);
return col;
return make_single_block_column<int64_t>(values, DataType::INT64);
}

Column make_regular_blocks(const std::vector<int64_t>& values) {
Column col(
static_cast<TypeDescriptor>(SearchTDT{}),
Sparsity::NOT_PERMITTED,
ChunkedBuffer::presized_in_blocks(values.size() * sizeof(int64_t))
);
populate(col, values);
return col;
return make_regular_blocks_column<int64_t>(values, DataType::INT64);
}

Column make_irregular_blocks(const std::vector<int64_t>& values, const std::vector<size_t>& block_sizes) {
Column col(static_cast<TypeDescriptor>(SearchTDT{}), 0, AllocationType::DETACHABLE, Sparsity::NOT_PERMITTED);
for (size_t block_size : block_sizes) {
col.allocate_data(block_size * sizeof(int64_t));
col.advance_data(block_size * sizeof(int64_t));
}
populate(col, values);
return col;
}

// Default irregular pattern: [1, 1, 1, 3, 1, 5, 1, 7, ...] — alternates 1-element and i-element blocks.
std::vector<size_t> default_irregular_sizes(size_t total) {
std::vector<size_t> sizes;
size_t remaining = total;
for (size_t i = 0; remaining > 0; ++i) {
size_t current = i % 2 == 0 ? 1 : i;
current = std::min(current, remaining);
sizes.push_back(current);
remaining -= current;
}
return sizes;
return make_irregular_blocks_column<int64_t>(values, block_sizes, DataType::INT64);
}

Column make_irregular_blocks(const std::vector<int64_t>& values) {
return make_irregular_blocks(values, default_irregular_sizes(values.size()));
return make_irregular_blocks_column<int64_t>(values, DataType::INT64);
}

// Cross-checks our search functions against std::lower_bound / upper_bound on the reference vector.
Expand Down Expand Up @@ -585,10 +554,14 @@ TEST(ColumnSearch, BasicRegular) {
auto column_data = col.data();
auto begin = column_data.cbegin<SearchTDT, IteratorType::REGULAR, IteratorDensity::DENSE>();
auto end = column_data.cend<SearchTDT, IteratorType::REGULAR, IteratorDensity::DENSE>();
// 20 is duplicated at indices 4 and 5; lower_bound must land on the first (4) and upper_bound past
// the last (6).
auto lb = lower_bound<SearchTDT, IteratorType::REGULAR, IteratorDensity::DENSE>(begin, end, int64_t{20});
ASSERT_EQ(*lb, 20);
ASSERT_EQ(std::distance(begin, lb), 4);
auto ub = upper_bound<SearchTDT, IteratorType::REGULAR, IteratorDensity::DENSE>(begin, end, int64_t{20});
ASSERT_EQ(*ub, 25);
ASSERT_EQ(std::distance(begin, ub), 6);
}

TEST(ColumnSearch, BasicEnumerated) {
Expand Down
Loading
Loading