From c20e253906b2c20e50d394a03761504ac4a27be3 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Sun, 19 Oct 2025 22:23:31 +0000 Subject: [PATCH 1/9] Add FieldSplitter to normalize multi-field query terms into per-field instances After view resolution, query terms may reference multiple index fields. Downstream blueprint building prefers single-field terms. FieldSplitter bridges this gap by splitting multi-field terms into OR of per-field copies early in the query processing pipeline. Handles all query node types: simple terms, Phrase, Equiv, SameElement, NEAR/ONEAR, and multi-term nodes (WeightedSet, DotProduct, WandTerm, InTerm, WordAlternatives). Falls back to the original tree on errors. Add debug logging in BlueprintBuilder for empty blueprint cases. Includes comprehensive unit tests for all node types and edge cases. --- .../src/tests/proton/matching/CMakeLists.txt | 8 + .../proton/matching/field_splitter_test.cpp | 651 ++++++++++++++ .../searchcore/proton/matching/CMakeLists.txt | 1 + .../proton/matching/blueprintbuilder.cpp | 6 + .../proton/matching/field_splitter.cpp | 810 ++++++++++++++++++ .../proton/matching/field_splitter.h | 52 ++ 6 files changed, 1528 insertions(+) create mode 100644 searchcore/src/tests/proton/matching/field_splitter_test.cpp create mode 100644 searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp create mode 100644 searchcore/src/vespa/searchcore/proton/matching/field_splitter.h diff --git a/searchcore/src/tests/proton/matching/CMakeLists.txt b/searchcore/src/tests/proton/matching/CMakeLists.txt index 52b3579a0a9..cbaaeca6ad8 100644 --- a/searchcore/src/tests/proton/matching/CMakeLists.txt +++ b/searchcore/src/tests/proton/matching/CMakeLists.txt @@ -74,3 +74,11 @@ vespa_add_executable(searchcore_tag_needed_handles_test_app TEST GTest::gtest ) vespa_add_test(NAME searchcore_tag_needed_handles_test_app COMMAND searchcore_tag_needed_handles_test_app) +vespa_add_executable(searchcore_field_splitter_test_app TEST + SOURCES + field_splitter_test.cpp + DEPENDS + searchcore_matching + GTest::gtest +) +vespa_add_test(NAME searchcore_field_splitter_test_app COMMAND searchcore_field_splitter_test_app) diff --git a/searchcore/src/tests/proton/matching/field_splitter_test.cpp b/searchcore/src/tests/proton/matching/field_splitter_test.cpp new file mode 100644 index 00000000000..6f17690d90b --- /dev/null +++ b/searchcore/src/tests/proton/matching/field_splitter_test.cpp @@ -0,0 +1,651 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Comprehensive unit tests for FieldSplitter + +#include +LOG_SETUP("field_splitter_test"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using search::fef::FieldInfo; +using search::fef::FieldType; +using search::fef::test::IndexEnvironment; +using search::query::Node; +using search::query::QueryBuilder; +using search::query::Weight; +using search::query::WeightedStringTermVector; +using std::string; +using namespace proton::matching; +using CollectionType = FieldInfo::CollectionType; + +namespace { + +// Test constants +const string TERM = "test_term"; +const string VIEW = "test_view"; +const string FIELD1 = "field1"; +const string FIELD2 = "field2"; +const string FIELD3 = "field3"; +const uint32_t TERM_ID = 42; +const Weight TERM_WEIGHT(100); + +//============================================================================== +// Test Fixture +//============================================================================== + +class FieldSplitterTest : public ::testing::Test { +protected: + IndexEnvironment index_env; + ViewResolver view_resolver; + + FieldSplitterTest() { + // Set up index environment with fields + index_env.getFields().emplace_back(FieldType::INDEX, CollectionType::SINGLE, FIELD1, 0); + index_env.getFields().emplace_back(FieldType::INDEX, CollectionType::SINGLE, FIELD2, 1); + index_env.getFields().emplace_back(FieldType::INDEX, CollectionType::SINGLE, FIELD3, 2); + + // Set up view resolver to map VIEW to multiple fields + view_resolver.add(VIEW, FIELD1); + view_resolver.add(VIEW, FIELD2); + view_resolver.add(VIEW, FIELD3); + } + + ~FieldSplitterTest() override = default; + + // Helper to resolve views on a node + void resolveViews(Node& node) { + ResolveViewVisitor visitor(view_resolver, index_env); + node.accept(visitor); + } + + // Helper to build and split a query + Node::UP buildAndSplit(Node::UP node) { + resolveViews(*node); + return FieldSplitter::split_terms(std::move(node)); + } +}; + +//============================================================================== +// Simple Term Tests +//============================================================================== + +TEST_F(FieldSplitterTest, single_field_string_term_not_split) { + QueryBuilder builder; + builder.addStringTerm(TERM, FIELD1, TERM_ID, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* term_node = dynamic_cast(result.get()); + ASSERT_TRUE(term_node); + EXPECT_EQ(1u, term_node->numFields()); + EXPECT_EQ(FIELD1, term_node->field(0).getName()); +} + +TEST_F(FieldSplitterTest, multi_field_view_splits_term_into_or) { + QueryBuilder builder; + builder.addStringTerm(TERM, VIEW, TERM_ID, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); // VIEW maps to 3 fields + + // Check first child + auto* term1 = dynamic_cast(or_node->getChildren()[0]); + ASSERT_TRUE(term1); + EXPECT_EQ(TERM, term1->getTerm()); + EXPECT_EQ(1u, term1->numFields()); + EXPECT_EQ(FIELD1, term1->field(0).getName()); + + // Check second child + auto* term2 = dynamic_cast(or_node->getChildren()[1]); + ASSERT_TRUE(term2); + EXPECT_EQ(TERM, term2->getTerm()); + EXPECT_EQ(1u, term2->numFields()); + EXPECT_EQ(FIELD2, term2->field(0).getName()); + + // Check third child + auto* term3 = dynamic_cast(or_node->getChildren()[2]); + ASSERT_TRUE(term3); + EXPECT_EQ(TERM, term3->getTerm()); + EXPECT_EQ(1u, term3->numFields()); + EXPECT_EQ(FIELD3, term3->field(0).getName()); +} + +TEST_F(FieldSplitterTest, number_term_with_multi_field_view_splits) { + QueryBuilder builder; + builder.addNumberTerm("123", VIEW, TERM_ID, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); +} + +//============================================================================== +// Phrase Tests +//============================================================================== + +TEST_F(FieldSplitterTest, phrase_with_single_field_not_split) { + QueryBuilder builder; + builder.addPhrase(2, FIELD1, TERM_ID, TERM_WEIGHT); + builder.addStringTerm("hello", FIELD1, 1, TERM_WEIGHT); + builder.addStringTerm("world", FIELD1, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* phrase_node = dynamic_cast(result.get()); + ASSERT_TRUE(phrase_node); + EXPECT_EQ(1u, phrase_node->numFields()); + EXPECT_EQ(2u, phrase_node->getChildren().size()); +} + +TEST_F(FieldSplitterTest, phrase_with_multi_field_view_splits) { + QueryBuilder builder; + builder.addPhrase(2, VIEW, TERM_ID, TERM_WEIGHT); + builder.addStringTerm("hello", VIEW, 1, TERM_WEIGHT); + builder.addStringTerm("world", VIEW, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); // VIEW maps to 3 fields + + // Check first phrase + auto* phrase1 = dynamic_cast(or_node->getChildren()[0]); + ASSERT_TRUE(phrase1); + EXPECT_EQ(1u, phrase1->numFields()); + EXPECT_EQ(FIELD1, phrase1->field(0).getName()); + EXPECT_EQ(2u, phrase1->getChildren().size()); + + // Check second phrase + auto* phrase2 = dynamic_cast(or_node->getChildren()[1]); + ASSERT_TRUE(phrase2); + EXPECT_EQ(1u, phrase2->numFields()); + EXPECT_EQ(FIELD2, phrase2->field(0).getName()); + EXPECT_EQ(2u, phrase2->getChildren().size()); +} + +//============================================================================== +// Intermediate Node Tests (AND, OR, etc.) +//============================================================================== + +TEST_F(FieldSplitterTest, and_node_preserves_structure) { + QueryBuilder builder; + builder.addAnd(2); + builder.addStringTerm("term1", VIEW, 1, TERM_WEIGHT); // Will split + builder.addStringTerm("term2", FIELD1, 2, TERM_WEIGHT); // Won't split + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* and_node = dynamic_cast(result.get()); + ASSERT_TRUE(and_node); + EXPECT_EQ(2u, and_node->getChildren().size()); + + // First child should be OR (split term with multi-field VIEW) + auto* or_node = dynamic_cast(and_node->getChildren()[0]); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + // Second child should be single term (not split) + auto* term_node = dynamic_cast(and_node->getChildren()[1]); + ASSERT_TRUE(term_node); + EXPECT_EQ(1u, term_node->numFields()); +} + +//============================================================================== +// Equiv Node Tests +//============================================================================== + +TEST_F(FieldSplitterTest, equiv_with_single_field_not_split) { + QueryBuilder builder; + builder.addEquiv(2, TERM_ID, TERM_WEIGHT); + builder.addStringTerm("synonym1", FIELD1, 1, TERM_WEIGHT); + builder.addStringTerm("synonym2", FIELD1, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* equiv_node = dynamic_cast(result.get()); + ASSERT_TRUE(equiv_node); + EXPECT_EQ(1u, equiv_node->numFields()); + EXPECT_EQ(2u, equiv_node->getChildren().size()); +} + +TEST_F(FieldSplitterTest, equiv_with_multi_field_view_splits) { + QueryBuilder builder; + builder.addEquiv(2, TERM_ID, TERM_WEIGHT); + builder.addStringTerm("term1", VIEW, 1, TERM_WEIGHT); + builder.addStringTerm("term2", VIEW, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + // Should create OR with 3 Equiv nodes (one per field) + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + // Each child should be an Equiv node + for (auto* child : or_node->getChildren()) { + auto* equiv = dynamic_cast(child); + ASSERT_TRUE(equiv); + EXPECT_EQ(1u, equiv->numFields()); + } +} + +//============================================================================== +// SameElement Node Tests +//============================================================================== + +TEST_F(FieldSplitterTest, same_element_with_single_field_not_split) { + QueryBuilder builder; + builder.addSameElement(2, FIELD1, TERM_ID, TERM_WEIGHT); + builder.addStringTerm("term1", FIELD1, 1, TERM_WEIGHT); + builder.addStringTerm("term2", FIELD1, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* same_elem_node = dynamic_cast(result.get()); + ASSERT_TRUE(same_elem_node); + EXPECT_EQ(1u, same_elem_node->numFields()); + EXPECT_EQ(2u, same_elem_node->getChildren().size()); +} + +TEST_F(FieldSplitterTest, same_element_with_multi_field_view_splits) { + QueryBuilder builder; + builder.addSameElement(2, VIEW, TERM_ID, TERM_WEIGHT); + builder.addStringTerm("term1", VIEW, 1, TERM_WEIGHT); + builder.addStringTerm("term2", VIEW, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + // Each child should be a SameElement + for (auto* child : or_node->getChildren()) { + auto* same_elem = dynamic_cast(child); + ASSERT_TRUE(same_elem); + EXPECT_EQ(2u, same_elem->getChildren().size()); + } +} + +//============================================================================== +// Multi-term Node Tests +//============================================================================== + +TEST_F(FieldSplitterTest, weighted_set_term_with_multi_field_view_splits) { + QueryBuilder builder; + + auto terms = std::make_unique(2); + terms->addTerm("value1", Weight(10)); + terms->addTerm("value2", Weight(20)); + + builder.addWeightedSetTerm( + std::move(terms), + search::query::MultiTerm::Type::WEIGHTED_STRING, + VIEW, TERM_ID, TERM_WEIGHT); + + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + for (auto* child : or_node->getChildren()) { + auto* wset_node = dynamic_cast(child); + ASSERT_TRUE(wset_node); + EXPECT_EQ(1u, wset_node->numFields()); + EXPECT_EQ(2u, wset_node->getNumTerms()); + } +} + +TEST_F(FieldSplitterTest, dot_product_with_single_field_not_split) { + QueryBuilder builder; + + auto terms = std::make_unique(1); + terms->addTerm("value1", Weight(10)); + + builder.addDotProduct( + std::move(terms), + search::query::MultiTerm::Type::WEIGHTED_STRING, + FIELD1, TERM_ID, TERM_WEIGHT); + + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* dotprod_node = dynamic_cast(result.get()); + ASSERT_TRUE(dotprod_node); + EXPECT_EQ(1u, dotprod_node->numFields()); +} + +//============================================================================== +// Near and ONear Node Tests +//============================================================================== + +TEST_F(FieldSplitterTest, near_with_single_field_not_split) { + QueryBuilder builder; + builder.addNear(2, 10, 0, 0); // distance=10, no negative terms + builder.addStringTerm("term1", FIELD1, 1, TERM_WEIGHT); + builder.addStringTerm("term2", FIELD1, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* near_node = dynamic_cast(result.get()); + ASSERT_TRUE(near_node); + EXPECT_EQ(2u, near_node->getChildren().size()); + EXPECT_EQ(10u, near_node->getDistance()); +} + +TEST_F(FieldSplitterTest, near_with_multi_field_view_splits) { + QueryBuilder builder; + builder.addNear(2, 10, 0, 0); // distance=10, no negative terms + builder.addStringTerm("term1", VIEW, 1, TERM_WEIGHT); + builder.addStringTerm("term2", VIEW, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + // Should create OR with 3 Near nodes (one per field) + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + // Each child should be a Near node with correct distance + for (auto* child : or_node->getChildren()) { + auto* near = dynamic_cast(child); + ASSERT_TRUE(near); + EXPECT_EQ(2u, near->getChildren().size()); + EXPECT_EQ(10u, near->getDistance()); + } +} + +TEST_F(FieldSplitterTest, onear_with_single_field_not_split) { + QueryBuilder builder; + builder.addONear(2, 10, 0, 0); // distance=10, no negative terms + builder.addStringTerm("term1", FIELD1, 1, TERM_WEIGHT); + builder.addStringTerm("term2", FIELD1, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* onear_node = dynamic_cast(result.get()); + ASSERT_TRUE(onear_node); + EXPECT_EQ(2u, onear_node->getChildren().size()); + EXPECT_EQ(10u, onear_node->getDistance()); +} + +TEST_F(FieldSplitterTest, onear_with_multi_field_view_splits) { + QueryBuilder builder; + builder.addONear(2, 10, 0, 0); // distance=10, no negative terms + builder.addStringTerm("term1", VIEW, 1, TERM_WEIGHT); + builder.addStringTerm("term2", VIEW, 2, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + // Should create OR with 3 ONear nodes (one per field) + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + // Each child should be an ONear node with correct distance + for (auto* child : or_node->getChildren()) { + auto* onear = dynamic_cast(child); + ASSERT_TRUE(onear); + EXPECT_EQ(2u, onear->getChildren().size()); + EXPECT_EQ(10u, onear->getDistance()); + } +} + +TEST_F(FieldSplitterTest, near_with_mixed_fields_splits_correctly) { + // Create a NEAR where children have overlapping but different field sets + // This tests that only children with the same field are grouped together + QueryBuilder builder; + builder.addNear(3, 10, 0, 0); // distance=10, no negative terms + builder.addStringTerm("term1", FIELD1, 1, TERM_WEIGHT); + builder.addStringTerm("term2", FIELD1, 2, TERM_WEIGHT); + builder.addStringTerm("term3", FIELD2, 3, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + // Should create OR with 2 Near nodes (one for field1, one for field2) + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(2u, or_node->getChildren().size()); + + // First Near should have 2 children (term1 and term2 for field1) + auto* near1 = dynamic_cast(or_node->getChildren()[0]); + ASSERT_TRUE(near1); + EXPECT_EQ(2u, near1->getChildren().size()); + + // Second Near should have 1 child (term3 for field2) + auto* near2 = dynamic_cast(or_node->getChildren()[1]); + ASSERT_TRUE(near2); + EXPECT_EQ(1u, near2->getChildren().size()); +} + +TEST_F(FieldSplitterTest, onear_with_three_terms_multi_field_splits) { + QueryBuilder builder; + builder.addONear(3, 5, 0, 0); // distance=5, 3 terms, no negative terms + builder.addStringTerm("term1", VIEW, 1, TERM_WEIGHT); + builder.addStringTerm("term2", VIEW, 2, TERM_WEIGHT); + builder.addStringTerm("term3", VIEW, 3, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + // Each child should be an ONear with 3 children + for (auto* child : or_node->getChildren()) { + auto* onear = dynamic_cast(child); + ASSERT_TRUE(onear); + EXPECT_EQ(3u, onear->getChildren().size()); + EXPECT_EQ(5u, onear->getDistance()); + } +} + +TEST_F(FieldSplitterTest, near_with_word_alternatives_and_equiv_splits) { + QueryBuilder builder; + + // Create NEAR with WordAlternatives and Equiv children + builder.addNear(2, 10, 0, 0); // distance=10, no negative terms + { + // First child: WordAlternatives + std::vector> alternatives; + alternatives.push_back(std::make_unique("alt1", VIEW, 1, TERM_WEIGHT)); + alternatives.push_back(std::make_unique("alt2", VIEW, 2, TERM_WEIGHT)); + builder.add_word_alternatives(std::move(alternatives), VIEW, 3, TERM_WEIGHT); + } + { + // Second child: Equiv + builder.addEquiv(2, 4, TERM_WEIGHT); + builder.addStringTerm("syn1", VIEW, 5, TERM_WEIGHT); + builder.addStringTerm("syn2", VIEW, 6, TERM_WEIGHT); + } + + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + // Should create OR with 3 Near nodes (one per field) + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(3u, or_node->getChildren().size()); + + // Each child should be a Near node with 2 children (WordAlternatives and Equiv) + for (auto* child : or_node->getChildren()) { + auto* near = dynamic_cast(child); + ASSERT_TRUE(near); + EXPECT_EQ(2u, near->getChildren().size()); + EXPECT_EQ(10u, near->getDistance()); + + // First child should be WordAlternatives with single field + auto* word_alt = dynamic_cast(near->getChildren()[0]); + ASSERT_TRUE(word_alt); + EXPECT_EQ(1u, word_alt->numFields()); + EXPECT_EQ(2u, word_alt->getNumTerms()); + + // Second child should be Equiv with single field + auto* equiv = dynamic_cast(near->getChildren()[1]); + ASSERT_TRUE(equiv); + EXPECT_EQ(1u, equiv->numFields()); + EXPECT_EQ(2u, equiv->getChildren().size()); + } +} + +//============================================================================== +// Complex Scenarios +//============================================================================== + +TEST_F(FieldSplitterTest, complex_query_with_and_or_phrases) { + QueryBuilder builder; + builder.addAnd(2); + { + // First child: multi-field term + builder.addStringTerm("term1", VIEW, 1, TERM_WEIGHT); + } + { + // Second child: phrase with multi-field view + builder.addPhrase(2, VIEW, 2, TERM_WEIGHT); + builder.addStringTerm("hello", VIEW, 3, TERM_WEIGHT); + builder.addStringTerm("world", VIEW, 4, TERM_WEIGHT); + } + + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* and_node = dynamic_cast(result.get()); + ASSERT_TRUE(and_node); + EXPECT_EQ(2u, and_node->getChildren().size()); + + // Both children should be OR nodes + auto* or1 = dynamic_cast(and_node->getChildren()[0]); + ASSERT_TRUE(or1); + EXPECT_EQ(3u, or1->getChildren().size()); + + auto* or2 = dynamic_cast(and_node->getChildren()[1]); + ASSERT_TRUE(or2); + EXPECT_EQ(3u, or2->getChildren().size()); +} + +//============================================================================== +// Edge Cases and Error Handling +//============================================================================== + +TEST_F(FieldSplitterTest, term_with_no_fields_handled) { + QueryBuilder builder; + builder.addStringTerm(TERM, "nonexistent_view", TERM_ID, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* term_node = dynamic_cast(result.get()); + ASSERT_TRUE(term_node); + EXPECT_EQ(0u, term_node->numFields()); +} + +TEST_F(FieldSplitterTest, empty_and_node_preserved) { + QueryBuilder builder; + builder.addAnd(0); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* and_node = dynamic_cast(result.get()); + ASSERT_TRUE(and_node); + EXPECT_EQ(0u, and_node->getChildren().size()); +} + +TEST_F(FieldSplitterTest, true_and_false_nodes_preserved) { + QueryBuilder builder; + builder.addOr(2); + builder.add_true_node(); + builder.add_false_node(); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + auto* or_node = dynamic_cast(result.get()); + ASSERT_TRUE(or_node); + EXPECT_EQ(2u, or_node->getChildren().size()); +} + +//============================================================================== +// Regression Tests +//============================================================================== + +TEST_F(FieldSplitterTest, deeply_nested_structure_handled) { + QueryBuilder builder; + builder.addAnd(1); + builder.addOr(1); + builder.addAnd(1); + builder.addOr(1); + builder.addStringTerm(TERM, VIEW, TERM_ID, TERM_WEIGHT); + Node::UP root = builder.build(); + + Node::UP result = buildAndSplit(std::move(root)); + + ASSERT_TRUE(result); + // Should successfully navigate the deep structure + auto* and_node = dynamic_cast(result.get()); + ASSERT_TRUE(and_node); +} + +} // namespace + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt index 2299802e01f..b4361809f68 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt +++ b/searchcore/src/vespa/searchcore/proton/matching/CMakeLists.txt @@ -8,6 +8,7 @@ vespa_add_library(searchcore_matching STATIC document_scorer.cpp extract_features.cpp fakesearchcontext.cpp + field_splitter.cpp handlerecorder.cpp i_match_loop_communicator.cpp indexenvironment.cpp diff --git a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp index cd1487e18cb..a688ada2de2 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/blueprintbuilder.cpp @@ -13,6 +13,9 @@ #include #include +#include +LOG_SETUP(".proton.matching.blueprintbuilder"); + using namespace search::queryeval; using search::query::Node; @@ -35,6 +38,7 @@ struct Mixer { Blueprint::UP mix(Blueprint::UP indexes) { if ( ! attributes) { if ( ! indexes) { + LOG(debug, "EmptyBlueprint: term has no attributes and no indexes"); return std::make_unique(); } return indexes; @@ -136,6 +140,7 @@ class BlueprintBuilderVisitor : } _result = std::move(se); } else { + LOG(debug, "EmptyBlueprint: SameElement operator has unexpected number of fields (expected 1, got %zu)", n.numFields()); vespalib::Issue::report("SameElement operator searches in unexpected number of fields. Expected 1 but was %zu", n.numFields()); _result = std::make_unique(); } @@ -199,6 +204,7 @@ class BlueprintBuilderVisitor : _result = std::make_unique(); } void visit(ProtonFalse &) override { + LOG(debug, "EmptyBlueprint: ProtonFalse query node (expected behavior)"); _result = std::make_unique(); } void visit(ProtonFuzzyTerm &n) override { buildTerm(n); } diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp new file mode 100644 index 00000000000..ed6b576aa5c --- /dev/null +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -0,0 +1,810 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "field_splitter.h" +#include "querynodes.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +LOG_SETUP(".proton.matching.field_splitter"); + +using search::query::Node; +using search::query::QueryBuilder; +using search::query::MultiTerm; +using search::query::TermVector; +using search::query::IntegerTermVector; +using search::query::StringTermVector; +using search::query::WeightedIntegerTermVector; +using search::query::WeightedStringTermVector; + +namespace proton::matching { + +namespace { + +using ProtonBuilder = QueryBuilder; + +template +T& genericAddSimpleTerm(ProtonBuilder& builder, + const std::string & term, + const std::string & view, + int32_t id, + search::query::Weight weight); + +template<> +ProtonNodeTypes::StringTerm& +genericAddSimpleTerm(ProtonBuilder& builder, + const std::string & term, + const std::string & view, + int32_t id, + search::query::Weight weight) +{ + return builder.addStringTerm(term, view, id, weight); +} + +template<> +ProtonNodeTypes::NumberTerm& +genericAddSimpleTerm(ProtonBuilder& builder, + const std::string & term, + const std::string & view, + int32_t id, + search::query::Weight weight) +{ + return builder.addNumberTerm(term, view, id, weight); +} + +template<> +ProtonNodeTypes::PrefixTerm& +genericAddSimpleTerm(ProtonBuilder& builder, + const std::string & term, + const std::string & view, + int32_t id, + search::query::Weight weight) +{ + return builder.addPrefixTerm(term, view, id, weight); +} + +template<> +ProtonNodeTypes::SubstringTerm& +genericAddSimpleTerm(ProtonBuilder& builder, + const std::string & term, + const std::string & view, + int32_t id, + search::query::Weight weight) +{ + return builder.addSubstringTerm(term, view, id, weight); +} + +template<> +ProtonNodeTypes::SuffixTerm& +genericAddSimpleTerm(ProtonBuilder& builder, + const std::string & term, + const std::string & view, + int32_t id, + search::query::Weight weight) +{ + return builder.addSuffixTerm(term, view, id, weight); +} + +template<> +ProtonNodeTypes::RegExpTerm& +genericAddSimpleTerm(ProtonBuilder& builder, + const std::string & term, + const std::string & view, + int32_t id, + search::query::Weight weight) +{ + return builder.addRegExpTerm(term, view, id, weight); +} + + +/** + * FieldSplitterVisitor - Visitor that splits multi-field query terms into separate per-field terms + * + * This visitor transforms a query tree where terms can have multiple fields into a normalized form + * where each term operates on a single field. When a term has multiple fields, it's split into + * separate term instances connected with OR nodes. + * + * Purpose: + * Query terms in Vespa can reference multiple fields (e.g., search in both "title" and "body"). + * This visitor splits such terms into field-specific variants to simplify query execution. + * + * Transformation examples: + * + * 1. Simple term with two fields: + * StringTerm("foo", fields=[title, body]) + * => + * OR(StringTerm("foo", field=title), StringTerm("foo", field=body)) + * + * 2. Phrase with multiple fields: + * Phrase(fields=[title, body], children=[term1, term2]) + * => + * OR(Phrase(field=title, children=[term1_title, term2_title]), + * Phrase(field=body, children=[term1_body, term2_body])) + * + * 3. Equiv with children having different fields: + * Equiv(child1[fields=title,body], child2[fields=body,author]) + * => + * OR(Equiv(child1_title), + * Equiv(child1_body, child2_body), + * Equiv(child2_author)) + * + * Special handling: + * - Phrase nodes: Children won't have fields, so they are visited as-is + * - SameElement nodes: Children are forced to use the same field as the SameElement + * - Equiv nodes: Gathers children by field, creating one Equiv per field + * - Multi-term nodes: WeightedSet, DotProduct, WandTerm, InTerm, WordAlternatives + * - Forced field mode: When inside a SameElement, children must use the SameElement's field + * + * State: + * - _builder: QueryBuilder for constructing the transformed tree + * - _force_field_id: When set, forces children to use this specific field (for SameElement) + * - _has_error: Set when field splitting fails (e.g., forced field not found) + * + * Usage: + * FieldSplitterVisitor visitor; + * query_node->accept(visitor); + * Node::UP result = visitor.build(); + */ +class FieldSplitterVisitor : public search::query::CustomTypeVisitor +{ +private: + ProtonBuilder _builder; + uint32_t _force_field_id = search::fef::IllegalFieldId; + bool _has_error = false; + + // ===== Node Traversal Helpers ===== + + void visitNodes(const std::vector &nodes) { + for (auto node : nodes) { + node->accept(*this); + } + } + + // Helper to visit children with a forced field (used for SameElement) + // Sets _force_field_id temporarily so that child terms only use the specified field + void splitAndVisitChildrenForField(const std::vector &nodes, uint32_t field_id) { + uint32_t saved_field_id = _force_field_id; + _force_field_id = field_id; + visitNodes(nodes); + _force_field_id = saved_field_id; + } + + // ===== Field Analysis Helpers ===== + + // Helper to get field IDs from a ProtonTermData node + static std::set getFieldIds(const ProtonTermData &term_data) { + std::set fields; + for (size_t i = 0; i < term_data.numFields(); ++i) { + fields.insert(term_data.field(i).getFieldId()); + } + return fields; + } + + // Helper to check if all children have the same field set + static bool allChildrenHaveSameFields(const std::vector &children, + const std::set &expected_fields) + { + for (Node *child : children) { + auto* term_data = dynamic_cast(child); + if (!term_data || term_data->numFields() != expected_fields.size()) { + return false; + } + if (getFieldIds(*term_data) != expected_fields) { + return false; + } + } + return true; + } + + // ===== Node-Specific Helpers ===== + + // Helper to create SameElement replica with common properties + ProtonSameElement& createSameElementReplica(ProtonSameElement &node) { + auto &replica = _builder.addSameElement(node.getChildren().size(), node.getView(), + node.getId(), node.getWeight()); + replica.set_expensive(node.is_expensive()); + return replica; + } + + // Helper to create a non-split SameElement (pass-through) + void handleWithoutSplit(ProtonSameElement &node) { + auto &replica = createSameElementReplica(node); + // Copy ProtonTermData state - should have exactly one field when not splitting + if (node.numFields() == 1) { + copyProtonTermDataForField(node, replica, 0); + } + visitNodes(node.getChildren()); + } + + // Helper to split SameElement across multiple fields + // Children are forced to use each specific field via splitAndVisitChildrenForField + void splitSameElementByFields(ProtonSameElement &node, const std::set &fields) { + _builder.addOr(fields.size()); + for (uint32_t field_id : fields) { + createSameElementReplica(node); + splitAndVisitChildrenForField(node.getChildren(), field_id); + } + } + + // ===== Term State Copying Helpers ===== + + static void copyState(const search::query::Term &original, search::query::Term &replica) { + replica.setRanked(original.isRanked()); + replica.setPositionData(original.usePositionData()); + replica.set_prefix_match(original.prefix_match()); + } + + // Helper to copy ProtonTermData state for a specific field + template + static void copyProtonTermDataForField(TermType &original, TermType &replica, size_t field_idx) { + // Copy the specific field entry from original to replica + if (field_idx < original.numFields()) { + replica.useFieldEntry(original.field(field_idx)); + } + } + + // Helper to replicate subterms for multi-term nodes + static std::unique_ptr replicate_subterms(const MultiTerm& original) { + uint32_t num_terms = original.getNumTerms(); + switch (original.getType()) { + case MultiTerm::Type::STRING: { + auto replica = std::make_unique(num_terms); + for (uint32_t i = 0; i < num_terms; i++) { + replica->addTerm(original.getAsString(i).first); + } + return replica; + } + case MultiTerm::Type::WEIGHTED_STRING: { + auto replica = std::make_unique(num_terms); + for (uint32_t i = 0; i < num_terms; i++) { + auto [term, weight] = original.getAsString(i); + replica->addTerm(term, weight); + } + return replica; + } + case MultiTerm::Type::INTEGER: { + auto replica = std::make_unique(num_terms); + for (uint32_t i = 0; i < num_terms; i++) { + replica->addTerm(original.getAsInteger(i).first); + } + return replica; + } + case MultiTerm::Type::WEIGHTED_INTEGER: { + auto replica = std::make_unique(num_terms); + for (uint32_t i = 0; i < num_terms; i++) { + auto [value, weight] = original.getAsInteger(i); + replica->addTerm(value, weight); + } + return replica; + } + case MultiTerm::Type::UNKNOWN: + assert(num_terms == 0); + } + return std::make_unique(num_terms); + } + + // ===== Error Handling Helpers ===== + + // Helper to get node type name for error reporting + template + const char* getNodeTypeName() { + if constexpr (std::is_same_v) return "StringTerm"; + else if constexpr (std::is_same_v) return "NumberTerm"; + else if constexpr (std::is_same_v) return "PrefixTerm"; + else if constexpr (std::is_same_v) return "SubstringTerm"; + else if constexpr (std::is_same_v) return "SuffixTerm"; + else if constexpr (std::is_same_v) return "RangeTerm"; + else if constexpr (std::is_same_v) return "LocationTerm"; + else if constexpr (std::is_same_v) return "RegExpTerm"; + else if constexpr (std::is_same_v) return "FuzzyTerm"; + else if constexpr (std::is_same_v) return "Phrase"; + else if constexpr (std::is_same_v) return "WeightedSetTerm"; + else if constexpr (std::is_same_v) return "DotProduct"; + else if constexpr (std::is_same_v) return "WandTerm"; + else if constexpr (std::is_same_v) return "InTerm"; + else if constexpr (std::is_same_v) return "WordAlternatives"; + else return "UnknownNode"; + } + + // ===== Term Splitting Logic ===== + + // Helper to get the view/field name for a term being replicated + // Uses the field name from field_idx if available, otherwise falls back to the node's view + template + const std::string& getFieldNameOrView(NodeType &node, size_t field_idx) const { + if (field_idx < node.numFields()) { + return node.field(field_idx).getName(); + } + return node.getView(); + } + + // Helper to replicate a term for a specific field + template + void replicateTermForField(NodeType &node, size_t field_idx) { + // Default implementation for simple term types that work with genericAddSimpleTerm + auto &replica = genericAddSimpleTerm(_builder, + node.getTerm(), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight()); + copyState(node, replica); + copyProtonTermDataForField(node, replica, field_idx); + } + + template + void splitTerm(NodeType &node) { + // If we're forced to use a specific field (inside SameElement), use only that field + if (_force_field_id != search::fef::IllegalFieldId) { + // Find the matching field index + for (size_t i = 0; i < node.numFields(); ++i) { + if (node.field(i).getFieldId() == _force_field_id) { + replicateTermForField(node, i); + return; + } + } + // Field not found - report error and set error flag + LOG(debug, "field splitting for %s failed: forced field_id %u not found in node's %zu fields", + getNodeTypeName(), _force_field_id, node.numFields()); + vespalib::Issue::report("field splitting for %s failed: forced field_id %u not found in node's %zu fields", + getNodeTypeName(), _force_field_id, node.numFields()); + _has_error = true; + return; + } + + // Normal case: split across all fields + size_t num_fields = node.numFields(); + + if (num_fields <= 1) { + // No splitting needed - just replicate the node as-is + replicateTermForField(node, 0); + } else { + // Multiple fields - create OR with one term per field + _builder.addOr(num_fields); + for (size_t i = 0; i < num_fields; ++i) { + replicateTermForField(node, i); + } + } + } + +public: + Node::UP build() { + if (_has_error) { + return Node::UP(); + } + return _builder.build(); + } + + // Intermediate nodes - recurse on children + void visit(ProtonAnd &node) override { + _builder.addAnd(node.getChildren().size()); + visitNodes(node.getChildren()); + } + + void visit(ProtonAndNot &node) override { + _builder.addAndNot(node.getChildren().size()); + visitNodes(node.getChildren()); + } + + void visit(ProtonOr &node) override { + _builder.addOr(node.getChildren().size()); + visitNodes(node.getChildren()); + } + + void visit(ProtonRank &node) override { + _builder.addRank(node.getChildren().size()); + visitNodes(node.getChildren()); + } + + void visit(ProtonWeakAnd &node) override { + _builder.addWeakAnd(node.getChildren().size(), node.getTargetNumHits(), node.getView()); + visitNodes(node.getChildren()); + } + + void visit(ProtonNear &node) override { + // Build map: field_id -> list of original child indices that have this field + auto field_to_children = buildFieldToChildrenMap(node.getChildren()); + + if (field_to_children.empty()) { + LOG(debug, "field splitting for Near node failed: " + "no fields found in any children (distance=%zu, num_children=%zu)", + node.getDistance(), node.getChildren().size()); + vespalib::Issue::report("field splitting for Near node failed: " + "no fields found in any children " + "(distance=%zu, num_children=%zu)", + node.getDistance(), node.getChildren().size()); + _has_error = true; + return; + } + + if (field_to_children.size() == 1) { + // Only one field - create single Near with all children + const auto& [field_id, child_indices] = *field_to_children.begin(); + createNearForField(node, field_id, child_indices); + } else { + // Multiple fields - create OR with one Near per field + _builder.addOr(field_to_children.size()); + for (const auto& [field_id, child_indices] : field_to_children) { + createNearForField(node, field_id, child_indices); + } + } + } + + void visit(ProtonONear &node) override { + // Build map: field_id -> list of original child indices that have this field + auto field_to_children = buildFieldToChildrenMap(node.getChildren()); + + if (field_to_children.empty()) { + LOG(debug, "field splitting for ONear node failed: " + "no fields found in any children (distance=%zu, num_children=%zu)", + node.getDistance(), node.getChildren().size()); + vespalib::Issue::report("field splitting for ONear node failed: " + "no fields found in any children " + "(distance=%zu, num_children=%zu)", + node.getDistance(), node.getChildren().size()); + _has_error = true; + return; + } + + if (field_to_children.size() == 1) { + // Only one field - create single ONear with all children + const auto& [field_id, child_indices] = *field_to_children.begin(); + createONearForField(node, field_id, child_indices); + } else { + // Multiple fields - create OR with one ONear per field + _builder.addOr(field_to_children.size()); + for (const auto& [field_id, child_indices] : field_to_children) { + createONearForField(node, field_id, child_indices); + } + } + } + + // Helper to build field-to-children mapping for Equiv nodes + std::map> buildFieldToChildrenMap(const std::vector &children) { + std::map> field_to_children; + for (size_t child_idx = 0; child_idx < children.size(); ++child_idx) { + if (auto* term_data = dynamic_cast(children[child_idx])) { + for (size_t i = 0; i < term_data->numFields(); ++i) { + field_to_children[term_data->field(i).getFieldId()].push_back(child_idx); + } + } + } + return field_to_children; + } + + // Helper to create and populate an Equiv node for a specific field + void createEquivForField(ProtonEquiv &node, uint32_t field_id, + const std::vector &child_indices) { + auto &replica = _builder.addEquiv(child_indices.size(), node.getId(), node.getWeight()); + + // Visit each child with forced field + uint32_t saved_field_id = _force_field_id; + _force_field_id = field_id; + for (size_t idx : child_indices) { + node.getChildren()[idx]->accept(*this); + } + _force_field_id = saved_field_id; + + // Resolve field metadata from children + replica.resolveFromChildren(replica.getChildren()); + } + + // Helper to create and populate a Near node for a specific field + void createNearForField(ProtonNear &node, uint32_t field_id, + const std::vector &child_indices) { + _builder.addNear(child_indices.size(), node.getDistance(), + node.num_negative_terms(), node.exclusion_distance()); + + // Visit each child with forced field + uint32_t saved_field_id = _force_field_id; + _force_field_id = field_id; + for (size_t idx : child_indices) { + node.getChildren()[idx]->accept(*this); + } + _force_field_id = saved_field_id; + } + + // Helper to create and populate an ONear node for a specific field + void createONearForField(ProtonONear &node, uint32_t field_id, + const std::vector &child_indices) { + _builder.addONear(child_indices.size(), node.getDistance(), + node.num_negative_terms(), node.exclusion_distance()); + + // Visit each child with forced field + uint32_t saved_field_id = _force_field_id; + _force_field_id = field_id; + for (size_t idx : child_indices) { + node.getChildren()[idx]->accept(*this); + } + _force_field_id = saved_field_id; + } + + void visit(ProtonEquiv &node) override { + // Build map: field_id -> list of original child indices that have this field + auto field_to_children = buildFieldToChildrenMap(node.getChildren()); + + if (field_to_children.empty()) { + LOG(debug, "field splitting for Equiv node failed: " + "no fields found in any children (id=%d, weight=%d, num_children=%zu)", + node.getId(), node.getWeight().percent(), node.getChildren().size()); + vespalib::Issue::report("field splitting for Equiv node failed: " + "no fields found in any children " + "(id=%d, weight=%d, num_children=%zu)", + node.getId(), node.getWeight().percent(), + node.getChildren().size()); + _has_error = true; + return; + } + + // If we're forced to use a specific field (e.g., inside a NEAR/ONEAR), use only that field + if (_force_field_id != search::fef::IllegalFieldId) { + auto it = field_to_children.find(_force_field_id); + if (it != field_to_children.end()) { + createEquivForField(node, it->first, it->second); + } else { + LOG(debug, "field splitting for Equiv node failed: forced field_id %u not found", + _force_field_id); + vespalib::Issue::report("field splitting for Equiv node failed: " + "forced field_id %u not found", + _force_field_id); + _has_error = true; + } + return; + } + + if (field_to_children.size() == 1) { + // Only one field - create single Equiv with all children + const auto& [field_id, child_indices] = *field_to_children.begin(); + createEquivForField(node, field_id, child_indices); + } else { + // Multiple fields - create OR with one Equiv per field + _builder.addOr(field_to_children.size()); + for (const auto& [field_id, child_indices] : field_to_children) { + createEquivForField(node, field_id, child_indices); + } + } + } + + void visit(ProtonPhrase &node) override { + splitTerm(node); + } + + // Helper to determine if SameElement can be split by fields + bool canSplitSameElement(ProtonSameElement &node) const { + // Can split if: + // 1. SameElement has multiple fields + // 2. All children have the same set of fields as the SameElement + if (node.numFields() <= 1) { + return false; + } + return allChildrenHaveSameFields(node.getChildren(), getFieldIds(node)); + } + + void visit(ProtonSameElement &node) override { + if (!canSplitSameElement(node)) { + LOG(debug, "SameElement not split: has %zu field(s), children have incompatible fields", + node.numFields()); + handleWithoutSplit(node); + return; + } + + // All children have the same multiple fields as SameElement - split like Phrase + auto fields = getFieldIds(node); + LOG(debug, "Splitting SameElement across %zu fields", fields.size()); + splitSameElementByFields(node, fields); + } + + // Terms that need splitting + void visit(ProtonNumberTerm &node) override { + splitTerm(node); + } + + void visit(ProtonStringTerm &node) override { + splitTerm(node); + } + + void visit(ProtonPrefixTerm &node) override { + splitTerm(node); + } + + void visit(ProtonSubstringTerm &node) override { + splitTerm(node); + } + + void visit(ProtonSuffixTerm &node) override { + splitTerm(node); + } + + void visit(ProtonRangeTerm &node) override { + splitTerm(node); + } + + void visit(ProtonLocationTerm &node) override { + splitTerm(node); + } + + void visit(ProtonRegExpTerm &node) override { + splitTerm(node); + } + + void visit(ProtonFuzzyTerm &node) override { + splitTerm(node); + } + + // Multi-terms - split by field if needed + void visit(ProtonWeightedSetTerm &node) override { + splitTerm(node); + } + + void visit(ProtonDotProduct &node) override { + splitTerm(node); + } + + void visit(ProtonWandTerm &node) override { + splitTerm(node); + } + + void visit(ProtonInTerm &node) override { + splitTerm(node); + } + + void visit(ProtonWordAlternatives &node) override { + splitTerm(node); + } + + void visit(ProtonPredicateQuery &node) override { + copyState(node, _builder.addPredicateQuery( + std::make_unique(*node.getTerm()), + node.getView(), node.getId(), node.getWeight())); + } + + void visit(ProtonNearestNeighborTerm &node) override { + copyState(node, _builder.add_nearest_neighbor_term( + node.get_query_tensor_name(), node.getView(), + node.getId(), node.getWeight(), node.get_target_num_hits(), + node.get_allow_approximate(), + node.get_hnsw_params())); + } + + void visit(ProtonTrue &) override { + _builder.add_true_node(); + } + + void visit(ProtonFalse &) override { + _builder.add_false_node(); + } +}; + +// Template specializations for replicateTermForField +// Only specialized for term types that don't fit the default pattern + +template <> +void FieldSplitterVisitor::replicateTermForField(ProtonRangeTerm &node, size_t field_idx) { + auto &replica = _builder.addRangeTerm( + node.getTerm(), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight()); + copyState(node, replica); + copyProtonTermDataForField(node, replica, field_idx); +} + +template <> +void FieldSplitterVisitor::replicateTermForField(ProtonLocationTerm &node, size_t field_idx) { + auto &replica = _builder.addLocationTerm( + node.getTerm(), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight()); + copyState(node, replica); + copyProtonTermDataForField(node, replica, field_idx); +} + +template <> +void FieldSplitterVisitor::replicateTermForField(ProtonFuzzyTerm &node, size_t field_idx) { + auto &replica = _builder.addFuzzyTerm( + node.getTerm(), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight(), + node.max_edit_distance(), node.prefix_lock_length(), node.prefix_match()); + copyState(node, replica); + copyProtonTermDataForField(node, replica, field_idx); +} + +template <> +void FieldSplitterVisitor::replicateTermForField(ProtonPhrase &node, size_t field_idx) { + auto &replica = _builder.addPhrase(node.getChildren().size(), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight()); + replica.set_expensive(node.is_expensive()); + copyState(node, replica); + copyProtonTermDataForField(node, replica, field_idx); + + // Process children normally - they won't have fields + visitNodes(node.getChildren()); +} + +template <> +void FieldSplitterVisitor::replicateTermForField(ProtonWordAlternatives &node, + size_t field_idx) +{ + // Create ProtonStringTerm children for each alternative + std::vector> children; + const auto& original_children = node.getChildren(); + children.reserve(original_children.size()); + + for (const auto& original_child : original_children) { + // Create a new ProtonStringTerm for this alternative + auto child = std::make_unique( + original_child->getTerm(), + getFieldNameOrView(node, field_idx), + original_child->getId(), + original_child->getWeight() + ); + copyState(*original_child, *child); + + // Copy field entry from parent WordAlternatives to child StringTerm + if (field_idx < node.numFields()) { + child->useFieldEntry(node.field(field_idx)); + } + + children.push_back(std::move(child)); + } + + // Create the WordAlternatives replica with ProtonStringTerm children + auto &replica = _builder.add_word_alternatives( + std::move(children), getFieldNameOrView(node, field_idx), node.getId(), node.getWeight()); + copyState(node, replica); + copyProtonTermDataForField(node, replica, field_idx); +} + +// Helper macro for multi-term nodes with subterms +#define REPLICATE_MULTITERM(TermType, builderMethod) \ +template <> \ +void FieldSplitterVisitor::replicateTermForField(TermType &node, size_t field_idx) { \ + auto &replica = _builder.builderMethod( \ + replicate_subterms(node), node.getType(), getFieldNameOrView(node, field_idx), \ + node.getId(), node.getWeight()); \ + copyState(node, replica); \ + copyProtonTermDataForField(node, replica, field_idx); \ +} + +REPLICATE_MULTITERM(ProtonWeightedSetTerm, addWeightedSetTerm) +REPLICATE_MULTITERM(ProtonDotProduct, addDotProduct) +REPLICATE_MULTITERM(ProtonInTerm, add_in_term) + +#undef REPLICATE_MULTITERM + +template <> +void FieldSplitterVisitor::replicateTermForField(ProtonWandTerm &node, size_t field_idx) { + auto &replica = _builder.addWandTerm( + replicate_subterms(node), node.getType(), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight(), + node.getTargetNumHits(), node.getScoreThreshold(), node.getThresholdBoostFactor()); + copyState(node, replica); + copyProtonTermDataForField(node, replica, field_idx); +} + +} + +Node::UP FieldSplitter::split_terms(Node::UP root) { + if (LOG_WOULD_LOG(debug)) { + search::query::QueryToProtobuf converter; + auto proto_tree = converter.serialize(*root); + LOG(debug, "field splitting input tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); + } + FieldSplitterVisitor visitor; + root->accept(visitor); + Node::UP result = visitor.build(); + if (!result) { + // Error during splitting, return original tree + LOG(debug, "field splitting failed, returning original tree"); + return root; + } + if (LOG_WOULD_LOG(debug)) { + search::query::QueryToProtobuf converter; + auto proto_tree = converter.serialize(*result); + LOG(debug, "field splitting completed, result tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); + } + return result; +} + +} diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.h b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.h new file mode 100644 index 00000000000..2147c0ed4b1 --- /dev/null +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.h @@ -0,0 +1,52 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include + +namespace proton::matching { + +/** + * FieldSplitter - Splits multi-field query terms into separate per-field term instances + * + * This utility transforms query trees to normalize field references. When a query term + * can match multiple fields (e.g., searching "foo" in both "title" and "body"), this + * splitter creates separate term instances for each field and combines them with OR nodes. + * + * Purpose: + * - Simplifies query execution by ensuring each term operates on exactly one field + * - Enables field-specific optimizations and statistics + * - Resolves field references early in the query processing pipeline + * + * Key behaviors: + * - Terms with single field: Pass through unchanged + * - Terms with multiple fields: Split into OR(term_field1, term_field2, ...) + * - Phrase nodes: Split into per-field phrases, forcing children to use phrase's field + * - Equiv nodes: Group children by field, creating one Equiv per field + * - Multi-term nodes: WeightedSet, DotProduct, WandTerm, InTerm, WordAlternatives + * + * Error handling: + * - Returns original tree if splitting fails + * - Reports issues via vespalib::Issue when errors occur + * - Logs the transformed tree for debugging + * + * Example transformation: + * Input: StringTerm("search", fields=[title, body]) + * Output: OR(StringTerm("search", field=title), StringTerm("search", field=body)) + */ +struct FieldSplitter { + using Node = search::query::Node; + /** + * Splits multi-field terms in the query tree into separate per-field instances. + * + * Takes a query tree and transforms it so that each term operates on a single field. + * Terms with multiple fields are split and connected with OR nodes. Returns the + * transformed tree, or the original tree if splitting fails. + * + * @param node The query tree to transform (ownership is transferred) + * @return The transformed query tree (or original if splitting fails) + */ + static Node::UP split_terms(Node::UP node); +}; + +} From 931c3049500af2c8c4124066d31af0173fa163fa Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Mon, 9 Mar 2026 18:34:03 +0000 Subject: [PATCH 2/9] Clear _force_field_id for phrase children in FieldSplitter Phrase children (individual words) don't carry field metadata - they inherit the field from their parent Phrase node. When a Phrase is nested inside an Equiv, Near, ONear, or SameElement, _force_field_id is set, and visiting phrase children with it set would cause field lookup to fail. Fix by saving/clearing/restoring _force_field_id around the child visit, consistent with the same pattern used elsewhere in the visitor. --- .../src/vespa/searchcore/proton/matching/field_splitter.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp index ed6b576aa5c..24f3ef00abe 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -718,8 +718,11 @@ void FieldSplitterVisitor::replicateTermForField(ProtonPhrase &nod copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); - // Process children normally - they won't have fields + // Phrase children don't have fields - clear _force_field_id so they pass through as-is + uint32_t saved_field_id = _force_field_id; + _force_field_id = search::fef::IllegalFieldId; visitNodes(node.getChildren()); + _force_field_id = saved_field_id; } template <> From a252336b5449366cc118907baae75d8dcc1521f3 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Mon, 9 Mar 2026 18:47:08 +0000 Subject: [PATCH 3/9] Handle SameElement in 3 different ways When a SameElement has multiple fields but children have different field sets, split the SameElement into per-field copies without forcing the children, letting each child handle its own field splitting independently. --- .../proton/matching/field_splitter.cpp | 57 +++++++++++-------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp index 24f3ef00abe..a06ac11a32f 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -137,10 +137,16 @@ genericAddSimpleTerm(ProtonBuilder& builder, * * Special handling: * - Phrase nodes: Children won't have fields, so they are visited as-is - * - SameElement nodes: Children are forced to use the same field as the SameElement + * - SameElement nodes have three variants: + * a) Single field: pass through as-is, children are visited normally + * b) Multiple fields, all children have the same field set as the SameElement: + * split into OR of per-field SameElements, forcing each child to use that field + * c) Multiple fields, children have different/mixed fields: + * split into OR of per-field SameElements, but children are visited normally + * without forcing, so each child handles its own field splitting independently * - Equiv nodes: Gathers children by field, creating one Equiv per field * - Multi-term nodes: WeightedSet, DotProduct, WandTerm, InTerm, WordAlternatives - * - Forced field mode: When inside a SameElement, children must use the SameElement's field + * - Forced field mode: When inside a SameElement (variant b), children must use the SameElement's field * * State: * - _builder: QueryBuilder for constructing the transformed tree @@ -213,7 +219,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &fields) { _builder.addOr(fields.size()); for (uint32_t field_id : fields) { @@ -233,6 +239,17 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &fields) { + _builder.addOr(fields.size()); + for ([[maybe_unused]] uint32_t field_id : fields) { + createSameElementReplica(node); + visitNodes(node.getChildren()); + } + } + // ===== Term State Copying Helpers ===== static void copyState(const search::query::Term &original, search::query::Term &replica) { @@ -573,29 +590,23 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor Date: Sun, 19 Oct 2025 22:31:59 +0000 Subject: [PATCH 4/9] Integrate FieldSplitter into the query processing pipeline Wire up FieldSplitter::split_terms in the query build step, placed after field resolution (ResolveViewVisitor) and before ranking term extraction (NeedsRankingVisitor). This ensures multi-field query terms are normalized into per-field instances before the rest of the matching pipeline sees them, enabling field-specific optimizations. --- searchcore/src/vespa/searchcore/proton/matching/query.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/searchcore/src/vespa/searchcore/proton/matching/query.cpp b/searchcore/src/vespa/searchcore/proton/matching/query.cpp index c01785a8aa4..5227f7b380c 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/query.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/query.cpp @@ -2,6 +2,7 @@ #include "query.h" #include "blueprintbuilder.h" +#include "field_splitter.h" #include "matchdatareservevisitor.h" #include "resolveviewvisitor.h" #include "sameelementmodifier.h" @@ -198,6 +199,7 @@ Query::buildTree(const SerializedQueryTree &queryTree, const string &location, _query_tree = UnpackingIteratorsOptimizer::optimize(std::move(_query_tree), bool(_whiteListBlueprint)); ResolveViewVisitor resolve_visitor(resolver, indexEnv); _query_tree->accept(resolve_visitor); + _query_tree = FieldSplitter::split_terms(std::move(_query_tree)); NeedsRankingVisitor need_ranking_visitor; _query_tree->accept(need_ranking_visitor); _needs_ranking = need_ranking_visitor.needs_ranking(); From 9bcd3e4f7fdc96278443cfa2efe22afb4c3e080a Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Wed, 11 Mar 2026 10:55:23 +0000 Subject: [PATCH 5/9] Address PR review feedback in FieldSplitter - Copy element_filter and expose_match_data_for_same_element in createSameElementReplica - Copy elementwise flag in visit(ProtonAndNot) - Generalize comment on buildFieldToChildrenMap --- .../vespa/searchcore/proton/matching/field_splitter.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp index a06ac11a32f..f70ca07354c 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -214,8 +214,10 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor(_builder.addAndNot(node.getChildren().size())); + replica.elementwise = node.elementwise; visitNodes(node.getChildren()); } @@ -480,7 +483,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor> buildFieldToChildrenMap(const std::vector &children) { std::map> field_to_children; for (size_t child_idx = 0; child_idx < children.size(); ++child_idx) { From 67296c18681aa33ce14d58ae71f939bb21e40efb Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Wed, 11 Mar 2026 10:56:51 +0000 Subject: [PATCH 6/9] reformat --- .../proton/matching/field_splitter.cpp | 436 ++++++++---------- .../proton/matching/field_splitter.h | 2 +- 2 files changed, 187 insertions(+), 251 deletions(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp index f70ca07354c..e81ceb7bcff 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -1,13 +1,16 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "field_splitter.h" + #include "querynodes.h" + #include #include -#include #include +#include #include #include + #include #include #include @@ -15,12 +18,12 @@ #include LOG_SETUP(".proton.matching.field_splitter"); +using search::query::IntegerTermVector; +using search::query::MultiTerm; using search::query::Node; using search::query::QueryBuilder; -using search::query::MultiTerm; -using search::query::TermVector; -using search::query::IntegerTermVector; using search::query::StringTermVector; +using search::query::TermVector; using search::query::WeightedIntegerTermVector; using search::query::WeightedStringTermVector; @@ -30,80 +33,53 @@ namespace { using ProtonBuilder = QueryBuilder; -template -T& genericAddSimpleTerm(ProtonBuilder& builder, - const std::string & term, - const std::string & view, - int32_t id, +template +T& genericAddSimpleTerm(ProtonBuilder& builder, const std::string& term, const std::string& view, int32_t id, search::query::Weight weight); -template<> +template <> ProtonNodeTypes::StringTerm& -genericAddSimpleTerm(ProtonBuilder& builder, - const std::string & term, - const std::string & view, - int32_t id, - search::query::Weight weight) -{ +genericAddSimpleTerm(ProtonBuilder& builder, const std::string& term, + const std::string& view, int32_t id, search::query::Weight weight) { return builder.addStringTerm(term, view, id, weight); } -template<> +template <> ProtonNodeTypes::NumberTerm& -genericAddSimpleTerm(ProtonBuilder& builder, - const std::string & term, - const std::string & view, - int32_t id, - search::query::Weight weight) -{ +genericAddSimpleTerm(ProtonBuilder& builder, const std::string& term, + const std::string& view, int32_t id, search::query::Weight weight) { return builder.addNumberTerm(term, view, id, weight); } -template<> +template <> ProtonNodeTypes::PrefixTerm& -genericAddSimpleTerm(ProtonBuilder& builder, - const std::string & term, - const std::string & view, - int32_t id, - search::query::Weight weight) -{ +genericAddSimpleTerm(ProtonBuilder& builder, const std::string& term, + const std::string& view, int32_t id, search::query::Weight weight) { return builder.addPrefixTerm(term, view, id, weight); } -template<> +template <> ProtonNodeTypes::SubstringTerm& -genericAddSimpleTerm(ProtonBuilder& builder, - const std::string & term, - const std::string & view, - int32_t id, - search::query::Weight weight) -{ +genericAddSimpleTerm(ProtonBuilder& builder, const std::string& term, + const std::string& view, int32_t id, + search::query::Weight weight) { return builder.addSubstringTerm(term, view, id, weight); } -template<> +template <> ProtonNodeTypes::SuffixTerm& -genericAddSimpleTerm(ProtonBuilder& builder, - const std::string & term, - const std::string & view, - int32_t id, - search::query::Weight weight) -{ +genericAddSimpleTerm(ProtonBuilder& builder, const std::string& term, + const std::string& view, int32_t id, search::query::Weight weight) { return builder.addSuffixTerm(term, view, id, weight); } -template<> +template <> ProtonNodeTypes::RegExpTerm& -genericAddSimpleTerm(ProtonBuilder& builder, - const std::string & term, - const std::string & view, - int32_t id, - search::query::Weight weight) -{ +genericAddSimpleTerm(ProtonBuilder& builder, const std::string& term, + const std::string& view, int32_t id, search::query::Weight weight) { return builder.addRegExpTerm(term, view, id, weight); } - /** * FieldSplitterVisitor - Visitor that splits multi-field query terms into separate per-field terms * @@ -158,16 +134,15 @@ genericAddSimpleTerm(ProtonBuilder& builder, * query_node->accept(visitor); * Node::UP result = visitor.build(); */ -class FieldSplitterVisitor : public search::query::CustomTypeVisitor -{ +class FieldSplitterVisitor : public search::query::CustomTypeVisitor { private: ProtonBuilder _builder; - uint32_t _force_field_id = search::fef::IllegalFieldId; - bool _has_error = false; + uint32_t _force_field_id = search::fef::IllegalFieldId; + bool _has_error = false; // ===== Node Traversal Helpers ===== - void visitNodes(const std::vector &nodes) { + void visitNodes(const std::vector& nodes) { for (auto node : nodes) { node->accept(*this); } @@ -175,7 +150,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &nodes, uint32_t field_id) { + void splitAndVisitChildrenForField(const std::vector& nodes, uint32_t field_id) { uint32_t saved_field_id = _force_field_id; _force_field_id = field_id; visitNodes(nodes); @@ -185,7 +160,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor getFieldIds(const ProtonTermData &term_data) { + static std::set getFieldIds(const ProtonTermData& term_data) { std::set fields; for (size_t i = 0; i < term_data.numFields(); ++i) { fields.insert(term_data.field(i).getFieldId()); @@ -194,10 +169,9 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &children, - const std::set &expected_fields) - { - for (Node *child : children) { + static bool allChildrenHaveSameFields(const std::vector& children, + const std::set& expected_fields) { + for (Node* child : children) { auto* term_data = dynamic_cast(child); if (!term_data || term_data->numFields() != expected_fields.size()) { return false; @@ -212,18 +186,17 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &fields) { + void splitSameElementByFields(ProtonSameElement& node, const std::set& fields) { _builder.addOr(fields.size()); for (uint32_t field_id : fields) { createSameElementReplica(node); @@ -244,7 +217,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &fields) { + void splitSameElementByFieldsNoForce(ProtonSameElement& node, const std::set& fields) { _builder.addOr(fields.size()); for ([[maybe_unused]] uint32_t field_id : fields) { createSameElementReplica(node); @@ -254,7 +227,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor - static void copyProtonTermDataForField(TermType &original, TermType &replica, size_t field_idx) { + static void copyProtonTermDataForField(TermType& original, TermType& replica, size_t field_idx) { // Copy the specific field entry from original to replica if (field_idx < original.numFields()) { replica.useFieldEntry(original.field(field_idx)); @@ -312,32 +285,46 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor - const char* getNodeTypeName() { - if constexpr (std::is_same_v) return "StringTerm"; - else if constexpr (std::is_same_v) return "NumberTerm"; - else if constexpr (std::is_same_v) return "PrefixTerm"; - else if constexpr (std::is_same_v) return "SubstringTerm"; - else if constexpr (std::is_same_v) return "SuffixTerm"; - else if constexpr (std::is_same_v) return "RangeTerm"; - else if constexpr (std::is_same_v) return "LocationTerm"; - else if constexpr (std::is_same_v) return "RegExpTerm"; - else if constexpr (std::is_same_v) return "FuzzyTerm"; - else if constexpr (std::is_same_v) return "Phrase"; - else if constexpr (std::is_same_v) return "WeightedSetTerm"; - else if constexpr (std::is_same_v) return "DotProduct"; - else if constexpr (std::is_same_v) return "WandTerm"; - else if constexpr (std::is_same_v) return "InTerm"; - else if constexpr (std::is_same_v) return "WordAlternatives"; - else return "UnknownNode"; + template const char* getNodeTypeName() { + if constexpr (std::is_same_v) + return "StringTerm"; + else if constexpr (std::is_same_v) + return "NumberTerm"; + else if constexpr (std::is_same_v) + return "PrefixTerm"; + else if constexpr (std::is_same_v) + return "SubstringTerm"; + else if constexpr (std::is_same_v) + return "SuffixTerm"; + else if constexpr (std::is_same_v) + return "RangeTerm"; + else if constexpr (std::is_same_v) + return "LocationTerm"; + else if constexpr (std::is_same_v) + return "RegExpTerm"; + else if constexpr (std::is_same_v) + return "FuzzyTerm"; + else if constexpr (std::is_same_v) + return "Phrase"; + else if constexpr (std::is_same_v) + return "WeightedSetTerm"; + else if constexpr (std::is_same_v) + return "DotProduct"; + else if constexpr (std::is_same_v) + return "WandTerm"; + else if constexpr (std::is_same_v) + return "InTerm"; + else if constexpr (std::is_same_v) + return "WordAlternatives"; + else + return "UnknownNode"; } // ===== Term Splitting Logic ===== // Helper to get the view/field name for a term being replicated // Uses the field name from field_idx if available, otherwise falls back to the node's view - template - const std::string& getFieldNameOrView(NodeType &node, size_t field_idx) const { + template const std::string& getFieldNameOrView(NodeType& node, size_t field_idx) const { if (field_idx < node.numFields()) { return node.field(field_idx).getName(); } @@ -345,18 +332,15 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor - void replicateTermForField(NodeType &node, size_t field_idx) { + template void replicateTermForField(NodeType& node, size_t field_idx) { // Default implementation for simple term types that work with genericAddSimpleTerm - auto &replica = genericAddSimpleTerm(_builder, - node.getTerm(), getFieldNameOrView(node, field_idx), - node.getId(), node.getWeight()); + auto& replica = genericAddSimpleTerm(_builder, node.getTerm(), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight()); copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); } - template - void splitTerm(NodeType &node) { + template void splitTerm(NodeType& node) { // If we're forced to use a specific field (inside SameElement), use only that field if (_force_field_id != search::fef::IllegalFieldId) { // Find the matching field index @@ -369,8 +353,9 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor(), _force_field_id, node.numFields()); - vespalib::Issue::report("field splitting for %s failed: forced field_id %u not found in node's %zu fields", - getNodeTypeName(), _force_field_id, node.numFields()); + vespalib::Issue::report( + "field splitting for %s failed: forced field_id %u not found in node's %zu fields", + getNodeTypeName(), _force_field_id, node.numFields()); _has_error = true; return; } @@ -399,44 +384,45 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor(_builder.addAndNot(node.getChildren().size())); + void visit(ProtonAndNot& node) override { + auto& replica = static_cast(_builder.addAndNot(node.getChildren().size())); replica.elementwise = node.elementwise; visitNodes(node.getChildren()); } - void visit(ProtonOr &node) override { + void visit(ProtonOr& node) override { _builder.addOr(node.getChildren().size()); visitNodes(node.getChildren()); } - void visit(ProtonRank &node) override { + void visit(ProtonRank& node) override { _builder.addRank(node.getChildren().size()); visitNodes(node.getChildren()); } - void visit(ProtonWeakAnd &node) override { + void visit(ProtonWeakAnd& node) override { _builder.addWeakAnd(node.getChildren().size(), node.getTargetNumHits(), node.getView()); visitNodes(node.getChildren()); } - void visit(ProtonNear &node) override { + void visit(ProtonNear& node) override { // Build map: field_id -> list of original child indices that have this field auto field_to_children = buildFieldToChildrenMap(node.getChildren()); if (field_to_children.empty()) { - LOG(debug, "field splitting for Near node failed: " + LOG(debug, + "field splitting for Near node failed: " "no fields found in any children (distance=%zu, num_children=%zu)", node.getDistance(), node.getChildren().size()); vespalib::Issue::report("field splitting for Near node failed: " - "no fields found in any children " - "(distance=%zu, num_children=%zu)", - node.getDistance(), node.getChildren().size()); + "no fields found in any children " + "(distance=%zu, num_children=%zu)", + node.getDistance(), node.getChildren().size()); _has_error = true; return; } @@ -454,18 +440,19 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor list of original child indices that have this field auto field_to_children = buildFieldToChildrenMap(node.getChildren()); if (field_to_children.empty()) { - LOG(debug, "field splitting for ONear node failed: " + LOG(debug, + "field splitting for ONear node failed: " "no fields found in any children (distance=%zu, num_children=%zu)", node.getDistance(), node.getChildren().size()); vespalib::Issue::report("field splitting for ONear node failed: " - "no fields found in any children " - "(distance=%zu, num_children=%zu)", - node.getDistance(), node.getChildren().size()); + "no fields found in any children " + "(distance=%zu, num_children=%zu)", + node.getDistance(), node.getChildren().size()); _has_error = true; return; } @@ -484,7 +471,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor> buildFieldToChildrenMap(const std::vector &children) { + std::map> buildFieldToChildrenMap(const std::vector& children) { std::map> field_to_children; for (size_t child_idx = 0; child_idx < children.size(); ++child_idx) { if (auto* term_data = dynamic_cast(children[child_idx])) { @@ -497,9 +484,8 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &child_indices) { - auto &replica = _builder.addEquiv(child_indices.size(), node.getId(), node.getWeight()); + void createEquivForField(ProtonEquiv& node, uint32_t field_id, const std::vector& child_indices) { + auto& replica = _builder.addEquiv(child_indices.size(), node.getId(), node.getWeight()); // Visit each child with forced field uint32_t saved_field_id = _force_field_id; @@ -514,10 +500,9 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &child_indices) { - _builder.addNear(child_indices.size(), node.getDistance(), - node.num_negative_terms(), node.exclusion_distance()); + void createNearForField(ProtonNear& node, uint32_t field_id, const std::vector& child_indices) { + _builder.addNear(child_indices.size(), node.getDistance(), node.num_negative_terms(), + node.exclusion_distance()); // Visit each child with forced field uint32_t saved_field_id = _force_field_id; @@ -529,10 +514,9 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor &child_indices) { - _builder.addONear(child_indices.size(), node.getDistance(), - node.num_negative_terms(), node.exclusion_distance()); + void createONearForField(ProtonONear& node, uint32_t field_id, const std::vector& child_indices) { + _builder.addONear(child_indices.size(), node.getDistance(), node.num_negative_terms(), + node.exclusion_distance()); // Visit each child with forced field uint32_t saved_field_id = _force_field_id; @@ -543,19 +527,19 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor list of original child indices that have this field auto field_to_children = buildFieldToChildrenMap(node.getChildren()); if (field_to_children.empty()) { - LOG(debug, "field splitting for Equiv node failed: " + LOG(debug, + "field splitting for Equiv node failed: " "no fields found in any children (id=%d, weight=%d, num_children=%zu)", node.getId(), node.getWeight().percent(), node.getChildren().size()); vespalib::Issue::report("field splitting for Equiv node failed: " - "no fields found in any children " - "(id=%d, weight=%d, num_children=%zu)", - node.getId(), node.getWeight().percent(), - node.getChildren().size()); + "no fields found in any children " + "(id=%d, weight=%d, num_children=%zu)", + node.getId(), node.getWeight().percent(), node.getChildren().size()); _has_error = true; return; } @@ -566,11 +550,10 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitorfirst, it->second); } else { - LOG(debug, "field splitting for Equiv node failed: forced field_id %u not found", - _force_field_id); + LOG(debug, "field splitting for Equiv node failed: forced field_id %u not found", _force_field_id); vespalib::Issue::report("field splitting for Equiv node failed: " - "forced field_id %u not found", - _force_field_id); + "forced field_id %u not found", + _force_field_id); _has_error = true; } return; @@ -589,11 +572,9 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor(*node.getTerm()), - node.getView(), node.getId(), node.getWeight())); + void visit(ProtonPredicateQuery& node) override { + copyState(node, + _builder.addPredicateQuery(std::make_unique(*node.getTerm()), + node.getView(), node.getId(), node.getWeight())); } - void visit(ProtonNearestNeighborTerm &node) override { - copyState(node, _builder.add_nearest_neighbor_term( - node.get_query_tensor_name(), node.getView(), - node.getId(), node.getWeight(), node.get_target_num_hits(), - node.get_allow_approximate(), - node.get_hnsw_params())); + void visit(ProtonNearestNeighborTerm& node) override { + copyState(node, _builder.add_nearest_neighbor_term(node.get_query_tensor_name(), node.getView(), node.getId(), + node.getWeight(), node.get_target_num_hits(), + node.get_allow_approximate(), node.get_hnsw_params())); } - void visit(ProtonTrue &) override { - _builder.add_true_node(); - } + void visit(ProtonTrue&) override { _builder.add_true_node(); } - void visit(ProtonFalse &) override { - _builder.add_false_node(); - } + void visit(ProtonFalse&) override { _builder.add_false_node(); } }; // Template specializations for replicateTermForField // Only specialized for term types that don't fit the default pattern template <> -void FieldSplitterVisitor::replicateTermForField(ProtonRangeTerm &node, size_t field_idx) { - auto &replica = _builder.addRangeTerm( - node.getTerm(), getFieldNameOrView(node, field_idx), - node.getId(), node.getWeight()); +void FieldSplitterVisitor::replicateTermForField(ProtonRangeTerm& node, size_t field_idx) { + auto& replica = + _builder.addRangeTerm(node.getTerm(), getFieldNameOrView(node, field_idx), node.getId(), node.getWeight()); copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); } template <> -void FieldSplitterVisitor::replicateTermForField(ProtonLocationTerm &node, size_t field_idx) { - auto &replica = _builder.addLocationTerm( - node.getTerm(), getFieldNameOrView(node, field_idx), - node.getId(), node.getWeight()); +void FieldSplitterVisitor::replicateTermForField(ProtonLocationTerm& node, size_t field_idx) { + auto& replica = + _builder.addLocationTerm(node.getTerm(), getFieldNameOrView(node, field_idx), node.getId(), node.getWeight()); copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); } template <> -void FieldSplitterVisitor::replicateTermForField(ProtonFuzzyTerm &node, size_t field_idx) { - auto &replica = _builder.addFuzzyTerm( - node.getTerm(), getFieldNameOrView(node, field_idx), - node.getId(), node.getWeight(), - node.max_edit_distance(), node.prefix_lock_length(), node.prefix_match()); +void FieldSplitterVisitor::replicateTermForField(ProtonFuzzyTerm& node, size_t field_idx) { + auto& replica = + _builder.addFuzzyTerm(node.getTerm(), getFieldNameOrView(node, field_idx), node.getId(), node.getWeight(), + node.max_edit_distance(), node.prefix_lock_length(), node.prefix_match()); copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); } -template <> -void FieldSplitterVisitor::replicateTermForField(ProtonPhrase &node, size_t field_idx) { - auto &replica = _builder.addPhrase(node.getChildren().size(), getFieldNameOrView(node, field_idx), - node.getId(), node.getWeight()); +template <> void FieldSplitterVisitor::replicateTermForField(ProtonPhrase& node, size_t field_idx) { + auto& replica = _builder.addPhrase(node.getChildren().size(), getFieldNameOrView(node, field_idx), node.getId(), + node.getWeight()); replica.set_expensive(node.is_expensive()); copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); @@ -740,22 +683,18 @@ void FieldSplitterVisitor::replicateTermForField(ProtonPhrase &nod } template <> -void FieldSplitterVisitor::replicateTermForField(ProtonWordAlternatives &node, - size_t field_idx) -{ +void FieldSplitterVisitor::replicateTermForField(ProtonWordAlternatives& node, + size_t field_idx) { // Create ProtonStringTerm children for each alternative std::vector> children; - const auto& original_children = node.getChildren(); + const auto& original_children = node.getChildren(); children.reserve(original_children.size()); for (const auto& original_child : original_children) { // Create a new ProtonStringTerm for this alternative - auto child = std::make_unique( - original_child->getTerm(), - getFieldNameOrView(node, field_idx), - original_child->getId(), - original_child->getWeight() - ); + auto child = + std::make_unique(original_child->getTerm(), getFieldNameOrView(node, field_idx), + original_child->getId(), original_child->getWeight()); copyState(*original_child, *child); // Copy field entry from parent WordAlternatives to child StringTerm @@ -767,22 +706,20 @@ void FieldSplitterVisitor::replicateTermForField(ProtonW } // Create the WordAlternatives replica with ProtonStringTerm children - auto &replica = _builder.add_word_alternatives( - std::move(children), getFieldNameOrView(node, field_idx), node.getId(), node.getWeight()); + auto& replica = _builder.add_word_alternatives(std::move(children), getFieldNameOrView(node, field_idx), + node.getId(), node.getWeight()); copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); } // Helper macro for multi-term nodes with subterms -#define REPLICATE_MULTITERM(TermType, builderMethod) \ -template <> \ -void FieldSplitterVisitor::replicateTermForField(TermType &node, size_t field_idx) { \ - auto &replica = _builder.builderMethod( \ - replicate_subterms(node), node.getType(), getFieldNameOrView(node, field_idx), \ - node.getId(), node.getWeight()); \ - copyState(node, replica); \ - copyProtonTermDataForField(node, replica, field_idx); \ -} +#define REPLICATE_MULTITERM(TermType, builderMethod) \ + template <> void FieldSplitterVisitor::replicateTermForField(TermType & node, size_t field_idx) { \ + auto& replica = _builder.builderMethod(replicate_subterms(node), node.getType(), \ + getFieldNameOrView(node, field_idx), node.getId(), node.getWeight()); \ + copyState(node, replica); \ + copyProtonTermDataForField(node, replica, field_idx); \ + } REPLICATE_MULTITERM(ProtonWeightedSetTerm, addWeightedSetTerm) REPLICATE_MULTITERM(ProtonDotProduct, addDotProduct) @@ -790,22 +727,20 @@ REPLICATE_MULTITERM(ProtonInTerm, add_in_term) #undef REPLICATE_MULTITERM -template <> -void FieldSplitterVisitor::replicateTermForField(ProtonWandTerm &node, size_t field_idx) { - auto &replica = _builder.addWandTerm( - replicate_subterms(node), node.getType(), getFieldNameOrView(node, field_idx), - node.getId(), node.getWeight(), +template <> void FieldSplitterVisitor::replicateTermForField(ProtonWandTerm& node, size_t field_idx) { + auto& replica = _builder.addWandTerm( + replicate_subterms(node), node.getType(), getFieldNameOrView(node, field_idx), node.getId(), node.getWeight(), node.getTargetNumHits(), node.getScoreThreshold(), node.getThresholdBoostFactor()); copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); } -} +} // namespace Node::UP FieldSplitter::split_terms(Node::UP root) { if (LOG_WOULD_LOG(debug)) { search::query::QueryToProtobuf converter; - auto proto_tree = converter.serialize(*root); + auto proto_tree = converter.serialize(*root); LOG(debug, "field splitting input tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); } FieldSplitterVisitor visitor; @@ -818,10 +753,11 @@ Node::UP FieldSplitter::split_terms(Node::UP root) { } if (LOG_WOULD_LOG(debug)) { search::query::QueryToProtobuf converter; - auto proto_tree = converter.serialize(*result); - LOG(debug, "field splitting completed, result tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); + auto proto_tree = converter.serialize(*result); + LOG(debug, "field splitting completed, result tree:\n%s", + search::common::protobuf_message_to_json(proto_tree).c_str()); } return result; } -} +} // namespace proton::matching diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.h b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.h index 2147c0ed4b1..a5cfda024f4 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.h +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.h @@ -49,4 +49,4 @@ struct FieldSplitter { static Node::UP split_terms(Node::UP node); }; -} +} // namespace proton::matching From 0b69f905167069890cb7ce0c6da6b55c6e2e9102 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Wed, 11 Mar 2026 11:42:23 +0000 Subject: [PATCH 7/9] Replace splitSameElementByFieldsNoForce with prefix-based child filtering For SameElement variant (c) - multiple fields with mixed-field children - create one SameElement replica per parent field and filter each child's fields to only those whose names match the parent field name (exact or dot-separated sub-field prefix). Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .../proton/matching/field_splitter.cpp | 66 +++++++++++++------ 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp index e81ceb7bcff..da03ae9fe32 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -138,6 +138,7 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor _field_name_prefix.size() && + field_name[_field_name_prefix.size()] == '.' && + field_name.compare(0, _field_name_prefix.size(), _field_name_prefix) == 0; + } + // ===== Field Analysis Helpers ===== // Helper to get field IDs from a ProtonTermData node @@ -215,13 +226,18 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor& fields) { - _builder.addOr(fields.size()); - for ([[maybe_unused]] uint32_t field_id : fields) { + // Creates OR of per-field SameElements. For each replica, children are visited with a + // field-name prefix filter so only child fields whose names start with the parent field name + // (exact match or "parent." prefix) are included in that replica. + void splitSameElementByFieldsWithPrefix(ProtonSameElement& node) { + size_t num_fields = node.numFields(); + _builder.addOr(num_fields); + for (size_t fi = 0; fi < num_fields; ++fi) { createSameElementReplica(node); + std::string saved_prefix = _field_name_prefix; + _field_name_prefix = node.field(fi).getName(); visitNodes(node.getChildren()); + _field_name_prefix = saved_prefix; } } @@ -360,16 +376,21 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor matching_indices; + for (size_t i = 0; i < node.numFields(); ++i) { + if (fieldNameMatchesPrefix(node.field(i).getName())) { + matching_indices.push_back(i); + } + } - if (num_fields <= 1) { - // No splitting needed - just replicate the node as-is - replicateTermForField(node, 0); + if (matching_indices.size() <= 1) { + // No splitting needed - replicate using first matching field (or field 0 as fallback) + replicateTermForField(node, matching_indices.empty() ? 0 : matching_indices[0]); } else { - // Multiple fields - create OR with one term per field - _builder.addOr(num_fields); - for (size_t i = 0; i < num_fields; ++i) { + // Multiple matching fields - create OR with one term per field + _builder.addOr(matching_indices.size()); + for (size_t i : matching_indices) { replicateTermForField(node, i); } } @@ -587,9 +608,9 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor void FieldSplitterVisitor::replicateTermForField(Proto copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); - // Phrase children don't have fields - clear _force_field_id so they pass through as-is + // Phrase children don't have fields - clear _force_field_id and _field_name_prefix so they pass through as-is uint32_t saved_field_id = _force_field_id; + std::string saved_prefix = _field_name_prefix; _force_field_id = search::fef::IllegalFieldId; + _field_name_prefix.clear(); visitNodes(node.getChildren()); _force_field_id = saved_field_id; + _field_name_prefix = saved_prefix; } template <> @@ -738,10 +762,10 @@ template <> void FieldSplitterVisitor::replicateTermForField(Pro } // namespace Node::UP FieldSplitter::split_terms(Node::UP root) { - if (LOG_WOULD_LOG(debug)) { + if (LOG_WOULD_LOG(info)) { search::query::QueryToProtobuf converter; auto proto_tree = converter.serialize(*root); - LOG(debug, "field splitting input tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); + LOG(error, "field splitting input tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); } FieldSplitterVisitor visitor; root->accept(visitor); @@ -751,10 +775,10 @@ Node::UP FieldSplitter::split_terms(Node::UP root) { LOG(debug, "field splitting failed, returning original tree"); return root; } - if (LOG_WOULD_LOG(debug)) { + if (LOG_WOULD_LOG(info)) { search::query::QueryToProtobuf converter; auto proto_tree = converter.serialize(*result); - LOG(debug, "field splitting completed, result tree:\n%s", + LOG(error, "field splitting completed, result tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); } return result; From 43c1ca2119f4ac8049bc09f85bcd57a275c7a8f0 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Wed, 11 Mar 2026 13:13:48 +0000 Subject: [PATCH 8/9] Revert "Replace splitSameElementByFieldsNoForce with prefix-based child filtering" This reverts commit 0b69f905167069890cb7ce0c6da6b55c6e2e9102. --- .../proton/matching/field_splitter.cpp | 66 ++++++------------- 1 file changed, 21 insertions(+), 45 deletions(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp index da03ae9fe32..e81ceb7bcff 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -138,7 +138,6 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor _field_name_prefix.size() && - field_name[_field_name_prefix.size()] == '.' && - field_name.compare(0, _field_name_prefix.size(), _field_name_prefix) == 0; - } - // ===== Field Analysis Helpers ===== // Helper to get field IDs from a ProtonTermData node @@ -226,18 +215,13 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor& fields) { + _builder.addOr(fields.size()); + for ([[maybe_unused]] uint32_t field_id : fields) { createSameElementReplica(node); - std::string saved_prefix = _field_name_prefix; - _field_name_prefix = node.field(fi).getName(); visitNodes(node.getChildren()); - _field_name_prefix = saved_prefix; } } @@ -376,21 +360,16 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor matching_indices; - for (size_t i = 0; i < node.numFields(); ++i) { - if (fieldNameMatchesPrefix(node.field(i).getName())) { - matching_indices.push_back(i); - } - } + // Normal case: split across all fields + size_t num_fields = node.numFields(); - if (matching_indices.size() <= 1) { - // No splitting needed - replicate using first matching field (or field 0 as fallback) - replicateTermForField(node, matching_indices.empty() ? 0 : matching_indices[0]); + if (num_fields <= 1) { + // No splitting needed - just replicate the node as-is + replicateTermForField(node, 0); } else { - // Multiple matching fields - create OR with one term per field - _builder.addOr(matching_indices.size()); - for (size_t i : matching_indices) { + // Multiple fields - create OR with one term per field + _builder.addOr(num_fields); + for (size_t i = 0; i < num_fields; ++i) { replicateTermForField(node, i); } } @@ -608,9 +587,9 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor void FieldSplitterVisitor::replicateTermForField(Proto copyState(node, replica); copyProtonTermDataForField(node, replica, field_idx); - // Phrase children don't have fields - clear _force_field_id and _field_name_prefix so they pass through as-is + // Phrase children don't have fields - clear _force_field_id so they pass through as-is uint32_t saved_field_id = _force_field_id; - std::string saved_prefix = _field_name_prefix; _force_field_id = search::fef::IllegalFieldId; - _field_name_prefix.clear(); visitNodes(node.getChildren()); _force_field_id = saved_field_id; - _field_name_prefix = saved_prefix; } template <> @@ -762,10 +738,10 @@ template <> void FieldSplitterVisitor::replicateTermForField(Pro } // namespace Node::UP FieldSplitter::split_terms(Node::UP root) { - if (LOG_WOULD_LOG(info)) { + if (LOG_WOULD_LOG(debug)) { search::query::QueryToProtobuf converter; auto proto_tree = converter.serialize(*root); - LOG(error, "field splitting input tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); + LOG(debug, "field splitting input tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); } FieldSplitterVisitor visitor; root->accept(visitor); @@ -775,10 +751,10 @@ Node::UP FieldSplitter::split_terms(Node::UP root) { LOG(debug, "field splitting failed, returning original tree"); return root; } - if (LOG_WOULD_LOG(info)) { + if (LOG_WOULD_LOG(debug)) { search::query::QueryToProtobuf converter; auto proto_tree = converter.serialize(*result); - LOG(error, "field splitting completed, result tree:\n%s", + LOG(debug, "field splitting completed, result tree:\n%s", search::common::protobuf_message_to_json(proto_tree).c_str()); } return result; From f0f57ff9226f11a603907a0df56e900c5cf5ef8e Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Wed, 11 Mar 2026 13:39:57 +0000 Subject: [PATCH 9/9] Fix SameElement: copy field data and drop mixed-field case Two bugs fixed in FieldSplitter's SameElement handling: 1. splitSameElementByFields was creating per-field SameElement replicas without copying the field entry (ProtonTermData) from the original node. Added the missing copyProtonTermDataForField call so each replica gets its correct field. The method now derives the field list from node.numFields() directly instead of requiring a separate fields set parameter. 2. The variant-(c) path (splitSameElementByFieldsNoForce) was structurally wrong: it split the SameElement into per-field replicas but then visited children without forcing a field, so children would be split across all their own fields independently -- producing query trees where the per-field SameElements contained children bound to the wrong fields. Since this case (mixed child field sets inside a multi-field SameElement) should not arise in valid queries, it is now treated as an error with a safe pass-through fallback. --- .../proton/matching/field_splitter.cpp | 34 +++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp index e81ceb7bcff..e2215c284b5 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/field_splitter.cpp @@ -118,8 +118,7 @@ genericAddSimpleTerm(ProtonBuilder& builder, const * b) Multiple fields, all children have the same field set as the SameElement: * split into OR of per-field SameElements, forcing each child to use that field * c) Multiple fields, children have different/mixed fields: - * split into OR of per-field SameElements, but children are visited normally - * without forcing, so each child handles its own field splitting independently + * not handled. * - Equiv nodes: Gathers children by field, creating one Equiv per field * - Multi-term nodes: WeightedSet, DotProduct, WandTerm, InTerm, WordAlternatives * - Forced field mode: When inside a SameElement (variant b), children must use the SameElement's field @@ -206,22 +205,14 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor& fields) { - _builder.addOr(fields.size()); - for (uint32_t field_id : fields) { - createSameElementReplica(node); - splitAndVisitChildrenForField(node.getChildren(), field_id); - } - } - - // Helper for SameElement variant (c): multiple fields, children have different/mixed fields. - // Creates OR of per-field SameElements, but children are visited without forcing, - // so each child independently handles its own field splitting. - void splitSameElementByFieldsNoForce(ProtonSameElement& node, const std::set& fields) { - _builder.addOr(fields.size()); - for ([[maybe_unused]] uint32_t field_id : fields) { - createSameElementReplica(node); - visitNodes(node.getChildren()); + void splitSameElementByFields(ProtonSameElement& node) { + // Normal case: split across all fields + size_t num_fields = node.numFields(); + _builder.addOr(num_fields); + for (size_t i = 0; i < num_fields; ++i) { + auto& replica = createSameElementReplica(node); + copyProtonTermDataForField(node, replica, i); + splitAndVisitChildrenForField(node.getChildren(), node.field(i).getFieldId()); } } @@ -585,11 +576,12 @@ class FieldSplitterVisitor : public search::query::CustomTypeVisitor