From b5ef5900d94b98a08f027a61532d95b1781d32c9 Mon Sep 17 00:00:00 2001 From: Joe Wicentowski Date: Sun, 10 May 2026 00:23:07 -0400 Subject: [PATCH 1/3] [optimize] preceding::*[K] K-bounded sliding window in PrecedingFilter Wildcard `preceding::*[K]` previously accumulated every preceding match into the result NodeSet between document start and the reference node, then let the predicate machinery pick the K-th. On a 50,000-element flat document at @xml:id=45000, that meant ~45,000 NodeProxy allocations, ~45,000 result.add() calls, and an O(N log N) sort downstream, even though only 5 elements could ever be selected by `[5]`. The fix: when LocationStep.computeLimit() yields a positive K (the existing positional-predicate detection used by FollowingFilter), PrecedingFilter switches to a K-bounded sliding window. Matches are buffered in an ArrayDeque sized to K, with the oldest evicted as new ones arrive. The window flushes into the result NodeSet on filter termination (reference node reached or root END_ELEMENT). Why a sliding window instead of "stop after K" (the FollowingFilter shape from PR #6325): preceding-axis positional `[K]` selects the K-th match in axis order = (K-th-from-end) in document order. We have to keep walking past every match to know which K are the most recent. Performance impact (50,000-element flat doc, 5 trials median): - ratio lateMs/earlyMs: ~2.55x -> ~2.02x (position-dependence damped) - wildcard-vs-sibling gap: ~1.75x -> ~1.52x (closer to craigberry's reported ~1.33x sibling baseline, not yet at parity) The StAX walk itself remains O(refPosition) since matches must be emitted before the reference and the reader is forward-only, so absolute time still grows with position. Eliminating that would require a different approach (e.g., backward navigation through the persistent NodeId structure for wildcard tests). The K-bounded buffer is a clean, conservative win on the allocation/sort axis and a prerequisite for any later walk-avoidance work. Tests: - PrecedingAxisPositionRegressionTest mirrors PR #6325's FollowingAxisPositionRegressionTest: correctness at 3 reference positions (early, mid, late), ancestor exclusion, K=1..4 axis-order semantics, position-independence threshold, and a wildcard-vs-preceding-sibling comparison. - positional.xqm:180 `optimize-simple-preceding` documented the prior gap (no POSITIONAL_PREDICATE optimization on preceding axis); the assertion is flipped to expect the optimization, mirroring the existing `optimize-simple-following-nested` case at line 170. - exist-core suite: 6599 tests, 0 failures, 0 errors (106 pre-existing skips). Partially addresses https://github.com/eXist-db/exist/issues/2129 (the preceding-axis half; following-axis half is closed by PR #6325). Full closure of the sibling-vs-wildcard gap requires walk-avoidance, left as follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../java/org/exist/xquery/LocationStep.java | 40 ++- .../PrecedingAxisPositionRegressionTest.java | 321 ++++++++++++++++++ .../src/test/xquery/optimizer/positional.xqm | 2 +- 3 files changed, 357 insertions(+), 6 deletions(-) create mode 100644 exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java diff --git a/exist-core/src/main/java/org/exist/xquery/LocationStep.java b/exist-core/src/main/java/org/exist/xquery/LocationStep.java index b8936cbd1c2..ceaa2699cb4 100644 --- a/exist-core/src/main/java/org/exist/xquery/LocationStep.java +++ b/exist-core/src/main/java/org/exist/xquery/LocationStep.java @@ -38,6 +38,8 @@ import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Deque; /** * Processes all location path steps (like descendant::*, ancestor::XXX). @@ -920,7 +922,7 @@ private Sequence getPrecedingOrFollowing(final XQueryContext context, final Sequ final NodeProxy root = new NodeProxy(this, node); final StreamFilter filter; if (axis == Constants.PRECEDING_AXIS) { - filter = new PrecedingFilter(test, root, next, result, contextId); + filter = new PrecedingFilter(test, root, next, result, contextId, position); } else { filter = new FollowingFilter(test, root, next, result, contextId, position); } @@ -1417,12 +1419,19 @@ public boolean accept(final XMLStreamReader reader) { private class PrecedingFilter extends AbstractFilterBase { final NodeProxy root; final NodeProxy referenceNode; + // Sliding window of the most recent {@code limit} matches. Non-null only + // when limit > 0 (positional predicate {@code [K]} present). The K-th + // preceding element in axis order is the (K-th-from-end) match in doc + // order, so any match earlier than the K most recent cannot be selected + // and may be discarded as new ones are found. + final Deque window; PrecedingFilter(final NodeTest test, final NodeProxy root, final NodeProxy referenceNode, final NodeSet result, - final int contextId) { - super(test, result, contextId, -1); + final int contextId, final int limit) { + super(test, result, contextId, limit); this.root = root; this.referenceNode = referenceNode; + this.window = limit > 0 ? new ArrayDeque<>(limit) : null; } @Override @@ -1431,11 +1440,16 @@ public boolean accept(final XMLStreamReader reader) { if (reader.getEventType() == XMLStreamReader.END_ELEMENT) { // exited the root element, so stop filtering - return currentId.getTreeLevel() != root.getNodeId().getTreeLevel(); + if (currentId.getTreeLevel() == root.getNodeId().getTreeLevel()) { + flushWindow(); + return false; + } + return true; } final NodeId refId = referenceNode.getNodeId(); if (currentId.compareTo(refId) >= 0) { + flushWindow(); return false; } @@ -1449,10 +1463,26 @@ public boolean accept(final XMLStreamReader reader) { proxy.addContextNode(contextId, referenceNode); } } - result.add(proxy); + if (window != null) { + if (window.size() == limit) { + window.pollFirst(); + } + window.addLast(proxy); + } else { + result.add(proxy); + } } return true; } + + private void flushWindow() { + if (window != null) { + for (final NodeProxy proxy : window) { + result.add(proxy); + } + window.clear(); + } + } } } diff --git a/exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java b/exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java new file mode 100644 index 00000000000..d115e4c52f4 --- /dev/null +++ b/exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java @@ -0,0 +1,321 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery; + +import org.exist.test.ExistXmldbEmbeddedServer; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.xmldb.api.base.ResourceSet; +import org.xmldb.api.base.XMLDBException; +import org.xmldb.api.modules.XQueryService; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Regression test for the {@code preceding::*} half of issue #2129. craigberry's + * follow-up reproduced position-dependence for the wildcard preceding axis on + * a 50,000-element flat document: a query at {@code @xml:id='45000'} took + * roughly twice as long as the same query at {@code @xml:id='25000'}, with + * 12 s page-turn impact in an 1100-page book. + * + * Unlike {@code following::*} (closed by PR #6325), {@code preceding::*} cannot + * skip the doc-start walk: matches must be emitted before the reference node, + * so every event up to the reference is a candidate. The fix is a K-bounded + * filter: when the location step has a positional predicate {@code [K]} with + * an integer K, only the most recent K matches need be retained, since any + * earlier match cannot be selected by {@code [K]}. The + * {@link LocationStep.PrecedingFilter} keeps a sliding window of size K in an + * {@link java.util.ArrayDeque}, evicting the oldest match as new ones are + * found, then flushes the window into the result on termination. + * + * Two complementary checks: + *
    + *
  • Correctness — same nodes are returned, ancestor exclusion preserved;
  • + *
  • Performance — late positions complete in roughly the same time as + * early positions on a 50,000-element flat document.
  • + *
+ */ +public class PrecedingAxisPositionRegressionTest { + + @ClassRule + public static final ExistXmldbEmbeddedServer existEmbeddedServer = + new ExistXmldbEmbeddedServer(false, true, true); + + private static final String SMALL_DOC = "/db/words-small.xml"; + private static final String LARGE_DOC = "/db/words-large.xml"; + + @BeforeClass + public static void storeTestDocuments() throws XMLDBException { + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + xqs.query( + """ + let $words := for $i in (1 to 50) return {$i} + return xmldb:store('/db', 'words-small.xml', document {{$words}}) + """); + xqs.query( + """ + let $words := for $i in (1 to 50000) return {$i} + return xmldb:store('/db', 'words-large.xml', document {{$words}}) + """); + } + + @AfterClass + public static void removeTestDocuments() throws XMLDBException { + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + xqs.query("xmldb:remove('/db', 'words-small.xml')"); + xqs.query("xmldb:remove('/db', 'words-large.xml')"); + } + + @Test + public void reproducerOutputAtEarlyPosition() throws XMLDBException { + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + final ResourceSet rs = xqs.query( + """ + let $w := doc('/db/words-small.xml')//w[@xml:id='10'] + return string-join( + for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), + ' ') + """); + assertEquals(1, rs.getSize()); + assertEquals("5 6 7 8 9", rs.getResource(0).getContent().toString()); + } + + @Test + public void reproducerOutputAtMidpoint() throws XMLDBException { + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + final ResourceSet rs = xqs.query( + """ + let $w := doc('/db/words-small.xml')//w[@xml:id='25'] + return string-join( + for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), + ' ') + """); + assertEquals(1, rs.getSize()); + assertEquals("20 21 22 23 24", rs.getResource(0).getContent().toString()); + } + + @Test + public void reproducerOutputAtLatePosition() throws XMLDBException { + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + final ResourceSet rs = xqs.query( + """ + let $w := doc('/db/words-small.xml')//w[@xml:id='45'] + return string-join( + for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), + ' ') + """); + assertEquals(1, rs.getSize()); + assertEquals("40 41 42 43 44", rs.getResource(0).getContent().toString()); + } + + @Test + public void precedingExcludesAncestors() throws XMLDBException { + // The K-bounded filter must still skip ancestors of the reference node: + // {@code preceding::} excludes ancestors, even though they precede the + // reference in document order. Verify the existing semantics survive + // the buffering rewrite. + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + xqs.query( + """ + xquery version "3.1"; + let $doc := + return xmldb:store('/db', 'ancestors.xml', document {$doc}) + """); + try { + final ResourceSet rs = xqs.query( + """ + for $n in doc('/db/ancestors.xml')//target/preceding::* + return name($n) + """); + assertEquals(1, rs.getSize()); + assertEquals("a", rs.getResource(0).getContent().toString()); + } finally { + xqs.query("xmldb:remove('/db', 'ancestors.xml')"); + } + } + + @Test + public void precedingPositionalPredicateReturnsKthClosest() throws XMLDBException { + // Small flat doc: 10 elements numbered 1-10. From w[5], the k-th + // preceding element in axis order (reverse doc order) is w[5-k]. + // Verify the K-bounded filter still respects axis-order positional + // semantics for k = 1, 2, 3, 4. + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + xqs.query( + """ + xquery version "3.1"; + let $doc := {for $i in (1 to 10) return {$i}} + return xmldb:store('/db', 'tiny.xml', document {$doc}) + """); + try { + for (int k = 1; k <= 4; k++) { + final ResourceSet rs = xqs.query( + (""" + let $w := doc('/db/tiny.xml')//w[@xml:id='5'] + return $w/preceding::*[%d]/text() + """).formatted(k)); + assertEquals("k=" + k, 1, rs.getSize()); + assertEquals("k=" + k, String.valueOf(5 - k), + rs.getResource(0).getContent().toString()); + } + } finally { + xqs.query("xmldb:remove('/db', 'tiny.xml')"); + } + } + + @Test + public void precedingAxisIsPositionIndependent() throws XMLDBException { + // On a 50,000-element flat document, isolating the wildcard preceding:: + // axis. Before the fix, the late-position run took ~2x the early- + // position run because PrecedingFilter accumulated every match from + // the document root forward. After the K-bounded fix, both run in + // ~constant time. Threshold loose to tolerate JIT warm-up and CI + // variance but tight enough to catch a re-regression that re-introduces + // unbounded accumulation. + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + + // Warm-up. + for (int i = 0; i < 3; i++) { + xqs.query(precedingOnlyQuery(25000)); + } + + // Median of 5 paired measurements per position to dampen single-run + // noise (JIT, GC, OS scheduling). Without the K-bounded buffer, ratio + // is consistently ~2.5x; with it, ~2.0x in local measurement (the StAX + // walk itself remains O(refPosition) so position-dependence cannot be + // eliminated with this approach, only damped). + final long earlyMs = medianMs(xqs, 5000, 5); + final long lateMs = medianMs(xqs, 45000, 5); + + System.out.println("[#2129-preceding] median earlyMs=" + earlyMs + + " lateMs=" + lateMs + + " ratio=" + ((double) lateMs / Math.max(1L, earlyMs))); + + final long threshold = Math.max(500L, earlyMs * 3L); + assertTrue( + "preceding:: at position 45000 took " + lateMs + "ms; " + + "at position 5000 it took " + earlyMs + "ms; " + + "threshold=" + threshold + "ms (3x early or 500ms min). " + + "If this regressed, the K-bounded sliding window in " + + "PrecedingFilter probably stopped firing - see issue #2129.", + lateMs <= threshold); + } + + @Test + public void wildcardPrecedingNotMuchSlowerThanPrecedingSibling() throws XMLDBException { + // craigberry's #2129 follow-up reported a ~33% gap between + // {@code preceding::*[K][self::w]} and {@code preceding-sibling::w[K]} + // on a flat document. The K-bounded sliding window in PrecedingFilter + // removes the unbounded result-set accumulation; downstream sort and + // predicate iteration then handle K rather than N elements. The two + // axes still differ in walk strategy (StAX vs persistent sibling + // chain), so we don't expect parity, but we do expect the wildcard + // case not to be wildly slower than the sibling case. + // + // Threshold is loose to tolerate JIT/CI noise. The interesting signal + // is the printed ratio, captured per run for telemetry. + final XQueryService xqs = + existEmbeddedServer.getRoot().getService(XQueryService.class); + + // Warm-up. + for (int i = 0; i < 3; i++) { + xqs.query(wildcardQuery(25000)); + xqs.query(siblingQuery(25000)); + } + + final long wildcardMs = medianMsOf(xqs, 5, () -> wildcardQuery(25000)); + final long siblingMs = medianMsOf(xqs, 5, () -> siblingQuery(25000)); + final double ratio = (double) wildcardMs / Math.max(1L, siblingMs); + + System.out.println("[#2129-preceding] wildcardMs=" + wildcardMs + + " siblingMs=" + siblingMs + + " ratio=" + ratio); + + // 4x is a generous ceiling — pre-fix the wildcard path was unbounded + // and could be much worse on large lookbacks; this catches catastrophic + // re-regression without flaking on CI noise. + assertTrue( + "wildcard preceding " + wildcardMs + "ms vs sibling " + siblingMs + + "ms (ratio=" + ratio + "); expected ratio <= 4. " + + "If this regressed, the K-bounded sliding window in " + + "PrecedingFilter probably stopped firing - see issue #2129.", + wildcardMs <= Math.max(500L, siblingMs * 4L)); + } + + private static long timeQuery(final XQueryService xqs, final String query) + throws XMLDBException { + final long start = System.nanoTime(); + xqs.query(query).getSize(); + return (System.nanoTime() - start) / 1_000_000L; + } + + private static long medianMs(final XQueryService xqs, final int xmlId, final int trials) + throws XMLDBException { + return medianMsOf(xqs, trials, () -> precedingOnlyQuery(xmlId)); + } + + @FunctionalInterface + private interface QuerySupplier { + String get(); + } + + private static long medianMsOf(final XQueryService xqs, final int trials, + final QuerySupplier querySupplier) throws XMLDBException { + final long[] samples = new long[trials]; + for (int i = 0; i < trials; i++) { + samples[i] = timeQuery(xqs, querySupplier.get()); + } + java.util.Arrays.sort(samples); + return samples[trials / 2]; + } + + private static String wildcardQuery(final int xmlId) { + return precedingOnlyQuery(xmlId); + } + + private static String siblingQuery(final int xmlId) { + return """ + xquery version "3.1"; + let $w := doc('%s')//w[@xml:id='%d'] + return for $i in (1 to 5) return $w/preceding-sibling::w[$i]/text() + """.formatted(LARGE_DOC, xmlId); + } + + private static String precedingOnlyQuery(final int xmlId) { + return """ + xquery version "3.1"; + let $w := doc('%s')//w[@xml:id='%d'] + return for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text() + """.formatted(LARGE_DOC, xmlId); + } +} diff --git a/exist-core/src/test/xquery/optimizer/positional.xqm b/exist-core/src/test/xquery/optimizer/positional.xqm index cdbbcd37fb0..fb76e9aa24b 100644 --- a/exist-core/src/test/xquery/optimizer/positional.xqm +++ b/exist-core/src/test/xquery/optimizer/positional.xqm @@ -176,7 +176,7 @@ function ot:optimize-simple-following-nested() { declare %test:stats - %test:assertXPath("not($result//stats:optimization[@type eq 'POSITIONAL_PREDICATE'])") + %test:assertXPath("$result//stats:optimization[@type eq 'POSITIONAL_PREDICATE']") function ot:optimize-simple-preceding() { let $w := doc($ot:DOC)//w[@xml:id='25000'] return From a1fe4631d625eb794b7aa5164c22166f7cae7b43 Mon Sep 17 00:00:00 2001 From: Joe Wicentowski Date: Tue, 12 May 2026 11:29:04 -0400 Subject: [PATCH 2/3] [test] Split #2129 preceding-axis perf+correctness test per review Address duncdrum's review on PR #6330 by splitting the mixed-purpose PrecedingAxisPositionRegressionTest.java into two artifacts: - exist-core-jmh/.../PrecedingAxisBenchmark.java: JMH benchmark for the performance comparison (wildcard preceding::* vs preceding-sibling::, at early/mid/late positions on a 50,000-element flat doc). JMH handles statistical aggregation natively; the bespoke nanoTime + median-of-N infrastructure is dropped. - exist-core/src/test/xquery/preceding-axis.xql: XQSuite tests for the correctness assertions (early/mid/late reproducer output, ancestor exclusion on the preceding axis, axis-order positional predicate semantics). The original JUnit class is removed, which also resolves the line-66 unused-variable Codacy complaint (SMALL_DOC) by deletion. Full-module gate (per strengthened test-before-push SOP): Tests run: 6597, Failures: 0, Errors: 0, Skipped: 106, BUILD SUCCESS. JMH module builds clean. --- exist-core-jmh/pom.xml | 4 + .../exist/xquery/PrecedingAxisBenchmark.java | 130 +++++++ .../PrecedingAxisPositionRegressionTest.java | 321 ------------------ exist-core/src/test/xquery/preceding-axis.xql | 132 +++++++ 4 files changed, 266 insertions(+), 321 deletions(-) create mode 100644 exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java delete mode 100644 exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java create mode 100644 exist-core/src/test/xquery/preceding-axis.xql diff --git a/exist-core-jmh/pom.xml b/exist-core-jmh/pom.xml index 3ef48a4dd93..b8270c99c2c 100644 --- a/exist-core-jmh/pom.xml +++ b/exist-core-jmh/pom.xml @@ -59,6 +59,10 @@ lucene-core ${lucene.version} + + net.sf.xmldb-org + xmldb-api + org.openjdk.jmh jmh-core diff --git a/exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java b/exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java new file mode 100644 index 00000000000..afcfe295f93 --- /dev/null +++ b/exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java @@ -0,0 +1,130 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery; + +import org.exist.test.ExistEmbeddedServer; +import org.exist.xmldb.XmldbURI; +import org.openjdk.jmh.annotations.*; +import org.xmldb.api.DatabaseManager; +import org.xmldb.api.base.Collection; +import org.xmldb.api.base.Database; +import org.xmldb.api.modules.XQueryService; + +import java.util.concurrent.TimeUnit; + +/** + * JMH benchmark for the {@code preceding::*} half of issue #2129. + * + *

craigberry's #2129 follow-up reproduced position-dependence for the + * wildcard preceding axis on a 50,000-element flat document: a query at + * {@code @xml:id='45000'} took roughly twice as long as the same query at + * {@code @xml:id='25000'}. The K-bounded sliding window in + * {@link LocationStep.PrecedingFilter} caps the retained match set at K, + * eliminating the unbounded accumulation that produced the late-position + * tax.

+ * + *

This benchmark exercises the wildcard-vs-sibling and early-vs-late + * comparisons that the original mixed-purpose JUnit class measured with + * {@code System.nanoTime} and median-of-N. JMH handles statistical + * aggregation natively; the correctness assertions live in the + * companion XQSuite test {@code exist-core/src/test/xquery/preceding-axis.xql}.

+ */ +@State(Scope.Benchmark) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Warmup(iterations = 3, time = 1) +@Measurement(iterations = 5, time = 2) +@Fork(1) +public class PrecedingAxisBenchmark { + + private static final String LARGE_DOC = "/db/words-large.xml"; + + @Param({"5000", "25000", "45000"}) + private int refPosition; + + private ExistEmbeddedServer existServer; + private Database database; + private Collection root; + private XQueryService xqs; + + @Setup(Level.Trial) + public void setUp() throws Exception { + existServer = new ExistEmbeddedServer(true, true); + existServer.startDb(); + + final Class cl = Class.forName("org.exist.xmldb.DatabaseImpl"); + database = (Database) cl.getDeclaredConstructor().newInstance(); + database.setProperty("create-database", "true"); + DatabaseManager.registerDatabase(database); + root = DatabaseManager.getCollection(XmldbURI.LOCAL_DB, "admin", ""); + xqs = root.getService(XQueryService.class); + + xqs.query( + """ + let $words := for $i in (1 to 50000) return {$i} + return xmldb:store('/db', 'words-large.xml', document {{$words}}) + """); + } + + @TearDown(Level.Trial) + public void tearDown() throws Exception { + try { + xqs.query("xmldb:remove('/db', 'words-large.xml')"); + } finally { + root.close(); + DatabaseManager.deregisterDatabase(database); + existServer.stopDb(true); + } + } + + /** + * Wildcard preceding axis with a positional predicate gated by a self::w + * filter. Pre-fix this accumulated every preceding match from doc start; + * post-fix the sliding window caps retention at K=5. + */ + @Benchmark + public long wildcardPrecedingWithPositionalPredicate() throws Exception { + return xqs.query( + (""" + xquery version "3.1"; + let $w := doc('%s')//w[@xml:id='%d'] + return for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text() + """).formatted(LARGE_DOC, refPosition) + ).getSize(); + } + + /** + * preceding-sibling::w[K] baseline: walks the persistent sibling chain + * directly rather than the full preceding axis. Used as a relative + * lower-bound to interpret the wildcard preceding number. + */ + @Benchmark + public long precedingSiblingBaseline() throws Exception { + return xqs.query( + (""" + xquery version "3.1"; + let $w := doc('%s')//w[@xml:id='%d'] + return for $i in (1 to 5) return $w/preceding-sibling::w[$i]/text() + """).formatted(LARGE_DOC, refPosition) + ).getSize(); + } +} diff --git a/exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java b/exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java deleted file mode 100644 index d115e4c52f4..00000000000 --- a/exist-core/src/test/java/org/exist/xquery/PrecedingAxisPositionRegressionTest.java +++ /dev/null @@ -1,321 +0,0 @@ -/* - * eXist-db Open Source Native XML Database - * Copyright (C) 2001 The eXist-db Authors - * - * info@exist-db.org - * http://www.exist-db.org - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -package org.exist.xquery; - -import org.exist.test.ExistXmldbEmbeddedServer; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; -import org.xmldb.api.base.ResourceSet; -import org.xmldb.api.base.XMLDBException; -import org.xmldb.api.modules.XQueryService; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -/** - * Regression test for the {@code preceding::*} half of issue #2129. craigberry's - * follow-up reproduced position-dependence for the wildcard preceding axis on - * a 50,000-element flat document: a query at {@code @xml:id='45000'} took - * roughly twice as long as the same query at {@code @xml:id='25000'}, with - * 12 s page-turn impact in an 1100-page book. - * - * Unlike {@code following::*} (closed by PR #6325), {@code preceding::*} cannot - * skip the doc-start walk: matches must be emitted before the reference node, - * so every event up to the reference is a candidate. The fix is a K-bounded - * filter: when the location step has a positional predicate {@code [K]} with - * an integer K, only the most recent K matches need be retained, since any - * earlier match cannot be selected by {@code [K]}. The - * {@link LocationStep.PrecedingFilter} keeps a sliding window of size K in an - * {@link java.util.ArrayDeque}, evicting the oldest match as new ones are - * found, then flushes the window into the result on termination. - * - * Two complementary checks: - *
    - *
  • Correctness — same nodes are returned, ancestor exclusion preserved;
  • - *
  • Performance — late positions complete in roughly the same time as - * early positions on a 50,000-element flat document.
  • - *
- */ -public class PrecedingAxisPositionRegressionTest { - - @ClassRule - public static final ExistXmldbEmbeddedServer existEmbeddedServer = - new ExistXmldbEmbeddedServer(false, true, true); - - private static final String SMALL_DOC = "/db/words-small.xml"; - private static final String LARGE_DOC = "/db/words-large.xml"; - - @BeforeClass - public static void storeTestDocuments() throws XMLDBException { - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - xqs.query( - """ - let $words := for $i in (1 to 50) return {$i} - return xmldb:store('/db', 'words-small.xml', document {{$words}}) - """); - xqs.query( - """ - let $words := for $i in (1 to 50000) return {$i} - return xmldb:store('/db', 'words-large.xml', document {{$words}}) - """); - } - - @AfterClass - public static void removeTestDocuments() throws XMLDBException { - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - xqs.query("xmldb:remove('/db', 'words-small.xml')"); - xqs.query("xmldb:remove('/db', 'words-large.xml')"); - } - - @Test - public void reproducerOutputAtEarlyPosition() throws XMLDBException { - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - final ResourceSet rs = xqs.query( - """ - let $w := doc('/db/words-small.xml')//w[@xml:id='10'] - return string-join( - for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), - ' ') - """); - assertEquals(1, rs.getSize()); - assertEquals("5 6 7 8 9", rs.getResource(0).getContent().toString()); - } - - @Test - public void reproducerOutputAtMidpoint() throws XMLDBException { - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - final ResourceSet rs = xqs.query( - """ - let $w := doc('/db/words-small.xml')//w[@xml:id='25'] - return string-join( - for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), - ' ') - """); - assertEquals(1, rs.getSize()); - assertEquals("20 21 22 23 24", rs.getResource(0).getContent().toString()); - } - - @Test - public void reproducerOutputAtLatePosition() throws XMLDBException { - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - final ResourceSet rs = xqs.query( - """ - let $w := doc('/db/words-small.xml')//w[@xml:id='45'] - return string-join( - for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), - ' ') - """); - assertEquals(1, rs.getSize()); - assertEquals("40 41 42 43 44", rs.getResource(0).getContent().toString()); - } - - @Test - public void precedingExcludesAncestors() throws XMLDBException { - // The K-bounded filter must still skip ancestors of the reference node: - // {@code preceding::} excludes ancestors, even though they precede the - // reference in document order. Verify the existing semantics survive - // the buffering rewrite. - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - xqs.query( - """ - xquery version "3.1"; - let $doc :=
- return xmldb:store('/db', 'ancestors.xml', document {$doc}) - """); - try { - final ResourceSet rs = xqs.query( - """ - for $n in doc('/db/ancestors.xml')//target/preceding::* - return name($n) - """); - assertEquals(1, rs.getSize()); - assertEquals("a", rs.getResource(0).getContent().toString()); - } finally { - xqs.query("xmldb:remove('/db', 'ancestors.xml')"); - } - } - - @Test - public void precedingPositionalPredicateReturnsKthClosest() throws XMLDBException { - // Small flat doc: 10 elements numbered 1-10. From w[5], the k-th - // preceding element in axis order (reverse doc order) is w[5-k]. - // Verify the K-bounded filter still respects axis-order positional - // semantics for k = 1, 2, 3, 4. - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - xqs.query( - """ - xquery version "3.1"; - let $doc := {for $i in (1 to 10) return {$i}} - return xmldb:store('/db', 'tiny.xml', document {$doc}) - """); - try { - for (int k = 1; k <= 4; k++) { - final ResourceSet rs = xqs.query( - (""" - let $w := doc('/db/tiny.xml')//w[@xml:id='5'] - return $w/preceding::*[%d]/text() - """).formatted(k)); - assertEquals("k=" + k, 1, rs.getSize()); - assertEquals("k=" + k, String.valueOf(5 - k), - rs.getResource(0).getContent().toString()); - } - } finally { - xqs.query("xmldb:remove('/db', 'tiny.xml')"); - } - } - - @Test - public void precedingAxisIsPositionIndependent() throws XMLDBException { - // On a 50,000-element flat document, isolating the wildcard preceding:: - // axis. Before the fix, the late-position run took ~2x the early- - // position run because PrecedingFilter accumulated every match from - // the document root forward. After the K-bounded fix, both run in - // ~constant time. Threshold loose to tolerate JIT warm-up and CI - // variance but tight enough to catch a re-regression that re-introduces - // unbounded accumulation. - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - - // Warm-up. - for (int i = 0; i < 3; i++) { - xqs.query(precedingOnlyQuery(25000)); - } - - // Median of 5 paired measurements per position to dampen single-run - // noise (JIT, GC, OS scheduling). Without the K-bounded buffer, ratio - // is consistently ~2.5x; with it, ~2.0x in local measurement (the StAX - // walk itself remains O(refPosition) so position-dependence cannot be - // eliminated with this approach, only damped). - final long earlyMs = medianMs(xqs, 5000, 5); - final long lateMs = medianMs(xqs, 45000, 5); - - System.out.println("[#2129-preceding] median earlyMs=" + earlyMs - + " lateMs=" + lateMs - + " ratio=" + ((double) lateMs / Math.max(1L, earlyMs))); - - final long threshold = Math.max(500L, earlyMs * 3L); - assertTrue( - "preceding:: at position 45000 took " + lateMs + "ms; " - + "at position 5000 it took " + earlyMs + "ms; " - + "threshold=" + threshold + "ms (3x early or 500ms min). " - + "If this regressed, the K-bounded sliding window in " - + "PrecedingFilter probably stopped firing - see issue #2129.", - lateMs <= threshold); - } - - @Test - public void wildcardPrecedingNotMuchSlowerThanPrecedingSibling() throws XMLDBException { - // craigberry's #2129 follow-up reported a ~33% gap between - // {@code preceding::*[K][self::w]} and {@code preceding-sibling::w[K]} - // on a flat document. The K-bounded sliding window in PrecedingFilter - // removes the unbounded result-set accumulation; downstream sort and - // predicate iteration then handle K rather than N elements. The two - // axes still differ in walk strategy (StAX vs persistent sibling - // chain), so we don't expect parity, but we do expect the wildcard - // case not to be wildly slower than the sibling case. - // - // Threshold is loose to tolerate JIT/CI noise. The interesting signal - // is the printed ratio, captured per run for telemetry. - final XQueryService xqs = - existEmbeddedServer.getRoot().getService(XQueryService.class); - - // Warm-up. - for (int i = 0; i < 3; i++) { - xqs.query(wildcardQuery(25000)); - xqs.query(siblingQuery(25000)); - } - - final long wildcardMs = medianMsOf(xqs, 5, () -> wildcardQuery(25000)); - final long siblingMs = medianMsOf(xqs, 5, () -> siblingQuery(25000)); - final double ratio = (double) wildcardMs / Math.max(1L, siblingMs); - - System.out.println("[#2129-preceding] wildcardMs=" + wildcardMs - + " siblingMs=" + siblingMs - + " ratio=" + ratio); - - // 4x is a generous ceiling — pre-fix the wildcard path was unbounded - // and could be much worse on large lookbacks; this catches catastrophic - // re-regression without flaking on CI noise. - assertTrue( - "wildcard preceding " + wildcardMs + "ms vs sibling " + siblingMs - + "ms (ratio=" + ratio + "); expected ratio <= 4. " - + "If this regressed, the K-bounded sliding window in " - + "PrecedingFilter probably stopped firing - see issue #2129.", - wildcardMs <= Math.max(500L, siblingMs * 4L)); - } - - private static long timeQuery(final XQueryService xqs, final String query) - throws XMLDBException { - final long start = System.nanoTime(); - xqs.query(query).getSize(); - return (System.nanoTime() - start) / 1_000_000L; - } - - private static long medianMs(final XQueryService xqs, final int xmlId, final int trials) - throws XMLDBException { - return medianMsOf(xqs, trials, () -> precedingOnlyQuery(xmlId)); - } - - @FunctionalInterface - private interface QuerySupplier { - String get(); - } - - private static long medianMsOf(final XQueryService xqs, final int trials, - final QuerySupplier querySupplier) throws XMLDBException { - final long[] samples = new long[trials]; - for (int i = 0; i < trials; i++) { - samples[i] = timeQuery(xqs, querySupplier.get()); - } - java.util.Arrays.sort(samples); - return samples[trials / 2]; - } - - private static String wildcardQuery(final int xmlId) { - return precedingOnlyQuery(xmlId); - } - - private static String siblingQuery(final int xmlId) { - return """ - xquery version "3.1"; - let $w := doc('%s')//w[@xml:id='%d'] - return for $i in (1 to 5) return $w/preceding-sibling::w[$i]/text() - """.formatted(LARGE_DOC, xmlId); - } - - private static String precedingOnlyQuery(final int xmlId) { - return """ - xquery version "3.1"; - let $w := doc('%s')//w[@xml:id='%d'] - return for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text() - """.formatted(LARGE_DOC, xmlId); - } -} diff --git a/exist-core/src/test/xquery/preceding-axis.xql b/exist-core/src/test/xquery/preceding-axis.xql new file mode 100644 index 00000000000..36f9e178999 --- /dev/null +++ b/exist-core/src/test/xquery/preceding-axis.xql @@ -0,0 +1,132 @@ +(: + : eXist-db Open Source Native XML Database + : Copyright (C) 2001 The eXist-db Authors + : + : info@exist-db.org + : http://www.exist-db.org + : + : This library is free software; you can redistribute it and/or + : modify it under the terms of the GNU Lesser General Public + : License as published by the Free Software Foundation; either + : version 2.1 of the License, or (at your option) any later version. + : + : This library is distributed in the hope that it will be useful, + : but WITHOUT ANY WARRANTY; without even the implied warranty of + : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + : Lesser General Public License for more details. + : + : You should have received a copy of the GNU Lesser General Public + : License along with this library; if not, write to the Free Software + : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + :) +xquery version "3.1"; + +(:~ + : Correctness regression tests for the {@code preceding::*} half of issue #2129. + : The K-bounded sliding window in {@code LocationStep.PrecedingFilter} must + : preserve the original axis semantics: ancestor exclusion, axis-order + : positional predicates, and identical result counts/values regardless of + : where the reference node sits in the document. + : + : The companion performance benchmark lives in + : {@code exist-core-jmh/.../PrecedingAxisBenchmark.java}. + :) +module namespace pa = "http://exist-db.org/xquery/test/preceding-axis"; + +declare namespace test = "http://exist-db.org/xquery/xqsuite"; + +declare variable $pa:collection := "/db/preceding-axis-test"; +declare variable $pa:small-doc := $pa:collection || "/words-small.xml"; +declare variable $pa:tiny-doc := $pa:collection || "/tiny.xml"; +declare variable $pa:ancestors-doc := $pa:collection || "/ancestors.xml"; + +declare + %test:setUp +function pa:setup() { + let $col := xmldb:create-collection("/db", "preceding-axis-test") + let $small := + document { + { + for $i in (1 to 50) return {$i} + } + } + let $tiny := + document { + { + for $i in (1 to 10) return {$i} + } + } + let $ancestors := + document { + + } + return ( + xmldb:store($col, "words-small.xml", $small), + xmldb:store($col, "tiny.xml", $tiny), + xmldb:store($col, "ancestors.xml", $ancestors) + ) +}; + +declare + %test:tearDown +function pa:cleanup() { + xmldb:remove($pa:collection) +}; + +declare + %test:assertEquals("5 6 7 8 9") +function pa:reproducer-output-at-early-position() { + let $w := doc($pa:small-doc)//w[@xml:id = "10"] + return + string-join( + for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), + " " + ) +}; + +declare + %test:assertEquals("20 21 22 23 24") +function pa:reproducer-output-at-midpoint() { + let $w := doc($pa:small-doc)//w[@xml:id = "25"] + return + string-join( + for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), + " " + ) +}; + +declare + %test:assertEquals("40 41 42 43 44") +function pa:reproducer-output-at-late-position() { + let $w := doc($pa:small-doc)//w[@xml:id = "45"] + return + string-join( + for $i in (1 to 5) return $w/preceding::*[5 + 1 - $i][self::w]/text(), + " " + ) +}; + +(:~ + : {@code preceding::} excludes ancestors of the reference node even though + : they precede the reference in document order. Only the sibling {@code a} + : precedes {@code target} on the preceding axis. + :) +declare + %test:assertEquals("a") +function pa:preceding-excludes-ancestors() { + for $n in doc($pa:ancestors-doc)//target/preceding::* + return name($n) +}; + +(:~ + : From {@code w[5]} on a flat doc of 10 elements, the k-th preceding element + : in axis order (reverse document order) is {@code w[5-k]}. + :) +declare + %test:assertEquals("4", "3", "2", "1") +function pa:preceding-positional-predicate-returns-kth-closest() { + let $w := doc($pa:tiny-doc)//w[@xml:id = "5"] + return + for $k in 1 to 4 + return $w/preceding::*[$k]/text() +}; From 9f27c409a311c46c9670b4af01b9b5949f3674b7 Mon Sep 17 00:00:00 2001 From: Joe Wicentowski Date: Tue, 12 May 2026 16:42:21 -0400 Subject: [PATCH 3/3] [doc] PrecedingAxisBenchmark: add Javadoc to satisfy strict doclint on CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses duncdrum's CI Javadoc error on PR #6330: - Replace {@link LocationStep.PrecedingFilter} (unresolved reference) with {@code LocationStep.PrecedingFilter}. - Add Javadoc for the default constructor. - Add Javadoc with @throws to setUp and tearDown. - Add @return and @throws to wildcardPrecedingWithPositionalPredicate and precedingSiblingBaseline. The JMH-generated annotation classes (under target/generated-sources/) also trip the doclint, but they're regenerated on every build — this commit doesn't touch them; if their lint complaints block CI, the proper fix is a javadoc-plugin excludePackageNames for org.exist.xquery.jmh_generated. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../exist/xquery/PrecedingAxisBenchmark.java | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java b/exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java index afcfe295f93..9c867ac869e 100644 --- a/exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java +++ b/exist-core-jmh/src/main/java/org/exist/xquery/PrecedingAxisBenchmark.java @@ -38,7 +38,7 @@ * wildcard preceding axis on a 50,000-element flat document: a query at * {@code @xml:id='45000'} took roughly twice as long as the same query at * {@code @xml:id='25000'}. The K-bounded sliding window in - * {@link LocationStep.PrecedingFilter} caps the retained match set at K, + * {@code LocationStep.PrecedingFilter} caps the retained match set at K, * eliminating the unbounded accumulation that produced the late-position * tax.

* @@ -66,6 +66,17 @@ public class PrecedingAxisBenchmark { private Collection root; private XQueryService xqs; + /** Default constructor for JMH harness. */ + public PrecedingAxisBenchmark() { + } + + /** + * Boots an embedded eXist server, registers the XML:DB driver, and stores + * a 50,000-element flat words document used by the benchmark queries. + * + * @throws Exception if the embedded server fails to start, the database + * driver cannot be registered, or the corpus document cannot be stored + */ @Setup(Level.Trial) public void setUp() throws Exception { existServer = new ExistEmbeddedServer(true, true); @@ -85,6 +96,13 @@ public void setUp() throws Exception { """); } + /** + * Removes the corpus document, closes the test collection, and shuts down + * the embedded server. + * + * @throws Exception if removing the corpus document, closing the + * collection, or stopping the embedded server fails + */ @TearDown(Level.Trial) public void tearDown() throws Exception { try { @@ -100,6 +118,10 @@ public void tearDown() throws Exception { * Wildcard preceding axis with a positional predicate gated by a self::w * filter. Pre-fix this accumulated every preceding match from doc start; * post-fix the sliding window caps retention at K=5. + * + * @return the result-set size, returned so JMH's blackhole prevents the + * call being optimized away + * @throws Exception if the embedded query fails */ @Benchmark public long wildcardPrecedingWithPositionalPredicate() throws Exception { @@ -116,6 +138,10 @@ public long wildcardPrecedingWithPositionalPredicate() throws Exception { * preceding-sibling::w[K] baseline: walks the persistent sibling chain * directly rather than the full preceding axis. Used as a relative * lower-bound to interpret the wildcard preceding number. + * + * @return the result-set size, returned so JMH's blackhole prevents the + * call being optimized away + * @throws Exception if the embedded query fails */ @Benchmark public long precedingSiblingBaseline() throws Exception {