diff --git a/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g b/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g index 98520186e65..872301461bd 100644 --- a/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g +++ b/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g @@ -1108,8 +1108,6 @@ throws XPathException STAR | ( - // TODO: parameter types are collected, but not used! - // Change SequenceType accordingly. { List paramTypes = new ArrayList(5); } ( { SequenceType paramType = new SequenceType(); } @@ -1118,6 +1116,10 @@ throws XPathException )* { SequenceType returnType = new SequenceType(); } "as" sequenceType [returnType] + { + type.setFunctionParamTypes(paramTypes.toArray(new SequenceType[0])); + type.setFunctionReturnType(returnType); + } ) ) ) @@ -1128,14 +1130,15 @@ throws XPathException STAR | ( - // TODO: parameter types are collected, but not used! - // Change SequenceType accordingly. { List paramTypes = new ArrayList(5); } ( { SequenceType paramType = new SequenceType(); } sequenceType [paramType] { paramTypes.add(paramType); } )* + { + type.setFunctionParamTypes(paramTypes.toArray(new SequenceType[0])); + } ) ) ) @@ -1146,14 +1149,15 @@ throws XPathException STAR | ( - // TODO: parameter types are collected, but not used! - // Change SequenceType accordingly. { List paramTypes = new ArrayList(5); } ( { SequenceType paramType = new SequenceType(); } sequenceType [paramType] { paramTypes.add(paramType); } )* + { + type.setFunctionParamTypes(paramTypes.toArray(new SequenceType[0])); + } ) ) ) diff --git a/exist-core/src/main/java/org/exist/xquery/DynamicTypeCheck.java b/exist-core/src/main/java/org/exist/xquery/DynamicTypeCheck.java index 52a237ba128..c8058c6c791 100644 --- a/exist-core/src/main/java/org/exist/xquery/DynamicTypeCheck.java +++ b/exist-core/src/main/java/org/exist/xquery/DynamicTypeCheck.java @@ -149,6 +149,15 @@ private void check(Sequence result, Item item) throws XPathException { } else if (type == Type.ANY_URI && requiredType == Type.STRING) { item = item.convertTo(Type.STRING); type = Type.STRING; + //Binary type promotion (XQuery 4.0): xs:base64Binary ↔ xs:hexBinary + } else if ((type == Type.BASE64_BINARY && requiredType == Type.HEX_BINARY) + || (type == Type.HEX_BINARY && requiredType == Type.BASE64_BINARY)) { + try { + item = item.convertTo(requiredType); + } catch (final XPathException e) { + throw new XPathException(expression, ErrorCodes.XPTY0004, + "cannot convert " + Type.getTypeName(type) + " to " + Type.getTypeName(requiredType)); + } } else { if (!(Type.subTypeOf(type, requiredType))) { throw new XPathException(expression, typeMismatchError, diff --git a/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java b/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java index 903edab957a..0d2682721d3 100644 --- a/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java +++ b/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java @@ -90,6 +90,7 @@ public class ErrorCodes { public static final ErrorCode XQST0052 = new W3CErrorCode("XQST0052", "It is a static error if the type-name in a single-type or sequence-type for a cast or castable expression does not refer to a defined atomic type."); public static final ErrorCode XQST0053 = new W3CErrorCode("XQST0053", "(Not currently used.)"); public static final ErrorCode XQST0054 = new W3CErrorCode("XQST0054", "It is a static error if a variable depends on itself."); + public static final ErrorCode XQDY0054 = new W3CErrorCode("XQDY0054", "It is a dynamic error if a variable depends on itself."); public static final ErrorCode XQST0055 = new W3CErrorCode("XQST0055", "It is a static error if a Prolog contains more than one copy-namespaces declaration."); public static final ErrorCode XQST0056 = new W3CErrorCode("XQST0056", "(Not currently used.)"); public static final ErrorCode XQST0057 = new W3CErrorCode("XQST0057", "It is a static error if a schema import binds a namespace prefix but does not specify a target namespace other than a zero-length string."); diff --git a/exist-core/src/main/java/org/exist/xquery/FunctionFactory.java b/exist-core/src/main/java/org/exist/xquery/FunctionFactory.java index 87b721751af..59f5679c252 100644 --- a/exist-core/src/main/java/org/exist/xquery/FunctionFactory.java +++ b/exist-core/src/main/java/org/exist/xquery/FunctionFactory.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Set; import org.exist.Namespaces; import org.exist.dom.QName; @@ -48,6 +49,17 @@ public class FunctionFactory { public static final String PROPERTY_DISABLE_DEPRECATED_FUNCTIONS = "xquery.disable-deprecated-functions"; public static final boolean DISABLE_DEPRECATED_FUNCTIONS_BY_DEFAULT = false; + /** + * Reserved function names per XQuery 3.1/4.0 spec. + * These names must not be used as unprefixed function calls (XPST0003). + */ + private static final Set RESERVED_FUNCTION_NAMES = Set.of( + "array", "attribute", "comment", "document-node", "element", + "function", "if", "item", "map", "namespace-node", "node", + "processing-instruction", "schema-attribute", "schema-element", + "switch", "text", "typeswitch" + ); + public static Expression createFunction(XQueryContext context, XQueryAST ast, PathExpr parent, List params) throws XPathException { QName qname = null; try { @@ -55,6 +67,19 @@ public static Expression createFunction(XQueryContext context, XQueryAST ast, Pa } catch(final QName.IllegalQNameException xpe) { throw new XPathException(ast, ErrorCodes.XPST0081, "Invalid qname " + ast.getText() + ". " + xpe.getMessage()); } + + // Check for reserved function names — unprefixed reserved names cannot be + // used as function calls (XPST0003). Prefixed names like fn:item() are not + // subject to the reserved name restriction (they just won't be found → XPST0017). + final String rawName = ast.getText(); + if (rawName != null && !rawName.contains(":") && !rawName.contains("{")) { + final String local = qname.getLocalPart(); + if (RESERVED_FUNCTION_NAMES.contains(local)) { + throw new XPathException(ast.getLine(), ast.getColumn(), ErrorCodes.XPST0003, + "'" + local + "' is a reserved function name and cannot be used as a function call"); + } + } + return createFunction(context, qname, ast, parent, params); } @@ -508,12 +533,13 @@ public static FunctionCall wrap(XQueryContext context, Function call) throws XPa newSignature.setArgumentTypes(newParamArray); final UserDefinedFunction func = new UserDefinedFunction(context, newSignature); + func.setPassContextToBody(true); for (final QName varName: variables) { func.addVariable(varName); } - + call.setArguments(innerArgs); - + func.setFunctionBody(call); final FunctionCall wrappedCall = new FunctionCall(context, func); diff --git a/exist-core/src/main/java/org/exist/xquery/GeneralComparison.java b/exist-core/src/main/java/org/exist/xquery/GeneralComparison.java index 4c523ea4277..575447e1d74 100644 --- a/exist-core/src/main/java/org/exist/xquery/GeneralComparison.java +++ b/exist-core/src/main/java/org/exist/xquery/GeneralComparison.java @@ -1082,7 +1082,17 @@ private AtomicValue convertForValueComparison(final AtomicValue value, final int } /* - * d. Otherwise, a type error is raised [err:XPTY0004]. + * d. (XQuery 4.0) If each operand is an instance of one of the types + * xs:hexBinary or xs:base64Binary, then both operands are cast to + * type xs:base64Binary. + */ + if ((thisType == Type.HEX_BINARY || thisType == Type.BASE64_BINARY) + && (otherType == Type.HEX_BINARY || otherType == Type.BASE64_BINARY)) { + return value.convertTo(Type.BASE64_BINARY); + } + + /* + * e. Otherwise, a type error is raised [err:XPTY0004]. */ throw new XPathException(this, ErrorCodes.XPTY0004, "Incompatible primitive types"); } diff --git a/exist-core/src/main/java/org/exist/xquery/NamedFunctionReference.java b/exist-core/src/main/java/org/exist/xquery/NamedFunctionReference.java index f95d8100ab2..642868318cc 100644 --- a/exist-core/src/main/java/org/exist/xquery/NamedFunctionReference.java +++ b/exist-core/src/main/java/org/exist/xquery/NamedFunctionReference.java @@ -24,6 +24,8 @@ import java.util.ArrayList; import java.util.List; +import java.util.Set; + import org.exist.dom.QName; import org.exist.xquery.parser.XQueryAST; import org.exist.xquery.util.ExpressionDumper; @@ -52,7 +54,29 @@ public void analyze(AnalyzeContextInfo contextInfo) throws XPathException { resolvedFunction.analyze(contextInfo); } + /** + * Reserved function names per XQuery 3.1/4.0 spec. + * These names must not be used as unprefixed named function references (XPST0003). + */ + private static final Set RESERVED_FUNCTION_NAMES = Set.of( + "array", "attribute", "comment", "document-node", "element", + "function", "if", "item", "map", "namespace-node", "node", + "processing-instruction", "schema-attribute", "schema-element", + "switch", "text", "typeswitch" + ); + public static FunctionCall lookupFunction(Expression self, XQueryContext context, QName funcName, int arity) throws XPathException { + // Check for reserved function names — these cannot be used as named function references + final String localPart = funcName.getLocalPart(); + final String nsURI = funcName.getNamespaceURI(); + if (RESERVED_FUNCTION_NAMES.contains(localPart) && + (nsURI == null || nsURI.isEmpty() || + Function.BUILTIN_FUNCTION_NS.equals(nsURI) || + context.getDefaultFunctionNamespace().equals(nsURI))) { + throw new XPathException(self, ErrorCodes.XPST0003, + "'" + localPart + "' is a reserved function name and cannot be used as a named function reference"); + } + if (Function.BUILTIN_FUNCTION_NS.equals(funcName.getNamespaceURI()) && "concat".equals(funcName.getLocalPart()) && arity < 2) { diff --git a/exist-core/src/main/java/org/exist/xquery/UserDefinedFunction.java b/exist-core/src/main/java/org/exist/xquery/UserDefinedFunction.java index 839cbb6bc44..d8d0529f764 100644 --- a/exist-core/src/main/java/org/exist/xquery/UserDefinedFunction.java +++ b/exist-core/src/main/java/org/exist/xquery/UserDefinedFunction.java @@ -47,6 +47,7 @@ public class UserDefinedFunction extends Function implements Cloneable { private FunctionCall call; private boolean hasBeenReset = false; private List closureVariables = null; + private boolean passContextToBody = false; public UserDefinedFunction(XQueryContext context, FunctionSignature signature) { super(context, signature); @@ -60,6 +61,17 @@ public void setFunctionBody(Expression body) { this.body = body.simplify(); } + /** + * Mark this UDF as a wrapper for an internal function (created by + * {@link FunctionFactory#wrap}). Wrapper functions pass the evaluation + * context through to their body so that context-dependent built-in + * functions (like fn:id, fn:idref, fn:string, etc.) can access the + * focus when called via function references. + */ + public void setPassContextToBody(boolean passContext) { + this.passContextToBody = passContext; + } + public void addVariable(final String varName) throws XPathException { try { final QName qname = QName.parse(context, varName, null); @@ -155,7 +167,15 @@ public Sequence eval(Sequence contextSequence, Item contextItem) throws XPathExc ", got " + currentArguments[j].getItemCount()); } } - result = body.eval(null, null); + // For wrapper functions (created by FunctionFactory.wrap for internal + // function references), pass the context through so context-dependent + // built-in functions can access the focus. For regular user-declared + // functions, the focus is absent per the XQuery spec. + if (passContextToBody) { + result = body.eval(contextSequence, contextItem); + } else { + result = body.eval(null, null); + } return result; } finally { // restore the local variable stack diff --git a/exist-core/src/main/java/org/exist/xquery/VariableImpl.java b/exist-core/src/main/java/org/exist/xquery/VariableImpl.java index a6c9ee6cd2c..0c9219a6600 100644 --- a/exist-core/src/main/java/org/exist/xquery/VariableImpl.java +++ b/exist-core/src/main/java/org/exist/xquery/VariableImpl.java @@ -120,26 +120,26 @@ public void setSequenceType(SequenceType type) throws XPathException { else {actualCardinality = Cardinality.EXACTLY_ONE;} //Type.EMPTY is *not* a subtype of other types ; checking cardinality first if (!getSequenceType().getCardinality().isSuperCardinalityOrEqualOf(actualCardinality)) - {throw new XPathException(getValue(), "XPTY0004: Invalid cardinality for variable $" + getQName() + + {throw new XPathException(getValue(), ErrorCodes.XPTY0004, "Invalid cardinality for variable $" + getQName() + ". Expected " + getSequenceType().getCardinality().getHumanDescription() + ", got " + actualCardinality.getHumanDescription());} //TODO : ignore nodes right now ; they are returned as xs:untypedAtomicType if (!Type.subTypeOf(getSequenceType().getPrimaryType(), Type.NODE)) { if (!getValue().isEmpty() && !Type.subTypeOf(getValue().getItemType(), getSequenceType().getPrimaryType())) - {throw new XPathException(getValue(), "XPTY0004: Invalid type for variable $" + getQName() + + {throw new XPathException(getValue(), ErrorCodes.XPTY0004, "Invalid type for variable $" + getQName() + ". Expected " + Type.getTypeName(getSequenceType().getPrimaryType()) + ", got " +Type.getTypeName(getValue().getItemType()));} //Here is an attempt to process the nodes correctly } else { - //Same as above : we probably may factorize + //Same as above : we probably may factorize if (!getValue().isEmpty() && !Type.subTypeOf(getValue().getItemType(), getSequenceType().getPrimaryType())) - {throw new XPathException(getValue(), "XPTY0004: Invalid type for variable $" + getQName() + + {throw new XPathException(getValue(), ErrorCodes.XPTY0004, "Invalid type for variable $" + getQName() + ". Expected " + Type.getTypeName(getSequenceType().getPrimaryType()) + ", got " +Type.getTypeName(getValue().getItemType()));} - + } } diff --git a/exist-core/src/main/java/org/exist/xquery/VariableReference.java b/exist-core/src/main/java/org/exist/xquery/VariableReference.java index 41c87ba7a99..2beb0ccb292 100644 --- a/exist-core/src/main/java/org/exist/xquery/VariableReference.java +++ b/exist-core/src/main/java/org/exist/xquery/VariableReference.java @@ -64,7 +64,7 @@ public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException "Variable '$" + qname + "' is not declared."); } if (!var.isInitialized()) { - throw new XPathException(this, ErrorCodes.XQST0054, + throw new XPathException(this, ErrorCodes.XQDY0054, "variable declaration of '$" + qname + "' cannot " + "be executed because of a circularity."); } diff --git a/exist-core/src/main/java/org/exist/xquery/XQueryContext.java b/exist-core/src/main/java/org/exist/xquery/XQueryContext.java index cb06e2a9c70..cd88b4435a3 100644 --- a/exist-core/src/main/java/org/exist/xquery/XQueryContext.java +++ b/exist-core/src/main/java/org/exist/xquery/XQueryContext.java @@ -1346,6 +1346,31 @@ public DocumentSet getStaticDocs() { return textResourceSupplier.apply(getBroker(), getBroker().getCurrentTransaction(), uri, charset); } + /** + * Gets a text resource from the "Available text resources" of the + * dynamic context, matching by URI only. This is used when no encoding + * is specified, allowing the resource to be found regardless of what + * charset it was registered with. + * + * @param uri the URI of the resource to retrieve + * @return a reader to read the resource content from, or null if not found + * @throws XPathException in case of a dynamic error + */ + public @Nullable Reader getDynamicallyAvailableTextResourceByUri(final String uri) + throws XPathException { + if (dynamicTextResources == null) { + return null; + } + + for (final Map.Entry, QuadFunctionE> entry : dynamicTextResources.entrySet()) { + if (entry.getKey()._1.equals(uri)) { + final Charset registeredCharset = entry.getKey()._2; + return entry.getValue().apply(getBroker(), getBroker().getCurrentTransaction(), uri, registeredCharset); + } + } + return null; + } + /** * Gets a collection from the "Available collections" of the * dynamic context. diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FnModule.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FnModule.java index 5006a848ce8..f11ab5baa0e 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FnModule.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FnModule.java @@ -78,6 +78,7 @@ public class FnModule extends AbstractInternalModule { new FunctionDef(FunDocumentURI.FS_DOCUMENT_URI_1, FunDocumentURI.class), new FunctionDef(FunElementWithId.FS_ELEMENT_WITH_ID_SIGNATURES[0], FunElementWithId.class), new FunctionDef(FunElementWithId.FS_ELEMENT_WITH_ID_SIGNATURES[1], FunElementWithId.class), + new FunctionDef(FunElementWithId.FS_ELEMENT_WITH_ID_SIGNATURES[2], FunElementWithId.class), new FunctionDef(FunEmpty.signature, FunEmpty.class), new FunctionDef(FunEncodeForURI.signature, FunEncodeForURI.class), new FunctionDef(FunEndsWith.signatures[0], FunEndsWith.class), diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunAnalyzeString.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunAnalyzeString.java index 67332bffa4f..5e0decbcb84 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunAnalyzeString.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunAnalyzeString.java @@ -125,9 +125,23 @@ public Sequence eval(final Sequence[] args, final Sequence contextSequence) thro } } - private void analyzeString(final MemTreeBuilder builder, final String input, String pattern, final String flags) throws XPathException { + private void analyzeString(final MemTreeBuilder builder, final String input, final String pattern, final String flags) throws XPathException { final Configuration config = context.getBroker().getBrokerPool().getSaxonConfiguration(); + // XPath 4.0 lookaround syntax is not yet implemented in eXist's XQuery 3.1 runtime. + // When XQuery 4.0 lands (v2/xq4-core-functions), replace this guard with the + // translateXPath4Lookaround() dispatch path. + if (org.exist.xquery.regex.RegexUtil.hasXPath4Lookaround(pattern)) { + throw new XPathException(this, ErrorCodes.XPST0017, + "XPath 4.0 lookaround syntax in regex patterns (e.g. (*positive_lookahead:...)) " + + "is not yet implemented in this XQuery 3.1 build. Rewrite the regex without lookaround."); + } + + // Pre-validate: reject constructs not valid in XPath 3.1 regex + if (!org.exist.xquery.regex.RegexUtil.hasLiteral(flags)) { + org.exist.xquery.regex.RegexUtil.validateXPathRegex(this, pattern, false); + } + final List warnings = new ArrayList<>(1); try { diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunContainsToken.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunContainsToken.java index aebf6cf6dc6..362ab3dd137 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunContainsToken.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunContainsToken.java @@ -44,7 +44,7 @@ public class FunContainsToken extends BasicFunction { private final static FunctionParameterSequenceType FS_INPUT = optManyParam("input", Type.STRING, "The input string"); private final static FunctionParameterSequenceType FS_TOKEN = param("token", Type.STRING, "The token to be searched for"); - private final static FunctionParameterSequenceType FS_COLLATION = param("pattern", Type.STRING, "Collation to use"); + private final static FunctionParameterSequenceType FS_COLLATION = optParam("collation", Type.STRING, "Collation to use; an empty sequence selects the default collation"); public final static FunctionSignature[] FS_CONTAINS_TOKEN = functionSignatures( FS_CONTAINS_TOKEN_NAME, diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunElementWithId.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunElementWithId.java index dd289251062..ec615e6aac9 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunElementWithId.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunElementWithId.java @@ -45,13 +45,15 @@ public class FunElementWithId extends BasicFunction { "If none is matching or $idrefs is the empty sequence, returns the empty sequence."; private static final FunctionReturnSequenceType FN_RETURN = returnsOptMany(Type.STRING, "the elements with IDs matching IDREFs from $idref-sequence"); private static final FunctionParameterSequenceType PARAM_ID_REFS_STRING = optManyParam("idrefs", Type.STRING, "The IDREF sequence"); + private static final FunctionParameterSequenceType PARAM_NODE = param("node", Type.NODE, "A node in the document to search"); public static final FunctionSignature[] FS_ELEMENT_WITH_ID_SIGNATURES = functionSignatures( FN_NAME, FN_DESCRIPTION, FN_RETURN, arities( arity(), - arity(PARAM_ID_REFS_STRING) + arity(PARAM_ID_REFS_STRING), + arity(PARAM_ID_REFS_STRING, PARAM_NODE) ) ); diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunMatches.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunMatches.java index 289cada28d2..1852ecf703c 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunMatches.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunMatches.java @@ -525,7 +525,22 @@ private Sequence evalGeneric(final Sequence contextSequence, final Item contextI } - private boolean matchXmlRegex(final String string, final String pattern, final String flags) throws XPathException { + private boolean matchXmlRegex(String string, final String pattern, final String flags) throws XPathException { + // XPath 4.0 lookaround syntax is not yet implemented in eXist's XQuery 3.1 runtime. + // When XQuery 4.0 lands (v2/xq4-core-functions), replace this guard with the + // translateXPath4Lookaround / Java-regex dispatch path. + if (org.exist.xquery.regex.RegexUtil.hasXPath4Lookaround(pattern)) { + throw new XPathException(this, ErrorCodes.XPST0017, + "XPath 4.0 lookaround syntax in regex patterns (e.g. (*positive_lookahead:...)) " + + "is not yet implemented in this XQuery 3.1 build. Rewrite the regex without lookaround."); + } + + // Pre-validate: reject constructs that are not valid in XPath 3.1 regex + // but that Saxon's XP30 mode accepts (Java/Perl extensions) + if (!hasLiteral(flags)) { + org.exist.xquery.regex.RegexUtil.validateXPathRegex(this, pattern, false); + } + try { List warnings = new ArrayList<>(1); RegularExpression regex = context.getBroker().getBrokerPool() diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunReplace.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunReplace.java index e8a9e81d378..bf6946cab99 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunReplace.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunReplace.java @@ -128,9 +128,21 @@ public Sequence eval(final Sequence[] args, final Sequence contextSequence) thro flags = ""; } final String string = stringArg.getStringValue(); - final String pattern = args[1].itemAt(0).getStringValue(); + String pattern = args[1].itemAt(0).getStringValue(); final String replace = args[2].itemAt(0).getStringValue(); + final boolean isXQuery40 = context.getXQueryVersion() >= 40; + + // XQ4: translate (*positive_lookahead:...) etc. to Java regex (?=...) syntax + if (isXQuery40 && hasXPath4Lookaround(pattern)) { + pattern = translateXPath4Lookaround(pattern); + } + + // Pre-validate: reject constructs not valid in XPath regex + if (!hasLiteral(flags)) { + org.exist.xquery.regex.RegexUtil.validateXPathRegex(this, pattern, isXQuery40); + } + final Configuration config = context.getBroker().getBrokerPool().getSaxonConfiguration(); final List warnings = new ArrayList<>(1); diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunTokenize.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunTokenize.java index f31b8b645f0..246f7881cb7 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunTokenize.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunTokenize.java @@ -89,14 +89,27 @@ public Sequence eval(final Sequence[] args, final Sequence contextSequence) thro flags = 0; } + final boolean isXQuery40 = context.getXQueryVersion() >= 40; final String pattern; if (args.length == 1) { pattern = " "; string = FunNormalizeSpace.normalize(string); } else { - if(hasLiteral(flags)) { + String rawPattern = args[1].itemAt(0).getStringValue(); + + // XQ4: translate (*positive_lookahead:...) etc. to Java regex + if (isXQuery40 && hasXPath4Lookaround(rawPattern)) { + rawPattern = translateXPath4Lookaround(rawPattern); + } + + // Pre-validate: reject constructs not valid in XPath regex + if (!hasLiteral(flags)) { + validateXPathRegex(this, rawPattern, isXQuery40); + } + + if (hasLiteral(flags)) { // no need to change anything - pattern = args[1].itemAt(0).getStringValue(); + pattern = rawPattern; } else { final boolean ignoreWhitespace = hasIgnoreWhitespace(flags); final boolean caseBlind = hasCaseInsensitive(flags); diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunUnparsedText.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunUnparsedText.java index 6ffe442176c..a624cc8a71f 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunUnparsedText.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunUnparsedText.java @@ -45,7 +45,7 @@ public class FunUnparsedText extends BasicFunction { private final static FunctionParameterSequenceType PARAM_HREF = optParam("href", Type.STRING, "the URI to load text from"); - private final static FunctionParameterSequenceType PARAM_ENCODING = param("encoding", Type.STRING, "character encoding of the resource"); + private final static FunctionParameterSequenceType PARAM_ENCODING = optParam("encoding", Type.STRING, "character encoding of the resource"); static final FunctionSignature [] FS_UNPARSED_TEXT = functionSignatures( new QName("unparsed-text", Function.BUILTIN_FUNCTION_NS), @@ -80,61 +80,102 @@ public FunUnparsedText(final XQueryContext context, final FunctionSignature sign @Override public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathException { - @Nullable final String encoding = args.length == 2 ? args[1].getStringValue() : null; - if (!args[0].isEmpty()) { - final String href = args[0].getStringValue(); - if (isCalledAs("unparsed-text-lines")) { - return readLines(href, encoding); - } else if (isCalledAs("unparsed-text-available")) { - return BooleanValue.valueOf(contentAvailable(href, encoding)); - } else { - return new StringValue(this, stripBOM(readContent(href, encoding))); + @Nullable final String encoding = (args.length == 2 && !args[1].isEmpty()) ? args[1].getStringValue() : null; + if (args[0].isEmpty()) { + // Per spec: if $href is empty, unparsed-text-available returns false, + // unparsed-text and unparsed-text-lines return empty sequence + if (isCalledAs("unparsed-text-available")) { + return BooleanValue.FALSE; } + return Sequence.EMPTY_SEQUENCE; + } + final String href = args[0].getStringValue(); + if (isCalledAs("unparsed-text-lines")) { + return readLines(href, encoding); + } else if (isCalledAs("unparsed-text-available")) { + return BooleanValue.valueOf(contentAvailable(href, encoding)); + } else { + return new StringValue(this, stripBOM(readContent(href, encoding))); } - return Sequence.EMPTY_SEQUENCE; } private boolean contentAvailable(final String uri, final String encoding) { - final Charset charset; try { - charset = encoding != null ? resolveCharset(encoding) : UTF_8; - } catch (final IllegalArgumentException e) { - return false; - } + final String resolvedUri = toUri(uri).toString(); - try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(toUri(uri).toString(), charset)) { - if (dynamicTextResource != null) { - return true; - } else { + if (encoding != null) { + final Charset charset; try { - readContent(getSource(uri), encoding); - return true; - } catch (final XPathException e) { + charset = resolveCharset(encoding); + } catch (final IllegalArgumentException e) { return false; } + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(resolvedUri, charset)) { + if (dynamicTextResource != null) { + return true; + } + } + } else { + // No encoding — try URI-only lookup + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResourceByUri(resolvedUri)) { + if (dynamicTextResource != null) { + return true; + } + } + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(resolvedUri, UTF_8)) { + if (dynamicTextResource != null) { + return true; + } + } } + + readContent(getSource(uri), encoding); + return true; } catch (final XPathException | IOException e) { return false; } } private String readContent(final String uri, final String encoding) throws XPathException { - final Charset charset; - try { - charset = encoding != null ? resolveCharset(encoding) : UTF_8; - } catch (final IllegalArgumentException e) { - throw new XPathException(this, ErrorCodes.FOUT1190, e.getMessage()); - } + final String resolvedUri = toUri(uri).toString(); - try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(toUri(uri).toString(), charset)) { - if (dynamicTextResource != null) { - return readAll(dynamicTextResource); - } else { - return readContent(getSource(uri), encoding); + if (encoding != null) { + // Explicit encoding specified — look up with exact charset + final Charset charset; + try { + charset = resolveCharset(encoding); + } catch (final IllegalArgumentException e) { + throw new XPathException(this, ErrorCodes.FOUT1190, e.getMessage()); + } + + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(resolvedUri, charset)) { + if (dynamicTextResource != null) { + return readAll(dynamicTextResource); + } + } catch (final IOException e) { + throw new XPathException(this, ErrorCodes.FOUT1170, "Cannot read text resource"); + } + } else { + // No encoding specified — try URI-only lookup (any registered charset) + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResourceByUri(resolvedUri)) { + if (dynamicTextResource != null) { + return readAll(dynamicTextResource); + } + } catch (final IOException e) { + throw new XPathException(this, ErrorCodes.FOUT1170, "Cannot read text resource"); + } + + // Also try with UTF-8 (in case registered with exact UTF-8 key) + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(resolvedUri, UTF_8)) { + if (dynamicTextResource != null) { + return readAll(dynamicTextResource); + } + } catch (final IOException e) { + throw new XPathException(this, ErrorCodes.FOUT1170, "Cannot read text resource"); } - } catch (final IOException e) { - throw new XPathException(this, ErrorCodes.FOUT1170, "Cannot read text resource"); } + + return readContent(getSource(uri), encoding); } private String readAll(final Reader reader) throws IOException { @@ -147,6 +188,35 @@ private String readAll(final Reader reader) throws IOException { return builder.toString(); } + /** + * Validate that a string contains only XML-legal characters. + * Per XQuery spec, FOUT1190 is raised if the text contains characters + * that are not permitted in XML. + */ + private void validateXmlCharacters(final String text) throws XPathException { + for (int i = 0; i < text.length(); i++) { + final char c = text.charAt(i); + // XML 1.0 legal characters: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + // Surrogate pairs (0xD800-0xDFFF) are handled as pairs for supplementary chars + if (c == 0x9 || c == 0xA || c == 0xD) { + continue; + } + if (c >= 0x20 && c <= 0xD7FF) { + continue; + } + if (c >= 0xE000 && c <= 0xFFFD) { + continue; + } + // Check for valid surrogate pairs (supplementary characters U+10000 to U+10FFFF) + if (Character.isHighSurrogate(c) && i + 1 < text.length() && Character.isLowSurrogate(text.charAt(i + 1))) { + i++; // skip the low surrogate + continue; + } + throw new XPathException(this, ErrorCodes.FOUT1190, + "Text resource contains character not permitted in XML: U+" + String.format("%04X", (int) c)); + } + } + private String readContent(final Source source, final String encoding) throws XPathException { try { final Charset charset = getCharset(encoding, source); @@ -155,26 +225,61 @@ private String readContent(final Source source, final String encoding) throws XP // InputStream can have value NULL for data retrieved from URL IOUtils.copy(is, output, charset); } - return output.toString(); + final String result = output.toString(); + validateXmlCharacters(result); + return result; } catch (final IOException | NullPointerException e) { throw new XPathException(this, ErrorCodes.FOUT1170, e.getMessage()); } } private Sequence readLines(final String uriParam, final String encoding) throws XPathException { + final String resolvedUri = toUri(uriParam).toString(); + + // Try dynamic text resources first (same as readContent) + if (encoding != null) { + final Charset charset; + try { + charset = resolveCharset(encoding); + } catch (final IllegalArgumentException e) { + throw new XPathException(this, ErrorCodes.FOUT1190, e.getMessage()); + } + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(resolvedUri, charset)) { + if (dynamicTextResource != null) { + return readLinesFromReader(new BufferedReader(dynamicTextResource)); + } + } catch (final IOException | RuntimeException e) { + throw new XPathException(this, ErrorCodes.FOUT1170, "Cannot read text resource"); + } + } else { + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResourceByUri(resolvedUri)) { + if (dynamicTextResource != null) { + return readLinesFromReader(new BufferedReader(dynamicTextResource)); + } + } catch (final IOException | RuntimeException e) { + throw new XPathException(this, ErrorCodes.FOUT1170, "Cannot read text resource"); + } + try (final Reader dynamicTextResource = context.getDynamicallyAvailableTextResource(resolvedUri, UTF_8)) { + if (dynamicTextResource != null) { + return readLinesFromReader(new BufferedReader(dynamicTextResource)); + } + } catch (final IOException | RuntimeException e) { + throw new XPathException(this, ErrorCodes.FOUT1170, "Cannot read text resource"); + } + } + + // Fall back to source resolution try { final Sequence result = new ValueSequence(); final Source source = getSource(uriParam); - final Charset charset = getCharset(encoding, source); + final Charset sourceCharset = getCharset(encoding, source); try (final InputStream inputStream = source.getInputStream()) { - - // Nested try() as inputStream can be null if (inputStream == null) { throw new XPathException(this, ErrorCodes.FOUT1170, "Unable to retrieve bytestream from " + uriParam); } - try (final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, charset))) { + try (final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, sourceCharset))) { String line; boolean firstLine = true; while ((line = reader.readLine()) != null) { @@ -192,6 +297,20 @@ private Sequence readLines(final String uriParam, final String encoding) throws } } + private Sequence readLinesFromReader(final BufferedReader reader) throws XPathException, IOException { + final Sequence result = new ValueSequence(); + String line; + boolean firstLine = true; + while ((line = reader.readLine()) != null) { + if (firstLine) { + line = stripBOM(line); + firstLine = false; + } + result.add(new StringValue(this, line)); + } + return result; + } + private Charset getCharset(final String encoding, final Source source) throws XPathException { Charset charset; if (encoding == null) { @@ -308,7 +427,7 @@ private URI toUri(final String uriStr) throws XPathException { } return uri; } catch (final URISyntaxException e) { - throw new XPathException(context.getRootExpression(), ErrorCodes.FODC0005, e); + throw new XPathException(this, ErrorCodes.FOUT1170, e.getMessage()); } } } diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/LoadXQueryModule.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/LoadXQueryModule.java index f2d409ebeb9..2ea185a3eb4 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/LoadXQueryModule.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/LoadXQueryModule.java @@ -170,9 +170,19 @@ public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathExce throw new XPathException(this, ErrorCodes.FOQM0002, "Module with URI " + targetNamespace + " not found"); } - if (!xqVersion.equals(getXQueryVersion(tempContext.getXQueryVersion()))) { - throw new XPathException(this, ErrorCodes.FOQM0003, "Imported module has wrong XQuery version: " + - getXQueryVersion(tempContext.getXQueryVersion())); + // The version declared in the loaded module is recorded on the module's + // own context, not on tempContext (which only hosts the import). + for (final Module loadedModule : loadedModules) { + if (loadedModule instanceof ExternalModule extMod) { + final XQueryContext modCtx = extMod.getContext(); + if (modCtx != null) { + final String moduleVersion = getXQueryVersion(modCtx.getXQueryVersion()); + if (!xqVersion.equals(moduleVersion)) { + throw new XPathException(this, ErrorCodes.FOQM0003, + "Imported module has wrong XQuery version: " + moduleVersion); + } + } + } } final IMap variables = newLinearMap(null); diff --git a/exist-core/src/main/java/org/exist/xquery/regex/RegexUtil.java b/exist-core/src/main/java/org/exist/xquery/regex/RegexUtil.java index d54ca496c01..b9050874c1f 100644 --- a/exist-core/src/main/java/org/exist/xquery/regex/RegexUtil.java +++ b/exist-core/src/main/java/org/exist/xquery/regex/RegexUtil.java @@ -32,6 +32,7 @@ import javax.annotation.Nullable; import java.util.ArrayList; import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -158,4 +159,593 @@ public static String translateRegexp(final Expression context, final String patt throw new XPathException(context, ErrorCodes.FORX0002, "Conversion from XPath F&O 3.0 regular expression syntax to Java regular expression syntax failed: " + e.getMessage(), new StringValue(pattern), e); } } + + /** + * Convert XML Schema/XPath \p{Is} and \P{Is} Unicode block + * property escapes to Java's \p{In} and \P{In} syntax. + */ + private static String convertUnicodeBlockNames(final String pattern) { + return pattern + .replaceAll("\\\\p\\{Is([^}]+)}", "\\\\p{In$1}") + .replaceAll("\\\\P\\{Is([^}]+)}", "\\\\P{In$1}"); + } + + /** + * Validates that a regex pattern only uses constructs allowed by the XPath + * regular expression specification (F&O 3.1, Section 5.6.1), with + * extensions for XPath 4.0 (Section 5.6.1.1). + * + *

Saxon's XP30 regex compiler accepts many Java/Perl regex constructs + * that are not part of the XPath regex specification. This method rejects + * such constructs with FORX0002 before they reach the Saxon compiler.

+ * + * @param context the calling expression, for error reporting + * @param pattern the regex pattern to validate + * @param isXQuery40 true if running in XQuery 4.0+ mode + * @throws XPathException with FORX0002 if the pattern uses non-XPath constructs + */ + public static void validateXPathRegex(final Expression context, final String pattern, final boolean isXQuery40) throws XPathException { + final int len = pattern.length(); + // Total capturing groups in the pattern, used as a maximum-digits cap + // when greedily parsing back-references like \11 vs \1+'1'. + final int totalGroups = countCapturingGroups(pattern); + // Closed capturing groups encountered so far. Back-references must + // refer to a group that has already CLOSED at the reference position + // — forward references like \1(abc) are invalid. + int closedGroupCount = 0; + // Stack tracks whether each currently-open group is capturing. + final java.util.Deque groupStack = new java.util.ArrayDeque<>(); + for (int i = 0; i < len; i++) { + final char c = pattern.charAt(i); + + if (c == '\\') { + if (i + 1 >= len) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: trailing backslash", + new StringValue(pattern)); + } + final char next = pattern.charAt(i + 1); + switch (next) { + // Valid XPath single-character escapes + case 'n': case 'r': case 't': + case '\\': case '|': case '.': case '-': case '^': + case '?': case '*': case '+': + case '{': case '}': case '(': case ')': + case '[': case ']': case '$': + // Space: not strictly in the XPath spec but Saxon allows + // \ in free-spacing mode ('x' flag) for literal space + case ' ': + // Valid XPath multi-character escape shortcuts + case 'd': case 'D': case 's': case 'S': + case 'w': case 'W': case 'i': case 'I': + case 'c': case 'C': + i++; // skip the escaped character + break; + case 'b': case 'B': + // Word boundaries: valid in XPath 4.0, invalid in 3.1 + if (!isXQuery40) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: \\" + next + + " is not a recognized escape sequence in XPath 3.1 regular expressions", + new StringValue(pattern)); + } + // Quantifier after zero-width boundary assertion is not allowed + if (i + 2 < len) { + final char q = pattern.charAt(i + 2); + if (q == '?' || q == '+' || q == '*' || q == '{') { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: quantifier '" + q + + "' after \\" + next + " boundary assertion is not allowed", + new StringValue(pattern)); + } + } + i++; + break; + case '0': + // Octal escapes (\0nn) are not part of the XPath regex spec + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: \\0 (octal escape) is not supported in XPath regular expressions", + new StringValue(pattern)); + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + // Back-reference \N. N is parsed greedily but capped at + // the total number of capturing groups in the pattern, + // so '\19' in a 1-group pattern is '\1' + literal '9', + // while '\11' in an 11-group pattern is back-ref 11. + // The chosen N must also reference a group that has + // already CLOSED at this position; forward references + // (\1(abc)) and self-references ((.)\2) are invalid. + int j = i + 2; + int num = next - '0'; + while (j < len && pattern.charAt(j) >= '0' && pattern.charAt(j) <= '9') { + final int candidate = num * 10 + (pattern.charAt(j) - '0'); + if (candidate > totalGroups) { + break; + } + num = candidate; + j++; + } + if (num < 1 || num > closedGroupCount) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: back-reference \\" + num + + " refers to a capturing group that does not exist or has not been closed at this position", + new StringValue(pattern)); + } + i = j - 1; + break; + } + case 'p': case 'P': + // \p{...} or \P{...} — must be followed by {Name} + if (i + 2 < len && pattern.charAt(i + 2) == '{') { + final int close = pattern.indexOf('}', i + 3); + if (close < 0) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: unclosed \\p{ property escape", + new StringValue(pattern)); + } + i = close; // advance past the closing } + } else { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: \\p or \\P must be followed by {Name}", + new StringValue(pattern)); + } + break; + default: + // Any other backslash escape is invalid in XPath regex. + // This catches: \x (hex), \\u (Java unicode), + // \A \Z \z (Java anchors), + // \a \e \f \v (special chars), \Q \E (literal mode), + // \G \k \g (named backrefs) + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: \\" + next + + " is not a recognized escape sequence in XPath regular expressions", + new StringValue(pattern)); + } + } else if (c == '(' && i + 1 < len && pattern.charAt(i + 1) == '?') { + // In XPath 3.1, only (?:...) is valid. + // In XPath 4.0, lookaround is also valid: (?=...) (?!...) (?<=...) (?= len) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: incomplete group syntax at position " + i, + new StringValue(pattern)); + } + final char groupType = pattern.charAt(i + 2); + if (groupType == ':') { + // (?:...) — always valid + } else if (isXQuery40 && (groupType == '=' || groupType == '!')) { + // (?=...) (?!...) — valid in XPath 4.0 + } else if (isXQuery40 && groupType == '<' && i + 3 < len + && (pattern.charAt(i + 3) == '=' || pattern.charAt(i + 3) == '!')) { + // (?<=...) (? 0) { + final char cj = pattern.charAt(j); + if (cj == '\\') { + j += 2; + continue; + } + if (cj == '[') { + depth++; + } else if (cj == ']') { + depth--; + } + j++; + } + i = j; + continue; + } + if (c == '(' && (i + 1 >= len + || (pattern.charAt(i + 1) != '?' && pattern.charAt(i + 1) != '*'))) { + count++; + } + i++; + } + return count; + } + + /** + * Validates XPath 4.0 lookaround constructs: + *
    + *
  • Lookbehind body must be fixed-length (no {@code *}, {@code +}, + * {@code ?}, or unbounded {@code {n,\u00a0}} quantifiers).
  • + *
  • A lookaround group cannot itself be quantified.
  • + *
+ */ + private static void validateLookaroundConstraints(final Expression context, final String pattern) + throws XPathException { + final int len = pattern.length(); + int i = 0; + while (i < len) { + final boolean isLookaround; + final boolean isLookbehind; + final int bodyStart; + if (i + 3 < len && pattern.charAt(i) == '(' && pattern.charAt(i + 1) == '?') { + final char gt = pattern.charAt(i + 2); + if (gt == '=' || gt == '!') { + isLookaround = true; + isLookbehind = false; + bodyStart = i + 3; + } else if (gt == '<' && i + 4 < len + && (pattern.charAt(i + 3) == '=' || pattern.charAt(i + 3) == '!')) { + isLookaround = true; + isLookbehind = true; + bodyStart = i + 4; + } else { + isLookaround = false; + isLookbehind = false; + bodyStart = -1; + } + } else if (i + 1 < len && pattern.charAt(i) == '(' && pattern.charAt(i + 1) == '*') { + final int colon = pattern.indexOf(':', i + 2); + if (colon > 0 && colon < len) { + final String name = pattern.substring(i + 2, colon); + if ("positive_lookahead".equals(name) || "negative_lookahead".equals(name)) { + isLookaround = true; + isLookbehind = false; + bodyStart = colon + 1; + } else if ("positive_lookbehind".equals(name) || "negative_lookbehind".equals(name)) { + isLookaround = true; + isLookbehind = true; + bodyStart = colon + 1; + } else { + isLookaround = false; + isLookbehind = false; + bodyStart = -1; + } + } else { + isLookaround = false; + isLookbehind = false; + bodyStart = -1; + } + } else { + isLookaround = false; + isLookbehind = false; + bodyStart = -1; + } + if (!isLookaround) { + i++; + continue; + } + // Find matching closing ')' for this lookaround group + int depth = 1; + int j = bodyStart; + boolean bodyHasUnboundedQuantifier = false; + while (j < len && depth > 0) { + final char cj = pattern.charAt(j); + if (cj == '\\') { + j += 2; + continue; + } + if (cj == '[') { + final int closeBracket = pattern.indexOf(']', j + 1); + if (closeBracket < 0) { + return; // malformed; let outer error handling take over + } + j = closeBracket + 1; + continue; + } + if (cj == '(') { + depth++; + } else if (cj == ')') { + depth--; + if (depth == 0) { + break; + } + } else if (depth == 1 && (cj == '*' || cj == '+' || cj == '?')) { + bodyHasUnboundedQuantifier = true; + } + j++; + } + if (depth != 0) { + return; // unbalanced parens; outer machinery handles + } + if (isLookbehind && bodyHasUnboundedQuantifier) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: lookbehind assertion must be fixed-length " + + "(unbounded quantifier in body)", + new StringValue(pattern)); + } + // Quantifier after the lookaround group is also invalid + if (j + 1 < len) { + final char after = pattern.charAt(j + 1); + if (after == '?' || after == '+' || after == '*' || after == '{') { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: lookaround assertion cannot be quantified ('" + + after + "' after closing ')')", + new StringValue(pattern)); + } + } + i = j + 1; // continue past closing ')' + } + } + + /** + * Scans a character class starting at the '[' at position {@code start}, + * validating its contents and returning the position of the matching ']'. + * Rejects POSIX-style {@code [:name:]} classes and invalid escapes inside + * the class (e.g. backslash-x or backslash-u) that the outer scanner would + * otherwise miss because it never enters the class body. + */ + private static int scanCharClass(final Expression context, final String pattern, + final int start, final int len) throws XPathException { + int j = start + 1; // skip '[' + if (j < len && pattern.charAt(j) == '^') { + j++; + } + // Track unescaped '-' immediately preceding the current position to + // disambiguate subtraction (charClassSub) from a literal hyphen. + // The XPath/XSD grammar requires the left side of subtraction to be + // a non-empty (pos|neg)CharGroup, i.e. partsBeforeLastHyphen >= 1. + int partsCount = 0; + boolean lastWasUnescapedHyphen = false; + int partsBeforeLastHyphen = 0; + while (j < len) { + final char cc = pattern.charAt(j); + if (cc == ']') { + if (partsCount == 0) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: empty character class is not allowed", + new StringValue(pattern)); + } + return j; + } + if (cc == '\\') { + if (j + 1 >= len) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: trailing backslash inside character class", + new StringValue(pattern)); + } + final char ec = pattern.charAt(j + 1); + switch (ec) { + case 'n', 'r', 't', + '\\', '|', '.', '-', '^', + '?', '*', '+', + '{', '}', '(', ')', + '[', ']', '$', + ' ', + 'd', 'D', 's', 'S', + 'w', 'W', 'i', 'I', + 'c', 'C', + // \b inside class is backspace, allowed + 'b', 'B' -> j += 2; + case 'p', 'P' -> { + if (j + 2 < len && pattern.charAt(j + 2) == '{') { + final int close = pattern.indexOf('}', j + 3); + if (close < 0) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: unclosed \\p{ inside character class", + new StringValue(pattern)); + } + j = close + 1; + } else { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: \\p or \\P must be followed by {Name}", + new StringValue(pattern)); + } + } + default -> throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: \\" + ec + + " is not a recognized escape sequence inside character class", + new StringValue(pattern)); + } + partsCount++; + lastWasUnescapedHyphen = false; + continue; + } + if (cc == '[') { + if (j + 1 < len && pattern.charAt(j + 1) == ':') { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: POSIX character class [:...:] is not supported in XPath regular expressions", + new StringValue(pattern)); + } + // Nested '[' is only valid as the start of a subtraction class, + // which requires a preceding unescaped '-' separator AND a + // non-empty (pos|neg)CharGroup before that '-'. + if (!lastWasUnescapedHyphen) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: '[' inside character class is only allowed after a '-' subtraction separator", + new StringValue(pattern)); + } + if (partsBeforeLastHyphen < 1) { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: character class subtraction requires a non-empty character group before the '-' separator", + new StringValue(pattern)); + } + final int innerEnd = scanCharClass(context, pattern, j, len); + j = innerEnd + 1; + if (j >= len || pattern.charAt(j) != ']') { + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: closing ']' expected after subtraction character class", + new StringValue(pattern)); + } + return j; + } + // Regular char (including literal '-') + if (cc == '-') { + partsBeforeLastHyphen = partsCount; + lastWasUnescapedHyphen = true; + } else { + lastWasUnescapedHyphen = false; + } + partsCount++; + j++; + } + throw new XPathException(context, ErrorCodes.FORX0002, + "Invalid regular expression: unclosed character class", + new StringValue(pattern)); + } + + private static final Pattern XPATH4_LOOKAROUND = Pattern.compile( + "\\(\\*(" + + "positive_lookahead|negative_lookahead|" + + "positive_lookbehind|negative_lookbehind" + + "):"); + + /** + * Translates XPath 4.0 lookaround syntax to Java regex syntax. + * + *

XPath 4.0 defines named lookaround groups:

+ *
    + *
  • {@code (*positive_lookahead:...)} → {@code (?=...)}
  • + *
  • {@code (*negative_lookahead:...)} → {@code (?!...)}
  • + *
  • {@code (*positive_lookbehind:...)} → {@code (?<=...)}
  • + *
  • {@code (*negative_lookbehind:...)} → {@code (? + *
+ * + * @param pattern the XPath regex pattern + * @return the pattern with any XPath 4.0 lookaround translated to Java syntax, + * or the original pattern if no lookaround is present + */ + public static String translateXPath4Lookaround(final String pattern) { + if (!pattern.contains("(*")) { + return pattern; + } + + final Matcher m = XPATH4_LOOKAROUND.matcher(pattern); + if (!m.find()) { + return pattern; + } + + final StringBuilder sb = new StringBuilder(); + m.reset(); + while (m.find()) { + final String replacement = switch (m.group(1)) { + case "positive_lookahead" -> "(?="; + case "negative_lookahead" -> "(?!"; + case "positive_lookbehind" -> "(?<="; + case "negative_lookbehind" -> "(? m.group(0); // shouldn't happen + }; + m.appendReplacement(sb, Matcher.quoteReplacement(replacement)); + } + m.appendTail(sb); + return sb.toString(); + } + + /** + * Checks whether a pattern contains XPath 4.0 lookaround syntax. + * + * @param pattern the regex pattern + * @return true if the pattern contains (*positive_lookahead:...) or similar + */ + public static boolean hasXPath4Lookaround(final String pattern) { + return pattern.contains("(*") && XPATH4_LOOKAROUND.matcher(pattern).find(); + } + + /** + * Checks whether a pattern uses XPath 4.0 regex extensions that Saxon's + * XP30 mode cannot handle, requiring Java regex compilation instead. + * + *

This includes:

+ *
    + *
  • Word boundaries: {@code \b}, {@code \B}
  • + *
  • Java-style lookaround: {@code (?=...)}, {@code (?!...)}, {@code (?<=...)}, {@code (? + *
  • Named lookaround: {@code (*positive_lookahead:...)}, etc.
  • + *
+ * + * @param pattern the regex pattern + * @return true if the pattern needs Java regex handling for XQ4 extensions + */ + public static boolean needsXQuery40JavaRegex(final String pattern) { + if (pattern.contains("\\b") || pattern.contains("\\B")) { + return true; + } + if (pattern.contains("(?=") || pattern.contains("(?!") + || pattern.contains("(?<=") || pattern.contains("(?