diff --git a/exist-core/pom.xml b/exist-core/pom.xml index c4163dbfc05..60b285c8878 100644 --- a/exist-core/pom.xml +++ b/exist-core/pom.xml @@ -1206,6 +1206,7 @@ The BaseX Team. The original license statement is also included below.]]>${project.build.testOutputDirectory}/log4j2.xml + 180 + + + org.exist.storage.lock.DeadlockIT + org.exist.xmldb.RemoveCollectionIT + @{jacocoArgLine} --add-modules jdk.incubator.vector --enable-native-access=ALL-UNNAMED -Dfile.encoding=${project.build.sourceEncoding} -Dexist.recovery.progressbar.hide=true ${project.basedir}/../exist-jetty-config/target/classes/org/exist/jetty diff --git a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java index ca85a06f5fe..7c727e6ab16 100644 --- a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java +++ b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java @@ -28,6 +28,11 @@ public class EXistOutputKeys { */ public static final String ITEM_SEPARATOR = "item-separator"; + // --- QT4 Serialization 4.0 parameters --- + public static final String CANONICAL = "canonical"; + public static final String ESCAPE_SOLIDUS = "escape-solidus"; + public static final String JSON_LINES = "json-lines"; + public static final String OMIT_ORIGINAL_XML_DECLARATION = "omit-original-xml-declaration"; public static final String OUTPUT_DOCTYPE = "output-doctype"; diff --git a/exist-core/src/main/java/org/exist/util/CharSlice.java b/exist-core/src/main/java/org/exist/util/CharSlice.java index 8175eb76ab1..b0df423a3d3 100644 --- a/exist-core/src/main/java/org/exist/util/CharSlice.java +++ b/exist-core/src/main/java/org/exist/util/CharSlice.java @@ -198,6 +198,19 @@ public void copyTo(final char[] destination, final int destOffset) { public void write(final Writer writer) throws java.io.IOException { writer.write(array, offset, len); } + + /** + * Write a sub-range of this slice to a writer using a single bulk + * {@link Writer#write(char[], int, int)} call. + * + * @param writer the writer + * @param start the start index within this slice (inclusive) + * @param length the number of characters to write + * @throws java.io.IOException if an error occurs whilst writing + */ + public void write(final Writer writer, final int start, final int length) throws java.io.IOException { + writer.write(array, offset + start, length); + } } // diff --git a/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java b/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java index 758ccee130a..a1b7c9890b3 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java +++ b/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java @@ -81,13 +81,27 @@ protected SerializerWriter getDefaultWriter() { public void setOutput(Writer writer, Properties properties) { outputProperties = Objects.requireNonNullElseGet(properties, () -> new Properties(defaultProperties)); final String method = outputProperties.getProperty(OutputKeys.METHOD, "xml"); - final String htmlVersionProp = outputProperties.getProperty(EXistOutputKeys.HTML_VERSION, "1.0"); - + // For html/xhtml methods, determine HTML version: + // 1. Use html-version if explicitly set + // 2. Otherwise use version (W3C spec: version controls HTML version for html method) + // 3. Default to 5.0 double htmlVersion; - try { - htmlVersion = Double.parseDouble(htmlVersionProp); - } catch (NumberFormatException e) { - htmlVersion = 1.0; + final String explicitHtmlVersion = outputProperties.getProperty(EXistOutputKeys.HTML_VERSION); + if (explicitHtmlVersion != null) { + try { + htmlVersion = Double.parseDouble(explicitHtmlVersion); + } catch (NumberFormatException e) { + htmlVersion = 5.0; + } + } else if (("html".equalsIgnoreCase(method) || "xhtml".equalsIgnoreCase(method)) + && outputProperties.getProperty(OutputKeys.VERSION) != null) { + try { + htmlVersion = Double.parseDouble(outputProperties.getProperty(OutputKeys.VERSION)); + } catch (NumberFormatException e) { + htmlVersion = 5.0; + } + } else { + htmlVersion = 5.0; } final SerializerWriter baseSerializerWriter = getBaseSerializerWriter(method, htmlVersion); diff --git a/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java b/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java index 22ab6dfca23..717ec83ab07 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java @@ -190,10 +190,15 @@ private void writeAtomic(AtomicValue value) throws IOException, SAXException, XP } private void writeDouble(final DoubleValue item) throws SAXException { - final DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(Locale.US); - symbols.setExponentSeparator("e"); - final DecimalFormat df = new DecimalFormat("0.0##########################E0", symbols); - writeText(df.format(item.getDouble())); + final double d = item.getDouble(); + if (Double.isInfinite(d) || Double.isNaN(d)) { + writeText(item.getStringValue()); + } else { + final DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(Locale.US); + symbols.setExponentSeparator("e"); + final DecimalFormat df = new DecimalFormat("0.0##########################E0", symbols); + writeText(df.format(d)); + } } private void writeArray(final ArrayType array) throws XPathException, SAXException, TransformerException { @@ -215,9 +220,7 @@ private void writeArray(final ArrayType array) throws XPathException, SAXExcepti private void writeMap(final AbstractMapType map) throws SAXException, XPathException, TransformerException { try { - writer.write("map"); - addSpaceIfIndent(); - writer.write('{'); + writer.write("map{"); addIndent(); indent(); for (final Iterator> i = map.iterator(); i.hasNext(); ) { diff --git a/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java b/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java index 1dffc3029b7..da7aede709a 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java +++ b/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java @@ -118,6 +118,13 @@ public class HTML5Writer extends XHTML5Writer { BOOLEAN_ATTRIBUTE_NAMES.add("willValidate"); } + private static final ObjectSet BOOLEAN_ATTRIBUTE_NAMES_LOWER = new ObjectOpenHashSet<>(BOOLEAN_ATTRIBUTE_NAMES.size()); + static { + for (final String n : BOOLEAN_ATTRIBUTE_NAMES) { + BOOLEAN_ATTRIBUTE_NAMES_LOWER.add(n.toLowerCase(java.util.Locale.ROOT)); + } + } + private static final ObjectSet EMPTY_TAGS = new ObjectOpenHashSet<>(31); static { EMPTY_TAGS.add("area"); @@ -156,8 +163,15 @@ public void endElement(QName qname) throws TransformerException { if (!isEmptyTag(qname.getLocalPart())) { super.endElement(qname); } else { + // HTML5 omits the close tag for void elements; we still need to + // honor the meta-in-head dedup that XHTMLWriter sets up at startElement + // time. Capture the buffered-meta flag before closeStartTag flips state. + final boolean wasBufferedMeta = isBufferedMeta(qname.getLocalPart()); closeStartTag(true); endIndent(qname.getNamespaceURI(), qname.getLocalPart()); + if (wasBufferedMeta) { + endMetaBuffer(); + } } } @@ -166,24 +180,33 @@ public void endElement(String namespaceURI, String localName, String qname) thro if (!isEmptyTag(localName)) { super.endElement(namespaceURI, localName, qname); } else { + final boolean wasBufferedMeta = isBufferedMeta(localName); closeStartTag(true); endIndent(namespaceURI, localName); + if (wasBufferedMeta) { + endMetaBuffer(); + } } } @Override public void attribute(String qname, CharSequence value) throws TransformerException { + // Strip prefix for the meta-dedup redundancy check + final int colon = qname.indexOf(':'); + final String localName = colon < 0 ? qname : qname.substring(colon + 1); + noteMetaAttribute(localName, value); + final CharSequence effectiveValue = maybeEscapeUriHtml5(localName, value); try { if(!tagIsOpen) { - characters(value); + characters(effectiveValue); return; } final Writer writer = getWriter(); writer.write(' '); writer.write(qname); - if (!(BOOLEAN_ATTRIBUTE_NAMES.contains(qname) && qname.contentEquals(value))) { + if (!isBooleanAttributeMatch(qname, effectiveValue)) { writer.write("=\""); - writeChars(value, true); + writeChars(effectiveValue, true); writer.write('"'); } } catch(final IOException ioe) { @@ -193,9 +216,12 @@ public void attribute(String qname, CharSequence value) throws TransformerExcept @Override public void attribute(QName qname, CharSequence value) throws TransformerException { + noteMetaAttribute(qname.getLocalPart(), value); + final String localPart = qname.getLocalPart(); + final CharSequence effectiveValue = maybeEscapeUriHtml5(localPart, value); try { if(!tagIsOpen) { - characters(value); + characters(effectiveValue); return; // throw new TransformerException("Found an attribute outside an // element"); @@ -206,11 +232,10 @@ public void attribute(QName qname, CharSequence value) throws TransformerExcepti writer.write(qname.getPrefix()); writer.write(':'); } - final String localPart = qname.getLocalPart(); writer.write(localPart); - if (!(BOOLEAN_ATTRIBUTE_NAMES.contains(localPart) && localPart.contentEquals(value))) { + if (!isBooleanAttributeMatch(localPart, effectiveValue)) { writer.write("=\""); - writeChars(value, true); + writeChars(effectiveValue, true); writer.write('"'); } } catch(final IOException ioe) { @@ -218,26 +243,76 @@ public void attribute(QName qname, CharSequence value) throws TransformerExcepti } } + /** + * URI-attribute escaping for the HTML5 writer. Mirrors + * {@link XHTMLWriter#shouldEscapeUriAttribute(String, String)} but unwraps + * the prefixed form of {@link #currentTag} so the (element, attribute) + * lookup uses local names only. + */ + private CharSequence maybeEscapeUriHtml5(final String attrLocal, final CharSequence value) { + if (currentTag == null) { + return value; + } + final String elementLocal = currentTag.contains(":") + ? currentTag.substring(currentTag.indexOf(':') + 1) + : currentTag; + if (!shouldEscapeUriAttribute(elementLocal, attrLocal)) { + return value; + } + return escapeUriAttribute(value); + } + + /** + * HTML5 boolean attribute minimization: emit just the bare name when the + * value is empty or matches the attribute name case-insensitively + * (per W3C XSLT/XQuery Serialization 3.1, section 7.2.2). + */ + private static boolean isBooleanAttributeMatch(final String name, final CharSequence value) { + if (!BOOLEAN_ATTRIBUTE_NAMES_LOWER.contains(name.toLowerCase(java.util.Locale.ROOT))) { + return false; + } + if (value == null || value.length() == 0) { + return true; + } + return name.equalsIgnoreCase(value.toString()); + } + @Override public void namespace(String prefix, String nsURI) throws TransformerException { - // no namespaces allowed in HTML5 + // HTML5 elements never carry an explicit xmlns since the parser puts + // them in the HTML namespace implicitly. Foreign content (anything + // outside the XHTML namespace, e.g. SVG, MathML, custom XML) keeps + // its namespace declarations so the receiver can re-parse it as XML. + if (nsURI == null || nsURI.isEmpty()) { + return; + } + if (org.exist.Namespaces.XHTML_NS.equals(nsURI)) { + return; + } + super.namespace(prefix, nsURI); } @Override protected void closeStartTag(boolean isEmpty) throws TransformerException { try { if (tagIsOpen) { + final Writer w = getWriter(); if (isEmpty) { if (isEmptyTag(currentTag)) { - getWriter().write(">"); + w.write('>'); + } else if (isForeignContent()) { + // Foreign content (SVG, MathML, custom XML namespace) + // embedded in HTML5 is serialized with XML self-close + // syntax so the receiver can re-parse it as XML. + w.write("/>"); } else { - getWriter().write('>'); - getWriter().write("'); + // Coalesce ">", "" into 2 writer calls instead of 4 + w.write(">'); } } else { - getWriter().write('>'); + w.write('>'); } tagIsOpen = false; } @@ -246,6 +321,39 @@ protected void closeStartTag(boolean isEmpty) throws TransformerException { } } + /** + * The current element is "foreign content" when its namespace is neither + * the XHTML namespace nor the empty (no-namespace) HTML namespace; that + * is the trigger for XML-style self-closing per HTML5's foreign-content + * serialization rule. + */ + private boolean isForeignContent() { + final String ns = currentElementNamespaceURI(); + return ns != null && !ns.isEmpty() && !org.exist.Namespaces.XHTML_NS.equals(ns); + } + + @Override + public void processingInstruction(final String target, final String data) throws TransformerException { + // QT4 PR2372: HTML5 has no PI syntax, so the serializer renders + // processing instructions as comments of the form ``, + // matching the HTML5 parser's coercion of `` content. + try { + if (tagIsOpen) { + closeStartTag(false); + } + final Writer writer = getWriter(); + writer.write(""); + } catch (final IOException e) { + throw new TransformerException(e.getMessage(), e); + } + } + @Override protected boolean needsEscape(char ch) { if (RAW_TEXT_ELEMENTS.contains(currentTag)) { @@ -253,4 +361,28 @@ protected boolean needsEscape(char ch) { } return super.needsEscape(ch); } + + @Override + protected boolean needsEscape(final char ch, final boolean inAttribute) { + // In raw text elements (script, style), suppress escaping for TEXT content only. + // Attribute values must always be escaped, even on raw text elements. + if (!inAttribute && RAW_TEXT_ELEMENTS.contains(currentTag)) { + return false; + } + // For attributes, always return true (bypass the 1-arg override + // which returns false for all script/style content) + if (inAttribute) { + return true; + } + return super.needsEscape(ch, inAttribute); + } + + @Override + protected boolean needsEscaping(final boolean inAttribute) { + // Mirror the per-char rule above: TEXT content inside script/style is + // raw text and never needs escaping. Lets writeChars() bulk-stream + // the entire block in one Writer.write() call. + return inAttribute || !RAW_TEXT_ELEMENTS.contains(currentTag); + } + } diff --git a/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java index c336d8b2943..99df54c3e19 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java @@ -25,7 +25,9 @@ import java.io.Writer; import java.util.ArrayDeque; import java.util.Deque; +import java.util.HashSet; import java.util.Properties; +import java.util.Set; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerException; @@ -48,6 +50,8 @@ public class IndentingXMLWriter extends XMLWriter { private boolean sameline = false; private boolean whitespacePreserve = false; private final Deque whitespacePreserveStack = new ArrayDeque<>(); + private Set suppressIndentation = null; + private int suppressIndentDepth = 0; public IndentingXMLWriter() { super(); @@ -75,6 +79,9 @@ public void startElement(final String namespaceURI, final String localName, fina indent(); } super.startElement(namespaceURI, localName, qname); + if (isSuppressIndentation(localName)) { + suppressIndentDepth++; + } addIndent(); afterTag = true; sameline = true; @@ -86,6 +93,9 @@ public void startElement(final QName qname) throws TransformerException { indent(); } super.startElement(qname); + if (isSuppressIndentation(qname.getLocalPart())) { + suppressIndentDepth++; + } addIndent(); afterTag = true; sameline = true; @@ -95,6 +105,9 @@ public void startElement(final QName qname) throws TransformerException { public void endElement(final String namespaceURI, final String localName, final String qname) throws TransformerException { endIndent(namespaceURI, localName); super.endElement(namespaceURI, localName, qname); + if (isSuppressIndentation(localName) && suppressIndentDepth > 0) { + suppressIndentDepth--; + } popWhitespacePreserve(); // apply ancestor's xml:space value _after_ end element sameline = isInlineTag(namespaceURI, localName); afterTag = true; @@ -104,6 +117,9 @@ public void endElement(final String namespaceURI, final String localName, final public void endElement(final QName qname) throws TransformerException { endIndent(qname.getNamespaceURI(), qname.getLocalPart()); super.endElement(qname); + if (isSuppressIndentation(qname.getLocalPart()) && suppressIndentDepth > 0) { + suppressIndentDepth--; + } popWhitespacePreserve(); // apply ancestor's xml:space value _after_ end element sameline = isInlineTag(qname.getNamespaceURI(), qname.getLocalPart()); afterTag = true; @@ -164,7 +180,29 @@ public void setOutputProperties(final Properties properties) { } catch (final NumberFormatException e) { LOG.warn("Invalid indentation value: '{}'", option); } - indent = "yes".equals(outputProperties.getProperty(OutputKeys.INDENT, "no")); + final String indentValue = outputProperties.getProperty(OutputKeys.INDENT, "no").trim(); + indent = "yes".equals(indentValue) || "true".equals(indentValue) || "1".equals(indentValue); + final String suppressProp = outputProperties.getProperty("suppress-indentation"); + if (suppressProp != null && !suppressProp.isEmpty()) { + suppressIndentation = new HashSet<>(); + for (final String name : suppressProp.split("\\s+")) { + if (!name.isEmpty()) { + // Handle URI-qualified names: Q{ns}local or {ns}local → extract local part + if (name.startsWith("Q{") || name.startsWith("{")) { + final int closeBrace = name.indexOf('}'); + if (closeBrace > 0 && closeBrace < name.length() - 1) { + suppressIndentation.add(name.substring(closeBrace + 1)); + } else { + suppressIndentation.add(name); + } + } else { + suppressIndentation.add(name); + } + } + } + } else { + suppressIndentation = null; + } } @Override @@ -220,8 +258,12 @@ protected void addSpaceIfIndent() throws IOException { writer.write(' '); } + private boolean isSuppressIndentation(final String localName) { + return suppressIndentation != null && suppressIndentation.contains(localName); + } + protected void indent() throws TransformerException { - if (!indent || whitespacePreserve) { + if (!indent || whitespacePreserve || suppressIndentDepth > 0) { return; } final int spaces = indentAmount * level; diff --git a/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java b/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java index 85c5c4cf5a6..5ea206a25da 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java @@ -206,16 +206,10 @@ protected void writeDoctype(final String rootElement) throws TransformerExceptio @Override protected void writeChars(final CharSequence s, final boolean inAttribute) throws IOException { - final int len = s.length(); - writeCharSeq(s, 0, len); + writeCharSeq(s, 0, s.length()); } - - private void writeCharSeq(final CharSequence ch, final int start, final int end) throws IOException { - for (int i = start; i < end; i++) { - writer.write(ch.charAt(i)); - } - } - + + @Override protected void writeCharacterReference(final char charval) throws IOException { int o = 0; diff --git a/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java b/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java index e89e7119d19..bc4990eb5eb 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java +++ b/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java @@ -22,7 +22,6 @@ package org.exist.util.serializer; import java.io.Writer; -import javax.xml.transform.TransformerException; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import it.unimi.dsi.fastutil.objects.ObjectSet; @@ -121,14 +120,4 @@ public XHTML5Writer(ObjectSet emptyTags, ObjectSet inlineTags) { public XHTML5Writer(Writer writer, ObjectSet emptyTags, ObjectSet inlineTags) { super(writer, emptyTags, inlineTags); } - - @Override - protected void writeDoctype(String rootElement) throws TransformerException { - if (doctypeWritten) { - return; - } - - documentType("html", null, null); - doctypeWritten = true; - } } diff --git a/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java index b0006f7f51c..d01a062fde4 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.Writer; +import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerException; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; @@ -36,12 +37,85 @@ */ public class XHTMLWriter extends IndentingXMLWriter { + /** + * HTML boolean attributes per HTML 4.01 and HTML5 spec. + * When method="html" and the attribute value equals the attribute name + * (case-insensitive), the attribute is minimized to just the name. + */ + protected static final ObjectSet BOOLEAN_ATTRIBUTES = new ObjectOpenHashSet<>(31); + static { + BOOLEAN_ATTRIBUTES.add("checked"); + BOOLEAN_ATTRIBUTES.add("compact"); + BOOLEAN_ATTRIBUTES.add("declare"); + BOOLEAN_ATTRIBUTES.add("defer"); + BOOLEAN_ATTRIBUTES.add("disabled"); + BOOLEAN_ATTRIBUTES.add("ismap"); + BOOLEAN_ATTRIBUTES.add("multiple"); + BOOLEAN_ATTRIBUTES.add("nohref"); + BOOLEAN_ATTRIBUTES.add("noresize"); + BOOLEAN_ATTRIBUTES.add("noshade"); + BOOLEAN_ATTRIBUTES.add("nowrap"); + BOOLEAN_ATTRIBUTES.add("readonly"); + BOOLEAN_ATTRIBUTES.add("selected"); + } + + /** + * URI-valued attributes that must be %-escaped when escape-uri-attributes=yes + * (default for HTML/XHTML output methods, per W3C XSLT and XQuery + * Serialization 3.1 § 7.2.5). Keys are element local name + "/" + attribute + * local name, both lowercase. The synthetic key "*/href" matches any + * element bearing an href attribute (covers both a/@href and area/@href etc. + * in a single check while still letting non-URI attributes through). + */ + private static final ObjectSet URI_VALUED_ATTRIBUTES = new ObjectOpenHashSet<>(48); + static { + URI_VALUED_ATTRIBUTES.add("a/href"); + URI_VALUED_ATTRIBUTES.add("a/name"); + URI_VALUED_ATTRIBUTES.add("applet/codebase"); + URI_VALUED_ATTRIBUTES.add("area/href"); + URI_VALUED_ATTRIBUTES.add("base/href"); + URI_VALUED_ATTRIBUTES.add("blockquote/cite"); + URI_VALUED_ATTRIBUTES.add("body/background"); + URI_VALUED_ATTRIBUTES.add("button/formaction"); + URI_VALUED_ATTRIBUTES.add("del/cite"); + URI_VALUED_ATTRIBUTES.add("form/action"); + URI_VALUED_ATTRIBUTES.add("frame/longdesc"); + URI_VALUED_ATTRIBUTES.add("frame/src"); + URI_VALUED_ATTRIBUTES.add("head/profile"); + URI_VALUED_ATTRIBUTES.add("html/manifest"); + URI_VALUED_ATTRIBUTES.add("iframe/longdesc"); + URI_VALUED_ATTRIBUTES.add("iframe/src"); + URI_VALUED_ATTRIBUTES.add("img/longdesc"); + URI_VALUED_ATTRIBUTES.add("img/src"); + URI_VALUED_ATTRIBUTES.add("img/usemap"); + URI_VALUED_ATTRIBUTES.add("input/formaction"); + URI_VALUED_ATTRIBUTES.add("input/src"); + URI_VALUED_ATTRIBUTES.add("input/usemap"); + URI_VALUED_ATTRIBUTES.add("ins/cite"); + URI_VALUED_ATTRIBUTES.add("link/href"); + URI_VALUED_ATTRIBUTES.add("object/archive"); + URI_VALUED_ATTRIBUTES.add("object/classid"); + URI_VALUED_ATTRIBUTES.add("object/codebase"); + URI_VALUED_ATTRIBUTES.add("object/data"); + URI_VALUED_ATTRIBUTES.add("object/usemap"); + URI_VALUED_ATTRIBUTES.add("q/cite"); + URI_VALUED_ATTRIBUTES.add("script/src"); + URI_VALUED_ATTRIBUTES.add("source/src"); + URI_VALUED_ATTRIBUTES.add("track/src"); + URI_VALUED_ATTRIBUTES.add("video/poster"); + URI_VALUED_ATTRIBUTES.add("video/src"); + URI_VALUED_ATTRIBUTES.add("audio/src"); + } + + private static final char[] HEX = "0123456789ABCDEF".toCharArray(); + protected static final ObjectSet EMPTY_TAGS = new ObjectOpenHashSet<>(31); static { EMPTY_TAGS.add("area"); EMPTY_TAGS.add("base"); EMPTY_TAGS.add("br"); EMPTY_TAGS.add("col"); + EMPTY_TAGS.add("embed"); EMPTY_TAGS.add("hr"); EMPTY_TAGS.add("img"); EMPTY_TAGS.add("input"); @@ -52,48 +126,71 @@ public class XHTMLWriter extends IndentingXMLWriter { EMPTY_TAGS.add("isindex"); EMPTY_TAGS.add("param"); } - + protected static final ObjectSet INLINE_TAGS = new ObjectOpenHashSet<>(31); - static { - INLINE_TAGS.add("a"); - INLINE_TAGS.add("abbr"); - INLINE_TAGS.add("acronym"); - INLINE_TAGS.add("b"); - INLINE_TAGS.add("bdo"); - INLINE_TAGS.add("big"); - INLINE_TAGS.add("br"); - INLINE_TAGS.add("button"); - INLINE_TAGS.add("cite"); - INLINE_TAGS.add("code"); - INLINE_TAGS.add("del"); - INLINE_TAGS.add("dfn"); - INLINE_TAGS.add("em"); - INLINE_TAGS.add("i"); - INLINE_TAGS.add("img"); - INLINE_TAGS.add("input"); - INLINE_TAGS.add("kbd"); - INLINE_TAGS.add("label"); - INLINE_TAGS.add("q"); - INLINE_TAGS.add("samp"); - INLINE_TAGS.add("select"); - INLINE_TAGS.add("small"); - INLINE_TAGS.add("span"); - INLINE_TAGS.add("strong"); - INLINE_TAGS.add("sub"); - INLINE_TAGS.add("sup"); - INLINE_TAGS.add("textarea"); - INLINE_TAGS.add("tt"); - INLINE_TAGS.add("var"); - } - - protected String currentTag; + INLINE_TAGS.add("a"); + INLINE_TAGS.add("abbr"); + INLINE_TAGS.add("acronym"); + INLINE_TAGS.add("b"); + INLINE_TAGS.add("bdo"); + INLINE_TAGS.add("big"); + INLINE_TAGS.add("br"); + INLINE_TAGS.add("button"); + INLINE_TAGS.add("cite"); + INLINE_TAGS.add("code"); + INLINE_TAGS.add("del"); + INLINE_TAGS.add("dfn"); + INLINE_TAGS.add("em"); + INLINE_TAGS.add("i"); + INLINE_TAGS.add("img"); + INLINE_TAGS.add("input"); + INLINE_TAGS.add("kbd"); + INLINE_TAGS.add("label"); + INLINE_TAGS.add("q"); + INLINE_TAGS.add("samp"); + INLINE_TAGS.add("select"); + INLINE_TAGS.add("small"); + INLINE_TAGS.add("span"); + INLINE_TAGS.add("strong"); + INLINE_TAGS.add("sub"); + INLINE_TAGS.add("sup"); + INLINE_TAGS.add("textarea"); + INLINE_TAGS.add("tt"); + INLINE_TAGS.add("var"); + } + + private static final String SVG_NS = "http://www.w3.org/2000/svg"; + private static final String MATHML_NS = "http://www.w3.org/1998/Math/MathML"; + + private static final ObjectSet RAW_TEXT_ELEMENTS_HTML = new ObjectOpenHashSet<>(4); + static { + RAW_TEXT_ELEMENTS_HTML.add("script"); + RAW_TEXT_ELEMENTS_HTML.add("style"); + } protected final ObjectSet emptyTags; protected final ObjectSet inlineTags; + protected String currentTag; + protected boolean inHead = false; + protected boolean contentTypeMetaWritten = false; + + // Meta-tag dedup state: when a `` element is encountered inside + // `` AFTER the auto-generated content-type meta has been emitted, + // its bytes are diverted to {@link #metaScratch}. If, while buffering, + // we observe a {@code charset} or {@code http-equiv="Content-Type"} + // attribute, the buffered meta is dropped (the auto-meta replaces it); + // otherwise the buffer is flushed verbatim at endElement time. + private Writer metaSuspendedWriter = null; + private java.io.StringWriter metaScratch = null; + private boolean metaIsContentTypeOrCharset = false; + + boolean haveCollapsedXhtmlPrefix = false; + private String collapsedForeignNs = null; // SVG or MathML ns being normalized + /** - * + * */ public XHTMLWriter() { this(EMPTY_TAGS, INLINE_TAGS); @@ -120,97 +217,311 @@ public XHTMLWriter(final Writer writer, ObjectSet emptyTags, ObjectSet> 6)); + appendHexByte(sb, 0x80 | (codepoint & 0x3F)); + } else if (codepoint < 0x10000) { + appendHexByte(sb, 0xE0 | (codepoint >> 12)); + appendHexByte(sb, 0x80 | ((codepoint >> 6) & 0x3F)); + appendHexByte(sb, 0x80 | (codepoint & 0x3F)); + } else { + appendHexByte(sb, 0xF0 | (codepoint >> 18)); + appendHexByte(sb, 0x80 | ((codepoint >> 12) & 0x3F)); + appendHexByte(sb, 0x80 | ((codepoint >> 6) & 0x3F)); + appendHexByte(sb, 0x80 | (codepoint & 0x3F)); + } + } + + private static void appendHexByte(final StringBuilder sb, final int b) { + sb.append('%'); + sb.append(HEX[(b >> 4) & 0xF]); + sb.append(HEX[b & 0xF]); + } + + @Override + protected void resetObjectState() { + super.resetObjectState(); + inHead = false; + contentTypeMetaWritten = false; + metaSuspendedWriter = null; + metaScratch = null; + metaIsContentTypeOrCharset = false; + } + + private boolean shouldBufferDuplicateMeta(final String localName) { + return inHead && contentTypeMetaWritten && metaSuspendedWriter == null + && "meta".equalsIgnoreCase(localName); + } + + /** True when the writer is currently diverting bytes for a candidate-duplicate meta. */ + protected boolean isBufferedMeta(final String localName) { + return metaSuspendedWriter != null && "meta".equalsIgnoreCase(localName); + } + + private void beginMetaBuffer() { + metaSuspendedWriter = writer; + metaScratch = new java.io.StringWriter(); + writer = metaScratch; + metaIsContentTypeOrCharset = false; + } + + protected void endMetaBuffer() throws TransformerException { + if (metaSuspendedWriter == null) { + return; + } + final Writer original = metaSuspendedWriter; + final String buffered = metaScratch.toString(); + final boolean dropDuplicate = metaIsContentTypeOrCharset; + metaSuspendedWriter = null; + metaScratch = null; + metaIsContentTypeOrCharset = false; + writer = original; + if (!dropDuplicate) { + try { + writer.write(buffered); + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + } + } + + protected void noteMetaAttribute(final String localName, final CharSequence value) { + if (metaSuspendedWriter == null) { + return; + } + if ("charset".equalsIgnoreCase(localName)) { + metaIsContentTypeOrCharset = true; + } else if ("http-equiv".equalsIgnoreCase(localName) + && value != null && "Content-Type".equalsIgnoreCase(value.toString())) { + metaIsContentTypeOrCharset = true; + } + } + protected boolean isEmptyTag(final String tag) { return emptyTags.contains(tag); } - boolean haveCollapsedXhtmlPrefix = false; - @Override public void startElement(final QName qname) throws TransformerException { - + final QName xhtmlQName = removeXhtmlPrefix(qname); - + + if (shouldBufferDuplicateMeta(xhtmlQName.getLocalPart())) { + beginMetaBuffer(); + } super.startElement(xhtmlQName); currentTag = xhtmlQName.getStringValue(); + if ("head".equalsIgnoreCase(xhtmlQName.getLocalPart())) { + inHead = true; + writeContentTypeMeta(); + } } - + @Override public void endElement(final QName qname) throws TransformerException { final QName xhtmlQName = removeXhtmlPrefix(qname); - + final boolean isMetaInHead = metaSuspendedWriter != null + && "meta".equalsIgnoreCase(xhtmlQName.getLocalPart()); + if (inHead && "head".equalsIgnoreCase(xhtmlQName.getLocalPart())) { + inHead = false; + } + super.endElement(xhtmlQName); - + + if (isMetaInHead) { + endMetaBuffer(); + } + haveCollapsedXhtmlPrefix = false; + collapsedForeignNs = null; } - + protected QName removeXhtmlPrefix(final QName qname) { final String prefix = qname.getPrefix(); final String namespaceURI = qname.getNamespaceURI(); - if(prefix != null && !prefix.isEmpty() && namespaceURI != null && namespaceURI.equals(Namespaces.XHTML_NS)) { - haveCollapsedXhtmlPrefix = true; - return new QName(qname.getLocalPart(), namespaceURI); + if (prefix != null && !prefix.isEmpty() && namespaceURI != null) { + if (namespaceURI.equals(Namespaces.XHTML_NS)) { + haveCollapsedXhtmlPrefix = true; + return new QName(qname.getLocalPart(), namespaceURI); + } + // XHTML5: normalize SVG and MathML prefixes to default namespace + if (isHtml5Version() && (namespaceURI.equals(SVG_NS) || namespaceURI.equals(MATHML_NS))) { + collapsedForeignNs = namespaceURI; + return new QName(qname.getLocalPart(), namespaceURI); + } } - return qname; } @Override public void startElement(final String namespaceURI, final String localName, final String qname) throws TransformerException { - + final String xhtmlQName = removeXhtmlPrefix(namespaceURI, qname); - + + if (shouldBufferDuplicateMeta(localName)) { + beginMetaBuffer(); + } super.startElement(namespaceURI, localName, xhtmlQName); currentTag = xhtmlQName; + if ("head".equalsIgnoreCase(localName)) { + inHead = true; + writeContentTypeMeta(); + } } - + @Override public void endElement(final String namespaceURI, final String localName, final String qname) throws TransformerException { - + final boolean isMetaInHead = metaSuspendedWriter != null + && "meta".equalsIgnoreCase(localName); + if (inHead && "head".equalsIgnoreCase(localName)) { + inHead = false; + } + final String xhtmlQName = removeXhtmlPrefix(namespaceURI, qname); - + super.endElement(namespaceURI, localName, xhtmlQName); - + + if (isMetaInHead) { + endMetaBuffer(); + } + haveCollapsedXhtmlPrefix = false; + collapsedForeignNs = null; } - + protected String removeXhtmlPrefix(final String namespaceURI, final String qname) { - final int pos = qname.indexOf(':'); - if(pos > 0 && namespaceURI != null && namespaceURI.equals(Namespaces.XHTML_NS)) { - haveCollapsedXhtmlPrefix = true; - return qname.substring(pos+1); - + if (pos > 0 && namespaceURI != null) { + if (namespaceURI.equals(Namespaces.XHTML_NS)) { + haveCollapsedXhtmlPrefix = true; + return qname.substring(pos + 1); + } + // XHTML5: normalize SVG and MathML prefixes + if (isHtml5Version() && (namespaceURI.equals(SVG_NS) || namespaceURI.equals(MATHML_NS))) { + collapsedForeignNs = namespaceURI; + return qname.substring(pos + 1); + } } - return qname; } @Override public void namespace(final String prefix, final String nsURI) throws TransformerException { - if(haveCollapsedXhtmlPrefix && prefix != null && !prefix.isEmpty() && nsURI.equals(Namespaces.XHTML_NS)) { - return; //dont output the xmlns:prefix for the collapsed nodes prefix + if (haveCollapsedXhtmlPrefix && prefix != null && !prefix.isEmpty() && nsURI.equals(Namespaces.XHTML_NS)) { + return; // don't output the xmlns:prefix for the collapsed node's prefix + } + // When a foreign namespace prefix was collapsed, replace the prefixed + // declaration with a default namespace declaration + if (collapsedForeignNs != null && prefix != null && !prefix.isEmpty() + && nsURI.equals(collapsedForeignNs)) { + super.namespace("", nsURI); // emit xmlns="..." instead of xmlns:prefix="..." + return; } - super.namespace(prefix, nsURI); } - - + + @Override protected void closeStartTag(final boolean isEmpty) throws TransformerException { try { if (tagIsOpen) { + // Flush canonical buffers (sorted namespaces + attributes) if active + if (isCanonical()) { + flushCanonicalBuffersXhtml(); + } + final Writer w = getWriter(); if (isEmpty) { - if (isEmptyTag(currentTag)) { - getWriter().write(" />"); + if (isCanonical()) { + // Canonical: always expand empty elements — coalesce 4 writes into 2 + w.write(">'); + } else if (isEmptyTag(currentTag)) { + // For method="html", use HTML-style void tags (
) + // For method="xhtml", use XHTML-style (
) + if (isHtmlMethod()) { + w.write('>'); + } else { + w.write(" />"); + } } else { - getWriter().write('>'); - getWriter().write("'); + // Coalesce ">", "" into 2 writer calls instead of 4 + w.write(">'); } } else { - getWriter().write('>'); + w.write('>'); } tagIsOpen = false; } @@ -218,10 +529,282 @@ protected void closeStartTag(final boolean isEmpty) throws TransformerException throw new TransformerException(ioe.getMessage(), ioe); } } - + + /** + * Returns true if the output method is "html" (not "xhtml"). + * HTML uses void element syntax (
) while XHTML uses self-closing (
). + */ + protected boolean isHtmlMethod() { + if (outputProperties != null) { + final String method = outputProperties.getProperty(OutputKeys.METHOD); + return "html".equalsIgnoreCase(method); + } + return false; + } + + /** + * Returns true if the HTML version is 5.0 or higher. + * Checks html-version first, then falls back to version (per W3C spec for html method). + */ + protected boolean isHtml5Version() { + if (outputProperties == null) { + return true; // default to HTML5 + } + final String htmlVersion = outputProperties.getProperty(org.exist.storage.serializers.EXistOutputKeys.HTML_VERSION); + if (htmlVersion != null) { + try { + return Double.parseDouble(htmlVersion) >= 5.0; + } catch (final NumberFormatException e) { + // fall through + } + } + final String version = outputProperties.getProperty(OutputKeys.VERSION); + if (version != null) { + try { + return Double.parseDouble(version) >= 5.0; + } catch (final NumberFormatException e) { + // ignore + } + } + return true; // default to HTML5 + } + + /** + * DOCTYPE emission for XHTML/HTML output methods, per + * W3C XSLT and XQuery Serialization 3.1 sections 7.1 and 7.2. + * + *
    + *
  • doctype-system set: emit DOCTYPE with PUBLIC/SYSTEM ids
  • + *
  • doctype-system absent, html method, doctype-public set: emit DOCTYPE PUBLIC
  • + *
  • doctype-system absent, html-version ≥ 5: emit {@code }
  • + *
  • otherwise: no DOCTYPE
  • + *
+ * + * Only emitted when the root element is {@code html} (case-insensitive); for + * fragments rooted on any other element the DOCTYPE is suppressed. + */ + @Override + protected void writeDoctype(final String rootElement) throws TransformerException { + if (doctypeWritten) { + return; + } + if (isCanonical() || !isHtmlRoot(rootElement)) { + doctypeWritten = true; + return; + } + emitHtmlDoctype(); + doctypeWritten = true; + } + + private static boolean isHtmlRoot(final String rootElement) { + final int colon = rootElement.indexOf(':'); + final String localName = colon < 0 ? rootElement : rootElement.substring(colon + 1); + return "html".equalsIgnoreCase(localName); + } + + private String getDoctypeProperty(final String key) { + return outputProperties != null ? outputProperties.getProperty(key) : null; + } + + private void emitHtmlDoctype() throws TransformerException { + final String publicId = getDoctypeProperty(OutputKeys.DOCTYPE_PUBLIC); + final String systemId = getDoctypeProperty(OutputKeys.DOCTYPE_SYSTEM); + if (systemId != null) { + documentType("html", publicId, systemId); + } else if (isHtmlMethod() && publicId != null) { + documentType("html", publicId, null); + } else if (isHtml5Version()) { + documentType("html", null, null); + } + } + + @Override + public void attribute(final QName qname, final CharSequence value) throws TransformerException { + noteMetaAttribute(qname.getLocalPart(), value); + final CharSequence effectiveValue = maybeEscapeUri(qname.getLocalPart(), value); + // For method="html", minimize boolean attributes when value matches name + if (isHtmlMethod() && isBooleanAttribute(qname.getLocalPart(), effectiveValue)) { + try { + if (!tagIsOpen) { + characters(value); + return; + } + final Writer w = getWriter(); + w.write(' '); + w.write(qname.getLocalPart()); + // Don't write ="value" — minimized form + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + return; + } + super.attribute(qname, effectiveValue); + } + + @Override + public void attribute(final String qname, final CharSequence value) throws TransformerException { + // Strip prefix for the redundancy check (we want the local name). + final int colon = qname.indexOf(':'); + final String localName = colon < 0 ? qname : qname.substring(colon + 1); + noteMetaAttribute(localName, value); + final CharSequence effectiveValue = maybeEscapeUri(localName, value); + if (isHtmlMethod() && isBooleanAttribute(qname, effectiveValue)) { + try { + if (!tagIsOpen) { + characters(value); + return; + } + final Writer w = getWriter(); + w.write(' '); + w.write(qname); + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + return; + } + super.attribute(qname, effectiveValue); + } + + /** + * Apply escape-uri-attributes when the current element/attribute names + * a URI-valued attribute; otherwise return the value unchanged. Escaping + * is applied for both HTML and XHTML output methods, so URI-valued + * attributes round-trip in XHTML 1.0 / 5 output too. + */ + private CharSequence maybeEscapeUri(final String attrLocal, final CharSequence value) { + if (currentTag == null) { + return value; + } + final String elementLocal = currentTag.contains(":") + ? currentTag.substring(currentTag.indexOf(':') + 1) + : currentTag; + if (!shouldEscapeUriAttribute(elementLocal, attrLocal)) { + return value; + } + return escapeUriAttribute(value); + } + + private boolean isBooleanAttribute(final String attrName, final CharSequence value) { + return BOOLEAN_ATTRIBUTES.contains(attrName.toLowerCase(java.util.Locale.ROOT)) + && attrName.equalsIgnoreCase(value.toString()); + } + + @Override + protected boolean needsEscape(final char ch, final boolean inAttribute) { + // For HTML method, script and style content should not be escaped + if (!inAttribute && isHtmlMethod() + && currentTag != null && RAW_TEXT_ELEMENTS_HTML.contains(currentTag.toLowerCase(java.util.Locale.ROOT))) { + return false; + } + return super.needsEscape(ch, inAttribute); + } + + @Override + protected boolean needsEscaping(final boolean inAttribute) { + if (!inAttribute && isHtmlMethod() + && currentTag != null && RAW_TEXT_ELEMENTS_HTML.contains(currentTag.toLowerCase(java.util.Locale.ROOT))) { + return false; + } + return super.needsEscaping(inAttribute); + } + + /** + * Per W3C XSLT and XQuery Serialization 3.1 § 7.2.7, the html method + * ignores cdata-section-elements for HTML elements (CDATA sections are + * not valid HTML syntax) but DOES apply them to foreign content + * (e.g. SVG, MathML, or any element in a non-HTML namespace embedded + * in the document). For foreign content the rule is unconditional — + * the xdm-serialization gate that the XML writer otherwise applies + * does not gate HTML's foreign-content CDATA emission. + */ + @Override + protected boolean shouldUseCdataSections() { + if (isHtmlMethod()) { + final String ns = currentElementNamespaceURI(); + return ns != null && !ns.isEmpty() && !Namespaces.XHTML_NS.equals(ns); + } + return super.shouldUseCdataSections(); + } + + /** + * Processing-instruction serialization for HTML method (pre-HTML5). + * Per W3C XSLT and XQuery Serialization 3.1 § 7.1.5, the HTML output + * method emits PIs as {@code } (no closing {@code ?>}); + * XHTML uses the regular XML form which the parent already provides. + * The HTML5 (PR2372) variant lives in {@link HTML5Writer}. + */ + @Override + public void processingInstruction(final String target, final String data) throws TransformerException { + if (!isHtmlMethod()) { + super.processingInstruction(target, data); + return; + } + try { + if (tagIsOpen) { + closeStartTag(false); + } + final Writer w = getWriter(); + w.write("'); + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + } + + @Override + protected boolean escapeAmpersandBeforeBrace() { + // HTML spec: & before { in attribute values should not be escaped + return false; + } + @Override protected boolean isInlineTag(final String namespaceURI, final String localName) { - return (namespaceURI == null || namespaceURI.isEmpty() || Namespaces.XHTML_NS.equals(namespaceURI)) - && inlineTags.contains(localName); + return (namespaceURI == null || namespaceURI.isEmpty() || Namespaces.XHTML_NS.equals(namespaceURI)) + && inlineTags.contains(localName); + } + + /** + * Write a meta content-type tag as the first child of head when + * include-content-type is enabled (the default per W3C Serialization 3.1). + */ + protected void writeContentTypeMeta() throws TransformerException { + if (contentTypeMetaWritten || outputProperties == null) { + return; + } + final String includeContentType = outputProperties.getProperty("include-content-type", "yes"); + if (!"yes".equals(includeContentType)) { + return; + } + contentTypeMetaWritten = true; + try { + final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, "UTF-8"); + closeStartTag(false); + final Writer writer = getWriter(); + + // HTML5 method uses + // XHTML and HTML4 use + // XHTML mode requires self-closing tags (/>) for valid XML output — + // the URL rewrite pipeline re-parses this as XML in the view step. + final boolean selfClose = !isHtmlMethod(); + if (isHtmlMethod() && isHtml5Version()) { + writer.write("" : "\">"); + } else { + final String mediaType = outputProperties.getProperty(OutputKeys.MEDIA_TYPE, "text/html"); + writer.write("" : "\">"); + } + } catch (IOException e) { + throw new TransformerException(e.getMessage(), e); + } } } diff --git a/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java index 763aaf52ef6..410f723f8fb 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java @@ -78,6 +78,11 @@ public class XMLWriter implements SerializerWriter { private String defaultNamespace = ""; + // Namespace stack (BaseX-style): flat list of (prefix, uri) pairs for all in-scope bindings. + // nstack records the list size at each startElement so endElement can roll back declarations. + private final List nspaces = new ArrayList<>(); + private final Deque nstack = new ArrayDeque<>(); + /** * When serializing an XDM this should be true, * otherwise false. @@ -86,8 +91,33 @@ public class XMLWriter implements SerializerWriter { * compared to retrieving resources from the database. */ private boolean xdmSerialization = false; + private boolean xml11 = false; + private boolean canonical = false; + @Nullable private java.text.Normalizer.Form normalizationForm = null; + + // Canonical XML: buffer namespaces and attributes for sorting + private final List canonicalNamespaces = new ArrayList<>(); // [prefix, uri] + private final List canonicalAttributes = new ArrayList<>(); // [nsUri, localName, qname, value] private final Deque elementName = new ArrayDeque<>(); + + /** + * Returns true if cdata-section-elements should be applied. + * Subclasses (e.g., XHTMLWriter for HTML method) can override + * to suppress CDATA sections. + */ + protected boolean shouldUseCdataSections() { + return xdmSerialization; + } + + /** + * Returns the namespace URI of the current (innermost) element, + * or null if no element is on the stack. + */ + protected String currentElementNamespaceURI() { + final QName top = elementName.peek(); + return top != null ? top.getNamespaceURI() : null; + } private LazyVal> cdataSectionElements = new LazyVal<>(this::parseCdataSectionElementNames); private boolean cdataSetionElement = false; @@ -96,8 +126,9 @@ public class XMLWriter implements SerializerWriter { Arrays.fill(textSpecialChars, false); textSpecialChars['<'] = true; textSpecialChars['>'] = true; - // textSpecialChars['\r'] = true; + textSpecialChars['\r'] = true; textSpecialChars['&'] = true; + textSpecialChars[0x7F] = true; // DEL must be escaped as  attrSpecialChars = new boolean[128]; Arrays.fill(attrSpecialChars, false); @@ -108,6 +139,7 @@ public class XMLWriter implements SerializerWriter { attrSpecialChars['\t'] = true; attrSpecialChars['&'] = true; attrSpecialChars['"'] = true; + attrSpecialChars[0x7F] = true; // DEL must be escaped as  } @Nullable private XMLDeclaration originalXmlDecl; @@ -139,6 +171,10 @@ public void setOutputProperties(final Properties properties) { } this.xdmSerialization = "yes".equals(outputProperties.getProperty(EXistOutputKeys.XDM_SERIALIZATION, "no")); + this.xml11 = "1.1".equals(outputProperties.getProperty(OutputKeys.VERSION)); + this.normalizationForm = parseNormalizationForm(outputProperties.getProperty("normalization-form", "none")); + final String canonicalProp = outputProperties.getProperty(EXistOutputKeys.CANONICAL); + this.canonical = "yes".equals(canonicalProp) || "true".equals(canonicalProp) || "1".equals(canonicalProp); } private Set parseCdataSectionElementNames() { @@ -166,6 +202,8 @@ protected void resetObjectState() { originalXmlDecl = null; doctypeWritten = false; defaultNamespace = ""; + nspaces.clear(); + nstack.clear(); cdataSectionElements = new LazyVal<>(this::parseCdataSectionElementNames); } @@ -184,12 +222,35 @@ public Writer getWriter() { } public String getDefaultNamespace() { - return defaultNamespace.isEmpty() ? null : defaultNamespace; + final String fromStack = nsLookup(""); + return (fromStack == null || fromStack.isEmpty()) ? null : fromStack; } public void setDefaultNamespace(final String namespace) { + // Keep the baseline field in sync; nsLookup() falls back to it when the + // namespace stack has no in-scope binding for the default prefix. defaultNamespace = namespace == null ? "" : namespace; } + + /** + * Looks up the currently in-scope URI for {@code prefix} by scanning the flat + * namespace list from innermost to outermost scope. + * For the default-namespace prefix ({@code ""}), falls back to the + * {@link #defaultNamespace} baseline field when the stack has no binding. + * + * @return the in-scope URI, or {@code null} if {@code prefix} is unbound + */ + private String nsLookup(final String prefix) { + for (int i = nspaces.size() - 2; i >= 0; i -= 2) { + if (nspaces.get(i).equals(prefix)) { + return nspaces.get(i + 1); + } + } + if (prefix.isEmpty()) { + return defaultNamespace.isEmpty() ? null : defaultNamespace; + } + return null; + } public void startDocument() throws TransformerException { resetObjectState(); @@ -207,15 +268,16 @@ public void startElement(final String namespaceUri, final String localName, fina if(!declarationWritten) { writeDeclaration(); } - + if(!doctypeWritten) { writeDoctype(qname); } - + try { if(tagIsOpen) { closeStartTag(false); } + nstack.push(nspaces.size()); writer.write('<'); writer.write(qname); tagIsOpen = true; @@ -233,21 +295,22 @@ public void startElement(final QName qname) throws TransformerException { if(!declarationWritten) { writeDeclaration(); } - + if(!doctypeWritten) { writeDoctype(qname.getStringValue()); } - + try { if(tagIsOpen) { closeStartTag(false); } + nstack.push(nspaces.size()); writer.write('<'); if(qname.getPrefix() != null && !qname.getPrefix().isEmpty()) { writer.write(qname.getPrefix()); writer.write(':'); } - + writer.write(qname.getLocalPart()); tagIsOpen = true; elementName.push(qname); @@ -266,6 +329,9 @@ public void endElement(final String namespaceURI, final String localName, final writer.write('>'); } elementName.pop(); + if (!nstack.isEmpty()) { + nspaces.subList(nstack.pop(), nspaces.size()).clear(); + } } catch(final IOException ioe) { throw new TransformerException(ioe.getMessage(), ioe); } @@ -285,40 +351,74 @@ public void endElement(final QName qname) throws TransformerException { writer.write('>'); } elementName.pop(); + if (!nstack.isEmpty()) { + nspaces.subList(nstack.pop(), nspaces.size()).clear(); + } } catch(final IOException ioe) { throw new TransformerException(ioe.getMessage(), ioe); } } public void namespace(final String prefix, final String nsURI) throws TransformerException { - if((nsURI == null) && (prefix == null || prefix.isEmpty())) { + final String normPrefix = prefix != null ? prefix : ""; + final String normUri = nsURI != null ? nsURI : ""; + + // The xml namespace is implicitly declared and never needs explicit serialization + if ("xml".equals(normPrefix)) { return; } - try { - if(!tagIsOpen) { + try { + if (!tagIsOpen) { + // An xmlns="" outside a start tag is harmless — just skip it + if (normUri.isEmpty() && normPrefix.isEmpty()) { + return; + } throw new TransformerException("Found a namespace declaration outside an element"); } - if(prefix != null && !prefix.isEmpty()) { - writer.write(' '); - writer.write("xmlns"); - writer.write(':'); - writer.write(prefix); - writer.write("=\""); - writeChars(nsURI, true); - writer.write('"'); - } else { - if(defaultNamespace.equals(nsURI)) { - return; + if (canonical) { + // Buffer for sorting — emitted in closeStartTag + // Validate: reject relative namespace URIs (SERE0024) + if (!normUri.isEmpty() && isRelativeUri(normUri)) { + throw new TransformerException("err:SERE0024 Canonical serialization does not allow relative namespace URIs: " + normUri); } - writer.write(' '); - writer.write("xmlns"); + if (normPrefix.isEmpty() && normUri.isEmpty()) { + return; // Skip xmlns="" in canonical (not meaningful for no-namespace elements) + } + // Deduplicate: replace existing binding for same prefix + canonicalNamespaces.removeIf(ns -> ns[0].equals(normPrefix)); + canonicalNamespaces.add(new String[]{normPrefix, normUri}); + // Track in namespace stack so getDefaultNamespace() stays accurate + nspaces.add(normPrefix); + nspaces.add(normUri); + return; + } + + // Look up what is currently in scope for this prefix. + // nsLookup scans nspaces from innermost to outermost and falls back to the + // defaultNamespace baseline field for the default-namespace prefix. + final String inScope = nsLookup(normPrefix); + final String effective = inScope != null ? inScope : ""; + if (normUri.equals(effective)) { + return; // Binding unchanged — no declaration needed + } + + // Record the new binding so descendants can see it via nsLookup + nspaces.add(normPrefix); + nspaces.add(normUri); + + // Write the namespace declaration + writer.write(' '); + if (normPrefix.isEmpty()) { + writer.write("xmlns=\""); + } else { + writer.write("xmlns:"); + writer.write(normPrefix); writer.write("=\""); - writeChars(nsURI, true); - writer.write('"'); - defaultNamespace= nsURI; } + writeChars(normUri, true); + writer.write('"'); } catch(final IOException ioe) { throw new TransformerException(ioe.getMessage(), ioe); } @@ -329,12 +429,18 @@ public void attribute(String qname, CharSequence value) throws TransformerExcept if(!tagIsOpen) { characters(value); return; - // throw new TransformerException("Found an attribute outside an - // element"); } - writer.write(' '); - writer.write(qname); - writer.write("=\""); + if (canonical) { + // Buffer for sorting — extract namespace URI from qname if prefixed + final int colon = qname.indexOf(':'); + final String nsUri = colon > 0 ? "" : ""; // string qname doesn't carry namespace + canonicalAttributes.add(new String[]{nsUri, colon > 0 ? qname.substring(colon + 1) : qname, qname, value.toString()}); + return; + } + // Coalesce ' ' + qname + '="' into a single bulk write when the + // qname fits in the scratch buffer (typical case for short HTML + // attribute names like class, href, style). + writeAttributePrefix(qname); writeChars(value, true); writer.write('"'); } catch(final IOException ioe) { @@ -347,16 +453,26 @@ public void attribute(final QName qname, final CharSequence value) throws Transf if(!tagIsOpen) { characters(value); return; - // throw new TransformerException("Found an attribute outside an - // element"); } - writer.write(' '); - if(qname.getPrefix() != null && !qname.getPrefix().isEmpty()) { - writer.write(qname.getPrefix()); - writer.write(':'); + if (canonical) { + final String nsUri = qname.getNamespaceURI() != null ? qname.getNamespaceURI() : ""; + final String localName = qname.getLocalPart(); + final String fullName; + if (qname.getPrefix() != null && !qname.getPrefix().isEmpty()) { + fullName = qname.getPrefix() + ":" + localName; + } else { + fullName = localName; + } + canonicalAttributes.add(new String[]{nsUri, localName, fullName, value.toString()}); + return; + } + final String prefix = qname.getPrefix(); + final String localPart = qname.getLocalPart(); + if (prefix != null && !prefix.isEmpty()) { + writePrefixedAttributePrefix(prefix, localPart); + } else { + writeAttributePrefix(localPart); } - writer.write(qname.getLocalPart()); - writer.write("=\""); writeChars(value, true); writer.write('"'); } catch(final IOException ioe) { @@ -364,6 +480,55 @@ public void attribute(final QName qname, final CharSequence value) throws Transf } } + /** + * Write {@code ' ' + qname + '="'} as a single {@code Writer.write(char[], + * int, int)} call when {@code qname} fits in the scratch buffer. Reduces + * 3 writer calls per attribute to 1. + */ + private void writeAttributePrefix(final String qname) throws IOException { + final int qlen = qname.length(); + final int needed = qlen + 3; // ' ' + qname + '="' + if (needed <= ATTR_SCRATCH_LEN) { + attrScratch[0] = ' '; + qname.getChars(0, qlen, attrScratch, 1); + attrScratch[qlen + 1] = '='; + attrScratch[qlen + 2] = '"'; + writer.write(attrScratch, 0, needed); + } else { + writer.write(' '); + writer.write(qname); + writer.write("=\""); + } + } + + /** + * Write {@code ' ' + prefix + ':' + localPart + '="'} as a single bulk + * write when it fits the scratch buffer. + */ + private void writePrefixedAttributePrefix(final String prefix, final String localPart) throws IOException { + final int plen = prefix.length(); + final int llen = localPart.length(); + final int needed = plen + llen + 4; // ' ' + prefix + ':' + localPart + '="' + if (needed <= ATTR_SCRATCH_LEN) { + attrScratch[0] = ' '; + prefix.getChars(0, plen, attrScratch, 1); + attrScratch[plen + 1] = ':'; + localPart.getChars(0, llen, attrScratch, plen + 2); + attrScratch[plen + llen + 2] = '='; + attrScratch[plen + llen + 3] = '"'; + writer.write(attrScratch, 0, needed); + } else { + writer.write(' '); + writer.write(prefix); + writer.write(':'); + writer.write(localPart); + writer.write("=\""); + } + } + + private static final int ATTR_SCRATCH_LEN = 96; + private final char[] attrScratch = new char[ATTR_SCRATCH_LEN]; + public void characters(final CharSequence chars) throws TransformerException { if(!declarationWritten) { writeDeclaration(); @@ -373,12 +538,68 @@ public void characters(final CharSequence chars) throws TransformerException { if(tagIsOpen) { closeStartTag(false); } - writeChars(chars, false); + // When xdmSerialization is active and current element is in cdata-section-elements, + // wrap text content in CDATA instead of escaping it (per W3C Serialization 3.1) + if (shouldUseCdataSections() && !elementName.isEmpty() + && cdataSectionElements.get().contains(elementName.peek())) { + writeCdataContent(chars); + } else { + writeChars(chars, false); + } } catch(final IOException ioe) { throw new TransformerException(ioe.getMessage(), ioe); } } + private void writeCdataContent(final CharSequence chars) throws IOException { + // CDATA sections must be split when: + // 1. The content contains "]]>" (which would end the CDATA prematurely) + // 2. A character cannot be represented in the output encoding (must be escaped as &#xNN;) + final String s = normalize(chars).toString(); + boolean inCdata = false; + for (int i = 0; i < s.length(); ) { + final int cp = s.codePointAt(i); + final int cpLen = Character.charCount(cp); + + // Check for "]]>" sequence + if (cp == ']' && i + 2 < s.length() && s.charAt(i + 1) == ']' && s.charAt(i + 2) == '>') { + if (!inCdata) { + writer.write(""); + inCdata = false; + i += 2; // skip "]]", the ">" will be picked up next + continue; + } + + // Check if character is encodable in the output charset + if (!charSet.inCharacterSet((char) cp)) { + // Close any open CDATA section + if (inCdata) { + writer.write("]]>"); + inCdata = false; + } + // Write as character reference + writer.write("&#x"); + writer.write(Integer.toHexString(cp)); + writer.write(';'); + } else { + // Encodable character — write inside CDATA + if (!inCdata) { + writer.write(""); + } + } + public void characters(final char[] ch, final int start, final int len) throws TransformerException { if(!declarationWritten) { writeDeclaration(); @@ -510,8 +731,23 @@ public void documentType(final String name, final String publicId, final String protected void closeStartTag(final boolean isEmpty) throws TransformerException { try { if(tagIsOpen) { - if(isEmpty) { + if (canonical) { + flushCanonicalBuffers(); + } + if(isEmpty && !canonical) { + // Canonical XML: empty elements expanded to writer.write("/>"); + } else if (isEmpty) { + // Canonical: write > for empty elements + writer.write('>'); + final QName currentElem = elementName.peek(); + writer.write("'); } else { writer.write('>'); } @@ -522,6 +758,52 @@ protected void closeStartTag(final boolean isEmpty) throws TransformerException } } + protected boolean isCanonical() { + return canonical; + } + + protected void flushCanonicalBuffersXhtml() throws TransformerException { + try { + flushCanonicalBuffers(); + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + } + + private void flushCanonicalBuffers() throws IOException { + // Sort namespaces by prefix (default namespace first, then alphabetical) + canonicalNamespaces.sort((a, b) -> a[0].compareTo(b[0])); + // Write sorted namespaces + for (final String[] ns : canonicalNamespaces) { + writer.write(' '); + if (ns[0].isEmpty()) { + writer.write("xmlns=\""); + } else { + writer.write("xmlns:"); + writer.write(ns[0]); + writer.write("=\""); + } + writeChars(ns[1], true); + writer.write('"'); + } + canonicalNamespaces.clear(); + + // Sort attributes by namespace URI (primary), then local name (secondary) + canonicalAttributes.sort((a, b) -> { + final int cmp = a[0].compareTo(b[0]); + return cmp != 0 ? cmp : a[1].compareTo(b[1]); + }); + // Write sorted attributes + for (final String[] attr : canonicalAttributes) { + writer.write(' '); + writer.write(attr[2]); // qualified name + writer.write("=\""); + writeChars(attr[3], true); + writer.write('"'); + } + canonicalAttributes.clear(); + } + protected void writeDeclaration() throws TransformerException { if(declarationWritten) { return; @@ -537,7 +819,9 @@ protected void writeDeclaration() throws TransformerException { // get the fields of the persisted xml declaration, but overridden with any properties from the serialization properties final String version = outputProperties.getProperty(OutputKeys.VERSION, (originalXmlDecl.version != null ? originalXmlDecl.version : DEFAULT_XML_VERSION)); final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, (originalXmlDecl.encoding != null ? originalXmlDecl.encoding : DEFAULT_XML_ENCODING)); - @Nullable final String standalone = outputProperties.getProperty(OutputKeys.STANDALONE, originalXmlDecl.standalone); + @Nullable final String standaloneOrig = outputProperties.getProperty(OutputKeys.STANDALONE, originalXmlDecl.standalone); + // "omit" means standalone should be absent from the declaration + @Nullable final String standalone = (standaloneOrig != null && "omit".equalsIgnoreCase(standaloneOrig.trim())) ? null : standaloneOrig; writeDeclaration(version, encoding, standalone); @@ -545,11 +829,15 @@ protected void writeDeclaration() throws TransformerException { } final String omitXmlDecl = outputProperties.getProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - if ("no".equals(omitXmlDecl)) { + @Nullable final String standaloneRaw = outputProperties.getProperty(OutputKeys.STANDALONE); + // "omit" means standalone should be absent from the declaration + @Nullable final String standalone = (standaloneRaw != null && "omit".equalsIgnoreCase(standaloneRaw.trim())) ? null : standaloneRaw; + // Per W3C Serialization 3.1: output declaration if omit-xml-declaration is false/no/0, + // or if standalone is explicitly set (the declaration is required to carry standalone) + if (isBooleanFalse(omitXmlDecl) || standalone != null) { // get the fields of the declaration from the serialization properties final String version = outputProperties.getProperty(OutputKeys.VERSION, DEFAULT_XML_VERSION); final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, DEFAULT_XML_ENCODING); - @Nullable final String standalone = outputProperties.getProperty(OutputKeys.STANDALONE); writeDeclaration(version, encoding, standalone); } @@ -564,7 +852,15 @@ private void writeDeclaration(final String version, final String encoding, @Null writer.write('"'); if(standalone != null) { writer.write(" standalone=\""); - writer.write(standalone); + // Normalize boolean values to yes/no for XML declaration + final String standaloneVal = standalone.trim(); + if ("true".equals(standaloneVal) || "1".equals(standaloneVal)) { + writer.write("yes"); + } else if ("false".equals(standaloneVal) || "0".equals(standaloneVal)) { + writer.write("no"); + } else { + writer.write(standaloneVal); + } writer.write('"'); } writer.write("?>\n"); @@ -589,60 +885,112 @@ protected void writeDoctype(final String rootElement) throws TransformerExceptio protected boolean needsEscape(final char ch) { return true; } - + + /** + * Whether & before { should be escaped. HTML output returns false + * per W3C HTML serialization spec. XML output returns true (always escape &). + */ + protected boolean escapeAmpersandBeforeBrace() { + return true; + } + + /** + * Check if a serialization boolean parameter value is false. + * W3C Serialization 3.1 accepts "no", "false", "0" (with optional whitespace) as false. + */ + protected static boolean isBooleanFalse(final String value) { + if (value == null) { + return false; + } + final String trimmed = value.trim(); + return "no".equals(trimmed) || "false".equals(trimmed) || "0".equals(trimmed); + } + + /** + * Whether the given character needs escaping. Subclasses can override + * to suppress escaping for specific contexts (e.g., HTML raw text elements). + * + * @param ch the character to check + * @param inAttribute true if we're writing an attribute value + */ + protected boolean needsEscape(final char ch, final boolean inAttribute) { + return needsEscape(ch); + } + + /** + * Whether the current context requires character escaping at all. + * Subclasses (e.g., HTML5Writer for {@code ", + "html", "5.0"); + assertTrue("Script attribute & should be escaped: " + result, + result.contains("language=\"Jack&Jill\"")); + assertTrue("Script body && should NOT be escaped: " + result, + result.contains("go && run()")); + } + + @Test + public void html40NoDoctypeWithoutPublicSystem() throws Exception { + // HTML 4.0 without doctype-public/doctype-system should not emit DOCTYPE + final String result = serialize("

hello

", "html", "4.0"); + assertFalse("HTML 4.0 without public/system should NOT have DOCTYPE: " + result, + result.contains("\n"; + final String expected = ""; final QName elQName = new QName("input"); writer.startElement(elQName); writer.attribute("checked", "checked"); @@ -54,7 +54,7 @@ public void testAttributeWithBooleanValue() throws Exception { @Test public void testAttributeWithNonBooleanValue() throws Exception { - final String expected = "\n"; + final String expected = ""; final QName elQName = new QName("input"); writer.startElement(elQName); writer.attribute("name", "name"); @@ -66,7 +66,7 @@ public void testAttributeWithNonBooleanValue() throws Exception { @Test public void testAttributeQNameWithBooleanValue() throws Exception { - final String expected = "\n"; + final String expected = ""; final QName elQName = new QName("input"); final QName attrQName = new QName("checked"); writer.startElement(elQName); @@ -79,7 +79,7 @@ public void testAttributeQNameWithBooleanValue() throws Exception { @Test public void testAttributeQNameWithNonBooleanValue() throws Exception { - final String expected = "\n"; + final String expected = ""; final QName elQName = new QName("input"); final QName attrQName = new QName("name"); writer.startElement(elQName); diff --git a/exist-core/src/test/java/org/exist/util/serializer/HtmlSerializerBenchmark.java b/exist-core/src/test/java/org/exist/util/serializer/HtmlSerializerBenchmark.java new file mode 100644 index 00000000000..932b2f2b8a4 --- /dev/null +++ b/exist-core/src/test/java/org/exist/util/serializer/HtmlSerializerBenchmark.java @@ -0,0 +1,291 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.util.serializer; + +import org.exist.dom.QName; +import org.junit.Test; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.TransformerException; +import java.io.IOException; +import java.io.Writer; +import java.util.Properties; + +import static org.junit.Assert.assertTrue; + +/** + * Microbenchmark for HTML serialization that exercises the writeChars/writeCharSeq + * hot path. Builds a representative HTML document with paragraphs of plain text + * (no special chars in the safe runs) and serializes it many times. + * + * Compares two configurations: + * - bulk writes via {@link Writer#write(char[], int, int)} (current code) + * - per-char writes via {@link Writer#write(int)} (the previous behaviour) + * + * The "per-char" baseline is simulated by wrapping the writer in one that + * counts only charAt-based calls — this lets us prove the algorithmic + * improvement without having to revert the patch. + */ +public class HtmlSerializerBenchmark { + + private static final String LOREM = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim " + + "ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + + private static final int PARAGRAPH_COUNT = 80; + private static final int ITERATIONS = 200; + + /** + * Counts both bulk and per-char writes so we can verify the hot path is + * actually using bulk operations. + */ + private static final class CountingWriter extends Writer { + long bulkWriteCalls; + long bulkCharsWritten; + long perCharWriteCalls; + long stringWriteCalls; + long stringCharsWritten; + + @Override + public void write(int c) { + perCharWriteCalls++; + } + + @Override + public void write(char[] cbuf, int off, int len) { + bulkWriteCalls++; + bulkCharsWritten += len; + } + + @Override + public void write(String str, int off, int len) { + stringWriteCalls++; + stringCharsWritten += len; + } + + @Override + public void flush() { /* no-op: metrics live in fields, nothing to flush */ } + + @Override + public void close() { /* no-op: counting writer holds no resources */ } + } + + /** + * Forwards every write to the underlying writer one char at a time, + * simulating a writer that has no efficient bulk path. Wrapping a + * {@link java.io.StringWriter} in this is the closest we can come + * to measuring the *previous* writeCharSeq behaviour without reverting. + */ + private static final class PerCharWriter extends Writer { + private final Writer delegate; + PerCharWriter(final Writer delegate) { this.delegate = delegate; } + @Override public void write(int c) throws IOException { delegate.write(c); } + @Override public void write(char[] cbuf, int off, int len) throws IOException { + for (int i = 0; i < len; i++) delegate.write(cbuf[off + i]); + } + @Override public void write(String str, int off, int len) throws IOException { + for (int i = 0; i < len; i++) delegate.write(str.charAt(off + i)); + } + @Override public void flush() throws IOException { delegate.flush(); } + @Override public void close() throws IOException { delegate.close(); } + } + + /** Discards bytes — simulates a network sink with no I/O cost. */ + private static final class NullOutputStream extends java.io.OutputStream { + @Override public void write(int b) { /* no-op sink: byte intentionally discarded */ } + @Override public void write(byte[] b, int off, int len) { /* no-op sink: bytes intentionally discarded */ } + } + + private static java.io.OutputStreamWriter newProductionLikeWriter() { + // Mirrors the typical HTTP-response chain: OutputStreamWriter(UTF-8) over + // a stream sink. No BufferedWriter — eXist's serializer pipeline does its + // own buffering at higher levels. + return new java.io.OutputStreamWriter(new NullOutputStream(), java.nio.charset.StandardCharsets.UTF_8); + } + + @Test + public void rawTextFastPath() throws TransformerException, IOException { + // Compare per-char writes between an empty " while non-empty + // splits the close across two writers.write() calls.) + final long stringCharsDelta = withScript.stringCharsWritten - empty.stringCharsWritten; + assertTrue("Script body should add bulk string output close to its size; " + + "empty=" + empty.stringCharsWritten + " withScript=" + + withScript.stringCharsWritten + " delta=" + stringCharsDelta + + " script.length()=" + script.length(), + stringCharsDelta >= script.length() - 5); + } + + private CountingWriter serializeWithScript(final String script) throws TransformerException { + final CountingWriter counter = new CountingWriter(); + final XHTMLWriter w = new HTML5Writer(counter); + final Properties props = new Properties(); + props.setProperty(OutputKeys.METHOD, "html"); + w.setOutputProperties(props); + w.startDocument(); + w.startElement(null, "html", "html"); + w.startElement(null, "body", "body"); + w.startElement(null, "script", "script"); + if (!script.isEmpty()) { + w.characters(script); + } + w.endElement(null, "script", "script"); + w.endElement(null, "body", "body"); + w.endElement(null, "html", "html"); + w.endDocument(); + return counter; + } + + @Test + public void compareAgainstPerCharWriter() throws TransformerException, IOException { + // Warm-up — let JIT compile the hot path + for (int i = 0; i < 5; i++) { + try (java.io.OutputStreamWriter w = newProductionLikeWriter()) { run(w); } + try (java.io.OutputStreamWriter w = newProductionLikeWriter()) { + run(new PerCharWriter(w)); + } + } + + // Bulk path (current code) + long bulkStart = System.nanoTime(); + for (int i = 0; i < ITERATIONS; i++) { + try (java.io.OutputStreamWriter w = newProductionLikeWriter()) { run(w); } + } + long bulkMs = (System.nanoTime() - bulkStart) / 1_000_000L; + + // Per-char path: wraps the OutputStreamWriter so every char goes through + // OutputStreamWriter.write(int) — same path the previous writeCharSeq used. + long perCharStart = System.nanoTime(); + for (int i = 0; i < ITERATIONS; i++) { + try (java.io.OutputStreamWriter w = newProductionLikeWriter()) { + run(new PerCharWriter(w)); + } + } + long perCharMs = (System.nanoTime() - perCharStart) / 1_000_000L; + + System.out.println("[HtmlSerializerBenchmark] " + ITERATIONS + " iters of " + + PARAGRAPH_COUNT + "-paragraph HTML doc to OutputStreamWriter(UTF-8):"); + System.out.println("[HtmlSerializerBenchmark] bulk path: " + bulkMs + " ms (" + + String.format("%.3f", bulkMs * 1.0 / ITERATIONS) + " ms/doc)"); + System.out.println("[HtmlSerializerBenchmark] per-char path: " + perCharMs + " ms (" + + String.format("%.3f", perCharMs * 1.0 / ITERATIONS) + " ms/doc)"); + System.out.println("[HtmlSerializerBenchmark] speedup: " + + String.format("%.2fx", perCharMs * 1.0 / Math.max(1, bulkMs))); + + assertTrue("Bulk path should be faster than per-char path; bulk=" + + bulkMs + "ms perChar=" + perCharMs + "ms", bulkMs < perCharMs); + } + + @Test + public void htmlSerializationHotPath() throws TransformerException, IOException { + // Warm-up + for (int i = 0; i < 3; i++) { + run(new CountingWriter()); + } + + final CountingWriter counter = new CountingWriter(); + final long start = System.nanoTime(); + for (int i = 0; i < ITERATIONS; i++) { + run(counter); + } + final long elapsedMs = (System.nanoTime() - start) / 1_000_000L; + + final long totalChars = counter.bulkCharsWritten + counter.stringCharsWritten + counter.perCharWriteCalls; + final long bulkChars = counter.bulkCharsWritten + counter.stringCharsWritten; + final double bulkPct = bulkChars * 100.0 / totalChars; + + System.out.println("[HtmlSerializerBenchmark] " + ITERATIONS + " iterations of " + + PARAGRAPH_COUNT + "-paragraph HTML doc in " + elapsedMs + " ms" + + " (" + (elapsedMs * 1.0 / ITERATIONS) + " ms/doc)"); + System.out.println("[HtmlSerializerBenchmark] bulk writes: " + + counter.bulkWriteCalls + " (chars: " + counter.bulkCharsWritten + ")"); + System.out.println("[HtmlSerializerBenchmark] string writes: " + + counter.stringWriteCalls + " (chars: " + counter.stringCharsWritten + ")"); + System.out.println("[HtmlSerializerBenchmark] per-char writes: " + + counter.perCharWriteCalls); + System.out.println("[HtmlSerializerBenchmark] " + String.format("%.2f", bulkPct) + + "% of output bytes flushed in bulk"); + + // We expect the vast majority of safe-character output to flow through + // bulk writes (Writer.write(char[],int,int) or Writer.write(String,int,int)). + // Special-character escapes still go through per-char writes, but those + // are a tiny minority of output for typical HTML. + assertTrue("Expected >90% of chars to be flushed in bulk, but got " + bulkPct + "%", + bulkPct > 90.0); + } + + private void run(final Writer out) throws TransformerException { + final XHTMLWriter w = new XHTMLWriter(out); + final Properties props = new Properties(); + props.setProperty(OutputKeys.METHOD, "html"); + props.setProperty(OutputKeys.INDENT, "yes"); + w.setOutputProperties(props); + w.startDocument(); + w.startElement(null, "html", "html"); + w.startElement(null, "body", "body"); + for (int i = 0; i < PARAGRAPH_COUNT; i++) { + w.startElement(null, "p", "p"); + w.attribute("class", "para"); + w.characters(LOREM); + w.endElement(null, "p", "p"); + } + w.endElement(null, "body", "body"); + w.endElement(null, "html", "html"); + w.endDocument(); + } +} diff --git a/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm b/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm index e971e7a5a93..64fd0d5267e 100644 --- a/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm +++ b/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm @@ -59,3 +59,62 @@ function testSerialize:use_character_maps-032-params-as-map() { let $result := serialize($testSerialize:atomic, $params) return contains($result, "foo:a$$name") }; + +(: JSON serialization with use-character-maps :) + +declare + %test:assertEquals('{"name":"hello ©orld"}') +function testSerialize:json_character_map_string() { + let $params := map { + "method": "json", + "use-character-maps": map { "w": "©" } + } + return serialize(map { "name": "hello world" }, $params) +}; + +declare + %test:assertEquals('{"price":"$100"}') +function testSerialize:json_character_map_special() { + (: Map # to $ in JSON string values :) + let $params := map { + "method": "json", + "use-character-maps": map { "#": "$" } + } + return serialize(map { "price": "#100" }, $params) +}; + +declare + %test:assertTrue +function testSerialize:json_character_map_raw_output() { + (: Character map replacements bypass JSON escaping — raw output :) + let $params := map { + "method": "json", + "use-character-maps": map { "*": "" } + } + let $result := serialize(map { "text": "hello *world*" }, $params) + (: The should appear raw, not escaped :) + return contains($result, "") +}; + +declare + %test:assertEquals('"(c) 2024"') +function testSerialize:json_character_map_copyright() { + (: Map © to (c) in JSON output :) + let $params := map { + "method": "json", + "use-character-maps": map { "©": "(c)" } + } + return serialize("© 2024", $params) +}; + +declare + %test:assertEquals('(c) symbol') +function testSerialize:xml_character_map_element_text() { + (: XML character maps in element text :) + let $params := map { + "method": "xml", + "omit-xml-declaration": true(), + "use-character-maps": map { "©": "(c)" } + } + return serialize(© symbol, $params) +}; diff --git a/exist-core/src/test/xquery/xquery3/serialize.xql b/exist-core/src/test/xquery/xquery3/serialize.xql index bea438d425f..4ac541f0f16 100644 --- a/exist-core/src/test/xquery/xquery3/serialize.xql +++ b/exist-core/src/test/xquery/xquery3/serialize.xql @@ -847,7 +847,7 @@ function ser:serialize-xml-134() { }; declare - %test:assertEquals(' ') + %test:assertEquals('') function ser:serialize-html-5-boolean-attribute-names() {