diff --git a/exist-core/pom.xml b/exist-core/pom.xml
index c4163dbfc05..60b285c8878 100644
--- a/exist-core/pom.xml
+++ b/exist-core/pom.xml
@@ -1206,6 +1206,7 @@ The BaseX Team. The original license statement is also included below.]]>${project.build.testOutputDirectory}/log4j2.xml
+ 180
+
+
+ org.exist.storage.lock.DeadlockIT
+ org.exist.xmldb.RemoveCollectionIT
+
@{jacocoArgLine} --add-modules jdk.incubator.vector --enable-native-access=ALL-UNNAMED -Dfile.encoding=${project.build.sourceEncoding} -Dexist.recovery.progressbar.hide=true
${project.basedir}/../exist-jetty-config/target/classes/org/exist/jetty
diff --git a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java
index ca85a06f5fe..7c727e6ab16 100644
--- a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java
+++ b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java
@@ -28,6 +28,11 @@ public class EXistOutputKeys {
*/
public static final String ITEM_SEPARATOR = "item-separator";
+ // --- QT4 Serialization 4.0 parameters ---
+ public static final String CANONICAL = "canonical";
+ public static final String ESCAPE_SOLIDUS = "escape-solidus";
+ public static final String JSON_LINES = "json-lines";
+
public static final String OMIT_ORIGINAL_XML_DECLARATION = "omit-original-xml-declaration";
public static final String OUTPUT_DOCTYPE = "output-doctype";
diff --git a/exist-core/src/main/java/org/exist/util/CharSlice.java b/exist-core/src/main/java/org/exist/util/CharSlice.java
index 8175eb76ab1..b0df423a3d3 100644
--- a/exist-core/src/main/java/org/exist/util/CharSlice.java
+++ b/exist-core/src/main/java/org/exist/util/CharSlice.java
@@ -198,6 +198,19 @@ public void copyTo(final char[] destination, final int destOffset) {
public void write(final Writer writer) throws java.io.IOException {
writer.write(array, offset, len);
}
+
+ /**
+ * Write a sub-range of this slice to a writer using a single bulk
+ * {@link Writer#write(char[], int, int)} call.
+ *
+ * @param writer the writer
+ * @param start the start index within this slice (inclusive)
+ * @param length the number of characters to write
+ * @throws java.io.IOException if an error occurs whilst writing
+ */
+ public void write(final Writer writer, final int start, final int length) throws java.io.IOException {
+ writer.write(array, offset + start, length);
+ }
}
//
diff --git a/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java b/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java
index 758ccee130a..a1b7c9890b3 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java
@@ -81,13 +81,27 @@ protected SerializerWriter getDefaultWriter() {
public void setOutput(Writer writer, Properties properties) {
outputProperties = Objects.requireNonNullElseGet(properties, () -> new Properties(defaultProperties));
final String method = outputProperties.getProperty(OutputKeys.METHOD, "xml");
- final String htmlVersionProp = outputProperties.getProperty(EXistOutputKeys.HTML_VERSION, "1.0");
-
+ // For html/xhtml methods, determine HTML version:
+ // 1. Use html-version if explicitly set
+ // 2. Otherwise use version (W3C spec: version controls HTML version for html method)
+ // 3. Default to 5.0
double htmlVersion;
- try {
- htmlVersion = Double.parseDouble(htmlVersionProp);
- } catch (NumberFormatException e) {
- htmlVersion = 1.0;
+ final String explicitHtmlVersion = outputProperties.getProperty(EXistOutputKeys.HTML_VERSION);
+ if (explicitHtmlVersion != null) {
+ try {
+ htmlVersion = Double.parseDouble(explicitHtmlVersion);
+ } catch (NumberFormatException e) {
+ htmlVersion = 5.0;
+ }
+ } else if (("html".equalsIgnoreCase(method) || "xhtml".equalsIgnoreCase(method))
+ && outputProperties.getProperty(OutputKeys.VERSION) != null) {
+ try {
+ htmlVersion = Double.parseDouble(outputProperties.getProperty(OutputKeys.VERSION));
+ } catch (NumberFormatException e) {
+ htmlVersion = 5.0;
+ }
+ } else {
+ htmlVersion = 5.0;
}
final SerializerWriter baseSerializerWriter = getBaseSerializerWriter(method, htmlVersion);
diff --git a/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java b/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java
index 22ab6dfca23..717ec83ab07 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/AdaptiveWriter.java
@@ -190,10 +190,15 @@ private void writeAtomic(AtomicValue value) throws IOException, SAXException, XP
}
private void writeDouble(final DoubleValue item) throws SAXException {
- final DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(Locale.US);
- symbols.setExponentSeparator("e");
- final DecimalFormat df = new DecimalFormat("0.0##########################E0", symbols);
- writeText(df.format(item.getDouble()));
+ final double d = item.getDouble();
+ if (Double.isInfinite(d) || Double.isNaN(d)) {
+ writeText(item.getStringValue());
+ } else {
+ final DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(Locale.US);
+ symbols.setExponentSeparator("e");
+ final DecimalFormat df = new DecimalFormat("0.0##########################E0", symbols);
+ writeText(df.format(d));
+ }
}
private void writeArray(final ArrayType array) throws XPathException, SAXException, TransformerException {
@@ -215,9 +220,7 @@ private void writeArray(final ArrayType array) throws XPathException, SAXExcepti
private void writeMap(final AbstractMapType map) throws SAXException, XPathException, TransformerException {
try {
- writer.write("map");
- addSpaceIfIndent();
- writer.write('{');
+ writer.write("map{");
addIndent();
indent();
for (final Iterator> i = map.iterator(); i.hasNext(); ) {
diff --git a/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java b/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java
index 1dffc3029b7..da7aede709a 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java
@@ -118,6 +118,13 @@ public class HTML5Writer extends XHTML5Writer {
BOOLEAN_ATTRIBUTE_NAMES.add("willValidate");
}
+ private static final ObjectSet BOOLEAN_ATTRIBUTE_NAMES_LOWER = new ObjectOpenHashSet<>(BOOLEAN_ATTRIBUTE_NAMES.size());
+ static {
+ for (final String n : BOOLEAN_ATTRIBUTE_NAMES) {
+ BOOLEAN_ATTRIBUTE_NAMES_LOWER.add(n.toLowerCase(java.util.Locale.ROOT));
+ }
+ }
+
private static final ObjectSet EMPTY_TAGS = new ObjectOpenHashSet<>(31);
static {
EMPTY_TAGS.add("area");
@@ -156,8 +163,15 @@ public void endElement(QName qname) throws TransformerException {
if (!isEmptyTag(qname.getLocalPart())) {
super.endElement(qname);
} else {
+ // HTML5 omits the close tag for void elements; we still need to
+ // honor the meta-in-head dedup that XHTMLWriter sets up at startElement
+ // time. Capture the buffered-meta flag before closeStartTag flips state.
+ final boolean wasBufferedMeta = isBufferedMeta(qname.getLocalPart());
closeStartTag(true);
endIndent(qname.getNamespaceURI(), qname.getLocalPart());
+ if (wasBufferedMeta) {
+ endMetaBuffer();
+ }
}
}
@@ -166,24 +180,33 @@ public void endElement(String namespaceURI, String localName, String qname) thro
if (!isEmptyTag(localName)) {
super.endElement(namespaceURI, localName, qname);
} else {
+ final boolean wasBufferedMeta = isBufferedMeta(localName);
closeStartTag(true);
endIndent(namespaceURI, localName);
+ if (wasBufferedMeta) {
+ endMetaBuffer();
+ }
}
}
@Override
public void attribute(String qname, CharSequence value) throws TransformerException {
+ // Strip prefix for the meta-dedup redundancy check
+ final int colon = qname.indexOf(':');
+ final String localName = colon < 0 ? qname : qname.substring(colon + 1);
+ noteMetaAttribute(localName, value);
+ final CharSequence effectiveValue = maybeEscapeUriHtml5(localName, value);
try {
if(!tagIsOpen) {
- characters(value);
+ characters(effectiveValue);
return;
}
final Writer writer = getWriter();
writer.write(' ');
writer.write(qname);
- if (!(BOOLEAN_ATTRIBUTE_NAMES.contains(qname) && qname.contentEquals(value))) {
+ if (!isBooleanAttributeMatch(qname, effectiveValue)) {
writer.write("=\"");
- writeChars(value, true);
+ writeChars(effectiveValue, true);
writer.write('"');
}
} catch(final IOException ioe) {
@@ -193,9 +216,12 @@ public void attribute(String qname, CharSequence value) throws TransformerExcept
@Override
public void attribute(QName qname, CharSequence value) throws TransformerException {
+ noteMetaAttribute(qname.getLocalPart(), value);
+ final String localPart = qname.getLocalPart();
+ final CharSequence effectiveValue = maybeEscapeUriHtml5(localPart, value);
try {
if(!tagIsOpen) {
- characters(value);
+ characters(effectiveValue);
return;
// throw new TransformerException("Found an attribute outside an
// element");
@@ -206,11 +232,10 @@ public void attribute(QName qname, CharSequence value) throws TransformerExcepti
writer.write(qname.getPrefix());
writer.write(':');
}
- final String localPart = qname.getLocalPart();
writer.write(localPart);
- if (!(BOOLEAN_ATTRIBUTE_NAMES.contains(localPart) && localPart.contentEquals(value))) {
+ if (!isBooleanAttributeMatch(localPart, effectiveValue)) {
writer.write("=\"");
- writeChars(value, true);
+ writeChars(effectiveValue, true);
writer.write('"');
}
} catch(final IOException ioe) {
@@ -218,26 +243,76 @@ public void attribute(QName qname, CharSequence value) throws TransformerExcepti
}
}
+ /**
+ * URI-attribute escaping for the HTML5 writer. Mirrors
+ * {@link XHTMLWriter#shouldEscapeUriAttribute(String, String)} but unwraps
+ * the prefixed form of {@link #currentTag} so the (element, attribute)
+ * lookup uses local names only.
+ */
+ private CharSequence maybeEscapeUriHtml5(final String attrLocal, final CharSequence value) {
+ if (currentTag == null) {
+ return value;
+ }
+ final String elementLocal = currentTag.contains(":")
+ ? currentTag.substring(currentTag.indexOf(':') + 1)
+ : currentTag;
+ if (!shouldEscapeUriAttribute(elementLocal, attrLocal)) {
+ return value;
+ }
+ return escapeUriAttribute(value);
+ }
+
+ /**
+ * HTML5 boolean attribute minimization: emit just the bare name when the
+ * value is empty or matches the attribute name case-insensitively
+ * (per W3C XSLT/XQuery Serialization 3.1, section 7.2.2).
+ */
+ private static boolean isBooleanAttributeMatch(final String name, final CharSequence value) {
+ if (!BOOLEAN_ATTRIBUTE_NAMES_LOWER.contains(name.toLowerCase(java.util.Locale.ROOT))) {
+ return false;
+ }
+ if (value == null || value.length() == 0) {
+ return true;
+ }
+ return name.equalsIgnoreCase(value.toString());
+ }
+
@Override
public void namespace(String prefix, String nsURI) throws TransformerException {
- // no namespaces allowed in HTML5
+ // HTML5 elements never carry an explicit xmlns since the parser puts
+ // them in the HTML namespace implicitly. Foreign content (anything
+ // outside the XHTML namespace, e.g. SVG, MathML, custom XML) keeps
+ // its namespace declarations so the receiver can re-parse it as XML.
+ if (nsURI == null || nsURI.isEmpty()) {
+ return;
+ }
+ if (org.exist.Namespaces.XHTML_NS.equals(nsURI)) {
+ return;
+ }
+ super.namespace(prefix, nsURI);
}
@Override
protected void closeStartTag(boolean isEmpty) throws TransformerException {
try {
if (tagIsOpen) {
+ final Writer w = getWriter();
if (isEmpty) {
if (isEmptyTag(currentTag)) {
- getWriter().write(">");
+ w.write('>');
+ } else if (isForeignContent()) {
+ // Foreign content (SVG, MathML, custom XML namespace)
+ // embedded in HTML5 is serialized with XML self-close
+ // syntax so the receiver can re-parse it as XML.
+ w.write("/>");
} else {
- getWriter().write('>');
- getWriter().write("");
- getWriter().write(currentTag);
- getWriter().write('>');
+ // Coalesce ">", "", tag, ">" into 2 writer calls instead of 4
+ w.write(">");
+ w.write(currentTag);
+ w.write('>');
}
} else {
- getWriter().write('>');
+ w.write('>');
}
tagIsOpen = false;
}
@@ -246,6 +321,39 @@ protected void closeStartTag(boolean isEmpty) throws TransformerException {
}
}
+ /**
+ * The current element is "foreign content" when its namespace is neither
+ * the XHTML namespace nor the empty (no-namespace) HTML namespace; that
+ * is the trigger for XML-style self-closing per HTML5's foreign-content
+ * serialization rule.
+ */
+ private boolean isForeignContent() {
+ final String ns = currentElementNamespaceURI();
+ return ns != null && !ns.isEmpty() && !org.exist.Namespaces.XHTML_NS.equals(ns);
+ }
+
+ @Override
+ public void processingInstruction(final String target, final String data) throws TransformerException {
+ // QT4 PR2372: HTML5 has no PI syntax, so the serializer renders
+ // processing instructions as comments of the form ``,
+ // matching the HTML5 parser's coercion of `...?>` content.
+ try {
+ if (tagIsOpen) {
+ closeStartTag(false);
+ }
+ final Writer writer = getWriter();
+ writer.write("");
+ } catch (final IOException e) {
+ throw new TransformerException(e.getMessage(), e);
+ }
+ }
+
@Override
protected boolean needsEscape(char ch) {
if (RAW_TEXT_ELEMENTS.contains(currentTag)) {
@@ -253,4 +361,28 @@ protected boolean needsEscape(char ch) {
}
return super.needsEscape(ch);
}
+
+ @Override
+ protected boolean needsEscape(final char ch, final boolean inAttribute) {
+ // In raw text elements (script, style), suppress escaping for TEXT content only.
+ // Attribute values must always be escaped, even on raw text elements.
+ if (!inAttribute && RAW_TEXT_ELEMENTS.contains(currentTag)) {
+ return false;
+ }
+ // For attributes, always return true (bypass the 1-arg override
+ // which returns false for all script/style content)
+ if (inAttribute) {
+ return true;
+ }
+ return super.needsEscape(ch, inAttribute);
+ }
+
+ @Override
+ protected boolean needsEscaping(final boolean inAttribute) {
+ // Mirror the per-char rule above: TEXT content inside script/style is
+ // raw text and never needs escaping. Lets writeChars() bulk-stream
+ // the entire block in one Writer.write() call.
+ return inAttribute || !RAW_TEXT_ELEMENTS.contains(currentTag);
+ }
+
}
diff --git a/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java
index c336d8b2943..99df54c3e19 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java
@@ -25,7 +25,9 @@
import java.io.Writer;
import java.util.ArrayDeque;
import java.util.Deque;
+import java.util.HashSet;
import java.util.Properties;
+import java.util.Set;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerException;
@@ -48,6 +50,8 @@ public class IndentingXMLWriter extends XMLWriter {
private boolean sameline = false;
private boolean whitespacePreserve = false;
private final Deque whitespacePreserveStack = new ArrayDeque<>();
+ private Set suppressIndentation = null;
+ private int suppressIndentDepth = 0;
public IndentingXMLWriter() {
super();
@@ -75,6 +79,9 @@ public void startElement(final String namespaceURI, final String localName, fina
indent();
}
super.startElement(namespaceURI, localName, qname);
+ if (isSuppressIndentation(localName)) {
+ suppressIndentDepth++;
+ }
addIndent();
afterTag = true;
sameline = true;
@@ -86,6 +93,9 @@ public void startElement(final QName qname) throws TransformerException {
indent();
}
super.startElement(qname);
+ if (isSuppressIndentation(qname.getLocalPart())) {
+ suppressIndentDepth++;
+ }
addIndent();
afterTag = true;
sameline = true;
@@ -95,6 +105,9 @@ public void startElement(final QName qname) throws TransformerException {
public void endElement(final String namespaceURI, final String localName, final String qname) throws TransformerException {
endIndent(namespaceURI, localName);
super.endElement(namespaceURI, localName, qname);
+ if (isSuppressIndentation(localName) && suppressIndentDepth > 0) {
+ suppressIndentDepth--;
+ }
popWhitespacePreserve(); // apply ancestor's xml:space value _after_ end element
sameline = isInlineTag(namespaceURI, localName);
afterTag = true;
@@ -104,6 +117,9 @@ public void endElement(final String namespaceURI, final String localName, final
public void endElement(final QName qname) throws TransformerException {
endIndent(qname.getNamespaceURI(), qname.getLocalPart());
super.endElement(qname);
+ if (isSuppressIndentation(qname.getLocalPart()) && suppressIndentDepth > 0) {
+ suppressIndentDepth--;
+ }
popWhitespacePreserve(); // apply ancestor's xml:space value _after_ end element
sameline = isInlineTag(qname.getNamespaceURI(), qname.getLocalPart());
afterTag = true;
@@ -164,7 +180,29 @@ public void setOutputProperties(final Properties properties) {
} catch (final NumberFormatException e) {
LOG.warn("Invalid indentation value: '{}'", option);
}
- indent = "yes".equals(outputProperties.getProperty(OutputKeys.INDENT, "no"));
+ final String indentValue = outputProperties.getProperty(OutputKeys.INDENT, "no").trim();
+ indent = "yes".equals(indentValue) || "true".equals(indentValue) || "1".equals(indentValue);
+ final String suppressProp = outputProperties.getProperty("suppress-indentation");
+ if (suppressProp != null && !suppressProp.isEmpty()) {
+ suppressIndentation = new HashSet<>();
+ for (final String name : suppressProp.split("\\s+")) {
+ if (!name.isEmpty()) {
+ // Handle URI-qualified names: Q{ns}local or {ns}local → extract local part
+ if (name.startsWith("Q{") || name.startsWith("{")) {
+ final int closeBrace = name.indexOf('}');
+ if (closeBrace > 0 && closeBrace < name.length() - 1) {
+ suppressIndentation.add(name.substring(closeBrace + 1));
+ } else {
+ suppressIndentation.add(name);
+ }
+ } else {
+ suppressIndentation.add(name);
+ }
+ }
+ }
+ } else {
+ suppressIndentation = null;
+ }
}
@Override
@@ -220,8 +258,12 @@ protected void addSpaceIfIndent() throws IOException {
writer.write(' ');
}
+ private boolean isSuppressIndentation(final String localName) {
+ return suppressIndentation != null && suppressIndentation.contains(localName);
+ }
+
protected void indent() throws TransformerException {
- if (!indent || whitespacePreserve) {
+ if (!indent || whitespacePreserve || suppressIndentDepth > 0) {
return;
}
final int spaces = indentAmount * level;
diff --git a/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java b/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java
index 85c5c4cf5a6..5ea206a25da 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/TEXTWriter.java
@@ -206,16 +206,10 @@ protected void writeDoctype(final String rootElement) throws TransformerExceptio
@Override
protected void writeChars(final CharSequence s, final boolean inAttribute) throws IOException {
- final int len = s.length();
- writeCharSeq(s, 0, len);
+ writeCharSeq(s, 0, s.length());
}
-
- private void writeCharSeq(final CharSequence ch, final int start, final int end) throws IOException {
- for (int i = start; i < end; i++) {
- writer.write(ch.charAt(i));
- }
- }
-
+
+
@Override
protected void writeCharacterReference(final char charval) throws IOException {
int o = 0;
diff --git a/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java b/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java
index e89e7119d19..bc4990eb5eb 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java
@@ -22,7 +22,6 @@
package org.exist.util.serializer;
import java.io.Writer;
-import javax.xml.transform.TransformerException;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
@@ -121,14 +120,4 @@ public XHTML5Writer(ObjectSet emptyTags, ObjectSet inlineTags) {
public XHTML5Writer(Writer writer, ObjectSet emptyTags, ObjectSet inlineTags) {
super(writer, emptyTags, inlineTags);
}
-
- @Override
- protected void writeDoctype(String rootElement) throws TransformerException {
- if (doctypeWritten) {
- return;
- }
-
- documentType("html", null, null);
- doctypeWritten = true;
- }
}
diff --git a/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java
index b0006f7f51c..d01a062fde4 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java
@@ -23,6 +23,7 @@
import java.io.IOException;
import java.io.Writer;
+import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerException;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
@@ -36,12 +37,85 @@
*/
public class XHTMLWriter extends IndentingXMLWriter {
+ /**
+ * HTML boolean attributes per HTML 4.01 and HTML5 spec.
+ * When method="html" and the attribute value equals the attribute name
+ * (case-insensitive), the attribute is minimized to just the name.
+ */
+ protected static final ObjectSet BOOLEAN_ATTRIBUTES = new ObjectOpenHashSet<>(31);
+ static {
+ BOOLEAN_ATTRIBUTES.add("checked");
+ BOOLEAN_ATTRIBUTES.add("compact");
+ BOOLEAN_ATTRIBUTES.add("declare");
+ BOOLEAN_ATTRIBUTES.add("defer");
+ BOOLEAN_ATTRIBUTES.add("disabled");
+ BOOLEAN_ATTRIBUTES.add("ismap");
+ BOOLEAN_ATTRIBUTES.add("multiple");
+ BOOLEAN_ATTRIBUTES.add("nohref");
+ BOOLEAN_ATTRIBUTES.add("noresize");
+ BOOLEAN_ATTRIBUTES.add("noshade");
+ BOOLEAN_ATTRIBUTES.add("nowrap");
+ BOOLEAN_ATTRIBUTES.add("readonly");
+ BOOLEAN_ATTRIBUTES.add("selected");
+ }
+
+ /**
+ * URI-valued attributes that must be %-escaped when escape-uri-attributes=yes
+ * (default for HTML/XHTML output methods, per W3C XSLT and XQuery
+ * Serialization 3.1 § 7.2.5). Keys are element local name + "/" + attribute
+ * local name, both lowercase. The synthetic key "*/href" matches any
+ * element bearing an href attribute (covers both a/@href and area/@href etc.
+ * in a single check while still letting non-URI attributes through).
+ */
+ private static final ObjectSet URI_VALUED_ATTRIBUTES = new ObjectOpenHashSet<>(48);
+ static {
+ URI_VALUED_ATTRIBUTES.add("a/href");
+ URI_VALUED_ATTRIBUTES.add("a/name");
+ URI_VALUED_ATTRIBUTES.add("applet/codebase");
+ URI_VALUED_ATTRIBUTES.add("area/href");
+ URI_VALUED_ATTRIBUTES.add("base/href");
+ URI_VALUED_ATTRIBUTES.add("blockquote/cite");
+ URI_VALUED_ATTRIBUTES.add("body/background");
+ URI_VALUED_ATTRIBUTES.add("button/formaction");
+ URI_VALUED_ATTRIBUTES.add("del/cite");
+ URI_VALUED_ATTRIBUTES.add("form/action");
+ URI_VALUED_ATTRIBUTES.add("frame/longdesc");
+ URI_VALUED_ATTRIBUTES.add("frame/src");
+ URI_VALUED_ATTRIBUTES.add("head/profile");
+ URI_VALUED_ATTRIBUTES.add("html/manifest");
+ URI_VALUED_ATTRIBUTES.add("iframe/longdesc");
+ URI_VALUED_ATTRIBUTES.add("iframe/src");
+ URI_VALUED_ATTRIBUTES.add("img/longdesc");
+ URI_VALUED_ATTRIBUTES.add("img/src");
+ URI_VALUED_ATTRIBUTES.add("img/usemap");
+ URI_VALUED_ATTRIBUTES.add("input/formaction");
+ URI_VALUED_ATTRIBUTES.add("input/src");
+ URI_VALUED_ATTRIBUTES.add("input/usemap");
+ URI_VALUED_ATTRIBUTES.add("ins/cite");
+ URI_VALUED_ATTRIBUTES.add("link/href");
+ URI_VALUED_ATTRIBUTES.add("object/archive");
+ URI_VALUED_ATTRIBUTES.add("object/classid");
+ URI_VALUED_ATTRIBUTES.add("object/codebase");
+ URI_VALUED_ATTRIBUTES.add("object/data");
+ URI_VALUED_ATTRIBUTES.add("object/usemap");
+ URI_VALUED_ATTRIBUTES.add("q/cite");
+ URI_VALUED_ATTRIBUTES.add("script/src");
+ URI_VALUED_ATTRIBUTES.add("source/src");
+ URI_VALUED_ATTRIBUTES.add("track/src");
+ URI_VALUED_ATTRIBUTES.add("video/poster");
+ URI_VALUED_ATTRIBUTES.add("video/src");
+ URI_VALUED_ATTRIBUTES.add("audio/src");
+ }
+
+ private static final char[] HEX = "0123456789ABCDEF".toCharArray();
+
protected static final ObjectSet EMPTY_TAGS = new ObjectOpenHashSet<>(31);
static {
EMPTY_TAGS.add("area");
EMPTY_TAGS.add("base");
EMPTY_TAGS.add("br");
EMPTY_TAGS.add("col");
+ EMPTY_TAGS.add("embed");
EMPTY_TAGS.add("hr");
EMPTY_TAGS.add("img");
EMPTY_TAGS.add("input");
@@ -52,48 +126,71 @@ public class XHTMLWriter extends IndentingXMLWriter {
EMPTY_TAGS.add("isindex");
EMPTY_TAGS.add("param");
}
-
+
protected static final ObjectSet INLINE_TAGS = new ObjectOpenHashSet<>(31);
-
static {
- INLINE_TAGS.add("a");
- INLINE_TAGS.add("abbr");
- INLINE_TAGS.add("acronym");
- INLINE_TAGS.add("b");
- INLINE_TAGS.add("bdo");
- INLINE_TAGS.add("big");
- INLINE_TAGS.add("br");
- INLINE_TAGS.add("button");
- INLINE_TAGS.add("cite");
- INLINE_TAGS.add("code");
- INLINE_TAGS.add("del");
- INLINE_TAGS.add("dfn");
- INLINE_TAGS.add("em");
- INLINE_TAGS.add("i");
- INLINE_TAGS.add("img");
- INLINE_TAGS.add("input");
- INLINE_TAGS.add("kbd");
- INLINE_TAGS.add("label");
- INLINE_TAGS.add("q");
- INLINE_TAGS.add("samp");
- INLINE_TAGS.add("select");
- INLINE_TAGS.add("small");
- INLINE_TAGS.add("span");
- INLINE_TAGS.add("strong");
- INLINE_TAGS.add("sub");
- INLINE_TAGS.add("sup");
- INLINE_TAGS.add("textarea");
- INLINE_TAGS.add("tt");
- INLINE_TAGS.add("var");
- }
-
- protected String currentTag;
+ INLINE_TAGS.add("a");
+ INLINE_TAGS.add("abbr");
+ INLINE_TAGS.add("acronym");
+ INLINE_TAGS.add("b");
+ INLINE_TAGS.add("bdo");
+ INLINE_TAGS.add("big");
+ INLINE_TAGS.add("br");
+ INLINE_TAGS.add("button");
+ INLINE_TAGS.add("cite");
+ INLINE_TAGS.add("code");
+ INLINE_TAGS.add("del");
+ INLINE_TAGS.add("dfn");
+ INLINE_TAGS.add("em");
+ INLINE_TAGS.add("i");
+ INLINE_TAGS.add("img");
+ INLINE_TAGS.add("input");
+ INLINE_TAGS.add("kbd");
+ INLINE_TAGS.add("label");
+ INLINE_TAGS.add("q");
+ INLINE_TAGS.add("samp");
+ INLINE_TAGS.add("select");
+ INLINE_TAGS.add("small");
+ INLINE_TAGS.add("span");
+ INLINE_TAGS.add("strong");
+ INLINE_TAGS.add("sub");
+ INLINE_TAGS.add("sup");
+ INLINE_TAGS.add("textarea");
+ INLINE_TAGS.add("tt");
+ INLINE_TAGS.add("var");
+ }
+
+ private static final String SVG_NS = "http://www.w3.org/2000/svg";
+ private static final String MATHML_NS = "http://www.w3.org/1998/Math/MathML";
+
+ private static final ObjectSet RAW_TEXT_ELEMENTS_HTML = new ObjectOpenHashSet<>(4);
+ static {
+ RAW_TEXT_ELEMENTS_HTML.add("script");
+ RAW_TEXT_ELEMENTS_HTML.add("style");
+ }
protected final ObjectSet emptyTags;
protected final ObjectSet inlineTags;
+ protected String currentTag;
+ protected boolean inHead = false;
+ protected boolean contentTypeMetaWritten = false;
+
+ // Meta-tag dedup state: when a `` element is encountered inside
+ // `` AFTER the auto-generated content-type meta has been emitted,
+ // its bytes are diverted to {@link #metaScratch}. If, while buffering,
+ // we observe a {@code charset} or {@code http-equiv="Content-Type"}
+ // attribute, the buffered meta is dropped (the auto-meta replaces it);
+ // otherwise the buffer is flushed verbatim at endElement time.
+ private Writer metaSuspendedWriter = null;
+ private java.io.StringWriter metaScratch = null;
+ private boolean metaIsContentTypeOrCharset = false;
+
+ boolean haveCollapsedXhtmlPrefix = false;
+ private String collapsedForeignNs = null; // SVG or MathML ns being normalized
+
/**
- *
+ *
*/
public XHTMLWriter() {
this(EMPTY_TAGS, INLINE_TAGS);
@@ -120,97 +217,311 @@ public XHTMLWriter(final Writer writer, ObjectSet emptyTags, ObjectSet> 6));
+ appendHexByte(sb, 0x80 | (codepoint & 0x3F));
+ } else if (codepoint < 0x10000) {
+ appendHexByte(sb, 0xE0 | (codepoint >> 12));
+ appendHexByte(sb, 0x80 | ((codepoint >> 6) & 0x3F));
+ appendHexByte(sb, 0x80 | (codepoint & 0x3F));
+ } else {
+ appendHexByte(sb, 0xF0 | (codepoint >> 18));
+ appendHexByte(sb, 0x80 | ((codepoint >> 12) & 0x3F));
+ appendHexByte(sb, 0x80 | ((codepoint >> 6) & 0x3F));
+ appendHexByte(sb, 0x80 | (codepoint & 0x3F));
+ }
+ }
+
+ private static void appendHexByte(final StringBuilder sb, final int b) {
+ sb.append('%');
+ sb.append(HEX[(b >> 4) & 0xF]);
+ sb.append(HEX[b & 0xF]);
+ }
+
+ @Override
+ protected void resetObjectState() {
+ super.resetObjectState();
+ inHead = false;
+ contentTypeMetaWritten = false;
+ metaSuspendedWriter = null;
+ metaScratch = null;
+ metaIsContentTypeOrCharset = false;
+ }
+
+ private boolean shouldBufferDuplicateMeta(final String localName) {
+ return inHead && contentTypeMetaWritten && metaSuspendedWriter == null
+ && "meta".equalsIgnoreCase(localName);
+ }
+
+ /** True when the writer is currently diverting bytes for a candidate-duplicate meta. */
+ protected boolean isBufferedMeta(final String localName) {
+ return metaSuspendedWriter != null && "meta".equalsIgnoreCase(localName);
+ }
+
+ private void beginMetaBuffer() {
+ metaSuspendedWriter = writer;
+ metaScratch = new java.io.StringWriter();
+ writer = metaScratch;
+ metaIsContentTypeOrCharset = false;
+ }
+
+ protected void endMetaBuffer() throws TransformerException {
+ if (metaSuspendedWriter == null) {
+ return;
+ }
+ final Writer original = metaSuspendedWriter;
+ final String buffered = metaScratch.toString();
+ final boolean dropDuplicate = metaIsContentTypeOrCharset;
+ metaSuspendedWriter = null;
+ metaScratch = null;
+ metaIsContentTypeOrCharset = false;
+ writer = original;
+ if (!dropDuplicate) {
+ try {
+ writer.write(buffered);
+ } catch (final IOException ioe) {
+ throw new TransformerException(ioe.getMessage(), ioe);
+ }
+ }
+ }
+
+ protected void noteMetaAttribute(final String localName, final CharSequence value) {
+ if (metaSuspendedWriter == null) {
+ return;
+ }
+ if ("charset".equalsIgnoreCase(localName)) {
+ metaIsContentTypeOrCharset = true;
+ } else if ("http-equiv".equalsIgnoreCase(localName)
+ && value != null && "Content-Type".equalsIgnoreCase(value.toString())) {
+ metaIsContentTypeOrCharset = true;
+ }
+ }
+
protected boolean isEmptyTag(final String tag) {
return emptyTags.contains(tag);
}
- boolean haveCollapsedXhtmlPrefix = false;
-
@Override
public void startElement(final QName qname) throws TransformerException {
-
+
final QName xhtmlQName = removeXhtmlPrefix(qname);
-
+
+ if (shouldBufferDuplicateMeta(xhtmlQName.getLocalPart())) {
+ beginMetaBuffer();
+ }
super.startElement(xhtmlQName);
currentTag = xhtmlQName.getStringValue();
+ if ("head".equalsIgnoreCase(xhtmlQName.getLocalPart())) {
+ inHead = true;
+ writeContentTypeMeta();
+ }
}
-
+
@Override
public void endElement(final QName qname) throws TransformerException {
final QName xhtmlQName = removeXhtmlPrefix(qname);
-
+ final boolean isMetaInHead = metaSuspendedWriter != null
+ && "meta".equalsIgnoreCase(xhtmlQName.getLocalPart());
+ if (inHead && "head".equalsIgnoreCase(xhtmlQName.getLocalPart())) {
+ inHead = false;
+ }
+
super.endElement(xhtmlQName);
-
+
+ if (isMetaInHead) {
+ endMetaBuffer();
+ }
+
haveCollapsedXhtmlPrefix = false;
+ collapsedForeignNs = null;
}
-
+
protected QName removeXhtmlPrefix(final QName qname) {
final String prefix = qname.getPrefix();
final String namespaceURI = qname.getNamespaceURI();
- if(prefix != null && !prefix.isEmpty() && namespaceURI != null && namespaceURI.equals(Namespaces.XHTML_NS)) {
- haveCollapsedXhtmlPrefix = true;
- return new QName(qname.getLocalPart(), namespaceURI);
+ if (prefix != null && !prefix.isEmpty() && namespaceURI != null) {
+ if (namespaceURI.equals(Namespaces.XHTML_NS)) {
+ haveCollapsedXhtmlPrefix = true;
+ return new QName(qname.getLocalPart(), namespaceURI);
+ }
+ // XHTML5: normalize SVG and MathML prefixes to default namespace
+ if (isHtml5Version() && (namespaceURI.equals(SVG_NS) || namespaceURI.equals(MATHML_NS))) {
+ collapsedForeignNs = namespaceURI;
+ return new QName(qname.getLocalPart(), namespaceURI);
+ }
}
-
return qname;
}
@Override
public void startElement(final String namespaceURI, final String localName, final String qname) throws TransformerException {
-
+
final String xhtmlQName = removeXhtmlPrefix(namespaceURI, qname);
-
+
+ if (shouldBufferDuplicateMeta(localName)) {
+ beginMetaBuffer();
+ }
super.startElement(namespaceURI, localName, xhtmlQName);
currentTag = xhtmlQName;
+ if ("head".equalsIgnoreCase(localName)) {
+ inHead = true;
+ writeContentTypeMeta();
+ }
}
-
+
@Override
public void endElement(final String namespaceURI, final String localName, final String qname) throws TransformerException {
-
+ final boolean isMetaInHead = metaSuspendedWriter != null
+ && "meta".equalsIgnoreCase(localName);
+ if (inHead && "head".equalsIgnoreCase(localName)) {
+ inHead = false;
+ }
+
final String xhtmlQName = removeXhtmlPrefix(namespaceURI, qname);
-
+
super.endElement(namespaceURI, localName, xhtmlQName);
-
+
+ if (isMetaInHead) {
+ endMetaBuffer();
+ }
+
haveCollapsedXhtmlPrefix = false;
+ collapsedForeignNs = null;
}
-
+
protected String removeXhtmlPrefix(final String namespaceURI, final String qname) {
-
final int pos = qname.indexOf(':');
- if(pos > 0 && namespaceURI != null && namespaceURI.equals(Namespaces.XHTML_NS)) {
- haveCollapsedXhtmlPrefix = true;
- return qname.substring(pos+1);
-
+ if (pos > 0 && namespaceURI != null) {
+ if (namespaceURI.equals(Namespaces.XHTML_NS)) {
+ haveCollapsedXhtmlPrefix = true;
+ return qname.substring(pos + 1);
+ }
+ // XHTML5: normalize SVG and MathML prefixes
+ if (isHtml5Version() && (namespaceURI.equals(SVG_NS) || namespaceURI.equals(MATHML_NS))) {
+ collapsedForeignNs = namespaceURI;
+ return qname.substring(pos + 1);
+ }
}
-
return qname;
}
@Override
public void namespace(final String prefix, final String nsURI) throws TransformerException {
- if(haveCollapsedXhtmlPrefix && prefix != null && !prefix.isEmpty() && nsURI.equals(Namespaces.XHTML_NS)) {
- return; //dont output the xmlns:prefix for the collapsed nodes prefix
+ if (haveCollapsedXhtmlPrefix && prefix != null && !prefix.isEmpty() && nsURI.equals(Namespaces.XHTML_NS)) {
+ return; // don't output the xmlns:prefix for the collapsed node's prefix
+ }
+ // When a foreign namespace prefix was collapsed, replace the prefixed
+ // declaration with a default namespace declaration
+ if (collapsedForeignNs != null && prefix != null && !prefix.isEmpty()
+ && nsURI.equals(collapsedForeignNs)) {
+ super.namespace("", nsURI); // emit xmlns="..." instead of xmlns:prefix="..."
+ return;
}
-
super.namespace(prefix, nsURI);
}
-
-
+
+
@Override
protected void closeStartTag(final boolean isEmpty) throws TransformerException {
try {
if (tagIsOpen) {
+ // Flush canonical buffers (sorted namespaces + attributes) if active
+ if (isCanonical()) {
+ flushCanonicalBuffersXhtml();
+ }
+ final Writer w = getWriter();
if (isEmpty) {
- if (isEmptyTag(currentTag)) {
- getWriter().write(" />");
+ if (isCanonical()) {
+ // Canonical: always expand empty elements — coalesce 4 writes into 2
+ w.write(">");
+ w.write(currentTag);
+ w.write('>');
+ } else if (isEmptyTag(currentTag)) {
+ // For method="html", use HTML-style void tags (
)
+ // For method="xhtml", use XHTML-style (
)
+ if (isHtmlMethod()) {
+ w.write('>');
+ } else {
+ w.write(" />");
+ }
} else {
- getWriter().write('>');
- getWriter().write("");
- getWriter().write(currentTag);
- getWriter().write('>');
+ // Coalesce ">", "", tag, ">" into 2 writer calls instead of 4
+ w.write(">");
+ w.write(currentTag);
+ w.write('>');
}
} else {
- getWriter().write('>');
+ w.write('>');
}
tagIsOpen = false;
}
@@ -218,10 +529,282 @@ protected void closeStartTag(final boolean isEmpty) throws TransformerException
throw new TransformerException(ioe.getMessage(), ioe);
}
}
-
+
+ /**
+ * Returns true if the output method is "html" (not "xhtml").
+ * HTML uses void element syntax (
) while XHTML uses self-closing (
).
+ */
+ protected boolean isHtmlMethod() {
+ if (outputProperties != null) {
+ final String method = outputProperties.getProperty(OutputKeys.METHOD);
+ return "html".equalsIgnoreCase(method);
+ }
+ return false;
+ }
+
+ /**
+ * Returns true if the HTML version is 5.0 or higher.
+ * Checks html-version first, then falls back to version (per W3C spec for html method).
+ */
+ protected boolean isHtml5Version() {
+ if (outputProperties == null) {
+ return true; // default to HTML5
+ }
+ final String htmlVersion = outputProperties.getProperty(org.exist.storage.serializers.EXistOutputKeys.HTML_VERSION);
+ if (htmlVersion != null) {
+ try {
+ return Double.parseDouble(htmlVersion) >= 5.0;
+ } catch (final NumberFormatException e) {
+ // fall through
+ }
+ }
+ final String version = outputProperties.getProperty(OutputKeys.VERSION);
+ if (version != null) {
+ try {
+ return Double.parseDouble(version) >= 5.0;
+ } catch (final NumberFormatException e) {
+ // ignore
+ }
+ }
+ return true; // default to HTML5
+ }
+
+ /**
+ * DOCTYPE emission for XHTML/HTML output methods, per
+ * W3C XSLT and XQuery Serialization 3.1 sections 7.1 and 7.2.
+ *
+ *
+ * - doctype-system set: emit DOCTYPE with PUBLIC/SYSTEM ids
+ * - doctype-system absent, html method, doctype-public set: emit DOCTYPE PUBLIC
+ * - doctype-system absent, html-version ≥ 5: emit {@code }
+ * - otherwise: no DOCTYPE
+ *
+ *
+ * Only emitted when the root element is {@code html} (case-insensitive); for
+ * fragments rooted on any other element the DOCTYPE is suppressed.
+ */
+ @Override
+ protected void writeDoctype(final String rootElement) throws TransformerException {
+ if (doctypeWritten) {
+ return;
+ }
+ if (isCanonical() || !isHtmlRoot(rootElement)) {
+ doctypeWritten = true;
+ return;
+ }
+ emitHtmlDoctype();
+ doctypeWritten = true;
+ }
+
+ private static boolean isHtmlRoot(final String rootElement) {
+ final int colon = rootElement.indexOf(':');
+ final String localName = colon < 0 ? rootElement : rootElement.substring(colon + 1);
+ return "html".equalsIgnoreCase(localName);
+ }
+
+ private String getDoctypeProperty(final String key) {
+ return outputProperties != null ? outputProperties.getProperty(key) : null;
+ }
+
+ private void emitHtmlDoctype() throws TransformerException {
+ final String publicId = getDoctypeProperty(OutputKeys.DOCTYPE_PUBLIC);
+ final String systemId = getDoctypeProperty(OutputKeys.DOCTYPE_SYSTEM);
+ if (systemId != null) {
+ documentType("html", publicId, systemId);
+ } else if (isHtmlMethod() && publicId != null) {
+ documentType("html", publicId, null);
+ } else if (isHtml5Version()) {
+ documentType("html", null, null);
+ }
+ }
+
+ @Override
+ public void attribute(final QName qname, final CharSequence value) throws TransformerException {
+ noteMetaAttribute(qname.getLocalPart(), value);
+ final CharSequence effectiveValue = maybeEscapeUri(qname.getLocalPart(), value);
+ // For method="html", minimize boolean attributes when value matches name
+ if (isHtmlMethod() && isBooleanAttribute(qname.getLocalPart(), effectiveValue)) {
+ try {
+ if (!tagIsOpen) {
+ characters(value);
+ return;
+ }
+ final Writer w = getWriter();
+ w.write(' ');
+ w.write(qname.getLocalPart());
+ // Don't write ="value" — minimized form
+ } catch (final IOException ioe) {
+ throw new TransformerException(ioe.getMessage(), ioe);
+ }
+ return;
+ }
+ super.attribute(qname, effectiveValue);
+ }
+
+ @Override
+ public void attribute(final String qname, final CharSequence value) throws TransformerException {
+ // Strip prefix for the redundancy check (we want the local name).
+ final int colon = qname.indexOf(':');
+ final String localName = colon < 0 ? qname : qname.substring(colon + 1);
+ noteMetaAttribute(localName, value);
+ final CharSequence effectiveValue = maybeEscapeUri(localName, value);
+ if (isHtmlMethod() && isBooleanAttribute(qname, effectiveValue)) {
+ try {
+ if (!tagIsOpen) {
+ characters(value);
+ return;
+ }
+ final Writer w = getWriter();
+ w.write(' ');
+ w.write(qname);
+ } catch (final IOException ioe) {
+ throw new TransformerException(ioe.getMessage(), ioe);
+ }
+ return;
+ }
+ super.attribute(qname, effectiveValue);
+ }
+
+ /**
+ * Apply escape-uri-attributes when the current element/attribute names
+ * a URI-valued attribute; otherwise return the value unchanged. Escaping
+ * is applied for both HTML and XHTML output methods, so URI-valued
+ * attributes round-trip in XHTML 1.0 / 5 output too.
+ */
+ private CharSequence maybeEscapeUri(final String attrLocal, final CharSequence value) {
+ if (currentTag == null) {
+ return value;
+ }
+ final String elementLocal = currentTag.contains(":")
+ ? currentTag.substring(currentTag.indexOf(':') + 1)
+ : currentTag;
+ if (!shouldEscapeUriAttribute(elementLocal, attrLocal)) {
+ return value;
+ }
+ return escapeUriAttribute(value);
+ }
+
+ private boolean isBooleanAttribute(final String attrName, final CharSequence value) {
+ return BOOLEAN_ATTRIBUTES.contains(attrName.toLowerCase(java.util.Locale.ROOT))
+ && attrName.equalsIgnoreCase(value.toString());
+ }
+
+ @Override
+ protected boolean needsEscape(final char ch, final boolean inAttribute) {
+ // For HTML method, script and style content should not be escaped
+ if (!inAttribute && isHtmlMethod()
+ && currentTag != null && RAW_TEXT_ELEMENTS_HTML.contains(currentTag.toLowerCase(java.util.Locale.ROOT))) {
+ return false;
+ }
+ return super.needsEscape(ch, inAttribute);
+ }
+
+ @Override
+ protected boolean needsEscaping(final boolean inAttribute) {
+ if (!inAttribute && isHtmlMethod()
+ && currentTag != null && RAW_TEXT_ELEMENTS_HTML.contains(currentTag.toLowerCase(java.util.Locale.ROOT))) {
+ return false;
+ }
+ return super.needsEscaping(inAttribute);
+ }
+
+ /**
+ * Per W3C XSLT and XQuery Serialization 3.1 § 7.2.7, the html method
+ * ignores cdata-section-elements for HTML elements (CDATA sections are
+ * not valid HTML syntax) but DOES apply them to foreign content
+ * (e.g. SVG, MathML, or any element in a non-HTML namespace embedded
+ * in the document). For foreign content the rule is unconditional —
+ * the xdm-serialization gate that the XML writer otherwise applies
+ * does not gate HTML's foreign-content CDATA emission.
+ */
+ @Override
+ protected boolean shouldUseCdataSections() {
+ if (isHtmlMethod()) {
+ final String ns = currentElementNamespaceURI();
+ return ns != null && !ns.isEmpty() && !Namespaces.XHTML_NS.equals(ns);
+ }
+ return super.shouldUseCdataSections();
+ }
+
+ /**
+ * Processing-instruction serialization for HTML method (pre-HTML5).
+ * Per W3C XSLT and XQuery Serialization 3.1 § 7.1.5, the HTML output
+ * method emits PIs as {@code } (no closing {@code ?>});
+ * XHTML uses the regular XML form which the parent already provides.
+ * The HTML5 (PR2372) variant lives in {@link HTML5Writer}.
+ */
+ @Override
+ public void processingInstruction(final String target, final String data) throws TransformerException {
+ if (!isHtmlMethod()) {
+ super.processingInstruction(target, data);
+ return;
+ }
+ try {
+ if (tagIsOpen) {
+ closeStartTag(false);
+ }
+ final Writer w = getWriter();
+ w.write("");
+ w.write(target);
+ if (data != null && !data.isEmpty()) {
+ w.write(' ');
+ w.write(data);
+ }
+ w.write('>');
+ } catch (final IOException ioe) {
+ throw new TransformerException(ioe.getMessage(), ioe);
+ }
+ }
+
+ @Override
+ protected boolean escapeAmpersandBeforeBrace() {
+ // HTML spec: & before { in attribute values should not be escaped
+ return false;
+ }
+
@Override
protected boolean isInlineTag(final String namespaceURI, final String localName) {
- return (namespaceURI == null || namespaceURI.isEmpty() || Namespaces.XHTML_NS.equals(namespaceURI))
- && inlineTags.contains(localName);
+ return (namespaceURI == null || namespaceURI.isEmpty() || Namespaces.XHTML_NS.equals(namespaceURI))
+ && inlineTags.contains(localName);
+ }
+
+ /**
+ * Write a meta content-type tag as the first child of head when
+ * include-content-type is enabled (the default per W3C Serialization 3.1).
+ */
+ protected void writeContentTypeMeta() throws TransformerException {
+ if (contentTypeMetaWritten || outputProperties == null) {
+ return;
+ }
+ final String includeContentType = outputProperties.getProperty("include-content-type", "yes");
+ if (!"yes".equals(includeContentType)) {
+ return;
+ }
+ contentTypeMetaWritten = true;
+ try {
+ final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, "UTF-8");
+ closeStartTag(false);
+ final Writer writer = getWriter();
+
+ // HTML5 method uses
+ // XHTML and HTML4 use
+ // XHTML mode requires self-closing tags (/>) for valid XML output —
+ // the URL rewrite pipeline re-parses this as XML in the view step.
+ final boolean selfClose = !isHtmlMethod();
+ if (isHtmlMethod() && isHtml5Version()) {
+ writer.write("" : "\">");
+ } else {
+ final String mediaType = outputProperties.getProperty(OutputKeys.MEDIA_TYPE, "text/html");
+ writer.write("" : "\">");
+ }
+ } catch (IOException e) {
+ throw new TransformerException(e.getMessage(), e);
+ }
}
}
diff --git a/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java
index 763aaf52ef6..410f723f8fb 100644
--- a/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java
+++ b/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java
@@ -78,6 +78,11 @@ public class XMLWriter implements SerializerWriter {
private String defaultNamespace = "";
+ // Namespace stack (BaseX-style): flat list of (prefix, uri) pairs for all in-scope bindings.
+ // nstack records the list size at each startElement so endElement can roll back declarations.
+ private final List nspaces = new ArrayList<>();
+ private final Deque nstack = new ArrayDeque<>();
+
/**
* When serializing an XDM this should be true,
* otherwise false.
@@ -86,8 +91,33 @@ public class XMLWriter implements SerializerWriter {
* compared to retrieving resources from the database.
*/
private boolean xdmSerialization = false;
+ private boolean xml11 = false;
+ private boolean canonical = false;
+ @Nullable private java.text.Normalizer.Form normalizationForm = null;
+
+ // Canonical XML: buffer namespaces and attributes for sorting
+ private final List canonicalNamespaces = new ArrayList<>(); // [prefix, uri]
+ private final List canonicalAttributes = new ArrayList<>(); // [nsUri, localName, qname, value]
private final Deque elementName = new ArrayDeque<>();
+
+ /**
+ * Returns true if cdata-section-elements should be applied.
+ * Subclasses (e.g., XHTMLWriter for HTML method) can override
+ * to suppress CDATA sections.
+ */
+ protected boolean shouldUseCdataSections() {
+ return xdmSerialization;
+ }
+
+ /**
+ * Returns the namespace URI of the current (innermost) element,
+ * or null if no element is on the stack.
+ */
+ protected String currentElementNamespaceURI() {
+ final QName top = elementName.peek();
+ return top != null ? top.getNamespaceURI() : null;
+ }
private LazyVal> cdataSectionElements = new LazyVal<>(this::parseCdataSectionElementNames);
private boolean cdataSetionElement = false;
@@ -96,8 +126,9 @@ public class XMLWriter implements SerializerWriter {
Arrays.fill(textSpecialChars, false);
textSpecialChars['<'] = true;
textSpecialChars['>'] = true;
- // textSpecialChars['\r'] = true;
+ textSpecialChars['\r'] = true;
textSpecialChars['&'] = true;
+ textSpecialChars[0x7F] = true; // DEL must be escaped as
attrSpecialChars = new boolean[128];
Arrays.fill(attrSpecialChars, false);
@@ -108,6 +139,7 @@ public class XMLWriter implements SerializerWriter {
attrSpecialChars['\t'] = true;
attrSpecialChars['&'] = true;
attrSpecialChars['"'] = true;
+ attrSpecialChars[0x7F] = true; // DEL must be escaped as
}
@Nullable private XMLDeclaration originalXmlDecl;
@@ -139,6 +171,10 @@ public void setOutputProperties(final Properties properties) {
}
this.xdmSerialization = "yes".equals(outputProperties.getProperty(EXistOutputKeys.XDM_SERIALIZATION, "no"));
+ this.xml11 = "1.1".equals(outputProperties.getProperty(OutputKeys.VERSION));
+ this.normalizationForm = parseNormalizationForm(outputProperties.getProperty("normalization-form", "none"));
+ final String canonicalProp = outputProperties.getProperty(EXistOutputKeys.CANONICAL);
+ this.canonical = "yes".equals(canonicalProp) || "true".equals(canonicalProp) || "1".equals(canonicalProp);
}
private Set parseCdataSectionElementNames() {
@@ -166,6 +202,8 @@ protected void resetObjectState() {
originalXmlDecl = null;
doctypeWritten = false;
defaultNamespace = "";
+ nspaces.clear();
+ nstack.clear();
cdataSectionElements = new LazyVal<>(this::parseCdataSectionElementNames);
}
@@ -184,12 +222,35 @@ public Writer getWriter() {
}
public String getDefaultNamespace() {
- return defaultNamespace.isEmpty() ? null : defaultNamespace;
+ final String fromStack = nsLookup("");
+ return (fromStack == null || fromStack.isEmpty()) ? null : fromStack;
}
public void setDefaultNamespace(final String namespace) {
+ // Keep the baseline field in sync; nsLookup() falls back to it when the
+ // namespace stack has no in-scope binding for the default prefix.
defaultNamespace = namespace == null ? "" : namespace;
}
+
+ /**
+ * Looks up the currently in-scope URI for {@code prefix} by scanning the flat
+ * namespace list from innermost to outermost scope.
+ * For the default-namespace prefix ({@code ""}), falls back to the
+ * {@link #defaultNamespace} baseline field when the stack has no binding.
+ *
+ * @return the in-scope URI, or {@code null} if {@code prefix} is unbound
+ */
+ private String nsLookup(final String prefix) {
+ for (int i = nspaces.size() - 2; i >= 0; i -= 2) {
+ if (nspaces.get(i).equals(prefix)) {
+ return nspaces.get(i + 1);
+ }
+ }
+ if (prefix.isEmpty()) {
+ return defaultNamespace.isEmpty() ? null : defaultNamespace;
+ }
+ return null;
+ }
public void startDocument() throws TransformerException {
resetObjectState();
@@ -207,15 +268,16 @@ public void startElement(final String namespaceUri, final String localName, fina
if(!declarationWritten) {
writeDeclaration();
}
-
+
if(!doctypeWritten) {
writeDoctype(qname);
}
-
+
try {
if(tagIsOpen) {
closeStartTag(false);
}
+ nstack.push(nspaces.size());
writer.write('<');
writer.write(qname);
tagIsOpen = true;
@@ -233,21 +295,22 @@ public void startElement(final QName qname) throws TransformerException {
if(!declarationWritten) {
writeDeclaration();
}
-
+
if(!doctypeWritten) {
writeDoctype(qname.getStringValue());
}
-
+
try {
if(tagIsOpen) {
closeStartTag(false);
}
+ nstack.push(nspaces.size());
writer.write('<');
if(qname.getPrefix() != null && !qname.getPrefix().isEmpty()) {
writer.write(qname.getPrefix());
writer.write(':');
}
-
+
writer.write(qname.getLocalPart());
tagIsOpen = true;
elementName.push(qname);
@@ -266,6 +329,9 @@ public void endElement(final String namespaceURI, final String localName, final
writer.write('>');
}
elementName.pop();
+ if (!nstack.isEmpty()) {
+ nspaces.subList(nstack.pop(), nspaces.size()).clear();
+ }
} catch(final IOException ioe) {
throw new TransformerException(ioe.getMessage(), ioe);
}
@@ -285,40 +351,74 @@ public void endElement(final QName qname) throws TransformerException {
writer.write('>');
}
elementName.pop();
+ if (!nstack.isEmpty()) {
+ nspaces.subList(nstack.pop(), nspaces.size()).clear();
+ }
} catch(final IOException ioe) {
throw new TransformerException(ioe.getMessage(), ioe);
}
}
public void namespace(final String prefix, final String nsURI) throws TransformerException {
- if((nsURI == null) && (prefix == null || prefix.isEmpty())) {
+ final String normPrefix = prefix != null ? prefix : "";
+ final String normUri = nsURI != null ? nsURI : "";
+
+ // The xml namespace is implicitly declared and never needs explicit serialization
+ if ("xml".equals(normPrefix)) {
return;
}
- try {
- if(!tagIsOpen) {
+ try {
+ if (!tagIsOpen) {
+ // An xmlns="" outside a start tag is harmless — just skip it
+ if (normUri.isEmpty() && normPrefix.isEmpty()) {
+ return;
+ }
throw new TransformerException("Found a namespace declaration outside an element");
}
- if(prefix != null && !prefix.isEmpty()) {
- writer.write(' ');
- writer.write("xmlns");
- writer.write(':');
- writer.write(prefix);
- writer.write("=\"");
- writeChars(nsURI, true);
- writer.write('"');
- } else {
- if(defaultNamespace.equals(nsURI)) {
- return;
+ if (canonical) {
+ // Buffer for sorting — emitted in closeStartTag
+ // Validate: reject relative namespace URIs (SERE0024)
+ if (!normUri.isEmpty() && isRelativeUri(normUri)) {
+ throw new TransformerException("err:SERE0024 Canonical serialization does not allow relative namespace URIs: " + normUri);
}
- writer.write(' ');
- writer.write("xmlns");
+ if (normPrefix.isEmpty() && normUri.isEmpty()) {
+ return; // Skip xmlns="" in canonical (not meaningful for no-namespace elements)
+ }
+ // Deduplicate: replace existing binding for same prefix
+ canonicalNamespaces.removeIf(ns -> ns[0].equals(normPrefix));
+ canonicalNamespaces.add(new String[]{normPrefix, normUri});
+ // Track in namespace stack so getDefaultNamespace() stays accurate
+ nspaces.add(normPrefix);
+ nspaces.add(normUri);
+ return;
+ }
+
+ // Look up what is currently in scope for this prefix.
+ // nsLookup scans nspaces from innermost to outermost and falls back to the
+ // defaultNamespace baseline field for the default-namespace prefix.
+ final String inScope = nsLookup(normPrefix);
+ final String effective = inScope != null ? inScope : "";
+ if (normUri.equals(effective)) {
+ return; // Binding unchanged — no declaration needed
+ }
+
+ // Record the new binding so descendants can see it via nsLookup
+ nspaces.add(normPrefix);
+ nspaces.add(normUri);
+
+ // Write the namespace declaration
+ writer.write(' ');
+ if (normPrefix.isEmpty()) {
+ writer.write("xmlns=\"");
+ } else {
+ writer.write("xmlns:");
+ writer.write(normPrefix);
writer.write("=\"");
- writeChars(nsURI, true);
- writer.write('"');
- defaultNamespace= nsURI;
}
+ writeChars(normUri, true);
+ writer.write('"');
} catch(final IOException ioe) {
throw new TransformerException(ioe.getMessage(), ioe);
}
@@ -329,12 +429,18 @@ public void attribute(String qname, CharSequence value) throws TransformerExcept
if(!tagIsOpen) {
characters(value);
return;
- // throw new TransformerException("Found an attribute outside an
- // element");
}
- writer.write(' ');
- writer.write(qname);
- writer.write("=\"");
+ if (canonical) {
+ // Buffer for sorting — extract namespace URI from qname if prefixed
+ final int colon = qname.indexOf(':');
+ final String nsUri = colon > 0 ? "" : ""; // string qname doesn't carry namespace
+ canonicalAttributes.add(new String[]{nsUri, colon > 0 ? qname.substring(colon + 1) : qname, qname, value.toString()});
+ return;
+ }
+ // Coalesce ' ' + qname + '="' into a single bulk write when the
+ // qname fits in the scratch buffer (typical case for short HTML
+ // attribute names like class, href, style).
+ writeAttributePrefix(qname);
writeChars(value, true);
writer.write('"');
} catch(final IOException ioe) {
@@ -347,16 +453,26 @@ public void attribute(final QName qname, final CharSequence value) throws Transf
if(!tagIsOpen) {
characters(value);
return;
- // throw new TransformerException("Found an attribute outside an
- // element");
}
- writer.write(' ');
- if(qname.getPrefix() != null && !qname.getPrefix().isEmpty()) {
- writer.write(qname.getPrefix());
- writer.write(':');
+ if (canonical) {
+ final String nsUri = qname.getNamespaceURI() != null ? qname.getNamespaceURI() : "";
+ final String localName = qname.getLocalPart();
+ final String fullName;
+ if (qname.getPrefix() != null && !qname.getPrefix().isEmpty()) {
+ fullName = qname.getPrefix() + ":" + localName;
+ } else {
+ fullName = localName;
+ }
+ canonicalAttributes.add(new String[]{nsUri, localName, fullName, value.toString()});
+ return;
+ }
+ final String prefix = qname.getPrefix();
+ final String localPart = qname.getLocalPart();
+ if (prefix != null && !prefix.isEmpty()) {
+ writePrefixedAttributePrefix(prefix, localPart);
+ } else {
+ writeAttributePrefix(localPart);
}
- writer.write(qname.getLocalPart());
- writer.write("=\"");
writeChars(value, true);
writer.write('"');
} catch(final IOException ioe) {
@@ -364,6 +480,55 @@ public void attribute(final QName qname, final CharSequence value) throws Transf
}
}
+ /**
+ * Write {@code ' ' + qname + '="'} as a single {@code Writer.write(char[],
+ * int, int)} call when {@code qname} fits in the scratch buffer. Reduces
+ * 3 writer calls per attribute to 1.
+ */
+ private void writeAttributePrefix(final String qname) throws IOException {
+ final int qlen = qname.length();
+ final int needed = qlen + 3; // ' ' + qname + '="'
+ if (needed <= ATTR_SCRATCH_LEN) {
+ attrScratch[0] = ' ';
+ qname.getChars(0, qlen, attrScratch, 1);
+ attrScratch[qlen + 1] = '=';
+ attrScratch[qlen + 2] = '"';
+ writer.write(attrScratch, 0, needed);
+ } else {
+ writer.write(' ');
+ writer.write(qname);
+ writer.write("=\"");
+ }
+ }
+
+ /**
+ * Write {@code ' ' + prefix + ':' + localPart + '="'} as a single bulk
+ * write when it fits the scratch buffer.
+ */
+ private void writePrefixedAttributePrefix(final String prefix, final String localPart) throws IOException {
+ final int plen = prefix.length();
+ final int llen = localPart.length();
+ final int needed = plen + llen + 4; // ' ' + prefix + ':' + localPart + '="'
+ if (needed <= ATTR_SCRATCH_LEN) {
+ attrScratch[0] = ' ';
+ prefix.getChars(0, plen, attrScratch, 1);
+ attrScratch[plen + 1] = ':';
+ localPart.getChars(0, llen, attrScratch, plen + 2);
+ attrScratch[plen + llen + 2] = '=';
+ attrScratch[plen + llen + 3] = '"';
+ writer.write(attrScratch, 0, needed);
+ } else {
+ writer.write(' ');
+ writer.write(prefix);
+ writer.write(':');
+ writer.write(localPart);
+ writer.write("=\"");
+ }
+ }
+
+ private static final int ATTR_SCRATCH_LEN = 96;
+ private final char[] attrScratch = new char[ATTR_SCRATCH_LEN];
+
public void characters(final CharSequence chars) throws TransformerException {
if(!declarationWritten) {
writeDeclaration();
@@ -373,12 +538,68 @@ public void characters(final CharSequence chars) throws TransformerException {
if(tagIsOpen) {
closeStartTag(false);
}
- writeChars(chars, false);
+ // When xdmSerialization is active and current element is in cdata-section-elements,
+ // wrap text content in CDATA instead of escaping it (per W3C Serialization 3.1)
+ if (shouldUseCdataSections() && !elementName.isEmpty()
+ && cdataSectionElements.get().contains(elementName.peek())) {
+ writeCdataContent(chars);
+ } else {
+ writeChars(chars, false);
+ }
} catch(final IOException ioe) {
throw new TransformerException(ioe.getMessage(), ioe);
}
}
+ private void writeCdataContent(final CharSequence chars) throws IOException {
+ // CDATA sections must be split when:
+ // 1. The content contains "]]>" (which would end the CDATA prematurely)
+ // 2. A character cannot be represented in the output encoding (must be escaped as NN;)
+ final String s = normalize(chars).toString();
+ boolean inCdata = false;
+ for (int i = 0; i < s.length(); ) {
+ final int cp = s.codePointAt(i);
+ final int cpLen = Character.charCount(cp);
+
+ // Check for "]]>" sequence
+ if (cp == ']' && i + 2 < s.length() && s.charAt(i + 1) == ']' && s.charAt(i + 2) == '>') {
+ if (!inCdata) {
+ writer.write("");
+ inCdata = false;
+ i += 2; // skip "]]", the ">" will be picked up next
+ continue;
+ }
+
+ // Check if character is encodable in the output charset
+ if (!charSet.inCharacterSet((char) cp)) {
+ // Close any open CDATA section
+ if (inCdata) {
+ writer.write("]]>");
+ inCdata = false;
+ }
+ // Write as character reference
+ writer.write("");
+ writer.write(Integer.toHexString(cp));
+ writer.write(';');
+ } else {
+ // Encodable character — write inside CDATA
+ if (!inCdata) {
+ writer.write("");
+ }
+ }
+
public void characters(final char[] ch, final int start, final int len) throws TransformerException {
if(!declarationWritten) {
writeDeclaration();
@@ -510,8 +731,23 @@ public void documentType(final String name, final String publicId, final String
protected void closeStartTag(final boolean isEmpty) throws TransformerException {
try {
if(tagIsOpen) {
- if(isEmpty) {
+ if (canonical) {
+ flushCanonicalBuffers();
+ }
+ if(isEmpty && !canonical) {
+ // Canonical XML: empty elements expanded to
writer.write("/>");
+ } else if (isEmpty) {
+ // Canonical: write > for empty elements
+ writer.write('>');
+ final QName currentElem = elementName.peek();
+ writer.write("");
+ if (currentElem.getPrefix() != null && !currentElem.getPrefix().isEmpty()) {
+ writer.write(currentElem.getPrefix());
+ writer.write(':');
+ }
+ writer.write(currentElem.getLocalPart());
+ writer.write('>');
} else {
writer.write('>');
}
@@ -522,6 +758,52 @@ protected void closeStartTag(final boolean isEmpty) throws TransformerException
}
}
+ protected boolean isCanonical() {
+ return canonical;
+ }
+
+ protected void flushCanonicalBuffersXhtml() throws TransformerException {
+ try {
+ flushCanonicalBuffers();
+ } catch (final IOException ioe) {
+ throw new TransformerException(ioe.getMessage(), ioe);
+ }
+ }
+
+ private void flushCanonicalBuffers() throws IOException {
+ // Sort namespaces by prefix (default namespace first, then alphabetical)
+ canonicalNamespaces.sort((a, b) -> a[0].compareTo(b[0]));
+ // Write sorted namespaces
+ for (final String[] ns : canonicalNamespaces) {
+ writer.write(' ');
+ if (ns[0].isEmpty()) {
+ writer.write("xmlns=\"");
+ } else {
+ writer.write("xmlns:");
+ writer.write(ns[0]);
+ writer.write("=\"");
+ }
+ writeChars(ns[1], true);
+ writer.write('"');
+ }
+ canonicalNamespaces.clear();
+
+ // Sort attributes by namespace URI (primary), then local name (secondary)
+ canonicalAttributes.sort((a, b) -> {
+ final int cmp = a[0].compareTo(b[0]);
+ return cmp != 0 ? cmp : a[1].compareTo(b[1]);
+ });
+ // Write sorted attributes
+ for (final String[] attr : canonicalAttributes) {
+ writer.write(' ');
+ writer.write(attr[2]); // qualified name
+ writer.write("=\"");
+ writeChars(attr[3], true);
+ writer.write('"');
+ }
+ canonicalAttributes.clear();
+ }
+
protected void writeDeclaration() throws TransformerException {
if(declarationWritten) {
return;
@@ -537,7 +819,9 @@ protected void writeDeclaration() throws TransformerException {
// get the fields of the persisted xml declaration, but overridden with any properties from the serialization properties
final String version = outputProperties.getProperty(OutputKeys.VERSION, (originalXmlDecl.version != null ? originalXmlDecl.version : DEFAULT_XML_VERSION));
final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, (originalXmlDecl.encoding != null ? originalXmlDecl.encoding : DEFAULT_XML_ENCODING));
- @Nullable final String standalone = outputProperties.getProperty(OutputKeys.STANDALONE, originalXmlDecl.standalone);
+ @Nullable final String standaloneOrig = outputProperties.getProperty(OutputKeys.STANDALONE, originalXmlDecl.standalone);
+ // "omit" means standalone should be absent from the declaration
+ @Nullable final String standalone = (standaloneOrig != null && "omit".equalsIgnoreCase(standaloneOrig.trim())) ? null : standaloneOrig;
writeDeclaration(version, encoding, standalone);
@@ -545,11 +829,15 @@ protected void writeDeclaration() throws TransformerException {
}
final String omitXmlDecl = outputProperties.getProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
- if ("no".equals(omitXmlDecl)) {
+ @Nullable final String standaloneRaw = outputProperties.getProperty(OutputKeys.STANDALONE);
+ // "omit" means standalone should be absent from the declaration
+ @Nullable final String standalone = (standaloneRaw != null && "omit".equalsIgnoreCase(standaloneRaw.trim())) ? null : standaloneRaw;
+ // Per W3C Serialization 3.1: output declaration if omit-xml-declaration is false/no/0,
+ // or if standalone is explicitly set (the declaration is required to carry standalone)
+ if (isBooleanFalse(omitXmlDecl) || standalone != null) {
// get the fields of the declaration from the serialization properties
final String version = outputProperties.getProperty(OutputKeys.VERSION, DEFAULT_XML_VERSION);
final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, DEFAULT_XML_ENCODING);
- @Nullable final String standalone = outputProperties.getProperty(OutputKeys.STANDALONE);
writeDeclaration(version, encoding, standalone);
}
@@ -564,7 +852,15 @@ private void writeDeclaration(final String version, final String encoding, @Null
writer.write('"');
if(standalone != null) {
writer.write(" standalone=\"");
- writer.write(standalone);
+ // Normalize boolean values to yes/no for XML declaration
+ final String standaloneVal = standalone.trim();
+ if ("true".equals(standaloneVal) || "1".equals(standaloneVal)) {
+ writer.write("yes");
+ } else if ("false".equals(standaloneVal) || "0".equals(standaloneVal)) {
+ writer.write("no");
+ } else {
+ writer.write(standaloneVal);
+ }
writer.write('"');
}
writer.write("?>\n");
@@ -589,60 +885,112 @@ protected void writeDoctype(final String rootElement) throws TransformerExceptio
protected boolean needsEscape(final char ch) {
return true;
}
-
+
+ /**
+ * Whether & before { should be escaped. HTML output returns false
+ * per W3C HTML serialization spec. XML output returns true (always escape &).
+ */
+ protected boolean escapeAmpersandBeforeBrace() {
+ return true;
+ }
+
+ /**
+ * Check if a serialization boolean parameter value is false.
+ * W3C Serialization 3.1 accepts "no", "false", "0" (with optional whitespace) as false.
+ */
+ protected static boolean isBooleanFalse(final String value) {
+ if (value == null) {
+ return false;
+ }
+ final String trimmed = value.trim();
+ return "no".equals(trimmed) || "false".equals(trimmed) || "0".equals(trimmed);
+ }
+
+ /**
+ * Whether the given character needs escaping. Subclasses can override
+ * to suppress escaping for specific contexts (e.g., HTML raw text elements).
+ *
+ * @param ch the character to check
+ * @param inAttribute true if we're writing an attribute value
+ */
+ protected boolean needsEscape(final char ch, final boolean inAttribute) {
+ return needsEscape(ch);
+ }
+
+ /**
+ * Whether the current context requires character escaping at all.
+ * Subclasses (e.g., HTML5Writer for {@code ",
+ "html", "5.0");
+ assertTrue("Script attribute & should be escaped: " + result,
+ result.contains("language=\"Jack&Jill\""));
+ assertTrue("Script body && should NOT be escaped: " + result,
+ result.contains("go && run()"));
+ }
+
+ @Test
+ public void html40NoDoctypeWithoutPublicSystem() throws Exception {
+ // HTML 4.0 without doctype-public/doctype-system should not emit DOCTYPE
+ final String result = serialize("hello
", "html", "4.0");
+ assertFalse("HTML 4.0 without public/system should NOT have DOCTYPE: " + result,
+ result.contains("\n";
+ final String expected = "";
final QName elQName = new QName("input");
writer.startElement(elQName);
writer.attribute("checked", "checked");
@@ -54,7 +54,7 @@ public void testAttributeWithBooleanValue() throws Exception {
@Test
public void testAttributeWithNonBooleanValue() throws Exception {
- final String expected = "\n";
+ final String expected = "";
final QName elQName = new QName("input");
writer.startElement(elQName);
writer.attribute("name", "name");
@@ -66,7 +66,7 @@ public void testAttributeWithNonBooleanValue() throws Exception {
@Test
public void testAttributeQNameWithBooleanValue() throws Exception {
- final String expected = "\n";
+ final String expected = "";
final QName elQName = new QName("input");
final QName attrQName = new QName("checked");
writer.startElement(elQName);
@@ -79,7 +79,7 @@ public void testAttributeQNameWithBooleanValue() throws Exception {
@Test
public void testAttributeQNameWithNonBooleanValue() throws Exception {
- final String expected = "\n";
+ final String expected = "";
final QName elQName = new QName("input");
final QName attrQName = new QName("name");
writer.startElement(elQName);
diff --git a/exist-core/src/test/java/org/exist/util/serializer/HtmlSerializerBenchmark.java b/exist-core/src/test/java/org/exist/util/serializer/HtmlSerializerBenchmark.java
new file mode 100644
index 00000000000..932b2f2b8a4
--- /dev/null
+++ b/exist-core/src/test/java/org/exist/util/serializer/HtmlSerializerBenchmark.java
@@ -0,0 +1,291 @@
+/*
+ * eXist-db Open Source Native XML Database
+ * Copyright (C) 2001 The eXist-db Authors
+ *
+ * info@exist-db.org
+ * http://www.exist-db.org
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+package org.exist.util.serializer;
+
+import org.exist.dom.QName;
+import org.junit.Test;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerException;
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Properties;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Microbenchmark for HTML serialization that exercises the writeChars/writeCharSeq
+ * hot path. Builds a representative HTML document with paragraphs of plain text
+ * (no special chars in the safe runs) and serializes it many times.
+ *
+ * Compares two configurations:
+ * - bulk writes via {@link Writer#write(char[], int, int)} (current code)
+ * - per-char writes via {@link Writer#write(int)} (the previous behaviour)
+ *
+ * The "per-char" baseline is simulated by wrapping the writer in one that
+ * counts only charAt-based calls — this lets us prove the algorithmic
+ * improvement without having to revert the patch.
+ */
+public class HtmlSerializerBenchmark {
+
+ private static final String LOREM =
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do " +
+ "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim " +
+ "ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut " +
+ "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " +
+ "in voluptate velit esse cillum dolore eu fugiat nulla pariatur.";
+
+ private static final int PARAGRAPH_COUNT = 80;
+ private static final int ITERATIONS = 200;
+
+ /**
+ * Counts both bulk and per-char writes so we can verify the hot path is
+ * actually using bulk operations.
+ */
+ private static final class CountingWriter extends Writer {
+ long bulkWriteCalls;
+ long bulkCharsWritten;
+ long perCharWriteCalls;
+ long stringWriteCalls;
+ long stringCharsWritten;
+
+ @Override
+ public void write(int c) {
+ perCharWriteCalls++;
+ }
+
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ bulkWriteCalls++;
+ bulkCharsWritten += len;
+ }
+
+ @Override
+ public void write(String str, int off, int len) {
+ stringWriteCalls++;
+ stringCharsWritten += len;
+ }
+
+ @Override
+ public void flush() { /* no-op: metrics live in fields, nothing to flush */ }
+
+ @Override
+ public void close() { /* no-op: counting writer holds no resources */ }
+ }
+
+ /**
+ * Forwards every write to the underlying writer one char at a time,
+ * simulating a writer that has no efficient bulk path. Wrapping a
+ * {@link java.io.StringWriter} in this is the closest we can come
+ * to measuring the *previous* writeCharSeq behaviour without reverting.
+ */
+ private static final class PerCharWriter extends Writer {
+ private final Writer delegate;
+ PerCharWriter(final Writer delegate) { this.delegate = delegate; }
+ @Override public void write(int c) throws IOException { delegate.write(c); }
+ @Override public void write(char[] cbuf, int off, int len) throws IOException {
+ for (int i = 0; i < len; i++) delegate.write(cbuf[off + i]);
+ }
+ @Override public void write(String str, int off, int len) throws IOException {
+ for (int i = 0; i < len; i++) delegate.write(str.charAt(off + i));
+ }
+ @Override public void flush() throws IOException { delegate.flush(); }
+ @Override public void close() throws IOException { delegate.close(); }
+ }
+
+ /** Discards bytes — simulates a network sink with no I/O cost. */
+ private static final class NullOutputStream extends java.io.OutputStream {
+ @Override public void write(int b) { /* no-op sink: byte intentionally discarded */ }
+ @Override public void write(byte[] b, int off, int len) { /* no-op sink: bytes intentionally discarded */ }
+ }
+
+ private static java.io.OutputStreamWriter newProductionLikeWriter() {
+ // Mirrors the typical HTTP-response chain: OutputStreamWriter(UTF-8) over
+ // a stream sink. No BufferedWriter — eXist's serializer pipeline does its
+ // own buffering at higher levels.
+ return new java.io.OutputStreamWriter(new NullOutputStream(), java.nio.charset.StandardCharsets.UTF_8);
+ }
+
+ @Test
+ public void rawTextFastPath() throws TransformerException, IOException {
+ // Compare per-char writes between an empty " while non-empty
+ // splits the close across two writers.write() calls.)
+ final long stringCharsDelta = withScript.stringCharsWritten - empty.stringCharsWritten;
+ assertTrue("Script body should add bulk string output close to its size; "
+ + "empty=" + empty.stringCharsWritten + " withScript="
+ + withScript.stringCharsWritten + " delta=" + stringCharsDelta
+ + " script.length()=" + script.length(),
+ stringCharsDelta >= script.length() - 5);
+ }
+
+ private CountingWriter serializeWithScript(final String script) throws TransformerException {
+ final CountingWriter counter = new CountingWriter();
+ final XHTMLWriter w = new HTML5Writer(counter);
+ final Properties props = new Properties();
+ props.setProperty(OutputKeys.METHOD, "html");
+ w.setOutputProperties(props);
+ w.startDocument();
+ w.startElement(null, "html", "html");
+ w.startElement(null, "body", "body");
+ w.startElement(null, "script", "script");
+ if (!script.isEmpty()) {
+ w.characters(script);
+ }
+ w.endElement(null, "script", "script");
+ w.endElement(null, "body", "body");
+ w.endElement(null, "html", "html");
+ w.endDocument();
+ return counter;
+ }
+
+ @Test
+ public void compareAgainstPerCharWriter() throws TransformerException, IOException {
+ // Warm-up — let JIT compile the hot path
+ for (int i = 0; i < 5; i++) {
+ try (java.io.OutputStreamWriter w = newProductionLikeWriter()) { run(w); }
+ try (java.io.OutputStreamWriter w = newProductionLikeWriter()) {
+ run(new PerCharWriter(w));
+ }
+ }
+
+ // Bulk path (current code)
+ long bulkStart = System.nanoTime();
+ for (int i = 0; i < ITERATIONS; i++) {
+ try (java.io.OutputStreamWriter w = newProductionLikeWriter()) { run(w); }
+ }
+ long bulkMs = (System.nanoTime() - bulkStart) / 1_000_000L;
+
+ // Per-char path: wraps the OutputStreamWriter so every char goes through
+ // OutputStreamWriter.write(int) — same path the previous writeCharSeq used.
+ long perCharStart = System.nanoTime();
+ for (int i = 0; i < ITERATIONS; i++) {
+ try (java.io.OutputStreamWriter w = newProductionLikeWriter()) {
+ run(new PerCharWriter(w));
+ }
+ }
+ long perCharMs = (System.nanoTime() - perCharStart) / 1_000_000L;
+
+ System.out.println("[HtmlSerializerBenchmark] " + ITERATIONS + " iters of "
+ + PARAGRAPH_COUNT + "-paragraph HTML doc to OutputStreamWriter(UTF-8):");
+ System.out.println("[HtmlSerializerBenchmark] bulk path: " + bulkMs + " ms ("
+ + String.format("%.3f", bulkMs * 1.0 / ITERATIONS) + " ms/doc)");
+ System.out.println("[HtmlSerializerBenchmark] per-char path: " + perCharMs + " ms ("
+ + String.format("%.3f", perCharMs * 1.0 / ITERATIONS) + " ms/doc)");
+ System.out.println("[HtmlSerializerBenchmark] speedup: "
+ + String.format("%.2fx", perCharMs * 1.0 / Math.max(1, bulkMs)));
+
+ assertTrue("Bulk path should be faster than per-char path; bulk="
+ + bulkMs + "ms perChar=" + perCharMs + "ms", bulkMs < perCharMs);
+ }
+
+ @Test
+ public void htmlSerializationHotPath() throws TransformerException, IOException {
+ // Warm-up
+ for (int i = 0; i < 3; i++) {
+ run(new CountingWriter());
+ }
+
+ final CountingWriter counter = new CountingWriter();
+ final long start = System.nanoTime();
+ for (int i = 0; i < ITERATIONS; i++) {
+ run(counter);
+ }
+ final long elapsedMs = (System.nanoTime() - start) / 1_000_000L;
+
+ final long totalChars = counter.bulkCharsWritten + counter.stringCharsWritten + counter.perCharWriteCalls;
+ final long bulkChars = counter.bulkCharsWritten + counter.stringCharsWritten;
+ final double bulkPct = bulkChars * 100.0 / totalChars;
+
+ System.out.println("[HtmlSerializerBenchmark] " + ITERATIONS + " iterations of "
+ + PARAGRAPH_COUNT + "-paragraph HTML doc in " + elapsedMs + " ms"
+ + " (" + (elapsedMs * 1.0 / ITERATIONS) + " ms/doc)");
+ System.out.println("[HtmlSerializerBenchmark] bulk writes: "
+ + counter.bulkWriteCalls + " (chars: " + counter.bulkCharsWritten + ")");
+ System.out.println("[HtmlSerializerBenchmark] string writes: "
+ + counter.stringWriteCalls + " (chars: " + counter.stringCharsWritten + ")");
+ System.out.println("[HtmlSerializerBenchmark] per-char writes: "
+ + counter.perCharWriteCalls);
+ System.out.println("[HtmlSerializerBenchmark] " + String.format("%.2f", bulkPct)
+ + "% of output bytes flushed in bulk");
+
+ // We expect the vast majority of safe-character output to flow through
+ // bulk writes (Writer.write(char[],int,int) or Writer.write(String,int,int)).
+ // Special-character escapes still go through per-char writes, but those
+ // are a tiny minority of output for typical HTML.
+ assertTrue("Expected >90% of chars to be flushed in bulk, but got " + bulkPct + "%",
+ bulkPct > 90.0);
+ }
+
+ private void run(final Writer out) throws TransformerException {
+ final XHTMLWriter w = new XHTMLWriter(out);
+ final Properties props = new Properties();
+ props.setProperty(OutputKeys.METHOD, "html");
+ props.setProperty(OutputKeys.INDENT, "yes");
+ w.setOutputProperties(props);
+ w.startDocument();
+ w.startElement(null, "html", "html");
+ w.startElement(null, "body", "body");
+ for (int i = 0; i < PARAGRAPH_COUNT; i++) {
+ w.startElement(null, "p", "p");
+ w.attribute("class", "para");
+ w.characters(LOREM);
+ w.endElement(null, "p", "p");
+ }
+ w.endElement(null, "body", "body");
+ w.endElement(null, "html", "html");
+ w.endDocument();
+ }
+}
diff --git a/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm b/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm
index e971e7a5a93..64fd0d5267e 100644
--- a/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm
+++ b/exist-core/src/test/xquery/xquery3/fnSerializeCharacterMaps.xqm
@@ -59,3 +59,62 @@ function testSerialize:use_character_maps-032-params-as-map() {
let $result := serialize($testSerialize:atomic, $params)
return contains($result, "foo:a$$name")
};
+
+(: JSON serialization with use-character-maps :)
+
+declare
+ %test:assertEquals('{"name":"hello ©orld"}')
+function testSerialize:json_character_map_string() {
+ let $params := map {
+ "method": "json",
+ "use-character-maps": map { "w": "©" }
+ }
+ return serialize(map { "name": "hello world" }, $params)
+};
+
+declare
+ %test:assertEquals('{"price":"$100"}')
+function testSerialize:json_character_map_special() {
+ (: Map # to $ in JSON string values :)
+ let $params := map {
+ "method": "json",
+ "use-character-maps": map { "#": "$" }
+ }
+ return serialize(map { "price": "#100" }, $params)
+};
+
+declare
+ %test:assertTrue
+function testSerialize:json_character_map_raw_output() {
+ (: Character map replacements bypass JSON escaping — raw output :)
+ let $params := map {
+ "method": "json",
+ "use-character-maps": map { "*": "" }
+ }
+ let $result := serialize(map { "text": "hello *world*" }, $params)
+ (: The should appear raw, not escaped :)
+ return contains($result, "")
+};
+
+declare
+ %test:assertEquals('"(c) 2024"')
+function testSerialize:json_character_map_copyright() {
+ (: Map © to (c) in JSON output :)
+ let $params := map {
+ "method": "json",
+ "use-character-maps": map { "©": "(c)" }
+ }
+ return serialize("© 2024", $params)
+};
+
+declare
+ %test:assertEquals('(c) symbol')
+function testSerialize:xml_character_map_element_text() {
+ (: XML character maps in element text :)
+ let $params := map {
+ "method": "xml",
+ "omit-xml-declaration": true(),
+ "use-character-maps": map { "©": "(c)" }
+ }
+ return serialize(© symbol, $params)
+};
diff --git a/exist-core/src/test/xquery/xquery3/serialize.xql b/exist-core/src/test/xquery/xquery3/serialize.xql
index bea438d425f..4ac541f0f16 100644
--- a/exist-core/src/test/xquery/xquery3/serialize.xql
+++ b/exist-core/src/test/xquery/xquery3/serialize.xql
@@ -847,7 +847,7 @@ function ser:serialize-xml-134() {
};
declare
- %test:assertEquals(' ')
+ %test:assertEquals('')
function ser:serialize-html-5-boolean-attribute-names() {
=> serialize($ser:opt-map-html5)
@@ -855,7 +855,7 @@ function ser:serialize-html-5-boolean-attribute-names() {
};
declare
- %test:assertEquals('
')
+ %test:assertEquals('
')
function ser:serialize-html-5-empty-tags() {
=> serialize($ser:opt-map-html5)
@@ -876,7 +876,7 @@ function ser:serialize-html-5-raw-text-elements-body() {
};
declare
- %test:assertEquals(' ')
+ %test:assertEquals(' ')
function ser:serialize-html-5-raw-text-elements-head() {
@@ -890,7 +890,7 @@ function ser:serialize-html-5-raw-text-elements-head() {
};
declare
- %test:assertEquals(' XML > JSON')
+ %test:assertEquals(' XML > JSON')
function ser:serialize-html-5-needs-escape-elements() {
@@ -952,3 +952,59 @@ declare
function ser:item-separator-applies-to-array-members() {
serialize([1,2], map { "item-separator": "|" })
};
+
+declare
+ %test:assertTrue
+function ser:cdata-section-elements-no-namespace() {
+ (: Simple unprefixed CDATA test :)
+ let $result := serialize(
+ bolditalic,
+ map {
+ "method": "xml",
+ "cdata-section-elements": QName("", "b"),
+ "omit-xml-declaration": true()
+ }
+ )
+ return contains($result, "CDATA[bold]") and not(contains($result, "CDATA[italic]"))
+};
+
+declare
+ %test:assertTrue
+function ser:cdata-section-elements-with-namespace() {
+ (: Namespaced CDATA test :)
+ let $result := serialize(
+ BOLDITALIC,
+ map {
+ "method": "xml",
+ "cdata-section-elements": QName("http://www.example.org/ns/p", "b"),
+ "omit-xml-declaration": true()
+ }
+ )
+ return contains($result, "CDATA[BOLD]") and not(contains($result, "CDATA[ITALIC]"))
+};
+
+declare
+ %test:assertEquals('1|2|3')
+function ser:item-separator-with-atomics() {
+ (: Atomic items joined by item-separator :)
+ serialize(
+ (1, 2, 3),
+ map { "method": "xml", "item-separator": "|", "omit-xml-declaration": true() }
+ )
+};
+
+declare
+ %test:assertTrue
+function ser:cdata-section-elements-combined() {
+ (: Combined: both unprefixed and namespaced elements get CDATA :)
+ let $result := serialize(
+ bolditalicBOLDITALIC,
+ map {
+ "method": "xml",
+ "cdata-section-elements": (QName("", "b"), QName("http://www.example.org/ns/p", "b")),
+ "omit-xml-declaration": true()
+ }
+ )
+ return contains($result, "CDATA[bold]") and contains($result, "CDATA[BOLD]")
+ and not(contains($result, "CDATA[italic]")) and not(contains($result, "CDATA[ITALIC]"))
+};