diff --git a/crates/mdbook-html/src/html/tests.rs b/crates/mdbook-html/src/html/tests.rs index 58825c9c81..3d3f1956cd 100644 --- a/crates/mdbook-html/src/html/tests.rs +++ b/crates/mdbook-html/src/html/tests.rs @@ -37,6 +37,28 @@ if (3 < 5 > 10) assert_eq!(output, script); } +#[test] +fn parse_html_svg_with_xml_decl() { + let html = r#""#; + let ts = parse_html(html); + for t in &ts { + if let Token::ParseError(e) = t { + panic!("unexpected parse error: {e:?}"); + } + } +} + +#[test] +fn parse_html_pre_with_svg_xml_decl() { + let html = r#"
"#; + let ts = parse_html(html); + for t in &ts { + if let Token::ParseError(e) = t { + panic!("unexpected parse error: {e:?}"); + } + } +} + // What happens if a script doesn't end. #[test] fn parse_html_script_unclosed() { diff --git a/crates/mdbook-html/src/html/tokenizer.rs b/crates/mdbook-html/src/html/tokenizer.rs index e3d8e44f86..e60c7bb00b 100644 --- a/crates/mdbook-html/src/html/tokenizer.rs +++ b/crates/mdbook-html/src/html/tokenizer.rs @@ -9,6 +9,7 @@ use html5ever::tokenizer::states::RawKind; use html5ever::tokenizer::{ BufferQueue, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, }; +use std::borrow::Cow; use std::cell::RefCell; /// Collector for HTML tokens. @@ -45,6 +46,15 @@ impl TokenSink for TokenCollector { TagKind::EndTag => {} } } + if tag_name == b"svg" { + match tag.kind { + TagKind::StartTag => { + self.tokens.borrow_mut().push(token); + return TokenSinkResult::RawData(RawKind::Rawtext); + } + TagKind::EndTag => {} + } + } self.tokens.borrow_mut().push(token); } Token::CommentToken(_) => { @@ -63,8 +73,32 @@ impl TokenSink for TokenCollector { } } +/// Strips XML processing instructions (e.g. ``) that are invalid in HTML +/// but commonly appear in inline SVG emitted by preprocessors. +fn strip_xml_processing_instructions(html: &str) -> Cow<'_, str> { + let mut out = String::new(); + let mut rest = html; + let mut changed = false; + while let Some(start) = rest.find("") else { + out.push_str(&rest[start..]); + return Cow::Owned(out); + }; + rest = &after[end + 2..]; + } + if !changed { + return Cow::Borrowed(html); + } + out.push_str(rest); + Cow::Owned(out) +} + /// Parse HTML into tokens. pub(crate) fn parse_html(html: &str) -> Vec { + let html = strip_xml_processing_instructions(html); let tendril: ByteTendril = html.as_bytes().into(); let mut queue = BufferQueue::default(); queue.push_back(tendril.try_reinterpret().unwrap()); diff --git a/crates/mdbook-html/src/html/tree.rs b/crates/mdbook-html/src/html/tree.rs index 5cb97ce378..e101de9369 100644 --- a/crates/mdbook-html/src/html/tree.rs +++ b/crates/mdbook-html/src/html/tree.rs @@ -662,7 +662,7 @@ where /// Adds an open HTML tag. fn start_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) { let is_closed = is_void_element(&tag.name) || tag.self_closing; - *is_raw = matches!(&*tag.name, "script" | "style"); + *is_raw = matches!(&*tag.name, "script" | "style" | "svg"); let name = QualName::new(None, html5ever::ns!(html), tag.name); let attrs = tag .attrs