Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions packages/markitdown/src/markitdown/converters/_html_converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import warnings
from typing import Any, BinaryIO, Optional
from bs4 import BeautifulSoup

Expand Down Expand Up @@ -44,6 +45,10 @@ def convert(
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Pop our own keyword before forwarding the rest to markdownify.
# strict=True raises RecursionError instead of falling back to plain text.
strict: bool = kwargs.pop("strict", False)

# Parse the stream
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
Expand All @@ -55,10 +60,25 @@ def convert(
# Print only the main content
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
try:
if body_elm:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
except RecursionError:
if strict:
raise
# Large or deeply-nested HTML can exceed Python's recursion limit
# during markdownify's recursive DOM traversal. Fall back to
# BeautifulSoup's iterative get_text() so the caller still gets
# usable plain-text content instead of raw HTML.
warnings.warn(
"HTML document is too deeply nested for markdown conversion "
"(RecursionError). Falling back to plain-text extraction.",
stacklevel=2,
)
target = body_elm if body_elm else soup
webpage_text = target.get_text("\n", strip=True)

assert isinstance(webpage_text, str)

Expand Down
50 changes: 50 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,56 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def test_deeply_nested_html_fallback() -> None:
"""Large, deeply nested HTML should fall back to plain-text extraction
instead of silently returning unconverted HTML (issue #1636).

Note: This test uses sys.setrecursionlimit to guarantee a RecursionError
regardless of the host environment's default limit, making it deterministic
across different platforms and CI configurations.
"""
import sys
import warnings

markitdown = MarkItDown()

# Use a small recursion limit so the test is environment-independent.
# We restore the original limit in a finally block to avoid side-effects.
original_limit = sys.getrecursionlimit()
low_limit = 200 # well below markdownify's traversal depth for depth=500

# Build HTML with nesting deep enough to trigger RecursionError
depth = 500
html = "<html><body>"
for _ in range(depth):
html += '<div style="margin-left:10px">'
html += "<p>Deep content with <b>bold text</b></p>"
for _ in range(depth):
html += "</div>"
html += "</body></html>"

try:
sys.setrecursionlimit(low_limit)
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
result = markitdown.convert_stream(
io.BytesIO(html.encode("utf-8")),
file_extension=".html",
)

# Should have emitted a warning about the fallback
recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
assert len(recursion_warnings) > 0
finally:
sys.setrecursionlimit(original_limit)

# The output should contain the text content, not raw HTML
assert "Deep content" in result.markdown
assert "bold text" in result.markdown
assert "<div" not in result.markdown
assert "<p>" not in result.markdown


def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()
Expand Down