microsoft · jigangz · Mar 28, 2026 · Apr 5, 2026 · Apr 6, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -1,4 +1,5 @@
 import io
+import warnings
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
 
@@ -44,6 +45,10 @@ def convert(
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
+        # Pop our own keyword before forwarding the rest to markdownify.
+        # strict=True raises RecursionError instead of falling back to plain text.
+        strict: bool = kwargs.pop("strict", False)
+
         # Parse the stream
         encoding = "utf-8" if stream_info.charset is None else stream_info.charset
         soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
@@ -55,10 +60,25 @@ def convert(
         # Print only the main content
         body_elm = soup.find("body")
         webpage_text = ""
-        if body_elm:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
-        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+        try:
+            if body_elm:
+                webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
+            else:
+                webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+        except RecursionError:
+            if strict:
+                raise
+            # Large or deeply-nested HTML can exceed Python's recursion limit
+            # during markdownify's recursive DOM traversal.  Fall back to
+            # BeautifulSoup's iterative get_text() so the caller still gets
+            # usable plain-text content instead of raw HTML.
+            warnings.warn(
+                "HTML document is too deeply nested for markdown conversion "
+                "(RecursionError). Falling back to plain-text extraction.",
+                stacklevel=2,
+            )
+            target = body_elm if body_elm else soup
+            webpage_text = target.get_text("\n", strip=True)
 
         assert isinstance(webpage_text, str)
 

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
@@ -288,6 +288,56 @@ def test_input_as_strings() -> None:
     assert "# Test" in result.text_content
 
 
+def test_deeply_nested_html_fallback() -> None:
+    """Large, deeply nested HTML should fall back to plain-text extraction
+    instead of silently returning unconverted HTML (issue #1636).
+
+    Note: This test uses sys.setrecursionlimit to guarantee a RecursionError
+    regardless of the host environment's default limit, making it deterministic
+    across different platforms and CI configurations.
+    """
+    import sys
+    import warnings
+
+    markitdown = MarkItDown()
+
+    # Use a small recursion limit so the test is environment-independent.
+    # We restore the original limit in a finally block to avoid side-effects.
+    original_limit = sys.getrecursionlimit()
+    low_limit = 200  # well below markdownify's traversal depth for depth=500
+
+    # Build HTML with nesting deep enough to trigger RecursionError
+    depth = 500
+    html = "<html><body>"
+    for _ in range(depth):
+        html += '<div style="margin-left:10px">'
+    html += "<p>Deep content with <b>bold text</b></p>"
+    for _ in range(depth):
+        html += "</div>"
+    html += "</body></html>"
+
+    try:
+        sys.setrecursionlimit(low_limit)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            result = markitdown.convert_stream(
+                io.BytesIO(html.encode("utf-8")),
+                file_extension=".html",
+            )
+
+            # Should have emitted a warning about the fallback
+            recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
+            assert len(recursion_warnings) > 0
+    finally:
+        sys.setrecursionlimit(original_limit)
+
+    # The output should contain the text content, not raw HTML
+    assert "Deep content" in result.markdown
+    assert "bold text" in result.markdown
+    assert "<div" not in result.markdown
+    assert "<p>" not in result.markdown
+
+
 def test_doc_rlink() -> None:
     # Test for: CVE-2025-11849
     markitdown = MarkItDown()