diff --git a/README.md b/README.md index 6da3ee1d9..4d883a4cd 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,16 @@ result = md.convert("test.xlsx") print(result.text_content) ``` +To preserve underlined text from supported sources such as DOCX as literal HTML underline tags: + +```python +from markitdown import MarkItDown + +md = MarkItDown(preserve_underlines=True) +result = md.convert("underlined.docx") +print(result.text_content) # e.g. "important" +``` + Document Intelligence conversion in Python: ```python diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..86278a018 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -125,6 +125,7 @@ def __init__( self._llm_prompt: Union[str | None] = None self._exiftool_path: Union[str | None] = None self._style_map: Union[str | None] = None + self._preserve_underlines: bool = False # Register the converters self._converters: List[ConverterRegistration] = [] @@ -150,6 +151,7 @@ def enable_builtins(self, **kwargs) -> None: self._llm_prompt = kwargs.get("llm_prompt") self._exiftool_path = kwargs.get("exiftool_path") self._style_map = kwargs.get("style_map") + self._preserve_underlines = kwargs.get("preserve_underlines", False) if self._exiftool_path is None: self._exiftool_path = os.getenv("EXIFTOOL_PATH") @@ -577,6 +579,12 @@ def _convert( if "exiftool_path" not in _kwargs and self._exiftool_path is not None: _kwargs["exiftool_path"] = self._exiftool_path + if ( + "preserve_underlines" not in _kwargs + and self._preserve_underlines + ): + _kwargs["preserve_underlines"] = self._preserve_underlines + # Add the list of converters for nested processing _kwargs["_parent_converters"] = self._converters diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..e399c6085 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -26,6 +26,20 @@ ] ACCEPTED_FILE_EXTENSIONS = [".docx"] +UNDERLINE_STYLE_MAP = "u => u" + + +def _merge_underline_style_map(style_map: str | None) -> str: + """Add the Mammoth underline mapping unless the caller already supplied one.""" + + if style_map is None or not style_map.strip(): + return UNDERLINE_STYLE_MAP + + for line in style_map.splitlines(): + if line.strip().startswith("u =>"): + return style_map + + return f"{style_map.rstrip()}\n{UNDERLINE_STYLE_MAP}" class DocxConverter(HtmlConverter): @@ -76,6 +90,8 @@ def convert( ) style_map = kwargs.get("style_map", None) + if kwargs.get("preserve_underlines"): + style_map = _merge_underline_style_map(style_map) pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..d8c5931ec 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -18,6 +18,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter): def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) options["keep_data_uris"] = options.get("keep_data_uris", False) + options["preserve_underlines"] = options.get("preserve_underlines", False) # Explicitly cast options to the expected type if necessary super().__init__(**options) @@ -122,5 +123,23 @@ def convert_input( return "[x] " if el.has_attr("checked") else "[ ] " return "" + def convert_u( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: + """Optionally preserve underline markup as literal HTML tags.""" + + if not self.options["preserve_underlines"]: + return text + + prefix, suffix, text = markdownify.chomp(text) # type: ignore + if not text: + return "" + + return f"{prefix}{text}{suffix}" + def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..21b878a37 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -5,8 +5,11 @@ import shutil import pytest from unittest.mock import MagicMock +from unittest.mock import patch from markitdown._uri_utils import parse_data_uri, file_uri_to_path +from markitdown._base_converter import DocumentConverterResult +from markitdown.converters._docx_converter import DocxConverter from markitdown import ( MarkItDown, @@ -274,6 +277,81 @@ def test_docx_equations() -> None: assert block_equations, "No block equations found in the document." +def test_html_preserve_underlines() -> None: + html = b"
alpha beta gamma
" + + default_result = MarkItDown().convert_stream( + io.BytesIO(html), + stream_info=StreamInfo(extension=".html"), + ) + assert "beta" not in default_result.text_content + assert "alpha beta gamma" in default_result.text_content + + preserved_result = MarkItDown(preserve_underlines=True).convert_stream( + io.BytesIO(html), + stream_info=StreamInfo(extension=".html"), + ) + assert "alpha beta gamma" in preserved_result.text_content + + +def test_docx_preserve_underlines_adds_style_map() -> None: + converter = DocxConverter() + converter._html_converter.convert_string = MagicMock( + return_value=DocumentConverterResult(markdown="underlined") + ) + + with ( + patch( + "markitdown.converters._docx_converter.pre_process_docx", + return_value=io.BytesIO(b"fake"), + ), + patch( + "markitdown.converters._docx_converter.mammoth.convert_to_html", + return_value=MagicMock(value="underlined
"), + ) as convert_to_html, + ): + converter.convert( + io.BytesIO(b"fake"), + StreamInfo(extension=".docx"), + preserve_underlines=True, + ) + + assert convert_to_html.call_args.kwargs["style_map"] == "u => u" + converter._html_converter.convert_string.assert_called_once_with( + "underlined
", + preserve_underlines=True, + ) + + +def test_docx_preserve_underlines_merges_existing_style_map() -> None: + converter = DocxConverter() + converter._html_converter.convert_string = MagicMock( + return_value=DocumentConverterResult(markdown="underlined") + ) + + with ( + patch( + "markitdown.converters._docx_converter.pre_process_docx", + return_value=io.BytesIO(b"fake"), + ), + patch( + "markitdown.converters._docx_converter.mammoth.convert_to_html", + return_value=MagicMock(value="underlined
"), + ) as convert_to_html, + ): + converter.convert( + io.BytesIO(b"fake"), + StreamInfo(extension=".docx"), + preserve_underlines=True, + style_map="comment-reference => ", + ) + + assert ( + convert_to_html.call_args.kwargs["style_map"] + == "comment-reference =>\nu => u" + ) + + def test_input_as_strings() -> None: markitdown = MarkItDown()