Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,16 @@ result = md.convert("test.xlsx")
print(result.text_content)
```

To preserve underlined text from supported sources such as DOCX as literal HTML underline tags:

```python
from markitdown import MarkItDown

md = MarkItDown(preserve_underlines=True)
result = md.convert("underlined.docx")
print(result.text_content) # e.g. "<u>important</u>"
```

Document Intelligence conversion in Python:

```python
Expand Down
8 changes: 8 additions & 0 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def __init__(
self._llm_prompt: Union[str | None] = None
self._exiftool_path: Union[str | None] = None
self._style_map: Union[str | None] = None
self._preserve_underlines: bool = False

# Register the converters
self._converters: List[ConverterRegistration] = []
Expand All @@ -150,6 +151,7 @@ def enable_builtins(self, **kwargs) -> None:
self._llm_prompt = kwargs.get("llm_prompt")
self._exiftool_path = kwargs.get("exiftool_path")
self._style_map = kwargs.get("style_map")
self._preserve_underlines = kwargs.get("preserve_underlines", False)

if self._exiftool_path is None:
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
Expand Down Expand Up @@ -577,6 +579,12 @@ def _convert(
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
_kwargs["exiftool_path"] = self._exiftool_path

if (
"preserve_underlines" not in _kwargs
and self._preserve_underlines
):
_kwargs["preserve_underlines"] = self._preserve_underlines

# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._converters

Expand Down
16 changes: 16 additions & 0 deletions packages/markitdown/src/markitdown/converters/_docx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,20 @@
]

ACCEPTED_FILE_EXTENSIONS = [".docx"]
UNDERLINE_STYLE_MAP = "u => u"


def _merge_underline_style_map(style_map: str | None) -> str:
"""Add the Mammoth underline mapping unless the caller already supplied one."""

if style_map is None or not style_map.strip():
return UNDERLINE_STYLE_MAP

for line in style_map.splitlines():
if line.strip().startswith("u =>"):
return style_map

return f"{style_map.rstrip()}\n{UNDERLINE_STYLE_MAP}"


class DocxConverter(HtmlConverter):
Expand Down Expand Up @@ -76,6 +90,8 @@ def convert(
)

style_map = kwargs.get("style_map", None)
if kwargs.get("preserve_underlines"):
style_map = _merge_underline_style_map(style_map)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
Expand Down
19 changes: 19 additions & 0 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
options["keep_data_uris"] = options.get("keep_data_uris", False)
options["preserve_underlines"] = options.get("preserve_underlines", False)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)

Expand Down Expand Up @@ -122,5 +123,23 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""

def convert_u(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
"""Optionally preserve underline markup as literal HTML tags."""

if not self.options["preserve_underlines"]:
return text

prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
return ""

return f"{prefix}<u>{text}</u>{suffix}"

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
78 changes: 78 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
import shutil
import pytest
from unittest.mock import MagicMock
from unittest.mock import patch

from markitdown._uri_utils import parse_data_uri, file_uri_to_path
from markitdown._base_converter import DocumentConverterResult
from markitdown.converters._docx_converter import DocxConverter

from markitdown import (
MarkItDown,
Expand Down Expand Up @@ -274,6 +277,81 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."


def test_html_preserve_underlines() -> None:
html = b"<html><body><p>alpha <u>beta</u> gamma</p></body></html>"

default_result = MarkItDown().convert_stream(
io.BytesIO(html),
stream_info=StreamInfo(extension=".html"),
)
assert "<u>beta</u>" not in default_result.text_content
assert "alpha beta gamma" in default_result.text_content

preserved_result = MarkItDown(preserve_underlines=True).convert_stream(
io.BytesIO(html),
stream_info=StreamInfo(extension=".html"),
)
assert "alpha <u>beta</u> gamma" in preserved_result.text_content


def test_docx_preserve_underlines_adds_style_map() -> None:
converter = DocxConverter()
converter._html_converter.convert_string = MagicMock(
return_value=DocumentConverterResult(markdown="<u>underlined</u>")
)

with (
patch(
"markitdown.converters._docx_converter.pre_process_docx",
return_value=io.BytesIO(b"fake"),
),
patch(
"markitdown.converters._docx_converter.mammoth.convert_to_html",
return_value=MagicMock(value="<p><u>underlined</u></p>"),
) as convert_to_html,
):
converter.convert(
io.BytesIO(b"fake"),
StreamInfo(extension=".docx"),
preserve_underlines=True,
)

assert convert_to_html.call_args.kwargs["style_map"] == "u => u"
converter._html_converter.convert_string.assert_called_once_with(
"<p><u>underlined</u></p>",
preserve_underlines=True,
)


def test_docx_preserve_underlines_merges_existing_style_map() -> None:
converter = DocxConverter()
converter._html_converter.convert_string = MagicMock(
return_value=DocumentConverterResult(markdown="<u>underlined</u>")
)

with (
patch(
"markitdown.converters._docx_converter.pre_process_docx",
return_value=io.BytesIO(b"fake"),
),
patch(
"markitdown.converters._docx_converter.mammoth.convert_to_html",
return_value=MagicMock(value="<p><u>underlined</u></p>"),
) as convert_to_html,
):
converter.convert(
io.BytesIO(b"fake"),
StreamInfo(extension=".docx"),
preserve_underlines=True,
style_map="comment-reference => ",
)

assert (
convert_to_html.call_args.kwargs["style_map"]
== "comment-reference =>\nu => u"
)


def test_input_as_strings() -> None:
markitdown = MarkItDown()

Expand Down