Skip to content

Commit 50aaf06

Browse files
committed
Fix aggressive line breaks
1 parent 643b8ec commit 50aaf06

6 files changed

Lines changed: 14 additions & 3 deletions

File tree

pdftext/pdf/pages.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def line_break():
157157
line_break()
158158
continue
159159

160-
if span["rotation"] != line["rotation"]:
160+
if span["rotation"] != line["rotation"] and abs(span["rotation"] - line["rotation"]) >= 45:
161161
line_break()
162162
continue
163163

pdftext/scripts/extract_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def extract_text_cli(
5252
workers=kwargs["workers"],
5353
disable_links=True
5454
)
55-
text = json.dumps(text)
55+
text = json.dumps(text, ensure_ascii=False, indent=2)
5656
else:
5757
text = plain_text_output(
5858
pdf_path,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pdftext"
3-
version = "0.6.2"
3+
version = "0.6.3"
44
description = "Extract structured text from pdfs quickly"
55
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
66
license = "Apache-2.0"

tests/conftest.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
def pdf_path():
66
return "tests/data/adversarial.pdf"
77

8+
@pytest.fixture(scope="session")
9+
def pdf_path2():
10+
return "tests/data/communication.pdf"
11+
812
@pytest.fixture()
913
def pdf_doc(pdf_path):
1014
doc = pdfium.PdfDocument(pdf_path)

tests/data/communication.pdf

937 KB
Binary file not shown.

tests/test_extraction.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,10 @@ def test_superscripts(pdf_path):
3535
if span["text"] == "∞":
3636
assert span["superscript"] is True
3737
return True
38+
39+
40+
def test_line_joining(pdf_path2):
41+
pages = [11]
42+
text = plain_text_output(pdf_path2, page_range=pages).lower()
43+
assert "the axis media control viewer toolbar" in text
44+
assert "axismediacontrolviewertoolbar" not in text

0 commit comments

Comments
 (0)