Skip to content

Commit d76a575

Browse files
authored
Merge pull request #46 from datalab-to/dev
Fix rotation issue
2 parents 4021f6e + 40a0be0 commit d76a575

8 files changed

Lines changed: 36 additions & 18 deletions

File tree

pdftext/pdf/chars.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,10 @@ def word_break():
8585
word_break()
8686
continue
8787

88-
# we break on any change in font info
89-
if any(char['font'][k] != word['font'][k] for k in ['name', 'flags', 'size', 'weight']):
88+
# we break on any change in font info - optimized comparison
89+
char_font = char['font']
90+
word_font = word['font']
91+
if any(char_font[k] != word_font[k] for k in ['name', 'flags', 'size', 'weight']):
9092
word_break()
9193
continue
9294

@@ -99,17 +101,19 @@ def word_break():
99101
word['bbox'] = word['bbox'].merge(char['bbox'])
100102
word['chars'].append(char)
101103

102-
# deduplicate words
103-
seen = {}
104+
# deduplicate words - use tuple keys instead of strings
105+
seen = set()
104106
deduped = []
105107
for word in words:
106108
# Round the bbox coordinates
107109
bbox = word['bbox'].bbox
108-
bbox = [round(x, 0) for x in bbox]
110+
bbox_rounded = tuple(round(x, 0) for x in bbox)
109111

110-
key = f"{bbox}-{word['text']}-{word['rotation']}-{word['font']['name']}-{word['font']['flags']}-{word['font']['size']}-{word['font']['weight']}"
112+
key = (bbox_rounded, word['text'], word['rotation'],
113+
word['font']['name'], word['font']['flags'],
114+
word['font']['size'], word['font']['weight'])
111115
if key not in seen:
112-
seen[key] = True
116+
seen.add(key)
113117
deduped.append(word)
114118

115119
return [char for word in deduped for char in word['chars']]

pdftext/pdf/pages.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,12 +144,13 @@ def line_break():
144144
line_break()
145145
continue
146146

147-
# we break if the previous span ends with a linebreak or hyphenation
148-
if any(line["spans"][-1]["text"].endswith(suffix) for suffix in ["\n", "\x02"]):
147+
# we break if the previous span ends with a linebreak
148+
last_text = line["spans"][-1]["text"]
149+
if any(last_text.endswith(suffix) for suffix in ["\n", "\x02"]):
149150
line_break()
150151
continue
151152

152-
if span["rotation"] != line["rotation"]:
153+
if span["rotation"] != line["rotation"] and abs(span["rotation"] - line["rotation"]) >= 45:
153154
line_break()
154155
continue
155156

pdftext/schema.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,14 @@ def y_end(self):
5656
return self.bbox[3]
5757

5858
def merge(self, other: Bbox) -> Bbox:
59-
x_start = self.x_start if self.x_start < other.x_start else other.x_start
60-
y_start = self.y_start if self.y_start < other.y_start else other.y_start
61-
x_end = self.x_end if self.x_end > other.x_end else other.x_end
62-
y_end = self.y_end if self.y_end > other.y_end else other.y_end
63-
64-
return Bbox([x_start, y_start, x_end, y_end])
59+
self_bbox = self.bbox
60+
other_bbox = other.bbox
61+
return Bbox([
62+
min(self_bbox[0], other_bbox[0]),
63+
min(self_bbox[1], other_bbox[1]),
64+
max(self_bbox[2], other_bbox[2]),
65+
max(self_bbox[3], other_bbox[3])
66+
])
6567

6668
def overlap_x(self, other: Bbox):
6769
return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0]))

pdftext/scripts/extract_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def extract_text_cli(
5252
workers=kwargs["workers"],
5353
disable_links=True
5454
)
55-
text = json.dumps(text)
55+
text = json.dumps(text, ensure_ascii=False, indent=2)
5656
else:
5757
text = plain_text_output(
5858
pdf_path,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pdftext"
3-
version = "0.6.2"
3+
version = "0.6.3"
44
description = "Extract structured text from pdfs quickly"
55
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
66
license = "Apache-2.0"

tests/conftest.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
def pdf_path():
66
return "tests/data/adversarial.pdf"
77

8+
@pytest.fixture(scope="session")
9+
def pdf_path2():
10+
return "tests/data/communication.pdf"
11+
812
@pytest.fixture()
913
def pdf_doc(pdf_path):
1014
doc = pdfium.PdfDocument(pdf_path)

tests/data/communication.pdf

937 KB
Binary file not shown.

tests/test_extraction.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,10 @@ def test_superscripts(pdf_path):
3535
if span["text"] == "∞":
3636
assert span["superscript"] is True
3737
return True
38+
39+
40+
def test_line_joining(pdf_path2):
41+
pages = [11]
42+
text = plain_text_output(pdf_path2, page_range=pages).lower()
43+
assert "the axis media control viewer toolbar" in text
44+
assert "axismediacontrolviewertoolbar" not in text

0 commit comments

Comments
 (0)