Skip to content

Commit a002c7f

Browse files
authored
Merge pull request #40 from VikParuchuri/dev
Superscripts
2 parents c10283f + aa4e0cc commit a002c7f

5 files changed

Lines changed: 112 additions & 7 deletions

File tree

pdftext/pdf/pages.py

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import math
44
import statistics
55
from typing import List
6+
import unicodedata
67

78
import pypdfium2 as pdfium
89

@@ -11,7 +12,69 @@
1112
from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans
1213

1314

14-
def get_spans(chars: Chars) -> Spans:
15+
def is_math_symbol(char):
16+
if len(char) != 1:
17+
return False
18+
19+
category = unicodedata.category(char)
20+
return category == 'Sm'
21+
22+
def assign_scripts(lines: Lines, height_threshold: float = 0.8, line_distance_threshold: float = 0.1):
23+
for line in lines:
24+
prev_span = None
25+
if len(line["spans"]) < 2:
26+
continue
27+
28+
# Skip vertical lines
29+
if line["bbox"].height > line["bbox"].width:
30+
continue
31+
32+
for i, span in enumerate(line["spans"]):
33+
is_first = i == 0 or not prev_span["text"].strip()
34+
is_last = i == len(line["spans"]) - 1 or not line["spans"][i + 1]["text"].strip()
35+
span_height = span["bbox"].height
36+
span_top = span["bbox"].y_start
37+
span_bottom = span["bbox"].y_end
38+
39+
line_fullheight = span_height / max(1, line["bbox"].height) <= height_threshold
40+
next_fullheight = is_last or span_height / max(1, line["spans"][i + 1]["bbox"].height) <= height_threshold
41+
prev_fullheight = is_first or span_height / max(1, prev_span["bbox"].height) <= height_threshold
42+
43+
above = any([span_top < (s["bbox"].y_start - s["bbox"].height * line_distance_threshold) for j, s in enumerate(line["spans"]) if j != i])
44+
prev_above = is_first or span_top < prev_span["bbox"].y_start
45+
next_above = is_last or span_top < line["spans"][i + 1]["bbox"].y_start
46+
47+
below = any([span_bottom > (s["bbox"].y_end + s["bbox"].height * line_distance_threshold) for j, s in enumerate(line["spans"]) if j != i])
48+
prev_below = is_first or span_bottom > prev_span["bbox"].y_end
49+
next_below = is_last or span_bottom > line["spans"][i + 1]["bbox"].y_end
50+
51+
span_text = span["text"].strip()
52+
span_text_okay = all([
53+
(len(span_text) == 1 or span_text.isdigit()), # Ensure that the span text is a single char or a number
54+
span_text.isalnum() or is_math_symbol(span_text) # Ensure that the span text is an alphanumeric or a math symbol
55+
])
56+
57+
if all([
58+
(prev_fullheight or next_fullheight),
59+
(prev_above or next_above),
60+
above,
61+
line_fullheight,
62+
span_text_okay
63+
]):
64+
span["superscript"] = True
65+
elif all([
66+
(prev_fullheight or next_fullheight),
67+
(prev_below or next_below),
68+
below,
69+
line_fullheight,
70+
span_text_okay
71+
]):
72+
span["subscript"] = True
73+
74+
prev_span = span
75+
76+
77+
def get_spans(chars: Chars, superscript_height_threshold: float = 0.8, line_distance_threshold: float = 0.1) -> Spans:
1578
spans: Spans = []
1679
span: Span = None
1780

@@ -49,6 +112,15 @@ def span_break():
49112
span_break()
50113
continue
51114

115+
# Character is likely a superscript
116+
if all([
117+
char["bbox"][1] < (span["bbox"][1] - span["bbox"].height * line_distance_threshold), # char top is above span
118+
char["bbox"][3] < (span["bbox"].height * superscript_height_threshold) + span["bbox"][1], # char bottom is not full line height
119+
char["bbox"][0] > span["bbox"][2], # char is to the right of the span
120+
]):
121+
span_break()
122+
continue
123+
52124
span['text'] += char['char']
53125
span['char_end_idx'] = char['char_idx']
54126
span['bbox'] = span['bbox'].merge(char['bbox'])
@@ -189,7 +261,9 @@ def get_pages(
189261
pdf: pdfium.PdfDocument,
190262
page_range: range,
191263
flatten_pdf: bool = True,
192-
quote_loosebox=True
264+
quote_loosebox: bool =True,
265+
superscript_height_threshold: float = 0.7,
266+
line_distance_threshold: float = 0.1,
193267
) -> Pages:
194268
pages: Pages = []
195269

@@ -212,8 +286,9 @@ def get_pages(
212286
pass
213287

214288
chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
215-
spans = get_spans(chars)
289+
spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
216290
lines = get_lines(spans)
291+
assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
217292
blocks = get_blocks(lines)
218293

219294
pages.append({

pdftext/schema.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ def __init__(self, bbox: List[float], ensure_nonzero_area=False):
1616
def __getitem__(self, item):
1717
return self.bbox[item]
1818

19+
def __repr__(self):
20+
return f"Bbox({self.bbox})"
21+
1922
@property
2023
def height(self):
2124
return self.bbox[3] - self.bbox[1]
@@ -140,6 +143,8 @@ class Span(TypedDict):
140143
char_end_idx: int
141144
rotation: int
142145
url: str
146+
superscript: bool
147+
subscript: bool
143148

144149

145150
class Line(TypedDict):

pdftext/scripts/extract_text.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,25 @@ def extract_text_cli(
4343
assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"
4444

4545
if kwargs["json"]:
46-
text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
46+
text = dictionary_output(
47+
pdf_path,
48+
sort=kwargs["sort"],
49+
page_range=pages,
50+
flatten_pdf=kwargs["flatten_pdf"],
51+
keep_chars=kwargs["keep_chars"],
52+
workers=kwargs["workers"],
53+
disable_links=True
54+
)
4755
text = json.dumps(text)
4856
else:
49-
text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])
57+
text = plain_text_output(
58+
pdf_path,
59+
sort=kwargs["sort"],
60+
hyphens=kwargs["keep_hyphens"],
61+
page_range=pages,
62+
flatten_pdf=kwargs["flatten_pdf"],
63+
workers=kwargs["workers"]
64+
)
5065

5166
if out_path is None:
5267
print(text)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pdftext"
3-
version = "0.6.0"
3+
version = "0.6.1"
44
description = "Extract structured text from pdfs quickly"
55
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
66
license = "Apache-2.0"

tests/test_extraction.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,14 @@ def test_json_output(pdf_path, pdf_doc):
2424
def test_keep_chars(pdf_path):
2525
pages: Pages = dictionary_output(pdf_path, keep_chars=True)
2626
assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"]
27-
assert "bbox" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["chars"][0]
27+
assert "bbox" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["chars"][0]
28+
29+
def test_superscripts(pdf_path):
30+
pages: Pages = dictionary_output(pdf_path)
31+
for page in pages:
32+
for block in page["blocks"]:
33+
for line in block["lines"]:
34+
for span in line["spans"]:
35+
if span["text"] == "∞":
36+
assert span["superscript"] is True
37+
return True

0 commit comments

Comments
 (0)