33import math
44import statistics
55from typing import List
6+ import unicodedata
67
78import pypdfium2 as pdfium
89
1112from pdftext .schema import Blocks , Chars , Line , Lines , Pages , Span , Spans
1213
1314
14- def get_spans (chars : Chars ) -> Spans :
15+ def is_math_symbol (char ):
16+ if len (char ) != 1 :
17+ return False
18+
19+ category = unicodedata .category (char )
20+ return category == 'Sm'
21+
22+ def assign_scripts (lines : Lines , height_threshold : float = 0.8 , line_distance_threshold : float = 0.1 ):
23+ for line in lines :
24+ prev_span = None
25+ if len (line ["spans" ]) < 2 :
26+ continue
27+
28+ # Skip vertical lines
29+ if line ["bbox" ].height > line ["bbox" ].width :
30+ continue
31+
32+ for i , span in enumerate (line ["spans" ]):
33+ is_first = i == 0 or not prev_span ["text" ].strip ()
34+ is_last = i == len (line ["spans" ]) - 1 or not line ["spans" ][i + 1 ]["text" ].strip ()
35+ span_height = span ["bbox" ].height
36+ span_top = span ["bbox" ].y_start
37+ span_bottom = span ["bbox" ].y_end
38+
39+ line_fullheight = span_height / max (1 , line ["bbox" ].height ) <= height_threshold
40+ next_fullheight = is_last or span_height / max (1 , line ["spans" ][i + 1 ]["bbox" ].height ) <= height_threshold
41+ prev_fullheight = is_first or span_height / max (1 , prev_span ["bbox" ].height ) <= height_threshold
42+
43+ above = any ([span_top < (s ["bbox" ].y_start - s ["bbox" ].height * line_distance_threshold ) for j , s in enumerate (line ["spans" ]) if j != i ])
44+ prev_above = is_first or span_top < prev_span ["bbox" ].y_start
45+ next_above = is_last or span_top < line ["spans" ][i + 1 ]["bbox" ].y_start
46+
47+ below = any ([span_bottom > (s ["bbox" ].y_end + s ["bbox" ].height * line_distance_threshold ) for j , s in enumerate (line ["spans" ]) if j != i ])
48+ prev_below = is_first or span_bottom > prev_span ["bbox" ].y_end
49+ next_below = is_last or span_bottom > line ["spans" ][i + 1 ]["bbox" ].y_end
50+
51+ span_text = span ["text" ].strip ()
52+ span_text_okay = all ([
53+ (len (span_text ) == 1 or span_text .isdigit ()), # Ensure that the span text is a single char or a number
54+ span_text .isalnum () or is_math_symbol (span_text ) # Ensure that the span text is an alphanumeric or a math symbol
55+ ])
56+
57+ if all ([
58+ (prev_fullheight or next_fullheight ),
59+ (prev_above or next_above ),
60+ above ,
61+ line_fullheight ,
62+ span_text_okay
63+ ]):
64+ span ["superscript" ] = True
65+ elif all ([
66+ (prev_fullheight or next_fullheight ),
67+ (prev_below or next_below ),
68+ below ,
69+ line_fullheight ,
70+ span_text_okay
71+ ]):
72+ span ["subscript" ] = True
73+
74+ prev_span = span
75+
76+
77+ def get_spans (chars : Chars , superscript_height_threshold : float = 0.8 , line_distance_threshold : float = 0.1 ) -> Spans :
1578 spans : Spans = []
1679 span : Span = None
1780
@@ -49,6 +112,15 @@ def span_break():
49112 span_break ()
50113 continue
51114
115+ # Character is likely a superscript
116+ if all ([
117+ char ["bbox" ][1 ] < (span ["bbox" ][1 ] - span ["bbox" ].height * line_distance_threshold ), # char top is above span
118+ char ["bbox" ][3 ] < (span ["bbox" ].height * superscript_height_threshold ) + span ["bbox" ][1 ], # char bottom is not full line height
119+ char ["bbox" ][0 ] > span ["bbox" ][2 ], # char is to the right of the span
120+ ]):
121+ span_break ()
122+ continue
123+
52124 span ['text' ] += char ['char' ]
53125 span ['char_end_idx' ] = char ['char_idx' ]
54126 span ['bbox' ] = span ['bbox' ].merge (char ['bbox' ])
@@ -189,7 +261,9 @@ def get_pages(
189261 pdf : pdfium .PdfDocument ,
190262 page_range : range ,
191263 flatten_pdf : bool = True ,
192- quote_loosebox = True
264+ quote_loosebox : bool = True ,
265+ superscript_height_threshold : float = 0.7 ,
266+ line_distance_threshold : float = 0.1 ,
193267) -> Pages :
194268 pages : Pages = []
195269
@@ -212,8 +286,9 @@ def get_pages(
212286 pass
213287
214288 chars = deduplicate_chars (get_chars (textpage , page_bbox , page_rotation , quote_loosebox ))
215- spans = get_spans (chars )
289+ spans = get_spans (chars , superscript_height_threshold = superscript_height_threshold , line_distance_threshold = line_distance_threshold )
216290 lines = get_lines (spans )
291+ assign_scripts (lines , height_threshold = superscript_height_threshold , line_distance_threshold = line_distance_threshold )
217292 blocks = get_blocks (lines )
218293
219294 pages .append ({
0 commit comments