Merge pull request #40 from VikParuchuri/dev

VikParuchuri · web-flow · commit a002c7f9866c · 2025-02-26T06:40:22.000-08:00
Superscripts
diff --git a/pdftext/pdf/pages.py b/pdftext/pdf/pages.py
@@ -3,6 +3,7 @@
 import math
 import statistics
 from typing import List
+import unicodedata
 
 import pypdfium2 as pdfium
 
@@ -11,7 +12,69 @@
 from pdftext.schema import Blocks, Chars, Line, Lines, Pages, Span, Spans
 
 
-def get_spans(chars: Chars) -> Spans:
+def is_math_symbol(char):
+    if len(char) != 1:
+        return False
+
+    category = unicodedata.category(char)
+    return category == 'Sm'
+
+def assign_scripts(lines: Lines, height_threshold: float = 0.8, line_distance_threshold: float = 0.1):
+    for line in lines:
+        prev_span = None
+        if len(line["spans"]) < 2:
+            continue
+
+        # Skip vertical lines
+        if line["bbox"].height > line["bbox"].width:
+            continue
+
+        for i, span in enumerate(line["spans"]):
+            is_first = i == 0 or not prev_span["text"].strip()
+            is_last = i == len(line["spans"]) - 1 or not line["spans"][i + 1]["text"].strip()
+            span_height = span["bbox"].height
+            span_top = span["bbox"].y_start
+            span_bottom = span["bbox"].y_end
+
+            line_fullheight = span_height / max(1, line["bbox"].height) <= height_threshold
+            next_fullheight = is_last or span_height / max(1, line["spans"][i + 1]["bbox"].height) <= height_threshold
+            prev_fullheight = is_first or span_height / max(1, prev_span["bbox"].height) <= height_threshold
+
+            above = any([span_top < (s["bbox"].y_start - s["bbox"].height * line_distance_threshold) for j, s in enumerate(line["spans"]) if j != i])
+            prev_above = is_first or span_top < prev_span["bbox"].y_start
+            next_above = is_last or span_top < line["spans"][i + 1]["bbox"].y_start
+
+            below = any([span_bottom > (s["bbox"].y_end + s["bbox"].height * line_distance_threshold) for j, s in enumerate(line["spans"]) if j != i])
+            prev_below = is_first or span_bottom > prev_span["bbox"].y_end
+            next_below = is_last or span_bottom > line["spans"][i + 1]["bbox"].y_end
+
+            span_text = span["text"].strip()
+            span_text_okay = all([
+                (len(span_text) == 1 or span_text.isdigit()), # Ensure that the span text is a single char or a number
+                span_text.isalnum() or is_math_symbol(span_text) # Ensure that the span text is an alphanumeric or a math symbol
+            ])
+
+            if all([
+                (prev_fullheight or next_fullheight),
+                (prev_above or next_above),
+                above,
+                line_fullheight,
+                span_text_okay
+            ]):
+                span["superscript"] = True
+            elif all([
+                (prev_fullheight or next_fullheight),
+                (prev_below or next_below),
+                below,
+                line_fullheight,
+                span_text_okay
+            ]):
+                span["subscript"] = True
+
+            prev_span = span
+
+
+def get_spans(chars: Chars, superscript_height_threshold: float = 0.8, line_distance_threshold: float = 0.1) -> Spans:
     spans: Spans = []
     span: Span = None
 
@@ -49,6 +112,15 @@ def span_break():
             span_break()
             continue
 
+        # Character is likely a superscript
+        if all([
+            char["bbox"][1] < (span["bbox"][1] - span["bbox"].height * line_distance_threshold), # char top is above span
+            char["bbox"][3] < (span["bbox"].height * superscript_height_threshold) + span["bbox"][1], # char bottom is not full line height
+            char["bbox"][0] > span["bbox"][2], # char is to the right of the span
+        ]):
+            span_break()
+            continue
+
         span['text'] += char['char']
         span['char_end_idx'] = char['char_idx']
         span['bbox'] = span['bbox'].merge(char['bbox'])
@@ -189,7 +261,9 @@ def get_pages(
     pdf: pdfium.PdfDocument,
     page_range: range,
     flatten_pdf: bool = True,
-    quote_loosebox=True
+    quote_loosebox: bool =True,
+    superscript_height_threshold: float = 0.7,
+    line_distance_threshold: float = 0.1,
 ) -> Pages:
     pages: Pages = []
 
@@ -212,8 +286,9 @@ def get_pages(
             pass
 
         chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox))
-        spans = get_spans(chars)
+        spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
         lines = get_lines(spans)
+        assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold)
         blocks = get_blocks(lines)
 
         pages.append({
diff --git a/pdftext/schema.py b/pdftext/schema.py
@@ -16,6 +16,9 @@ def __init__(self, bbox: List[float], ensure_nonzero_area=False):
     def __getitem__(self, item):
         return self.bbox[item]
 
+    def __repr__(self):
+        return f"Bbox({self.bbox})"
+
     @property
     def height(self):
         return self.bbox[3] - self.bbox[1]
@@ -140,6 +143,8 @@ class Span(TypedDict):
     char_end_idx: int
     rotation: int
     url: str
+    superscript: bool
+    subscript: bool
 
 
 class Line(TypedDict):
diff --git a/pdftext/scripts/extract_text.py b/pdftext/scripts/extract_text.py
@@ -43,10 +43,25 @@ def extract_text_cli(
         assert all(0 <= p <= doc_len for p in pages), "Invalid page number(s) provided"
 
     if kwargs["json"]:
-        text = dictionary_output(pdf_path, sort=kwargs["sort"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], keep_chars=kwargs["keep_chars"], workers=kwargs["workers"])
+        text = dictionary_output(
+            pdf_path,
+            sort=kwargs["sort"],
+            page_range=pages,
+            flatten_pdf=kwargs["flatten_pdf"],
+            keep_chars=kwargs["keep_chars"],
+            workers=kwargs["workers"],
+            disable_links=True
+        )
         text = json.dumps(text)
     else:
-        text = plain_text_output(pdf_path, sort=kwargs["sort"], hyphens=kwargs["keep_hyphens"], page_range=pages, flatten_pdf=kwargs["flatten_pdf"], workers=kwargs["workers"])
+        text = plain_text_output(
+            pdf_path,
+            sort=kwargs["sort"],
+            hyphens=kwargs["keep_hyphens"],
+            page_range=pages,
+            flatten_pdf=kwargs["flatten_pdf"],
+            workers=kwargs["workers"]
+        )
 
     if out_path is None:
         print(text)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.6.0"
+version = "0.6.1"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"
diff --git a/tests/test_extraction.py b/tests/test_extraction.py
@@ -24,4 +24,14 @@ def test_json_output(pdf_path, pdf_doc):
 def test_keep_chars(pdf_path):
     pages: Pages = dictionary_output(pdf_path, keep_chars=True)
     assert "Subspace" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["text"]
-    assert "bbox" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["chars"][0]
+    assert "bbox" in pages[0]["blocks"][0]["lines"][0]["spans"][0]["chars"][0]
+
+def test_superscripts(pdf_path):
+    pages: Pages = dictionary_output(pdf_path)
+    for page in pages:
+        for block in page["blocks"]:
+            for line in block["lines"]:
+                for span in line["spans"]:
+                    if span["text"] == "∞":
+                        assert span["superscript"] is True
+                        return True