File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -157,7 +157,7 @@ def line_break():
157157 line_break ()
158158 continue
159159
160- if span ["rotation" ] != line ["rotation" ]:
160+ if span ["rotation" ] != line ["rotation" ] and abs ( span [ "rotation" ] - line [ "rotation" ]) >= 45 :
161161 line_break ()
162162 continue
163163
Original file line number Diff line number Diff line change @@ -52,7 +52,7 @@ def extract_text_cli(
5252 workers = kwargs ["workers" ],
5353 disable_links = True
5454 )
55- text = json .dumps (text )
55+ text = json .dumps (text , ensure_ascii = False , indent = 2 )
5656 else :
5757 text = plain_text_output (
5858 pdf_path ,
Original file line number Diff line number Diff line change 11[tool .poetry ]
22name = " pdftext"
3- version = " 0.6.2 "
3+ version = " 0.6.3 "
44description = " Extract structured text from pdfs quickly"
55authors = [" Vik Paruchuri <vik.paruchuri@gmail.com>" ]
66license = " Apache-2.0"
Original file line number Diff line number Diff line change 55def pdf_path ():
66 return "tests/data/adversarial.pdf"
77
8+ @pytest .fixture (scope = "session" )
9+ def pdf_path2 ():
10+ return "tests/data/communication.pdf"
11+
812@pytest .fixture ()
913def pdf_doc (pdf_path ):
1014 doc = pdfium .PdfDocument (pdf_path )
Original file line number Diff line number Diff line change @@ -35,3 +35,10 @@ def test_superscripts(pdf_path):
3535 if span ["text" ] == "∞" :
3636 assert span ["superscript" ] is True
3737 return True
38+
39+
40+ def test_line_joining (pdf_path2 ):
41+ pages = [11 ]
42+ text = plain_text_output (pdf_path2 , page_range = pages ).lower ()
43+ assert "the axis media control viewer toolbar" in text
44+ assert "axismediacontrolviewertoolbar" not in text
You can’t perform that action at this time.
0 commit comments