@@ -85,8 +85,10 @@ def word_break():
8585 word_break ()
8686 continue
8787
88- # we break on any change in font info
89- if any (char ['font' ][k ] != word ['font' ][k ] for k in ['name' , 'flags' , 'size' , 'weight' ]):
88+ # we break on any change in font info - optimized comparison
89+ char_font = char ['font' ]
90+ word_font = word ['font' ]
91+ if any (char_font [k ] != word_font [k ] for k in ['name' , 'flags' , 'size' , 'weight' ]):
9092 word_break ()
9193 continue
9294
@@ -99,17 +101,19 @@ def word_break():
99101 word ['bbox' ] = word ['bbox' ].merge (char ['bbox' ])
100102 word ['chars' ].append (char )
101103
102- # deduplicate words
103- seen = {}
104+ # deduplicate words - use tuple keys instead of strings
105+ seen = set ()
104106 deduped = []
105107 for word in words :
106108 # Round the bbox coordinates
107109 bbox = word ['bbox' ].bbox
108- bbox = [ round (x , 0 ) for x in bbox ]
110+ bbox_rounded = tuple ( round (x , 0 ) for x in bbox )
109111
110- key = f"{ bbox } -{ word ['text' ]} -{ word ['rotation' ]} -{ word ['font' ]['name' ]} -{ word ['font' ]['flags' ]} -{ word ['font' ]['size' ]} -{ word ['font' ]['weight' ]} "
112+ key = (bbox_rounded , word ['text' ], word ['rotation' ],
113+ word ['font' ]['name' ], word ['font' ]['flags' ],
114+ word ['font' ]['size' ], word ['font' ]['weight' ])
111115 if key not in seen :
112- seen [ key ] = True
116+ seen . add ( key )
113117 deduped .append (word )
114118
115119 return [char for word in deduped for char in word ['chars' ]]
0 commit comments