2626from gradia .backend .logger import Logger
2727from gradia .backend .settings import Settings
2828from gradia .constants import app_id
29- from gradia .constants import ocr_tesseract_cmd , ocr_original_tessdata , ocr_user_tessdata
29+ from gradia .constants import ocr_tesseract_cmd , ocr_original_tessdata
3030
3131
3232logger = Logger ()
@@ -63,7 +63,7 @@ class OCR:
6363 def __init__ (self , window = None ):
6464 self .tesseract_cmd = ocr_tesseract_cmd
6565 self .original_tessdata_dir = ocr_original_tessdata
66- self .user_tessdata_dir = ocr_user_tessdata
66+ self .user_tessdata_dir = os . path . expanduser ( f"~/.var/app/ { app_id } /data/tessdata" )
6767 self .window = window
6868
6969 pytesseract .pytesseract .tesseract_cmd = self .tesseract_cmd
@@ -90,27 +90,21 @@ def set_current_model(self, model_code: str):
9090 logger .warning (f"Cannot set model { model_code } : not installed" )
9191 raise ValueError (f"Model { model_code } is not installed" )
9292
93- def extract_text (self , image , primary_lang = "eng" , secondary_lang = "eng" ):
93+ def extract_text (self , image , primary_lang ):
9494 if not self .get_installed_models ():
9595 raise RuntimeError ("No OCR language models are available" )
9696
9797 if not self .is_model_installed (primary_lang ):
98- available_models = self .get_installed_models ()
99- if available_models :
100- primary_lang = available_models [0 ]
101- logger .warning (f"Requested language not available, using { primary_lang } " )
102- else :
103- raise RuntimeError ("No OCR language models are available" )
98+ raise RuntimeError (f"OCR language model '{ primary_lang } ' is not installed" )
10499
105100 self .set_current_model (primary_lang )
101+
106102 try :
107103 tessdata_dir = self ._get_tessdata_dir_for_lang (primary_lang )
108104 config = f'--tessdata-dir "{ tessdata_dir } "'
109-
110- if self .is_model_installed (secondary_lang ) and secondary_lang != primary_lang :
111- lang = f"{ primary_lang } +{ secondary_lang } "
112- else :
113- lang = primary_lang
105+ lang = primary_lang
106+ if self .is_model_installed ("eng" ) and primary_lang != "eng" :
107+ lang = f"{ primary_lang } +eng"
114108
115109 extracted_text = pytesseract .image_to_string (
116110 image ,
@@ -168,6 +162,7 @@ def on_download_complete(session, result, user_data):
168162
169163 with open (output_path , 'wb' ) as f :
170164 f .write (raw_bytes )
165+ logger .info (f"saving to { output_path } " )
171166
172167 logger .info (f"Downloaded OCR model: { model_code } " )
173168 self .set_current_model (model_code )
0 commit comments