diff --git a/.gitignore b/.gitignore index ef7ec00..ef765cd 100644 --- a/.gitignore +++ b/.gitignore @@ -519,6 +519,8 @@ secrets-backup/**/.env.* # ============================================================================ # Backend unified data directories (cache, receipts, telegram) backend/data/cache/*.db +backend/data/cache/*.db-wal +backend/data/cache/*.db-shm backend/data/receipts/*.db backend/data/telegram/*.db backend/data/receipts/uploads/* diff --git a/backend/modules/data_entry/services/ocr/ocr_worker_pool.py b/backend/modules/data_entry/services/ocr/ocr_worker_pool.py index 679e838..dedd9a8 100644 --- a/backend/modules/data_entry/services/ocr/ocr_worker_pool.py +++ b/backend/modules/data_entry/services/ocr/ocr_worker_pool.py @@ -367,6 +367,17 @@ def _worker_initializer() -> None: Total warmup time = max(engine_times) instead of sum(engine_times). """ + import io + + # Fix stdout/stderr encoding in spawned worker process. + # On Windows, 'spawn' context creates a subprocess with charmap (CP1252) encoding. + # Romanian diacritics (ș, ț, ă, â, î) cannot be encoded in charmap, causing + # UnicodeEncodeError when print() is called with OCR-extracted text. + if sys.stdout and hasattr(sys.stdout, 'buffer'): + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + if sys.stderr and hasattr(sys.stderr, 'buffer'): + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') + global _paddle_engine, _tesseract_engine, _doctr_engine, _worker_initialized if _worker_initialized: