From 23080aec46acde0951a2ab6eeff8946d20c19cbf Mon Sep 17 00:00:00 2001 From: Claude Agent Date: Fri, 20 Mar 2026 12:52:50 +0000 Subject: [PATCH] fix(ocr,gitignore): fix charmap encoding in OCR workers and ignore SQLite WAL files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix UnicodeEncodeError in OCR worker subprocesses on Windows: spawned processes with 'spawn' context inherit charmap (CP1252) encoding, causing crashes when printing Romanian diacritics (ș, ț, ă) from OCR-extracted text. Fix: reconfigure sys.stdout/sys.stderr to UTF-8 in _worker_initializer(). - Add *.db-wal and *.db-shm to .gitignore for SQLite WAL mode auxiliary files. Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 2 ++ .../data_entry/services/ocr/ocr_worker_pool.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index ef7ec00..ef765cd 100644 --- a/.gitignore +++ b/.gitignore @@ -519,6 +519,8 @@ secrets-backup/**/.env.* # ============================================================================ # Backend unified data directories (cache, receipts, telegram) backend/data/cache/*.db +backend/data/cache/*.db-wal +backend/data/cache/*.db-shm backend/data/receipts/*.db backend/data/telegram/*.db backend/data/receipts/uploads/* diff --git a/backend/modules/data_entry/services/ocr/ocr_worker_pool.py b/backend/modules/data_entry/services/ocr/ocr_worker_pool.py index 679e838..dedd9a8 100644 --- a/backend/modules/data_entry/services/ocr/ocr_worker_pool.py +++ b/backend/modules/data_entry/services/ocr/ocr_worker_pool.py @@ -367,6 +367,17 @@ def _worker_initializer() -> None: Total warmup time = max(engine_times) instead of sum(engine_times). """ + import io + + # Fix stdout/stderr encoding in spawned worker process. + # On Windows, 'spawn' context creates a subprocess with charmap (CP1252) encoding. + # Romanian diacritics (ș, ț, ă, â, î) cannot be encoded in charmap, causing + # UnicodeEncodeError when print() is called with OCR-extracted text. + if sys.stdout and hasattr(sys.stdout, 'buffer'): + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + if sys.stderr and hasattr(sys.stderr, 'buffer'): + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') + global _paddle_engine, _tesseract_engine, _doctr_engine, _worker_initialized if _worker_initialized: