fix(ocr,gitignore): fix charmap encoding in OCR workers and ignore SQLite WAL files

- Fix UnicodeEncodeError in OCR worker subprocesses on Windows: spawned processes
  with 'spawn' context inherit charmap (CP1252) encoding, causing crashes when
  printing Romanian diacritics (ș, ț, ă) from OCR-extracted text.
  Fix: reconfigure sys.stdout/sys.stderr to UTF-8 in _worker_initializer().

- Add *.db-wal and *.db-shm to .gitignore for SQLite WAL mode auxiliary files.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-03-20 12:52:50 +00:00
parent a1a47eeb7f
commit 23080aec46
2 changed files with 13 additions and 0 deletions

2
.gitignore vendored
View File

@@ -519,6 +519,8 @@ secrets-backup/**/.env.*
# ============================================================================
# Backend unified data directories (cache, receipts, telegram)
backend/data/cache/*.db
backend/data/cache/*.db-wal
backend/data/cache/*.db-shm
backend/data/receipts/*.db
backend/data/telegram/*.db
backend/data/receipts/uploads/*

View File

@@ -367,6 +367,17 @@ def _worker_initializer() -> None:
Total warmup time = max(engine_times) instead of sum(engine_times).
"""
import io
# Fix stdout/stderr encoding in spawned worker process.
# On Windows, 'spawn' context creates a subprocess with charmap (CP1252) encoding.
# Romanian diacritics (ș, ț, ă, â, î) cannot be encoded in charmap, causing
# UnicodeEncodeError when print() is called with OCR-extracted text.
if sys.stdout and hasattr(sys.stdout, 'buffer'):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
if sys.stderr and hasattr(sys.stderr, 'buffer'):
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
global _paddle_engine, _tesseract_engine, _doctr_engine, _worker_initialized
if _worker_initialized: