fix(ocr,gitignore): fix charmap encoding in OCR workers and ignore SQLite WAL files
- Fix UnicodeEncodeError in OCR worker subprocesses on Windows: spawned processes with 'spawn' context inherit charmap (CP1252) encoding, causing crashes when printing Romanian diacritics (ș, ț, ă) from OCR-extracted text. Fix: reconfigure sys.stdout/sys.stderr to UTF-8 in _worker_initializer(). - Add *.db-wal and *.db-shm to .gitignore for SQLite WAL mode auxiliary files. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -367,6 +367,17 @@ def _worker_initializer() -> None:
|
||||
|
||||
Total warmup time = max(engine_times) instead of sum(engine_times).
|
||||
"""
|
||||
import io
|
||||
|
||||
# Fix stdout/stderr encoding in spawned worker process.
|
||||
# On Windows, 'spawn' context creates a subprocess with charmap (CP1252) encoding.
|
||||
# Romanian diacritics (ș, ț, ă, â, î) cannot be encoded in charmap, causing
|
||||
# UnicodeEncodeError when print() is called with OCR-extracted text.
|
||||
if sys.stdout and hasattr(sys.stdout, 'buffer'):
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
||||
if sys.stderr and hasattr(sys.stderr, 'buffer'):
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
|
||||
|
||||
global _paddle_engine, _tesseract_engine, _doctr_engine, _worker_initialized
|
||||
|
||||
if _worker_initialized:
|
||||
|
||||
Reference in New Issue
Block a user