feat(ocr): Add modular store profiles with hot-reload support
## Store Profiles System
- Add ProfileRegistry for CUI-based profile lookup
- Add BaseStoreProfile with generic extraction patterns
- Implement hot-reload via POST /api/data-entry/ocr/profiles/reload
## 12 Store Profiles
- LIDL: Multi-rate TVA (A, B, C, D codes)
- OMV, SOCAR: B2B with client CUI, YYYY.MM.DD dates
- BRICK, DEDEMAN: Standard TVA, e-factura support
- KINETERRA, BEST PRINT: Non-VAT payers (returns [])
- STEPOUT MARKET: TVA 5% (books/reduced rate)
- UNLIMITED KEYS: NUMERAR payment detection
- GAMA INK, ELECTROBERING, PICTUS VELUM: Standard TVA
## Flexible TVA Patterns
- All patterns use (\d{1,2})% to accept any rate
- Supports historical (19%, 9%, 5%) and current (21%, 11%)
## Payment Methods Fix
- Fixed base.py to support multiple payments of same type
- Changed deduplication from method-only to (method, amount) tuple
- Returns separate entries for split payments
## Tools
- Add generate_store_profile.py for automatic profile generation
- Analyzes PDFs via OCR API and detects patterns
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
600
scripts/generate_store_profile.py
Executable file
600
scripts/generate_store_profile.py
Executable file
@@ -0,0 +1,600 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Store Profile Generator Script
|
||||
|
||||
Analyzes PDF receipts from a store and generates a Python profile class
|
||||
for the OCR extraction system.
|
||||
|
||||
Usage:
|
||||
python scripts/generate_store_profile.py \
|
||||
--name "Magazin Exemplu" \
|
||||
--cui "12345678" \
|
||||
--receipts "docs/data-entry/MagazinExemplu*.pdf" \
|
||||
--output "backend/modules/data_entry/services/ocr/profiles/magazin_exemplu.py"
|
||||
|
||||
Features:
|
||||
- Submits PDFs to OCR API
|
||||
- Analyzes extracted text for patterns (TVA, total, date, payment)
|
||||
- Generates a BaseStoreProfile subclass with detected patterns
|
||||
- Supports hot-reload via ProfileRegistry
|
||||
|
||||
Requirements:
|
||||
- Backend server running on localhost:8000
|
||||
- JWT authentication
|
||||
- python-jose, requests packages
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
import requests
|
||||
from jose import jwt
|
||||
except ImportError:
|
||||
print("Error: Required packages not installed.")
|
||||
print("Run: pip install python-jose requests")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Configuration
|
||||
API_BASE = os.getenv("API_BASE", "http://localhost:8000")
|
||||
JWT_SECRET = os.getenv("JWT_SECRET_KEY", "GENERATE_NEW_SECRET_FOR_PRODUCTION3334!")
|
||||
|
||||
|
||||
def create_jwt_token() -> str:
|
||||
"""Create a test JWT token for API authentication."""
|
||||
payload = {
|
||||
"username": "PROFILE_GENERATOR",
|
||||
"user_id": 1,
|
||||
"companies": ["604"],
|
||||
"permissions": ["read", "write"],
|
||||
"exp": datetime.now(timezone.utc) + timedelta(hours=1),
|
||||
"iat": datetime.now(timezone.utc),
|
||||
"type": "access"
|
||||
}
|
||||
return jwt.encode(payload, JWT_SECRET, algorithm="HS256")
|
||||
|
||||
|
||||
def submit_ocr(pdf_path: str, token: str, api_base: str = API_BASE, timeout: int = 120) -> Optional[Dict]:
|
||||
"""
|
||||
Submit a PDF to OCR API and wait for result.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
token: JWT authentication token
|
||||
api_base: API base URL
|
||||
timeout: Max seconds to wait for completion
|
||||
|
||||
Returns:
|
||||
Extraction result dict or None on failure
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
filename = os.path.basename(pdf_path)
|
||||
|
||||
print(f" Submitting: {filename}...", end=" ", flush=True)
|
||||
|
||||
try:
|
||||
with open(pdf_path, "rb") as f:
|
||||
files = {"file": (filename, f, "application/pdf")}
|
||||
response = requests.post(
|
||||
f"{api_base}/api/data-entry/ocr/extract?engine=doctr_plus",
|
||||
files=files,
|
||||
headers=headers,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"FAILED (HTTP {response.status_code})")
|
||||
return None
|
||||
|
||||
job_data = response.json()
|
||||
job_id = job_data.get("job_id")
|
||||
|
||||
if not job_id:
|
||||
print("FAILED (no job_id)")
|
||||
return None
|
||||
|
||||
# Poll for completion
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
poll_response = requests.get(
|
||||
f"{api_base}/api/data-entry/ocr/jobs/{job_id}/wait?timeout=30",
|
||||
headers=headers,
|
||||
timeout=35
|
||||
)
|
||||
|
||||
if poll_response.status_code == 200:
|
||||
job_result = poll_response.json()
|
||||
status = job_result.get("status")
|
||||
|
||||
if status == "completed":
|
||||
elapsed = time.time() - start_time
|
||||
print(f"OK ({elapsed:.1f}s)")
|
||||
return job_result.get("result", {})
|
||||
elif status == "error":
|
||||
print(f"ERROR: {job_result.get('error', 'Unknown')}")
|
||||
return None
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
print("TIMEOUT")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"EXCEPTION: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def analyze_tva_patterns(results: List[Dict]) -> Dict:
|
||||
"""
|
||||
Analyze TVA patterns from multiple extraction results.
|
||||
|
||||
Returns:
|
||||
Dict with detected patterns and statistics
|
||||
"""
|
||||
tva_entries = []
|
||||
raw_texts = []
|
||||
|
||||
for r in results:
|
||||
if r.get("tva_entries"):
|
||||
tva_entries.extend(r["tva_entries"])
|
||||
if r.get("raw_text"):
|
||||
raw_texts.append(r["raw_text"])
|
||||
|
||||
# Analyze TVA code patterns (A, B, C, etc.)
|
||||
codes = Counter(e.get("code") for e in tva_entries if e.get("code"))
|
||||
|
||||
# Analyze TVA percentage patterns
|
||||
percents = Counter(e.get("percent") for e in tva_entries if e.get("percent"))
|
||||
|
||||
# Detect TVA format from raw text
|
||||
tva_formats = defaultdict(int)
|
||||
for text in raw_texts:
|
||||
text_upper = text.upper()
|
||||
|
||||
# Standard format: "TVA 19% 10.50" or "TVA: 19% 10.50"
|
||||
if re.search(r'TVA\s*:?\s*\d{1,2}%', text_upper):
|
||||
tva_formats["standard"] += 1
|
||||
|
||||
# Lidl format: "TVA A 21% 7.71"
|
||||
if re.search(r'TVA\s+[A-D]\s+\d{1,2}', text_upper):
|
||||
tva_formats["lidl_multi_rate"] += 1
|
||||
|
||||
# Table format: "BAZA TVA | % TVA | VALOARE TVA"
|
||||
if re.search(r'BAZA\s+TVA', text_upper):
|
||||
tva_formats["table"] += 1
|
||||
|
||||
# No TVA (neplatitor)
|
||||
if re.search(r'NEPLATITOR|NON.?TVA', text_upper):
|
||||
tva_formats["non_vat"] += 1
|
||||
|
||||
return {
|
||||
"codes": dict(codes),
|
||||
"percents": dict(percents),
|
||||
"formats": dict(tva_formats),
|
||||
"has_multi_rate": len(codes) > 1,
|
||||
"is_non_vat": tva_formats.get("non_vat", 0) > 0,
|
||||
"dominant_format": max(tva_formats, key=tva_formats.get) if tva_formats else "standard"
|
||||
}
|
||||
|
||||
|
||||
def analyze_total_patterns(results: List[Dict]) -> Dict:
|
||||
"""Analyze TOTAL patterns from extraction results."""
|
||||
totals = []
|
||||
raw_texts = []
|
||||
|
||||
for r in results:
|
||||
if r.get("amount"):
|
||||
totals.append(float(r["amount"]))
|
||||
if r.get("raw_text"):
|
||||
raw_texts.append(r["raw_text"])
|
||||
|
||||
total_formats = defaultdict(int)
|
||||
for text in raw_texts:
|
||||
text_upper = text.upper()
|
||||
|
||||
if re.search(r'TOTAL\s*:?\s*[\d.,]+', text_upper):
|
||||
total_formats["TOTAL:"] += 1
|
||||
if re.search(r'TOTAL\s+DE\s+PLAT', text_upper):
|
||||
total_formats["TOTAL DE PLATA"] += 1
|
||||
if re.search(r'SUMA\s+TOTAL', text_upper):
|
||||
total_formats["SUMA TOTALA"] += 1
|
||||
if re.search(r'GRAND\s*TOTAL', text_upper):
|
||||
total_formats["GRAND TOTAL"] += 1
|
||||
|
||||
return {
|
||||
"count": len(totals),
|
||||
"formats": dict(total_formats),
|
||||
"dominant_format": max(total_formats, key=total_formats.get) if total_formats else "TOTAL"
|
||||
}
|
||||
|
||||
|
||||
def analyze_date_patterns(results: List[Dict]) -> Dict:
|
||||
"""Analyze date patterns from extraction results."""
|
||||
dates = []
|
||||
raw_texts = []
|
||||
|
||||
for r in results:
|
||||
if r.get("receipt_date"):
|
||||
dates.append(r["receipt_date"])
|
||||
if r.get("raw_text"):
|
||||
raw_texts.append(r["raw_text"])
|
||||
|
||||
date_formats = defaultdict(int)
|
||||
for text in raw_texts:
|
||||
# DD.MM.YYYY
|
||||
if re.search(r'\d{2}\.\d{2}\.\d{4}', text):
|
||||
date_formats["DD.MM.YYYY"] += 1
|
||||
# YYYY.MM.DD (OMV/SOCAR style)
|
||||
if re.search(r'\d{4}\.\d{2}\.\d{2}', text):
|
||||
date_formats["YYYY.MM.DD"] += 1
|
||||
# DD-MM-YYYY
|
||||
if re.search(r'\d{2}-\d{2}-\d{4}', text):
|
||||
date_formats["DD-MM-YYYY"] += 1
|
||||
# DD/MM/YYYY
|
||||
if re.search(r'\d{2}/\d{2}/\d{4}', text):
|
||||
date_formats["DD/MM/YYYY"] += 1
|
||||
|
||||
return {
|
||||
"extracted_dates": dates,
|
||||
"formats": dict(date_formats),
|
||||
"dominant_format": max(date_formats, key=date_formats.get) if date_formats else "DD.MM.YYYY"
|
||||
}
|
||||
|
||||
|
||||
def analyze_payment_patterns(results: List[Dict]) -> Dict:
|
||||
"""Analyze payment method patterns."""
|
||||
payment_counts = defaultdict(int)
|
||||
|
||||
for r in results:
|
||||
methods = r.get("payment_methods", [])
|
||||
for m in methods:
|
||||
method_type = m.get("method", "UNKNOWN")
|
||||
payment_counts[method_type] += 1
|
||||
|
||||
return {
|
||||
"methods": dict(payment_counts),
|
||||
"has_mixed_payments": len(payment_counts) > 1
|
||||
}
|
||||
|
||||
|
||||
def analyze_client_patterns(results: List[Dict]) -> Dict:
|
||||
"""Analyze client (B2B) patterns."""
|
||||
has_client_cui = 0
|
||||
has_client_name = 0
|
||||
|
||||
for r in results:
|
||||
if r.get("client_cui"):
|
||||
has_client_cui += 1
|
||||
if r.get("client_name"):
|
||||
has_client_name += 1
|
||||
|
||||
return {
|
||||
"has_client_cui": has_client_cui > 0,
|
||||
"has_client_name": has_client_name > 0,
|
||||
"b2b_ratio": has_client_cui / len(results) if results else 0
|
||||
}
|
||||
|
||||
|
||||
def generate_profile_code(
|
||||
store_name: str,
|
||||
cui: str,
|
||||
tva_analysis: Dict,
|
||||
total_analysis: Dict,
|
||||
date_analysis: Dict,
|
||||
payment_analysis: Dict,
|
||||
client_analysis: Dict
|
||||
) -> str:
|
||||
"""
|
||||
Generate Python profile class code.
|
||||
|
||||
Args:
|
||||
store_name: Human-readable store name
|
||||
cui: CUI number (without RO prefix)
|
||||
*_analysis: Analysis results from pattern detection
|
||||
|
||||
Returns:
|
||||
Python source code for the profile class
|
||||
"""
|
||||
# Generate class name from store name
|
||||
class_name = "".join(
|
||||
word.capitalize()
|
||||
for word in re.sub(r'[^a-zA-Z0-9\s]', '', store_name).split()
|
||||
) + "Profile"
|
||||
|
||||
# Generate module name
|
||||
module_name = re.sub(r'[^a-z0-9]', '_', store_name.lower()).strip('_')
|
||||
|
||||
# Determine profile characteristics
|
||||
is_non_vat = tva_analysis.get("is_non_vat", False)
|
||||
has_multi_rate = tva_analysis.get("has_multi_rate", False)
|
||||
has_client_cui = client_analysis.get("has_client_cui", False)
|
||||
uses_yyyy_mm_dd = date_analysis.get("dominant_format") == "YYYY.MM.DD"
|
||||
|
||||
# Generate OCR name patterns
|
||||
name_words = store_name.upper().split()
|
||||
primary_word = name_words[0] if name_words else store_name.upper()
|
||||
name_patterns = [
|
||||
primary_word,
|
||||
store_name.upper().replace(".", "").replace(",", ""),
|
||||
]
|
||||
# Add OCR error variants
|
||||
ocr_variants = {
|
||||
'O': '0', 'I': '1', 'L': '1', 'S': '5', 'B': '8', 'E': '3'
|
||||
}
|
||||
for char, replacement in ocr_variants.items():
|
||||
if char in primary_word:
|
||||
name_patterns.append(primary_word.replace(char, replacement, 1))
|
||||
|
||||
name_patterns = list(dict.fromkeys(name_patterns))[:4] # Unique, max 4
|
||||
|
||||
# Build the code
|
||||
code_lines = [
|
||||
'"""',
|
||||
f'{store_name} store profile for OCR extraction.',
|
||||
'',
|
||||
'Auto-generated by generate_store_profile.py',
|
||||
f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M")}',
|
||||
'"""',
|
||||
'',
|
||||
'import re',
|
||||
'from decimal import Decimal, InvalidOperation',
|
||||
'from typing import List, Dict, Any',
|
||||
'',
|
||||
'from .base import BaseStoreProfile',
|
||||
'from . import ProfileRegistry',
|
||||
'',
|
||||
'',
|
||||
'@ProfileRegistry.register',
|
||||
f'class {class_name}(BaseStoreProfile):',
|
||||
' """',
|
||||
f' {store_name} - OCR extraction profile.',
|
||||
' ',
|
||||
]
|
||||
|
||||
# Add characteristics to docstring
|
||||
characteristics = []
|
||||
if is_non_vat:
|
||||
characteristics.append("Non-VAT payer (neplatitor TVA)")
|
||||
if has_multi_rate:
|
||||
characteristics.append("Multi-rate TVA")
|
||||
if has_client_cui:
|
||||
characteristics.append("B2B receipts with client CUI")
|
||||
if uses_yyyy_mm_dd:
|
||||
characteristics.append("Date format: YYYY.MM.DD")
|
||||
|
||||
if characteristics:
|
||||
code_lines.append(' Key characteristics:')
|
||||
for c in characteristics:
|
||||
code_lines.append(f' - {c}')
|
||||
code_lines.append(' ')
|
||||
|
||||
code_lines.extend([
|
||||
' """',
|
||||
'',
|
||||
f' CUI_LIST = ["{cui}"]',
|
||||
f' NAME_PATTERNS = {name_patterns}',
|
||||
f' STORE_NAME = "{store_name}"',
|
||||
'',
|
||||
])
|
||||
|
||||
# Add date patterns override for YYYY.MM.DD format
|
||||
if uses_yyyy_mm_dd:
|
||||
code_lines.extend([
|
||||
' # Override date patterns for YYYY.MM.DD format',
|
||||
' DATE_PATTERNS_OCR_SPACES = [',
|
||||
' r\'(\\d{4})[.,]\\s*(\\d{2})[.,]\\s*(\\d{2})\', # YYYY. MM. DD with spaces',
|
||||
' r\'(\\d{4})[.,](\\d{2})[.,](\\d{2})\', # YYYY.MM.DD',
|
||||
' ]',
|
||||
'',
|
||||
])
|
||||
|
||||
# Add TVA extraction method for multi-rate or non-VAT
|
||||
if is_non_vat:
|
||||
code_lines.extend([
|
||||
' def extract_tva_entries(self, text: str) -> List[dict]:',
|
||||
' """Non-VAT payer - returns empty list."""',
|
||||
' return []',
|
||||
'',
|
||||
])
|
||||
elif has_multi_rate and tva_analysis.get("dominant_format") == "lidl_multi_rate":
|
||||
code_lines.extend([
|
||||
' # Store-specific TVA patterns',
|
||||
' TVA_PATTERNS = [',
|
||||
' r\'T[VU][AR]\\s+([A-D])\\s+(\\d{1,2})[.,]?\\d{0,2}\\s*%\\s+([\\d.,]+)\',',
|
||||
' ]',
|
||||
'',
|
||||
' def extract_tva_entries(self, text: str) -> List[dict]:',
|
||||
' """Extract multi-rate TVA entries."""',
|
||||
' entries = []',
|
||||
' seen = set()',
|
||||
'',
|
||||
' for pattern in self.TVA_PATTERNS:',
|
||||
' for match in re.finditer(pattern, text, re.IGNORECASE):',
|
||||
' try:',
|
||||
' code = match.group(1).upper()',
|
||||
' percent = int(match.group(2))',
|
||||
' amount = self._parse_decimal(match.group(3))',
|
||||
'',
|
||||
' if amount and amount > 0:',
|
||||
' entry_key = (code, percent)',
|
||||
' if entry_key not in seen:',
|
||||
' entries.append({',
|
||||
' \'code\': code,',
|
||||
' \'percent\': percent,',
|
||||
' \'amount\': amount',
|
||||
' })',
|
||||
' seen.add(entry_key)',
|
||||
' except (ValueError, InvalidOperation):',
|
||||
' continue',
|
||||
'',
|
||||
' return entries',
|
||||
'',
|
||||
])
|
||||
|
||||
# Add validation hints method
|
||||
code_lines.extend([
|
||||
' def get_validation_hints(self) -> Dict[str, Any]:',
|
||||
f' """Return {store_name}-specific validation hints."""',
|
||||
' return {',
|
||||
f' "has_multi_rate_tva": {has_multi_rate},',
|
||||
f' "card_equals_total": True,',
|
||||
f' "has_client_cui": {has_client_cui},',
|
||||
f' "has_efactura": False,',
|
||||
f' "is_non_vat_payer": {is_non_vat},',
|
||||
' }',
|
||||
])
|
||||
|
||||
return '\n'.join(code_lines) + '\n'
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate store profile from PDF receipts",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Generate profile from a single PDF
|
||||
python scripts/generate_store_profile.py \\
|
||||
--name "Magazin Nou" --cui "12345678" \\
|
||||
--receipts "docs/data-entry/magazin_nou.pdf"
|
||||
|
||||
# Generate profile from multiple PDFs (glob pattern)
|
||||
python scripts/generate_store_profile.py \\
|
||||
--name "Carrefour" --cui "2475489" \\
|
||||
--receipts "docs/data-entry/Carrefour*.pdf" \\
|
||||
--output backend/modules/data_entry/services/ocr/profiles/carrefour.py
|
||||
|
||||
# Dry run (analyze only, don't write file)
|
||||
python scripts/generate_store_profile.py \\
|
||||
--name "Test Store" --cui "11111111" \\
|
||||
--receipts "docs/data-entry/test*.pdf" \\
|
||||
--dry-run
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument("--name", required=True, help="Store name (e.g., 'LIDL DISCOUNT S.R.L.')")
|
||||
parser.add_argument("--cui", required=True, help="CUI number without RO prefix")
|
||||
parser.add_argument("--receipts", required=True, help="PDF file path or glob pattern")
|
||||
parser.add_argument("--output", help="Output file path (default: auto-generated)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Analyze only, don't write file")
|
||||
parser.add_argument("--api-base", default=API_BASE, help=f"API base URL (default: {API_BASE})")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Update API base if provided
|
||||
api_base = args.api_base
|
||||
|
||||
# Validate CUI format
|
||||
cui = args.cui.strip().replace("RO", "").replace(" ", "")
|
||||
if not cui.isdigit() or len(cui) < 6 or len(cui) > 10:
|
||||
print(f"Error: Invalid CUI format: {args.cui}")
|
||||
sys.exit(1)
|
||||
|
||||
# Find PDF files
|
||||
pdf_files = glob.glob(args.receipts)
|
||||
if not pdf_files:
|
||||
print(f"Error: No PDF files found matching: {args.receipts}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Store Profile Generator")
|
||||
print(f"{'='*60}")
|
||||
print(f"Store: {args.name}")
|
||||
print(f"CUI: {cui}")
|
||||
print(f"PDFs: {len(pdf_files)} files")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Generate JWT token
|
||||
token = create_jwt_token()
|
||||
|
||||
# Submit PDFs to OCR
|
||||
print("Step 1: Submitting PDFs to OCR API...")
|
||||
results = []
|
||||
for pdf_path in pdf_files:
|
||||
result = submit_ocr(pdf_path, token, api_base=api_base)
|
||||
if result:
|
||||
results.append(result)
|
||||
|
||||
if not results:
|
||||
print("\nError: No successful extractions. Check if backend is running.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\nSuccessfully extracted: {len(results)}/{len(pdf_files)} PDFs")
|
||||
|
||||
# Analyze patterns
|
||||
print("\nStep 2: Analyzing patterns...")
|
||||
tva_analysis = analyze_tva_patterns(results)
|
||||
total_analysis = analyze_total_patterns(results)
|
||||
date_analysis = analyze_date_patterns(results)
|
||||
payment_analysis = analyze_payment_patterns(results)
|
||||
client_analysis = analyze_client_patterns(results)
|
||||
|
||||
print(f" TVA: {tva_analysis['dominant_format']} format, multi-rate={tva_analysis['has_multi_rate']}")
|
||||
print(f" Date: {date_analysis['dominant_format']} format")
|
||||
print(f" Payments: {list(payment_analysis['methods'].keys())}")
|
||||
print(f" B2B: {client_analysis['has_client_cui']}")
|
||||
|
||||
# Generate profile code
|
||||
print("\nStep 3: Generating profile code...")
|
||||
code = generate_profile_code(
|
||||
store_name=args.name,
|
||||
cui=cui,
|
||||
tva_analysis=tva_analysis,
|
||||
total_analysis=total_analysis,
|
||||
date_analysis=date_analysis,
|
||||
payment_analysis=payment_analysis,
|
||||
client_analysis=client_analysis
|
||||
)
|
||||
|
||||
# Determine output path
|
||||
if args.output:
|
||||
output_path = args.output
|
||||
else:
|
||||
module_name = re.sub(r'[^a-z0-9]', '_', args.name.lower()).strip('_')
|
||||
output_path = f"backend/modules/data_entry/services/ocr/profiles/{module_name}.py"
|
||||
|
||||
if args.dry_run:
|
||||
print(f"\n[DRY RUN] Would write to: {output_path}")
|
||||
print(f"\n{'='*60}")
|
||||
print("Generated code:")
|
||||
print(f"{'='*60}")
|
||||
print(code)
|
||||
else:
|
||||
# Write file
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, 'w') as f:
|
||||
f.write(code)
|
||||
print(f" Written to: {output_path}")
|
||||
|
||||
# Verify syntax
|
||||
import py_compile
|
||||
try:
|
||||
py_compile.compile(output_path, doraise=True)
|
||||
print(f" Syntax check: OK")
|
||||
except py_compile.PyCompileError as e:
|
||||
print(f" Syntax check: FAILED - {e}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("Profile generation complete!")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if not args.dry_run:
|
||||
print(f"\nNext steps:")
|
||||
print(f"1. Review the generated code: {output_path}")
|
||||
print(f"2. Customize patterns if needed")
|
||||
print(f"3. Hot-reload profiles: curl -X POST http://localhost:8000/api/data-entry/ocr/profiles/reload")
|
||||
print(f"4. Test with a sample receipt")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user