feat(ocr): Add docTR OCR engine with metrics infrastructure
Add docTR as primary OCR engine with 2-tier sequential processing, OCR metrics tracking, and simplified engine selection. Features: - docTR OCR engine with light+medium preprocessing tiers - doctr_plus mode with early exit optimization (~65% fast path) - OCR metrics dashboard with per-engine statistics - User OCR preference persistence - Parallel worker pool for OCR processing - Cross-validation for extraction quality Engine options: tesseract, doctr, doctr_plus (recommended), paddleocr 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
629
tests/ocr-validation/expected_receipts.json
Normal file
629
tests/ocr-validation/expected_receipts.json
Normal file
@@ -0,0 +1,629 @@
|
||||
{
|
||||
"receipts": [
|
||||
{
|
||||
"id": "receipt_01",
|
||||
"filename": "abonament kineterra.pdf",
|
||||
"furnizor": "KINETERRA CONCEPT SRL",
|
||||
"cui_furnizor": "31180432",
|
||||
"client": null,
|
||||
"cui_client": null,
|
||||
"total": 1900.0,
|
||||
"tva_details": [],
|
||||
"total_tva": 0.0,
|
||||
"card": 1900.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-11-10",
|
||||
"numar_bon": "0039",
|
||||
"notes": "Neplatitor TVA - abonament terapie"
|
||||
},
|
||||
{
|
||||
"id": "receipt_02",
|
||||
"filename": "benzina 14 august.pdf",
|
||||
"furnizor": "OMV PETROM MARKETING S.R.L.",
|
||||
"cui_furnizor": "RO11201891",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 318.16,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 55.22
|
||||
}
|
||||
],
|
||||
"total_tva": 55.22,
|
||||
"card": 318.16,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-08-14",
|
||||
"numar_bon": "2850-00075",
|
||||
"notes": "Benzina standard 95"
|
||||
},
|
||||
{
|
||||
"id": "receipt_03",
|
||||
"filename": "benzina 27 octombrie .pdf",
|
||||
"furnizor": "OMV PETROM MARKETING S.R.L.",
|
||||
"cui_furnizor": "RO11201891",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 285.66,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 49.58
|
||||
}
|
||||
],
|
||||
"total_tva": 49.58,
|
||||
"card": 285.66,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-10-27",
|
||||
"numar_bon": "2857-00217",
|
||||
"notes": "Benzina standard 95"
|
||||
},
|
||||
{
|
||||
"id": "receipt_04",
|
||||
"filename": "igiena 11 octombrie .pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 186.16,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 32.31
|
||||
}
|
||||
],
|
||||
"total_tva": 32.31,
|
||||
"card": 186.16,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-10-11",
|
||||
"numar_bon": "0171",
|
||||
"notes": "BRICK - produse igiena"
|
||||
},
|
||||
{
|
||||
"id": "receipt_05",
|
||||
"filename": "igiena 14 decembrie five-holding.pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 85.99,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 14.92
|
||||
}
|
||||
],
|
||||
"total_tva": 14.92,
|
||||
"card": 85.99,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-12-14",
|
||||
"numar_bon": "0126",
|
||||
"notes": "BRICK - produse igiena"
|
||||
},
|
||||
{
|
||||
"id": "receipt_06",
|
||||
"filename": "rechizite 12 decembrie pictus.pdf",
|
||||
"furnizor": "PICTUS VELUM SRL",
|
||||
"cui_furnizor": "RO39634534",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 11.9,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 2.07
|
||||
}
|
||||
],
|
||||
"total_tva": 2.07,
|
||||
"card": 11.9,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-12-12",
|
||||
"numar_bon": "0060",
|
||||
"notes": "Rechizite - creioane, radiera"
|
||||
},
|
||||
{
|
||||
"id": "receipt_07",
|
||||
"filename": "benzina 10 mai 2025.pdf",
|
||||
"furnizor": "OMV PETROM MARKETING S.R.L.",
|
||||
"cui_furnizor": "RO11201891",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 231.83,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 37.01
|
||||
}
|
||||
],
|
||||
"total_tva": 37.01,
|
||||
"card": 231.83,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-05-10",
|
||||
"numar_bon": "2863-00239",
|
||||
"notes": "Benzina standard 95 - Petrom Baia"
|
||||
},
|
||||
{
|
||||
"id": "receipt_08",
|
||||
"filename": "brick consumabil 604 50% deductibil 22 dec.pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 21.18,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 3.68
|
||||
}
|
||||
],
|
||||
"total_tva": 3.68,
|
||||
"card": 21.18,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-12-22",
|
||||
"numar_bon": "0159",
|
||||
"notes": "BRICK - lichid spalare parbriz"
|
||||
},
|
||||
{
|
||||
"id": "receipt_09",
|
||||
"filename": "benzina 13 septembrie .pdf",
|
||||
"furnizor": "OMV PETROM MARKETING S.R.L.",
|
||||
"cui_furnizor": "RO11201891",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 275.91,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 47.89
|
||||
}
|
||||
],
|
||||
"total_tva": 47.89,
|
||||
"card": 275.91,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-09-13",
|
||||
"numar_bon": "2813-00298",
|
||||
"notes": "Benzina standard 95"
|
||||
},
|
||||
{
|
||||
"id": "receipt_10",
|
||||
"filename": "brick consumabile 604 22 dec.pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 5.27,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 0.91
|
||||
}
|
||||
],
|
||||
"total_tva": 0.91,
|
||||
"card": 5.27,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-12-22",
|
||||
"numar_bon": "0175",
|
||||
"notes": "BRICK - suport polita"
|
||||
},
|
||||
{
|
||||
"id": "receipt_11",
|
||||
"filename": "benzina 20 dec.pdf",
|
||||
"furnizor": "OMV PETROM MARKETING S.R.L.",
|
||||
"cui_furnizor": "RO11201891",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 282.79,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 49.08
|
||||
}
|
||||
],
|
||||
"total_tva": 49.08,
|
||||
"card": 282.79,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-12-20",
|
||||
"numar_bon": "2820-00306",
|
||||
"notes": "Benzina standard 95 - Petrom 26 Constanta"
|
||||
},
|
||||
{
|
||||
"id": "receipt_12",
|
||||
"filename": "bon fiscal Dedeman - efactura.pdf",
|
||||
"furnizor": "DEDEMAN SRL",
|
||||
"cui_furnizor": "RO2816464",
|
||||
"client": "ROMFAST SRL CONSTANTA",
|
||||
"cui_client": "1879855",
|
||||
"total": 5.83,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 0.93
|
||||
}
|
||||
],
|
||||
"total_tva": 0.93,
|
||||
"card": 5.83,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-12-18",
|
||||
"numar_bon": "0066",
|
||||
"notes": "Dedeman - garnituri, reductie"
|
||||
},
|
||||
{
|
||||
"id": "receipt_13",
|
||||
"filename": "factura 70005116259 20.09.2025 Dedeman.pdf",
|
||||
"furnizor": "DEDEMAN SRL",
|
||||
"cui_furnizor": "RO2816464",
|
||||
"client": "ONCR BLEUMARIN CONSTANTA",
|
||||
"cui_client": "46598884",
|
||||
"total": 53.7,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 21,
|
||||
"value": 9.32
|
||||
}
|
||||
],
|
||||
"total_tva": 9.32,
|
||||
"card": 53.7,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-09-20",
|
||||
"numar_bon": "0164",
|
||||
"notes": "Dedeman - folie delimitare, baterii"
|
||||
},
|
||||
{
|
||||
"id": "receipt_14",
|
||||
"filename": "benzina 13 iulie.pdf",
|
||||
"furnizor": "SOCAR PETROLEUM S.A.",
|
||||
"cui_furnizor": "RO12546600",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 252.4,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 40.3
|
||||
}
|
||||
],
|
||||
"total_tva": 40.3,
|
||||
"card": 252.4,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-07-13",
|
||||
"numar_bon": "2443-00129",
|
||||
"notes": "NANO 95 - Socar Adjud Vrancea"
|
||||
},
|
||||
{
|
||||
"id": "receipt_15",
|
||||
"filename": "best print stampila .pdf",
|
||||
"furnizor": "BEST PRINT TRADE ACTIV SRL",
|
||||
"cui_furnizor": "45417955",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 100.0,
|
||||
"tva_details": [],
|
||||
"total_tva": 0.0,
|
||||
"card": 100.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-07-15",
|
||||
"numar_bon": "0018",
|
||||
"notes": "Neplatitor TVA - stampila"
|
||||
},
|
||||
{
|
||||
"id": "receipt_16",
|
||||
"filename": "electrobering telecomanda.pdf",
|
||||
"furnizor": "ELECTROBERING S.R.L.",
|
||||
"cui_furnizor": "RO2744937",
|
||||
"client": null,
|
||||
"cui_client": "1879855",
|
||||
"total": 35.0,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 5.59
|
||||
}
|
||||
],
|
||||
"total_tva": 5.59,
|
||||
"card": 35.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2025-07-17",
|
||||
"numar_bon": "0073",
|
||||
"notes": "Telecomanda A.C."
|
||||
},
|
||||
{
|
||||
"id": "receipt_17a",
|
||||
"filename": "stepout market carti tva 5%.pdf",
|
||||
"page": 1,
|
||||
"furnizor": "STEPOUT MARKET SRL",
|
||||
"cui_furnizor": "RO35532655",
|
||||
"client": null,
|
||||
"cui_client": "1879855",
|
||||
"total": 156.0,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 5,
|
||||
"value": 7.43
|
||||
}
|
||||
],
|
||||
"total_tva": 7.43,
|
||||
"card": 156.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-10-22",
|
||||
"numar_bon": "000009",
|
||||
"notes": "Carti - TVA 5%",
|
||||
"pages": 2
|
||||
},
|
||||
{
|
||||
"id": "receipt_17b",
|
||||
"filename": "stepout market carti tva 5%.pdf",
|
||||
"page": 2,
|
||||
"furnizor": "STEPOUT MARKET SRL",
|
||||
"cui_furnizor": "RO35532655",
|
||||
"client": null,
|
||||
"cui_client": "1879855",
|
||||
"total": 78.0,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 5,
|
||||
"value": 3.71
|
||||
}
|
||||
],
|
||||
"total_tva": 3.71,
|
||||
"card": 78.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-10-24",
|
||||
"numar_bon": "000024",
|
||||
"notes": "Carti - TVA 5%",
|
||||
"pages": 2
|
||||
},
|
||||
{
|
||||
"id": "receipt_18",
|
||||
"filename": "brick igiena 8 octombrie 98.95 lei card.pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 98.95,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 15.8
|
||||
}
|
||||
],
|
||||
"total_tva": 15.8,
|
||||
"card": 98.95,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-10-08",
|
||||
"numar_bon": "0299",
|
||||
"notes": "BRICK - produse igiena"
|
||||
},
|
||||
{
|
||||
"id": "receipt_19",
|
||||
"filename": "gama ink refill toner imprimanta 17 sept 2024.pdf",
|
||||
"furnizor": "GAMA INK SERVICE SRL",
|
||||
"cui_furnizor": "RO17741882",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 45.0,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 7.18
|
||||
}
|
||||
],
|
||||
"total_tva": 7.18,
|
||||
"card": 45.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-09-17",
|
||||
"numar_bon": "0041",
|
||||
"notes": "Incarcare toner HP"
|
||||
},
|
||||
{
|
||||
"id": "receipt_20",
|
||||
"filename": "kineterra fizioterapie 9 sept.pdf",
|
||||
"furnizor": "KINETERRA CONCEPT SRL",
|
||||
"cui_furnizor": "31180432",
|
||||
"client": null,
|
||||
"cui_client": null,
|
||||
"total": 650.0,
|
||||
"tva_details": [],
|
||||
"total_tva": 0.0,
|
||||
"card": 650.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-09-09",
|
||||
"numar_bon": "0024",
|
||||
"notes": "Neplatitor TVA - diatermie tecar"
|
||||
},
|
||||
{
|
||||
"id": "receipt_21",
|
||||
"filename": "brick igiena 1 sept.pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 82.86,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 13.23
|
||||
}
|
||||
],
|
||||
"total_tva": 13.23,
|
||||
"card": 82.86,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-09-01",
|
||||
"numar_bon": "0047",
|
||||
"notes": "BRICK - produse igiena, instalatii"
|
||||
},
|
||||
{
|
||||
"id": "receipt_22",
|
||||
"filename": "kineterra abonament terapie august 2024.pdf",
|
||||
"furnizor": "KINETERRA CONCEPT SRL",
|
||||
"cui_furnizor": "31180432",
|
||||
"client": null,
|
||||
"cui_client": null,
|
||||
"total": 750.0,
|
||||
"tva_details": [],
|
||||
"total_tva": 0.0,
|
||||
"card": 750.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-08-27",
|
||||
"numar_bon": "0029",
|
||||
"notes": "Neplatitor TVA - terapie acvatica"
|
||||
},
|
||||
{
|
||||
"id": "receipt_23a",
|
||||
"filename": "benzina 07 aug. 2024.pdf",
|
||||
"page": 1,
|
||||
"furnizor": "OMV PETROM MARKETING S.R.L.",
|
||||
"cui_furnizor": "RO11201891",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 263.28,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 42.04
|
||||
}
|
||||
],
|
||||
"total_tva": 42.04,
|
||||
"card": 263.28,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-08-01",
|
||||
"numar_bon": "2134-00220",
|
||||
"notes": "Benzina standard 95 - Petrom 1 Huedin",
|
||||
"pages": 2
|
||||
},
|
||||
{
|
||||
"id": "receipt_23b",
|
||||
"filename": "benzina 07 aug. 2024.pdf",
|
||||
"page": 2,
|
||||
"furnizor": "OMV PETROM MARKETING S.R.L.",
|
||||
"cui_furnizor": "RO11201891",
|
||||
"client": "ROMFAST SRL",
|
||||
"cui_client": "RO1879855",
|
||||
"total": 306.67,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 48.96
|
||||
}
|
||||
],
|
||||
"total_tva": 48.96,
|
||||
"card": 306.67,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-08-02",
|
||||
"numar_bon": "2193-00699",
|
||||
"notes": "Benzina standard 95 - Petrom A2 KM66",
|
||||
"pages": 2
|
||||
},
|
||||
{
|
||||
"id": "receipt_24",
|
||||
"filename": "brick igiena, electrice consumabile 604.pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 190.6,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 30.43
|
||||
}
|
||||
],
|
||||
"total_tva": 30.43,
|
||||
"card": 190.6,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-08-05",
|
||||
"numar_bon": "0207",
|
||||
"notes": "BRICK - electrice, instalatii, igiena"
|
||||
},
|
||||
{
|
||||
"id": "receipt_25",
|
||||
"filename": "electrobering igiena iulie 604.pdf",
|
||||
"furnizor": "ELECTROBERING S.R.L.",
|
||||
"cui_furnizor": "RO2744937",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 62.0,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 9.9
|
||||
}
|
||||
],
|
||||
"total_tva": 9.9,
|
||||
"card": 62.0,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-07-11",
|
||||
"numar_bon": "0059",
|
||||
"notes": "Filtru, spray detector"
|
||||
},
|
||||
{
|
||||
"id": "receipt_26",
|
||||
"filename": "Lidl papetarie 604 fara TVA. nu are cod fiscal.pdf",
|
||||
"furnizor": "LIDL DISCOUNT S.R.L.",
|
||||
"cui_furnizor": "RO22891860",
|
||||
"client": null,
|
||||
"cui_client": null,
|
||||
"total": 39.96,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 6.38
|
||||
}
|
||||
],
|
||||
"total_tva": 6.38,
|
||||
"card": 39.96,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-07-01",
|
||||
"numar_bon": "00719",
|
||||
"notes": "Papetarie - agende, caiete. FARA CIF CLIENT!"
|
||||
},
|
||||
{
|
||||
"id": "receipt_27",
|
||||
"filename": "brick igiena 604.pdf",
|
||||
"furnizor": "FIVE-HOLDING S.A.",
|
||||
"cui_furnizor": "RO10562600",
|
||||
"client": null,
|
||||
"cui_client": "RO1879855",
|
||||
"total": 155.15,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 24.77
|
||||
}
|
||||
],
|
||||
"total_tva": 24.77,
|
||||
"card": 155.15,
|
||||
"numerar": 0.0,
|
||||
"data_bon": "2024-06-28",
|
||||
"numar_bon": "0293",
|
||||
"notes": "BRICK - igiena, consumabile auto"
|
||||
},
|
||||
{
|
||||
"id": "receipt_28",
|
||||
"filename": "unlimited duplicat chei 23 mai.pdf",
|
||||
"furnizor": "UNLIMITED KEYS S.R.L.",
|
||||
"cui_furnizor": "RO18993187",
|
||||
"client": null,
|
||||
"cui_client": "1879855",
|
||||
"total": 80.0,
|
||||
"tva_details": [
|
||||
{
|
||||
"rate": 19,
|
||||
"value": 12.77
|
||||
}
|
||||
],
|
||||
"total_tva": 12.77,
|
||||
"card": 0.0,
|
||||
"numerar": 80.0,
|
||||
"data_bon": "2024-05-23",
|
||||
"numar_bon": "000004",
|
||||
"notes": "Duplicat cheie yala - NUMERAR"
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"total_receipts": 30,
|
||||
"total_files": 28,
|
||||
"extracted_by": "Claude - manual extraction",
|
||||
"extraction_date": "2026-01-01",
|
||||
"notes": "Some PDF files contain multiple receipts (pages)"
|
||||
}
|
||||
}
|
||||
127
tests/ocr-validation/get_raw_ocr_text.py
Normal file
127
tests/ocr-validation/get_raw_ocr_text.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick script to get raw OCR text for specific receipts.
|
||||
Usage: python get_raw_ocr_text.py <receipt_path>
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
sys.path.insert(0, str(project_root / 'backend'))
|
||||
|
||||
from jose import jwt
|
||||
|
||||
API_BASE = "http://localhost:8000/api/data-entry"
|
||||
|
||||
def create_test_token() -> str:
|
||||
"""Create a test JWT token for API authentication."""
|
||||
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
now = datetime.utcnow()
|
||||
expire = now + timedelta(hours=1)
|
||||
|
||||
payload = {
|
||||
"username": "ocr_test_user",
|
||||
"user_id": 999,
|
||||
"companies": ["TEST"],
|
||||
"permissions": ["read", "write"],
|
||||
"exp": expire,
|
||||
"iat": now,
|
||||
"type": "access"
|
||||
}
|
||||
|
||||
return jwt.encode(payload, secret_key, algorithm="HS256")
|
||||
|
||||
|
||||
def get_raw_ocr_text(file_path: str, token: str) -> dict:
|
||||
"""Submit file to OCR and get raw text."""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
return {"error": f"File not found: {file_path}"}
|
||||
|
||||
# Submit OCR job
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Processing: {path.name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
headers = {'Authorization': f'Bearer {token}'}
|
||||
|
||||
with open(path, 'rb') as f:
|
||||
files = {'file': (path.name, f, 'application/pdf')}
|
||||
|
||||
response = requests.post(f"{API_BASE}/ocr/extract?engine=doctr_plus", files=files, headers=headers)
|
||||
|
||||
if response.status_code != 200:
|
||||
return {"error": f"Submit failed: {response.status_code} - {response.text}"}
|
||||
|
||||
result = response.json()
|
||||
job_id = result.get('job_id')
|
||||
print(f"Job ID: {job_id}")
|
||||
|
||||
# Poll for completion
|
||||
max_wait = 120
|
||||
start = time.time()
|
||||
|
||||
while time.time() - start < max_wait:
|
||||
status_response = requests.get(f"{API_BASE}/ocr/jobs/{job_id}", headers=headers)
|
||||
if status_response.status_code != 200:
|
||||
return {"error": f"Status check failed: {status_response.status_code}"}
|
||||
|
||||
status = status_response.json()
|
||||
job_status = status.get('status')
|
||||
|
||||
if job_status == 'completed':
|
||||
result = status.get('result', {})
|
||||
|
||||
# Print raw texts
|
||||
raw_texts = result.get('raw_texts', [])
|
||||
print(f"\n--- RAW OCR TEXT ({len(raw_texts)} passes) ---\n")
|
||||
|
||||
for i, raw_text in enumerate(raw_texts):
|
||||
print(f"\n=== Pass {i+1} ===")
|
||||
print(raw_text[:3000] if len(raw_text) > 3000 else raw_text)
|
||||
print(f"\n[Text length: {len(raw_text)} chars]")
|
||||
|
||||
# Print extracted fields
|
||||
print(f"\n--- EXTRACTED FIELDS ---")
|
||||
print(f"TOTAL: {result.get('amount')}")
|
||||
print(f"DATE: {result.get('receipt_date')}")
|
||||
print(f"CUI: {result.get('cui')}")
|
||||
print(f"TVA Total: {result.get('tva_total')}")
|
||||
print(f"TVA Entries: {result.get('tva_entries')}")
|
||||
print(f"Confidence: {result.get('overall_confidence')}")
|
||||
print(f"Engine: {result.get('ocr_engine')}")
|
||||
|
||||
return result
|
||||
|
||||
elif job_status == 'failed':
|
||||
return {"error": f"OCR failed: {status.get('error')}"}
|
||||
|
||||
print(f" Status: {job_status}, waiting...")
|
||||
time.sleep(2)
|
||||
|
||||
return {"error": "Timeout waiting for OCR"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create test token
|
||||
token = create_test_token()
|
||||
print(f"Using JWT token for authentication")
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
# Default: process the two receipts user wants to see
|
||||
receipts = [
|
||||
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena 1 sept.pdf",
|
||||
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena, electrice consumabile 604.pdf"
|
||||
]
|
||||
else:
|
||||
receipts = sys.argv[1:]
|
||||
|
||||
for receipt in receipts:
|
||||
result = get_raw_ocr_text(receipt, token)
|
||||
if "error" in result:
|
||||
print(f"ERROR: {result['error']}")
|
||||
593
tests/ocr-validation/ocr-direct-validation.py
Normal file
593
tests/ocr-validation/ocr-direct-validation.py
Normal file
@@ -0,0 +1,593 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
OCR Direct Validation Tests
|
||||
|
||||
This script validates the OCR extraction accuracy by:
|
||||
1. Generating a test JWT token
|
||||
2. Calling the OCR API endpoint with PDF receipts
|
||||
3. Comparing extracted data with expected values from expected_receipts.json
|
||||
|
||||
Run:
|
||||
python tests/ocr-validation/ocr-direct-validation.py
|
||||
python tests/ocr-validation/ocr-direct-validation.py --engine doctr_plus
|
||||
python tests/ocr-validation/ocr-direct-validation.py --receipt receipt_01
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import time
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
# Add backend and project root to path
|
||||
project_root = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
sys.path.insert(0, str(project_root / 'backend'))
|
||||
|
||||
# Import JWT handler to create test tokens
|
||||
from jose import jwt
|
||||
|
||||
|
||||
def create_test_token(secret_key: str) -> str:
|
||||
"""Create a test JWT token for API authentication."""
|
||||
now = datetime.utcnow()
|
||||
expire = now + timedelta(hours=1)
|
||||
|
||||
payload = {
|
||||
"username": "ocr_test_user",
|
||||
"user_id": 999,
|
||||
"companies": ["TEST"],
|
||||
"permissions": ["read", "write"],
|
||||
"exp": expire,
|
||||
"iat": now,
|
||||
"type": "access"
|
||||
}
|
||||
|
||||
return jwt.encode(payload, secret_key, algorithm="HS256")
|
||||
|
||||
|
||||
def normalize_cui(cui: Optional[str]) -> Optional[str]:
|
||||
"""Normalize CUI by removing RO prefix and spaces."""
|
||||
if not cui:
|
||||
return None
|
||||
return cui.upper().replace('RO', '').replace(' ', '')
|
||||
|
||||
|
||||
def normalize_date(date: Optional[str]) -> Optional[str]:
|
||||
"""Normalize date to YYYY-MM-DD format."""
|
||||
if not date:
|
||||
return None
|
||||
try:
|
||||
# Try parsing ISO format
|
||||
from datetime import datetime
|
||||
parsed = datetime.fromisoformat(date.replace('Z', '+00:00'))
|
||||
return parsed.strftime('%Y-%m-%d')
|
||||
except:
|
||||
return date
|
||||
|
||||
|
||||
def compare_with_tolerance(expected: float, actual, tolerance: float) -> bool:
|
||||
"""Compare numbers with tolerance."""
|
||||
if actual is None:
|
||||
return False
|
||||
# Handle string values from API
|
||||
try:
|
||||
actual_float = float(actual)
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
diff = abs(expected - actual_float)
|
||||
threshold = expected * tolerance
|
||||
return diff <= threshold or diff <= 0.01 # Within tolerance or 1 cent
|
||||
|
||||
|
||||
def submit_ocr_job(api_base: str, token: str, pdf_path: Path, engine: str) -> dict:
|
||||
"""Submit a PDF file for OCR processing and wait for result.
|
||||
|
||||
Returns dict with detailed timing information:
|
||||
- timing.submit_duration: time to submit job and get job_id
|
||||
- timing.poll_count: number of poll requests made
|
||||
- timing.poll_duration: total time spent polling
|
||||
- timing.wall_time: total elapsed time (submit + poll)
|
||||
- timing.api_reported_ms: processing time reported by API
|
||||
"""
|
||||
headers = {'Authorization': f'Bearer {token}'}
|
||||
|
||||
# Timing tracking
|
||||
timing = {
|
||||
'submit_duration': 0.0,
|
||||
'poll_count': 0,
|
||||
'poll_duration': 0.0,
|
||||
'wall_time': 0.0,
|
||||
'api_reported_ms': 0,
|
||||
}
|
||||
|
||||
wall_start = time.time()
|
||||
|
||||
# Submit job
|
||||
submit_start = time.time()
|
||||
with open(pdf_path, 'rb') as f:
|
||||
files = {'file': (pdf_path.name, f, 'application/pdf')}
|
||||
response = requests.post(
|
||||
f'{api_base}/api/data-entry/ocr/extract?engine={engine}',
|
||||
headers=headers,
|
||||
files=files,
|
||||
timeout=60
|
||||
)
|
||||
timing['submit_duration'] = time.time() - submit_start
|
||||
|
||||
if not response.ok:
|
||||
timing['wall_time'] = time.time() - wall_start
|
||||
return {'status': 'failed', 'error': f'Submit failed: {response.status_code} - {response.text}', 'timing': timing}
|
||||
|
||||
job = response.json()
|
||||
job_id = job.get('job_id')
|
||||
|
||||
if not job_id:
|
||||
timing['wall_time'] = time.time() - wall_start
|
||||
return {'status': 'failed', 'error': 'No job_id in response', 'timing': timing}
|
||||
|
||||
# Poll for result
|
||||
poll_start = time.time()
|
||||
max_wait = 120 # 2 minutes
|
||||
|
||||
while time.time() - wall_start < max_wait:
|
||||
timing['poll_count'] += 1
|
||||
poll_response = requests.get(
|
||||
f'{api_base}/api/data-entry/ocr/jobs/{job_id}/wait?timeout=30',
|
||||
headers=headers,
|
||||
timeout=35
|
||||
)
|
||||
|
||||
if not poll_response.ok:
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
job_status = poll_response.json()
|
||||
|
||||
if job_status.get('status') == 'completed':
|
||||
timing['poll_duration'] = time.time() - poll_start
|
||||
timing['wall_time'] = time.time() - wall_start
|
||||
# Detailed timing from API
|
||||
timing['queue_wait_ms'] = job_status.get('queue_wait_ms', 0) or 0
|
||||
timing['ocr_time_ms'] = job_status.get('ocr_time_ms', 0) or 0
|
||||
timing['processing_time_ms'] = job_status.get('processing_time_ms', 0) or 0
|
||||
return {
|
||||
'status': 'completed',
|
||||
'result': job_status.get('result', {}),
|
||||
'processing_time_ms': job_status.get('processing_time_ms', 0),
|
||||
'timing': timing
|
||||
}
|
||||
|
||||
if job_status.get('status') == 'failed':
|
||||
timing['poll_duration'] = time.time() - poll_start
|
||||
timing['wall_time'] = time.time() - wall_start
|
||||
return {'status': 'failed', 'error': job_status.get('error', 'Unknown error'), 'timing': timing}
|
||||
|
||||
# Still pending - show status but don't spam
|
||||
if timing['poll_count'] <= 3 or timing['poll_count'] % 5 == 0:
|
||||
print(f" Status: {job_status.get('status')}, position: {job_status.get('queue_position')}, polls: {timing['poll_count']}")
|
||||
|
||||
timing['poll_duration'] = time.time() - poll_start
|
||||
timing['wall_time'] = time.time() - wall_start
|
||||
return {'status': 'failed', 'error': 'Timeout waiting for OCR result', 'timing': timing}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='OCR Direct Validation')
|
||||
parser.add_argument('--engine', default='doctr_plus',
|
||||
choices=['tesseract', 'doctr', 'doctr_plus', 'paddleocr'],
|
||||
help='OCR engine to use (doctr_plus recommended)')
|
||||
parser.add_argument('--receipt', help='Specific receipt ID to test (e.g., receipt_01)')
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
|
||||
parser.add_argument('--api-base', default='http://localhost:8000', help='API base URL')
|
||||
parser.add_argument('--stop-on-issue', action='store_true',
|
||||
help='Stop at first receipt with wall_time > 7.5s or extraction errors')
|
||||
parser.add_argument('--include-multipage', action='store_true',
|
||||
help='Include multi-page PDFs (normally skipped)')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Paths
|
||||
script_dir = Path(__file__).parent
|
||||
expected_path = script_dir / 'expected_receipts.json'
|
||||
pdf_base_path = script_dir.parent.parent / 'docs' / 'data-entry'
|
||||
|
||||
# JWT secret from environment or default
|
||||
jwt_secret = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
|
||||
# Create test token
|
||||
token = create_test_token(jwt_secret)
|
||||
|
||||
# Load expected data
|
||||
print(f"\n{'='*60}")
|
||||
print("OCR API VALIDATION")
|
||||
print(f"{'='*60}")
|
||||
print(f"Engine: {args.engine}")
|
||||
print(f"API Base: {args.api_base}")
|
||||
print(f"Expected data: {expected_path}")
|
||||
print(f"PDF folder: {pdf_base_path}")
|
||||
|
||||
with open(expected_path) as f:
|
||||
expected_data = json.load(f)
|
||||
|
||||
# Filter receipts
|
||||
receipts_to_test = expected_data['receipts']
|
||||
|
||||
# Skip multi-page PDFs unless explicitly included
|
||||
if not args.include_multipage:
|
||||
original_count = len(receipts_to_test)
|
||||
receipts_to_test = [r for r in receipts_to_test if r.get('page') is None]
|
||||
skipped = original_count - len(receipts_to_test)
|
||||
if skipped > 0:
|
||||
print(f"Skipping {skipped} multi-page PDF entries (use --include-multipage to include)")
|
||||
|
||||
# Filter by specific receipt ID if requested
|
||||
if args.receipt:
|
||||
receipts_to_test = [r for r in receipts_to_test if r['id'] == args.receipt]
|
||||
if not receipts_to_test:
|
||||
print(f"\nError: Receipt ID '{args.receipt}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Receipts to test: {len(receipts_to_test)}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Results storage
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
# Test each receipt
|
||||
for expected in receipts_to_test:
|
||||
pdf_path = pdf_base_path / expected['filename']
|
||||
|
||||
if not pdf_path.exists():
|
||||
print(f"[SKIP] File not found: {expected['filename']}")
|
||||
continue
|
||||
|
||||
print(f"[TEST] Processing: {expected['filename']}")
|
||||
|
||||
try:
|
||||
# Submit OCR job via API
|
||||
start_time = datetime.now()
|
||||
ocr_result = submit_ocr_job(args.api_base, token, pdf_path, args.engine)
|
||||
# Handle processing_time which may be string or number
|
||||
raw_time = ocr_result.get('processing_time_ms')
|
||||
if raw_time is not None:
|
||||
processing_time = float(raw_time)
|
||||
else:
|
||||
processing_time = (datetime.now() - start_time).total_seconds() * 1000
|
||||
|
||||
if ocr_result.get('status') == 'failed':
|
||||
print(f" [ERROR] OCR failed: {ocr_result.get('error')}")
|
||||
results.append({
|
||||
'receipt_id': expected['id'],
|
||||
'filename': expected['filename'],
|
||||
'status': 'failed',
|
||||
'error': ocr_result.get('error'),
|
||||
})
|
||||
continue
|
||||
|
||||
# Get extracted values
|
||||
extracted = ocr_result.get('result', {})
|
||||
|
||||
# Safe float conversion helper
|
||||
def safe_float(value, default=0.0):
|
||||
if value is None:
|
||||
return default
|
||||
try:
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
return default
|
||||
|
||||
# Compare results
|
||||
comparison = {
|
||||
'receipt_id': expected['id'],
|
||||
'filename': expected['filename'],
|
||||
'status': 'completed',
|
||||
'total_expected': expected['total'],
|
||||
'total_extracted': safe_float(extracted.get('amount'), None),
|
||||
'total_match': False,
|
||||
'date_expected': expected['data_bon'],
|
||||
'date_extracted': extracted.get('receipt_date'),
|
||||
'date_match': False,
|
||||
'cui_expected': expected['cui_furnizor'],
|
||||
'cui_extracted': extracted.get('cui'),
|
||||
'cui_match': False,
|
||||
'tva_expected': expected['total_tva'],
|
||||
'tva_extracted': safe_float(extracted.get('tva_total'), None),
|
||||
'tva_match': False,
|
||||
'confidence': safe_float(extracted.get('overall_confidence'), 0),
|
||||
'processing_time_ms': processing_time,
|
||||
'ocr_engine': extracted.get('ocr_engine', args.engine),
|
||||
'errors': [],
|
||||
# NEW: Save full extraction for analysis
|
||||
'full_extraction': {
|
||||
'amount': extracted.get('amount'),
|
||||
'receipt_date': extracted.get('receipt_date'),
|
||||
'cui': extracted.get('cui'),
|
||||
'tva_total': extracted.get('tva_total'),
|
||||
'tva_entries': extracted.get('tva_entries', []),
|
||||
'supplier_name': extracted.get('supplier_name'),
|
||||
'receipt_number': extracted.get('receipt_number'),
|
||||
'payment_methods': extracted.get('payment_methods', []),
|
||||
'items_count': extracted.get('items_count'),
|
||||
'overall_confidence': extracted.get('overall_confidence'),
|
||||
'confidence_amount': extracted.get('confidence_amount'),
|
||||
'confidence_date': extracted.get('confidence_date'),
|
||||
'confidence_cui': extracted.get('confidence_cui'),
|
||||
},
|
||||
# NEW: Save raw OCR texts from each engine pass
|
||||
'raw_texts': extracted.get('raw_texts', []),
|
||||
}
|
||||
|
||||
# Compare TOTAL
|
||||
comparison['total_match'] = compare_with_tolerance(
|
||||
expected['total'],
|
||||
extracted.get('amount'),
|
||||
0.02 # 2% tolerance
|
||||
)
|
||||
if not comparison['total_match']:
|
||||
comparison['errors'].append(
|
||||
f"TOTAL: expected {expected['total']}, got {extracted.get('amount')}"
|
||||
)
|
||||
|
||||
# Compare DATE
|
||||
normalized_expected_date = normalize_date(expected['data_bon'])
|
||||
normalized_extracted_date = normalize_date(extracted.get('receipt_date'))
|
||||
comparison['date_match'] = normalized_expected_date == normalized_extracted_date
|
||||
if not comparison['date_match']:
|
||||
comparison['errors'].append(
|
||||
f"DATE: expected {normalized_expected_date}, got {normalized_extracted_date}"
|
||||
)
|
||||
|
||||
# Compare CUI
|
||||
normalized_expected_cui = normalize_cui(expected['cui_furnizor'])
|
||||
normalized_extracted_cui = normalize_cui(extracted.get('cui'))
|
||||
comparison['cui_match'] = normalized_expected_cui == normalized_extracted_cui
|
||||
if not comparison['cui_match']:
|
||||
comparison['errors'].append(
|
||||
f"CUI: expected {normalized_expected_cui}, got {normalized_extracted_cui}"
|
||||
)
|
||||
|
||||
# Compare TVA
|
||||
if expected['total_tva'] > 0:
|
||||
comparison['tva_match'] = compare_with_tolerance(
|
||||
expected['total_tva'],
|
||||
extracted.get('tva_total'),
|
||||
0.05 # 5% tolerance
|
||||
)
|
||||
if not comparison['tva_match']:
|
||||
comparison['errors'].append(
|
||||
f"TVA: expected {expected['total_tva']}, got {extracted.get('tva_total')}"
|
||||
)
|
||||
else:
|
||||
# No TVA expected (neplatitor TVA)
|
||||
tva_extracted = safe_float(extracted.get('tva_total'), None)
|
||||
comparison['tva_match'] = tva_extracted is None or tva_extracted == 0 or tva_extracted == 0.0
|
||||
|
||||
results.append(comparison)
|
||||
|
||||
# Get timing info from API (detailed breakdown)
|
||||
t = ocr_result.get('timing', {})
|
||||
wall_ms = t.get('wall_time', 0) * 1000
|
||||
queue_wait_ms = t.get('queue_wait_ms', 0)
|
||||
ocr_time_ms = t.get('ocr_time_ms', 0)
|
||||
processing_time_ms = t.get('processing_time_ms', 0)
|
||||
# Overhead = wall_time - processing_time (includes network, polling)
|
||||
overhead_ms = wall_ms - processing_time_ms if processing_time_ms else 0
|
||||
|
||||
# Print result
|
||||
status = 'PASS' if not comparison['errors'] else 'FAIL'
|
||||
print(f" [{status}] Total: {expected['total']} vs {extracted.get('amount')} ({comparison['total_match']})")
|
||||
print(f" Date: {expected['data_bon']} vs {extracted.get('receipt_date')} ({comparison['date_match']})")
|
||||
print(f" CUI: {expected['cui_furnizor']} vs {extracted.get('cui')} ({comparison['cui_match']})")
|
||||
print(f" TVA: {expected['total_tva']} vs {extracted.get('tva_total')} ({comparison['tva_match']})")
|
||||
print(f" Confidence: {comparison['confidence']*100:.1f}%")
|
||||
|
||||
# Print detailed timing breakdown
|
||||
print(f" TIMING: ocr={ocr_time_ms}ms, queue_wait={queue_wait_ms}ms, "
|
||||
f"job_total={processing_time_ms}ms, wall={wall_ms:.0f}ms")
|
||||
print(f" overhead={overhead_ms:.0f}ms (wall - job_total)")
|
||||
|
||||
if comparison['errors'] and args.verbose:
|
||||
for err in comparison['errors']:
|
||||
print(f" Error: {err}")
|
||||
|
||||
# Stop on issue if requested
|
||||
if args.stop_on_issue:
|
||||
has_errors = len(comparison['errors']) > 0
|
||||
# Use OCR time for threshold (actual processing, not queue wait)
|
||||
ocr_too_slow = ocr_time_ms > 10000 # 10s threshold for actual OCR
|
||||
|
||||
if has_errors or ocr_too_slow:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"STOP: Issue detected on {expected['filename']}")
|
||||
print(f"{'='*60}")
|
||||
if ocr_too_slow:
|
||||
print(f" SLOW: ocr_time={ocr_time_ms}ms > 10000ms threshold")
|
||||
if has_errors:
|
||||
print(f" ERRORS: {comparison['errors']}")
|
||||
print(f"\n Full timing breakdown:")
|
||||
print(f" ocr_time_ms: {ocr_time_ms}ms (actual OCR engine time)")
|
||||
print(f" queue_wait_ms: {queue_wait_ms}ms (waiting in queue)")
|
||||
print(f" processing_time_ms: {processing_time_ms}ms (job total)")
|
||||
print(f" wall_time: {wall_ms:.0f}ms (client-side)")
|
||||
print(f" overhead: {overhead_ms:.0f}ms (network + polling)")
|
||||
print(f"\n Full extraction:")
|
||||
print(json.dumps(extracted, indent=4, default=str))
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f" [ERROR] {str(e)}")
|
||||
if args.verbose:
|
||||
traceback.print_exc()
|
||||
results.append({
|
||||
'receipt_id': expected['id'],
|
||||
'filename': expected['filename'],
|
||||
'status': 'error',
|
||||
'error': str(e),
|
||||
})
|
||||
|
||||
# Calculate statistics
|
||||
completed_results = [r for r in results if r.get('status') == 'completed']
|
||||
total_tests = len(completed_results)
|
||||
|
||||
if total_tests == 0:
|
||||
print("\nNo tests completed successfully!")
|
||||
sys.exit(1)
|
||||
|
||||
perfect_matches = len([r for r in completed_results
|
||||
if r['total_match'] and r['date_match'] and r['cui_match'] and r['tva_match']])
|
||||
total_match_rate = len([r for r in completed_results if r['total_match']]) / total_tests
|
||||
date_match_rate = len([r for r in completed_results if r['date_match']]) / total_tests
|
||||
cui_match_rate = len([r for r in completed_results if r['cui_match']]) / total_tests
|
||||
tva_match_rate = len([r for r in completed_results if r['tva_match']]) / total_tests
|
||||
avg_confidence = sum(r['confidence'] for r in completed_results) / total_tests
|
||||
avg_processing_time = sum(r['processing_time_ms'] for r in completed_results) / total_tests
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print("OCR VALIDATION SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total Receipts Tested: {total_tests}")
|
||||
print(f"Perfect Matches: {perfect_matches} ({perfect_matches/total_tests*100:.1f}%)")
|
||||
print("---")
|
||||
print(f"Total Amount Match Rate: {total_match_rate*100:.1f}%")
|
||||
print(f"Date Match Rate: {date_match_rate*100:.1f}%")
|
||||
print(f"CUI Match Rate: {cui_match_rate*100:.1f}%")
|
||||
print(f"TVA Match Rate: {tva_match_rate*100:.1f}%")
|
||||
print("---")
|
||||
print(f"Average Confidence: {avg_confidence*100:.1f}%")
|
||||
print(f"Average Processing Time: {avg_processing_time:.0f}ms")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Failed receipts
|
||||
failed_results = [r for r in completed_results if r.get('errors')]
|
||||
if failed_results:
|
||||
print(f"\nFAILED RECEIPTS ({len(failed_results)}):")
|
||||
for r in failed_results:
|
||||
print(f" - {r['filename']}: {'; '.join(r['errors'])}")
|
||||
|
||||
# Categorize problems for analysis
|
||||
problems_analysis = {
|
||||
'cui_issues': [],
|
||||
'tva_issues': [],
|
||||
'total_issues': [],
|
||||
'date_issues': [],
|
||||
'confidence_issues': [],
|
||||
}
|
||||
|
||||
for r in completed_results:
|
||||
# CUI issues
|
||||
if not r.get('cui_match'):
|
||||
cui_expected = normalize_cui(r.get('cui_expected'))
|
||||
cui_got = normalize_cui(r.get('cui_extracted'))
|
||||
issue_type = 'missing' if not cui_got else 'mismatch'
|
||||
|
||||
# Check if it's a digit substitution (same length, 1-2 chars different)
|
||||
if cui_expected and cui_got and len(cui_expected) == len(cui_got):
|
||||
diff_count = sum(1 for a, b in zip(cui_expected, cui_got) if a != b)
|
||||
if diff_count <= 2:
|
||||
issue_type = 'digit_substitution'
|
||||
|
||||
problems_analysis['cui_issues'].append({
|
||||
'file': r['filename'],
|
||||
'expected': r.get('cui_expected'),
|
||||
'got': r.get('cui_extracted'),
|
||||
'type': issue_type,
|
||||
'confidence': r.get('confidence', 0),
|
||||
})
|
||||
|
||||
# TVA issues
|
||||
if not r.get('tva_match'):
|
||||
tva_expected = r.get('tva_expected', 0)
|
||||
tva_got = r.get('tva_extracted')
|
||||
issue_type = 'missing' if tva_got is None else 'mismatch'
|
||||
|
||||
# Check for 5% rate (books)
|
||||
if tva_expected and tva_expected > 0:
|
||||
full_ext = r.get('full_extraction', {})
|
||||
total = full_ext.get('amount')
|
||||
if total and tva_expected:
|
||||
try:
|
||||
implied_rate = float(tva_expected) / float(total) * 100
|
||||
if 4 <= implied_rate <= 6:
|
||||
issue_type = 'low_rate_5pct'
|
||||
except:
|
||||
pass
|
||||
|
||||
problems_analysis['tva_issues'].append({
|
||||
'file': r['filename'],
|
||||
'expected': tva_expected,
|
||||
'got': tva_got,
|
||||
'type': issue_type,
|
||||
'tva_entries': r.get('full_extraction', {}).get('tva_entries', []),
|
||||
})
|
||||
|
||||
# TOTAL issues
|
||||
if not r.get('total_match'):
|
||||
problems_analysis['total_issues'].append({
|
||||
'file': r['filename'],
|
||||
'expected': r.get('total_expected'),
|
||||
'got': r.get('total_extracted'),
|
||||
'confidence': r.get('confidence', 0),
|
||||
'payment_methods': r.get('full_extraction', {}).get('payment_methods', []),
|
||||
})
|
||||
|
||||
# DATE issues
|
||||
if not r.get('date_match'):
|
||||
problems_analysis['date_issues'].append({
|
||||
'file': r['filename'],
|
||||
'expected': r.get('date_expected'),
|
||||
'got': r.get('date_extracted'),
|
||||
})
|
||||
|
||||
# Low confidence issues
|
||||
if r.get('confidence', 0) < 0.7:
|
||||
problems_analysis['confidence_issues'].append({
|
||||
'file': r['filename'],
|
||||
'confidence': r.get('confidence', 0),
|
||||
'errors': r.get('errors', []),
|
||||
})
|
||||
|
||||
# Save detailed report
|
||||
report = {
|
||||
'test_date': datetime.now().isoformat(),
|
||||
'engine': args.engine,
|
||||
'statistics': {
|
||||
'total_tests': total_tests,
|
||||
'perfect_matches': perfect_matches,
|
||||
'perfect_match_rate': perfect_matches / total_tests,
|
||||
'total_match_rate': total_match_rate,
|
||||
'date_match_rate': date_match_rate,
|
||||
'cui_match_rate': cui_match_rate,
|
||||
'tva_match_rate': tva_match_rate,
|
||||
'avg_confidence': avg_confidence,
|
||||
'avg_processing_time_ms': avg_processing_time,
|
||||
},
|
||||
'problems_analysis': problems_analysis,
|
||||
'failed_receipts': [
|
||||
{'filename': r['filename'], 'errors': r['errors']}
|
||||
for r in failed_results
|
||||
],
|
||||
'detailed_results': results,
|
||||
}
|
||||
|
||||
# Save report with engine name in filename
|
||||
report_path = script_dir / f'ocr_report_{args.engine.replace("-", "_")}_FULL.json'
|
||||
with open(report_path, 'w') as f:
|
||||
json.dump(report, f, indent=2, default=str)
|
||||
print(f"\nReport saved to: {report_path}")
|
||||
|
||||
# Exit with error if match rates are below threshold
|
||||
if total_match_rate < 0.8:
|
||||
print(f"\n[FAIL] Total match rate {total_match_rate*100:.1f}% is below 80% threshold")
|
||||
sys.exit(1)
|
||||
|
||||
print("\n[PASS] OCR validation completed successfully!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
135
tests/ocr-validation/test_receipts_parallel.py
Normal file
135
tests/ocr-validation/test_receipts_parallel.py
Normal file
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test receipts in PARALLEL to measure real worker benefit."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import requests
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timedelta
|
||||
from jose import jwt
|
||||
|
||||
API_BASE = "http://localhost:8000"
|
||||
PDF_FOLDER = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry"
|
||||
EXPECTED_FILE = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/tests/ocr-validation/expected_receipts.json"
|
||||
|
||||
def get_jwt_token():
|
||||
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
now = datetime.utcnow()
|
||||
payload = {
|
||||
"username": "MARIUS", "user_id": 1, "companies": ["604"],
|
||||
"permissions": ["read", "write"], "exp": now + timedelta(hours=1),
|
||||
"iat": now, "type": "access"
|
||||
}
|
||||
return jwt.encode(payload, secret_key, algorithm="HS256")
|
||||
|
||||
def submit_job(pdf_path, headers):
|
||||
"""Submit OCR job and return job_id immediately."""
|
||||
filename = os.path.basename(pdf_path)
|
||||
try:
|
||||
with open(pdf_path, "rb") as f:
|
||||
files = {"file": (filename, f, "application/pdf")}
|
||||
response = requests.post(
|
||||
f"{API_BASE}/api/data-entry/ocr/extract?engine=doctr_plus",
|
||||
files=files, headers=headers, timeout=30
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json().get("job_id"), filename, None
|
||||
return None, filename, f"HTTP {response.status_code}"
|
||||
except Exception as e:
|
||||
return None, filename, str(e)
|
||||
|
||||
def wait_for_job(job_id, filename, headers, timeout=180):
|
||||
"""Wait for job completion."""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
resp = requests.get(
|
||||
f"{API_BASE}/api/data-entry/ocr/jobs/{job_id}/wait?timeout=30",
|
||||
headers=headers, timeout=35
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
status = data.get("status")
|
||||
if status == "completed":
|
||||
result = data.get("result", {})
|
||||
conf = result.get("overall_confidence", 0)
|
||||
return {"success": True, "conf": conf, "time": time.time() - start}
|
||||
elif status == "error":
|
||||
return {"success": False, "error": data.get("error", "unknown"), "time": time.time() - start}
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
time.sleep(1)
|
||||
return {"success": False, "error": "timeout", "time": time.time() - start}
|
||||
|
||||
def main():
|
||||
# Load receipts
|
||||
with open(EXPECTED_FILE) as f:
|
||||
data = json.load(f)
|
||||
receipts = data.get("receipts", data)
|
||||
receipts = [r for r in receipts if r.get("pages", 1) == 1]
|
||||
|
||||
token = get_jwt_token()
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"PARALLEL TEST: {len(receipts)} receipts")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# PHASE 1: Submit ALL jobs rapidly
|
||||
print("Phase 1: Submitting all jobs...")
|
||||
total_start = time.time()
|
||||
jobs = []
|
||||
|
||||
for r in receipts:
|
||||
pdf_path = os.path.join(PDF_FOLDER, r["filename"])
|
||||
if os.path.exists(pdf_path):
|
||||
job_id, filename, error = submit_job(pdf_path, headers)
|
||||
if job_id:
|
||||
jobs.append((job_id, filename))
|
||||
else:
|
||||
print(f" Submit failed: {filename} - {error}")
|
||||
|
||||
submit_time = time.time() - total_start
|
||||
print(f"Submitted {len(jobs)} jobs in {submit_time:.1f}s")
|
||||
|
||||
# PHASE 2: Wait for ALL results in parallel
|
||||
print("\nPhase 2: Waiting for results...")
|
||||
wait_start = time.time()
|
||||
results = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=26) as executor:
|
||||
futures = {executor.submit(wait_for_job, job_id, fn, headers): fn
|
||||
for job_id, fn in jobs}
|
||||
|
||||
for future in as_completed(futures):
|
||||
filename = futures[future]
|
||||
result = future.result()
|
||||
result["filename"] = filename
|
||||
results.append(result)
|
||||
|
||||
if result["success"]:
|
||||
print(f" OK: {filename[:45]:47} {result['time']:5.1f}s conf={result['conf']:.0%}")
|
||||
else:
|
||||
print(f" ERR: {filename[:45]:47} {result['time']:5.1f}s {result.get('error','?')}")
|
||||
|
||||
total_time = time.time() - total_start
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print("SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
successful = [r for r in results if r["success"]]
|
||||
failed = [r for r in results if not r["success"]]
|
||||
|
||||
print(f"Success: {len(successful)}/{len(results)}")
|
||||
print(f"Submit phase: {submit_time:.1f}s")
|
||||
print(f"Wait phase: {time.time() - wait_start:.1f}s")
|
||||
print(f"TOTAL TIME: {total_time:.1f}s")
|
||||
|
||||
if successful:
|
||||
times = [r["time"] for r in successful]
|
||||
print(f"\nPer-job: avg={sum(times)/len(times):.1f}s, min={min(times):.1f}s, max={max(times):.1f}s")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
314
tests/ocr-validation/test_receipts_parallel_windows.py
Normal file
314
tests/ocr-validation/test_receipts_parallel_windows.py
Normal file
@@ -0,0 +1,314 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parallel OCR test for Windows.
|
||||
Run from backend directory: python tests\ocr-validation\test_receipts_parallel_windows.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from jose import jwt
|
||||
|
||||
try:
|
||||
import psutil
|
||||
PSUTIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
PSUTIL_AVAILABLE = False
|
||||
print("Warning: psutil not installed, memory tracking disabled")
|
||||
|
||||
# Paths - relative to backend directory
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
BACKEND_DIR = SCRIPT_DIR.parent.parent / "backend"
|
||||
PDF_FOLDER = SCRIPT_DIR.parent.parent / "docs" / "data-entry"
|
||||
EXPECTED_FILE = SCRIPT_DIR / "expected_receipts.json"
|
||||
|
||||
|
||||
class MemoryMonitor:
|
||||
"""Monitor memory usage of backend process and its children (OCR workers)."""
|
||||
|
||||
def __init__(self, port=8006):
|
||||
self.port = port
|
||||
self.peak_memory_mb = 0
|
||||
self.current_memory_mb = 0
|
||||
self._stop_event = threading.Event()
|
||||
self._thread = None
|
||||
self._process = None
|
||||
|
||||
def _find_backend_process(self):
|
||||
"""Find the backend process by port."""
|
||||
if not PSUTIL_AVAILABLE:
|
||||
return None
|
||||
try:
|
||||
for conn in psutil.net_connections(kind='inet'):
|
||||
if conn.laddr.port == self.port and conn.status == 'LISTEN':
|
||||
return psutil.Process(conn.pid)
|
||||
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||
pass
|
||||
return None
|
||||
|
||||
def _get_total_memory(self):
|
||||
"""Get total memory of backend + all child processes (OCR workers)."""
|
||||
if not self._process:
|
||||
self._process = self._find_backend_process()
|
||||
if not self._process:
|
||||
return 0
|
||||
try:
|
||||
# Get memory of main process
|
||||
total = self._process.memory_info().rss
|
||||
# Add memory of all child processes (OCR workers)
|
||||
for child in self._process.children(recursive=True):
|
||||
try:
|
||||
total += child.memory_info().rss
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
return total / (1024 * 1024) # Convert to MB
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
self._process = None
|
||||
return 0
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Background thread that monitors memory every 0.5s."""
|
||||
while not self._stop_event.is_set():
|
||||
mem = self._get_total_memory()
|
||||
if mem > 0:
|
||||
self.current_memory_mb = mem
|
||||
if mem > self.peak_memory_mb:
|
||||
self.peak_memory_mb = mem
|
||||
self._stop_event.wait(0.5)
|
||||
|
||||
def start(self):
|
||||
"""Start monitoring in background thread."""
|
||||
if not PSUTIL_AVAILABLE:
|
||||
return
|
||||
self._stop_event.clear()
|
||||
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self._thread.start()
|
||||
# Wait a bit to get initial reading
|
||||
time.sleep(1)
|
||||
|
||||
def stop(self):
|
||||
"""Stop monitoring and return peak memory."""
|
||||
if self._thread:
|
||||
self._stop_event.set()
|
||||
self._thread.join(timeout=2)
|
||||
return self.peak_memory_mb
|
||||
|
||||
|
||||
def get_jwt_token():
|
||||
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
now = datetime.utcnow()
|
||||
payload = {
|
||||
"username": "MARIUS",
|
||||
"user_id": 1,
|
||||
"companies": ["604"],
|
||||
"permissions": ["read", "write"],
|
||||
"exp": now + timedelta(hours=1),
|
||||
"iat": now,
|
||||
"type": "access"
|
||||
}
|
||||
return jwt.encode(payload, secret_key, algorithm="HS256")
|
||||
|
||||
|
||||
def submit_job(pdf_path, headers, api_base):
|
||||
"""Submit OCR job and return job_id immediately."""
|
||||
filename = os.path.basename(pdf_path)
|
||||
try:
|
||||
with open(pdf_path, "rb") as f:
|
||||
files = {"file": (filename, f, "application/pdf")}
|
||||
response = requests.post(
|
||||
f"{api_base}/api/data-entry/ocr/extract?engine=doctr_plus",
|
||||
files=files,
|
||||
headers=headers,
|
||||
timeout=30
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json().get("job_id"), filename, None
|
||||
return None, filename, f"HTTP {response.status_code}: {response.text[:100]}"
|
||||
except Exception as e:
|
||||
return None, filename, str(e)
|
||||
|
||||
|
||||
def wait_for_job(job_id, filename, headers, api_base, timeout=180):
|
||||
"""Wait for job completion."""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
resp = requests.get(
|
||||
f"{api_base}/api/data-entry/ocr/jobs/{job_id}/wait?timeout=30",
|
||||
headers=headers,
|
||||
timeout=35
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
status = data.get("status")
|
||||
if status == "completed":
|
||||
result = data.get("result", {})
|
||||
conf = result.get("overall_confidence", 0)
|
||||
return {"success": True, "conf": conf, "time": time.time() - start, "filename": filename}
|
||||
elif status in ("error", "failed"):
|
||||
return {"success": False, "error": data.get("error", "unknown"), "time": time.time() - start, "filename": filename}
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
time.sleep(1)
|
||||
return {"success": False, "error": "timeout", "time": time.time() - start, "filename": filename}
|
||||
|
||||
|
||||
def run_test(api_base, workers, output_file=None, port=8006):
|
||||
"""Run test and return results dict."""
|
||||
# Load receipts
|
||||
if not EXPECTED_FILE.exists():
|
||||
print(f"ERROR: {EXPECTED_FILE} not found!")
|
||||
return None
|
||||
|
||||
with open(EXPECTED_FILE) as f:
|
||||
data = json.load(f)
|
||||
receipts = data.get("receipts", data)
|
||||
receipts = [r for r in receipts if r.get("pages", 1) == 1]
|
||||
|
||||
token = get_jwt_token()
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
|
||||
# Start memory monitoring
|
||||
memory_monitor = MemoryMonitor(port=port)
|
||||
memory_monitor.start()
|
||||
|
||||
header = f"TEST: {len(receipts)} receipts, {workers} worker(s)"
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(header)
|
||||
print(f"Backend: {api_base}")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# PHASE 1: Submit ALL jobs rapidly
|
||||
print("Phase 1: Submitting all jobs...")
|
||||
total_start = time.time()
|
||||
jobs = []
|
||||
|
||||
for r in receipts:
|
||||
pdf_path = PDF_FOLDER / r["filename"]
|
||||
if pdf_path.exists():
|
||||
job_id, filename, error = submit_job(str(pdf_path), headers, api_base)
|
||||
if job_id:
|
||||
jobs.append((job_id, filename))
|
||||
else:
|
||||
print(f" Submit failed: {filename} - {error}")
|
||||
else:
|
||||
print(f" File not found: {r['filename']}")
|
||||
|
||||
submit_time = time.time() - total_start
|
||||
print(f"Submitted {len(jobs)} jobs in {submit_time:.1f}s")
|
||||
print()
|
||||
|
||||
# PHASE 2: Wait for ALL results in parallel
|
||||
print("Phase 2: Waiting for results...")
|
||||
wait_start = time.time()
|
||||
results = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=26) as executor:
|
||||
futures = {
|
||||
executor.submit(wait_for_job, job_id, fn, headers, api_base): fn
|
||||
for job_id, fn in jobs
|
||||
}
|
||||
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
|
||||
if result["success"]:
|
||||
print(f" OK: {result['filename'][:45]:47} {result['time']:5.1f}s conf={result['conf']:.0%}")
|
||||
else:
|
||||
print(f" ERR: {result['filename'][:45]:47} {result['time']:5.1f}s {result.get('error', '?')}")
|
||||
|
||||
total_time = time.time() - total_start
|
||||
wait_time = time.time() - wait_start
|
||||
|
||||
# Stop memory monitoring and get peak
|
||||
peak_memory_mb = memory_monitor.stop()
|
||||
|
||||
# Summary
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"SUMMARY - {workers} WORKER(S)")
|
||||
print("=" * 60)
|
||||
successful = [r for r in results if r["success"]]
|
||||
failed = [r for r in results if not r["success"]]
|
||||
|
||||
print(f"Success: {len(successful)}/{len(results)}")
|
||||
print(f"Submit phase: {submit_time:.1f}s")
|
||||
print(f"Wait phase: {wait_time:.1f}s")
|
||||
print(f"TOTAL TIME: {total_time:.1f}s")
|
||||
if peak_memory_mb > 0:
|
||||
print(f"PEAK MEMORY: {peak_memory_mb:.0f} MB")
|
||||
|
||||
avg_time = sum(r["time"] for r in successful) / len(successful) if successful else 0
|
||||
min_time = min(r["time"] for r in successful) if successful else 0
|
||||
max_time = max(r["time"] for r in successful) if successful else 0
|
||||
avg_conf = sum(r["conf"] for r in successful) / len(successful) if successful else 0
|
||||
|
||||
if successful:
|
||||
print(f"\nPer-job: avg={avg_time:.1f}s, min={min_time:.1f}s, max={max_time:.1f}s")
|
||||
|
||||
if failed:
|
||||
print(f"\nFailed jobs ({len(failed)}):")
|
||||
for r in failed:
|
||||
print(f" - {r['filename']}: {r.get('error', '?')}")
|
||||
|
||||
# Build result dict
|
||||
result_data = {
|
||||
"workers": workers,
|
||||
"total_receipts": len(receipts),
|
||||
"submitted": len(jobs),
|
||||
"successful": len(successful),
|
||||
"failed": len(failed),
|
||||
"submit_time": round(submit_time, 1),
|
||||
"wait_time": round(wait_time, 1),
|
||||
"total_time": round(total_time, 1),
|
||||
"avg_time": round(avg_time, 1),
|
||||
"min_time": round(min_time, 1),
|
||||
"max_time": round(max_time, 1),
|
||||
"avg_confidence": round(avg_conf * 100, 1),
|
||||
"peak_memory_mb": round(peak_memory_mb, 0),
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Write to file if specified
|
||||
if output_file:
|
||||
# Append to existing results
|
||||
all_results = []
|
||||
if Path(output_file).exists():
|
||||
try:
|
||||
with open(output_file) as f:
|
||||
all_results = json.load(f)
|
||||
except:
|
||||
all_results = []
|
||||
all_results.append(result_data)
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(all_results, f, indent=2)
|
||||
print(f"\nResults saved to: {output_file}")
|
||||
|
||||
return result_data
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Parallel OCR Test")
|
||||
parser.add_argument("--port", type=int, default=8006, help="Backend port")
|
||||
parser.add_argument("--host", default="localhost", help="Backend host")
|
||||
parser.add_argument("--workers", type=int, default=1, help="Number of OCR workers (for labeling)")
|
||||
parser.add_argument("--output", type=str, help="Output JSON file for results")
|
||||
args = parser.parse_args()
|
||||
|
||||
api_base = f"http://{args.host}:{args.port}"
|
||||
run_test(api_base, args.workers, args.output, port=args.port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
228
tests/ocr-validation/test_receipts_sequential.py
Normal file
228
tests/ocr-validation/test_receipts_sequential.py
Normal file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test each receipt sequentially and report results."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
from jose import jwt
|
||||
|
||||
API_BASE = "http://localhost:8000"
|
||||
PDF_FOLDER = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry"
|
||||
EXPECTED_FILE = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/tests/ocr-validation/expected_receipts.json"
|
||||
|
||||
def get_jwt_token():
|
||||
"""Create a test JWT token for API authentication."""
|
||||
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
now = datetime.utcnow()
|
||||
expire = now + timedelta(hours=1)
|
||||
|
||||
payload = {
|
||||
"username": "MARIUS",
|
||||
"user_id": 1,
|
||||
"companies": ["604"],
|
||||
"permissions": ["read", "write"],
|
||||
"exp": expire,
|
||||
"iat": now,
|
||||
"type": "access"
|
||||
}
|
||||
return jwt.encode(payload, secret_key, algorithm="HS256")
|
||||
|
||||
def test_receipt(pdf_path: str, expected: dict, headers: dict) -> dict:
|
||||
"""Test a single receipt and return results."""
|
||||
filename = os.path.basename(pdf_path)
|
||||
result = {
|
||||
"filename": filename,
|
||||
"success": False,
|
||||
"time_ms": 0,
|
||||
"error": None,
|
||||
"extracted": {},
|
||||
"matches": {},
|
||||
"issues": []
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
with open(pdf_path, "rb") as f:
|
||||
files = {"file": (filename, f, "application/pdf")}
|
||||
response = requests.post(
|
||||
f"{API_BASE}/api/data-entry/ocr/extract?engine=doctr_plus",
|
||||
files=files,
|
||||
headers=headers,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
result["error"] = f"HTTP {response.status_code}"
|
||||
result["time_ms"] = int((time.time() - start_time) * 1000)
|
||||
return result
|
||||
|
||||
job_data = response.json()
|
||||
job_id = job_data.get("job_id")
|
||||
|
||||
# Poll for completion
|
||||
for _ in range(60): # Max 60 polls (2 minutes)
|
||||
poll_response = requests.get(
|
||||
f"{API_BASE}/api/data-entry/ocr/jobs/{job_id}/wait?timeout=30",
|
||||
headers=headers,
|
||||
timeout=35
|
||||
)
|
||||
if poll_response.status_code == 200:
|
||||
job_result = poll_response.json()
|
||||
status = job_result.get("status")
|
||||
if status == "completed":
|
||||
break
|
||||
elif status == "error":
|
||||
result["error"] = job_result.get("error", "Unknown error")
|
||||
result["time_ms"] = int((time.time() - start_time) * 1000)
|
||||
return result
|
||||
time.sleep(1)
|
||||
|
||||
result["time_ms"] = int((time.time() - start_time) * 1000)
|
||||
|
||||
if job_result.get("status") != "completed":
|
||||
result["error"] = f"Timeout - status: {job_result.get('status')}"
|
||||
return result
|
||||
|
||||
# Extract fields (correct field names from API)
|
||||
extraction = job_result.get("result", {})
|
||||
result["extracted"] = {
|
||||
"total": extraction.get("amount"), # API uses "amount" not "total"
|
||||
"date": extraction.get("receipt_date"), # API uses "receipt_date" not "date"
|
||||
"cui": extraction.get("cui"),
|
||||
"tva_total": extraction.get("tva_total"),
|
||||
"confidence": extraction.get("overall_confidence")
|
||||
}
|
||||
|
||||
# Compare with expected (use correct field names from expected_receipts.json)
|
||||
exp_total = expected.get("total")
|
||||
exp_date = expected.get("data_bon")
|
||||
exp_cui = expected.get("cui_furnizor")
|
||||
|
||||
# Normalize for comparison
|
||||
def normalize_total(val):
|
||||
if val is None:
|
||||
return None
|
||||
return float(str(val).replace(',', '.'))
|
||||
|
||||
def normalize_cui(val):
|
||||
if val is None:
|
||||
return None
|
||||
return str(val).upper().replace('RO', '').replace(' ', '').strip()
|
||||
|
||||
ext_total = normalize_total(result["extracted"]["total"])
|
||||
ext_cui = normalize_cui(result["extracted"]["cui"])
|
||||
exp_cui_norm = normalize_cui(exp_cui)
|
||||
exp_total_norm = normalize_total(exp_total)
|
||||
|
||||
result["matches"]["total"] = abs(ext_total - exp_total_norm) < 0.01 if ext_total and exp_total_norm else None
|
||||
result["matches"]["date"] = result["extracted"]["date"] == exp_date if exp_date else None
|
||||
result["matches"]["cui"] = ext_cui == exp_cui_norm if exp_cui else None
|
||||
|
||||
# Check for issues
|
||||
if exp_total and not result["matches"]["total"]:
|
||||
result["issues"].append(f"TOTAL: got {result['extracted']['total']}, expected {exp_total}")
|
||||
if exp_date and not result["matches"]["date"]:
|
||||
result["issues"].append(f"DATE: got {result['extracted']['date']}, expected {exp_date}")
|
||||
if exp_cui and not result["matches"]["cui"]:
|
||||
result["issues"].append(f"CUI: got {result['extracted']['cui']}, expected {exp_cui}")
|
||||
|
||||
result["success"] = len(result["issues"]) == 0
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
result["time_ms"] = int((time.time() - start_time) * 1000)
|
||||
|
||||
return result
|
||||
|
||||
def main():
|
||||
# Load expected data
|
||||
with open(EXPECTED_FILE) as f:
|
||||
expected_data = json.load(f)
|
||||
|
||||
# Handle both formats: list or dict with "receipts" key
|
||||
if isinstance(expected_data, dict) and "receipts" in expected_data:
|
||||
all_receipts = expected_data["receipts"]
|
||||
else:
|
||||
all_receipts = expected_data
|
||||
|
||||
# Get JWT token
|
||||
token = get_jwt_token()
|
||||
if not token:
|
||||
print("ERROR: Could not get JWT token")
|
||||
sys.exit(1)
|
||||
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
|
||||
# Filter single-page receipts
|
||||
receipts = [r for r in all_receipts if r.get("pages", 1) == 1]
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing {len(receipts)} single-page receipts with doctr_plus")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
results = []
|
||||
times = []
|
||||
|
||||
for i, receipt in enumerate(receipts, 1):
|
||||
filename = receipt["filename"]
|
||||
pdf_path = os.path.join(PDF_FOLDER, filename)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"[{i:02d}/{len(receipts)}] SKIP: {filename} (not found)")
|
||||
continue
|
||||
|
||||
print(f"[{i:02d}/{len(receipts)}] Testing: {filename[:50]}...", end=" ", flush=True)
|
||||
|
||||
result = test_receipt(pdf_path, receipt, headers)
|
||||
results.append(result)
|
||||
|
||||
if result["error"]:
|
||||
print(f"ERROR ({result['time_ms']}ms): {result['error']}")
|
||||
elif result["success"]:
|
||||
print(f"OK ({result['time_ms']}ms) conf={result['extracted'].get('confidence', 0):.2f}")
|
||||
times.append(result["time_ms"])
|
||||
else:
|
||||
print(f"FAIL ({result['time_ms']}ms): {'; '.join(result['issues'])}")
|
||||
times.append(result["time_ms"])
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print("SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
|
||||
successful = [r for r in results if r["success"]]
|
||||
failed = [r for r in results if not r["success"] and not r["error"]]
|
||||
errors = [r for r in results if r["error"]]
|
||||
|
||||
print(f"Total: {len(results)}")
|
||||
print(f"Success: {len(successful)} ({len(successful)*100/len(results):.1f}%)")
|
||||
print(f"Failed: {len(failed)}")
|
||||
print(f"Errors: {len(errors)}")
|
||||
|
||||
if times:
|
||||
avg_time = sum(times) / len(times)
|
||||
print(f"\nTiming: avg={avg_time:.0f}ms, min={min(times)}ms, max={max(times)}ms")
|
||||
|
||||
# Flag slow ones
|
||||
slow_threshold = avg_time * 2
|
||||
slow = [r for r in results if r["time_ms"] > slow_threshold and not r["error"]]
|
||||
if slow:
|
||||
print(f"\nSlow receipts (>{slow_threshold:.0f}ms):")
|
||||
for r in slow:
|
||||
print(f" - {r['filename']}: {r['time_ms']}ms")
|
||||
|
||||
if failed:
|
||||
print(f"\nFailed receipts:")
|
||||
for r in failed:
|
||||
print(f" - {r['filename']}: {'; '.join(r['issues'])}")
|
||||
|
||||
if errors:
|
||||
print(f"\nError receipts:")
|
||||
for r in errors:
|
||||
print(f" - {r['filename']}: {r['error']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user