refactor(docs): consolidate and cleanup documentation
- Delete 9 deprecated/obsolete docs (~6,300 lines removed) - Move test PDFs to tests/fixtures/ocr-samples/ - Create docs/DEPLOYMENT.md as principal guide - Create tests/ocr-validation/README.md - Update all refs for ultrathin monolith architecture - Update OCR tests to use relative paths Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
BIN
tests/fixtures/ocr-samples/Lidl papetarie 604 fara TVA. nu are cod fiscal.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/Lidl papetarie 604 fara TVA. nu are cod fiscal.pdf
vendored
Normal file
Binary file not shown.
1784
tests/fixtures/ocr-samples/Lidl personal 4 ianuarie .pdf
vendored
Normal file
1784
tests/fixtures/ocr-samples/Lidl personal 4 ianuarie .pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tests/fixtures/ocr-samples/abonament kineterra.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/abonament kineterra.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/benzina 07 aug. 2024.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/benzina 07 aug. 2024.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/benzina 10 mai 2025.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/benzina 10 mai 2025.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/benzina 13 iulie.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/benzina 13 iulie.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/benzina 13 septembrie .pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/benzina 13 septembrie .pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/benzina 14 august.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/benzina 14 august.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/benzina 20 dec.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/benzina 20 dec.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/benzina 27 octombrie .pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/benzina 27 octombrie .pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/best print stampila .pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/best print stampila .pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/bon fiscal Dedeman - efactura.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/bon fiscal Dedeman - efactura.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/brick consumabil 604 50% deductibil 22 dec.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/brick consumabil 604 50% deductibil 22 dec.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/brick consumabile 604 22 dec.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/brick consumabile 604 22 dec.pdf
vendored
Normal file
Binary file not shown.
6740
tests/fixtures/ocr-samples/brick igiena 1 sept.pdf
vendored
Normal file
6740
tests/fixtures/ocr-samples/brick igiena 1 sept.pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2552
tests/fixtures/ocr-samples/brick igiena 604.pdf
vendored
Normal file
2552
tests/fixtures/ocr-samples/brick igiena 604.pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2292
tests/fixtures/ocr-samples/brick igiena 8 octombrie 98.95 lei card.pdf
vendored
Normal file
2292
tests/fixtures/ocr-samples/brick igiena 8 octombrie 98.95 lei card.pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2610
tests/fixtures/ocr-samples/brick igiena, electrice consumabile 604.pdf
vendored
Normal file
2610
tests/fixtures/ocr-samples/brick igiena, electrice consumabile 604.pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tests/fixtures/ocr-samples/electrobering igiena iulie 604.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/electrobering igiena iulie 604.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/electrobering telecomanda.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/electrobering telecomanda.pdf
vendored
Normal file
Binary file not shown.
1370
tests/fixtures/ocr-samples/factura 70005116259 20.09.2025 Dedeman.pdf
vendored
Normal file
1370
tests/fixtures/ocr-samples/factura 70005116259 20.09.2025 Dedeman.pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tests/fixtures/ocr-samples/gama ink refill toner imprimanta 17 sept 2024.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/gama ink refill toner imprimanta 17 sept 2024.pdf
vendored
Normal file
Binary file not shown.
2086
tests/fixtures/ocr-samples/igiena 11 octombrie .pdf
vendored
Normal file
2086
tests/fixtures/ocr-samples/igiena 11 octombrie .pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2312
tests/fixtures/ocr-samples/igiena 14 decembrie five-holding.pdf
vendored
Normal file
2312
tests/fixtures/ocr-samples/igiena 14 decembrie five-holding.pdf
vendored
Normal file
File diff suppressed because it is too large
Load Diff
BIN
tests/fixtures/ocr-samples/kineterra abonament terapie august 2024.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/kineterra abonament terapie august 2024.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/kineterra fizioterapie 9 sept.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/kineterra fizioterapie 9 sept.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/rechizite 12 decembrie pictus.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/rechizite 12 decembrie pictus.pdf
vendored
Normal file
Binary file not shown.
BIN
tests/fixtures/ocr-samples/stepout-bon1-5.jpg
vendored
Normal file
BIN
tests/fixtures/ocr-samples/stepout-bon1-5.jpg
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 299 KiB |
BIN
tests/fixtures/ocr-samples/stepout-bon2-5.jpg
vendored
Normal file
BIN
tests/fixtures/ocr-samples/stepout-bon2-5.jpg
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 323 KiB |
BIN
tests/fixtures/ocr-samples/unlimited duplicat chei 23 mai.pdf
vendored
Normal file
BIN
tests/fixtures/ocr-samples/unlimited duplicat chei 23 mai.pdf
vendored
Normal file
Binary file not shown.
158
tests/ocr-validation/README.md
Normal file
158
tests/ocr-validation/README.md
Normal file
@@ -0,0 +1,158 @@
|
||||
# OCR Validation Tests
|
||||
|
||||
Teste pentru validarea acurateții extragerii OCR din bonuri fiscale.
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Backend-ul trebuie să ruleze** pe `http://localhost:8000`
|
||||
2. **Modulul Data Entry activat** în `.env`: `MODULE_DATA_ENTRY_ENABLED=true`
|
||||
3. **JWT_SECRET_KEY** setat (sau folosește default-ul de test)
|
||||
|
||||
```bash
|
||||
# Pornește backend-ul
|
||||
cd /workspace/roa2web
|
||||
./start-prod.sh
|
||||
# sau
|
||||
./start-test.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Test Files
|
||||
|
||||
| Fișier | Scop |
|
||||
|--------|------|
|
||||
| `expected_receipts.json` | Expected values pentru fiecare bon (ground truth) |
|
||||
| `ocr-direct-validation.py` | Test individual cu comparare detaliată |
|
||||
| `test_receipts_sequential.py` | Rulează toate bonurile secvențial |
|
||||
| `test_receipts_parallel.py` | Rulează toate bonurile în paralel (performance test) |
|
||||
| `test_receipts_parallel_windows.py` | Versiune Windows cu memory tracking |
|
||||
| `get_raw_ocr_text.py` | Debug tool - afișează raw OCR text |
|
||||
|
||||
**Fixtures:** `tests/fixtures/ocr-samples/` - 30 PDF-uri de bonuri fiscale
|
||||
|
||||
---
|
||||
|
||||
## Cum să rulezi testele
|
||||
|
||||
### 1. Test Individual (Recomandat pentru debug)
|
||||
|
||||
```bash
|
||||
cd /workspace/roa2web
|
||||
|
||||
# Test toate bonurile cu engine doctr_plus
|
||||
python tests/ocr-validation/ocr-direct-validation.py
|
||||
|
||||
# Test cu engine specific
|
||||
python tests/ocr-validation/ocr-direct-validation.py --engine doctr_plus
|
||||
python tests/ocr-validation/ocr-direct-validation.py --engine tesseract
|
||||
|
||||
# Test doar un bon specific
|
||||
python tests/ocr-validation/ocr-direct-validation.py --receipt receipt_01
|
||||
|
||||
# Include și bonuri multi-page
|
||||
python tests/ocr-validation/ocr-direct-validation.py --include-multipage
|
||||
```
|
||||
|
||||
### 2. Test Secvențial (Toate bonurile, unul câte unul)
|
||||
|
||||
```bash
|
||||
python tests/ocr-validation/test_receipts_sequential.py
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
Processing: abonament kineterra.pdf
|
||||
✓ Total: MATCH (1900.0 = 1900.0)
|
||||
✓ Date: MATCH (2025-11-10)
|
||||
✗ CUI: MISMATCH (expected: 31180432, got: 3118043)
|
||||
```
|
||||
|
||||
### 3. Test Paralel (Performance benchmark)
|
||||
|
||||
```bash
|
||||
python tests/ocr-validation/test_receipts_parallel.py
|
||||
```
|
||||
|
||||
Output:
|
||||
```
|
||||
PARALLEL TEST: 26 receipts
|
||||
Phase 1: Submitting all jobs...
|
||||
Submitted 26 jobs in 2.3s
|
||||
Phase 2: Waiting for results...
|
||||
OK: abonament kineterra.pdf 12.3s conf=95%
|
||||
OK: benzina 14 august.pdf 8.7s conf=92%
|
||||
TOTAL TIME: 45.2s
|
||||
```
|
||||
|
||||
### 4. Debug Raw OCR Text
|
||||
|
||||
```bash
|
||||
# Vezi textul raw extras de OCR
|
||||
python tests/ocr-validation/get_raw_ocr_text.py
|
||||
|
||||
# Sau pentru un fișier specific
|
||||
python tests/ocr-validation/get_raw_ocr_text.py tests/fixtures/ocr-samples/benzina\ 14\ august.pdf
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Expected Receipts Format
|
||||
|
||||
`expected_receipts.json` conține ground truth pentru fiecare bon:
|
||||
|
||||
```json
|
||||
{
|
||||
"receipts": [
|
||||
{
|
||||
"id": "receipt_01",
|
||||
"filename": "abonament kineterra.pdf",
|
||||
"furnizor": "KINETERRA CONCEPT SRL",
|
||||
"cui_furnizor": "31180432",
|
||||
"total": 1900.0,
|
||||
"tva_details": [],
|
||||
"total_tva": 0.0,
|
||||
"data_bon": "2025-11-10",
|
||||
"notes": "Neplatitor TVA - abonament terapie"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Adaugă bonuri noi pentru testare
|
||||
|
||||
1. Pune PDF-ul în `tests/fixtures/ocr-samples/`
|
||||
2. Adaugă entry în `expected_receipts.json` cu valorile corecte
|
||||
3. Rulează testul:
|
||||
```bash
|
||||
python tests/ocr-validation/ocr-direct-validation.py --receipt receipt_XX
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Connection refused" sau "Failed to connect"
|
||||
- Backend-ul nu rulează. Pornește cu `./start-prod.sh`
|
||||
|
||||
### "401 Unauthorized"
|
||||
- JWT token invalid. Verifică `JWT_SECRET_KEY` în `.env`
|
||||
|
||||
### "File not found"
|
||||
- Verifică că PDF-urile sunt în `tests/fixtures/ocr-samples/`
|
||||
|
||||
### Rezultate incorecte
|
||||
- Folosește `get_raw_ocr_text.py` pentru a vedea ce text extrage OCR
|
||||
- Verifică dacă bonul e lizibil și de calitate bună
|
||||
|
||||
---
|
||||
|
||||
## Performance Notes
|
||||
|
||||
- **doctr_plus** engine: ~8-15 secunde per bon (GPU accelerated)
|
||||
- **tesseract** engine: ~3-5 secunde per bon (CPU only)
|
||||
- Testul paralel poate procesa ~26 bonuri în ~45 secunde (vs ~5 minute secvențial)
|
||||
@@ -113,10 +113,11 @@ if __name__ == "__main__":
|
||||
print(f"Using JWT token for authentication")
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
# Default: process the two receipts user wants to see
|
||||
# Default: process sample receipts from fixtures
|
||||
fixtures_dir = Path(__file__).parent.parent / "fixtures" / "ocr-samples"
|
||||
receipts = [
|
||||
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena 1 sept.pdf",
|
||||
"/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry/brick igiena, electrice consumabile 604.pdf"
|
||||
str(fixtures_dir / "brick igiena 1 sept.pdf"),
|
||||
str(fixtures_dir / "brick igiena, electrice consumabile 604.pdf")
|
||||
]
|
||||
else:
|
||||
receipts = sys.argv[1:]
|
||||
|
||||
@@ -193,7 +193,7 @@ def main():
|
||||
# Paths
|
||||
script_dir = Path(__file__).parent
|
||||
expected_path = script_dir / 'expected_receipts.json'
|
||||
pdf_base_path = script_dir.parent.parent / 'docs' / 'data-entry'
|
||||
pdf_base_path = script_dir.parent.parent / 'tests' / 'fixtures' / 'ocr-samples'
|
||||
|
||||
# JWT secret from environment or default
|
||||
jwt_secret = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
|
||||
@@ -10,8 +10,13 @@ from datetime import datetime, timedelta
|
||||
from jose import jwt
|
||||
|
||||
API_BASE = "http://localhost:8000"
|
||||
PDF_FOLDER = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry"
|
||||
EXPECTED_FILE = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/tests/ocr-validation/expected_receipts.json"
|
||||
|
||||
# Paths - relative to project root
|
||||
from pathlib import Path
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parent.parent
|
||||
PDF_FOLDER = str(PROJECT_ROOT / "tests" / "fixtures" / "ocr-samples")
|
||||
EXPECTED_FILE = str(SCRIPT_DIR / "expected_receipts.json")
|
||||
|
||||
def get_jwt_token():
|
||||
secret_key = os.getenv('JWT_SECRET_KEY', 'generate_with_secrets_token_urlsafe_32')
|
||||
|
||||
@@ -27,7 +27,7 @@ except ImportError:
|
||||
# Paths - relative to backend directory
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
BACKEND_DIR = SCRIPT_DIR.parent.parent / "backend"
|
||||
PDF_FOLDER = SCRIPT_DIR.parent.parent / "docs" / "data-entry"
|
||||
PDF_FOLDER = SCRIPT_DIR.parent.parent / "tests" / "fixtures" / "ocr-samples"
|
||||
EXPECTED_FILE = SCRIPT_DIR / "expected_receipts.json"
|
||||
|
||||
|
||||
|
||||
@@ -10,8 +10,13 @@ from datetime import datetime, timedelta
|
||||
from jose import jwt
|
||||
|
||||
API_BASE = "http://localhost:8000"
|
||||
PDF_FOLDER = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/docs/data-entry"
|
||||
EXPECTED_FILE = "/mnt/e/proiecte/ab-worktrees/doctr-ocr-metrics/tests/ocr-validation/expected_receipts.json"
|
||||
|
||||
# Paths - relative to project root
|
||||
from pathlib import Path
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parent.parent
|
||||
PDF_FOLDER = str(PROJECT_ROOT / "tests" / "fixtures" / "ocr-samples")
|
||||
EXPECTED_FILE = str(SCRIPT_DIR / "expected_receipts.json")
|
||||
|
||||
def get_jwt_token():
|
||||
"""Create a test JWT token for API authentication."""
|
||||
|
||||
Reference in New Issue
Block a user