Files
rar-autopass/tests/test_import_parse.py
Claude Agent 2c8367109c feat(parser): T14 — parser core 2-treceri xlsx/csv + T3 coercion + T10 data col-level + T13 robustete RO
Implementeaza app/import_parse.py (modul nou) cu toate cele 4 taskuri parser:

T14 (task #7): schelet parser + arhitectura 2-treceri (Issue 2 consens cross-model)
- Trecerea 1 read_only=True: dim-check FileTooLarge (>5000 randuri / >5MB) + detectie multi-sheet
- Trecerea 2 normal-mode: header + merged cells + body
- CSV delimiter sniff (csv.Sniffer + proba explicita {; , \t}) — export RO foloseste ;
- Encoding: UTF-8-sig -> UTF-8 -> cp1250 -> latin2 (fallback RO)
- Exceptii custom: FileTooLarge, HeaderError, MultipleSheets
- Coloane duplicate din merge: sufixate _2/_3 (nu HeaderError)

T3 (task #8): coercion guard + needs_review + mesaj formule-None
- VIN numeric (openpyxl citeste ca int/float) -> coercion_flags[row] needs_review
- Odometru float 123456.0 -> tunde .0 inainte de validare (§3.4bis, ordonare critica)
- Rata None > 60% pe coloana obligatorie -> formula_columns (Issue 3 mesaj specific)
- Datetime nativ -> convertit la YYYY-MM-DD (neambiguu)

T10 (task #9): dezambiguizare data la nivel de COLOANA (OV-8)
- Datetime nativ -> "native" (direct, fara ambiguitate)
- String: detectie format din intreaga coloana — daca oricare rand are token[1]>12 -> DD-first
- Daca toti zi<=12 -> "ambiguous" -> needs_review per rand
- parse_date_value() helper pentru preview resolve

T13 (task #10): robustete export RO
- Multi-sheet: >1 sheet non-gol -> MultipleSheets([...]) cu lista; sheet ales -> parse normal
- Merged header: _unmerge_header propaga valoarea topleft la toate coloanele din grup
- Footer trim: randuri trailing cu VIN + data ambele goale -> skip structural (nu needs_data)

Teste: 37 teste verzi in tests/test_import_parse.py (fixture-uri xlsx generate in-memory).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-16 20:31:27 +00:00

452 lines
19 KiB
Python

"""Teste pentru app/import_parse.py (T14 + T3 + T10 + T13).
Fixture-urile xlsx sunt generate in-memory cu openpyxl (nu fisiere binare commituite).
Fiecare sectiune acopera un task distinct.
"""
from __future__ import annotations
import csv
import io
from datetime import date, datetime
import openpyxl
import pytest
from app.import_parse import (
FileTooLarge,
HeaderError,
MultipleSheets,
ParsedFile,
parse_csv,
parse_file,
parse_xlsx,
_detect_date_formats,
_infer_date_format_from_column,
parse_date_value,
_trim_footer,
)
# --------------------------------------------------------------------------- #
# Helpere fixture #
# --------------------------------------------------------------------------- #
def _make_xlsx(rows: list[list], sheet_name: str = "Sheet1") -> bytes:
"""Creeaza un fisier xlsx in-memory cu un singur sheet."""
wb = openpyxl.Workbook()
ws = wb.active
ws.title = sheet_name
for row in rows:
ws.append(row)
buf = io.BytesIO()
wb.save(buf)
return buf.getvalue()
def _make_xlsx_multisheet(sheets: dict[str, list[list]]) -> bytes:
"""Creeaza un xlsx cu mai multe sheet-uri."""
wb = openpyxl.Workbook()
first = True
for name, rows in sheets.items():
if first:
ws = wb.active
ws.title = name
first = False
else:
ws = wb.create_sheet(name)
for row in rows:
ws.append(row)
buf = io.BytesIO()
wb.save(buf)
return buf.getvalue()
def _make_csv(rows: list[list], delimiter: str = ",", encoding: str = "utf-8") -> bytes:
"""Creeaza un CSV in-memory."""
buf = io.StringIO()
writer = csv.writer(buf, delimiter=delimiter)
for row in rows:
writer.writerow(row)
return buf.getvalue().encode(encoding)
_HEADER = ["VIN", "Nr inmatriculare", "Data prestatie", "Odometru final", "Operatie"]
_ROW1 = ["WVWZZZ1KZAW000123", "B999TST", "2026-06-15", "123456", "Revizie"]
_ROW2 = ["WVWZZZ1KZAW000124", "CJ001AB", "2026-05-10", "98765", "Reparatie"]
# =========================================================================== #
# T14 — CSV delimiter sniff + encoding + cap #
# =========================================================================== #
class TestCsvDelimiter:
def test_csv_semicolon_ro_export(self):
"""Export Excel RO foloseste ';' — trebuie detectat corect."""
data = _make_csv([_HEADER, _ROW1, _ROW2], delimiter=";")
result = parse_csv(data)
assert result.columns == _HEADER
assert len(result.rows) == 2
assert result.rows[0]["VIN"] == "WVWZZZ1KZAW000123"
def test_csv_comma_standard(self):
data = _make_csv([_HEADER, _ROW1, _ROW2], delimiter=",")
result = parse_csv(data)
assert len(result.rows) == 2
def test_csv_tab_delimiter(self):
data = _make_csv([_HEADER, _ROW1, _ROW2], delimiter="\t")
result = parse_csv(data)
assert len(result.rows) == 2
def test_csv_single_column_raises_header_error(self):
"""1 coloana dupa sniff -> HeaderError clar, nu mapare oarba."""
# CSV fara delimitator real -> o singura coloana
text = "VIN\nWVWZZZ1KZAW000123\n"
data = text.encode("utf-8")
with pytest.raises(HeaderError) as exc:
parse_csv(data)
assert "coloana" in str(exc.value).lower()
def test_csv_encoding_cp1250(self):
"""Export RO cu encoding cp1250 (diacritice romanesti)."""
rows = [
["VIN", "Nr inmatriculare", "Data prestatie", "Odometru final", "Operatie"],
["WVWZZZ1KZAW000123", "B999TST", "2026-06-15", "123456", "Revizie periodică"],
]
data = _make_csv(rows, delimiter=";", encoding="cp1250")
result = parse_csv(data)
assert len(result.rows) == 1
def test_csv_too_many_rows_raises(self):
""">5000 randuri -> FileTooLarge fara parsare partiala."""
header = ["VIN", "Data", "Odometru", "NrInm", "Op"]
rows = [header] + [["WVWZZZ1KZAW000123", "2026-01-01", "1000", "B1TST", "R"] for _ in range(5001)]
data = _make_csv(rows, delimiter=",")
with pytest.raises(FileTooLarge):
parse_csv(data)
def test_csv_too_large_bytes_raises(self):
""">5MB -> FileTooLarge."""
data = b"X" * (5 * 1024 * 1024 + 1)
with pytest.raises(FileTooLarge):
parse_csv(data)
# =========================================================================== #
# T14 — XLSX read_only dim-check + cap #
# =========================================================================== #
class TestXlsxDimcheck:
def test_xlsx_parsat_corect(self):
data = _make_xlsx([_HEADER, _ROW1, _ROW2])
result = parse_xlsx(data)
assert result.columns == _HEADER
assert len(result.rows) == 2
def test_xlsx_too_large_bytes_raises(self):
"""Fisier >5MB -> FileTooLarge inainte de parsare."""
# Cream un xlsx real dar verificam dimensiunea bytes separat
data = b"PK" + b"X" * (5 * 1024 * 1024 + 100)
with pytest.raises((FileTooLarge, Exception)):
# Poate ridica si InvalidFileException daca nu e xlsx valid
parse_xlsx(data)
def test_xlsx_empty_raises_header_error(self):
"""Workbook fara date -> HeaderError."""
wb = openpyxl.Workbook()
buf = io.BytesIO()
wb.save(buf)
with pytest.raises(HeaderError):
parse_xlsx(buf.getvalue())
def test_parse_file_dispatch_xlsx(self):
data = _make_xlsx([_HEADER, _ROW1])
result = parse_file(data, "test.xlsx")
assert len(result.rows) == 1
def test_parse_file_dispatch_csv(self):
data = _make_csv([_HEADER, _ROW1], delimiter=";")
result = parse_file(data, "test.csv")
assert len(result.rows) == 1
def test_parse_file_unsupported_ext(self):
with pytest.raises(HeaderError):
parse_file(b"data", "test.dbf")
# =========================================================================== #
# T3 — coercion guard + needs_review + mesaj formule-None #
# =========================================================================== #
class TestCoercionNeedsReview:
def test_vin_numeric_xlsx_flagged(self):
"""VIN '0123...' citit ca numeric din xlsx -> needs_review (coercion_flags)."""
# openpyxl citeste VIN numeric ca int/float daca celula e formatata numeric
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["VIN", "Nr inmatriculare", "Data prestatie", "Odometru final", "Operatie"])
# Scrie VIN ca numar (simuleaza comportamentul openpyxl pt celule numerice)
ws.cell(row=2, column=1).value = 123456789012345 # int, nu string
ws.cell(row=2, column=2).value = "B999TST"
ws.cell(row=2, column=3).value = "2026-06-15"
ws.cell(row=2, column=4).value = 123456
ws.cell(row=2, column=5).value = "Revizie"
buf = io.BytesIO()
wb.save(buf)
result = parse_xlsx(buf.getvalue())
assert 0 in result.coercion_flags
flags = result.coercion_flags[0]
assert any("VIN" in f for f in flags)
def test_odometru_float_coerced(self):
"""Odometru 123456.0 (float Excel) -> convertit la '123456'."""
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["VIN", "Nr inmatriculare", "Data prestatie", "Odometru final", "Operatie"])
ws.cell(row=2, column=1).value = "WVWZZZ1KZAW000123"
ws.cell(row=2, column=2).value = "B999TST"
ws.cell(row=2, column=3).value = "2026-06-15"
ws.cell(row=2, column=4).value = 123456.0 # float cu .0
ws.cell(row=2, column=5).value = "Revizie"
buf = io.BytesIO()
wb.save(buf)
result = parse_xlsx(buf.getvalue())
odo_val = result.rows[0]["Odometru final"]
assert odo_val == "123456", f"Asteptat '123456', primit '{odo_val}'"
# Nu trebuie flag needs_review pentru odometru .0 (e coercion standard)
assert 0 not in result.coercion_flags or not any(
"Odometru" in f for f in result.coercion_flags.get(0, [])
)
def test_formula_column_detected(self):
"""Coloana cu >60% None (formule necalculate) -> formula_columns."""
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["VIN", "Nr inmatriculare", "Data prestatie", "Odometru final", "Operatie"])
for i in range(10):
ws.append([
"WVWZZZ1KZAW000123",
"B999TST",
"2026-06-15",
None, # formula necalculata -> None
"Revizie",
])
buf = io.BytesIO()
wb.save(buf)
result = parse_xlsx(buf.getvalue())
assert "Odometru final" in result.formula_columns
def test_datetime_native_converted_to_iso(self):
"""Celula datetime nativa -> convertita la YYYY-MM-DD (neambigua)."""
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["VIN", "Nr inmatriculare", "Data prestatie", "Odometru final", "Operatie"])
ws.cell(row=2, column=1).value = "WVWZZZ1KZAW000123"
ws.cell(row=2, column=2).value = "B999TST"
ws.cell(row=2, column=3).value = datetime(2026, 6, 15, 10, 30)
ws.cell(row=2, column=4).value = 123456
ws.cell(row=2, column=5).value = "Revizie"
buf = io.BytesIO()
wb.save(buf)
result = parse_xlsx(buf.getvalue())
assert result.rows[0]["Data prestatie"] == "2026-06-15"
def test_odometru_via_canonicalize_row(self):
"""Verifica ca odometru_final='123456.0' e normalizat de canonicalize_row."""
from app.idempotency import canonicalize_row
raw = {"vin": "WVWZZZ1KZAW000123", "nr_inmatriculare": "B1TST",
"data_prestatie": "2026-06-15", "odometru_final": "123456.0", "prestatii": []}
canon = canonicalize_row(raw)
assert canon["odometru_final"] == "123456"
def test_vin_numeric_not_sent(self):
"""VIN numeric flagged -> coercion_flags prezent = auto-send blocat."""
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["VIN", "Nr inmatriculare", "Data prestatie", "Odometru final", "Operatie"])
ws.cell(row=2, column=1).value = 1234567890 # numeric
ws.cell(row=2, column=2).value = "B999TST"
ws.cell(row=2, column=3).value = "2026-06-15"
ws.cell(row=2, column=4).value = 123456
ws.cell(row=2, column=5).value = "Revizie"
buf = io.BytesIO()
wb.save(buf)
result = parse_xlsx(buf.getvalue())
# Randul 0 trebuie sa aiba flags (needs_review)
assert 0 in result.coercion_flags
# =========================================================================== #
# T10 — dezambiguizare data la nivel de COLOANA (OV-8) #
# =========================================================================== #
class TestDateColumnDisambiguation:
def test_string_ambiguous_flagged(self):
"""'03.04.2026' string cu zi<=12 -> ambiguous."""
fmt = _infer_date_format_from_column(["03.04.2026", "05.06.2026", "01.02.2026"])
assert fmt == "ambiguous"
def test_dd_first_detected_from_column(self):
"""Coloana cu cel putin un rand zi>12 -> DD.MM.YYYY detectat."""
# 15.04.2026: zi=15 > 12 -> DD-first sigur
fmt = _infer_date_format_from_column(["03.04.2026", "15.04.2026", "01.02.2026"])
assert fmt == "DD.MM.YYYY"
def test_month_gt12_also_dd_first(self):
"""Luna >12 imposibila -> cu siguranta DD.MM (ex: 04.13.2026 e imposibil -> zi=4, luna=13 ❌ / zi=13, luna=4 ✓)."""
# Daca pozitia-1 (luna) > 12 -> DD-first
fmt = _infer_date_format_from_column(["04.13.2026"])
assert fmt == "DD.MM.YYYY"
def test_iso_format_detected(self):
fmt = _infer_date_format_from_column(["2026-06-15", "2026-05-10"])
assert fmt == "YYYY-MM-DD"
def test_native_datetime_column_format(self):
"""Coloana cu toate valorile datetime native -> format 'native'."""
col_values = {"Data prestatie": [datetime(2026, 6, 15), datetime(2026, 5, 10)]}
fmt_map = _detect_date_formats(col_values, ["Data prestatie"])
assert fmt_map.get("Data prestatie") == "native"
def test_parse_date_value_ambiguous_needs_review(self):
"""'03.04.2026' cu format ambiguous -> (iso, True) = needs_review."""
iso, ambiguous = parse_date_value("03.04.2026", "ambiguous")
assert ambiguous is True
assert iso == "2026-04-03" # parseaza ca DD.MM.YYYY
def test_parse_date_value_native_already_iso(self):
"""Valoare deja convertita la ISO (din datetime nativ) -> (iso, False)."""
iso, ambiguous = parse_date_value("2026-06-15", "native")
assert iso == "2026-06-15"
assert ambiguous is False
def test_parse_date_value_dd_mm_yyyy(self):
iso, ambiguous = parse_date_value("15.06.2026", "DD.MM.YYYY")
assert iso == "2026-06-15"
assert ambiguous is False
def test_parse_date_value_empty(self):
iso, ambiguous = parse_date_value(None, "DD.MM.YYYY")
assert iso is None
assert ambiguous is False
def test_column_uniform_mm_dd_with_day_gt12(self):
"""Coloana uniform MM.DD cu randuri zi>12 -> format DD-first detectat la nivel coloana."""
# Exemplu: 03.04.2026 (ambiguu) + 15.04.2026 (zi=15>12 -> DD-first sigur)
# -> intreaga coloana e DD.MM.YYYY
fmt = _infer_date_format_from_column(["03.04.2026", "15.04.2026"])
assert fmt == "DD.MM.YYYY"
# Rand care altfel ar parea ambiguu (03) e tratat corect de format detectat la nivel coloana
iso, ambig = parse_date_value("03.04.2026", fmt)
assert iso == "2026-04-03"
assert ambig is False
# =========================================================================== #
# T13 — robustete export RO (multi-sheet + merged header + footer trim) #
# =========================================================================== #
class TestRobustetExportRO:
def test_multisheet_raises_multiple_sheets(self):
"""Workbook cu 2 sheet-uri non-goale -> MultipleSheets cu lista."""
data = _make_xlsx_multisheet({
"Iunie": [_HEADER, _ROW1],
"Iulie": [_HEADER, _ROW2],
})
with pytest.raises(MultipleSheets) as exc:
parse_xlsx(data)
assert "Iunie" in exc.value.sheet_names
assert "Iulie" in exc.value.sheet_names
def test_multisheet_with_sheet_name_selected(self):
"""Dupa alegere sheet -> parsat corect."""
data = _make_xlsx_multisheet({
"Iunie": [_HEADER, _ROW1],
"Iulie": [_HEADER, _ROW2],
})
result = parse_xlsx(data, sheet_name="Iulie")
assert len(result.rows) == 1
assert result.rows[0]["VIN"] == "WVWZZZ1KZAW000124"
def test_merged_header_resolved(self):
"""Header cu celule imbinate -> un-merge logic, nu coloane goale."""
wb = openpyxl.Workbook()
ws = wb.active
# Scrie header cu merge pe primele 2 coloane
ws.cell(row=1, column=1).value = "Vehicul"
ws.cell(row=1, column=3).value = "Data prestatie"
ws.cell(row=1, column=4).value = "Odometru final"
ws.cell(row=1, column=5).value = "Operatie"
ws.merge_cells(start_row=1, start_column=1, end_row=1, end_column=2)
ws.cell(row=2, column=1).value = "WVWZZZ1KZAW000123"
ws.cell(row=2, column=2).value = "B999TST"
ws.cell(row=2, column=3).value = "2026-06-15"
ws.cell(row=2, column=4).value = 123456
ws.cell(row=2, column=5).value = "Revizie"
buf = io.BytesIO()
wb.save(buf)
result = parse_xlsx(buf.getvalue())
# Merge propaga "Vehicul" la ambele coloane; dedup adauga sufix _2
assert "Vehicul" in result.columns
assert "Vehicul_2" in result.columns # coloana 2 din merge — dezambiguizata cu sufix
# Niciuna nu e goala/None
assert len([c for c in result.columns if "Vehicul" in c]) == 2
def test_footer_rows_skipped(self):
"""Randuri trailing fara VIN + data -> skip structural, nu needs_data."""
rows = [
_HEADER,
_ROW1,
_ROW2,
["TOTAL", "", "", "222221", ""], # footer cu VIN "TOTAL"
["", "", "", "", ""], # rand complet gol
]
data = _make_xlsx(rows)
result = parse_xlsx(data)
# Randul "TOTAL" are VIN non-gol ("TOTAL") si data goala -> nu e trim structural
# Randul complet gol (VIN="" + data="") -> trim
# Deci: 3 randuri raman (ROW1 + ROW2 + TOTAL)
vins = [r.get("VIN") for r in result.rows]
assert "" not in [v for v in vins if v is not None]
# Randul gol complet sa nu fie prezent
empty_rows = [r for r in result.rows if all(v is None or str(v).strip() == "" for v in r.values())]
assert len(empty_rows) == 0
def test_footer_vin_and_date_both_empty_skipped(self):
"""Rand unde VIN si data sunt ambele goale -> skip (footer TOTAL/Intocmit de:)."""
rows_list = [
{"VIN": "WVWZZZ1KZAW000123", "Data prestatie": "2026-06-15", "Odometru final": "123456"},
{"VIN": "", "Data prestatie": "", "Odometru final": "9999"}, # footer fals
]
col_names = ["VIN", "Data prestatie", "Odometru final"]
trimmed = _trim_footer(rows_list, col_names)
assert len(trimmed) == 1
assert trimmed[0]["VIN"] == "WVWZZZ1KZAW000123"
def test_single_sheet_no_error(self):
"""Workbook cu un singur sheet -> parsat fara MultipleSheets."""
data = _make_xlsx([_HEADER, _ROW1])
result = parse_xlsx(data)
assert len(result.rows) == 1
# =========================================================================== #
# Integrare parse_file #
# =========================================================================== #
class TestParseFileIntegration:
def test_xlsx_full_flow(self):
data = _make_xlsx([_HEADER, _ROW1, _ROW2])
result = parse_file(data, "prezentari_iunie.xlsx")
assert result.columns == _HEADER
assert len(result.rows) == 2
assert result.rows[0]["VIN"] == "WVWZZZ1KZAW000123"
assert result.rows[1]["Odometru final"] == "98765"
def test_csv_semicolon_full_flow(self):
data = _make_csv([_HEADER, _ROW1], delimiter=";")
result = parse_file(data, "export_ro.csv")
assert result.columns == _HEADER
assert result.rows[0]["VIN"] == "WVWZZZ1KZAW000123"