feat(voice): Pas 3 — voice/normalize.py + 35 RO test cases

Pure functions pentru TTS text normalization (RO): - strip_markdown: regex bold/italic/code/link/heading/list - expand_numbers_ro: num2words pentru cardinals + decimal handling ("3.14" → "trei virgulă paisprezece", "3.05" → "trei virgulă zero cinci" digit-by-digit la leading zero) - expand_currency: formă naturală RO ("12.50 RON" → "doisprezece lei și cincizeci de bani", "$25.99" → "douăzeci și cinci de dolari și nouăzeci și nouă de cenți") - expand_symbols: %/&/@/° + whitespace collapse - expand_abbreviations: etc./dl./dna./nr./ş.a./ş.a.m.d. - normalize_for_tts: full pipeline + hard truncate 200 cuvinte cu "Restul l-am scris în chat." Pipeline order: markdown → abbreviations → currency → numbers → symbols → truncate. Currency BEFORE numbers — altfel "12.50 RON" se degradează la "doisprezece virgulă cincizeci RON". Romanian "de" particle rule: n>=20 AND (n%100 not in 1..19) → "o sută de lei", "o sută cinci lei" (no "de"). n=1 with currency → "un dolar" / "un leu" (article, nu cardinal). 35/35 tests pass: markdown(5), cardinals(6), decimals(4), currency RON/USD/EUR/GBP mix(8), symbols(4), abbreviations(4), truncation(2), edge cases empty/whitespace(2). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 14:42:41 +00:00
parent af5af8133f
commit a48562b2f5
2 changed files with 358 additions and 0 deletions
--- a/tests/test_voice_normalize.py
+++ b/tests/test_voice_normalize.py
@@ -0,0 +1,137 @@
+"""Tests for src/voice/normalize.py — 35 Romanian cases.
+
+Categories:
+    markdown strip (5), numbers cardinals (6), decimals (4),
+    currency natural (8), symbols (4), abbreviations (4),
+    truncation boundary (2), edge cases empty / whitespace (2).
+
+Total: 35.
+"""
+import pytest
+
+from src.voice.normalize import (
+    expand_abbreviations,
+    expand_currency,
+    expand_numbers_ro,
+    expand_symbols,
+    normalize_for_tts,
+    strip_markdown,
+)
+
+
+# ============================================================
+# Markdown stripping (5)
+# ============================================================
+@pytest.mark.parametrize("text,expected", [
+    ("**bold text**", "bold text"),
+    ("*italic text*", "italic text"),
+    ("`code snippet`", "code snippet"),
+    ("[click here](https://example.com)", "click here"),
+    ("# Heading text", "Heading text"),
+])
+def test_strip_markdown(text, expected):
+    assert strip_markdown(text) == expected
+
+
+# ============================================================
+# Numbers cardinals (6)
+# ============================================================
+@pytest.mark.parametrize("text,expected", [
+    ("21", "douăzeci și unu"),
+    ("81", "optzeci și unu"),
+    ("100", "o sută"),
+    ("3", "trei"),
+    ("0", "zero"),
+    ("200", "două sute"),
+])
+def test_expand_numbers_cardinals(text, expected):
+    assert expand_numbers_ro(text) == expected
+
+
+# ============================================================
+# Decimals (4)
+# ============================================================
+@pytest.mark.parametrize("text,expected", [
+    ("3.14", "trei virgulă paisprezece"),
+    ("12.5", "doisprezece virgulă cinci"),
+    ("0.5", "zero virgulă cinci"),
+    ("99.99", "nouăzeci și nouă virgulă nouăzeci și nouă"),
+])
+def test_expand_numbers_decimals(text, expected):
+    assert expand_numbers_ro(text) == expected
+
+
+# ============================================================
+# Currency natural RO (8) — RON / USD / EUR / GBP mix
+# ============================================================
+@pytest.mark.parametrize("text,expected", [
+    ("12.50 RON", "doisprezece lei și cincizeci de bani"),
+    ("$25.99", "douăzeci și cinci de dolari și nouăzeci și nouă de cenți"),
+    ("€100.50", "o sută de euro și cincizeci de cenți"),
+    ("£200", "două sute de lire"),
+    ("100 RON", "o sută de lei"),
+    ("$1", "un dolar"),
+    ("€50", "cincizeci de euro"),
+    ("1 RON", "un leu"),
+])
+def test_expand_currency(text, expected):
+    assert expand_currency(text) == expected
+
+
+# ============================================================
+# Symbols (4)
+# ============================================================
+@pytest.mark.parametrize("text,expected", [
+    ("25%", "25 la sută"),
+    ("foo & bar", "foo și bar"),
+    ("Marius @ home", "Marius la home"),
+    ("30°", "30 grade"),
+])
+def test_expand_symbols(text, expected):
+    assert expand_symbols(text) == expected
+
+
+# ============================================================
+# Abbreviations (4)
+# ============================================================
+@pytest.mark.parametrize("text,expected", [
+    ("etc.", "etcetera"),
+    ("dl. Popescu", "domnul Popescu"),
+    ("dna. Ionescu", "doamna Ionescu"),
+    ("nr. 5", "numărul 5"),
+])
+def test_expand_abbreviations(text, expected):
+    assert expand_abbreviations(text) == expected
+
+
+# ============================================================
+# Truncation boundary (2)
+# ============================================================
+def test_truncate_exactly_200_words_unchanged():
+    """Exactly 200 simple word tokens — no truncation, no suffix."""
+    text = " ".join(["cuvant"] * 200)
+    out = normalize_for_tts(text)
+    assert "Restul l-am scris în chat." not in out
+    assert out.split() == ["cuvant"] * 200
+
+
+def test_truncate_over_200_words_appends_suffix():
+    """250 word tokens — keep first 200 then append the chat-deferral phrase."""
+    text = " ".join(["cuvant"] * 250)
+    out = normalize_for_tts(text)
+    assert out.endswith("Restul l-am scris în chat.")
+    words = out.split()
+    # First 200 are 'cuvant', followed by the 5-word suffix.
+    assert words[:200] == ["cuvant"] * 200
+    assert words[200:] == ["Restul", "l-am", "scris", "în", "chat."]
+
+
+# ============================================================
+# Edge cases (2)
+# ============================================================
+@pytest.mark.parametrize("text,expected", [
+    ("", ""),
+    ("   ", ""),
+])
+def test_normalize_edge_cases(text, expected):
+    assert normalize_for_tts(text) == expected