echo-core/tests/test_voice_normalize.py

"""Tests for src/voice/normalize.py — 35 Romanian cases.

Categories:
    markdown strip (5), numbers cardinals (6), decimals (4),
    currency natural (8), symbols (4), abbreviations (4),
    truncation boundary (2), edge cases empty / whitespace (2).

Total: 35.
"""
import pytest

from src.voice.normalize import (
    expand_abbreviations,
    expand_currency,
    expand_numbers_ro,
    expand_symbols,
    normalize_for_tts,
    strip_markdown,
)


# ============================================================
# Markdown stripping (5)
# ============================================================
@pytest.mark.parametrize("text,expected", [
    ("**bold text**", "bold text"),
    ("*italic text*", "italic text"),
    ("`code snippet`", "code snippet"),
    ("[click here](https://example.com)", "click here"),
    ("# Heading text", "Heading text"),
])
def test_strip_markdown(text, expected):
    assert strip_markdown(text) == expected


# ============================================================
# Numbers cardinals (6)
# ============================================================
@pytest.mark.parametrize("text,expected", [
    ("21", "douăzeci și unu"),
    ("81", "optzeci și unu"),
    ("100", "o sută"),
    ("3", "trei"),
    ("0", "zero"),
    ("200", "două sute"),
])
def test_expand_numbers_cardinals(text, expected):
    assert expand_numbers_ro(text) == expected


# ============================================================
# Decimals (4)
# ============================================================
@pytest.mark.parametrize("text,expected", [
    ("3.14", "trei virgulă paisprezece"),
    ("12.5", "doisprezece virgulă cinci"),
    ("0.5", "zero virgulă cinci"),
    ("99.99", "nouăzeci și nouă virgulă nouăzeci și nouă"),
])
def test_expand_numbers_decimals(text, expected):
    assert expand_numbers_ro(text) == expected


# ============================================================
# Currency natural RO (8) — RON / USD / EUR / GBP mix
# ============================================================
@pytest.mark.parametrize("text,expected", [
    ("12.50 RON", "doisprezece lei și cincizeci de bani"),
    ("$25.99", "douăzeci și cinci de dolari și nouăzeci și nouă de cenți"),
    ("€100.50", "o sută de euro și cincizeci de cenți"),
    ("£200", "două sute de lire"),
    ("100 RON", "o sută de lei"),
    ("$1", "un dolar"),
    ("€50", "cincizeci de euro"),
    ("1 RON", "un leu"),
])
def test_expand_currency(text, expected):
    assert expand_currency(text) == expected


# ============================================================
# Symbols (4)
# ============================================================
@pytest.mark.parametrize("text,expected", [
    ("25%", "25 la sută"),
    ("foo & bar", "foo și bar"),
    ("Marius @ home", "Marius la home"),
    ("30°", "30 grade"),
])
def test_expand_symbols(text, expected):
    assert expand_symbols(text) == expected


# ============================================================
# Abbreviations (4)
# ============================================================
@pytest.mark.parametrize("text,expected", [
    ("etc.", "etcetera"),
    ("dl. Popescu", "domnul Popescu"),
    ("dna. Ionescu", "doamna Ionescu"),
    ("nr. 5", "numărul 5"),
])
def test_expand_abbreviations(text, expected):
    assert expand_abbreviations(text) == expected


# ============================================================
# Truncation boundary (2)
# ============================================================
def test_truncate_exactly_200_words_unchanged():
    """Exactly 200 simple word tokens — no truncation, no suffix."""
    text = " ".join(["cuvant"] * 200)
    out = normalize_for_tts(text)
    assert "Restul l-am scris în chat." not in out
    assert out.split() == ["cuvant"] * 200


def test_truncate_over_200_words_appends_suffix():
    """250 word tokens — keep first 200 then append the chat-deferral phrase."""
    text = " ".join(["cuvant"] * 250)
    out = normalize_for_tts(text)
    assert out.endswith("Restul l-am scris în chat.")
    words = out.split()
    # First 200 are 'cuvant', followed by the 5-word suffix.
    assert words[:200] == ["cuvant"] * 200
    assert words[200:] == ["Restul", "l-am", "scris", "în", "chat."]


# ============================================================
# Edge cases (2)
# ============================================================
@pytest.mark.parametrize("text,expected", [
    ("", ""),
    ("   ", ""),
])
def test_normalize_edge_cases(text, expected):
    assert normalize_for_tts(text) == expected