"""Tests for src/voice/normalize.py — 35 Romanian cases. Categories: markdown strip (5), numbers cardinals (6), decimals (4), currency natural (8), symbols (4), abbreviations (4), truncation boundary (2), edge cases empty / whitespace (2). Total: 35. """ import pytest from src.voice.normalize import ( expand_abbreviations, expand_currency, expand_numbers_ro, expand_symbols, normalize_for_tts, strip_markdown, ) # ============================================================ # Markdown stripping (5) # ============================================================ @pytest.mark.parametrize("text,expected", [ ("**bold text**", "bold text"), ("*italic text*", "italic text"), ("`code snippet`", "code snippet"), ("[click here](https://example.com)", "click here"), ("# Heading text", "Heading text"), ]) def test_strip_markdown(text, expected): assert strip_markdown(text) == expected # ============================================================ # Numbers cardinals (6) # ============================================================ @pytest.mark.parametrize("text,expected", [ ("21", "douăzeci și unu"), ("81", "optzeci și unu"), ("100", "o sută"), ("3", "trei"), ("0", "zero"), ("200", "două sute"), ]) def test_expand_numbers_cardinals(text, expected): assert expand_numbers_ro(text) == expected # ============================================================ # Decimals (4) # ============================================================ @pytest.mark.parametrize("text,expected", [ ("3.14", "trei virgulă paisprezece"), ("12.5", "doisprezece virgulă cinci"), ("0.5", "zero virgulă cinci"), ("99.99", "nouăzeci și nouă virgulă nouăzeci și nouă"), ]) def test_expand_numbers_decimals(text, expected): assert expand_numbers_ro(text) == expected # ============================================================ # Currency natural RO (8) — RON / USD / EUR / GBP mix # ============================================================ @pytest.mark.parametrize("text,expected", [ ("12.50 RON", "doisprezece lei și cincizeci de bani"), ("$25.99", "douăzeci și cinci de dolari și nouăzeci și nouă de cenți"), ("€100.50", "o sută de euro și cincizeci de cenți"), ("£200", "două sute de lire"), ("100 RON", "o sută de lei"), ("$1", "un dolar"), ("€50", "cincizeci de euro"), ("1 RON", "un leu"), ]) def test_expand_currency(text, expected): assert expand_currency(text) == expected # ============================================================ # Symbols (4) # ============================================================ @pytest.mark.parametrize("text,expected", [ ("25%", "25 la sută"), ("foo & bar", "foo și bar"), ("Marius @ home", "Marius la home"), ("30°", "30 grade"), ]) def test_expand_symbols(text, expected): assert expand_symbols(text) == expected # ============================================================ # Abbreviations (4) # ============================================================ @pytest.mark.parametrize("text,expected", [ ("etc.", "etcetera"), ("dl. Popescu", "domnul Popescu"), ("dna. Ionescu", "doamna Ionescu"), ("nr. 5", "numărul 5"), ]) def test_expand_abbreviations(text, expected): assert expand_abbreviations(text) == expected # ============================================================ # Truncation boundary (2) # ============================================================ def test_truncate_exactly_200_words_unchanged(): """Exactly 200 simple word tokens — no truncation, no suffix.""" text = " ".join(["cuvant"] * 200) out = normalize_for_tts(text) assert "Restul l-am scris în chat." not in out assert out.split() == ["cuvant"] * 200 def test_truncate_over_200_words_appends_suffix(): """250 word tokens — keep first 200 then append the chat-deferral phrase.""" text = " ".join(["cuvant"] * 250) out = normalize_for_tts(text) assert out.endswith("Restul l-am scris în chat.") words = out.split() # First 200 are 'cuvant', followed by the 5-word suffix. assert words[:200] == ["cuvant"] * 200 assert words[200:] == ["Restul", "l-am", "scris", "în", "chat."] # ============================================================ # Edge cases (2) # ============================================================ @pytest.mark.parametrize("text,expected", [ ("", ""), (" ", ""), ]) def test_normalize_edge_cases(text, expected): assert normalize_for_tts(text) == expected