From a48562b2f56569f8b7f5d61e338c6b313c38db62 Mon Sep 17 00:00:00 2001 From: Marius Mutu Date: Wed, 27 May 2026 14:42:41 +0000 Subject: [PATCH] =?UTF-8?q?feat(voice):=20Pas=203=20=E2=80=94=20voice/norm?= =?UTF-8?q?alize.py=20+=2035=20RO=20test=20cases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure functions pentru TTS text normalization (RO): - strip_markdown: regex bold/italic/code/link/heading/list - expand_numbers_ro: num2words pentru cardinals + decimal handling ("3.14" → "trei virgulă paisprezece", "3.05" → "trei virgulă zero cinci" digit-by-digit la leading zero) - expand_currency: formă naturală RO ("12.50 RON" → "doisprezece lei și cincizeci de bani", "$25.99" → "douăzeci și cinci de dolari și nouăzeci și nouă de cenți") - expand_symbols: %/&/@/° + whitespace collapse - expand_abbreviations: etc./dl./dna./nr./ş.a./ş.a.m.d. - normalize_for_tts: full pipeline + hard truncate 200 cuvinte cu "Restul l-am scris în chat." Pipeline order: markdown → abbreviations → currency → numbers → symbols → truncate. Currency BEFORE numbers — altfel "12.50 RON" se degradează la "doisprezece virgulă cincizeci RON". Romanian "de" particle rule: n>=20 AND (n%100 not in 1..19) → "o sută de lei", "o sută cinci lei" (no "de"). n=1 with currency → "un dolar" / "un leu" (article, nu cardinal). 35/35 tests pass: markdown(5), cardinals(6), decimals(4), currency RON/USD/EUR/GBP mix(8), symbols(4), abbreviations(4), truncation(2), edge cases empty/whitespace(2). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/voice/normalize.py | 221 ++++++++++++++++++++++++++++++++++ tests/test_voice_normalize.py | 137 +++++++++++++++++++++ 2 files changed, 358 insertions(+) create mode 100644 src/voice/normalize.py create mode 100644 tests/test_voice_normalize.py diff --git a/src/voice/normalize.py b/src/voice/normalize.py new file mode 100644 index 0000000..b61232a --- /dev/null +++ b/src/voice/normalize.py @@ -0,0 +1,221 @@ +"""Voice mode text normalization for Romanian TTS. + +Pure functions — no side effects, no I/O, no logging. Strip markdown, +expand numbers / currency / symbols / abbreviations into natural-sounding +Romanian text. See plan: src/voice/normalize.py (Pas 3). + +Pipeline order in normalize_for_tts: + strip_markdown -> expand_abbreviations -> expand_currency + -> expand_numbers_ro -> expand_symbols -> truncate(200) + +Currency runs BEFORE generic number expansion so "12.50 RON" becomes +"doisprezece lei și cincizeci de bani" rather than +"doisprezece virgulă cincizeci RON". +""" +import re + +from num2words import num2words + + +# ---------- Markdown ---------- + +_MARKDOWN_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)') +_MARKDOWN_BOLD = re.compile(r'\*\*([^*]+)\*\*') +_MARKDOWN_CODE = re.compile(r'`([^`\n]+)`') +_MARKDOWN_ITALIC = re.compile(r'(? str: + """Remove common markdown formatting, preserve the visible content.""" + text = _MARKDOWN_LINK.sub(r'\1', text) + text = _MARKDOWN_BOLD.sub(r'\1', text) + text = _MARKDOWN_CODE.sub(r'\1', text) + text = _MARKDOWN_ITALIC.sub(r'\1', text) + text = _MARKDOWN_HEADING.sub('', text) + text = _MARKDOWN_LIST.sub('', text) + return text + + +# ---------- Number helpers ---------- + +def _needs_de(n: int) -> bool: + """Romanian: insert 'de' between numeral and noun for n >= 20, + except when the trailing 1-19 portion makes it sound off + (e.g., 105, 119 -> no 'de'; 120, 200 -> 'de'). + """ + if n < 20: + return False + last = n % 100 + if 1 <= last <= 19: + return False + return True + + +def _int_to_ro(n: int) -> str: + return num2words(n, lang='ro') + + +def _decimal_to_ro(s: str) -> str: + """Convert decimal string 'X.Y' to RO words. + + Decimal part is read as a whole number ('3.14' -> 'trei virgulă paisprezece'), + unless it has a leading zero ('3.05' -> 'trei virgulă zero cinci') so the + magnitude is preserved. + """ + int_part, dec_part = s.split('.', 1) + int_words = _int_to_ro(int(int_part)) + if dec_part.startswith('0') and len(dec_part) > 1: + dec_words = ' '.join(_int_to_ro(int(d)) for d in dec_part) + else: + dec_words = _int_to_ro(int(dec_part)) + return f"{int_words} virgulă {dec_words}" + + +# ---------- Numbers ---------- + +_NUM_TOKEN = re.compile(r'(? str: + """Expand bare numeric tokens to Romanian words. + + Only matches pure number tokens (no surrounding letters). Decimals + use 'virgulă' separator. Currency-bound numbers should already be + handled by expand_currency before this runs. + """ + def _sub(match: re.Match) -> str: + token = match.group(1) + if '.' in token: + return _decimal_to_ro(token) + return _int_to_ro(int(token)) + + return _NUM_TOKEN.sub(_sub, text) + + +# ---------- Currency ---------- + +_CURRENCY_MAIN = { + 'RON': ('leu', 'lei'), + 'USD': ('dolar', 'dolari'), + 'EUR': ('euro', 'euro'), + 'GBP': ('liră', 'lire'), +} + +_CURRENCY_SUB = { + 'RON': ('ban', 'bani'), + 'USD': ('cent', 'cenți'), + 'EUR': ('cent', 'cenți'), + 'GBP': ('penny', 'pence'), +} + +_CURRENCY_PATTERNS = [ + # RON suffix (case-insensitive: RON, ron, lei) + (re.compile(r'(? str: + """Format integer amount + currency noun with proper RO singular/plural + and 'de' particle. Uses 'un' (article) for n=1, not 'unu' (cardinal). + """ + if n == 1: + return f"un {singular}" + word = _int_to_ro(n) + if _needs_de(n): + return f"{word} de {plural}" + return f"{word} {plural}" + + +def _format_currency(amount: str, code: str) -> str: + main_sg, main_pl = _CURRENCY_MAIN[code] + if '.' in amount: + whole_s, frac_s = amount.split('.', 1) + # Normalize fractional part to 2 digits so "12.5 RON" reads as + # 50 bani, not 5 bani. + if len(frac_s) == 1: + frac_s = frac_s + '0' + elif len(frac_s) > 2: + frac_s = frac_s[:2] + whole = int(whole_s) + frac = int(frac_s) + whole_part = _format_currency_unit(whole, main_sg, main_pl) + if frac == 0: + return whole_part + sub_sg, sub_pl = _CURRENCY_SUB[code] + frac_part = _format_currency_unit(frac, sub_sg, sub_pl) + return f"{whole_part} și {frac_part}" + return _format_currency_unit(int(amount), main_sg, main_pl) + + +def expand_currency(text: str) -> str: + """Expand currency amounts into natural Romanian. + + Recognises `` RON`` / `` lei`` suffix and ``$``, ``€``, ``£`` prefix + forms with optional 2-decimal fractional part (treated as sub-unit: + bani / cenți / pence). + """ + for pattern, code in _CURRENCY_PATTERNS: + text = pattern.sub(lambda m, c=code: _format_currency(m.group(1), c), text) + return text + + +# ---------- Symbols ---------- + +def expand_symbols(text: str) -> str: + """Replace common symbols with their Romanian spoken form.""" + text = text.replace('%', ' la sută') + text = text.replace('&', ' și ') + text = text.replace('@', ' la ') + text = text.replace('°', ' grade') + text = re.sub(r'\s+', ' ', text).strip() + return text + + +# ---------- Abbreviations ---------- + +# Longer patterns first so 'ș.a.m.d.' wins over 'ș.a.' +_ABBREVIATIONS = [ + (re.compile(r'(? str: + """Expand Romanian abbreviations into their full forms.""" + for pattern, replacement in _ABBREVIATIONS: + text = pattern.sub(replacement, text) + return text + + +# ---------- Top-level pipeline ---------- + +_MAX_WORDS = 200 +_TRUNCATE_SUFFIX = "Restul l-am scris în chat." + + +def normalize_for_tts(text: str) -> str: + """Apply the full normalization pipeline and truncate to 200 words. + + If the text exceeds 200 words, the first 200 are kept and the suffix + "Restul l-am scris în chat." is appended so the listener knows the + response continues in the text channel mirror. + """ + text = strip_markdown(text) + text = expand_abbreviations(text) + text = expand_currency(text) + text = expand_numbers_ro(text) + text = expand_symbols(text) + words = text.split() + if len(words) > _MAX_WORDS: + text = ' '.join(words[:_MAX_WORDS]) + f" {_TRUNCATE_SUFFIX}" + return text.strip() diff --git a/tests/test_voice_normalize.py b/tests/test_voice_normalize.py new file mode 100644 index 0000000..0b6c4b9 --- /dev/null +++ b/tests/test_voice_normalize.py @@ -0,0 +1,137 @@ +"""Tests for src/voice/normalize.py — 35 Romanian cases. + +Categories: + markdown strip (5), numbers cardinals (6), decimals (4), + currency natural (8), symbols (4), abbreviations (4), + truncation boundary (2), edge cases empty / whitespace (2). + +Total: 35. +""" +import pytest + +from src.voice.normalize import ( + expand_abbreviations, + expand_currency, + expand_numbers_ro, + expand_symbols, + normalize_for_tts, + strip_markdown, +) + + +# ============================================================ +# Markdown stripping (5) +# ============================================================ +@pytest.mark.parametrize("text,expected", [ + ("**bold text**", "bold text"), + ("*italic text*", "italic text"), + ("`code snippet`", "code snippet"), + ("[click here](https://example.com)", "click here"), + ("# Heading text", "Heading text"), +]) +def test_strip_markdown(text, expected): + assert strip_markdown(text) == expected + + +# ============================================================ +# Numbers cardinals (6) +# ============================================================ +@pytest.mark.parametrize("text,expected", [ + ("21", "douăzeci și unu"), + ("81", "optzeci și unu"), + ("100", "o sută"), + ("3", "trei"), + ("0", "zero"), + ("200", "două sute"), +]) +def test_expand_numbers_cardinals(text, expected): + assert expand_numbers_ro(text) == expected + + +# ============================================================ +# Decimals (4) +# ============================================================ +@pytest.mark.parametrize("text,expected", [ + ("3.14", "trei virgulă paisprezece"), + ("12.5", "doisprezece virgulă cinci"), + ("0.5", "zero virgulă cinci"), + ("99.99", "nouăzeci și nouă virgulă nouăzeci și nouă"), +]) +def test_expand_numbers_decimals(text, expected): + assert expand_numbers_ro(text) == expected + + +# ============================================================ +# Currency natural RO (8) — RON / USD / EUR / GBP mix +# ============================================================ +@pytest.mark.parametrize("text,expected", [ + ("12.50 RON", "doisprezece lei și cincizeci de bani"), + ("$25.99", "douăzeci și cinci de dolari și nouăzeci și nouă de cenți"), + ("€100.50", "o sută de euro și cincizeci de cenți"), + ("£200", "două sute de lire"), + ("100 RON", "o sută de lei"), + ("$1", "un dolar"), + ("€50", "cincizeci de euro"), + ("1 RON", "un leu"), +]) +def test_expand_currency(text, expected): + assert expand_currency(text) == expected + + +# ============================================================ +# Symbols (4) +# ============================================================ +@pytest.mark.parametrize("text,expected", [ + ("25%", "25 la sută"), + ("foo & bar", "foo și bar"), + ("Marius @ home", "Marius la home"), + ("30°", "30 grade"), +]) +def test_expand_symbols(text, expected): + assert expand_symbols(text) == expected + + +# ============================================================ +# Abbreviations (4) +# ============================================================ +@pytest.mark.parametrize("text,expected", [ + ("etc.", "etcetera"), + ("dl. Popescu", "domnul Popescu"), + ("dna. Ionescu", "doamna Ionescu"), + ("nr. 5", "numărul 5"), +]) +def test_expand_abbreviations(text, expected): + assert expand_abbreviations(text) == expected + + +# ============================================================ +# Truncation boundary (2) +# ============================================================ +def test_truncate_exactly_200_words_unchanged(): + """Exactly 200 simple word tokens — no truncation, no suffix.""" + text = " ".join(["cuvant"] * 200) + out = normalize_for_tts(text) + assert "Restul l-am scris în chat." not in out + assert out.split() == ["cuvant"] * 200 + + +def test_truncate_over_200_words_appends_suffix(): + """250 word tokens — keep first 200 then append the chat-deferral phrase.""" + text = " ".join(["cuvant"] * 250) + out = normalize_for_tts(text) + assert out.endswith("Restul l-am scris în chat.") + words = out.split() + # First 200 are 'cuvant', followed by the 5-word suffix. + assert words[:200] == ["cuvant"] * 200 + assert words[200:] == ["Restul", "l-am", "scris", "în", "chat."] + + +# ============================================================ +# Edge cases (2) +# ============================================================ +@pytest.mark.parametrize("text,expected", [ + ("", ""), + (" ", ""), +]) +def test_normalize_edge_cases(text, expected): + assert normalize_for_tts(text) == expected