feat(voice): Pas 3 — voice/normalize.py + 35 RO test cases

Pure functions pentru TTS text normalization (RO): - strip_markdown: regex bold/italic/code/link/heading/list - expand_numbers_ro: num2words pentru cardinals + decimal handling ("3.14" → "trei virgulă paisprezece", "3.05" → "trei virgulă zero cinci" digit-by-digit la leading zero) - expand_currency: formă naturală RO ("12.50 RON" → "doisprezece lei și cincizeci de bani", "$25.99" → "douăzeci și cinci de dolari și nouăzeci și nouă de cenți") - expand_symbols: %/&/@/° + whitespace collapse - expand_abbreviations: etc./dl./dna./nr./ş.a./ş.a.m.d. - normalize_for_tts: full pipeline + hard truncate 200 cuvinte cu "Restul l-am scris în chat." Pipeline order: markdown → abbreviations → currency → numbers → symbols → truncate. Currency BEFORE numbers — altfel "12.50 RON" se degradează la "doisprezece virgulă cincizeci RON". Romanian "de" particle rule: n>=20 AND (n%100 not in 1..19) → "o sută de lei", "o sută cinci lei" (no "de"). n=1 with currency → "un dolar" / "un leu" (article, nu cardinal). 35/35 tests pass: markdown(5), cardinals(6), decimals(4), currency RON/USD/EUR/GBP mix(8), symbols(4), abbreviations(4), truncation(2), edge cases empty/whitespace(2). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 14:42:41 +00:00
parent af5af8133f
commit a48562b2f5
2 changed files with 358 additions and 0 deletions
--- a/src/voice/normalize.py
+++ b/src/voice/normalize.py
@@ -0,0 +1,221 @@
+"""Voice mode text normalization for Romanian TTS.
+
+Pure functions — no side effects, no I/O, no logging. Strip markdown,
+expand numbers / currency / symbols / abbreviations into natural-sounding
+Romanian text. See plan: src/voice/normalize.py (Pas 3).
+
+Pipeline order in normalize_for_tts:
+    strip_markdown -> expand_abbreviations -> expand_currency
+    -> expand_numbers_ro -> expand_symbols -> truncate(200)
+
+Currency runs BEFORE generic number expansion so "12.50 RON" becomes
+"doisprezece lei și cincizeci de bani" rather than
+"doisprezece virgulă cincizeci RON".
+"""
+import re
+
+from num2words import num2words
+
+
+# ---------- Markdown ----------
+
+_MARKDOWN_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)')
+_MARKDOWN_BOLD = re.compile(r'\*\*([^*]+)\*\*')
+_MARKDOWN_CODE = re.compile(r'`([^`\n]+)`')
+_MARKDOWN_ITALIC = re.compile(r'(?<!\*)\*([^*\n]+)\*(?!\*)')
+_MARKDOWN_HEADING = re.compile(r'^[ \t]*#{1,6}[ \t]+', re.MULTILINE)
+_MARKDOWN_LIST = re.compile(r'^[ \t]*[-*+][ \t]+', re.MULTILINE)
+
+
+def strip_markdown(text: str) -> str:
+    """Remove common markdown formatting, preserve the visible content."""
+    text = _MARKDOWN_LINK.sub(r'\1', text)
+    text = _MARKDOWN_BOLD.sub(r'\1', text)
+    text = _MARKDOWN_CODE.sub(r'\1', text)
+    text = _MARKDOWN_ITALIC.sub(r'\1', text)
+    text = _MARKDOWN_HEADING.sub('', text)
+    text = _MARKDOWN_LIST.sub('', text)
+    return text
+
+
+# ---------- Number helpers ----------
+
+def _needs_de(n: int) -> bool:
+    """Romanian: insert 'de' between numeral and noun for n >= 20,
+    except when the trailing 1-19 portion makes it sound off
+    (e.g., 105, 119 -> no 'de'; 120, 200 -> 'de').
+    """
+    if n < 20:
+        return False
+    last = n % 100
+    if 1 <= last <= 19:
+        return False
+    return True
+
+
+def _int_to_ro(n: int) -> str:
+    return num2words(n, lang='ro')
+
+
+def _decimal_to_ro(s: str) -> str:
+    """Convert decimal string 'X.Y' to RO words.
+
+    Decimal part is read as a whole number ('3.14' -> 'trei virgulă paisprezece'),
+    unless it has a leading zero ('3.05' -> 'trei virgulă zero cinci') so the
+    magnitude is preserved.
+    """
+    int_part, dec_part = s.split('.', 1)
+    int_words = _int_to_ro(int(int_part))
+    if dec_part.startswith('0') and len(dec_part) > 1:
+        dec_words = ' '.join(_int_to_ro(int(d)) for d in dec_part)
+    else:
+        dec_words = _int_to_ro(int(dec_part))
+    return f"{int_words} virgulă {dec_words}"
+
+
+# ---------- Numbers ----------
+
+_NUM_TOKEN = re.compile(r'(?<!\w)(\d+(?:\.\d+)?)(?!\w)')
+
+
+def expand_numbers_ro(text: str) -> str:
+    """Expand bare numeric tokens to Romanian words.
+
+    Only matches pure number tokens (no surrounding letters). Decimals
+    use 'virgulă' separator. Currency-bound numbers should already be
+    handled by expand_currency before this runs.
+    """
+    def _sub(match: re.Match) -> str:
+        token = match.group(1)
+        if '.' in token:
+            return _decimal_to_ro(token)
+        return _int_to_ro(int(token))
+
+    return _NUM_TOKEN.sub(_sub, text)
+
+
+# ---------- Currency ----------
+
+_CURRENCY_MAIN = {
+    'RON': ('leu', 'lei'),
+    'USD': ('dolar', 'dolari'),
+    'EUR': ('euro', 'euro'),
+    'GBP': ('liră', 'lire'),
+}
+
+_CURRENCY_SUB = {
+    'RON': ('ban', 'bani'),
+    'USD': ('cent', 'cenți'),
+    'EUR': ('cent', 'cenți'),
+    'GBP': ('penny', 'pence'),
+}
+
+_CURRENCY_PATTERNS = [
+    # RON suffix (case-insensitive: RON, ron, lei)
+    (re.compile(r'(?<!\w)(\d+(?:\.\d+)?)\s+(?:RON|lei)\b', re.IGNORECASE), 'RON'),
+    # Prefix currencies
+    (re.compile(r'\$(\d+(?:\.\d+)?)'), 'USD'),
+    (re.compile(r'€(\d+(?:\.\d+)?)'), 'EUR'),
+    (re.compile(r'£(\d+(?:\.\d+)?)'), 'GBP'),
+]
+
+
+def _format_currency_unit(n: int, singular: str, plural: str) -> str:
+    """Format integer amount + currency noun with proper RO singular/plural
+    and 'de' particle. Uses 'un' (article) for n=1, not 'unu' (cardinal).
+    """
+    if n == 1:
+        return f"un {singular}"
+    word = _int_to_ro(n)
+    if _needs_de(n):
+        return f"{word} de {plural}"
+    return f"{word} {plural}"
+
+
+def _format_currency(amount: str, code: str) -> str:
+    main_sg, main_pl = _CURRENCY_MAIN[code]
+    if '.' in amount:
+        whole_s, frac_s = amount.split('.', 1)
+        # Normalize fractional part to 2 digits so "12.5 RON" reads as
+        # 50 bani, not 5 bani.
+        if len(frac_s) == 1:
+            frac_s = frac_s + '0'
+        elif len(frac_s) > 2:
+            frac_s = frac_s[:2]
+        whole = int(whole_s)
+        frac = int(frac_s)
+        whole_part = _format_currency_unit(whole, main_sg, main_pl)
+        if frac == 0:
+            return whole_part
+        sub_sg, sub_pl = _CURRENCY_SUB[code]
+        frac_part = _format_currency_unit(frac, sub_sg, sub_pl)
+        return f"{whole_part} și {frac_part}"
+    return _format_currency_unit(int(amount), main_sg, main_pl)
+
+
+def expand_currency(text: str) -> str:
+    """Expand currency amounts into natural Romanian.
+
+    Recognises ``<n> RON`` / ``<n> lei`` suffix and ``$``, ``€``, ``£`` prefix
+    forms with optional 2-decimal fractional part (treated as sub-unit:
+    bani / cenți / pence).
+    """
+    for pattern, code in _CURRENCY_PATTERNS:
+        text = pattern.sub(lambda m, c=code: _format_currency(m.group(1), c), text)
+    return text
+
+
+# ---------- Symbols ----------
+
+def expand_symbols(text: str) -> str:
+    """Replace common symbols with their Romanian spoken form."""
+    text = text.replace('%', ' la sută')
+    text = text.replace('&', ' și ')
+    text = text.replace('@', ' la ')
+    text = text.replace('°', ' grade')
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+
+
+# ---------- Abbreviations ----------
+
+# Longer patterns first so 'ș.a.m.d.' wins over 'ș.a.'
+_ABBREVIATIONS = [
+    (re.compile(r'(?<!\w)[șş]\.a\.m\.d\.', re.IGNORECASE), 'și așa mai departe'),
+    (re.compile(r'(?<!\w)[șş]\.a\.', re.IGNORECASE), 'și altele'),
+    (re.compile(r'(?<!\w)etc\.', re.IGNORECASE), 'etcetera'),
+    (re.compile(r'(?<!\w)dl\.', re.IGNORECASE), 'domnul'),
+    (re.compile(r'(?<!\w)dna\.', re.IGNORECASE), 'doamna'),
+    (re.compile(r'(?<!\w)nr\.', re.IGNORECASE), 'numărul'),
+]
+
+
+def expand_abbreviations(text: str) -> str:
+    """Expand Romanian abbreviations into their full forms."""
+    for pattern, replacement in _ABBREVIATIONS:
+        text = pattern.sub(replacement, text)
+    return text
+
+
+# ---------- Top-level pipeline ----------
+
+_MAX_WORDS = 200
+_TRUNCATE_SUFFIX = "Restul l-am scris în chat."
+
+
+def normalize_for_tts(text: str) -> str:
+    """Apply the full normalization pipeline and truncate to 200 words.
+
+    If the text exceeds 200 words, the first 200 are kept and the suffix
+    "Restul l-am scris în chat." is appended so the listener knows the
+    response continues in the text channel mirror.
+    """
+    text = strip_markdown(text)
+    text = expand_abbreviations(text)
+    text = expand_currency(text)
+    text = expand_numbers_ro(text)
+    text = expand_symbols(text)
+    words = text.split()
+    if len(words) > _MAX_WORDS:
+        text = ' '.join(words[:_MAX_WORDS]) + f" {_TRUNCATE_SUFFIX}"
+    return text.strip()