Files
echo-core/tools/tts.py
Marius Mutu d175d5ba5a chore: working-tree state — anaf snapshots, cron state, KB notes, tools
Pre-existing uncommitted changes swept in with the STT work:
anaf-monitor snapshots/versions, cron job + newsletter state, 9 youtube KB
notes, tools/ocr_bon.py, and tools/tts.py.

Note: the tts.py change breaks 2 truncation tests in test_voice_normalize.py
(sanitize word-count) — flagged for a separate follow-up.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-27 18:16:31 +00:00

133 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Text-to-speech via Supertonic local server.
CLI:
python3 tools/tts.py --text "Salut Marius" [--voice M1] [--lang ro]
→ stdout: {"ok": true, "path": "/tmp/echo-tts-xxx.wav", "size_bytes": 12345}
→ stdout: {"ok": false, "error": "..."}
Module:
from tools.tts import synthesize
result = synthesize("text", voice="M1", lang="ro")
"""
import argparse
import json
import sys
import tempfile
import httpx
SUPERTONIC_URL = "http://127.0.0.1:7788"
VOICES = {"M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"}
DEFAULT_VOICE = "M2"
DEFAULT_LANG = "ro"
# Punctuation Supertonic synthesis rejects with HTTP 500 (Romanian curly quotes,
# smart dashes, ellipsis, angle quotes). Mapped to ASCII so a stray „foo" in
# any caller's text doesn't kill the whole request.
_TTS_PUNCT_MAP = {
'': '"', '': '"', '': '"',
'': "'", '': "'", '': "'",
'«': '"', '»': '"',
'': '-', '': '-',
'': '...',
}
# Supertonic ONNX model hard limit: inputs longer than this trigger
# Mul node dimension mismatches in attention layers.
_MAX_TTS_CHARS = 400
def sanitize_for_supertonic(text: str) -> str:
"""Replace Unicode punctuation and strip chars that crash Supertonic's ONNX model."""
for src, dst in _TTS_PUNCT_MAP.items():
text = text.replace(src, dst)
# Strip emoji and high-codepoint chars (keep ASCII printable + Latin/Romanian diacritice)
cleaned = []
for ch in text:
cp = ord(ch)
if (32 <= cp <= 126) or (128 <= cp <= 591):
cleaned.append(ch)
else:
cleaned.append(' ')
text = ' '.join(''.join(cleaned).split())
if len(text) > _MAX_TTS_CHARS:
text = text[:_MAX_TTS_CHARS]
return text
def synthesize(text: str, voice: str = DEFAULT_VOICE, lang: str = DEFAULT_LANG) -> dict:
"""Call Supertonic server and save audio to a temp WAV file.
Returns:
{"ok": True, "path": "/tmp/echo-tts-xxx.wav", "size_bytes": N}
{"ok": False, "error": "mesaj eroare"}
"""
if not text or not text.strip():
return {"ok": False, "error": "Text gol."}
text = sanitize_for_supertonic(text)
voice = voice.upper()
if voice not in VOICES:
voice = DEFAULT_VOICE
try:
resp = httpx.post(
f"{SUPERTONIC_URL}/v1/audio/speech",
json={
"model": "supertonic-3",
"input": text,
"voice": voice,
"response_format": "wav",
"lang": lang,
},
timeout=60.0,
)
resp.raise_for_status()
except httpx.ConnectError:
return {
"ok": False,
"error": (
"Serverul Supertonic nu rulează pe :7788. "
"Pornește cu: systemctl --user start supertonic-tts"
),
}
except httpx.HTTPStatusError as e:
body = e.response.text[:300]
# Fallback: dacă lang=ro eșuează, încearcă na (language-agnostic)
if lang != "na":
return synthesize(text, voice=voice, lang="na")
return {"ok": False, "error": f"HTTP {e.response.status_code}: {body}"}
except Exception as e:
return {"ok": False, "error": str(e)}
# Salvează în fișier temp
try:
fd, path = tempfile.mkstemp(prefix="echo-tts-", suffix=".wav")
with open(fd, "wb") as f:
f.write(resp.content)
return {"ok": True, "path": path, "size_bytes": len(resp.content)}
except Exception as e:
return {"ok": False, "error": f"Scriere fișier: {e}"}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Supertonic TTS CLI")
parser.add_argument("--text", required=True, help="Text de convertit în audio")
parser.add_argument(
"--voice", default=DEFAULT_VOICE,
help="Voce: M1-M5 (masculin) sau F1-F5 (feminin). Default: M1"
)
parser.add_argument(
"--lang", default=DEFAULT_LANG,
help="Limbă (ro, en, na). Default: ro. Fallback automat la na dacă ro eșuează."
)
args = parser.parse_args()
result = synthesize(args.text, voice=args.voice, lang=args.lang)
print(json.dumps(result, ensure_ascii=False))
sys.exit(0 if result.get("ok") else 1)