#!/usr/bin/env python3 """Text-to-speech via Supertonic local server. CLI: python3 tools/tts.py --text "Salut Marius" [--voice M1] [--lang ro] → stdout: {"ok": true, "path": "/tmp/echo-tts-xxx.wav", "size_bytes": 12345} → stdout: {"ok": false, "error": "..."} Module: from tools.tts import synthesize result = synthesize("text", voice="M1", lang="ro") """ import argparse import json import sys import tempfile import httpx SUPERTONIC_URL = "http://127.0.0.1:7788" VOICES = {"M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4", "F5"} DEFAULT_VOICE = "M2" DEFAULT_LANG = "ro" # Punctuation Supertonic synthesis rejects with HTTP 500 (Romanian curly quotes, # smart dashes, ellipsis, angle quotes). Mapped to ASCII so a stray „foo" in # any caller's text doesn't kill the whole request. _TTS_PUNCT_MAP = { '„': '"', '“': '"', '”': '"', '‘': "'", '’': "'", '‚': "'", '«': '"', '»': '"', '–': '-', '—': '-', '…': '...', } # Supertonic ONNX model hard limit: inputs longer than this trigger # Mul node dimension mismatches in attention layers. _MAX_TTS_CHARS = 400 def sanitize_for_supertonic(text: str) -> str: """Replace Unicode punctuation and strip chars that crash Supertonic's ONNX model.""" for src, dst in _TTS_PUNCT_MAP.items(): text = text.replace(src, dst) # Strip emoji and high-codepoint chars (keep ASCII printable + Latin/Romanian diacritice) cleaned = [] for ch in text: cp = ord(ch) if (32 <= cp <= 126) or (128 <= cp <= 591): cleaned.append(ch) else: cleaned.append(' ') text = ' '.join(''.join(cleaned).split()) if len(text) > _MAX_TTS_CHARS: text = text[:_MAX_TTS_CHARS] return text def synthesize(text: str, voice: str = DEFAULT_VOICE, lang: str = DEFAULT_LANG) -> dict: """Call Supertonic server and save audio to a temp WAV file. Returns: {"ok": True, "path": "/tmp/echo-tts-xxx.wav", "size_bytes": N} {"ok": False, "error": "mesaj eroare"} """ if not text or not text.strip(): return {"ok": False, "error": "Text gol."} text = sanitize_for_supertonic(text) voice = voice.upper() if voice not in VOICES: voice = DEFAULT_VOICE try: resp = httpx.post( f"{SUPERTONIC_URL}/v1/audio/speech", json={ "model": "supertonic-3", "input": text, "voice": voice, "response_format": "wav", "lang": lang, }, timeout=60.0, ) resp.raise_for_status() except httpx.ConnectError: return { "ok": False, "error": ( "Serverul Supertonic nu rulează pe :7788. " "Pornește cu: systemctl --user start supertonic-tts" ), } except httpx.HTTPStatusError as e: body = e.response.text[:300] # Fallback: dacă lang=ro eșuează, încearcă na (language-agnostic) if lang != "na": return synthesize(text, voice=voice, lang="na") return {"ok": False, "error": f"HTTP {e.response.status_code}: {body}"} except Exception as e: return {"ok": False, "error": str(e)} # Salvează în fișier temp try: fd, path = tempfile.mkstemp(prefix="echo-tts-", suffix=".wav") with open(fd, "wb") as f: f.write(resp.content) return {"ok": True, "path": path, "size_bytes": len(resp.content)} except Exception as e: return {"ok": False, "error": f"Scriere fișier: {e}"} if __name__ == "__main__": parser = argparse.ArgumentParser(description="Supertonic TTS CLI") parser.add_argument("--text", required=True, help="Text de convertit în audio") parser.add_argument( "--voice", default=DEFAULT_VOICE, help="Voce: M1-M5 (masculin) sau F1-F5 (feminin). Default: M1" ) parser.add_argument( "--lang", default=DEFAULT_LANG, help="Limbă (ro, en, na). Default: ro. Fallback automat la na dacă ro eșuează." ) args = parser.parse_args() result = synthesize(args.text, voice=args.voice, lang=args.lang) print(json.dumps(result, ensure_ascii=False)) sys.exit(0 if result.get("ok") else 1)