Update cron, dashboard, root +3 more (+1 ~11)

2026-05-28 20:21:28 +00:00
parent e79bed7afe
commit 0ce8a5a04d
12 changed files with 217 additions and 51 deletions
--- a/src/voice/pipeline.py
+++ b/src/voice/pipeline.py
@@ -53,6 +53,24 @@ NO_SPEECH_DROP_THRESHOLD = 0.6
 PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
 LOGS_DIR = PROJECT_ROOT / "logs"
 VOICE_METRICS_PATH = LOGS_DIR / "voice_metrics.jsonl"
+VOICE_STT_LOG_PATH = LOGS_DIR / "voice_stt_log.jsonl"
+_stt_log_lock = threading.Lock()
+
+
+def _append_stt_log(entry: dict) -> None:
+    """Append one Whisper transcript to ``voice_stt_log.jsonl``.
+
+    Separate from ``record_enabled``/``transcripts_jsonl_path`` (which feed
+    KB). This log is always-on, scoped to STT debugging — used to mine
+    code-switching mistranscriptions (English words in Romanian flow) over
+    several days and build a personal vocabulary correction table.
+    """
+    try:
+        LOGS_DIR.mkdir(parents=True, exist_ok=True)
+        with _stt_log_lock, VOICE_STT_LOG_PATH.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    except Exception as e:  # noqa: BLE001
+        log.debug("STT log write failed: %s", e)


 # ---------- Lazy model singletons ----------
@@ -100,24 +118,31 @@ def _get_silero_vad():
 def _pcm48_stereo_to_16_mono(pcm: bytes) -> np.ndarray:
    """Discord 48kHz s16le stereo bytes -> 16kHz mono float32 in [-1, 1].

-    Cheap downsample: average the two channels, then average every 3
-    samples (48k / 3 = 16k). faster-whisper + silero-vad accept the
-    resulting ``np.float32`` array directly.
+    Mix channels to mono, then resample 48k→16k with torchaudio's polyphase
+    Kaiser-windowed sinc (``lowpass_filter_width=16``) instead of a naive
+    every-3-samples average. The previous decimation had no anti-aliasing,
+    which folded HF content (sibilants, fricatives) back into the
+    speech band and degraded Whisper's accuracy on short wake phrases
+    like "Salut, Eco". faster-whisper + silero-vad accept the resulting
+    ``np.float32`` array directly.
    """
    if not pcm:
        return np.zeros(0, dtype=np.float32)
    samples = np.frombuffer(pcm, dtype=np.int16)
    if samples.size % 2 != 0:
        samples = samples[:-1]
-    stereo = samples.reshape(-1, 2)
-    mono = stereo.mean(axis=1).astype(np.float32) / 32768.0
-    if mono.size == 0:
-        return mono
-    trim = (mono.size // 3) * 3
-    if trim == 0:
+    if samples.size == 0:
        return np.zeros(0, dtype=np.float32)
-    mono = mono[:trim].reshape(-1, 3).mean(axis=1)
-    return mono.astype(np.float32)
+    stereo = samples.reshape(-1, 2)
+    mono48 = stereo.mean(axis=1).astype(np.float32) / 32768.0
+    import torch
+    import torchaudio.functional as taF
+    wav = torch.from_numpy(mono48).unsqueeze(0)
+    mono16 = taF.resample(
+        wav, SAMPLE_RATE_DISCORD, SAMPLE_RATE_WHISPER,
+        lowpass_filter_width=16,
+    ).squeeze(0).numpy()
+    return np.ascontiguousarray(mono16, dtype=np.float32)


 # ---------- VoiceSession ----------
@@ -646,19 +671,25 @@ class EchoVoiceSink(AudioSink):
    def _flush_to_stt(self, user_id: int, pcm48_stereo: bytes) -> None:
        """Downsample, Whisper-transcribe RO, drop hallucinations, dispatch."""
        try:
+            t_start = time.monotonic()
            mono16 = _pcm48_stereo_to_16_mono(pcm48_stereo)
            if mono16.size == 0:
                return
+            audio_duration_s = float(mono16.size) / float(SAMPLE_RATE_WHISPER)
            model = _get_whisper_model()
            segments, _info = model.transcribe(
                mono16, language="ro", beam_size=5,
                initial_prompt=(
-                    "Echo Core, asistent personal AI românesc al lui Marius. "
-                    "Conversație colocvială în română. "
-                    "Comenzi voce recunoscute: schimbă vocea pe M1, M2, M3, M4, M5, "
-                    "F1, F2, F3, F4, F5. Exemple: vorbește cu vocea M5, voce F3, "
+                    "Conversatie in romana cu asistentul Eco (Echo Core). "
+                    "Marius i se adreseaza cu 'Salut, Eco', 'Eco' sau 'Echo Core' "
+                    "la inceputul mesajului. Exemple: 'Salut, Eco, ce mai faci?', "
+                    "'Eco, adauga pe agenda de maine sa sun la Bianca', "
+                    "'Echo Core, vreau sa-mi reamintesti diseara'. "
+                    "Comenzi voce recunoscute: schimba vocea pe M1, M2, M3, M4, M5, "
+                    "F1, F2, F3, F4, F5. Exemple: vorbeste cu vocea M5, voce F3, "
                    "treci pe vocea F1."
                ),
+                hotwords="Eco Echo Core Marius Bianca",
                condition_on_previous_text=False,
            )
            text_parts: list[str] = []
@@ -677,6 +708,16 @@ class EchoVoiceSink(AudioSink):
            text = " ".join(text_parts).strip()
            if not text:
                return
+            _append_stt_log({
+                "ts": time.time(),
+                "channel_id": self.session.voice_channel_id,
+                "user_id": int(user_id),
+                "text": text,
+                "no_speech_prob": round(worst_no_speech, 3),
+                "audio_duration_s": round(audio_duration_s, 3),
+                "stt_latency_s": round(time.monotonic() - t_start, 3),
+                "model": "small",
+            })
            self._schedule_segment_done(user_id, text, worst_no_speech)
        except Exception as e:  # noqa: BLE001
            log.warning("Whisper transcribe failed: %s", e)