Files
echo-core/dashboard/handlers/youtube.py
Marius Mutu 6e9dfd137c feat: youtube_subs + dashboard includ descrierea video ca index
- tools/youtube_subs.py: get_subtitles() returneaza acum (title, desc, transcript).
  Functii noi is_description_about_video() si extract_relevant_description()
  detecteaza daca descrierea contine capitole/timestamps (nu doar promotie autori)
  si curata trailing-urile promotionale inainte sa includa descrierea in output.
- dashboard/handlers/youtube.py: aceleasi functii adaugate; nota KB generata
  include acum un bloc "Descriere / Index" daca descrierea e relevanta pentru video.
- memory/kb/youtube: nota Jeremy Grantham (AI bubble, investitii, toxicitate)
  cu descrierea ca index de capitole.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-27 17:00:59 +00:00

201 lines
6.1 KiB
Python

"""YouTube subtitle-download + note-creation endpoint."""
import json
import logging
import os
import re
import subprocess
import sys
import traceback
from datetime import datetime
from pathlib import Path
import constants
log = logging.getLogger(__name__)
def _clean_vtt(content):
"""Convert VTT captions to plain text."""
lines = []
seen = set()
for line in content.split('\n'):
if any([
line.startswith('WEBVTT'),
line.startswith('Kind:'),
line.startswith('Language:'),
'-->' in line,
line.strip().startswith('<'),
not line.strip(),
re.match(r'^\d+$', line.strip()),
]):
continue
clean = re.sub(r'<[^>]+>', '', line).strip()
if clean and clean not in seen:
seen.add(clean)
lines.append(clean)
return ' '.join(lines)
def _is_description_about_video(description):
"""Return True if description contains info about the video (chapters/topics)."""
if not description or len(description.strip()) < 50:
return False
timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b')
if len(timestamp_pattern.findall(description)) >= 3:
return True
lines = description.strip().split('\n')
bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)]
if len(bullet_lines) >= 3:
return True
numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)]
if len(numbered_lines) >= 3:
return True
return False
def _extract_relevant_description(description):
"""Strip promotional tails (links, social media) from description."""
if not description:
return ""
promo_patterns = [
re.compile(r'https?://\S+'),
re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I),
re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I),
re.compile(r'purchase|buy|order|shop|store', re.I),
]
result_lines = []
promo_streak = 0
for line in description.strip().split('\n'):
stripped = line.strip()
is_promo = any(p.search(stripped) for p in promo_patterns)
if is_promo:
promo_streak += 1
if promo_streak >= 2:
break
else:
promo_streak = 0
result_lines.append(line)
while result_lines and not result_lines[-1].strip():
result_lines.pop()
return '\n'.join(result_lines)
def _process_youtube(url):
"""Download subtitles, save note."""
yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
result = subprocess.run(
[yt_dlp, '--dump-json', '--no-download', url],
capture_output=True, text=True, timeout=30,
)
if result.returncode != 0:
print(f"Failed to get video info: {result.stderr}")
return
info = json.loads(result.stdout)
title = info.get('title', 'Unknown')
duration = info.get('duration', 0)
description = info.get('description', '')
temp_dir = Path('/tmp/yt_subs')
temp_dir.mkdir(exist_ok=True)
for f in temp_dir.glob('*'):
f.unlink()
subprocess.run([
yt_dlp, '--write-auto-subs', '--sub-langs', 'en',
'--skip-download', '--sub-format', 'vtt',
'-o', str(temp_dir / '%(id)s'),
url,
], capture_output=True, timeout=120)
transcript = None
for sub_file in temp_dir.glob('*.vtt'):
content = sub_file.read_text(encoding='utf-8', errors='replace')
transcript = _clean_vtt(content)
break
if not transcript:
print("No subtitles found")
return
date_str = datetime.now().strftime('%Y-%m-%d')
slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-')
filename = f"{date_str}_{slug}.md"
# Build optional description block
desc_block = ""
if _is_description_about_video(description):
relevant_desc = _extract_relevant_description(description)
if relevant_desc:
desc_block = f"\n## Descriere / Index\n\n{relevant_desc}\n\n---\n"
note_content = f"""# {title}
**Video:** {url}
**Duration:** {duration // 60}:{duration % 60:02d}
**Saved:** {date_str}
**Tags:** #youtube #to-summarize
---
{desc_block}
## Transcript
{transcript[:15000]}
---
*Notă: Sumarizarea va fi adăugată de Echo.*
"""
constants.NOTES_DIR.mkdir(parents=True, exist_ok=True)
note_path = constants.NOTES_DIR / filename
note_path.write_text(note_content, encoding='utf-8')
subprocess.run(
[sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')],
capture_output=True,
)
# Index new note with Ollama semantic embeddings
try:
sys.path.insert(0, str(constants.BASE_DIR))
from src.memory_search import index_file, MEMORY_DIR
n = index_file(note_path)
log.info("Ollama indexed %s (%d chunks)", filename, n)
except Exception as e:
log.warning("Ollama indexing failed for %s: %s", filename, e)
print(f"Created note: {filename}")
return filename
class YoutubeHandlers:
"""Mixin for /api/youtube."""
def handle_youtube(self):
"""Process a YouTube URL: download subs, save note."""
try:
content_length = int(self.headers['Content-Length'])
post_data = self.rfile.read(content_length).decode('utf-8')
data = json.loads(post_data)
url = data.get('url', '').strip()
if not url or ('youtube.com' not in url and 'youtu.be' not in url):
self.send_json({'error': 'URL YouTube invalid'}, 400)
return
try:
print(f"Processing YouTube URL: {url}")
_process_youtube(url)
self.send_json({
'status': 'done',
'message': 'Notița a fost creată! Refresh pagina Notes.',
})
except Exception as e:
print(f"YouTube processing error: {e}")
traceback.print_exc()
self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500)
except Exception as e:
self.send_json({'error': str(e)}, 500)