- tools/youtube_subs.py: get_subtitles() returneaza acum (title, desc, transcript). Functii noi is_description_about_video() si extract_relevant_description() detecteaza daca descrierea contine capitole/timestamps (nu doar promotie autori) si curata trailing-urile promotionale inainte sa includa descrierea in output. - dashboard/handlers/youtube.py: aceleasi functii adaugate; nota KB generata include acum un bloc "Descriere / Index" daca descrierea e relevanta pentru video. - memory/kb/youtube: nota Jeremy Grantham (AI bubble, investitii, toxicitate) cu descrierea ca index de capitole. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
216 lines
6.9 KiB
Python
Executable File
216 lines
6.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Download YouTube subtitles/transcript for summarization.
|
|
Usage: python3 youtube_subs.py <video_url> [language]
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
import os
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
def clean_vtt(content):
|
|
"""Convert VTT to plain text, removing timestamps and duplicates."""
|
|
lines = []
|
|
seen = set()
|
|
|
|
for line in content.split('\n'):
|
|
# Skip VTT headers, timestamps, positioning
|
|
if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
|
|
continue
|
|
if '-->' in line: # Timestamp line
|
|
continue
|
|
if line.strip().startswith('<'): # Positioning tags
|
|
continue
|
|
if not line.strip():
|
|
continue
|
|
if re.match(r'^\d+$', line.strip()): # Sequence numbers
|
|
continue
|
|
|
|
# Clean HTML tags
|
|
clean = re.sub(r'<[^>]+>', '', line).strip()
|
|
if clean and clean not in seen:
|
|
seen.add(clean)
|
|
lines.append(clean)
|
|
|
|
return ' '.join(lines)
|
|
|
|
|
|
def is_description_about_video(description):
|
|
"""
|
|
Determine if the description contains info about the video content
|
|
(chapters/timestamps, topics) vs. just author promotion/ads.
|
|
Returns True if description is worth including.
|
|
"""
|
|
if not description or len(description.strip()) < 50:
|
|
return False
|
|
|
|
# Strong signal: contains timestamp markers like 00:00, 0:00:00, 1:23
|
|
timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b')
|
|
timestamp_count = len(timestamp_pattern.findall(description))
|
|
if timestamp_count >= 3:
|
|
return True
|
|
|
|
# Strong signal: contains chapter/topic-like bullet lines
|
|
lines = description.strip().split('\n')
|
|
bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)]
|
|
if len(bullet_lines) >= 3:
|
|
return True
|
|
|
|
# Signal: numbered list or clear topic breakdown
|
|
numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)]
|
|
if len(numbered_lines) >= 3:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def extract_relevant_description(description):
|
|
"""
|
|
Extract only the relevant parts of the description (about the video).
|
|
Removes trailing promotional links, author bio boilerplate, etc.
|
|
"""
|
|
if not description:
|
|
return ""
|
|
|
|
lines = description.strip().split('\n')
|
|
|
|
# Find the last line that looks like content (timestamps or bullets or substantive text)
|
|
# Cut off at lines that are clearly promotional (links, social media, etc.)
|
|
promo_patterns = [
|
|
re.compile(r'https?://\S+'), # URLs
|
|
re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I),
|
|
re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I),
|
|
re.compile(r'purchase|buy|order|shop|store', re.I),
|
|
]
|
|
|
|
result_lines = []
|
|
promo_streak = 0
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
|
|
# Check if this line is promotional
|
|
is_promo = any(p.search(stripped) for p in promo_patterns)
|
|
|
|
if is_promo:
|
|
promo_streak += 1
|
|
# Allow isolated promo lines (like a single URL after a chapter list)
|
|
# but stop if we hit multiple consecutive promo lines
|
|
if promo_streak >= 2:
|
|
break
|
|
else:
|
|
promo_streak = 0
|
|
result_lines.append(line)
|
|
|
|
# Also strip trailing empty lines
|
|
while result_lines and not result_lines[-1].strip():
|
|
result_lines.pop()
|
|
|
|
return '\n'.join(result_lines)
|
|
|
|
|
|
def get_subtitles(url, lang='en'):
|
|
"""Download subtitles for a YouTube video."""
|
|
|
|
yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
|
|
temp_dir = Path('/tmp/yt_subs')
|
|
temp_dir.mkdir(exist_ok=True)
|
|
|
|
# Clean old files
|
|
for f in temp_dir.glob('*'):
|
|
f.unlink()
|
|
|
|
# First, get video info
|
|
title = "Unknown"
|
|
description = ""
|
|
info_cmd = [yt_dlp, '--js-runtimes', 'node', '--dump-json', '--no-download', url]
|
|
result = subprocess.run(info_cmd, capture_output=True, text=True, timeout=30)
|
|
print(f"INFO: returncode={result.returncode}, stderr={result.stderr[:200]}", file=sys.stderr)
|
|
if result.returncode == 0:
|
|
try:
|
|
info = json.loads(result.stdout)
|
|
title = info.get('title', 'Unknown')
|
|
description = info.get('description', '')
|
|
duration = info.get('duration', 0)
|
|
print(f"Title: {title}", file=sys.stderr)
|
|
print(f"Duration: {duration//60}:{duration%60:02d}", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f"JSON parse error: {e}", file=sys.stderr)
|
|
else:
|
|
print(f"yt-dlp failed: {result.stderr[:500]}", file=sys.stderr)
|
|
|
|
# Try to get subtitles in order of preference
|
|
lang_preferences = [lang, 'ro', 'en', 'en-US', 'en-GB']
|
|
|
|
for try_lang in lang_preferences:
|
|
# Try manual subtitles first
|
|
cmd = [
|
|
yt_dlp,
|
|
'--js-runtimes', 'node',
|
|
'--write-subs',
|
|
'--sub-langs', try_lang,
|
|
'--skip-download',
|
|
'-o', str(temp_dir / '%(id)s.%(ext)s'),
|
|
url
|
|
]
|
|
|
|
subprocess.run(cmd, capture_output=True, timeout=60)
|
|
|
|
# Check if we got subtitles
|
|
for ext in ['vtt', 'srt', 'ass']:
|
|
for sub_file in temp_dir.glob(f'*.{try_lang}*.{ext}'):
|
|
content = sub_file.read_text(encoding='utf-8', errors='replace')
|
|
return title, description, clean_vtt(content)
|
|
|
|
# Try auto-generated subtitles
|
|
for try_lang in lang_preferences:
|
|
cmd = [
|
|
yt_dlp,
|
|
'--js-runtimes', 'node',
|
|
'--write-auto-subs',
|
|
'--sub-langs', try_lang,
|
|
'--skip-download',
|
|
'-o', str(temp_dir / '%(id)s.%(ext)s'),
|
|
url
|
|
]
|
|
|
|
subprocess.run(cmd, capture_output=True, timeout=60)
|
|
|
|
for ext in ['vtt', 'srt', 'ass']:
|
|
for sub_file in temp_dir.glob(f'*.{ext}'):
|
|
content = sub_file.read_text(encoding='utf-8', errors='replace')
|
|
text = clean_vtt(content)
|
|
if text:
|
|
return title, description, text
|
|
|
|
return title or "Unknown", description, None
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python3 youtube_subs.py <video_url> [language]")
|
|
sys.exit(1)
|
|
|
|
url = sys.argv[1]
|
|
lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
|
|
|
|
title, description, transcript = get_subtitles(url, lang)
|
|
|
|
if transcript:
|
|
print(f"\n=== {title} ===\n")
|
|
|
|
# Include description if it's about the video content
|
|
if description and is_description_about_video(description):
|
|
relevant_desc = extract_relevant_description(description)
|
|
if relevant_desc:
|
|
print("--- Descriere / Index ---")
|
|
print(relevant_desc)
|
|
print("--- Transcript ---")
|
|
|
|
print(transcript)
|
|
else:
|
|
print(f"No subtitles found for: {title}", file=sys.stderr)
|
|
sys.exit(1)
|