- dashboard/handlers/youtube.py: după descărcare transcriere, cheamă `claude -p` cu un prompt structurat care generează TL;DR + puncte cheie + citate + idei acționabile + secțiuni tematice în proze. Fallback la transcriptul brut dacă Claude eșuează. - nota Grantham: format complet — TL;DR, puncte cheie, citate, idei acționabile, secțiuni tematice în proze curgătoare. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
258 lines
8.2 KiB
Python
258 lines
8.2 KiB
Python
"""YouTube subtitle-download + note-creation endpoint."""
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import traceback
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import constants
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def _clean_vtt(content):
|
|
"""Convert VTT captions to plain text."""
|
|
lines = []
|
|
seen = set()
|
|
for line in content.split('\n'):
|
|
if any([
|
|
line.startswith('WEBVTT'),
|
|
line.startswith('Kind:'),
|
|
line.startswith('Language:'),
|
|
'-->' in line,
|
|
line.strip().startswith('<'),
|
|
not line.strip(),
|
|
re.match(r'^\d+$', line.strip()),
|
|
]):
|
|
continue
|
|
clean = re.sub(r'<[^>]+>', '', line).strip()
|
|
if clean and clean not in seen:
|
|
seen.add(clean)
|
|
lines.append(clean)
|
|
return ' '.join(lines)
|
|
|
|
|
|
def _is_description_about_video(description):
|
|
"""Return True if description contains info about the video (chapters/topics)."""
|
|
if not description or len(description.strip()) < 50:
|
|
return False
|
|
timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b')
|
|
if len(timestamp_pattern.findall(description)) >= 3:
|
|
return True
|
|
lines = description.strip().split('\n')
|
|
bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)]
|
|
if len(bullet_lines) >= 3:
|
|
return True
|
|
numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)]
|
|
if len(numbered_lines) >= 3:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _extract_relevant_description(description):
|
|
"""Strip promotional tails (links, social media) from description."""
|
|
if not description:
|
|
return ""
|
|
promo_patterns = [
|
|
re.compile(r'https?://\S+'),
|
|
re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I),
|
|
re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I),
|
|
re.compile(r'purchase|buy|order|shop|store', re.I),
|
|
]
|
|
result_lines = []
|
|
promo_streak = 0
|
|
for line in description.strip().split('\n'):
|
|
stripped = line.strip()
|
|
is_promo = any(p.search(stripped) for p in promo_patterns)
|
|
if is_promo:
|
|
promo_streak += 1
|
|
if promo_streak >= 2:
|
|
break
|
|
else:
|
|
promo_streak = 0
|
|
result_lines.append(line)
|
|
while result_lines and not result_lines[-1].strip():
|
|
result_lines.pop()
|
|
return '\n'.join(result_lines)
|
|
|
|
|
|
ANALYSIS_PROMPT = """\
|
|
Ai primit transcriptul unui video YouTube și descrierea lui. Scrie o notiță KB în română, format Markdown.
|
|
|
|
Structura notei (în ordine):
|
|
1. ## TL;DR — un paragraf de 3-5 rânduri care surprinde esența
|
|
2. ## Puncte cheie — 6-10 puncte concise (pot fi bullets, dar scurte și dense)
|
|
3. ## Quote-uri memorabile — 4-6 citate directe din transcript, în limba originală, între ghilimele
|
|
4. ## Idei acționabile — 4-8 lucruri concrete pe care cititorul le poate face
|
|
5. Secțiuni tematice cu ## heading — câte teme apar natural, în proze curgătoare (NU bullets), fiecare cu conținut real din transcript: cifre, exemple, mecanisme, argumente
|
|
|
|
Nu scrie metadate (titlu, url, tags, dată) — vor fi adăugate separat.
|
|
Nu scrie fraze introductive despre tine sau despre video. Începe direct cu ## TL;DR.
|
|
Scrie în română. Citatele rămân în engleză dacă sursa e engleză.
|
|
"""
|
|
|
|
|
|
def _analyze_with_claude(title, description, transcript):
|
|
"""Call claude -p to generate rich analysis of the video."""
|
|
claude_bin = os.path.expanduser('~/.local/bin/claude')
|
|
if not os.path.exists(claude_bin):
|
|
claude_bin = 'claude'
|
|
|
|
desc_section = ""
|
|
if description:
|
|
desc_section = f"DESCRIERE VIDEO:\n{description[:3000]}\n\n"
|
|
|
|
prompt = (
|
|
f"{ANALYSIS_PROMPT}\n\n"
|
|
f"TITLU: {title}\n\n"
|
|
f"{desc_section}"
|
|
f"TRANSCRIPT (primele 40000 caractere):\n{transcript[:40000]}"
|
|
)
|
|
|
|
result = subprocess.run(
|
|
[claude_bin, '-p', prompt],
|
|
capture_output=True, text=True, timeout=300,
|
|
)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
return result.stdout.strip()
|
|
log.warning("Claude analysis failed: %s", result.stderr[:300])
|
|
return None
|
|
|
|
|
|
def _process_youtube(url):
|
|
"""Download subtitles, save note."""
|
|
yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
|
|
|
|
result = subprocess.run(
|
|
[yt_dlp, '--dump-json', '--no-download', url],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
if result.returncode != 0:
|
|
print(f"Failed to get video info: {result.stderr}")
|
|
return
|
|
|
|
info = json.loads(result.stdout)
|
|
title = info.get('title', 'Unknown')
|
|
duration = info.get('duration', 0)
|
|
description = info.get('description', '')
|
|
|
|
temp_dir = Path('/tmp/yt_subs')
|
|
temp_dir.mkdir(exist_ok=True)
|
|
for f in temp_dir.glob('*'):
|
|
f.unlink()
|
|
|
|
subprocess.run([
|
|
yt_dlp, '--write-auto-subs', '--sub-langs', 'en',
|
|
'--skip-download', '--sub-format', 'vtt',
|
|
'-o', str(temp_dir / '%(id)s'),
|
|
url,
|
|
], capture_output=True, timeout=120)
|
|
|
|
transcript = None
|
|
for sub_file in temp_dir.glob('*.vtt'):
|
|
content = sub_file.read_text(encoding='utf-8', errors='replace')
|
|
transcript = _clean_vtt(content)
|
|
break
|
|
|
|
if not transcript:
|
|
print("No subtitles found")
|
|
return
|
|
|
|
date_str = datetime.now().strftime('%Y-%m-%d')
|
|
slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-')
|
|
filename = f"{date_str}_{slug}.md"
|
|
|
|
# Description block
|
|
desc_block = ""
|
|
if _is_description_about_video(description):
|
|
relevant_desc = _extract_relevant_description(description)
|
|
if relevant_desc:
|
|
desc_block = f"\n## Descriere / Index\n\n{relevant_desc}\n\n---\n"
|
|
|
|
# Claude analysis: TL;DR + puncte cheie + citate + teme în proze
|
|
print("Running Claude analysis...")
|
|
analysis = _analyze_with_claude(title, description, transcript)
|
|
|
|
if analysis:
|
|
note_content = f"""# {title}
|
|
|
|
**Video:** {url}
|
|
**Duration:** {duration // 60}:{duration % 60:02d}
|
|
**Saved:** {date_str}
|
|
**Tags:** #youtube
|
|
|
|
---
|
|
{desc_block}
|
|
{analysis}
|
|
"""
|
|
else:
|
|
# Fallback: save raw transcript if Claude fails
|
|
note_content = f"""# {title}
|
|
|
|
**Video:** {url}
|
|
**Duration:** {duration // 60}:{duration % 60:02d}
|
|
**Saved:** {date_str}
|
|
**Tags:** #youtube #to-summarize
|
|
|
|
---
|
|
{desc_block}
|
|
## Transcript
|
|
|
|
{transcript[:15000]}
|
|
"""
|
|
|
|
constants.NOTES_DIR.mkdir(parents=True, exist_ok=True)
|
|
note_path = constants.NOTES_DIR / filename
|
|
note_path.write_text(note_content, encoding='utf-8')
|
|
|
|
subprocess.run(
|
|
[sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')],
|
|
capture_output=True,
|
|
)
|
|
|
|
# Index new note with Ollama semantic embeddings
|
|
try:
|
|
sys.path.insert(0, str(constants.BASE_DIR))
|
|
from src.memory_search import index_file, MEMORY_DIR
|
|
n = index_file(note_path)
|
|
log.info("Ollama indexed %s (%d chunks)", filename, n)
|
|
except Exception as e:
|
|
log.warning("Ollama indexing failed for %s: %s", filename, e)
|
|
|
|
print(f"Created note: {filename}")
|
|
return filename
|
|
|
|
|
|
class YoutubeHandlers:
|
|
"""Mixin for /api/youtube."""
|
|
|
|
def handle_youtube(self):
|
|
"""Process a YouTube URL: download subs, save note."""
|
|
try:
|
|
content_length = int(self.headers['Content-Length'])
|
|
post_data = self.rfile.read(content_length).decode('utf-8')
|
|
data = json.loads(post_data)
|
|
url = data.get('url', '').strip()
|
|
|
|
if not url or ('youtube.com' not in url and 'youtu.be' not in url):
|
|
self.send_json({'error': 'URL YouTube invalid'}, 400)
|
|
return
|
|
|
|
try:
|
|
print(f"Processing YouTube URL: {url}")
|
|
_process_youtube(url)
|
|
self.send_json({
|
|
'status': 'done',
|
|
'message': 'Notița a fost creată! Refresh pagina Notes.',
|
|
})
|
|
except Exception as e:
|
|
print(f"YouTube processing error: {e}")
|
|
traceback.print_exc()
|
|
self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500)
|
|
except Exception as e:
|
|
self.send_json({'error': str(e)}, 500)
|