"""YouTube subtitle-download + note-creation endpoint.""" import json import logging import os import re import subprocess import sys import traceback from datetime import datetime from pathlib import Path import constants log = logging.getLogger(__name__) def _clean_vtt(content): """Convert VTT captions to plain text.""" lines = [] seen = set() for line in content.split('\n'): if any([ line.startswith('WEBVTT'), line.startswith('Kind:'), line.startswith('Language:'), '-->' in line, line.strip().startswith('<'), not line.strip(), re.match(r'^\d+$', line.strip()), ]): continue clean = re.sub(r'<[^>]+>', '', line).strip() if clean and clean not in seen: seen.add(clean) lines.append(clean) return ' '.join(lines) def _is_description_about_video(description): """Return True if description contains info about the video (chapters/topics).""" if not description or len(description.strip()) < 50: return False timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b') if len(timestamp_pattern.findall(description)) >= 3: return True lines = description.strip().split('\n') bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)] if len(bullet_lines) >= 3: return True numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)] if len(numbered_lines) >= 3: return True return False def _extract_relevant_description(description): """Strip promotional tails (links, social media) from description.""" if not description: return "" promo_patterns = [ re.compile(r'https?://\S+'), re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I), re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I), re.compile(r'purchase|buy|order|shop|store', re.I), ] result_lines = [] promo_streak = 0 for line in description.strip().split('\n'): stripped = line.strip() is_promo = any(p.search(stripped) for p in promo_patterns) if is_promo: promo_streak += 1 if promo_streak >= 2: break else: promo_streak = 0 result_lines.append(line) while result_lines and not result_lines[-1].strip(): result_lines.pop() return '\n'.join(result_lines) ANALYSIS_PROMPT = """\ Ai primit transcriptul unui video YouTube și descrierea lui. Scrie o notiță KB în română, format Markdown. Structura notei (în ordine): 1. ## TL;DR — un paragraf de 3-5 rânduri care surprinde esența 2. ## Puncte cheie — 6-10 puncte concise (pot fi bullets, dar scurte și dense) 3. ## Quote-uri memorabile — 4-6 citate directe din transcript, în limba originală, între ghilimele 4. ## Idei acționabile — 4-8 lucruri concrete pe care cititorul le poate face 5. Secțiuni tematice cu ## heading — câte teme apar natural, în proze curgătoare (NU bullets), fiecare cu conținut real din transcript: cifre, exemple, mecanisme, argumente Nu scrie metadate (titlu, url, tags, dată) — vor fi adăugate separat. Nu scrie fraze introductive despre tine sau despre video. Începe direct cu ## TL;DR. Scrie în română. Citatele rămân în engleză dacă sursa e engleză. """ def _analyze_with_claude(title, description, transcript): """Call claude -p to generate rich analysis of the video.""" claude_bin = os.path.expanduser('~/.local/bin/claude') if not os.path.exists(claude_bin): claude_bin = 'claude' desc_section = "" if description: desc_section = f"DESCRIERE VIDEO:\n{description[:3000]}\n\n" prompt = ( f"{ANALYSIS_PROMPT}\n\n" f"TITLU: {title}\n\n" f"{desc_section}" f"TRANSCRIPT (primele 40000 caractere):\n{transcript[:40000]}" ) result = subprocess.run( [claude_bin, '-p', prompt], capture_output=True, text=True, timeout=300, ) if result.returncode == 0 and result.stdout.strip(): return result.stdout.strip() log.warning("Claude analysis failed: %s", result.stderr[:300]) return None def _process_youtube(url): """Download subtitles, save note.""" yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp') result = subprocess.run( [yt_dlp, '--dump-json', '--no-download', url], capture_output=True, text=True, timeout=30, ) if result.returncode != 0: print(f"Failed to get video info: {result.stderr}") return info = json.loads(result.stdout) title = info.get('title', 'Unknown') duration = info.get('duration', 0) description = info.get('description', '') temp_dir = Path('/tmp/yt_subs') temp_dir.mkdir(exist_ok=True) for f in temp_dir.glob('*'): f.unlink() subprocess.run([ yt_dlp, '--write-auto-subs', '--sub-langs', 'en', '--skip-download', '--sub-format', 'vtt', '-o', str(temp_dir / '%(id)s'), url, ], capture_output=True, timeout=120) transcript = None for sub_file in temp_dir.glob('*.vtt'): content = sub_file.read_text(encoding='utf-8', errors='replace') transcript = _clean_vtt(content) break if not transcript: print("No subtitles found") return date_str = datetime.now().strftime('%Y-%m-%d') slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-') filename = f"{date_str}_{slug}.md" # Description block desc_block = "" if _is_description_about_video(description): relevant_desc = _extract_relevant_description(description) if relevant_desc: desc_block = f"\n## Descriere / Index\n\n{relevant_desc}\n\n---\n" # Claude analysis: TL;DR + puncte cheie + citate + teme în proze print("Running Claude analysis...") analysis = _analyze_with_claude(title, description, transcript) if analysis: note_content = f"""# {title} **Video:** {url} **Duration:** {duration // 60}:{duration % 60:02d} **Saved:** {date_str} **Tags:** #youtube --- {desc_block} {analysis} """ else: # Fallback: save raw transcript if Claude fails note_content = f"""# {title} **Video:** {url} **Duration:** {duration // 60}:{duration % 60:02d} **Saved:** {date_str} **Tags:** #youtube #to-summarize --- {desc_block} ## Transcript {transcript[:15000]} """ constants.NOTES_DIR.mkdir(parents=True, exist_ok=True) note_path = constants.NOTES_DIR / filename note_path.write_text(note_content, encoding='utf-8') subprocess.run( [sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')], capture_output=True, ) # Index new note with Ollama semantic embeddings try: sys.path.insert(0, str(constants.BASE_DIR)) from src.memory_search import index_file, MEMORY_DIR n = index_file(note_path) log.info("Ollama indexed %s (%d chunks)", filename, n) except Exception as e: log.warning("Ollama indexing failed for %s: %s", filename, e) print(f"Created note: {filename}") return filename class YoutubeHandlers: """Mixin for /api/youtube.""" def handle_youtube(self): """Process a YouTube URL: download subs, save note.""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length).decode('utf-8') data = json.loads(post_data) url = data.get('url', '').strip() if not url or ('youtube.com' not in url and 'youtu.be' not in url): self.send_json({'error': 'URL YouTube invalid'}, 400) return try: print(f"Processing YouTube URL: {url}") _process_youtube(url) self.send_json({ 'status': 'done', 'message': 'Notița a fost creată! Refresh pagina Notes.', }) except Exception as e: print(f"YouTube processing error: {e}") traceback.print_exc() self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500) except Exception as e: self.send_json({'error': str(e)}, 500)