"""YouTube subtitle-download + note-creation endpoint.""" import json import logging import os import re import subprocess import sys import traceback from datetime import datetime from pathlib import Path import constants log = logging.getLogger(__name__) def _clean_vtt(content): """Convert VTT captions to plain text.""" lines = [] seen = set() for line in content.split('\n'): if any([ line.startswith('WEBVTT'), line.startswith('Kind:'), line.startswith('Language:'), '-->' in line, line.strip().startswith('<'), not line.strip(), re.match(r'^\d+$', line.strip()), ]): continue clean = re.sub(r'<[^>]+>', '', line).strip() if clean and clean not in seen: seen.add(clean) lines.append(clean) return ' '.join(lines) def _is_description_about_video(description): """Return True if description contains info about the video (chapters/topics).""" if not description or len(description.strip()) < 50: return False timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b') if len(timestamp_pattern.findall(description)) >= 3: return True lines = description.strip().split('\n') bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)] if len(bullet_lines) >= 3: return True numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)] if len(numbered_lines) >= 3: return True return False def _extract_relevant_description(description): """Strip promotional tails (links, social media) from description.""" if not description: return "" promo_patterns = [ re.compile(r'https?://\S+'), re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I), re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I), re.compile(r'purchase|buy|order|shop|store', re.I), ] result_lines = [] promo_streak = 0 for line in description.strip().split('\n'): stripped = line.strip() is_promo = any(p.search(stripped) for p in promo_patterns) if is_promo: promo_streak += 1 if promo_streak >= 2: break else: promo_streak = 0 result_lines.append(line) while result_lines and not result_lines[-1].strip(): result_lines.pop() return '\n'.join(result_lines) def _process_youtube(url): """Download subtitles, save note.""" yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp') result = subprocess.run( [yt_dlp, '--dump-json', '--no-download', url], capture_output=True, text=True, timeout=30, ) if result.returncode != 0: print(f"Failed to get video info: {result.stderr}") return info = json.loads(result.stdout) title = info.get('title', 'Unknown') duration = info.get('duration', 0) description = info.get('description', '') temp_dir = Path('/tmp/yt_subs') temp_dir.mkdir(exist_ok=True) for f in temp_dir.glob('*'): f.unlink() subprocess.run([ yt_dlp, '--write-auto-subs', '--sub-langs', 'en', '--skip-download', '--sub-format', 'vtt', '-o', str(temp_dir / '%(id)s'), url, ], capture_output=True, timeout=120) transcript = None for sub_file in temp_dir.glob('*.vtt'): content = sub_file.read_text(encoding='utf-8', errors='replace') transcript = _clean_vtt(content) break if not transcript: print("No subtitles found") return date_str = datetime.now().strftime('%Y-%m-%d') slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-') filename = f"{date_str}_{slug}.md" # Build optional description block desc_block = "" if _is_description_about_video(description): relevant_desc = _extract_relevant_description(description) if relevant_desc: desc_block = f"\n## Descriere / Index\n\n{relevant_desc}\n\n---\n" note_content = f"""# {title} **Video:** {url} **Duration:** {duration // 60}:{duration % 60:02d} **Saved:** {date_str} **Tags:** #youtube #to-summarize --- {desc_block} ## Transcript {transcript[:15000]} --- *Notă: Sumarizarea va fi adăugată de Echo.* """ constants.NOTES_DIR.mkdir(parents=True, exist_ok=True) note_path = constants.NOTES_DIR / filename note_path.write_text(note_content, encoding='utf-8') subprocess.run( [sys.executable, str(constants.TOOLS_DIR / 'update_notes_index.py')], capture_output=True, ) # Index new note with Ollama semantic embeddings try: sys.path.insert(0, str(constants.BASE_DIR)) from src.memory_search import index_file, MEMORY_DIR n = index_file(note_path) log.info("Ollama indexed %s (%d chunks)", filename, n) except Exception as e: log.warning("Ollama indexing failed for %s: %s", filename, e) print(f"Created note: {filename}") return filename class YoutubeHandlers: """Mixin for /api/youtube.""" def handle_youtube(self): """Process a YouTube URL: download subs, save note.""" try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length).decode('utf-8') data = json.loads(post_data) url = data.get('url', '').strip() if not url or ('youtube.com' not in url and 'youtu.be' not in url): self.send_json({'error': 'URL YouTube invalid'}, 400) return try: print(f"Processing YouTube URL: {url}") _process_youtube(url) self.send_json({ 'status': 'done', 'message': 'Notița a fost creată! Refresh pagina Notes.', }) except Exception as e: print(f"YouTube processing error: {e}") traceback.print_exc() self.send_json({'status': 'error', 'message': f'Eroare: {str(e)}'}, 500) except Exception as e: self.send_json({'error': str(e)}, 500)