feat: youtube_subs + dashboard includ descrierea video ca index

- tools/youtube_subs.py: get_subtitles() returneaza acum (title, desc, transcript).
  Functii noi is_description_about_video() si extract_relevant_description()
  detecteaza daca descrierea contine capitole/timestamps (nu doar promotie autori)
  si curata trailing-urile promotionale inainte sa includa descrierea in output.
- dashboard/handlers/youtube.py: aceleasi functii adaugate; nota KB generata
  include acum un bloc "Descriere / Index" daca descrierea e relevanta pentru video.
- memory/kb/youtube: nota Jeremy Grantham (AI bubble, investitii, toxicitate)
  cu descrierea ca index de capitole.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-27 17:00:59 +00:00
parent a8d024944d
commit 6e9dfd137c
4 changed files with 444 additions and 27 deletions

View File

@@ -36,6 +36,50 @@ def _clean_vtt(content):
return ' '.join(lines)
def _is_description_about_video(description):
"""Return True if description contains info about the video (chapters/topics)."""
if not description or len(description.strip()) < 50:
return False
timestamp_pattern = re.compile(r'\b\d{1,2}:\d{2}(:\d{2})?\b')
if len(timestamp_pattern.findall(description)) >= 3:
return True
lines = description.strip().split('\n')
bullet_lines = [l for l in lines if re.match(r'^\s*[◼•\-\*▶►]\s+\S', l)]
if len(bullet_lines) >= 3:
return True
numbered_lines = [l for l in lines if re.match(r'^\s*\d+[\.\)]\s+\S', l)]
if len(numbered_lines) >= 3:
return True
return False
def _extract_relevant_description(description):
"""Strip promotional tails (links, social media) from description."""
if not description:
return ""
promo_patterns = [
re.compile(r'https?://\S+'),
re.compile(r'instagram|twitter|facebook|tiktok|linkedin|patreon|spotify', re.I),
re.compile(r'follow|subscribe|newsletter|merch|sponsor|affiliate', re.I),
re.compile(r'purchase|buy|order|shop|store', re.I),
]
result_lines = []
promo_streak = 0
for line in description.strip().split('\n'):
stripped = line.strip()
is_promo = any(p.search(stripped) for p in promo_patterns)
if is_promo:
promo_streak += 1
if promo_streak >= 2:
break
else:
promo_streak = 0
result_lines.append(line)
while result_lines and not result_lines[-1].strip():
result_lines.pop()
return '\n'.join(result_lines)
def _process_youtube(url):
"""Download subtitles, save note."""
yt_dlp = os.path.expanduser('~/.local/bin/yt-dlp')
@@ -51,6 +95,7 @@ def _process_youtube(url):
info = json.loads(result.stdout)
title = info.get('title', 'Unknown')
duration = info.get('duration', 0)
description = info.get('description', '')
temp_dir = Path('/tmp/yt_subs')
temp_dir.mkdir(exist_ok=True)
@@ -78,6 +123,13 @@ def _process_youtube(url):
slug = re.sub(r'[^\w\s-]', '', title.lower())[:50].strip().replace(' ', '-')
filename = f"{date_str}_{slug}.md"
# Build optional description block
desc_block = ""
if _is_description_about_video(description):
relevant_desc = _extract_relevant_description(description)
if relevant_desc:
desc_block = f"\n## Descriere / Index\n\n{relevant_desc}\n\n---\n"
note_content = f"""# {title}
**Video:** {url}
@@ -86,7 +138,7 @@ def _process_youtube(url):
**Tags:** #youtube #to-summarize
---
{desc_block}
## Transcript
{transcript[:15000]}