diff --git a/cron/jobs.json b/cron/jobs.json
index cace59f..ff5a557 100644
--- a/cron/jobs.json
+++ b/cron/jobs.json
@@ -23,9 +23,9 @@
     "report_on": "changes",
     "timeout": 120,
     "enabled": true,
-    "last_run": "2026-05-29T10:00:00.003600+00:00",
+    "last_run": "2026-06-02T10:00:00.003082+00:00",
     "last_status": "ok",
-    "next_run": "2026-05-29T16:00:00+00:00"
+    "next_run": "2026-06-02T16:00:00+00:00"
   },
   {
     "name": "security-audit-daily",
@@ -39,9 +39,9 @@
     "report_on": "changes",
     "timeout": 180,
     "enabled": true,
-    "last_run": "2026-05-29T03:00:00.002043+00:00",
+    "last_run": "2026-06-02T03:00:00.003432+00:00",
     "last_status": "error",
-    "next_run": "2026-05-30T03:00:00+00:00"
+    "next_run": "2026-06-03T03:00:00+00:00"
   },
   {
     "name": "kb-index-refresh",
@@ -55,9 +55,9 @@
     "report_on": "never",
     "timeout": 120,
     "enabled": true,
-    "last_run": "2026-05-29T03:30:00.002139+00:00",
+    "last_run": "2026-06-02T03:30:00.001994+00:00",
     "last_status": "ok",
-    "next_run": "2026-05-30T03:30:00+00:00"
+    "next_run": "2026-06-03T03:30:00+00:00"
   },
   {
     "name": "archive-tasks-daily",
@@ -71,9 +71,9 @@
     "report_on": "changes",
     "timeout": 60,
     "enabled": true,
-    "last_run": "2026-05-29T03:00:00.001515+00:00",
+    "last_run": "2026-06-02T03:00:00.002967+00:00",
     "last_status": "ok",
-    "next_run": "2026-05-30T03:00:00+00:00"
+    "next_run": "2026-06-03T03:00:00+00:00"
   },
   {
     "name": "backup-config",
@@ -87,9 +87,9 @@
     "report_on": "never",
     "timeout": 120,
     "enabled": true,
-    "last_run": "2026-05-29T02:00:00.002948+00:00",
+    "last_run": "2026-06-02T02:00:00.001327+00:00",
     "last_status": "ok",
-    "next_run": "2026-05-30T02:00:00+00:00"
+    "next_run": "2026-06-03T02:00:00+00:00"
   },
   {
     "name": "insights-extract",
@@ -255,9 +255,9 @@
     "prompt": "Heartbeat check. Rulează src/heartbeat.py printr-un scurt raport de status.\nDacă nu e nimic de raportat (email=0, calendar nu are evenimente <2h, kb ok), răspunde doar cu HEARTBEAT_OK și oprește-te — nu trimite mesaj.\nDacă e ceva: raport scurt pe Discord #echo-work.",
     "allowed_tools": [],
     "enabled": true,
-    "last_run": "2026-05-29T12:00:00.002840+00:00",
+    "last_run": "2026-06-02T12:00:00.001410+00:00",
     "last_status": "ok",
-    "next_run": "2026-05-29T14:00:00+00:00"
+    "next_run": "2026-06-02T14:00:00+00:00"
   },
   {
     "name": "night-execute",
@@ -271,8 +271,8 @@
       "Read",
       "Write"
     ],
-    "last_run": "2026-05-28T23:00:00.002106+00:00",
+    "last_run": "2026-06-01T23:00:00.001878+00:00",
     "last_status": "ok",
-    "next_run": "2026-05-29T23:00:00+00:00"
+    "next_run": "2026-06-02T23:00:00+00:00"
   }
 ]
diff --git a/memory/kb/index.json b/memory/kb/index.json
index 7607fcb..72f7372 100644
--- a/memory/kb/index.json
+++ b/memory/kb/index.json
@@ -1,30 +1,124 @@
 {
   "notes": [
     {
-      "file": "notes-data/emails/2026-05-29_fwd-invitatie-festivalul-luminii-brasov-2026.md",
-      "title": "***SPAM***   Fwd: Invitație – Festivalul Luminii Brașov 2026",
-      "date": "2026-05-29",
+      "file": "notes-data/youtube/2026-06-01_agentic-engineering-workflow.md",
+      "title": "My Agentic Engineering Workflow (step by step workflow)",
+      "date": "2026-06-01",
       "tags": [],
-      "domains": [],
+      "domains": [
+        "work",
+        "growth"
+      ],
       "types": [],
-      "category": "emails",
+      "category": "youtube",
       "project": null,
       "subdir": null,
       "video": "",
-      "tldr": "<!-- Echo: completează cu rezumat -->"
+      "tldr": "Workflow complet de inginerie agentică: GPT-4.5 extra high fast în Cursor + Greptile pentru code review automat + GP Loop (skill Greptile care iterează autonom până la 5/5) + Whisper Flow pentru dicta..."
     },
     {
-      "file": "notes-data/emails/2026-05-28_fwd-newsletter-20-din-2026.md",
-      "title": "Newsletter 20 din 2026",
-      "date": "2026-05-28",
+      "file": "notes-data/youtube/2026-05-31_i-ran-a-1b-ai-agent-on-a-0-budget-100-tok-s-on-8gb.md",
+      "title": "I Ran a 1B AI Agent on a $0 Budget — 100+ tok/s on 8GB GPU",
+      "date": "2026-05-31",
       "tags": [],
-      "domains": [],
+      "domains": [
+        "work",
+        "growth"
+      ],
       "types": [],
-      "category": "emails",
+      "category": "youtube",
       "project": null,
       "subdir": null,
       "video": "",
-      "tldr": "<!-- Echo: completează cu rezumat -->"
+      "tldr": "MiniCPM 5 1B (2.17 GB, necesita 7-8 GB VRAM) rulează la 100+ tok/s pe un GPU de 8 GB. Videoul demonstrează 3 metode: Ollama (simplu, rapid), vLLM (throughput mai mare, necesar pentru apps publice, nec..."
+    },
+    {
+      "file": "notes-data/youtube/2026-05-31_agentic-engineering-100x-faster.md",
+      "title": "Why This Dev Ships 100x Faster Than 99% of Engineers",
+      "date": "2026-05-31",
+      "tags": [],
+      "domains": [
+        "work",
+        "growth"
+      ],
+      "types": [],
+      "category": "youtube",
+      "project": null,
+      "subdir": null,
+      "video": "",
+      "tldr": "Mickey, un senior developer, explică cum livrează de 100x mai rapid folosind **agentic engineering** — nu vibe coding. Diferența cheie: tu faci gândirea strategică, AI face execuția. Stack-ul lui: Cur..."
+    },
+    {
+      "file": "notes-data/youtube/2026-05-31_hormozi-robbins-game-of-life.md",
+      "title": "2026-05-31_hormozi-robbins-game-of-life",
+      "date": "2026-05-31",
+      "tags": [],
+      "domains": [
+        "growth"
+      ],
+      "types": [
+        "coaching"
+      ],
+      "category": "youtube",
+      "project": null,
+      "subdir": null,
+      "video": "",
+      "tldr": "Tony Robbins și Alex Hormozi poartă o conversație profundă despre ce înseamnă cu adevărat succesul și împlinirea. Robbins diagnostichează în timp real „blocajul\" lui Hormozi: știința realizărilor îl s..."
+    },
+    {
+      "file": "notes-data/youtube/2026-05-30_ex-google-recruiter-explains-why-lying-gets-you-hi.md",
+      "title": "Ex-Google Recruiter Explains Why \"Lying\" Gets You Hired",
+      "date": "2026-05-30",
+      "tags": [],
+      "domains": [],
+      "types": [
+        "coaching"
+      ],
+      "category": "youtube",
+      "project": null,
+      "subdir": null,
+      "video": "",
+      "tldr": "<!-- Completează un rezumat de 2-3 rânduri -->"
+    },
+    {
+      "file": "notes-data/youtube/2026-05-30_local-coding-agent-budget-gpu-llamacpp.md",
+      "title": "Build Powerful Local Coding Agent on Budget GPU with Llama.cpp and Pi",
+      "date": "2026-05-30",
+      "tags": [
+        "local-ai",
+        "llama-cpp",
+        "coding-agent",
+        "moe",
+        "hardware"
+      ],
+      "domains": [
+        "work",
+        "growth"
+      ],
+      "types": [],
+      "category": "youtube",
+      "project": null,
+      "subdir": null,
+      "video": "",
+      "tldr": "Cum rulezi un coding agent local la nivel \"mid-frontier\" (comparabil cu Claude Code) pe un GPU de buget (RTX 3060, 12GB VRAM) fără rate limit și fără abonament cloud. Ingredientele: modele MoE REAP cu..."
+    },
+    {
+      "file": "notes-data/youtube/2026-05-30_rebuilt-hermes-claude-code.md",
+      "title": "I Rebuilt Hermes in Claude Code (It's Ridiculously Good)",
+      "date": "2026-05-30",
+      "tags": [],
+      "domains": [
+        "work",
+        "growth"
+      ],
+      "types": [
+        "project"
+      ],
+      "category": "youtube",
+      "project": null,
+      "subdir": null,
+      "video": "",
+      "tldr": "Hermes e un sistem agentic cu 40k stele GitHub în 46 de zile — rapid de adoptat, dar vine cu costuri ascunse. Autorul a ales să **reconstruiască doar piesele relevante din Hermes** în propriul setup C..."
     },
     {
       "file": "notes-data/facebook/2026-05-26_500k-views-4-8k-reactions.md",
@@ -9664,11 +9758,11 @@
     }
   ],
   "stats": {
-    "total": 558,
+    "total": 563,
     "by_domain": {
-      "work": 182,
+      "work": 187,
       "health": 100,
-      "growth": 249,
+      "growth": 255,
       "sprijin": 39,
       "scout": 8
     },
@@ -9676,7 +9770,7 @@
       "articole": 1,
       "coaching": 51,
       "conversations": 0,
-      "emails": 24,
+      "emails": 22,
       "exercitii": 4,
       "facebook": 8,
       "health": 6,
@@ -9685,7 +9779,7 @@
       "reflectii": 3,
       "retete": 1,
       "tools": 7,
-      "youtube": 128,
+      "youtube": 135,
       "memory": 44
     }
   },
diff --git a/memory/kb/youtube/2026-05-30_ex-google-recruiter-explains-why-lying-gets-you-hi.md b/memory/kb/youtube/2026-05-30_ex-google-recruiter-explains-why-lying-gets-you-hi.md
new file mode 100644
index 0000000..123bbd1
--- /dev/null
+++ b/memory/kb/youtube/2026-05-30_ex-google-recruiter-explains-why-lying-gets-you-hi.md
@@ -0,0 +1,19 @@
+# Ex-Google Recruiter Explains Why "Lying" Gets You Hired
+
+**Sursa:** https://youtu.be/T__1QViXUxk?si=HCAmeFHi8bRw96Ax
+**Data:** 2026-05-30
+**Creator:** Farah Sharghi
+**Format:** Video (~11:54 min)
+**Tags:** @coaching
+
+---
+
+## TL;DR
+
+<!-- Completează un rezumat de 2-3 rânduri -->
+
+---
+
+## Transcrierea
+
+ These are lies you can say in a job interview. At least that's what it feels like when you're the one saying them. Because what I'm going to share are the moments where your honest answer could hurt you, even though it shouldn't. And the version you give instead, it's not fake. It's what everyone in hiring expects you to say. But if no one's ever told you that, it can feel like you're being dishonest. I've spent over a decade on the other side of the hiring table at Google, TikTok, Uber, Lyft, and the New York Times. And I'm going to walk you through all nine of the answers that we expect you to give and why. So let's get into it. Number one, how are you? This feels like small talk, but it's not. It's an audition that starts before you think it does. I've watched candidates lose momentum in the first 10 seconds of an interview. They'll say something like, I'm just getting over a cold or honestly, it's been a rough week or traffic was a nightmare getting here. And what they don't realize is that I'm already taking a mental note, not because I'm judging them as a person, but because I'm asking myself, is this how they're going to show up on a hard day at work? You are always great, always. So say, I'm great. Thank you. How are you? That's it. You're not being fake. You're showing me that you can regulate your energy when it matters. That's a skill. And the interview starts the second you walk into the room, not when the first real question gets asked. Number two, why do you want to work here? Here's what most people say. I went to your website and I really love your mission and values. That's what everybody says. And I'll tell you what I'm thinking when I hear that. Nothing really. It tells me nothing about you. It tells me that you spent five minutes on our about page. So what you want to do is flip the question entirely. Stop making it about what you want from them and make it about what you're going to do for them. Something like, I've outgrown my current role and I'm looking for a new challenge. And based on what I've seen about this team's goals, I know it can help you solve and then insert the problem. You're not there to take. You are there to contribute. And here's the mindset shift. Stop treating interviews like auditions where you're hoping to be chosen. Treat them like a meeting between two parties who both have something to offer. That energy shift hiring managers can feel it immediately. Number three, where do you see yourself in five years? Let me tell you what we're actually asking because it's not about your five year plan. We don't actually care about your five year plan. What we want to know are two things. One, are you going to leave in six months? And two, are your goals aligned with this role? Or are you going to get bored and become a problem? Which is why you never say I'm going to grad school. Never say starting my own business. Never say hopefully in a leadership role somewhere because somewhere tells me probably not here. Even if those things are true and they might be, that's not what we want to hear in this moment. What we need to hear is, I see myself here. I see myself becoming an expert in this area and I'm a valuable part of this team. This role aligns with where I want to go because, and then you connect it to something specific about the job. You're not lying about who you are. You're telling the version of the truth that serves this conversation. There's a difference. Number four, why are you looking for a new opportunity? This is for when you're employed. We don't want to hear that you hate your boss or that your company is toxic, that you're being micromanaged or underpaid or undervalued. Even if every word of it's true, especially if it's true, that's not what you say. And here's why. When you trash a current employer, I'm not thinking, oh, that sounds terrible. Instead, I'm thinking, what is she going to say about us in two years? You become a risk. And in a stack of qualified candidates, risks get cut. So keep it clean. Say, I've learned a lot in my current role and now I'm ready for a new challenge. Done. You said nothing negative. You've shown growth and you're moved the conversation forward. One sentence, move on. Number five, how do you feel about your current manager? Now, this is different from the last one. And here's why, because it needs to be its own point, because sometimes the person interviewing you is about to become your next boss. And that changes everything. I've watched this happen in real time. A candidate was interviewing for a role on a team that I was recruiting for. And she mentioned that her current manager was a, quote, micromanager and two hands on, and that she was looking for more autonomy. The hiring manager nodded politely, said nothing, but that hiring manager was known internally for being heavily involved in her team's work. Some people loved it, some people didn't. But the candidate had just told her without realizing it that her management style was a problem. She didn't get the offer. And she never really knew why. Say nothing negative about your current boss, nothing about your coworkers, nothing about leadership, ever. Because you don't know who you're talking to, and you don't know who they know. This isn't about being fake. It's about not handing someone a reason to say no. So instead of saying, he's a micromanager, you could say, my manager has been great at providing structure for our team's goals. I've learned a lot about process from him. And as I've grown, I become more proactive in anticipating next steps. And I'm excited about the possibility of bringing that proactive energy to a new team. Number six, your hobbies. Now, this one's going to surprise you because most people just treat this as a throwaway, but it's not. I once had a candidate mention that she was restoring a vintage motorcycle in her garage. She wasn't interviewing for anything mechanical. It was a marketing role. But that detail made her memorable. It signaled curiosity, patience, willingness to figure things out. And the hiring manager brought it up three times in the debrief. On the other end, I've had candidates tell me their main hobby is watching TV or hanging out with friends. And there's nothing wrong with that. We all do it, but it doesn't give me anything to work with. So instead, you become forgettable. You're not listening activities. Instead, you're painting a picture of who you are when you're not at work. And the picture should make someone think, huh, that's interesting. I'd want to get coffee with that person. Reading, hiking, learning a new language, building something, mentoring, playing in a sports league. Those are examples of things that you can say. And these tell me that you're curious, engaged, and you have a life outside of work. Pick the version of yourself that's true and memorable. So instead of saying, I like Netflix and going out, you could say, outside of work, I'm an avid home baker. I actually run a small Instagram page where I document my attempts to mastering sourdough. It's taught me a lot about patience, process, and troubleshooting when things don't rise as expected. Number seven, your job title and description. This is the one that trips up high performers the most. And I need you to hear me on this. If you've been doing the work above your pay grade, you need to claim it. If you've been leading projects without the official title, you can say that you led them. If you've been doing the job of a senior person while being paid as a junior one, own the job. I've reviewed hundreds of thousands of resumes. And one of the most common mistakes I see is people underselling what they've actually done because they're waiting for permission. They're waiting for the promotion, the title change, the official recognition. And in the meantime, they're letting other candidates take credit for the level of work they're already doing. The title on your badge is what HR decided. The work you've done is what actually matters. And if you don't articulate that clearly, someone with less experience, but more confidence, is going to walk in and take the job that you were qualified for. That's not lying. That's accurately representing the value that you created. So for example, if your title is marketing coordinator, but you led the campaign, you could say, while my title was marketing coordinator, I was responsible for leading the Q3 email campaign from concept to execution. This involves setting the strategy, coordinating the design and copy, analyzing the A-B test results and presenting the 15% lift in engagement to leadership. Number eight, resume gaps. This one stresses people out way more than it should. And here's what I want you to understand. That gap on your resume already got you the interview. We saw it. We still called you, which means we've already decided that it's not a deal breaker. You just need to not make it weird. If you went back to school, got a certification or did volunteer work, talk about what you learned and how it applies to the job you're interviewing for. If you were laid off, say that there were layoffs. I've been laid off. It happens. We understand. If it was something personal, like health, family, caregiving, you can simply say, I took time off for personal reasons and now I'm ready to return to work. In the U.S., employers legally can't ask you about your family status and won't push. Here's what actually matters. The energy that you bring when you talk about it. If you're defensive or apologetic, I'm going to sense that something's wrong. If you're a matter of fact and confident, I'm going to move on. The gap isn't the problem, how you handle it is. For example, let's say you had a nine month gap for caregiving. You could say in your interview, I took a planned period away from my career to focus on a family commitment. During that time, I kept my skills sharp by completing a Google Analytics certification and doing some freelance content work. I'm now fully ready and eager to return to a full-time role where I can contribute deeply and say this with a calm, confident tone. Number nine, do you have any questions for us? Do not say, no, I think you covered everything. That's the fastest way to end an interview on a flat note. And here's what it actually communicates to me. You're not that interested. You haven't thought beyond today and you're just trying to get through this. The questions you ask, tell me how you think. They tell me whether you're already picturing yourself in this role or just hoping to survive the conversation. Have at least two questions ready. Why is this position open is a good one. And it tells you if someone got promoted, quit or was fired. And that's useful information for you. What does success look like in the first 90 days? This question shows that you're already thinking about how to deliver. Don't ask about vacation days or remote work flexibility in the first interview. That's a negotiation conversation, not an interview conversation. The best candidates ask questions that make the interviewer think, that's what you're aiming for. For example, don't say, no, instead ask, you mentioned the team is growing. Could you describe the dynamic between this role and the team it collaborates with the most closely? Or you could ask, based on our conversation today, what would you say is the most immediate challenge the person in this role would need to tackle in their first month? Interviews aren't about being the most honest person in the room. They're about being the most strategic. Same person, same skills, different answers. Now, if you've been laid off, check out this video where I lay out a 10 day plan to get you back on your feet.
diff --git a/memory/kb/youtube/2026-05-30_local-coding-agent-budget-gpu-llamacpp.md b/memory/kb/youtube/2026-05-30_local-coding-agent-budget-gpu-llamacpp.md
new file mode 100644
index 0000000..b28c427
--- /dev/null
+++ b/memory/kb/youtube/2026-05-30_local-coding-agent-budget-gpu-llamacpp.md
@@ -0,0 +1,83 @@
+# Build Powerful Local Coding Agent on Budget GPU with Llama.cpp and Pi
+
+**URL:** https://youtu.be/0AqpaFm11oI?si=LGIuBQD1ptTv7vGn
+**Data:** 2026-05-30
+**Durata:** 16:56
+**Tags:** @work @growth #local-ai #llama-cpp #coding-agent #moe #hardware
+
+---
+
+## TL;DR
+
+Cum rulezi un coding agent local la nivel "mid-frontier" (comparabil cu Claude Code) pe un GPU de buget (RTX 3060, 12GB VRAM) fără rate limit și fără abonament cloud. Ingredientele: modele MoE REAP cuantizate Q4, tuning agresiv llama.cpp (threads + ubatch + KV compression), agentul Pyi, și Tailscale pentru acces remote.
+
+---
+
+## Puncte cheie
+
+- **MoE > Dense la cost echivalent** — un model MoE de 30B rulează la viteza unui model dense de 3B. Toate modelele frontier (GPT, Claude) sunt MoE la trilioane de parametri. Sweet spot pentru muncă reală: 20-40B parametri.
+
+- **REAP pruning** — paper Cerebras: se pot elimina 20% din experții MoE neutilizați. Modelele pruned sunt mai mici + uneori *mai bune* pe benchmark-uri (HumanEval: 95.1 vs 94.5 nepruned). Unsloth oferă variante REAP pentru Qwen 3.6B MoE și GLM 4.7B 23B.
+
+- **Ierarhia de performanță:** VRAM > RAM speed / PCI bandwidth > CPU cores. DDR4 = bottleneck ~54 GB/s → ~50 tokens/s maxim la decode dacă modelul e în RAM.
+
+- **Ubatch = cheia pentru prompt processing rapid** (critic pentru agenți):
+  - Ubatch 256 → 300 tokens/s prefill (Qwen)
+  - Ubatch 2048 → 1,142 tokens/s prefill — aproape 4x mai rapid
+  - TG (decode) rămâne neschimbat — ubatch afectează DOAR prefill-ul
+  - Trade-off: ubatch mare consumă VRAM
+
+- **Threads optim = CPU cores - 1**, nu maxim. La 4 core CPU: thread 3 = 39.5 tok/s, thread 4 = colapsat la 22 tok/s. Un core trebuie lăsat pentru scheduling + GPU management.
+
+- **KV Compression (TurboQuant):**
+  - Keys (K) → Turbo4 (near lossless)
+  - Values (V) → Turbo2 (forma vectorului, nu precizia exactă)
+  - GLM: +12% decode, -25% prefill — trade-off clasic
+  - Qwen: +4% prefill, +5% decode — win pur
+  - Cu cât modelul e mai mare față de VRAM, cu atât compression câștigă mai mult (VRAM eliberat → layere extra pe GPU)
+
+- **Cache reuse llama.cpp:** împarte prompt cache în chunk-uri de 256 tokens. La modificare parțială a promptului, reprocessează doar chunk-urile modificate → TTFT mai rapid pentru agenți.
+
+- **Model presets (models.ini):** llama.server poate gestiona mai multe modele configurate. Switch din Pyi (`/models`) → serverul unload + load automat. Nu mai trebuie restart manual.
+
+- **Tailscale pentru remote:** instalezi pe AI rig + laptop → accesezi llama.server cu IP Tailscale de oriunde. Experiență identică cu un agent cloud.
+
+- **Agentul recomandat: Pyi** — lightweight, customizabil, suport nativ llama.cpp fără middleware. `pip install mcp-pi-llama-cpp` + URL în settings.json.
+
+---
+
+## Quote-uri notabile
+
+> "It doesn't matter how well or how much we optimize, it will never beat a model that is totally loaded into the VRAM of a GPU."
+
+> "All the frontier models are trillion parameter models with an MoE architecture. Why do you think frontier labs are doing that? They don't have the hardware to run a dense 1 trillion parameter model."
+
+> "Agents are mostly pre-fill. Processing the long system prompt with instructions, MCP content, tool usage details, documents, and code files."
+
+> "A lot of the time we see people optimize for the token speed... but to run agents we actually need some prompt processing speed. It is much more important than the token speed that we are chasing."
+
+> "No subscription, no API key, and no rate limit. It's already yours and you can run it as much as you want as long as you can pay for the electricity bill."
+
+---
+
+## Setup recomandat (RTX 3060 12GB)
+
+| Component | Alegere |
+|-----------|---------|
+| GPU | RTX 3060 12GB (sau orice VRAM ≥ 8GB) |
+| Model 1 (cod) | Qwen 3.6B MoE REAP Q4_KM (Unsloth) |
+| Model 2 (general) | GLM 4.7 Flash REAP 23B Q4_KM |
+| Quantizare | Q4 KM sau Unsloth dynamic Q4 |
+| Threads | CPU cores - 1 (ex: 3 din 4 cores) |
+| Ubatch | 1024 (870 tok/s prefill cu VRAM headroom) |
+| KV Compression | K=Turbo4, V=Turbo2 |
+| Agent | Pyi (PyCode agent) |
+| Remote access | Tailscale |
+
+---
+
+## Relevanță pentru Echo / ROA
+
+- **Potențial:** Un setup local cu RTX 3060 + Pyi ar putea rula un coding agent autonom (similar Ralph) fără cost API. Rate limit = 0. Util dacă Anthropic limitează.
+- **Pragmatic (80/20):** Actualmete Echo + Ralph rulează pe subscription Anthropic Pro, cost OK. Setup local = efort hardware semnificativ. De monitorizat ca alternativă, nu de acționat imediat.
+- **Insight cheie pentru orice LLM local:** prefill speed > decode speed pentru use-case-uri agentic (routers, heartbeats, job-uri cron cu context mare).
diff --git a/memory/kb/youtube/2026-05-30_rebuilt-hermes-claude-code.md b/memory/kb/youtube/2026-05-30_rebuilt-hermes-claude-code.md
new file mode 100644
index 0000000..614703d
--- /dev/null
+++ b/memory/kb/youtube/2026-05-30_rebuilt-hermes-claude-code.md
@@ -0,0 +1,66 @@
+# I Rebuilt Hermes in Claude Code (It's Ridiculously Good)
+
+**URL:** https://youtu.be/wdc1OFWDxlU?si=0AqRf8_0stcSKrTi
+**Durata:** 12:56
+**Tags:** @work @growth @project
+
+---
+
+## TL;DR
+
+Hermes e un sistem agentic cu 40k stele GitHub în 46 de zile — rapid de adoptat, dar vine cu costuri ascunse. Autorul a ales să **reconstruiască doar piesele relevante din Hermes** în propriul setup Claude Code, în loc să instaleze ceva off-the-shelf. Concluzia: mai lent la start, dar infinit mai scalabil și mai ușor de înțeles și reparat.
+
+Extrem de relevant pentru Echo Core — confirmă că abordarea ta (custom, modular, controlat) e corectă strategic.
+
+---
+
+## Puncte cheie
+
+**1. Cele 3 costuri ascunse ale sistemelor off-the-shelf (OpenClaw/Hermes)**
+- **Moștenești asumpții pe care nu le-ai ales** — self-learning loop-ul Hermes nu are validare externă; modelul se autoevaluează (grade your own homework), poate suprascrie silențios skill-uri bune cu versiuni mai slabe
+- **Nu poți repara ce nu înțelegi** — OpenClaw: 200+ vulnerabilități identificate, 386 pachete malițioase descoperite de un cercetător de securitate
+- **Nu scalează pe business** — Hermes e proiectat pentru un singur client/brand; pentru agenții/multi-client trebuie instalări separate, fiecare cu propria memorie
+
+**2. Identity layer**
+- Hermes: `memory.md` + `user.md` injectate la fiecare conversație — simplu și eficace
+- Limitare: nu poți comuta între clienți/branduri fără instalări separate
+- Soluție custom: folder per client cu `brand voice`, `ICP`, `visual identity` + skills **shared** între toți clienții dintr-o singură instalare
+
+**3. Memory system**
+- Hermes: autosave + summarize la fiecare turn, injectare în conversație (cap ~1300 tokens), recall prin **keyword search** — slab pentru memorie pe termen lung
+- Soluție custom: același pattern de injectare (recent memory MD), dar recall prin **semantic search** (embeddings / mem search) — găsești informații după sens, nu după cuvinte exacte
+
+**4. Self-learning loop — controversat**
+- Hermes creează automat un skill nou după fiecare task — rapid la start
+- Problemă la scală: după 10-20 skill-uri, ajungi cu 15 versiuni ale aceluiași lucru (LinkedIn post V1, V2, V3...), greu de menținut
+- **Soluție custom: skill systems modulare** — fiecare skill face un singur lucru, stă într-un singur loc, se actualizează într-un singur loc; un skill system le înlănțuiește în ordinea corectă
+- Când vocea brandului se schimbă: un singur fișier de actualizat, toate sistemele trag din el
+
+**5. Concluzie strategică**
+- Hermes: mai rapid la start
+- Custom: mai rapid la a 10-a, 100-a iterație — fiecare strat e vizibil, editabil, reutilizabil
+- Alegerea depinde de context; nu există răspuns universal
+
+---
+
+## Citate relevante
+
+> "You can't fix what you don't understand underneath."
+
+> "The same model that writes the skill is also the sole judge of its correctness."
+
+> "When your brand voice shifts, you've got like 15 places to go and update."
+
+> "Hermes is faster to start, but your own setup is actually going to be faster to scale."
+
+---
+
+## Idei acționabile pentru Echo Core
+
+- [ ] **Skill systems modulare** — Echo are deja o structură similară (personality/*.md, tools separate). Verifică dacă skill-urile noi (pauze respirație, coaching etc.) urmează pattern-ul modular sau acumulează duplicat
+- [ ] **Semantic recall confirmat corect** — Echo folosește deja Ollama all-minilm embeddings pentru memory search semantic. Asta e exact ce autorul recomandă față de Hermes keyword search. Confirmăm că arhitectura e solidă.
+- [ ] **Validare externă pentru self-improvement** — Ralph scrie cod autonom; reviewul vine din skills gstack (/qa, /review). Dacă vrei un self-learning loop pentru Echo, adaugă un pas de validare externă (teste, comparare cu versiunea anterioară) înainte de a accepta skill-ul nou.
+
+---
+
+*Salvat: 2026-05-30*
diff --git a/memory/kb/youtube/2026-05-31_agentic-engineering-100x-faster.md b/memory/kb/youtube/2026-05-31_agentic-engineering-100x-faster.md
new file mode 100644
index 0000000..fd968e4
--- /dev/null
+++ b/memory/kb/youtube/2026-05-31_agentic-engineering-100x-faster.md
@@ -0,0 +1,106 @@
+# Why This Dev Ships 100x Faster Than 99% of Engineers
+
+**Sursa:** https://youtu.be/PzVV4X37ihg  
+**Canal:** David Andre Podcast  
+**Invitat:** Mickey (senior dev, 95% AI-generated code)  
+**Durata:** 53:52  
+**Data:** 2026-05-31  
+**Tags:** @work @growth @agentic-engineering @ai-tools @productivitate
+
+---
+
+## TL;DR
+
+Mickey, un senior developer, explică cum livrează de 100x mai rapid folosind **agentic engineering** — nu vibe coding. Diferența cheie: tu faci gândirea strategică, AI face execuția. Stack-ul lui: Cursor + GPT-5.5 (sau Opus 4.7 Max pentru UI) + 3 unelte specifice. Principiul central: context engineering — să dai agentului exact ce are nevoie, nu mai mult.
+
+---
+
+## Puncte cheie
+
+### 1. Harness > Model (dar modelul tot contează)
+- Harness-ul = tot ce înconjoară modelul: tools, system prompt, agenți, fișiere md
+- Cursors/Claude Code/Codex diferă nu prin model, ci prin uneltele pe care le dau agentului
+- Modelele top (GPT-5.5, Opus 4.7 Max) sunt mandatory — modelele gratuite/ieftine nu țin pasul
+- **Opus 4.7 Max** = ideal pentru UI/frontend; **GPT-5.5 Extra High** = codebase-uri mari/complexe
+
+### 2. Context Engineering — principiul #1
+- Ține context window-ul curat: agentul e "deștept" până la ~60% din context, după aceea degradează
+- Features mici, PR-uri mici = agent mai precis, mai puțini errori
+- Planul nu e pentru agent — e pentru tine, să ții agentul accountable și să spargi task-ul în bucăți mici
+- Dacă planul pare prea mare → "Cum facem asta un PR mic, ușor de review?"
+
+### 3. Stack de 3 unelte concrete
+
+**Unealta 1: `open-source` (de la Vercel)**
+- Descarcă source code-ul oricărui pachet/repo în codebase-ul tău
+- În `agents.md` îi spui agentului să fetch-uiască codul oricărui pachet necunoscut
+- De ce: codul e cel mai bun "context" — mai bun decât documentația human-written
+- Cum: `npx open-source <repo-url>` → folder `open-source/repos/`
+
+**Unealta 2: Skill de refactorizare (service layer)**
+- Problema: agentul rescrie funcții existente în loc să le refolosească → code smell
+- Soluția: după fiecare feature, rulezi un skill care identifică cod duplicat și creează service layers
+- Cod curat = agentul poate relua lucrul pe un session nou fără confuzie
+- Alternativă: Matt Pocock's "improved code base structure" skill
+
+**Unealta 3: Greptile + `/grep-loop` skill**
+- Greptile face code review cu confidence score (1-5)
+- `/grep-loop`: agentul citește PR-ul + feedback-ul Greptile, fixează, re-submitea review, repetă până la 5/5
+- Merge automat, te ocupi de altceva între timp
+- Funcționează NUMAI pe PR-uri mici (sub câteva sute de linii)
+
+### 4. Agentic Engineering vs Vibe Coding
+- **Vibe coding**: delegi gândirea agentului → rezultate inconsistente, piezi controlul
+- **Agentic engineering**: tu gândești strategic, agentul execută ca un "junior cracked care are nevoie de îndrumare"
+- Tratează modelul ca "un om deștept cu memorie fotografică dar care nu știe cum să folosească tot ce știe"
+- Nu te lăsa condus de agent — el va fi de acord cu orice și va inventa probleme inexistente
+
+### 5. Securitate în era agentică
+- Nu instala pachete mai vechi de 14 zile — attack vector major prin pachete noi malițioase
+- Promptează agentul să refuze pachete sub 14 zile vechime
+- 2FA obligatoriu (nu prin SMS — SIM swapping real)
+- Password manager (1Password etc.)
+- Passphrase de familie pentru verificare identitate (voice cloning avansat)
+- La breach pe Twitter: paste tweet în Claude → "sunt afectat?" → verifică directoarele automat
+
+### 6. Lansează mai repede (mentalitate SF)
+- Oamenii din San Francisco lansează cu MVP semi-funcțional și câștigă market share
+- Cei care asteaptă "mai un feature" pierd față de competitori mai puțin tehnici dar mai curajosi
+- "Construieste în public, nu în umbra" — feedback real > perfecționism intern
+- Dacă crezi în produs, orice obstacol e rezolvabil; dacă ești pe gard, renunți
+
+### 7. Viitorul: Knowledge Work > Agentic Engineering
+- Modelele sunt deja suficient de bune pentru knowledge work — lipsesc uneltele din jur
+- Anthropic + OpenAI lansează "consulting arms" pentru a ajuta companii să adopte AI
+- "Dacă ajuți compania ta să adopte AI → ești promovat" (exemplu: 24 de ani, prezentare Claude → manager)
+- Nimeni nu știe exact ce urmează — embrace uncertainty, nu o dread
+
+---
+
+## Quote-uri relevante
+
+> "In agentic engineering, you're doing the thinking and then you're just letting your minions do the work. You're letting a bunch of junior grads who are very cracked, but need a lot of guidance do the work."
+
+> "The model is just a predictor of next text. The model doesn't think. The model just predicts the next text."
+
+> "Context engineering might as well be a principle in engineering in it of itself — this is a make or break for how good things will be."
+
+> "Treat this like a really dumb person with photographic memory that knows everything but doesn't know how to use everything."
+
+> "Even if you don't understand the syntax — which syntax doesn't really matter nowadays — understanding how good code and architecture works helps."
+
+> "If it's hard for a human to read, it's probably going to be hard for the agent too."
+
+> "Never install a package younger than 14 days — that's how the big attack vectors are happening now."
+
+> "Don't take the change as 'this is happening against me' — if you have a little mindset shift and say 'this is happening for me', you'll grow with the industry."
+
+---
+
+## Relevanta pentru Marius / Echo Core
+
+- **Ralph**: principiul "plan mic → PR mic → loop de review" e exact ce face Ralph cu stories — validare că suntem pe drumul bun
+- **Context engineering**: motivul pentru care sesiunile de planning gstack sunt importante înainte de execuție (nu în timpul)
+- **Open-source tool**: potențial util pentru roa2web — dacă folosim librării Vue/FastAPI, putem da agentului source code-ul direct
+- **Skill de refactorizare post-feature**: ar putea fi integrat în ralph.sh după fiecare story completat
+- **Lansare rapidă**: lecție pentru proiectele lui Marius — MVP funcțional > perfecționism
diff --git a/memory/kb/youtube/2026-05-31_hormozi-robbins-game-of-life.md b/memory/kb/youtube/2026-05-31_hormozi-robbins-game-of-life.md
new file mode 100644
index 0000000..64baad2
--- /dev/null
+++ b/memory/kb/youtube/2026-05-31_hormozi-robbins-game-of-life.md
@@ -0,0 +1,48 @@
+---
+title: Alex Hormozi x Tony Robbins - O Conversație Brutală despre Jocul Vieții
+url: https://youtu.be/u1Aam_1NlRs
+date: 2026-05-31
+duration: 69:42
+tags: @growth @coaching
+---
+
+## TL;DR
+Tony Robbins și Alex Hormozi poartă o conversație profundă despre ce înseamnă cu adevărat succesul și împlinirea. Robbins diagnostichează în timp real „blocajul" lui Hormozi: știința realizărilor îl stăpânește, dar arta împlinirii îi lipsește. Mesajul central: willpower-ul și datoria te duc până la un punct, dar pentru a trăi cu adevărat ai nevoie de o misiune mai mare decât tine, de conexiune emoțională reală și de identitate conștientă. Trecerea de la „trebuie să fac" la „am privilegiul să fac" este diferența dintre bogăție și sărăcie — nu ca bani, ci ca stare de viață.
+
+## Puncte cheie
+- **Motivație push vs. pull**: Motivația prin presiune (datorie, obligație) epuizează. Motivația prin atracție — ceva ce vrei să servești mai mult decât pe tine — îți explodează energia și rezistența.
+- **Contribuția este împlinirea maximă**: Tony nu distinge între datorie și plăcere — pentru el totul e plăcere, pentru că contribuția este scopul pentru care suntem făcuți. Dacă faci business doar pentru bani, ajungi la un plafon de împlinire.
+- **Știința realizărilor vs. arta împlinirii**: Realizarea e o știință — dacă urmezi sistemul, obții rezultate. Împlinirea e unică pentru fiecare om și nu poate fi copiată. Hormozi excelează la prima, o neglijează pe a doua.
+- **Vocabularul transformațional**: Cuvintele pe care le atașezi experiențelor devin experiențele tale. „Datorie" produce alte emoții decât „oportunitate". Dacă te antrenezi cu cuvinte de suferință, te vei simți în suferință, indiferent de circumstanțe.
+- **Identitatea este forța de control**: Cel mai puternic mecanism din personalitatea umană e nevoia de a rămâne consistent cu identitatea proprie. Ce crezi că ești — ești. Schimbă identitatea, schimbi comportamentul și rezultatele.
+- **Moonshot-ul contribuției**: Simpla contribuție de rutină devine banală prin legea familiarității. Ai nevoie de un obiectiv nerezonabil de mare, conectat emoțional la o cauză reală, care să te trezească dimineața și să te țină treaz noaptea.
+- **Capcanele astronautului**: Oamenii care au atins apogeul (mers pe lună, vândut compania cu miliarde) devin adesea alcoolici sau cad în depresie pentru că nu mai știu să găsească bucuria în lucruri mici. Soluția: reconectare cu stările vii, nu o nouă realizare externă.
+- **Stresul e din management, nu din dificultate**: Oamenii de succes sunt stresați nu pentru că viața e grea, ci pentru că gestionează — nu creează. Creierul pus în modul de management te bagă în supraviețuire.
+- **Limbajul NLP modifică biochimia**: Același eveniment neplăcut poate fi interpretat ca „umilitor", „enervant" sau „amuzant" — în funcție de cuvântul ales, emoția resimțită e complet diferentă. Partenerul de negociere care spunea „sunt puțin deranjat" în loc de „sunt furios" se recupera instant.
+- **Ieșirea din cap, intrarea în inimă**: Creierul reduce și compară. Inima amplifică și conectează. Cunoașterea intelectuală a unui lucru bun nu produce emoție — prezența și implicarea directă o fac.
+- **Selecția în relații**: 80% din succesul unei relații intime vine din selecție — nu pe cine alegi, ci ce versiune din tine alegi să fie în relație. Versiunea care se dăruiește complet la început vs. versiunea tranzacțională care măsoară.
+- **Capitalismul și ownership-ul**: Dacă trăiești într-un sistem de liberă inițiativă și nu ești proprietar, vei suferi mereu de inflație și incertitudine. Tranziția de la angajat la proprietar schimbă fundamental relația cu sistemul economic.
+- **Alocarea activelor ca a doua afacere**: Nu-ți poți pune toate ouăle într-un singur coș (propria afacere). Ai nevoie de două „afaceri" paralele: cea pe care o construiești și un portofoliu de investiții care crește independent.
+- **Private equity bate orice**: Pe 39 de ani, S&P 500 a returnat 9% pe an (1M → 28.6M), iar private equity mediu 15.7% pe an (1M → 293M). Diferența de acces la aceste instrumente e cea mai mare inegalitate financiară ascunsă.
+
+## Quote-uri memorabile
+- "The only thing that makes us feel alive is growth. When you grow, then you have something to give." — Tony despre de ce oamenii bogați și faimoși ajung să se distrugă dacă se opresc din creștere.
+- "Get in your head, you're dead." — Robbins despre cum analiza excesivă blochează bucuria și conexiunea emoțională.
+- "The difference between have to, duty, and get to — that's the difference between rich and poor. And rich and poor is not money. Rich and poor is feeling fully alive." — Esența conversației, în două propoziții.
+- "There are two skills in life: the science of achievement, which you're unbelievably great at, and the art of fulfillment, which you're not so great at." — Robbins diagnosticând situația lui Hormozi.
+- "Pain is part of life. Suffering is an option." — Robbins separând realitatea dificultății de alegerea de a suferi.
+- "The words you attach to an experience become your experience." — Principiul vocabularului transformațional — cuvintele nu descriu realitatea, o creează.
+- "Transcend means end the trance. Whatever you say to yourself over and over again, sooner or later you believe it." — Despre auto-hipnoză și cum ieși din ea.
+- "I don't teach you shit. You've done everything you do. But what I could offer you is conscious choice to find Anabolic Alex and put him in charge." — Robbins refuzând să-l „antreneze" pe Hormozi, dar oferindu-i cheia.
+
+## Idei acționabile
+- **Numești-ți „sinele" de serviciu**: Creează un alter-ego clar pentru starea ta productivă și conectată (Hormozi a ales „Anabolic Alex" vs. „Analytical Alex"). Când ai nevoie de energie și conexiune, cheamă conștient acel alter-ego — nu willpower, ci comutare de identitate.
+- **Șterge cuvintele toxice din vocabular**: Identifică 2-3 cuvinte care îți intensifică suferința inutil (deprimat, obligat, trebuie, datorie) și înlocuiește-le cu variante care schimbă biochimia (provocat, oportunitate, privilegiu, vreau).
+- **Găsești moonshot-ul tău de contribuție**: Nu orice cauză nobilă — cauza care îți aprinde ceva personal. Leagă-o de un moment real din viața ta (un prag, o transformare, o durere depășită). Stabilește un număr nerezonabil de mare și un termen clar.
+- **Fii prezent fizic la impactul tău**: Scrie un cec sau creezi conținut — emoția nu apare din distanță. Mergi acolo unde impactul se petrece, vorbești cu oamenii afectați, te conectezi direct. Asocierea emoțională se construiește prin prezență, nu prin date.
+- **Auditează-ți identitatea regulat**: Întreabă-te: „Când am decis că sunt genul ăsta de om?" Dacă răspunsul e „acum 10 ani", e momentul să extinzi identitatea. Nu o abandona — extinde-o. Upgrade identitar, nu restart.
+- **Construiești a doua „afacere" ca investitor**: Indiferent de nivelul tău, începe să diversifici în afara propriei afaceri. Minimul: S&P 500 index. Aspirațional: acces la private equity sau co-investiții. Nu lăsa toată averea în singurul cos pe care îl controlezi.
+- **Testezi o stare de „get to" timp de 7 zile**: Pentru o săptămână, înlocuiește orice „trebuie să fac X" cu „am oportunitatea să fac X". Observă diferența de energie și motivație. Creierul se recalibrează prin repetiție lingvistică.
+
+## Sursa
+Alex Hormozi interviewing Tony Robbins
diff --git a/memory/kb/youtube/2026-05-31_i-ran-a-1b-ai-agent-on-a-0-budget-100-tok-s-on-8gb.md b/memory/kb/youtube/2026-05-31_i-ran-a-1b-ai-agent-on-a-0-budget-100-tok-s-on-8gb.md
new file mode 100644
index 0000000..3f6d52d
--- /dev/null
+++ b/memory/kb/youtube/2026-05-31_i-ran-a-1b-ai-agent-on-a-0-budget-100-tok-s-on-8gb.md
@@ -0,0 +1,34 @@
+# I Ran a 1B AI Agent on a $0 Budget — 100+ tok/s on 8GB GPU
+
+**Sursa:** https://youtu.be/i-Oq_CcFsT4?si=lClTPzCk3kMvEaAG
+**Data:** 2026-05-31
+**Creator:** Prompt Engineer
+**Format:** Video (~24:14 min)
+**Tags:** @work @growth
+
+---
+
+## TL;DR
+
+MiniCPM 5 1B (2.17 GB, necesita 7-8 GB VRAM) rulează la 100+ tok/s pe un GPU de 8 GB. Videoul demonstrează 3 metode: Ollama (simplu, rapid), vLLM (throughput mai mare, necesar pentru apps publice, necesita WSL pe Windows), și OpenCode (alternativa open-source la Claude Code). Concluzie: modelul e excelent pentru chat/rezumare/reasoning, slab la tool-calling și modificare fișiere. Cheia pentru frameworks agentice (OpenCode, Hermes): `--max-model-len 64000`.
+
+## Puncte cheie
+
+- **MiniCPM 5 1B** de la OpenBMB — 2.17 GB, GGUF disponibil pe HuggingFace, performant pe GPU de 8 GB
+- **100 tok/s** pe 8 GB VRAM — viteză impresionantă pentru un model local, gratuit
+- **3 metode de rulare**: Ollama (cel mai simplu), vLLM (throughput mai mare, multi-user), OpenCode CLI
+- **vLLM pe Windows**: necesita WSL + Ubuntu + Miniconda. Comanda: `vllm serve OpenBMB/MiniCPM-S-1B-sft --port 1234 --gpu-memory-utilization 0.65 --max-model-len 64000 --enable-auto-tool-choice --tool-call-parser hermes`
+- **64K context obligatoriu** pentru frameworks agentice (OpenCode, Hermes Agent) — fără el primești eroare de context prea mic
+- **Limitări**: tool-calling slab, nu poate modifica fișiere în agentic mode; bun pentru text/reasoning/chat
+- **OpenCode** = open-source Claude Code, configurabil cu orice model vLLM prin custom provider
+
+## Idei acționabile
+
+- [ ] **Test MiniCPM 5 1B pe Ollama (LXC 104)** — are deja Ollama, modelul e 2.17 GB, VRAM disponibil? Bun pentru taskuri de rezumare/reasoning locale @work
+- [ ] **vLLM ca alternativă la API Anthropic** pentru taskuri repetitive simple (rezumare bonuri, extragere date) — cost $0 @work
+
+---
+
+## Transcrierea
+
+ Hey guys, I'm gonna take you through a journey. So we have this new model, Mini CPM 1 billion. And I'm gonna show you different ways in which you can use this beautiful small model. The file size of this model is just 2.17 GB. But if you are trying to make it run, we do need about 7 to 8 gigs of VRAM. And this fits perfectly on my PC because I have 8 GB of VRAM and 16 GB of RAM. So let's go ahead and try to use this. Now I will show you how to use this on Olama. You can do LM Studio as well. I'm gonna show you how I do the UI part on Streamlit. I'm gonna show you VLM inferencing as well. So that we can ultimately patch everything up and try to run it with the open code. Which is an open source alternative of Cloud code. Now this is really one of the toughest jobs for any LLM. Now 1 billion model LLM, we really doubt if it's possible or not. Now if you want to see if it was possible to do it or not. Please stay till the later part of the video when I will show you if it was a success or not. But rest of the things like Olama and VLM was a success. LM Studio was a success as well. There are so many options to run this and I'm gonna touch upon a few and do this quickly. Let's go ahead. So we have this from OpenBMB. We have MiniCPM 5 1 billion model. And this is a very small model. You can see you can compare it with other Qen 3 0.6 billion model Qen 3.5 0.8 billion model. Or LLM 2.5 1.2 billion model. And you can see our new model which is MiniCPM 5 1 billion. It's actually performing pretty good in agendic works in logical reasoning and math reasoning. Again in coding and in general knowledge. So let's go ahead and let's see this. We have different options that you can choose. We have this base version of 1 billion. And then we have the SFT version. We have the base version here. And then we have this GGUF version which is for LLAMA CPP, Olama and LM Studio. Now I'm gonna use this. So what I'm gonna do is I'm gonna click here and go to Hugging Face and then download this. So I'm just gonna copy or click here and then download it here. So this is downloaded. This is the model. And for running it on Olama, we first need to install Olama. The installation process of Olama is pretty easy. You can go to PowerShell and run this command. Or you can go to Downloads here. And again have options for all these. You can go ahead and download the desktop app as well for Windows. Okay, so I already have installed Olama. So because of which I can do this. I can go to our command prompt here. And then I can do Olama list. And you can see the list of models that I have. But the important thing is, I don't want this. But the important thing is that I want to use this GGUF model. Now for using this model, we need to create a file called model file. Okay, just a model file. So what you need to do is you can go ahead and start up a text document, remove the extension and just name it model file. Okay, so once you do that. Now let's open up this in a VS code so that I can show you properly. So here we have this model file. So inside this model file, we need to do this. So we are taking the path where the GGUF file is. So this is my GGUF file. So I'm saying from the GGUF file and the template. I'm just leaving it like this. And I'm putting the parameter of temperature of 0.7. So this is the model file. And now using the model file, what you can do is you can go ahead and do cmd now to this particular location. And now you can do olama create the name of the model. So let's say I'm going to call this mini CPM 5 1b model. Okay. And then I'm going to say dash F and the model file. So I'm making a new model known as mini CPM 5 1b and we're using the model file. So inside the model file, as you can see, we have the things written here. So this model file. So inside the model file, we have the location of the GGUF because this is the actual model. We have the template and then we have the parameter of temperature. You can put other things as well. If you go to olama model file, you will see all the list of things that you can do. So I can go to model file reference and go to some examples here. So you can see from the model, set the temperature, set the number of context here, and then the system prompt. So even I can set this number of context. Let's set this context as well. So what I can do is I can put in like this, a context length could be, okay, could be anything between like 2000. It's okay. So now what we are going to do is I'm going to cmd here. And then now we are going to create this. I think I've already have written this. So this is the one olama create mini CPM 51B dash F and then the model file. That's going to press enter. And this is going to use the GGUF file, which you have downloaded. And it's going to make or save it as a new model. Now, if you say olama list, then you'll see that we have this new model mini CPM 51B latest. This is the new model that we have. Now what you can do is we can say olama run the name of the model. So mini CPM then 51B latest. So you can run this model now. So it's loading the two GB model. And if you want to see the memory utilization and everything, you can go ahead to WSL. So we'll see how to install WSL as well. But once you go to WSL, which is Windows subsystem for Linux, there you can run this command just a minute. There you can run this command. So I can say watch dash N one second. And I can say NVIDIA dash SMI. So this will show me, let me minimize this. And you can see we have the 8188 MB 8188 GB GPU. And out of which 1.50 has been used as of now. Okay, let's go ahead and ask some questions. What is the capital of India? You can see the speed. It's really amazing. The capital of India is New Delhi. Cool. Then if you want to see the number of tokens, I can just top this control D and then say clear. But again, if you run this with a verbose tag here and then say what is the capital of India? Then you can see the number of tokens here. You can see it's 100 tokens per second. And it's really amazing. So you can go ahead and use this on Olama. But the outputs are not that great when you use an Olama. But if you use in VLLM, I'm going to show you a better control on the outputs. Okay, so this is first step. You can go ahead and use this on Olama. Works completely fine. You can see you get 100 tokens per second. I have cap cut now running. If it doesn't run, then it goes even faster than this. But a local model, a powerful model giving you 100 tokens per second. It's really amazing. All right. The next part of the video would be to run it on VLLM using VLLM because it gives you higher throughput and it's better control. And again, multiple users requesting info and it's going to inference it and serve each one of them. And it's not going to confuse and send the outputs of one to another. So it's a better serving engine. VLLM is always best when you have public facing applications. So let's go ahead and use VLLM to host to run this one billion model here. So for using VLLM, what I'm going to do is, okay, so for using with VLLM, what we need to do is first go ahead and if you're on a Windows system, you need WSL because on Windows, it doesn't work much and it's really frustrating to get VLLM to work. So what are you going to do is we are going to use WSL, which is Windows subsystem for Linux. Basically a window, a Linux environment on a Windows machine. Okay. So we're going to run this on PowerShell and this is going to just get you started. So once you have that, you'll be able to open something called WSL here need to install Ubuntu as well. So you have Ubuntu and WSL here. And then what I'm going to do is I'm going to make one environment here using mini conda or conda. You need to install mini conda here as well. So when you first start this, you will have something like this. So you will have, sorry, you will have something like this. And then you need to install mini conda. So you go to mini conda, install here, to install mini conda here. And then you can go to Linux installer here, copy this, go back and paste it here. It'll download the files. And then you go down and run this, say enter, do the necessary requirements here. So enter, say yes, enter, and you are done. Okay. So it's not installed in my case because I already have a mini conda environment here. Okay. So once you install mini conda, then again, make a new environment which will be like conda, create dash and let's call it VLLM ENV and then say Python 3.12-Y. Okay. So this will create a VLLM environment here, VLLM underscore ENV environment on your WSL, on your Linux system. So this is going to install everything here. Okay. Now we need to do conda activate, I guess a conda activate VLLM underscore ENV. So this will activate the environment. Cool. Next, what you need to do is go ahead and install VLLM. So say pip install VLLM. And this is going to download the VLLM. This will take some time, but it will be worth it. And again, just keep it in this fashion and use it later as well. You don't need to download it every day and every time you use an LLM. This is a one-time setup that you can do on your system and you can keep it forever. Of course, you need to update the things when it is required. Okay. I'm not going to bore you with the installation here. So what I've done is I've already got this installed. So I can see the list of environments conda info dash dash ENVS. This will give me the list of environments. So the environment is this one open BNB. I can say conda activate open BNB. So this is my environment in which the VLLM is already installed. So now what I can do is I already have this command. I will share this with you, but this is the command that we need to use. So VLLM serve, we are serving this model open BNB. I can say mini CPM 1B. And then we will serve it in the port of 1, 2, 3, 4 GPU memory utilization is 0.65. Because if I keep it one, then it the 8GVGV that I have falls short. Therefore, I had to reduce this. Now the most important thing and the only important thing if you want to take from this video is this 64,000 value. We need a maximum model length of 64,000 value. If you want to work with open code or if you want to work with Hermos agent and cloud code. So if you want to use this model 1 billion model with even cloud code, you can do that. I will bring it a different video. You can obviously integrate with Hermos agent if it's possible with cloud code. And in this video, I'm just going to show you the open code way, but it's absolutely possible for Hermos agent and cloud code as well. I'll bring in a separate video for those. But today let's focus on open code. We need this 64,000 and it's essential. And I was trying this with Hermos agent. I will bring a separate video on this, but you can see the error that we have here. This is an internal error model open BNB mini CPM 1B has a context 20 of 16,000, but which is below the minimum 64,000 required by Hermos agent. And therefore I suggest to keep this value whenever you're working with agentic frameworks. Okay. So 64,000 now in order to have that 64,000, you need because it takes some GPUs. So we need to make some space in the GPU. So for which we need to reduce this value. Okay. And then we are putting enable auto tool choice. This will help us choose the tools. And then we're using the tool parser of Hermos and this will help you output tool calls. So let's run this VLM serve open VMB mini CPM 5, 1 billion. We got the same to open code. So this is running. You can see that it's pretty good. You can see the memory requirements. And that will be interesting. And it's not yet increased, but it will be in just a moment. So I have a GB GPU and it's going to increase. So I can see that it's increasing 32, 3.2 GB is used. It will go even more. Okay. It is downloading and making everything ready. So finally, everything has started up. It's VLM. Your 1 billion model is running here on the port 1, 2, 3, 4 on the local host. And I can see the usage here with a GB of GPU. I've used 6.4 GB is being used. And that includes my recording cap cut as well. So now let's go ahead and use this. How to use this now. What do you say we have this model? So if I go to a CMD and if I do a CURL and I can say HTTP and say local host, 1, 2, 3, 4, V1 and say models, then you'll get the list of models. So you can see that we have this open BMV mini CPM 1 billion. So this is the actual model. Okay. So if I use this, you can use this on the front end that I've shown you at the start. We're going to use trimlet. Now I have already made a code. I will share that with you as well. But this is a code app.py. You're using this endpoint of 1, 2, 3, 4 V1 chat completions. This is the model name. By the way, you can go ahead and try out the other models. I think the SFT would perform better as well. This is post train that we're using right now. But this is before the reinforcement learning and OPD. So you can go ahead and use this SFT model as well. But ultimately, the starting model was this pre training model. And then we have the SFT and then we have the RL plus OPD. So this is the final model that you can use. If you are on Apple, you can go ahead and use this MLX. We have seen how to use this on Olamma. But you can check out my other videos where I've shown you how to use this on Lama CPP and LM Studio as well. The Olamma and LM Studio are just have been made on the Lama CPP here. So let's go back enough of blabbering. This is the model that we're using setting the title of me CPM. And then we are putting this streamlit code here. I'll share the code. It's it's a normal code, nothing much. And there I need to install a streamlit streamlit and say requests. Sorry, it should be requests. Okay, seems like this is installed. Let's go ahead and run this now. Streamlit run app.py. So this will start up my app.py here. And you can see that we have this beautiful interface. Mini CPM 1 billion chat here. So let's go ahead and chat this. And I have deliberately not kept any memory as of now. So what is the capital of India? And you can see it's pretty fast. This is the thinking mode here till here and the output here. How cool is that? And I can say which is larger 9.11 or 9.9. 9.9 is larger because it's the first decimal digit larger than 9.11. And it's really great. Which is larger 2 to the power 3 or 3 to the power 2. And as a matter of fact, even chat GPT did this wrong. You can see I was asking some trick questions if I show you the exact conversation that we're having. So you can see that this wears some of the conversations that I was having with chat GPT so that you know I can ask some trick questions here. And you can see this one. You can see question number 10 which is larger 2 to the power 3 or 3 to the power 2. And the answer that chat GPT gave was equal. I mean come on. 3 to the power 2 is 9 and 2 to the power 3 is 8. And therefore 3 to the power 2 is larger. And you can see if we go to our favorite 1 billion model, let's see the output here. So you can see 3 to the power 2 is larger and it was able to solve this. It's really amazing how fast is this. I can go ahead and look at some other questions here. For example, write this number in words. Let's go ahead and see here. So you can see this one. Output. So all these are the thinking process and the output is 1024. Okay. And then other questions. For example, what comes next 248 1632. It's so fast. You know, like 100 tokens. It's really it's amazingly fast. You can see it's 32 here and harder reasoning. Let's say a doctor gives you three pills and say take one every 30 minutes. How long until all pills are taken? Okay. This is the question you got the question. I think a doctor gives you three pills and say take one pill every 30 minutes. So right now is let's say 12 o'clock and 12 30 you take one and one PM you take one and then one 30 you take one. So how long it would be like, you know, 90 minutes. So let's see the output. These are all the thinking process. I cannot complain, but I just want the final output. Okay. This is another thing which I even did wrong and says take one every 30 minutes. Okay. It considered that it took the first pill at the start the second 30 minutes later and the third 60 minutes later. What is the output here? Okay. First pill now second pill 30 minutes and the third pill 60 minutes. I was mistaken. Sorry. But what I want is that it's not, you know, it's not cutting the outputs and it's able to give the entire output here. And that is really amazing for me. And now thanks to the 64,000 tokens that we have put and then this code as well. And the, in the output tokens and the maximum allowable tokens also I have given a very good number, which is about 2048. And therefore it's not, it will not exclude now given this number considering that we will have the history as well. So if these questions would have been history and it remembered everything from the start, then it would easily cross, you know, 20k or 30k tokens. And then it will be useful when we have a 64k tokens here. But for this example, you can see it works really good. It's fun. It's fast and it's local. Now taking one step further, if you have been following and watching this video till now, let's go to the, the third criteria in which we are going to use this with open code. Now this is revolutionary. A one billion model trying to run on open code. If it was success, I don't think so. I don't want to say now, but let's go ahead and see for ourselves if it's a success or not. So for this, I can go and install open code using npm. So I can say npm, copy this, go to any cmd on your system. And then I can say npm i dash g, which means dash g is for global and this is done. Okay. So once we have this, what I'm going to do is I'm going to go to this directory that I was playing with. I'm going to go to cmd here and then I'm going to go and say open code. So open code and this will enter into your interface here. Pretty cool. Next, we want to have the models. So I can say models here, switch models and you can pick up the model from here. But let's go ahead and first use the desktop app in which I can show you more clearly how to set up a model. I'm going to close this. I'm going to go here and download the desktop app again. So this is the desktop app. I'm going to download four windows and once you download and install this, you'll have something like open code. So open code is opening. I'm going to go ahead and click here on the open projects. Go to that folder that we're working on. So select that folder and then say new session. And now we need to set our models here. Okay. So I click on the choose model here, click on the manage models here and then click on connect provider and then go to custom provider. So here what I'm going to say is YouTube. Let's say one billion. This is the provider ID. The display name could be the same thing like one billion. The base URL, this is important, HTTP. And then we have localhost or I can say 127. 001. And then I can say v1 and the port as well. Sorry. One to three, four. And then I'm going to say v1. Okay. And then there is no requirement of any API key, but the model name is really, really important here. And the model that we have seen is this one open BNB copied. No, open BNB, any CPM, be mini CPM five. So this will be like model ID is like open BNB. Then we have mini CPM five and one billion. The display name could be anything. I'm going to put like YouTube one billion. And let's go ahead and test this out. If it works or not, otherwise we need to change some things. So this is done. I'm going to close this. Let's open up again, open code. Okay. So we selected this model that we've used, which is this YouTube one B. And now we can go ahead and ask, hi, how are you cool? I can say what is the capital of India? Cool. And then if you remember, I had some questions that I asked. And if you remember, I have just one. And if you remember, I just asked some chat GPT for some trick questions here. So let's see, copy this and let's go ahead and let's go ahead and ask this question here. So divide 30 by half and then add 10. It doesn't have the mathematical abilities. But if you go to open code here and then maybe change the model, let's see. So we pick the model, which is this one, YouTube one B, and then ask the same question here. The same question that you asked just now, divide 30 by half. So you can see this is able to give me the answer because we're using on the code here. Now the difference between the CLI that I've observed and the web app is that in the web app, you're not able to make the files automatically. It's difficult to make the files automatically. But if you're using on the CLI, which is equivalent to cloud code, then you're able to make the files as well. But this one billion model is not it's able to make the entire files for yourself. But it's at least it's trying, but at least it's trying to get the results. And you can see that if I say, read the contents of the folder of the current folder, or you can say read the contents of the files of the current folder, then you see that it was able to get this file, this file app.txt. And we really do have that app.txt file here. And I can say, read the contents of the app.txt file. Go ahead. So you can see it lacks that it lacks that capabilities of using and changing the files in your system. But having said that, this is an amazing model for doing all other sorts of things without any file modification or shell commands running. So those are not available here. But still, you can go ahead and ask these questions as well. Which one is heavier? One kg of iron or one kg of cotton? And you can see that the weights are approximately iron this point 18 kg and cotton is 0.75. I want to expect that it performs well in the tool calling feature as well. So apart from the tool calling, I was able to find this as a very good model for using on LamaCPP, Olama LM Studio. And you can go ahead and use it and do the VLLM here as well. Now for any summarization task or the task which has words without any tool usage, you can go ahead and absolutely use this model. This is amazingly fast. It's like 100 tokens per second on the 8GB GPU that I have. So go ahead and test this out and let me know if you were able to connect this with all these things that I've mentioned here with Olama VLLM and OpenCode. I'm going to show in the next video on Cloud Code as well. And maybe find a way to make that tool call usage. Okay. But in summary, it's a very good model. It works with Olama. It works with VLLM. Even it works with OpenCode. We're able to connect. And there is no issue with the context as you have seen. 64K tokens. If you keep that 64K tokens, you know, if you have lesser GPU, if you have a 4GB GPU, then try to make this GPU utilization lesser. But you have to keep the 64K here as well. Now I've tested out so many things which I cut from the video. I've tested out with SG Lang as well and it failed miserably. VLLM was the one which at least we were able to serve this. And at least we are able to use this with OpenCode. Okay. So in the next video, in the subsequent videos, I will play with this model and do all sorts of automations. Try it with Cloud Code, try it with Hermes agent and I will show you the results. So stay tuned, stay subscribed and I will see you in the next one.
diff --git a/memory/kb/youtube/2026-06-01_agentic-engineering-workflow.md b/memory/kb/youtube/2026-06-01_agentic-engineering-workflow.md
new file mode 100644
index 0000000..dc7b8e3
--- /dev/null
+++ b/memory/kb/youtube/2026-06-01_agentic-engineering-workflow.md
@@ -0,0 +1,59 @@
+# My Agentic Engineering Workflow (step by step workflow)
+
+**URL:** https://youtu.be/WIDIV8oDDC8
+**Durata:** 35:53
+**Salvat:** 2026-06-01
+**Tags:** @work @growth
+
+---
+
+## TL;DR
+
+Workflow complet de inginerie agentică: GPT-4.5 extra high fast în Cursor + Greptile pentru code review automat + GP Loop (skill Greptile care iterează autonom până la 5/5) + Whisper Flow pentru dictare. Construiește o funcționalitate completă (artifacts preview similar Claude) fără să scrie manual aproape nicio linie de cod.
+
+---
+
+## Puncte Cheie
+
+- **Stack:** Cursor + GPT-4.5 extra high fast + Greptile (code review) + GP Loop skill + Whisper Flow (speech-to-text)
+- **Whisper Flow:** Gratis, speech-to-text — vorbești mai mult decât scrii, deci prompt-urile devin mai bogate
+- **Greptile GP Loop:** Skill care citește comentariile de review GitHub → face fix-uri → push → re-review, iterează autonom până la 5/5 sau 5 turns. Complet autonom.
+- **PR-uri mici:** Regula de aur — PR-uri sub 1000 linii, ideally câteva sute. >2000 linii = Greptile nu poate prinde toate problemele. A spart un PR de 2000 linii în 4 PR-uri stacked.
+- **Plan-ul e pentru tine, nu pentru agent:** Creează planul mai mult ca să ții minte ce construiești, mai ales când lucrezi pe mai multe features simultan.
+- **Subagenti non-blocking:** Agenții spawna subagent pentru research, thread principal rămâne liber pentru alte întrebări.
+- **/code-structure skill:** Restructurează codebase-ul într-un service layer curat — ajută și agentul să citească și să înțeleagă codul.
+- **Confidence score Greptile:** 4-5/5 = safe to merge. Sub 4 = mai e de lucru. GP Loop se oprește la 5/5 sau 5 turns.
+- **Stack tehnic Pluto:** SvelteKit + Electron (desktop) + Convex (backend) + Daytona (agent cloud) + Super (memory) + Agent Mail + Plaid + Twilio
+
+---
+
+## Workflow pas cu pas
+
+1. **Dictează prompt-ul** cu Whisper Flow (vorbești liber, mai mult context)
+2. **Plan mode în Cursor** — agentul explorează codebase, propune plan cu PRs mici
+3. **Build feature** — back-and-forth cu agentul, testezi vizual
+4. **Push branch + PR** — agentul face push și creează PR automat
+5. **Greptile review** — obții confidence score + comentarii specifice
+6. **/gp loop** — agentul iterează autonom: citește feedback → fix → push → re-review
+7. **Merge** când 5/5 sau după review manual dacă se blochează
+
+---
+
+## Quote-uri
+
+> "The plan sometimes and actually most of the time is really for me because I'll work on multiple features at a time and I need to remember what it is that I was working on."
+
+> "Short, simple, concise, to the point, not too long. That's the sauce that I've seen success with."
+
+> "The smaller the PR, the more focused the PR, the better your life is. And I think the same applies to the agent as well."
+
+> "Engineering is not dead. In fact, it's become more alive because generating code has become so much easier."
+
+---
+
+## Idei Acționabile
+
+- [ ] Explorează **Greptile** pentru code review automat pe Gitea/GitHub — are skill GP Loop care poate fi integrat în workflow Ralph @work
+- [ ] **Speech-to-text** pentru prompt-uri mai bogate — Whisper Flow sau alternativă locală @work
+- [ ] Principiu: **PR-uri mici și focused** pentru Ralph — la fel ca pentru oameni, agentul produce calitate mai bună pe schimbări mici @work
+- [ ] **Plan mode** înainte de features mari — nu pentru agent ci pentru Marius să țină track @work