#!/usr/bin/env python3 """ Lead Generator Minimal - Găsește companii care au nevoie de soluții ERP/contabilitate. Folosește Brave Search API pentru a găsi companii care angajează contabili/economiști. Output: leads.csv cu companii pentru review manual Usage: python find_leads.py [--limit N] Necesită: BRAVE_API_KEY în environment sau ~/.clawdbot/clawdbot.json """ import os import re import csv import json import argparse from datetime import datetime from pathlib import Path import requests OUTPUT_DIR = Path(__file__).parent / "output" OUTPUT_DIR.mkdir(exist_ok=True) def get_brave_api_key(): """Get Brave API key from clawdbot config.""" config_path = Path.home() / ".clawdbot" / "clawdbot.json" if config_path.exists(): with open(config_path) as f: config = json.load(f) # Try tools.web.search.apiKey (clawdbot format) api_key = config.get("tools", {}).get("web", {}).get("search", {}).get("apiKey", "") if api_key: return api_key # Fallback to brave.apiKey return config.get("brave", {}).get("apiKey", "") return os.getenv("BRAVE_API_KEY", "") def search_brave(query, count=10): """Search using Brave Search API.""" api_key = get_brave_api_key() if not api_key: print("[!] Nu am găsit Brave API key") return [] url = "https://api.search.brave.com/res/v1/web/search" headers = { "X-Subscription-Token": api_key, "Accept": "application/json" } params = { "q": query, "count": count } try: resp = requests.get(url, headers=headers, params=params, timeout=15) data = resp.json() return data.get("web", {}).get("results", []) except Exception as e: print(f"[!] Brave search error: {e}") return [] def extract_companies_from_results(results): """Extract company names from search results.""" companies = [] # Patterns for Romanian companies patterns = [ r'([A-Z][A-Z\s\-\.&]+(?:S\.R\.L\.|SRL|S\.A\.|SA|S\.C\.))', # COMPANY S.R.L. r'(SC\s+[A-Z][A-Z\s\-\.&]+(?:S\.R\.L\.|SRL|S\.A\.|SA))', # SC COMPANY SRL r'([A-Z][a-zA-Z\s\-\.&]{2,30}(?:S\.R\.L\.|SRL|S\.A\.|SA))', # Mixed case ] for result in results: text = f"{result.get('title', '')} {result.get('description', '')}" for pattern in patterns: matches = re.findall(pattern, text) for match in matches: company = match.strip() # Clean up company = re.sub(r'\s+', ' ', company) if len(company) > 5 and len(company) < 80: companies.append({ "company": company, "source_url": result.get("url", ""), "context": text[:200] }) return companies def clean_company_name(name): """Clean company name from prefix garbage.""" # Remove common prefixes that get caught by regex prefixes_to_remove = [ r'^(?:Senior|Junior|Contabil|Economist|Director\s+Economic|Expert|Specialist)\s+', r'^(?:RON|EUR|USD)\s+', r'^(?:Bucuresti|Cluj|Iasi|Brasov|Constanta)\s+', r'^\d+[\s\-]+', ] result = name.strip() for pattern in prefixes_to_remove: result = re.sub(pattern, '', result, flags=re.IGNORECASE) # Clean trailing garbage result = re.sub(r'\s*-\s*$', '', result) result = re.sub(r'\s+', ' ', result).strip() return result def deduplicate(leads): """Elimină duplicate după numele companiei.""" seen = set() unique = [] for lead in leads: # Clean company name lead["company"] = clean_company_name(lead["company"]) # Normalize for comparison company_norm = re.sub(r'[^a-z0-9]', '', lead["company"].lower()) # Skip too short or invalid if len(company_norm) < 5: continue # Skip obvious non-companies skip_patterns = [ r'^emea\s', r'^staff\s', r'accountant', r'^bestjobs', r'^ejobs', r'^hipo', ] if any(re.search(p, lead["company"], re.IGNORECASE) for p in skip_patterns): continue if company_norm not in seen: seen.add(company_norm) unique.append(lead) return unique def enrich_leads(leads): """Adaugă câmpuri pentru tracking.""" for lead in leads: lead["found_date"] = datetime.now().isoformat()[:10] lead["cui"] = "" lead["email"] = "" lead["website"] = "" lead["phone"] = "" lead["status"] = "new" # new, researched, contacted, replied, converted, rejected lead["notes"] = "" lead["industry"] = "" return leads def save_leads(leads, filename="leads.csv"): """Salvează leads în CSV pentru review.""" output_file = OUTPUT_DIR / filename fieldnames = ["company", "industry", "source_url", "found_date", "cui", "email", "website", "phone", "status", "notes"] # Remove context from output (used only for extraction) for lead in leads: lead.pop("context", None) with open(output_file, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() writer.writerows(leads) return output_file def main(): parser = argparse.ArgumentParser(description="Lead Generator Minimal") parser.add_argument("--limit", type=int, default=10, help="Results per search query") args = parser.parse_args() print("🔍 Căutare leads via Brave Search...") # Search queries - companii care angajează contabili queries = [ 'site:ejobs.ro contabil angajare 2026', 'site:ejobs.ro economist angajare', 'site:bestjobs.eu contabil Romania', 'site:hipo.ro contabil angajare', '"angajam contabil" Romania firma', '"cautam economist" Romania SRL', ] all_leads = [] for query in queries: print(f" → {query[:50]}...") results = search_brave(query, count=args.limit) companies = extract_companies_from_results(results) all_leads.extend(companies) print(f" Găsite: {len(companies)} companii") # Deduplică unique_leads = deduplicate(all_leads) print(f"\n📊 Total: {len(all_leads)} → {len(unique_leads)} unice") # Îmbogățește enriched = enrich_leads(unique_leads) # Salvează output_file = save_leads(enriched) print(f"\n✅ Salvat: {output_file}") # Afișează lista print(f"\n📋 {len(enriched)} companii găsite:") for i, lead in enumerate(enriched, 1): print(f" {i}. {lead['company']}") print(f"\n💡 Următorii pași:") print(f" 1. Deschide {output_file}") print(f" 2. Completează CUI, email, website pentru cele interesante") print(f" 3. Marchează status: researched → contacted → replied") return enriched if __name__ == "__main__": main()