Update .trash, dashboard, root +1 more (+1 ~5 -18)
This commit is contained in:
@@ -1,21 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ANAF Monitor v2 - Extrage și compară versiuni soft A/J din numele fișierelor
|
||||
ANAF Monitor v2.2 - Hash detection + version extraction + text diff
|
||||
- Hash-based change detection (catches ANY change)
|
||||
- Extracts ALL soft A/J versions from page
|
||||
- Saves page text and shows diff on changes
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import hashlib
|
||||
import urllib.request
|
||||
import ssl
|
||||
import difflib
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from html.parser import HTMLParser
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
CONFIG_FILE = SCRIPT_DIR / "config.json"
|
||||
VERSIONS_FILE = SCRIPT_DIR / "versions.json"
|
||||
HASHES_FILE = SCRIPT_DIR / "hashes.json"
|
||||
SNAPSHOTS_DIR = SCRIPT_DIR / "snapshots"
|
||||
LOG_FILE = SCRIPT_DIR / "monitor.log"
|
||||
DASHBOARD_STATUS = SCRIPT_DIR.parent.parent / "dashboard" / "status.json"
|
||||
|
||||
# Ensure snapshots directory exists
|
||||
SNAPSHOTS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
class TextExtractor(HTMLParser):
|
||||
"""Extract visible text from HTML"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text = []
|
||||
self.skip_tags = {'script', 'style', 'head', 'meta', 'link'}
|
||||
self.current_tag = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.current_tag = tag.lower()
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.current_tag = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_tag not in self.skip_tags:
|
||||
text = data.strip()
|
||||
if text:
|
||||
self.text.append(text)
|
||||
|
||||
def get_text(self):
|
||||
return '\n'.join(self.text)
|
||||
|
||||
|
||||
def html_to_text(html):
|
||||
"""Convert HTML to plain text"""
|
||||
parser = TextExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
return parser.get_text()
|
||||
except:
|
||||
# Fallback: just strip tags
|
||||
return re.sub(r'<[^>]+>', ' ', html)
|
||||
|
||||
SSL_CTX = ssl.create_default_context()
|
||||
SSL_CTX.check_hostname = False
|
||||
SSL_CTX.verify_mode = ssl.CERT_NONE
|
||||
@@ -39,14 +85,58 @@ def save_json(path, data):
|
||||
def fetch_page(url, timeout=30):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ANAF-Monitor/2.0)'
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; ANAF-Monitor/2.1)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout, context=SSL_CTX) as resp:
|
||||
return resp.read().decode('utf-8', errors='ignore')
|
||||
return resp.read()
|
||||
except Exception as e:
|
||||
log(f"ERROR fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
def compute_hash(content):
|
||||
"""Compute SHA256 hash of content"""
|
||||
return hashlib.sha256(content).hexdigest()
|
||||
|
||||
|
||||
def load_snapshot(page_id):
|
||||
"""Load previous page text snapshot"""
|
||||
snapshot_file = SNAPSHOTS_DIR / f"{page_id}.txt"
|
||||
try:
|
||||
return snapshot_file.read_text(encoding='utf-8')
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def save_snapshot(page_id, text):
|
||||
"""Save page text snapshot"""
|
||||
snapshot_file = SNAPSHOTS_DIR / f"{page_id}.txt"
|
||||
snapshot_file.write_text(text, encoding='utf-8')
|
||||
|
||||
|
||||
def generate_diff(old_text, new_text, context_lines=3):
|
||||
"""Generate unified diff between old and new text"""
|
||||
if not old_text:
|
||||
return None
|
||||
|
||||
old_lines = old_text.splitlines(keepends=True)
|
||||
new_lines = new_text.splitlines(keepends=True)
|
||||
|
||||
diff = list(difflib.unified_diff(
|
||||
old_lines, new_lines,
|
||||
fromfile='anterior',
|
||||
tofile='actual',
|
||||
n=context_lines
|
||||
))
|
||||
|
||||
if not diff:
|
||||
return None
|
||||
|
||||
# Limitează diff-ul la maxim 50 linii pentru output
|
||||
if len(diff) > 50:
|
||||
diff = diff[:50] + ['... (truncat)\n']
|
||||
|
||||
return ''.join(diff)
|
||||
|
||||
def parse_date_from_filename(filename):
|
||||
"""Extrage data din numele fișierului (ex: D394_26092025.pdf -> 26.09.2025)"""
|
||||
# Pattern: _DDMMYYYY. sau _DDMMYYYY_ sau _YYYYMMDD
|
||||
@@ -69,10 +159,10 @@ def parse_date_from_filename(filename):
|
||||
return None
|
||||
|
||||
def extract_versions(html):
|
||||
"""Extrage primul soft A și soft J din HTML"""
|
||||
"""Extrage soft A/J din HTML - primul generic + toate cele cu label (S1002, etc.)"""
|
||||
versions = {}
|
||||
|
||||
# Găsește primul link soft A (PDF)
|
||||
# Găsește PRIMUL link soft A (PDF) - versiunea curentă
|
||||
soft_a_match = re.search(
|
||||
r'<a[^>]+href=["\']([^"\']*\.pdf)["\'][^>]*>\s*soft\s*A\s*</a>',
|
||||
html, re.IGNORECASE
|
||||
@@ -84,17 +174,33 @@ def extract_versions(html):
|
||||
if date:
|
||||
versions['soft_a_date'] = date
|
||||
|
||||
# Găsește primul link soft J (ZIP)
|
||||
soft_j_match = re.search(
|
||||
r'<a[^>]+href=["\']([^"\']*\.zip)["\'][^>]*>\s*soft\s*J',
|
||||
# Găsește soft J-uri CU LABEL (ex: "soft J - S1002") - toate
|
||||
soft_j_labeled = re.findall(
|
||||
r'<a[^>]+href=["\']([^"\']*\.zip)["\'][^>]*>\s*soft\s*J\s*-\s*([^<]+)',
|
||||
html, re.IGNORECASE
|
||||
)
|
||||
if soft_j_match:
|
||||
url = soft_j_match.group(1)
|
||||
versions['soft_j_url'] = url
|
||||
date = parse_date_from_filename(url)
|
||||
if date:
|
||||
versions['soft_j_date'] = date
|
||||
|
||||
if soft_j_labeled:
|
||||
# Pagină cu soft-uri denumite (bilanț)
|
||||
for url, label in soft_j_labeled:
|
||||
label = label.strip()
|
||||
key = f'soft_j_{label.replace(" ", "_")}'
|
||||
versions[f'{key}_url'] = url
|
||||
date = parse_date_from_filename(url)
|
||||
if date:
|
||||
versions[f'{key}_date'] = date
|
||||
else:
|
||||
# Pagină cu soft J simplu - ia doar primul
|
||||
soft_j_match = re.search(
|
||||
r'<a[^>]+href=["\']([^"\']*\.zip)["\'][^>]*>\s*soft\s*J',
|
||||
html, re.IGNORECASE
|
||||
)
|
||||
if soft_j_match:
|
||||
url = soft_j_match.group(1)
|
||||
versions['soft_j_url'] = url
|
||||
date = parse_date_from_filename(url)
|
||||
if date:
|
||||
versions['soft_j_date'] = date
|
||||
|
||||
# Găsește data publicării din text
|
||||
publish_match = re.search(
|
||||
@@ -106,71 +212,106 @@ def extract_versions(html):
|
||||
|
||||
return versions
|
||||
|
||||
def format_date(d):
|
||||
"""Formatează data pentru afișare"""
|
||||
if not d:
|
||||
return "N/A"
|
||||
return d
|
||||
|
||||
def compare_versions(old, new, page_name):
|
||||
def compare_versions(old, new):
|
||||
"""Compară versiunile și returnează diferențele"""
|
||||
changes = []
|
||||
|
||||
fields = [
|
||||
('soft_a_date', 'Soft A'),
|
||||
('soft_j_date', 'Soft J'),
|
||||
('published', 'Publicat')
|
||||
]
|
||||
# Colectează toate cheile unice
|
||||
all_keys = set(old.keys()) | set(new.keys())
|
||||
date_keys = sorted([k for k in all_keys if k.endswith('_date') or k == 'published'])
|
||||
|
||||
for field, label in fields:
|
||||
old_val = old.get(field)
|
||||
new_val = new.get(field)
|
||||
for key in date_keys:
|
||||
old_val = old.get(key)
|
||||
new_val = new.get(key)
|
||||
|
||||
# Formatează label-ul
|
||||
label = key.replace('_date', '').replace('_', ' ').title()
|
||||
|
||||
if new_val and old_val != new_val:
|
||||
if old_val:
|
||||
changes.append(f"{label}: {old_val} → {new_val}")
|
||||
else:
|
||||
changes.append(f"{label}: {new_val} (nou)")
|
||||
changes.append(f"{label}: {new_val} (NOU)")
|
||||
|
||||
return changes
|
||||
|
||||
def check_page(page, saved_versions):
|
||||
def format_current_versions(versions):
|
||||
"""Formatează versiunile curente pentru output"""
|
||||
result = {}
|
||||
for key, val in versions.items():
|
||||
if key.endswith('_date'):
|
||||
label = key.replace('_date', '')
|
||||
result[label] = val
|
||||
return result
|
||||
|
||||
def check_page(page, saved_versions, saved_hashes):
|
||||
"""Verifică o pagină și returnează modificările"""
|
||||
page_id = page["id"]
|
||||
name = page["name"]
|
||||
url = page["url"]
|
||||
|
||||
html = fetch_page(url)
|
||||
if html is None:
|
||||
content = fetch_page(url)
|
||||
if content is None:
|
||||
return None
|
||||
|
||||
# 1. Verifică hash-ul mai întâi (detectează ORICE schimbare)
|
||||
new_hash = compute_hash(content)
|
||||
old_hash = saved_hashes.get(page_id)
|
||||
|
||||
html = content.decode('utf-8', errors='ignore')
|
||||
new_text = html_to_text(html)
|
||||
new_versions = extract_versions(html)
|
||||
old_versions = saved_versions.get(page_id, {})
|
||||
|
||||
# Prima rulare - doar salvează, nu raportează
|
||||
if not old_versions:
|
||||
log(f"INIT: {page_id} - {new_versions}")
|
||||
# Încarcă snapshot-ul anterior
|
||||
old_text = load_snapshot(page_id)
|
||||
|
||||
# Prima rulare - inițializare
|
||||
if not old_hash:
|
||||
log(f"INIT: {page_id}")
|
||||
saved_hashes[page_id] = new_hash
|
||||
saved_versions[page_id] = new_versions
|
||||
save_snapshot(page_id, new_text)
|
||||
return None
|
||||
|
||||
changes = compare_versions(old_versions, new_versions, name)
|
||||
saved_versions[page_id] = new_versions
|
||||
# Compară hash-uri
|
||||
hash_changed = new_hash != old_hash
|
||||
|
||||
if changes:
|
||||
log(f"CHANGES in {page_id}: {changes}")
|
||||
return {
|
||||
# Compară versiuni pentru detalii
|
||||
version_changes = compare_versions(old_versions, new_versions)
|
||||
|
||||
# Generează diff dacă s-a schimbat
|
||||
diff = None
|
||||
if hash_changed and old_text:
|
||||
diff = generate_diff(old_text, new_text)
|
||||
|
||||
# Actualizează starea
|
||||
saved_hashes[page_id] = new_hash
|
||||
saved_versions[page_id] = new_versions
|
||||
save_snapshot(page_id, new_text)
|
||||
|
||||
if hash_changed:
|
||||
if version_changes:
|
||||
log(f"CHANGES in {page_id}: {version_changes}")
|
||||
else:
|
||||
log(f"HASH CHANGED in {page_id} (no version changes detected)")
|
||||
version_changes = ["Pagina s-a modificat (vezi diff)"]
|
||||
|
||||
result = {
|
||||
"id": page_id,
|
||||
"name": name,
|
||||
"url": url,
|
||||
"changes": changes,
|
||||
"current": {
|
||||
"soft_a": new_versions.get('soft_a_date', 'N/A'),
|
||||
"soft_j": new_versions.get('soft_j_date', 'N/A')
|
||||
}
|
||||
"changes": version_changes,
|
||||
"current": format_current_versions(new_versions)
|
||||
}
|
||||
else:
|
||||
log(f"OK: {page_id}")
|
||||
return None
|
||||
|
||||
if diff:
|
||||
result["diff"] = diff
|
||||
|
||||
return result
|
||||
|
||||
log(f"OK: {page_id}")
|
||||
return None
|
||||
|
||||
def update_dashboard_status(has_changes, changes_count):
|
||||
"""Actualizează status.json pentru dashboard"""
|
||||
@@ -188,18 +329,20 @@ def update_dashboard_status(has_changes, changes_count):
|
||||
log(f"ERROR updating dashboard status: {e}")
|
||||
|
||||
def main():
|
||||
log("=== Starting ANAF monitor v2 ===")
|
||||
log("=== Starting ANAF monitor v2.1 ===")
|
||||
|
||||
config = load_json(CONFIG_FILE, {"pages": []})
|
||||
saved_versions = load_json(VERSIONS_FILE, {})
|
||||
saved_hashes = load_json(HASHES_FILE, {})
|
||||
|
||||
all_changes = []
|
||||
for page in config["pages"]:
|
||||
result = check_page(page, saved_versions)
|
||||
result = check_page(page, saved_versions, saved_hashes)
|
||||
if result:
|
||||
all_changes.append(result)
|
||||
|
||||
save_json(VERSIONS_FILE, saved_versions)
|
||||
save_json(HASHES_FILE, saved_hashes)
|
||||
|
||||
# Update dashboard status
|
||||
update_dashboard_status(len(all_changes) > 0, len(all_changes))
|
||||
|
||||
Reference in New Issue
Block a user