#!/usr/bin/env python3 import requests, re, time, hashlib, psycopg2, os, glob from datetime import datetime DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" LOG = "/opt/pgz-sport/logs/scrape_online.log" TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y" TG_CHAT = "7969491558" H = {"User-Agent": "Ri.NET Civic Intelligence Bot 1.0"} os.makedirs("/opt/pgz-sport/logs", exist_ok=True) def log(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True) with open(LOG, "a") as f: f.write(msg+"\n") def get(url): try: r = requests.get(url, headers=H, timeout=20) time.sleep(1.5) return r.text if r.status_code == 200 else None except Exception as e: log(f"ERR {url}: {e}"); return None def clean(html): txt = re.sub(r'<[^>]+>', ' ', html or '') txt = re.sub(r'\s+', ' ', txt).strip()[:1200] return txt.replace('\x00', '') conn = psycopg2.connect(DSN) conn.autocommit = True cur = conn.cursor() total = 0 def insert(fact, source): global total fact = fact.replace('\x00', '').strip() if len(fact) < 50: return dh = hashlib.md5(fact.encode()).hexdigest() cur.execute("INSERT INTO dabi.knowledge (fact, source, confidence, category, data_hash) VALUES (%s,%s,%s,%s,%s) ON CONFLICT (data_hash) DO NOTHING", (fact[:2000], source[:200], 0.80, "pgz_sport_online", dh)) if cur.rowcount: total += 1 log("=== ONLINE SCRAPING ===") for url in ["https://www.pgz.hr/o-zupaniji/sport/", "https://www.rijeka.hr/teme-za-gradane/sport/"]: html = get(url) if html: txt = clean(html) if len(txt) > 200: insert(txt, url) log(f" {url}: {len(txt)} chars -> inserted={cur.rowcount}") for fpath in glob.glob("/opt/pgz-sport/_downloads/**/*.txt", recursive=True)[:30]: if "godisnjak" in fpath.lower(): continue try: txt = open(fpath, errors="replace").read().replace('\x00','')[:800].strip() if len(txt) > 100: insert(txt, f"zsp_local:{os.path.basename(fpath)}") except: pass log(f"DONE: {total} new facts") requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage", data={"chat_id": TG_CHAT, "text": f"Scrape done: {total} new facts"}, timeout=10)