Files
pgz-sport/scrape_online.py
T

62 lines
2.2 KiB
Python

#!/usr/bin/env python3
import requests, re, time, hashlib, psycopg2, os, glob
from datetime import datetime
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
LOG = "/opt/pgz-sport/logs/scrape_online.log"
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
TG_CHAT = "7969491558"
H = {"User-Agent": "Ri.NET Civic Intelligence Bot 1.0"}
os.makedirs("/opt/pgz-sport/logs", exist_ok=True)
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
with open(LOG, "a") as f: f.write(msg+"\n")
def get(url):
try:
r = requests.get(url, headers=H, timeout=20)
time.sleep(1.5)
return r.text if r.status_code == 200 else None
except Exception as e:
log(f"ERR {url}: {e}"); return None
def clean(html):
txt = re.sub(r'<[^>]+>', ' ', html or '')
txt = re.sub(r'\s+', ' ', txt).strip()[:1200]
return txt.replace('\x00', '')
conn = psycopg2.connect(DSN)
conn.autocommit = True
cur = conn.cursor()
total = 0
def insert(fact, source):
global total
fact = fact.replace('\x00', '').strip()
if len(fact) < 50: return
dh = hashlib.md5(fact.encode()).hexdigest()
cur.execute("INSERT INTO dabi.knowledge (fact, source, confidence, category, data_hash) VALUES (%s,%s,%s,%s,%s) ON CONFLICT (data_hash) DO NOTHING",
(fact[:2000], source[:200], 0.80, "pgz_sport_online", dh))
if cur.rowcount: total += 1
log("=== ONLINE SCRAPING ===")
for url in ["https://www.pgz.hr/o-zupaniji/sport/", "https://www.rijeka.hr/teme-za-gradane/sport/"]:
html = get(url)
if html:
txt = clean(html)
if len(txt) > 200:
insert(txt, url)
log(f" {url}: {len(txt)} chars -> inserted={cur.rowcount}")
for fpath in glob.glob("/opt/pgz-sport/_downloads/**/*.txt", recursive=True)[:30]:
if "godisnjak" in fpath.lower(): continue
try:
txt = open(fpath, errors="replace").read().replace('\x00','')[:800].strip()
if len(txt) > 100: insert(txt, f"zsp_local:{os.path.basename(fpath)}")
except: pass
log(f"DONE: {total} new facts")
requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
data={"chat_id": TG_CHAT, "text": f"Scrape done: {total} new facts"}, timeout=10)