PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env python3
|
||||
import requests, re, time, hashlib, psycopg2, os, glob
|
||||
from datetime import datetime
|
||||
|
||||
DSN = "host=127.0.0.1 port=5432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
LOG = "/opt/pgz-sport/logs/scrape_online.log"
|
||||
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
|
||||
TG_CHAT = "7969491558"
|
||||
H = {"User-Agent": "Ri.NET Civic Intelligence Bot 1.0"}
|
||||
os.makedirs("/opt/pgz-sport/logs", exist_ok=True)
|
||||
|
||||
def log(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
||||
with open(LOG, "a") as f: f.write(msg+"\n")
|
||||
|
||||
def get(url):
|
||||
try:
|
||||
r = requests.get(url, headers=H, timeout=20)
|
||||
time.sleep(1.5)
|
||||
return r.text if r.status_code == 200 else None
|
||||
except Exception as e:
|
||||
log(f"ERR {url}: {e}"); return None
|
||||
|
||||
def clean(html):
|
||||
txt = re.sub(r'<[^>]+>', ' ', html or '')
|
||||
txt = re.sub(r'\s+', ' ', txt).strip()[:1200]
|
||||
return txt.replace('\x00', '')
|
||||
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
total = 0
|
||||
|
||||
def insert(fact, source):
|
||||
global total
|
||||
fact = fact.replace('\x00', '').strip()
|
||||
if len(fact) < 50: return
|
||||
dh = hashlib.md5(fact.encode()).hexdigest()
|
||||
cur.execute("INSERT INTO dabi.knowledge (fact, source, confidence, category, data_hash) VALUES (%s,%s,%s,%s,%s) ON CONFLICT (data_hash) DO NOTHING",
|
||||
(fact[:2000], source[:200], 0.80, "pgz_sport_online", dh))
|
||||
if cur.rowcount: total += 1
|
||||
|
||||
log("=== ONLINE SCRAPING ===")
|
||||
for url in ["https://www.pgz.hr/o-zupaniji/sport/", "https://www.rijeka.hr/teme-za-gradane/sport/"]:
|
||||
html = get(url)
|
||||
if html:
|
||||
txt = clean(html)
|
||||
if len(txt) > 200:
|
||||
insert(txt, url)
|
||||
log(f" {url}: {len(txt)} chars -> inserted={cur.rowcount}")
|
||||
|
||||
for fpath in glob.glob("/opt/pgz-sport/_downloads/**/*.txt", recursive=True)[:30]:
|
||||
if "godisnjak" in fpath.lower(): continue
|
||||
try:
|
||||
txt = open(fpath, errors="replace").read().replace('\x00','')[:800].strip()
|
||||
if len(txt) > 100: insert(txt, f"zsp_local:{os.path.basename(fpath)}")
|
||||
except: pass
|
||||
|
||||
log(f"DONE: {total} new facts")
|
||||
requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
data={"chat_id": TG_CHAT, "text": f"Scrape done: {total} new facts"}, timeout=10)
|
||||
Reference in New Issue
Block a user