65 lines
2.3 KiB
Python
65 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
from dotenv import load_dotenv
|
|
load_dotenv('/opt/rinet-gpu/.env.master')
|
|
# auto-added by patch_scrapers_with_dotenv.sh
|
|
import requests, re, time, hashlib, psycopg2, os, glob
|
|
from datetime import datetime
|
|
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
LOG = "/opt/pgz-sport/logs/scrape_online.log"
|
|
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
|
|
TG_CHAT = "7969491558"
|
|
H = {"User-Agent": "Ri.NET Civic Intelligence Bot 1.0"}
|
|
os.makedirs("/opt/pgz-sport/logs", exist_ok=True)
|
|
|
|
def log(msg):
|
|
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
|
with open(LOG, "a") as f: f.write(msg+"\n")
|
|
|
|
def get(url):
|
|
try:
|
|
r = requests.get(url, headers=H, timeout=20)
|
|
time.sleep(1.5)
|
|
return r.text if r.status_code == 200 else None
|
|
except Exception as e:
|
|
log(f"ERR {url}: {e}"); return None
|
|
|
|
def clean(html):
|
|
txt = re.sub(r'<[^>]+>', ' ', html or '')
|
|
txt = re.sub(r'\s+', ' ', txt).strip()[:1200]
|
|
return txt.replace('\x00', '')
|
|
|
|
conn = psycopg2.connect(DSN)
|
|
conn.autocommit = True
|
|
cur = conn.cursor()
|
|
total = 0
|
|
|
|
def insert(fact, source):
|
|
global total
|
|
fact = fact.replace('\x00', '').strip()
|
|
if len(fact) < 50: return
|
|
dh = hashlib.md5(fact.encode()).hexdigest()
|
|
cur.execute("INSERT INTO dabi.knowledge (fact, source, confidence, category, data_hash) VALUES (%s,%s,%s,%s,%s) ON CONFLICT (data_hash) DO NOTHING",
|
|
(fact[:2000], source[:200], 0.80, "pgz_sport_online", dh))
|
|
if cur.rowcount: total += 1
|
|
|
|
log("=== ONLINE SCRAPING ===")
|
|
for url in ["https://www.pgz.hr/o-zupaniji/sport/", "https://www.rijeka.hr/teme-za-gradane/sport/"]:
|
|
html = get(url)
|
|
if html:
|
|
txt = clean(html)
|
|
if len(txt) > 200:
|
|
insert(txt, url)
|
|
log(f" {url}: {len(txt)} chars -> inserted={cur.rowcount}")
|
|
|
|
for fpath in glob.glob("/opt/pgz-sport/_downloads/**/*.txt", recursive=True)[:30]:
|
|
if "godisnjak" in fpath.lower(): continue
|
|
try:
|
|
txt = open(fpath, errors="replace").read().replace('\x00','')[:800].strip()
|
|
if len(txt) > 100: insert(txt, f"zsp_local:{os.path.basename(fpath)}")
|
|
except: pass
|
|
|
|
log(f"DONE: {total} new facts")
|
|
requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
|
data={"chat_id": TG_CHAT, "text": f"Scrape done: {total} new facts"}, timeout=10)
|