feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
# pgz_sport_deep.py — Deep scrape sport-pgz.hr + pgz.hr/sport
|
||||
import os, sys, time, hashlib, logging, re
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
from html import unescape
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [pgz_deep] %(message)s')
|
||||
log = logging.getLogger("pgz_deep")
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
ROOTS = [
|
||||
"https://sport-pgz.hr",
|
||||
"https://www.pgz.hr/teme/sport/",
|
||||
"https://www.pgz.hr/sport/",
|
||||
"https://www.pgz.hr/o-zupaniji/upravna-tijela/upravni-odjel-za-kulturu-sport-tehnicku-kulturu/",
|
||||
]
|
||||
|
||||
def fetch(url, retries=3):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode('utf-8', errors='replace'), r.status
|
||||
except Exception as e:
|
||||
time.sleep(3*(i+1))
|
||||
return None, 0
|
||||
|
||||
def extract_text(html):
|
||||
if not html: return ""
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S|re.I)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = unescape(text)
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
def find_links(html, base):
|
||||
if not html: return []
|
||||
out = []
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
||||
u = urljoin(base, m.group(1))
|
||||
host = urlparse(u).hostname or ""
|
||||
if any(d in host for d in ['pgz.hr', 'sport-pgz.hr']):
|
||||
out.append(u)
|
||||
return list(set(out))
|
||||
|
||||
def find_pdf_links(html, base):
|
||||
if not html: return []
|
||||
out = []
|
||||
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
|
||||
out.append(urljoin(base, m.group(1)))
|
||||
return list(set(out))
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
visited = set()
|
||||
queue = list(ROOTS)
|
||||
docs = 0
|
||||
facts = 0
|
||||
pdfs_logged = 0
|
||||
|
||||
while queue and len(visited) < 300:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
|
||||
html, status = fetch(url)
|
||||
if not html or status != 200:
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
log.info(f"[{status}] {url[:80]} ({len(html)} bytes)")
|
||||
text = extract_text(html)
|
||||
if len(text) < 100: continue
|
||||
|
||||
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
||||
title = title_m.group(1).strip() if title_m else url[:80]
|
||||
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
|
||||
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'PGŽ'))
|
||||
docs += cur.rowcount
|
||||
except Exception as e:
|
||||
log.warning(f"Doc insert fail: {e}")
|
||||
|
||||
# PDF links — log them
|
||||
for pdf_url in find_pdf_links(html, url):
|
||||
try:
|
||||
pdf_sha = hashlib.sha1(pdf_url.encode()).hexdigest()
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(url, pdf_url, fname, title, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (pdf_url, pdf_url, pdf_url.split('/')[-1][:100],
|
||||
pdf_url.split('/')[-1][:200], 'pdf', url, pdf_sha, 'PGŽ'))
|
||||
pdfs_logged += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Knowledge facts — sport relevant
|
||||
if any(kw in text.lower() for kw in ['sport', 'klub', 'savez', 'sportaš', 'sportaši', 'natjecanj', 'manifestacij', 'javne potrebe', 'sufinancir', 'kup', 'prvenstvo', 'liga', 'utakm', 'igrač', 'trener', 'olimpij', 'paraolimpij', 'turn', 'medalj', 'pobjed', 'gradonaceln', 'župan', 'rijeka', 'pgž', 'primorsko', 'subvenc', 'natječaj', 'odluka', 'proračun', 'rebal']):
|
||||
# Save chunk as fact
|
||||
chunks = [text[i:i+800] for i in range(0, min(len(text), 5000), 800)]
|
||||
for ci, chunk in enumerate(chunks[:5]):
|
||||
if len(chunk) < 200: continue
|
||||
fact_hash = hashlib.sha256((url + str(ci) + chunk[:100]).encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_url, source_date, confidence, data_hash)
|
||||
VALUES (%s, 'pgz_sport_official', 'pgz_sport_deep', %s, CURRENT_DATE, 0.85, %s)
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
""", (chunk[:1500].replace('\x00', ''), url, fact_hash))
|
||||
facts += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Follow links
|
||||
links = find_links(html, url)
|
||||
for l in links[:25]:
|
||||
if l not in visited and l not in queue:
|
||||
queue.append(l)
|
||||
|
||||
log.info(f"FINAL: visited={len(visited)} docs={docs} pdfs={pdfs_logged} facts={facts}")
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
Reference in New Issue
Block a user