#!/usr/bin/env python3 # pgz_sport_deep.py — Deep scrape sport-pgz.hr + pgz.hr/sport import os, sys, time, hashlib, logging, re from urllib.parse import urljoin, urlparse import urllib.request import psycopg2 from html import unescape logging.basicConfig(level=logging.INFO, format='%(asctime)s [pgz_deep] %(message)s') log = logging.getLogger("pgz_deep") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)" ROOTS = [ "https://sport-pgz.hr", "https://www.pgz.hr/teme/sport/", "https://www.pgz.hr/sport/", "https://www.pgz.hr/o-zupaniji/upravna-tijela/upravni-odjel-za-kulturu-sport-tehnicku-kulturu/", ] def fetch(url, retries=3): for i in range(retries): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=20) as r: return r.read().decode('utf-8', errors='replace'), r.status except Exception as e: time.sleep(3*(i+1)) return None, 0 def extract_text(html): if not html: return "" text = re.sub(r']*>.*?', '', html, flags=re.S|re.I) text = re.sub(r']*>.*?', '', text, flags=re.S|re.I) text = re.sub(r'<[^>]+>', ' ', text) text = unescape(text) return re.sub(r'\s+', ' ', text).strip() def find_links(html, base): if not html: return [] out = [] for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I): u = urljoin(base, m.group(1)) host = urlparse(u).hostname or "" if any(d in host for d in ['pgz.hr', 'sport-pgz.hr']): out.append(u) return list(set(out)) def find_pdf_links(html, base): if not html: return [] out = [] for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I): out.append(urljoin(base, m.group(1))) return list(set(out)) def harvest(): conn = psycopg2.connect(DSN) conn.autocommit = True cur = conn.cursor() visited = set() queue = list(ROOTS) docs = 0 facts = 0 pdfs_logged = 0 while queue and len(visited) < 300: url = queue.pop(0) if url in visited: continue visited.add(url) html, status = fetch(url) if not html or status != 200: time.sleep(1) continue log.info(f"[{status}] {url[:80]} ({len(html)} bytes)") text = extract_text(html) if len(text) < 100: continue title_m = re.search(r']*>([^<]+)', html, re.I) title = title_m.group(1).strip() if title_m else url[:80] sha1 = hashlib.sha1(text[:5000].encode()).hexdigest() try: cur.execute(""" INSERT INTO pgz_sport.dokumenti (url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija) VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s) ON CONFLICT DO NOTHING """, (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'PGŽ')) docs += cur.rowcount except Exception as e: log.warning(f"Doc insert fail: {e}") # PDF links — log them for pdf_url in find_pdf_links(html, url): try: pdf_sha = hashlib.sha1(pdf_url.encode()).hexdigest() cur.execute(""" INSERT INTO pgz_sport.dokumenti (url, pdf_url, fname, title, vrsta, izvor_url, scraped_at, sha1, organizacija) VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s) ON CONFLICT DO NOTHING """, (pdf_url, pdf_url, pdf_url.split('/')[-1][:100], pdf_url.split('/')[-1][:200], 'pdf', url, pdf_sha, 'PGŽ')) pdfs_logged += cur.rowcount except: pass # Knowledge facts — sport relevant if any(kw in text.lower() for kw in ['sport', 'klub', 'savez', 'sportaš', 'sportaši', 'natjecanj', 'manifestacij', 'javne potrebe', 'sufinancir', 'kup', 'prvenstvo', 'liga', 'utakm', 'igrač', 'trener', 'olimpij', 'paraolimpij', 'turn', 'medalj', 'pobjed', 'gradonaceln', 'župan', 'rijeka', 'pgž', 'primorsko', 'subvenc', 'natječaj', 'odluka', 'proračun', 'rebal']): # Save chunk as fact chunks = [text[i:i+800] for i in range(0, min(len(text), 5000), 800)] for ci, chunk in enumerate(chunks[:5]): if len(chunk) < 200: continue fact_hash = hashlib.sha256((url + str(ci) + chunk[:100]).encode()).hexdigest() try: cur.execute(""" INSERT INTO dabi.knowledge (fact, category, source, source_url, source_date, confidence, data_hash) VALUES (%s, 'pgz_sport_official', 'pgz_sport_deep', %s, CURRENT_DATE, 0.85, %s) ON CONFLICT (data_hash) DO NOTHING """, (chunk[:1500].replace('\x00', ''), url, fact_hash)) facts += cur.rowcount except: pass # Follow links links = find_links(html, url) for l in links[:25]: if l not in visited and l not in queue: queue.append(l) log.info(f"FINAL: visited={len(visited)} docs={docs} pdfs={pdfs_logged} facts={facts}") cur.close() conn.close() if __name__ == "__main__": harvest()