#!/usr/bin/env python3 """ D PGŽ news enrichment: scrape novilist.hr, glasistre.hr, sportske.jutarnji.hr search pages directly (not via DDG which is blocked). """ import re, json, time, sys import urllib.request, urllib.parse import psycopg2 DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7") UA = "Mozilla/5.0 (compatible; PGZBot/1.0)" def out(msg): print(msg, flush=True) def http_get(url, timeout=12): try: req = urllib.request.Request(url, headers={ "User-Agent": UA, "Accept-Language": "hr,en" }) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8", errors="replace") except Exception: return None def html_to_text(h): h = re.sub(r'', '', h, flags=re.S) h = re.sub(r'', '', h, flags=re.S) h = re.sub(r'<[^>]+>', ' ', h) h = re.sub(r' ', ' ', h) h = re.sub(r'&', '&', h) h = re.sub(r'&[a-z]+;', '', h) h = re.sub(r'\s+', ' ', h).strip() return h def relevant_paragraph(text, ime, prez, sport): sents = re.split(r'(?<=[.!?])\s+', text) relevant = [] for s in sents: sl = s.lower() if (prez.lower() in sl or f"{ime.lower()} {prez.lower()}" in sl) and len(s) > 60: relevant.append(s) if len(" ".join(relevant)) > 700: break return " ".join(relevant)[:1300] def search_novilist(query): """Novi list search: direct URL""" enc = urllib.parse.quote(query) h = http_get(f"https://www.novilist.hr/?s={enc}") if not h: return [] return re.findall(r'href="(https://www\.novilist\.hr/[^"]+)"', h)[:5] def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id WHERE (c.kategorija_hoo IN (1, 2, 3) OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))) AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200) ORDER BY c.kategorija_hoo NULLS LAST LIMIT 50""") targets = cr.fetchall() out(f"Targets: {len(targets)}") enriched = 0; tried = 0 for tid, ime, prez, sport, kat, klub in targets: tried += 1 full = f"{ime} {prez}" sport_kw = sport or "sportaš" # Try Novi list (Riječki regional) urls = search_novilist(f"{full} {sport_kw}") time.sleep(0.4) bio_text = ""; bio_url = None for u in urls[:3]: if any(skip in u for skip in ("autor", "kategorija", "tag", "wp-content", "feed", "page=")): continue html = http_get(u, timeout=10) if not html: continue text = html_to_text(html) para = relevant_paragraph(text, ime, prez, sport_kw) if para and len(para) >= 200: bio_text = para; bio_url = u break time.sleep(0.3) if not bio_text: time.sleep(0.3) continue try: cr.execute("""UPDATE pgz_sport.clanovi SET biografija = %s, source_url = COALESCE(source_url, %s), source_synced_at = now() WHERE id = %s""", (bio_text, bio_url, tid)) cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at) VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""", (f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: novilist.hr)"[:2000], "novilist", 0.85, "biografija_sportasa")) enriched += 1 out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}") except Exception as e: out(f" ERR {full}: {e}") if tried % 10 == 0: out(f" Progress: tried={tried} enriched={enriched}") time.sleep(0.4) out(f"=== DONE: tried={tried} enriched={enriched} ===") conn.close() if __name__ == "__main__": main()