#!/usr/bin/env python3 """ D news enrichment: search via DuckDuckGo HTML (no API key) for biographical news mentions. Gather first 2-3 hits, scrape, extract relevant paragraphs that mention the athlete + sport. """ import re, json, time, sys import urllib.request, urllib.parse import psycopg2 DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7") UA = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" def out(msg): print(msg, flush=True) def http_get(url, timeout=12): try: req = urllib.request.Request(url, headers={ "User-Agent": UA, "Accept": "text/html,application/xhtml+xml", "Accept-Language": "hr,en" }) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8", errors="replace") except Exception: return None def ddg_search(query, limit=3): """DuckDuckGo HTML search. Returns list of (url, snippet).""" q = urllib.parse.quote(query) url = f"https://html.duckduckgo.com/html/?q={q}" h = http_get(url) if not h: return [] results = [] # Extract or "result__url" for m in re.finditer(r']*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)', h): link = urllib.parse.unquote(m.group(1)) title = m.group(2).strip() # Strip DDG redirect m2 = re.search(r'uddg=([^&]+)', link) if m2: link = urllib.parse.unquote(m2.group(1)) results.append((link, title)) if len(results) >= limit: break return results def html_to_text(html): h = re.sub(r'', '', html, flags=re.S) h = re.sub(r'', '', h, flags=re.S) h = re.sub(r'<[^>]+>', ' ', h) h = re.sub(r' ', ' ', h) h = re.sub(r'&', '&', h) h = re.sub(r'"', '"', h) h = re.sub(r'&#\d+;', '', h) h = re.sub(r'\s+', ' ', h) return h def relevant_paragraph(text, ime, prez, sport): """Extract first relevant sentence(s) that mention name + sport.""" sents = re.split(r'(?<=[.!?])\s+', text) full_name = f"{ime} {prez}" relevant = [] for s in sents: sl = s.lower() if (full_name.lower() in sl or prez.lower() in sl) and len(s) > 50: relevant.append(s) if len(" ".join(relevant)) > 800: break return " ".join(relevant)[:1500] def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id WHERE (c.kategorija_hoo IN (1, 2, 3) OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))) AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200) ORDER BY c.kategorija_hoo NULLS LAST LIMIT 80""") targets = cr.fetchall() out(f"Targets: {len(targets)}") enriched = 0; tried = 0 for tid, ime, prez, sport, kat, klub in targets: tried += 1 full = f"{ime} {prez}" sport_kw = sport or "sportaš" # DuckDuckGo query query = f'"{full}" {sport_kw} Rijeka' results = ddg_search(query, limit=3) if not results: time.sleep(0.5) continue bio_text = "" bio_url = None for link, title in results[:3]: # Skip non-news domains: facebook, instagram, hns.family etc if any(skip in link.lower() for skip in ("facebook", "instagram", "twitter", "youtube", "x.com", "tiktok")): continue html = http_get(link, timeout=10) if not html: continue text = html_to_text(html) para = relevant_paragraph(text, ime, prez, sport_kw) if para and len(para) >= 200: bio_text = para bio_url = link break time.sleep(0.3) if not bio_text: time.sleep(0.4) continue # Insert try: cr.execute("""UPDATE pgz_sport.clanovi SET biografija = %s, source_url = COALESCE(source_url, %s), source_synced_at = now() WHERE id = %s""", (bio_text, bio_url, tid)) cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at) VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""", (f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: {bio_url})"[:2000], "online_news", 0.85, "biografija_sportasa")) enriched += 1 out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}") except Exception as e: out(f" ERR {full}: {e}") if tried % 15 == 0: out(f" Progress: tried={tried} enriched={enriched}") time.sleep(0.5) out(f"=== DONE: tried={tried} enriched={enriched} ===") conn.close() if __name__ == "__main__": main()