#!/usr/bin/env python3 """Fast godišnjak mining: tokenize text, then set-intersect with sportaši names.""" import os import psycopg2, re from collections import defaultdict DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina") godisnjaci = cu.fetchall() print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True) # Build map: lowercase "ime prezime" → sportas_id cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL") sportasi = cu.fetchall() name_to_ids = defaultdict(set) for sid, ime, prezime in sportasi: if not ime or not prezime: continue full = f"{ime.strip()} {prezime.strip()}".lower() full2 = f"{prezime.strip()} {ime.strip()}".lower() if len(full) >= 8: name_to_ids[full].add(sid) name_to_ids[full2].add(sid) print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True) # Process each godišnjak: build n-gram set then check mentions = defaultdict(set) for did, godina, text in godisnjaci: if not text or len(text) < 5000: continue text_low = text.lower() # Substring search is fastest for this found_names = 0 for name, sids in name_to_ids.items(): if name in text_low: for sid in sids: mentions[sid].add(godina) found_names += 1 print(f" godišnjak {godina}: {found_names} matches", flush=True) print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True) # Update DB cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]") cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT") cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT") updated = 0 for sid, godine in mentions.items(): g = sorted(godine) cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s", (g, g[0], g[-1], sid)) updated += 1 print(f"\nUpdated {updated} sportaša", flush=True) # Top mentioned top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25] print("\nTOP 25 sportaša po godinama:") for sid, godine in top: cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,)) r = cu.fetchone() if r: kh = f" KAT-{r[3]}" if r[3] else "" print(f" {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})") conn.close()