#!/usr/bin/env python3
"""Fast godišnjak mining: tokenize text, then set-intersect with sportaši names."""
import os
import psycopg2, re
from collections import defaultdict

DB = dict(host='localhost', port=5432, dbname='rinet_v3',
          user='rinet', password=os.environ["DB_PASSWORD"])
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()

cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina")
godisnjaci = cu.fetchall()
print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True)

# Build map: lowercase "ime prezime" → sportas_id
cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL")
sportasi = cu.fetchall()
name_to_ids = defaultdict(set)
for sid, ime, prezime in sportasi:
    if not ime or not prezime: continue
    full = f"{ime.strip()} {prezime.strip()}".lower()
    full2 = f"{prezime.strip()} {ime.strip()}".lower()
    if len(full) >= 8: 
        name_to_ids[full].add(sid)
        name_to_ids[full2].add(sid)
print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True)

# Process each godišnjak: build n-gram set then check
mentions = defaultdict(set)
for did, godina, text in godisnjaci:
    if not text or len(text) < 5000: continue
    text_low = text.lower()
    # Substring search is fastest for this
    found_names = 0
    for name, sids in name_to_ids.items():
        if name in text_low:
            for sid in sids:
                mentions[sid].add(godina)
            found_names += 1
    print(f"  godišnjak {godina}: {found_names} matches", flush=True)

print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True)

# Update DB
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")

updated = 0
for sid, godine in mentions.items():
    g = sorted(godine)
    cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s",
               (g, g[0], g[-1], sid))
    updated += 1

print(f"\nUpdated {updated} sportaša", flush=True)

# Top mentioned
top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25]
print("\nTOP 25 sportaša po godinama:")
for sid, godine in top:
    cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,))
    r = cu.fetchone()
    if r:
        kh = f" KAT-{r[3]}" if r[3] else ""
        print(f"  {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})")

conn.close()