69 lines
2.7 KiB
Python
Executable File
69 lines
2.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""Fast godišnjak mining: tokenize text, then set-intersect with sportaši names."""
|
||
import psycopg2, re
|
||
from collections import defaultdict
|
||
|
||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||
user='rinet', password='R1net2026!SecureDB#v7')
|
||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||
cu = conn.cursor()
|
||
|
||
cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina")
|
||
godisnjaci = cu.fetchall()
|
||
print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True)
|
||
|
||
# Build map: lowercase "ime prezime" → sportas_id
|
||
cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL")
|
||
sportasi = cu.fetchall()
|
||
name_to_ids = defaultdict(set)
|
||
for sid, ime, prezime in sportasi:
|
||
if not ime or not prezime: continue
|
||
full = f"{ime.strip()} {prezime.strip()}".lower()
|
||
full2 = f"{prezime.strip()} {ime.strip()}".lower()
|
||
if len(full) >= 8:
|
||
name_to_ids[full].add(sid)
|
||
name_to_ids[full2].add(sid)
|
||
print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True)
|
||
|
||
# Process each godišnjak: build n-gram set then check
|
||
mentions = defaultdict(set)
|
||
for did, godina, text in godisnjaci:
|
||
if not text or len(text) < 5000: continue
|
||
text_low = text.lower()
|
||
# Substring search is fastest for this
|
||
found_names = 0
|
||
for name, sids in name_to_ids.items():
|
||
if name in text_low:
|
||
for sid in sids:
|
||
mentions[sid].add(godina)
|
||
found_names += 1
|
||
print(f" godišnjak {godina}: {found_names} matches", flush=True)
|
||
|
||
print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True)
|
||
|
||
# Update DB
|
||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
|
||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
|
||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
|
||
|
||
updated = 0
|
||
for sid, godine in mentions.items():
|
||
g = sorted(godine)
|
||
cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s",
|
||
(g, g[0], g[-1], sid))
|
||
updated += 1
|
||
|
||
print(f"\nUpdated {updated} sportaša", flush=True)
|
||
|
||
# Top mentioned
|
||
top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25]
|
||
print("\nTOP 25 sportaša po godinama:")
|
||
for sid, godine in top:
|
||
cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,))
|
||
r = cu.fetchone()
|
||
if r:
|
||
kh = f" KAT-{r[3]}" if r[3] else ""
|
||
print(f" {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})")
|
||
|
||
conn.close()
|