PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

This commit is contained in:
Damir Radulić
2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
+68
View File
@@ -0,0 +1,68 @@
#!/usr/bin/env python3
"""Fast godišnjak mining: tokenize text, then set-intersect with sportaši names."""
import psycopg2, re
from collections import defaultdict
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina")
godisnjaci = cu.fetchall()
print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True)
# Build map: lowercase "ime prezime" → sportas_id
cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL")
sportasi = cu.fetchall()
name_to_ids = defaultdict(set)
for sid, ime, prezime in sportasi:
if not ime or not prezime: continue
full = f"{ime.strip()} {prezime.strip()}".lower()
full2 = f"{prezime.strip()} {ime.strip()}".lower()
if len(full) >= 8:
name_to_ids[full].add(sid)
name_to_ids[full2].add(sid)
print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True)
# Process each godišnjak: build n-gram set then check
mentions = defaultdict(set)
for did, godina, text in godisnjaci:
if not text or len(text) < 5000: continue
text_low = text.lower()
# Substring search is fastest for this
found_names = 0
for name, sids in name_to_ids.items():
if name in text_low:
for sid in sids:
mentions[sid].add(godina)
found_names += 1
print(f" godišnjak {godina}: {found_names} matches", flush=True)
print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True)
# Update DB
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
updated = 0
for sid, godine in mentions.items():
g = sorted(godine)
cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s",
(g, g[0], g[-1], sid))
updated += 1
print(f"\nUpdated {updated} sportaša", flush=True)
# Top mentioned
top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25]
print("\nTOP 25 sportaša po godinama:")
for sid, godine in top:
cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,))
r = cu.fetchone()
if r:
kh = f" KAT-{r[3]}" if r[3] else ""
print(f" {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})")
conn.close()