feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
+69
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fast godišnjak mining: tokenize text, then set-intersect with sportaši names."""
|
||||
import os
|
||||
import psycopg2, re
|
||||
from collections import defaultdict
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina")
|
||||
godisnjaci = cu.fetchall()
|
||||
print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True)
|
||||
|
||||
# Build map: lowercase "ime prezime" → sportas_id
|
||||
cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL")
|
||||
sportasi = cu.fetchall()
|
||||
name_to_ids = defaultdict(set)
|
||||
for sid, ime, prezime in sportasi:
|
||||
if not ime or not prezime: continue
|
||||
full = f"{ime.strip()} {prezime.strip()}".lower()
|
||||
full2 = f"{prezime.strip()} {ime.strip()}".lower()
|
||||
if len(full) >= 8:
|
||||
name_to_ids[full].add(sid)
|
||||
name_to_ids[full2].add(sid)
|
||||
print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True)
|
||||
|
||||
# Process each godišnjak: build n-gram set then check
|
||||
mentions = defaultdict(set)
|
||||
for did, godina, text in godisnjaci:
|
||||
if not text or len(text) < 5000: continue
|
||||
text_low = text.lower()
|
||||
# Substring search is fastest for this
|
||||
found_names = 0
|
||||
for name, sids in name_to_ids.items():
|
||||
if name in text_low:
|
||||
for sid in sids:
|
||||
mentions[sid].add(godina)
|
||||
found_names += 1
|
||||
print(f" godišnjak {godina}: {found_names} matches", flush=True)
|
||||
|
||||
print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True)
|
||||
|
||||
# Update DB
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
|
||||
|
||||
updated = 0
|
||||
for sid, godine in mentions.items():
|
||||
g = sorted(godine)
|
||||
cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s",
|
||||
(g, g[0], g[-1], sid))
|
||||
updated += 1
|
||||
|
||||
print(f"\nUpdated {updated} sportaša", flush=True)
|
||||
|
||||
# Top mentioned
|
||||
top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25]
|
||||
print("\nTOP 25 sportaša po godinama:")
|
||||
for sid, godine in top:
|
||||
cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,))
|
||||
r = cu.fetchone()
|
||||
if r:
|
||||
kh = f" KAT-{r[3]}" if r[3] else ""
|
||||
print(f" {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user