Files
pgz-sport/scrapers/godisnjak_text_mine.py
T

69 lines
2.7 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Fast godišnjak mining: tokenize text, then set-intersect with sportaši names."""
import psycopg2, re
from collections import defaultdict
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina")
godisnjaci = cu.fetchall()
print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True)
# Build map: lowercase "ime prezime" → sportas_id
cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL")
sportasi = cu.fetchall()
name_to_ids = defaultdict(set)
for sid, ime, prezime in sportasi:
if not ime or not prezime: continue
full = f"{ime.strip()} {prezime.strip()}".lower()
full2 = f"{prezime.strip()} {ime.strip()}".lower()
if len(full) >= 8:
name_to_ids[full].add(sid)
name_to_ids[full2].add(sid)
print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True)
# Process each godišnjak: build n-gram set then check
mentions = defaultdict(set)
for did, godina, text in godisnjaci:
if not text or len(text) < 5000: continue
text_low = text.lower()
# Substring search is fastest for this
found_names = 0
for name, sids in name_to_ids.items():
if name in text_low:
for sid in sids:
mentions[sid].add(godina)
found_names += 1
print(f" godišnjak {godina}: {found_names} matches", flush=True)
print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True)
# Update DB
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
updated = 0
for sid, godine in mentions.items():
g = sorted(godine)
cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s",
(g, g[0], g[-1], sid))
updated += 1
print(f"\nUpdated {updated} sportaša", flush=True)
# Top mentioned
top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25]
print("\nTOP 25 sportaša po godinama:")
for sid, godine in top:
cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,))
r = cu.fetchone()
if r:
kh = f" KAT-{r[3]}" if r[3] else ""
print(f" {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})")
conn.close()