PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

This commit is contained in:
Damir Radulić
2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
+129
View File
@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
PGŽ Sport - Continuous Learning Loop
Scrape Sportilus, sport-pgz.hr, klub websites, embed sve novo, update Qdrant
Author: Damir Radulić | 25.04.2026
Schedule: cron 0 */6 * * * (every 6h)
"""
import psycopg2, psycopg2.extras, requests, time, logging, re
from datetime import datetime
logging.basicConfig(level=logging.INFO, format='%(asctime)s [LEARN] %(message)s')
LOG = logging.getLogger()
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
QDRANT = 'http://localhost:6333'
EMBED = 'http://localhost:9879/api/embeddings'
def embed(texts):
r = requests.post(EMBED, json={'texts': [t[:2000] for t in texts]}, timeout=60)
if not r.ok: return None
d = r.json()
return d.get('embeddings') or [d.get('embedding')]
def reembed_all():
"""Pull all from PG, re-embed, replace in Qdrant."""
conn = psycopg2.connect(**DB)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
LOG.info("Starting full re-embed...")
# Wipe + recreate
requests.delete(f"{QDRANT}/collections/pgz_sport_v1")
requests.put(f"{QDRANT}/collections/pgz_sport_v1",
json={'vectors':{'size':1024,'distance':'Cosine'}})
time.sleep(1)
texts = []; meta = []; cnt = 1
# All entities
for table_sql, formatter, mk_meta in [
("SELECT * FROM pgz_sport.savezi WHERE aktivan",
lambda r: f"Savez: {r['naziv']}. {r.get('sport') or ''}. Predsj: {r.get('predsjednik') or ''}, tajnik: {r.get('tajnik') or ''}. {r.get('grad') or ''}. {r.get('email') or ''}. {r.get('napomena') or ''}",
lambda r,t: {'tip':'savez','savez_id':r['id'],'naziv':r['naziv'],'sport':r.get('sport'),
'predsjednik':r.get('predsjednik'),'grad':r.get('grad'),'tekst':t[:500]}),
("""SELECT k.*, s.naziv AS savez_naziv FROM pgz_sport.klubovi k
LEFT JOIN pgz_sport.savezi s ON s.id=k.savez_id WHERE k.aktivan""",
lambda r: f"Klub: {r['naziv']}. Sport: {r.get('sport') or ''}. Razina: {r.get('razina') or ''}. Savez: {r.get('savez_naziv') or ''}. Predsj: {r.get('predsjednik') or ''}. Tajnik: {r.get('tajnik') or ''}. Trener: {r.get('trener_glavni') or ''}. OIB: {r.get('oib') or ''}. {r.get('grad') or ''}. {r.get('napomena') or ''}",
lambda r,t: {'tip':'klub','klub_id':r['id'],'naziv':r['naziv'],'sport':r.get('sport'),
'razina':r.get('razina'),'savez':r.get('savez_naziv'),
'predsjednik':r.get('predsjednik'),'oib':r.get('oib'),'tekst':t[:500]}),
("""SELECT c.*, k.naziv AS klub FROM pgz_sport.clanovi c
JOIN pgz_sport.klubovi k ON k.id=c.klub_id WHERE c.aktivan""",
lambda r: f"Sportaš: {r['ime']} {r['prezime']}. Klub: {r.get('klub') or ''}. Pozicija: {r.get('pozicija') or r.get('kategorija') or ''}. {'Reprezentativac.' if r.get('reprezentativac') else ''} {'Kategoriziran.' if r.get('kategoriziran') else ''}",
lambda r,t: {'tip':'clan','clan_id':r['id'],'klub_id':r['klub_id'],'klub':r.get('klub'),
'naziv':f"{r['ime']} {r['prezime']}",'pozicija':r.get('pozicija'),'tekst':t[:500]}),
("""SELECT m.*, s.naziv AS savez FROM pgz_sport.manifestacije m
LEFT JOIN pgz_sport.savezi s ON s.id=m.savez_id WHERE m.aktivna""",
lambda r: f"Manifestacija: {r['naziv']}. {r.get('mjesto') or ''}. {r.get('razina') or ''}. {r.get('savez') or ''}. Tradicija od {r.get('godina_od') or ''}.",
lambda r,t: {'tip':'manifestacija','manifestacija_id':r['id'],'naziv':r['naziv'],
'razina':r.get('razina'),'mjesto':r.get('mjesto'),'tekst':t[:500]}),
("SELECT * FROM pgz_sport.potpore_nositelji ORDER BY iznos DESC",
lambda r: f"Potpora PGŽ — {r['naziv_kluba']} {r['godina']}: {r['iznos']} EUR.",
lambda r,t: {'tip':'potpora','klub':r['naziv_kluba'],'naziv':r['naziv_kluba'],
'godina':r['godina'],'iznos':float(r['iznos'] or 0),'tekst':t[:500]}),
("SELECT * FROM pgz_sport.proracun ORDER BY godina",
lambda r: f"Proračun PGŽ za sport {r['godina']}: {r['ukupno']} EUR.",
lambda r,t: {'tip':'proracun','godina':r['godina'],'naziv':f"Proračun {r['godina']}",
'ukupno':float(r['ukupno'] or 0),'tekst':t[:500]}),
("""SELECT st.*, s.naziv AS savez FROM pgz_sport.statistika_saveza st
JOIN pgz_sport.savezi s ON s.id=st.savez_id ORDER BY godina DESC""",
lambda r: f"Statistika {r['savez']} {r['godina']}: {r['klubova_clanica']} klub, {r['registriranih']} reg, {r['trenera']} trener, {r['reprezentativaca']} reprez.",
lambda r,t: {'tip':'statistika','savez':r['savez'],'godina':r['godina'],
'naziv':f"{r['savez']} {r['godina']}",'tekst':t[:500]}),
]:
cur.execute(table_sql)
for r in cur.fetchall():
t = formatter(r)
texts.append(t)
meta.append(mk_meta(r, t))
LOG.info(f"Embedding {len(texts)} entities...")
points = []
BATCH = 32
for i in range(0, len(texts), BATCH):
embs = embed(texts[i:i+BATCH])
if not embs:
LOG.error(f"Embed batch {i} FAILED")
continue
for e, m in zip(embs, meta[i:i+BATCH]):
points.append({'id': cnt, 'vector': e, 'payload': m})
cnt += 1
# Upsert
for i in range(0, len(points), 100):
r = requests.put(f"{QDRANT}/collections/pgz_sport_v1/points",
json={'points': points[i:i+100]}, timeout=30)
if not r.ok: LOG.error(f"Qdrant {i}: {r.text[:200]}")
info = requests.get(f"{QDRANT}/collections/pgz_sport_v1").json()
LOG.info(f"Done: {info['result']['points_count']} points indexed")
return info['result']['points_count']
def scrape_sportilus_klub(naziv):
"""Scrape Sportilus klub page for new data."""
# Make slug from naziv
slug = naziv.lower().replace(' ','-').replace('š','s').replace('ž','z').replace('č','c').replace('ć','c').replace('đ','d')
slug = re.sub(r'[^a-z0-9-]', '', slug)
url = f"https://www.sportilus.com/klubovi/{slug}/"
try:
r = requests.get(url, timeout=10, headers={'User-Agent':'Mozilla/5.0 PGZSport-Bot'})
if r.ok and 'PREDSJEDNIK' in r.text.upper():
# Extract roles
m = re.search(r'([\w\s]+?)\s*-\s*PREDSJEDNIK', r.text)
return {'predsjednik': m.group(1).strip() if m else None}
except: pass
return None
if __name__ == '__main__':
# 1) Re-embed everything
n = reembed_all()
# 2) Optional: scrape new klubovi from Sportilus (placeholder for incremental learning)
LOG.info(f"Learn cycle complete: {n} entities indexed in Qdrant")