Files
pgz-sport/learn_loop.py_prije_env_deepseek

131 lines
6.7 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
PGŽ Sport - Continuous Learning Loop
Scrape Sportilus, sport-pgz.hr, klub websites, embed sve novo, update Qdrant
Author: Damir Radulić | 25.04.2026
Schedule: cron 0 */6 * * * (every 6h)
"""
import os
import psycopg2, psycopg2.extras, requests, time, logging, re
from datetime import datetime
logging.basicConfig(level=logging.INFO, format='%(asctime)s [LEARN] %(message)s')
LOG = logging.getLogger()
DB = dict(host='10.10.0.2', port=6432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
QDRANT = 'http://10.10.0.2:6333'
EMBED = 'http://localhost:9879/api/embeddings'
def embed(texts):
r = requests.post(EMBED, json={'texts': [t[:2000] for t in texts]}, timeout=60)
if not r.ok: return None
d = r.json()
return d.get('embeddings') or [d.get('embedding')]
def reembed_all():
"""Pull all from PG, re-embed, replace in Qdrant."""
conn = psycopg2.connect(**DB)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
LOG.info("Starting full re-embed...")
# Wipe + recreate
requests.delete(f"{QDRANT}/collections/pgz_sport_v1")
requests.put(f"{QDRANT}/collections/pgz_sport_v1",
json={'vectors':{'size':1024,'distance':'Cosine'}})
time.sleep(1)
texts = []; meta = []; cnt = 1
# All entities
for table_sql, formatter, mk_meta in [
("SELECT * FROM pgz_sport.savezi WHERE aktivan",
lambda r: f"Savez: {r['naziv']}. {r.get('sport') or ''}. Predsj: {r.get('predsjednik') or ''}, tajnik: {r.get('tajnik') or ''}. {r.get('grad') or ''}. {r.get('email') or ''}. {r.get('napomena') or ''}",
lambda r,t: {'tip':'savez','savez_id':r['id'],'naziv':r['naziv'],'sport':r.get('sport'),
'predsjednik':r.get('predsjednik'),'grad':r.get('grad'),'tekst':t[:500]}),
("""SELECT k.*, s.naziv AS savez_naziv FROM pgz_sport.klubovi k
LEFT JOIN pgz_sport.savezi s ON s.id=k.savez_id WHERE k.aktivan""",
lambda r: f"Klub: {r['naziv']}. Sport: {r.get('sport') or ''}. Razina: {r.get('razina') or ''}. Savez: {r.get('savez_naziv') or ''}. Predsj: {r.get('predsjednik') or ''}. Tajnik: {r.get('tajnik') or ''}. Trener: {r.get('trener_glavni') or ''}. OIB: {r.get('oib') or ''}. {r.get('grad') or ''}. {r.get('napomena') or ''}",
lambda r,t: {'tip':'klub','klub_id':r['id'],'naziv':r['naziv'],'sport':r.get('sport'),
'razina':r.get('razina'),'savez':r.get('savez_naziv'),
'predsjednik':r.get('predsjednik'),'oib':r.get('oib'),'tekst':t[:500]}),
("""SELECT c.*, k.naziv AS klub FROM pgz_sport.clanovi c
JOIN pgz_sport.klubovi k ON k.id=c.klub_id WHERE c.aktivan""",
lambda r: f"Sportaš: {r['ime']} {r['prezime']}. Klub: {r.get('klub') or ''}. Pozicija: {r.get('pozicija') or r.get('kategorija') or ''}. {'Reprezentativac.' if r.get('reprezentativac') else ''} {'Kategoriziran.' if r.get('kategoriziran') else ''}",
lambda r,t: {'tip':'clan','clan_id':r['id'],'klub_id':r['klub_id'],'klub':r.get('klub'),
'naziv':f"{r['ime']} {r['prezime']}",'pozicija':r.get('pozicija'),'tekst':t[:500]}),
("""SELECT m.*, s.naziv AS savez FROM pgz_sport.manifestacije m
LEFT JOIN pgz_sport.savezi s ON s.id=m.savez_id WHERE m.aktivna""",
lambda r: f"Manifestacija: {r['naziv']}. {r.get('mjesto') or ''}. {r.get('razina') or ''}. {r.get('savez') or ''}. Tradicija od {r.get('godina_od') or ''}.",
lambda r,t: {'tip':'manifestacija','manifestacija_id':r['id'],'naziv':r['naziv'],
'razina':r.get('razina'),'mjesto':r.get('mjesto'),'tekst':t[:500]}),
("SELECT * FROM pgz_sport.potpore_nositelji ORDER BY iznos DESC",
lambda r: f"Potpora PGŽ — {r['naziv_kluba']} {r['godina']}: {r['iznos']} EUR.",
lambda r,t: {'tip':'potpora','klub':r['naziv_kluba'],'naziv':r['naziv_kluba'],
'godina':r['godina'],'iznos':float(r['iznos'] or 0),'tekst':t[:500]}),
("SELECT * FROM pgz_sport.proracun ORDER BY godina",
lambda r: f"Proračun PGŽ za sport {r['godina']}: {r['ukupno']} EUR.",
lambda r,t: {'tip':'proracun','godina':r['godina'],'naziv':f"Proračun {r['godina']}",
'ukupno':float(r['ukupno'] or 0),'tekst':t[:500]}),
("""SELECT st.*, s.naziv AS savez FROM pgz_sport.statistika_saveza st
JOIN pgz_sport.savezi s ON s.id=st.savez_id ORDER BY godina DESC""",
lambda r: f"Statistika {r['savez']} {r['godina']}: {r['klubova_clanica']} klub, {r['registriranih']} reg, {r['trenera']} trener, {r['reprezentativaca']} reprez.",
lambda r,t: {'tip':'statistika','savez':r['savez'],'godina':r['godina'],
'naziv':f"{r['savez']} {r['godina']}",'tekst':t[:500]}),
]:
cur.execute(table_sql)
for r in cur.fetchall():
t = formatter(r)
texts.append(t)
meta.append(mk_meta(r, t))
LOG.info(f"Embedding {len(texts)} entities...")
points = []
BATCH = 32
for i in range(0, len(texts), BATCH):
embs = embed(texts[i:i+BATCH])
if not embs:
LOG.error(f"Embed batch {i} FAILED")
continue
for e, m in zip(embs, meta[i:i+BATCH]):
points.append({'id': cnt, 'vector': e, 'payload': m})
cnt += 1
# Upsert
for i in range(0, len(points), 100):
r = requests.put(f"{QDRANT}/collections/pgz_sport_v1/points",
json={'points': points[i:i+100]}, timeout=30)
if not r.ok: LOG.error(f"Qdrant {i}: {r.text[:200]}")
info = requests.get(f"{QDRANT}/collections/pgz_sport_v1").json()
LOG.info(f"Done: {info['result']['points_count']} points indexed")
return info['result']['points_count']
def scrape_sportilus_klub(naziv):
"""Scrape Sportilus klub page for new data."""
# Make slug from naziv
slug = naziv.lower().replace(' ','-').replace('š','s').replace('ž','z').replace('č','c').replace('ć','c').replace('đ','d')
slug = re.sub(r'[^a-z0-9-]', '', slug)
url = f"https://www.sportilus.com/klubovi/{slug}/"
try:
r = requests.get(url, timeout=10, headers={'User-Agent':'Mozilla/5.0 PGZSport-Bot'})
if r.ok and 'PREDSJEDNIK' in r.text.upper():
# Extract roles
m = re.search(r'([\w\s]+?)\s*-\s*PREDSJEDNIK', r.text)
return {'predsjednik': m.group(1).strip() if m else None}
except: pass
return None
if __name__ == '__main__':
# 1) Re-embed everything
n = reembed_all()
# 2) Optional: scrape new klubovi from Sportilus (placeholder for incremental learning)
LOG.info(f"Learn cycle complete: {n} entities indexed in Qdrant")