#!/usr/bin/env python3 """ PGŽ Sport — Qdrant embedder. Embeds savezi, klubovi, sportaši (clanovi), natjecanja into BGE-M3 → Qdrant. Collection: pgz_sport_v1 (1024 dim, BGE-M3) Run modes: python embedder.py init # create Qdrant collection python embedder.py savezi # embed all savezi python embedder.py klubovi # embed all klubovi python embedder.py sportasi # embed all clanovi python embedder.py all # full refresh python embedder.py incremental # only items missing or stale """ import os, sys, time, json, hashlib, logging import psycopg2, psycopg2.extras, requests DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7') EMBED = "http://localhost:9879/api/embeddings" QDRANT = "http://10.10.0.2:6333" COLLECTION = "pgz_sport_v1" DIM = 1024 BATCH = 16 logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO, handlers=[logging.FileHandler('/opt/pgz-sport/_logs/embedder.log'), logging.StreamHandler(sys.stdout)]) log = logging.getLogger("emb") def conn(): return psycopg2.connect(**DB) def embed_batch(texts: list) -> list: r = requests.post(EMBED, json={"input": texts}, timeout=120) r.raise_for_status() d = r.json() if 'data' in d: return [item['embedding'] for item in d['data']] if 'embeddings' in d: return d['embeddings'] raise RuntimeError(f"unknown embed response shape: {list(d.keys())[:5]}") def cmd_init(): """Create Qdrant collection if not exists.""" r = requests.get(f"{QDRANT}/collections/{COLLECTION}") if r.status_code == 200: log.info(f"Collection {COLLECTION} already exists") return r = requests.put(f"{QDRANT}/collections/{COLLECTION}", json={ "vectors": {"size": DIM, "distance": "Cosine"}, "optimizers_config": {"indexing_threshold": 10000}, }) r.raise_for_status() log.info(f"Collection {COLLECTION} created") def text_id(prefix: str, src_id: int) -> int: """Stable numeric ID from prefix + src — Qdrant accepts uint64.""" h = hashlib.sha1(f"{prefix}:{src_id}".encode()).digest() return int.from_bytes(h[:8], 'big') >> 1 # ensure < 2^63 def upsert_points(points: list): if not points: return r = requests.put(f"{QDRANT}/collections/{COLLECTION}/points", json={"points": points}, timeout=120) if not r.ok: log.error(f"qdrant upsert failed: {r.status_code} {r.text[:300]}") r.raise_for_status() def cmd_savezi(): cmd_init() rows = [] with conn() as c: cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cu.execute("""SELECT id, naziv, sport, predsjednik, tajnik, web, aktivan, napomena FROM pgz_sport.savezi WHERE aktivan=true""") rows = cu.fetchall() log.info(f"Embedding {len(rows)} savezi…") pts = [] for i in range(0, len(rows), BATCH): batch = rows[i:i+BATCH] texts = [f"Sportski savez PGŽ: {r['naziv']}. Sport: {r['sport'] or ''}. " f"Predsjednik: {r['predsjednik'] or '?'}. Tajnik: {r['tajnik'] or '?'}. " f"{r['napomena'] or ''}" for r in batch] vecs = embed_batch(texts) for r, v in zip(batch, vecs): pts.append({"id": text_id('savez', r['id']), "vector": v, "payload": {"type":"savez","id":r['id'],"naziv":r['naziv'], "sport":r['sport'],"predsjednik":r['predsjednik'], "tajnik":r['tajnik'],"web":r['web']}}) if len(pts) >= 64: upsert_points(pts); pts = [] upsert_points(pts) log.info(f"Saved {len(rows)} savezi → {COLLECTION}") def cmd_klubovi(): cmd_init() with conn() as c: cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cu.execute("""SELECT k.id, k.naziv, k.sport, k.razina, k.grad, k.region, k.predsjednik, k.tajnik, k.napomena, k.hns_klub_id, s.naziv AS savez FROM pgz_sport.klubovi k LEFT JOIN pgz_sport.savezi s ON s.id=k.savez_id WHERE k.aktivan=true""") rows = cu.fetchall() log.info(f"Embedding {len(rows)} klubova…") pts = [] for i in range(0, len(rows), BATCH): batch = rows[i:i+BATCH] texts = [f"Sportski klub PGŽ: {r['naziv']}. Sport: {r['sport'] or ''} ({r['razina'] or 'liga ?'}). " f"Grad: {r['grad'] or '?'} ({r['region'] or 'PGŽ'}). " f"Savez: {r['savez'] or '?'}. Predsjednik: {r['predsjednik'] or '?'}. " f"Tajnik: {r['tajnik'] or '?'}. {r['napomena'] or ''}" for r in batch] vecs = embed_batch(texts) for r, v in zip(batch, vecs): pts.append({"id": text_id('klub', r['id']), "vector": v, "payload": {"type":"klub","id":r['id'],"naziv":r['naziv'], "sport":r['sport'],"razina":r['razina'],"grad":r['grad'], "region":r['region'],"hns_klub_id":r['hns_klub_id']}}) if len(pts) >= 64: upsert_points(pts); pts = [] upsert_points(pts) log.info(f"Saved {len(rows)} klubova → {COLLECTION}") def cmd_sportasi(): cmd_init() with conn() as c: cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cu.execute("""SELECT c.id, c.ime, c.prezime, c.datum_rodenja, c.mjesto_rodenja, c.pozicija, c.broj_dresa, c.reprezentativac, c.source, k.naziv AS klub_naziv, k.sport FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id=c.klub_id""") rows = cu.fetchall() log.info(f"Embedding {len(rows)} sportaša…") pts = [] for i in range(0, len(rows), BATCH): batch = rows[i:i+BATCH] texts = [f"Sportaš: {r['ime'] or ''} {r['prezime'] or ''}. " f"Klub: {r['klub_naziv'] or '?'}. Sport: {r['sport'] or '?'}. " f"Datum rođenja: {r['datum_rodenja'] or '?'}. Mjesto: {r['mjesto_rodenja'] or '?'}. " f"Pozicija: {r['pozicija'] or '?'}. " f"{'Reprezentativac.' if r['reprezentativac'] else ''}" for r in batch] vecs = embed_batch(texts) for r, v in zip(batch, vecs): pts.append({"id": text_id('sportas', r['id']), "vector": v, "payload": {"type":"sportas","id":r['id'], "ime":r['ime'],"prezime":r['prezime'], "klub_naziv":r['klub_naziv'],"sport":r['sport'], "source":r['source']}}) if len(pts) >= 64: upsert_points(pts); pts = [] upsert_points(pts) log.info(f"Saved {len(rows)} sportaša → {COLLECTION}") if __name__ == '__main__': if len(sys.argv) < 2: print(__doc__); sys.exit(1) cmd = sys.argv[1] if cmd == 'init': cmd_init() elif cmd == 'savezi': cmd_savezi() elif cmd == 'klubovi': cmd_klubovi() elif cmd == 'sportasi': cmd_sportasi() elif cmd == 'all': cmd_savezi(); cmd_klubovi(); cmd_sportasi() else: print(f"unknown: {cmd}"); sys.exit(2)