Files
pgz-sport/scrapers/embedder.py
T

166 lines
7.2 KiB
Python
Executable File

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
PGŽ Sport — Qdrant embedder.
Embeds savezi, klubovi, sportaši (clanovi), natjecanja into BGE-M3 → Qdrant.
Collection: pgz_sport_v1 (1024 dim, BGE-M3)
Run modes:
python embedder.py init # create Qdrant collection
python embedder.py savezi # embed all savezi
python embedder.py klubovi # embed all klubovi
python embedder.py sportasi # embed all clanovi
python embedder.py all # full refresh
python embedder.py incremental # only items missing or stale
"""
import os, sys, time, json, hashlib, logging
import psycopg2, psycopg2.extras, requests
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
EMBED = "http://localhost:9879/api/embeddings"
QDRANT = "http://10.10.0.2:6333"
COLLECTION = "pgz_sport_v1"
DIM = 1024
BATCH = 16
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO,
handlers=[logging.FileHandler('/opt/pgz-sport/_logs/embedder.log'),
logging.StreamHandler(sys.stdout)])
log = logging.getLogger("emb")
def conn(): return psycopg2.connect(**DB)
def embed_batch(texts: list) -> list:
r = requests.post(EMBED, json={"input": texts}, timeout=120)
r.raise_for_status()
d = r.json()
if 'data' in d:
return [item['embedding'] for item in d['data']]
if 'embeddings' in d:
return d['embeddings']
raise RuntimeError(f"unknown embed response shape: {list(d.keys())[:5]}")
def cmd_init():
"""Create Qdrant collection if not exists."""
r = requests.get(f"{QDRANT}/collections/{COLLECTION}")
if r.status_code == 200:
log.info(f"Collection {COLLECTION} already exists")
return
r = requests.put(f"{QDRANT}/collections/{COLLECTION}", json={
"vectors": {"size": DIM, "distance": "Cosine"},
"optimizers_config": {"indexing_threshold": 10000},
})
r.raise_for_status()
log.info(f"Collection {COLLECTION} created")
def text_id(prefix: str, src_id: int) -> int:
"""Stable numeric ID from prefix + src — Qdrant accepts uint64."""
h = hashlib.sha1(f"{prefix}:{src_id}".encode()).digest()
return int.from_bytes(h[:8], 'big') >> 1 # ensure < 2^63
def upsert_points(points: list):
if not points: return
r = requests.put(f"{QDRANT}/collections/{COLLECTION}/points",
json={"points": points}, timeout=120)
if not r.ok:
log.error(f"qdrant upsert failed: {r.status_code} {r.text[:300]}")
r.raise_for_status()
def cmd_savezi():
cmd_init()
rows = []
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("""SELECT id, naziv, sport, predsjednik, tajnik, web, aktivan, napomena
FROM pgz_sport.savezi WHERE aktivan=true""")
rows = cu.fetchall()
log.info(f"Embedding {len(rows)} savezi…")
pts = []
for i in range(0, len(rows), BATCH):
batch = rows[i:i+BATCH]
texts = [f"Sportski savez PGŽ: {r['naziv']}. Sport: {r['sport'] or ''}. "
f"Predsjednik: {r['predsjednik'] or '?'}. Tajnik: {r['tajnik'] or '?'}. "
f"{r['napomena'] or ''}" for r in batch]
vecs = embed_batch(texts)
for r, v in zip(batch, vecs):
pts.append({"id": text_id('savez', r['id']), "vector": v,
"payload": {"type":"savez","id":r['id'],"naziv":r['naziv'],
"sport":r['sport'],"predsjednik":r['predsjednik'],
"tajnik":r['tajnik'],"web":r['web']}})
if len(pts) >= 64:
upsert_points(pts); pts = []
upsert_points(pts)
log.info(f"Saved {len(rows)} savezi → {COLLECTION}")
def cmd_klubovi():
cmd_init()
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("""SELECT k.id, k.naziv, k.sport, k.razina, k.grad, k.region,
k.predsjednik, k.tajnik, k.napomena, k.hns_klub_id, s.naziv AS savez
FROM pgz_sport.klubovi k LEFT JOIN pgz_sport.savezi s ON s.id=k.savez_id
WHERE k.aktivan=true""")
rows = cu.fetchall()
log.info(f"Embedding {len(rows)} klubova…")
pts = []
for i in range(0, len(rows), BATCH):
batch = rows[i:i+BATCH]
texts = [f"Sportski klub PGŽ: {r['naziv']}. Sport: {r['sport'] or ''} ({r['razina'] or 'liga ?'}). "
f"Grad: {r['grad'] or '?'} ({r['region'] or 'PGŽ'}). "
f"Savez: {r['savez'] or '?'}. Predsjednik: {r['predsjednik'] or '?'}. "
f"Tajnik: {r['tajnik'] or '?'}. {r['napomena'] or ''}" for r in batch]
vecs = embed_batch(texts)
for r, v in zip(batch, vecs):
pts.append({"id": text_id('klub', r['id']), "vector": v,
"payload": {"type":"klub","id":r['id'],"naziv":r['naziv'],
"sport":r['sport'],"razina":r['razina'],"grad":r['grad'],
"region":r['region'],"hns_klub_id":r['hns_klub_id']}})
if len(pts) >= 64:
upsert_points(pts); pts = []
upsert_points(pts)
log.info(f"Saved {len(rows)} klubova → {COLLECTION}")
def cmd_sportasi():
cmd_init()
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("""SELECT c.id, c.ime, c.prezime, c.datum_rodenja, c.mjesto_rodenja,
c.pozicija, c.broj_dresa, c.reprezentativac, c.source,
k.naziv AS klub_naziv, k.sport
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id=c.klub_id""")
rows = cu.fetchall()
log.info(f"Embedding {len(rows)} sportaša…")
pts = []
for i in range(0, len(rows), BATCH):
batch = rows[i:i+BATCH]
texts = [f"Sportaš: {r['ime'] or ''} {r['prezime'] or ''}. "
f"Klub: {r['klub_naziv'] or '?'}. Sport: {r['sport'] or '?'}. "
f"Datum rođenja: {r['datum_rodenja'] or '?'}. Mjesto: {r['mjesto_rodenja'] or '?'}. "
f"Pozicija: {r['pozicija'] or '?'}. "
f"{'Reprezentativac.' if r['reprezentativac'] else ''}" for r in batch]
vecs = embed_batch(texts)
for r, v in zip(batch, vecs):
pts.append({"id": text_id('sportas', r['id']), "vector": v,
"payload": {"type":"sportas","id":r['id'],
"ime":r['ime'],"prezime":r['prezime'],
"klub_naziv":r['klub_naziv'],"sport":r['sport'],
"source":r['source']}})
if len(pts) >= 64:
upsert_points(pts); pts = []
upsert_points(pts)
log.info(f"Saved {len(rows)} sportaša → {COLLECTION}")
if __name__ == '__main__':
if len(sys.argv) < 2: print(__doc__); sys.exit(1)
cmd = sys.argv[1]
if cmd == 'init': cmd_init()
elif cmd == 'savezi': cmd_savezi()
elif cmd == 'klubovi': cmd_klubovi()
elif cmd == 'sportasi': cmd_sportasi()
elif cmd == 'all':
cmd_savezi(); cmd_klubovi(); cmd_sportasi()
else: print(f"unknown: {cmd}"); sys.exit(2)