PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

This commit is contained in:
Damir Radulić
2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
+128
View File
@@ -0,0 +1,128 @@
#!/usr/bin/env python3
"""
PGZ Godišnjaci embed pipeline - FIXED
- Batch size: 5 points max
- Payload text: max 800 chars (not 1000)
- Use direct Qdrant REST API (not qdrant_client)
"""
import os, glob, json, time, re, requests
from datetime import datetime
QDRANT = "http://10.10.0.2:6333"
OLLAMA = "http://localhost:11434"
COLL = "pgz_godisnjaci"
DATA = "/opt/pgz-sport/_data/godisnjaci"
LOG = "/opt/pgz-sport/logs/godisnjak_pipeline.log"
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
TG_CHAT = "7969491558"
os.makedirs("/opt/pgz-sport/logs", exist_ok=True)
def log(msg):
ts = datetime.now().strftime("%H:%M:%S")
line = f"[{ts}] {msg}"
print(line, flush=True)
with open(LOG, "a") as f: f.write(line+"\n")
def tg(msg):
try:
requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
data={"chat_id": TG_CHAT, "text": f"📚 Godišnjaci: {msg}"}, timeout=10)
except: pass
def embed(texts):
r = requests.post(f"{OLLAMA}/api/embed",
json={"model": "nomic-embed-text", "input": texts}, timeout=60)
if r.status_code == 200:
return r.json().get("embeddings", [])
log(f"Embed error: {r.status_code} {r.text[:50]}")
return []
def qdrant_upsert(points):
"""Upsert with small batches to avoid 33MB limit"""
BATCH = 5
for i in range(0, len(points), BATCH):
batch = points[i:i+BATCH]
r = requests.put(f"{QDRANT}/collections/{COLL}/points",
json={"points": batch}, timeout=30)
if r.status_code not in [200, 201]:
log(f" Qdrant warn: {r.status_code} {r.text[:80]}")
time.sleep(0.05)
# Create/verify collection
requests.delete(f"{QDRANT}/collections/{COLL}", timeout=10)
time.sleep(1)
r = requests.put(f"{QDRANT}/collections/{COLL}",
json={"vectors": {"size": 768, "distance": "Cosine"}}, timeout=30)
log(f"Collection created: {r.status_code}")
# Test embed
test = embed(["NK Rijeka"])
if not test:
log("ERROR: Ollama not responding")
tg("ERROR: Ollama not responding!")
exit(1)
log(f"Embed test OK, dim={len(test[0])}")
# Process files
files = sorted([f for f in glob.glob(f"{DATA}/godisnjak_*.txt") if "_layout" not in f])
log(f"Files: {len(files)}")
tg(f"Starting embed of {len(files)} godišnjaka")
total = 0
point_id = 1
for fname in files:
year = re.search(r'(\d{4})', fname)
if not year: continue
year = int(year.group(1))
with open(fname) as f:
text = f.read()
# Clean
text = re.sub(r'\n{3,}', '\n\n', text).strip()
# Chunk: 300 words, 30 overlap
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i:i+300])
if len(chunk.strip()) > 80:
chunks.append(chunk[:800]) # Max 800 chars in payload
i += 270
log(f" {year}: {len(chunks)} chunks")
# Embed in batches of 5
for i in range(0, len(chunks), 5):
batch_texts = chunks[i:i+5]
embs = embed(batch_texts)
if not embs:
continue
points = [{
"id": point_id + j,
"vector": emb,
"payload": {
"godina": year,
"text": txt[:600], # Strictly limit payload size
"source": f"godisnjak_{year}",
"chunk_idx": i+j
}
} for j, (txt, emb) in enumerate(zip(batch_texts, embs))]
qdrant_upsert(points)
point_id += len(points)
time.sleep(0.1)
total += len(chunks)
log(f" {year}: done ({total} total chunks so far)")
# Verify
r = requests.get(f"{QDRANT}/collections/{COLL}", timeout=10)
pts = r.json().get("result",{}).get("points_count", 0)
msg = f"✅ Embed done: {pts} vectors in {COLL}"
log(msg)
tg(msg)