PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+"""
+PGZ Godišnjaci embed pipeline - FIXED
+- Batch size: 5 points max
+- Payload text: max 800 chars (not 1000)
+- Use direct Qdrant REST API (not qdrant_client)
+"""
+import os, glob, json, time, re, requests
+from datetime import datetime
+
+QDRANT = "http://10.10.0.2:6333"
+OLLAMA = "http://localhost:11434"
+COLL   = "pgz_godisnjaci"
+DATA   = "/opt/pgz-sport/_data/godisnjaci"
+LOG    = "/opt/pgz-sport/logs/godisnjak_pipeline.log"
+TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
+TG_CHAT = "7969491558"
+
+os.makedirs("/opt/pgz-sport/logs", exist_ok=True)
+
+def log(msg):
+    ts = datetime.now().strftime("%H:%M:%S")
+    line = f"[{ts}] {msg}"
+    print(line, flush=True)
+    with open(LOG, "a") as f: f.write(line+"\n")
+
+def tg(msg):
+    try:
+        requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
+            data={"chat_id": TG_CHAT, "text": f"📚 Godišnjaci: {msg}"}, timeout=10)
+    except: pass
+
+def embed(texts):
+    r = requests.post(f"{OLLAMA}/api/embed",
+        json={"model": "nomic-embed-text", "input": texts}, timeout=60)
+    if r.status_code == 200:
+        return r.json().get("embeddings", [])
+    log(f"Embed error: {r.status_code} {r.text[:50]}")
+    return []
+
+def qdrant_upsert(points):
+    """Upsert with small batches to avoid 33MB limit"""
+    BATCH = 5
+    for i in range(0, len(points), BATCH):
+        batch = points[i:i+BATCH]
+        r = requests.put(f"{QDRANT}/collections/{COLL}/points",
+            json={"points": batch}, timeout=30)
+        if r.status_code not in [200, 201]:
+            log(f"  Qdrant warn: {r.status_code} {r.text[:80]}")
+        time.sleep(0.05)
+
+# Create/verify collection
+requests.delete(f"{QDRANT}/collections/{COLL}", timeout=10)
+time.sleep(1)
+r = requests.put(f"{QDRANT}/collections/{COLL}",
+    json={"vectors": {"size": 768, "distance": "Cosine"}}, timeout=30)
+log(f"Collection created: {r.status_code}")
+
+# Test embed
+test = embed(["NK Rijeka"])
+if not test:
+    log("ERROR: Ollama not responding")
+    tg("ERROR: Ollama not responding!")
+    exit(1)
+log(f"Embed test OK, dim={len(test[0])}")
+
+# Process files
+files = sorted([f for f in glob.glob(f"{DATA}/godisnjak_*.txt") if "_layout" not in f])
+log(f"Files: {len(files)}")
+tg(f"Starting embed of {len(files)} godišnjaka")
+
+total = 0
+point_id = 1
+
+for fname in files:
+    year = re.search(r'(\d{4})', fname)
+    if not year: continue
+    year = int(year.group(1))
+    
+    with open(fname) as f:
+        text = f.read()
+    
+    # Clean
+    text = re.sub(r'\n{3,}', '\n\n', text).strip()
+    
+    # Chunk: 300 words, 30 overlap
+    words = text.split()
+    chunks = []
+    i = 0
+    while i < len(words):
+        chunk = " ".join(words[i:i+300])
+        if len(chunk.strip()) > 80:
+            chunks.append(chunk[:800])  # Max 800 chars in payload
+        i += 270
+    
+    log(f"  {year}: {len(chunks)} chunks")
+    
+    # Embed in batches of 5
+    for i in range(0, len(chunks), 5):
+        batch_texts = chunks[i:i+5]
+        embs = embed(batch_texts)
+        if not embs:
+            continue
+        
+        points = [{
+            "id": point_id + j,
+            "vector": emb,
+            "payload": {
+                "godina": year,
+                "text": txt[:600],  # Strictly limit payload size
+                "source": f"godisnjak_{year}",
+                "chunk_idx": i+j
+            }
+        } for j, (txt, emb) in enumerate(zip(batch_texts, embs))]
+        
+        qdrant_upsert(points)
+        point_id += len(points)
+        time.sleep(0.1)
+    
+    total += len(chunks)
+    log(f"  {year}: done ({total} total chunks so far)")
+
+# Verify
+r = requests.get(f"{QDRANT}/collections/{COLL}", timeout=10)
+pts = r.json().get("result",{}).get("points_count", 0)
+msg = f"✅ Embed done: {pts} vectors in {COLL}"
+log(msg)
+tg(msg)