PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PGZ Godišnjaci embed pipeline - FIXED
|
||||
- Batch size: 5 points max
|
||||
- Payload text: max 800 chars (not 1000)
|
||||
- Use direct Qdrant REST API (not qdrant_client)
|
||||
"""
|
||||
import os, glob, json, time, re, requests
|
||||
from datetime import datetime
|
||||
|
||||
QDRANT = "http://10.10.0.2:6333"
|
||||
OLLAMA = "http://localhost:11434"
|
||||
COLL = "pgz_godisnjaci"
|
||||
DATA = "/opt/pgz-sport/_data/godisnjaci"
|
||||
LOG = "/opt/pgz-sport/logs/godisnjak_pipeline.log"
|
||||
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
|
||||
TG_CHAT = "7969491558"
|
||||
|
||||
os.makedirs("/opt/pgz-sport/logs", exist_ok=True)
|
||||
|
||||
def log(msg):
|
||||
ts = datetime.now().strftime("%H:%M:%S")
|
||||
line = f"[{ts}] {msg}"
|
||||
print(line, flush=True)
|
||||
with open(LOG, "a") as f: f.write(line+"\n")
|
||||
|
||||
def tg(msg):
|
||||
try:
|
||||
requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
data={"chat_id": TG_CHAT, "text": f"📚 Godišnjaci: {msg}"}, timeout=10)
|
||||
except: pass
|
||||
|
||||
def embed(texts):
|
||||
r = requests.post(f"{OLLAMA}/api/embed",
|
||||
json={"model": "nomic-embed-text", "input": texts}, timeout=60)
|
||||
if r.status_code == 200:
|
||||
return r.json().get("embeddings", [])
|
||||
log(f"Embed error: {r.status_code} {r.text[:50]}")
|
||||
return []
|
||||
|
||||
def qdrant_upsert(points):
|
||||
"""Upsert with small batches to avoid 33MB limit"""
|
||||
BATCH = 5
|
||||
for i in range(0, len(points), BATCH):
|
||||
batch = points[i:i+BATCH]
|
||||
r = requests.put(f"{QDRANT}/collections/{COLL}/points",
|
||||
json={"points": batch}, timeout=30)
|
||||
if r.status_code not in [200, 201]:
|
||||
log(f" Qdrant warn: {r.status_code} {r.text[:80]}")
|
||||
time.sleep(0.05)
|
||||
|
||||
# Create/verify collection
|
||||
requests.delete(f"{QDRANT}/collections/{COLL}", timeout=10)
|
||||
time.sleep(1)
|
||||
r = requests.put(f"{QDRANT}/collections/{COLL}",
|
||||
json={"vectors": {"size": 768, "distance": "Cosine"}}, timeout=30)
|
||||
log(f"Collection created: {r.status_code}")
|
||||
|
||||
# Test embed
|
||||
test = embed(["NK Rijeka"])
|
||||
if not test:
|
||||
log("ERROR: Ollama not responding")
|
||||
tg("ERROR: Ollama not responding!")
|
||||
exit(1)
|
||||
log(f"Embed test OK, dim={len(test[0])}")
|
||||
|
||||
# Process files
|
||||
files = sorted([f for f in glob.glob(f"{DATA}/godisnjak_*.txt") if "_layout" not in f])
|
||||
log(f"Files: {len(files)}")
|
||||
tg(f"Starting embed of {len(files)} godišnjaka")
|
||||
|
||||
total = 0
|
||||
point_id = 1
|
||||
|
||||
for fname in files:
|
||||
year = re.search(r'(\d{4})', fname)
|
||||
if not year: continue
|
||||
year = int(year.group(1))
|
||||
|
||||
with open(fname) as f:
|
||||
text = f.read()
|
||||
|
||||
# Clean
|
||||
text = re.sub(r'\n{3,}', '\n\n', text).strip()
|
||||
|
||||
# Chunk: 300 words, 30 overlap
|
||||
words = text.split()
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(words):
|
||||
chunk = " ".join(words[i:i+300])
|
||||
if len(chunk.strip()) > 80:
|
||||
chunks.append(chunk[:800]) # Max 800 chars in payload
|
||||
i += 270
|
||||
|
||||
log(f" {year}: {len(chunks)} chunks")
|
||||
|
||||
# Embed in batches of 5
|
||||
for i in range(0, len(chunks), 5):
|
||||
batch_texts = chunks[i:i+5]
|
||||
embs = embed(batch_texts)
|
||||
if not embs:
|
||||
continue
|
||||
|
||||
points = [{
|
||||
"id": point_id + j,
|
||||
"vector": emb,
|
||||
"payload": {
|
||||
"godina": year,
|
||||
"text": txt[:600], # Strictly limit payload size
|
||||
"source": f"godisnjak_{year}",
|
||||
"chunk_idx": i+j
|
||||
}
|
||||
} for j, (txt, emb) in enumerate(zip(batch_texts, embs))]
|
||||
|
||||
qdrant_upsert(points)
|
||||
point_id += len(points)
|
||||
time.sleep(0.1)
|
||||
|
||||
total += len(chunks)
|
||||
log(f" {year}: done ({total} total chunks so far)")
|
||||
|
||||
# Verify
|
||||
r = requests.get(f"{QDRANT}/collections/{COLL}", timeout=10)
|
||||
pts = r.json().get("result",{}).get("points_count", 0)
|
||||
msg = f"✅ Embed done: {pts} vectors in {COLL}"
|
||||
log(msg)
|
||||
tg(msg)
|
||||
Reference in New Issue
Block a user