pgz-sport/godisnjak_embed.py

#!/usr/bin/env python3
"""
PGZ Godišnjaci embed pipeline - FIXED
- Batch size: 5 points max
- Payload text: max 800 chars (not 1000)
- Use direct Qdrant REST API (not qdrant_client)
"""
import os, glob, json, time, re, requests
from datetime import datetime

QDRANT = "http://10.10.0.2:6333"
OLLAMA = "http://localhost:11434"
COLL   = "pgz_godisnjaci"
DATA   = "/opt/pgz-sport/_data/godisnjaci"
LOG    = "/opt/pgz-sport/logs/godisnjak_pipeline.log"
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
TG_CHAT = "7969491558"

os.makedirs("/opt/pgz-sport/logs", exist_ok=True)

def log(msg):
    ts = datetime.now().strftime("%H:%M:%S")
    line = f"[{ts}] {msg}"
    print(line, flush=True)
    with open(LOG, "a") as f: f.write(line+"\n")

def tg(msg):
    try:
        requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
            data={"chat_id": TG_CHAT, "text": f"📚 Godišnjaci: {msg}"}, timeout=10)
    except: pass

def embed(texts):
    r = requests.post(f"{OLLAMA}/api/embed",
        json={"model": "nomic-embed-text", "input": texts}, timeout=60)
    if r.status_code == 200:
        return r.json().get("embeddings", [])
    log(f"Embed error: {r.status_code} {r.text[:50]}")
    return []

def qdrant_upsert(points):
    """Upsert with small batches to avoid 33MB limit"""
    BATCH = 5
    for i in range(0, len(points), BATCH):
        batch = points[i:i+BATCH]
        r = requests.put(f"{QDRANT}/collections/{COLL}/points",
            json={"points": batch}, timeout=30)
        if r.status_code not in [200, 201]:
            log(f"  Qdrant warn: {r.status_code} {r.text[:80]}")
        time.sleep(0.05)

# Create/verify collection
requests.delete(f"{QDRANT}/collections/{COLL}", timeout=10)
time.sleep(1)
r = requests.put(f"{QDRANT}/collections/{COLL}",
    json={"vectors": {"size": 768, "distance": "Cosine"}}, timeout=30)
log(f"Collection created: {r.status_code}")

# Test embed
test = embed(["NK Rijeka"])
if not test:
    log("ERROR: Ollama not responding")
    tg("ERROR: Ollama not responding!")
    exit(1)
log(f"Embed test OK, dim={len(test[0])}")

# Process files
files = sorted([f for f in glob.glob(f"{DATA}/godisnjak_*.txt") if "_layout" not in f])
log(f"Files: {len(files)}")
tg(f"Starting embed of {len(files)} godišnjaka")

total = 0
point_id = 1

for fname in files:
    year = re.search(r'(\d{4})', fname)
    if not year: continue
    year = int(year.group(1))

    with open(fname) as f:
        text = f.read()

    # Clean
    text = re.sub(r'\n{3,}', '\n\n', text).strip()

    # Chunk: 300 words, 30 overlap
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i+300])
        if len(chunk.strip()) > 80:
            chunks.append(chunk[:800])  # Max 800 chars in payload
        i += 270

    log(f"  {year}: {len(chunks)} chunks")

    # Embed in batches of 5
    for i in range(0, len(chunks), 5):
        batch_texts = chunks[i:i+5]
        embs = embed(batch_texts)
        if not embs:
            continue

        points = [{
            "id": point_id + j,
            "vector": emb,
            "payload": {
                "godina": year,
                "text": txt[:600],  # Strictly limit payload size
                "source": f"godisnjak_{year}",
                "chunk_idx": i+j
            }
        } for j, (txt, emb) in enumerate(zip(batch_texts, embs))]

        qdrant_upsert(points)
        point_id += len(points)
        time.sleep(0.1)

    total += len(chunks)
    log(f"  {year}: done ({total} total chunks so far)")

# Verify
r = requests.get(f"{QDRANT}/collections/{COLL}", timeout=10)
pts = r.json().get("result",{}).get("points_count", 0)
msg = f"✅ Embed done: {pts} vectors in {COLL}"
log(msg)
tg(msg)