#!/usr/bin/env python3 """ PGZ Godišnjaci embed pipeline - FIXED - Batch size: 5 points max - Payload text: max 800 chars (not 1000) - Use direct Qdrant REST API (not qdrant_client) """ import os, glob, json, time, re, requests from datetime import datetime QDRANT = "http://10.10.0.2:6333" OLLAMA = "http://localhost:11434" COLL = "pgz_godisnjaci" DATA = "/opt/pgz-sport/_data/godisnjaci" LOG = "/opt/pgz-sport/logs/godisnjak_pipeline.log" TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y" TG_CHAT = "7969491558" os.makedirs("/opt/pgz-sport/logs", exist_ok=True) def log(msg): ts = datetime.now().strftime("%H:%M:%S") line = f"[{ts}] {msg}" print(line, flush=True) with open(LOG, "a") as f: f.write(line+"\n") def tg(msg): try: requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage", data={"chat_id": TG_CHAT, "text": f"📚 Godišnjaci: {msg}"}, timeout=10) except: pass def embed(texts): r = requests.post(f"{OLLAMA}/api/embed", json={"model": "nomic-embed-text", "input": texts}, timeout=60) if r.status_code == 200: return r.json().get("embeddings", []) log(f"Embed error: {r.status_code} {r.text[:50]}") return [] def qdrant_upsert(points): """Upsert with small batches to avoid 33MB limit""" BATCH = 5 for i in range(0, len(points), BATCH): batch = points[i:i+BATCH] r = requests.put(f"{QDRANT}/collections/{COLL}/points", json={"points": batch}, timeout=30) if r.status_code not in [200, 201]: log(f" Qdrant warn: {r.status_code} {r.text[:80]}") time.sleep(0.05) # Create/verify collection requests.delete(f"{QDRANT}/collections/{COLL}", timeout=10) time.sleep(1) r = requests.put(f"{QDRANT}/collections/{COLL}", json={"vectors": {"size": 768, "distance": "Cosine"}}, timeout=30) log(f"Collection created: {r.status_code}") # Test embed test = embed(["NK Rijeka"]) if not test: log("ERROR: Ollama not responding") tg("ERROR: Ollama not responding!") exit(1) log(f"Embed test OK, dim={len(test[0])}") # Process files files = sorted([f for f in glob.glob(f"{DATA}/godisnjak_*.txt") if "_layout" not in f]) log(f"Files: {len(files)}") tg(f"Starting embed of {len(files)} godišnjaka") total = 0 point_id = 1 for fname in files: year = re.search(r'(\d{4})', fname) if not year: continue year = int(year.group(1)) with open(fname) as f: text = f.read() # Clean text = re.sub(r'\n{3,}', '\n\n', text).strip() # Chunk: 300 words, 30 overlap words = text.split() chunks = [] i = 0 while i < len(words): chunk = " ".join(words[i:i+300]) if len(chunk.strip()) > 80: chunks.append(chunk[:800]) # Max 800 chars in payload i += 270 log(f" {year}: {len(chunks)} chunks") # Embed in batches of 5 for i in range(0, len(chunks), 5): batch_texts = chunks[i:i+5] embs = embed(batch_texts) if not embs: continue points = [{ "id": point_id + j, "vector": emb, "payload": { "godina": year, "text": txt[:600], # Strictly limit payload size "source": f"godisnjak_{year}", "chunk_idx": i+j } } for j, (txt, emb) in enumerate(zip(batch_texts, embs))] qdrant_upsert(points) point_id += len(points) time.sleep(0.1) total += len(chunks) log(f" {year}: done ({total} total chunks so far)") # Verify r = requests.get(f"{QDRANT}/collections/{COLL}", timeout=10) pts = r.json().get("result",{}).get("points_count", 0) msg = f"✅ Embed done: {pts} vectors in {COLL}" log(msg) tg(msg)