129 lines
3.7 KiB
Python
129 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PGZ Godišnjaci embed pipeline - FIXED
|
|
- Batch size: 5 points max
|
|
- Payload text: max 800 chars (not 1000)
|
|
- Use direct Qdrant REST API (not qdrant_client)
|
|
"""
|
|
import os, glob, json, time, re, requests
|
|
from datetime import datetime
|
|
|
|
QDRANT = "http://10.10.0.2:6333"
|
|
OLLAMA = "http://localhost:11434"
|
|
COLL = "pgz_godisnjaci"
|
|
DATA = "/opt/pgz-sport/_data/godisnjaci"
|
|
LOG = "/opt/pgz-sport/logs/godisnjak_pipeline.log"
|
|
TG_TOKEN = "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y"
|
|
TG_CHAT = "7969491558"
|
|
|
|
os.makedirs("/opt/pgz-sport/logs", exist_ok=True)
|
|
|
|
def log(msg):
|
|
ts = datetime.now().strftime("%H:%M:%S")
|
|
line = f"[{ts}] {msg}"
|
|
print(line, flush=True)
|
|
with open(LOG, "a") as f: f.write(line+"\n")
|
|
|
|
def tg(msg):
|
|
try:
|
|
requests.post(f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
|
data={"chat_id": TG_CHAT, "text": f"📚 Godišnjaci: {msg}"}, timeout=10)
|
|
except: pass
|
|
|
|
def embed(texts):
|
|
r = requests.post(f"{OLLAMA}/api/embed",
|
|
json={"model": "nomic-embed-text", "input": texts}, timeout=60)
|
|
if r.status_code == 200:
|
|
return r.json().get("embeddings", [])
|
|
log(f"Embed error: {r.status_code} {r.text[:50]}")
|
|
return []
|
|
|
|
def qdrant_upsert(points):
|
|
"""Upsert with small batches to avoid 33MB limit"""
|
|
BATCH = 5
|
|
for i in range(0, len(points), BATCH):
|
|
batch = points[i:i+BATCH]
|
|
r = requests.put(f"{QDRANT}/collections/{COLL}/points",
|
|
json={"points": batch}, timeout=30)
|
|
if r.status_code not in [200, 201]:
|
|
log(f" Qdrant warn: {r.status_code} {r.text[:80]}")
|
|
time.sleep(0.05)
|
|
|
|
# Create/verify collection
|
|
requests.delete(f"{QDRANT}/collections/{COLL}", timeout=10)
|
|
time.sleep(1)
|
|
r = requests.put(f"{QDRANT}/collections/{COLL}",
|
|
json={"vectors": {"size": 768, "distance": "Cosine"}}, timeout=30)
|
|
log(f"Collection created: {r.status_code}")
|
|
|
|
# Test embed
|
|
test = embed(["NK Rijeka"])
|
|
if not test:
|
|
log("ERROR: Ollama not responding")
|
|
tg("ERROR: Ollama not responding!")
|
|
exit(1)
|
|
log(f"Embed test OK, dim={len(test[0])}")
|
|
|
|
# Process files
|
|
files = sorted([f for f in glob.glob(f"{DATA}/godisnjak_*.txt") if "_layout" not in f])
|
|
log(f"Files: {len(files)}")
|
|
tg(f"Starting embed of {len(files)} godišnjaka")
|
|
|
|
total = 0
|
|
point_id = 1
|
|
|
|
for fname in files:
|
|
year = re.search(r'(\d{4})', fname)
|
|
if not year: continue
|
|
year = int(year.group(1))
|
|
|
|
with open(fname) as f:
|
|
text = f.read()
|
|
|
|
# Clean
|
|
text = re.sub(r'\n{3,}', '\n\n', text).strip()
|
|
|
|
# Chunk: 300 words, 30 overlap
|
|
words = text.split()
|
|
chunks = []
|
|
i = 0
|
|
while i < len(words):
|
|
chunk = " ".join(words[i:i+300])
|
|
if len(chunk.strip()) > 80:
|
|
chunks.append(chunk[:800]) # Max 800 chars in payload
|
|
i += 270
|
|
|
|
log(f" {year}: {len(chunks)} chunks")
|
|
|
|
# Embed in batches of 5
|
|
for i in range(0, len(chunks), 5):
|
|
batch_texts = chunks[i:i+5]
|
|
embs = embed(batch_texts)
|
|
if not embs:
|
|
continue
|
|
|
|
points = [{
|
|
"id": point_id + j,
|
|
"vector": emb,
|
|
"payload": {
|
|
"godina": year,
|
|
"text": txt[:600], # Strictly limit payload size
|
|
"source": f"godisnjak_{year}",
|
|
"chunk_idx": i+j
|
|
}
|
|
} for j, (txt, emb) in enumerate(zip(batch_texts, embs))]
|
|
|
|
qdrant_upsert(points)
|
|
point_id += len(points)
|
|
time.sleep(0.1)
|
|
|
|
total += len(chunks)
|
|
log(f" {year}: done ({total} total chunks so far)")
|
|
|
|
# Verify
|
|
r = requests.get(f"{QDRANT}/collections/{COLL}", timeout=10)
|
|
pts = r.json().get("result",{}).get("points_count", 0)
|
|
msg = f"✅ Embed done: {pts} vectors in {COLL}"
|
|
log(msg)
|
|
tg(msg)
|