#!/usr/bin/env python3 """Embed dokumenti into Qdrant pgz_sport_dokumenti_v1 collection. Strategy: 1. Use existing sadrzaj for docs that have content scraped 2. For docs without sadrzaj — embed kratak_opis + naslov + organizacija 3. Chunk into 800-char overlapping windows 4. BGE-M3 embed via local server 5. Store in Qdrant + dokument_chunks """ import psycopg2 import psycopg2.extras import requests import json import re import sys from datetime import datetime DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7') EMBED_URL = 'http://localhost:9879/api/embeddings' # BGE-M3 QDRANT = 'http://10.10.0.2:6333' COLL = 'pgz_sport_dokumenti_v1' DIM = 1024 CHUNK_SIZE = 800 OVERLAP = 100 def ensure_collection(): r = requests.get(f'{QDRANT}/collections/{COLL}') if r.status_code == 200: return requests.put(f'{QDRANT}/collections/{COLL}', json={ "vectors": {"size": DIM, "distance": "Cosine"} }) print(f" ✓ Created collection {COLL}") def embed_text(text): """BGE-M3 embedding.""" r = requests.post(EMBED_URL, json={"model":"bge-m3","prompt":text}, timeout=30) return r.json().get('embedding') or r.json().get('data', [{}])[0].get('embedding') def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP): """Split into overlapping chunks.""" if not text: return [] text = re.sub(r'\s+', ' ', text).strip() if len(text) <= size: return [text] chunks = [] i = 0 while i < len(text): chunks.append(text[i:i+size]) i += size - overlap return chunks def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) ensure_collection() # Get all docs cu.execute("""SELECT id, title, sadrzaj, kratak_opis, vrsta, razina, organizacija, sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci FROM pgz_sport.dokumenti WHERE COALESCE(aktivan,true)=true""") rows = cu.fetchall() print(f"Embedding {len(rows)} dokumenata…") # Clear existing chunks cu.execute("TRUNCATE pgz_sport.dokument_chunks RESTART IDENTITY") requests.delete(f'{QDRANT}/collections/{COLL}/points/delete', json={"filter":{"must":[{"key":"_dummy","match":{"value":"any"}}]}}) # Easier — recreate requests.delete(f'{QDRANT}/collections/{COLL}') ensure_collection() point_id = 1 n_emb = 0 for d in rows: # Build embed text title = (d.get('title') or '').strip() opis = (d.get('kratak_opis') or '').strip() sadrzaj = (d.get('sadrzaj') or '').strip() org = d.get('organizacija') or '' razina = d.get('razina') or '' vrsta = d.get('vrsta') or '' sport = d.get('sport') or '' kljuc = ', '.join(d.get('kljucne_rijeci') or []) glasnik = d.get('sluzbeni_glasnik') or '' # Header injected into every chunk for context header = f"[{vrsta.upper()} · {razina} · {org}]\n" if sport: header += f"Sport: {sport}\n" if glasnik: header += f"Službeni glasnik: {glasnik}\n" # Strategy: if sadrzaj > 200, chunk it. Else use kratak_opis+title. if sadrzaj and len(sadrzaj) > 200: chunks = chunk_text(sadrzaj) else: text_for_embed = f"{title}\n{opis}\n{kljuc}".strip() chunks = [text_for_embed] if text_for_embed else [] if not chunks: continue for idx, chunk in enumerate(chunks): full_chunk = header + chunk[:CHUNK_SIZE] try: vec = embed_text(full_chunk) if not vec: continue # Save chunk to DB cu.execute("""INSERT INTO pgz_sport.dokument_chunks (dokument_id, chunk_index, chunk_text, chunk_tokens, embedded_at, qdrant_point_id) VALUES (%s, %s, %s, %s, now(), %s)""", (d['id'], idx, full_chunk, len(full_chunk.split()), point_id)) # Upsert into Qdrant payload = { "dokument_id": d['id'], "chunk_index": idx, "title": title[:200], "vrsta": vrsta, "razina": razina, "organizacija": org, "sport": sport, "sluzbeni_glasnik": glasnik, "izvor_url": d.get('izvor_url') or '', "preview": chunk[:200], } requests.put(f'{QDRANT}/collections/{COLL}/points', json={"points":[{"id": point_id, "vector": vec, "payload": payload}]}, timeout=10) point_id += 1 n_emb += 1 except Exception as e: print(f" err doc {d['id']} chunk {idx}: {e}") continue if n_emb % 25 == 0 and n_emb > 0: print(f" embedded {n_emb} chunks…") # Final count qstats = requests.get(f'{QDRANT}/collections/{COLL}').json() print(f"\n✓ Embedded {n_emb} chunks total") print(f" Qdrant {COLL}: {qstats.get('result',{}).get('points_count',0)} points") cu.execute("SELECT count(*) AS n FROM pgz_sport.dokument_chunks") print(f" DB chunks: {cu.fetchone()['n']}") conn.close() if __name__ == '__main__': main()