149 lines
5.4 KiB
Python
Executable File
149 lines
5.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Embed dokumenti into Qdrant pgz_sport_dokumenti_v1 collection.
|
|
Strategy:
|
|
1. Use existing sadrzaj for docs that have content scraped
|
|
2. For docs without sadrzaj — embed kratak_opis + naslov + organizacija
|
|
3. Chunk into 800-char overlapping windows
|
|
4. BGE-M3 embed via local server
|
|
5. Store in Qdrant + dokument_chunks
|
|
"""
|
|
import os
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
import requests
|
|
import json
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
|
|
EMBED_URL = 'http://localhost:9879/api/embeddings' # BGE-M3
|
|
QDRANT = 'http://10.10.0.2:6333'
|
|
COLL = 'pgz_sport_dokumenti_v1'
|
|
DIM = 1024
|
|
CHUNK_SIZE = 800
|
|
OVERLAP = 100
|
|
|
|
def ensure_collection():
|
|
r = requests.get(f'{QDRANT}/collections/{COLL}')
|
|
if r.status_code == 200:
|
|
return
|
|
requests.put(f'{QDRANT}/collections/{COLL}', json={
|
|
"vectors": {"size": DIM, "distance": "Cosine"}
|
|
})
|
|
print(f" ✓ Created collection {COLL}")
|
|
|
|
def embed_text(text):
|
|
"""BGE-M3 embedding."""
|
|
r = requests.post(EMBED_URL, json={"model":"bge-m3","prompt":text}, timeout=30)
|
|
return r.json().get('embedding') or r.json().get('data', [{}])[0].get('embedding')
|
|
|
|
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
|
|
"""Split into overlapping chunks."""
|
|
if not text: return []
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
if len(text) <= size:
|
|
return [text]
|
|
chunks = []
|
|
i = 0
|
|
while i < len(text):
|
|
chunks.append(text[i:i+size])
|
|
i += size - overlap
|
|
return chunks
|
|
|
|
def main():
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
|
|
ensure_collection()
|
|
|
|
# Get all docs
|
|
cu.execute("""SELECT id, title, sadrzaj, kratak_opis, vrsta, razina, organizacija,
|
|
sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci
|
|
FROM pgz_sport.dokumenti WHERE COALESCE(aktivan,true)=true""")
|
|
rows = cu.fetchall()
|
|
print(f"Embedding {len(rows)} dokumenata…")
|
|
|
|
# Clear existing chunks
|
|
cu.execute("TRUNCATE pgz_sport.dokument_chunks RESTART IDENTITY")
|
|
requests.delete(f'{QDRANT}/collections/{COLL}/points/delete',
|
|
json={"filter":{"must":[{"key":"_dummy","match":{"value":"any"}}]}})
|
|
# Easier — recreate
|
|
requests.delete(f'{QDRANT}/collections/{COLL}')
|
|
ensure_collection()
|
|
|
|
point_id = 1
|
|
n_emb = 0
|
|
for d in rows:
|
|
# Build embed text
|
|
title = (d.get('title') or '').strip()
|
|
opis = (d.get('kratak_opis') or '').strip()
|
|
sadrzaj = (d.get('sadrzaj') or '').strip()
|
|
org = d.get('organizacija') or ''
|
|
razina = d.get('razina') or ''
|
|
vrsta = d.get('vrsta') or ''
|
|
sport = d.get('sport') or ''
|
|
kljuc = ', '.join(d.get('kljucne_rijeci') or [])
|
|
glasnik = d.get('sluzbeni_glasnik') or ''
|
|
|
|
# Header injected into every chunk for context
|
|
header = f"[{vrsta.upper()} · {razina} · {org}]\n"
|
|
if sport: header += f"Sport: {sport}\n"
|
|
if glasnik: header += f"Službeni glasnik: {glasnik}\n"
|
|
|
|
# Strategy: if sadrzaj > 200, chunk it. Else use kratak_opis+title.
|
|
if sadrzaj and len(sadrzaj) > 200:
|
|
chunks = chunk_text(sadrzaj)
|
|
else:
|
|
text_for_embed = f"{title}\n{opis}\n{kljuc}".strip()
|
|
chunks = [text_for_embed] if text_for_embed else []
|
|
|
|
if not chunks: continue
|
|
|
|
for idx, chunk in enumerate(chunks):
|
|
full_chunk = header + chunk[:CHUNK_SIZE]
|
|
try:
|
|
vec = embed_text(full_chunk)
|
|
if not vec:
|
|
continue
|
|
# Save chunk to DB
|
|
cu.execute("""INSERT INTO pgz_sport.dokument_chunks
|
|
(dokument_id, chunk_index, chunk_text, chunk_tokens, embedded_at, qdrant_point_id)
|
|
VALUES (%s, %s, %s, %s, now(), %s)""",
|
|
(d['id'], idx, full_chunk, len(full_chunk.split()), point_id))
|
|
# Upsert into Qdrant
|
|
payload = {
|
|
"dokument_id": d['id'],
|
|
"chunk_index": idx,
|
|
"title": title[:200],
|
|
"vrsta": vrsta,
|
|
"razina": razina,
|
|
"organizacija": org,
|
|
"sport": sport,
|
|
"sluzbeni_glasnik": glasnik,
|
|
"izvor_url": d.get('izvor_url') or '',
|
|
"preview": chunk[:200],
|
|
}
|
|
requests.put(f'{QDRANT}/collections/{COLL}/points',
|
|
json={"points":[{"id": point_id, "vector": vec, "payload": payload}]},
|
|
timeout=10)
|
|
point_id += 1
|
|
n_emb += 1
|
|
except Exception as e:
|
|
print(f" err doc {d['id']} chunk {idx}: {e}")
|
|
continue
|
|
if n_emb % 25 == 0 and n_emb > 0:
|
|
print(f" embedded {n_emb} chunks…")
|
|
|
|
# Final count
|
|
qstats = requests.get(f'{QDRANT}/collections/{COLL}').json()
|
|
print(f"\n✓ Embedded {n_emb} chunks total")
|
|
print(f" Qdrant {COLL}: {qstats.get('result',{}).get('points_count',0)} points")
|
|
|
|
cu.execute("SELECT count(*) AS n FROM pgz_sport.dokument_chunks")
|
|
print(f" DB chunks: {cu.fetchone()['n']}")
|
|
conn.close()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|