Files
pgz-sport/scrapers/dok_embedder.py_prije_env_deepseek

149 lines
5.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""Embed dokumenti into Qdrant pgz_sport_dokumenti_v1 collection.
Strategy:
1. Use existing sadrzaj for docs that have content scraped
2. For docs without sadrzaj — embed kratak_opis + naslov + organizacija
3. Chunk into 800-char overlapping windows
4. BGE-M3 embed via local server
5. Store in Qdrant + dokument_chunks
"""
import os
import psycopg2
import psycopg2.extras
import requests
import json
import re
import sys
from datetime import datetime
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
EMBED_URL = 'http://localhost:9879/api/embeddings' # BGE-M3
QDRANT = 'http://10.10.0.2:6333'
COLL = 'pgz_sport_dokumenti_v1'
DIM = 1024
CHUNK_SIZE = 800
OVERLAP = 100
def ensure_collection():
r = requests.get(f'{QDRANT}/collections/{COLL}')
if r.status_code == 200:
return
requests.put(f'{QDRANT}/collections/{COLL}', json={
"vectors": {"size": DIM, "distance": "Cosine"}
})
print(f" ✓ Created collection {COLL}")
def embed_text(text):
"""BGE-M3 embedding."""
r = requests.post(EMBED_URL, json={"model":"bge-m3","prompt":text}, timeout=30)
return r.json().get('embedding') or r.json().get('data', [{}])[0].get('embedding')
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
"""Split into overlapping chunks."""
if not text: return []
text = re.sub(r'\s+', ' ', text).strip()
if len(text) <= size:
return [text]
chunks = []
i = 0
while i < len(text):
chunks.append(text[i:i+size])
i += size - overlap
return chunks
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
ensure_collection()
# Get all docs
cu.execute("""SELECT id, title, sadrzaj, kratak_opis, vrsta, razina, organizacija,
sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci
FROM pgz_sport.dokumenti WHERE COALESCE(aktivan,true)=true""")
rows = cu.fetchall()
print(f"Embedding {len(rows)} dokumenata…")
# Clear existing chunks
cu.execute("TRUNCATE pgz_sport.dokument_chunks RESTART IDENTITY")
requests.delete(f'{QDRANT}/collections/{COLL}/points/delete',
json={"filter":{"must":[{"key":"_dummy","match":{"value":"any"}}]}})
# Easier — recreate
requests.delete(f'{QDRANT}/collections/{COLL}')
ensure_collection()
point_id = 1
n_emb = 0
for d in rows:
# Build embed text
title = (d.get('title') or '').strip()
opis = (d.get('kratak_opis') or '').strip()
sadrzaj = (d.get('sadrzaj') or '').strip()
org = d.get('organizacija') or ''
razina = d.get('razina') or ''
vrsta = d.get('vrsta') or ''
sport = d.get('sport') or ''
kljuc = ', '.join(d.get('kljucne_rijeci') or [])
glasnik = d.get('sluzbeni_glasnik') or ''
# Header injected into every chunk for context
header = f"[{vrsta.upper()} · {razina} · {org}]\n"
if sport: header += f"Sport: {sport}\n"
if glasnik: header += f"Službeni glasnik: {glasnik}\n"
# Strategy: if sadrzaj > 200, chunk it. Else use kratak_opis+title.
if sadrzaj and len(sadrzaj) > 200:
chunks = chunk_text(sadrzaj)
else:
text_for_embed = f"{title}\n{opis}\n{kljuc}".strip()
chunks = [text_for_embed] if text_for_embed else []
if not chunks: continue
for idx, chunk in enumerate(chunks):
full_chunk = header + chunk[:CHUNK_SIZE]
try:
vec = embed_text(full_chunk)
if not vec:
continue
# Save chunk to DB
cu.execute("""INSERT INTO pgz_sport.dokument_chunks
(dokument_id, chunk_index, chunk_text, chunk_tokens, embedded_at, qdrant_point_id)
VALUES (%s, %s, %s, %s, now(), %s)""",
(d['id'], idx, full_chunk, len(full_chunk.split()), point_id))
# Upsert into Qdrant
payload = {
"dokument_id": d['id'],
"chunk_index": idx,
"title": title[:200],
"vrsta": vrsta,
"razina": razina,
"organizacija": org,
"sport": sport,
"sluzbeni_glasnik": glasnik,
"izvor_url": d.get('izvor_url') or '',
"preview": chunk[:200],
}
requests.put(f'{QDRANT}/collections/{COLL}/points',
json={"points":[{"id": point_id, "vector": vec, "payload": payload}]},
timeout=10)
point_id += 1
n_emb += 1
except Exception as e:
print(f" err doc {d['id']} chunk {idx}: {e}")
continue
if n_emb % 25 == 0 and n_emb > 0:
print(f" embedded {n_emb} chunks…")
# Final count
qstats = requests.get(f'{QDRANT}/collections/{COLL}').json()
print(f"\n✓ Embedded {n_emb} chunks total")
print(f" Qdrant {COLL}: {qstats.get('result',{}).get('points_count',0)} points")
cu.execute("SELECT count(*) AS n FROM pgz_sport.dokument_chunks")
print(f" DB chunks: {cu.fetchone()['n']}")
conn.close()
if __name__ == '__main__':
main()