feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
Executable
+148
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Embed dokumenti into Qdrant pgz_sport_dokumenti_v1 collection.
|
||||
Strategy:
|
||||
1. Use existing sadrzaj for docs that have content scraped
|
||||
2. For docs without sadrzaj — embed kratak_opis + naslov + organizacija
|
||||
3. Chunk into 800-char overlapping windows
|
||||
4. BGE-M3 embed via local server
|
||||
5. Store in Qdrant + dokument_chunks
|
||||
"""
|
||||
import os
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
EMBED_URL = 'http://localhost:9879/api/embeddings' # BGE-M3
|
||||
QDRANT = 'http://10.10.0.2:6333'
|
||||
COLL = 'pgz_sport_dokumenti_v1'
|
||||
DIM = 1024
|
||||
CHUNK_SIZE = 800
|
||||
OVERLAP = 100
|
||||
|
||||
def ensure_collection():
|
||||
r = requests.get(f'{QDRANT}/collections/{COLL}')
|
||||
if r.status_code == 200:
|
||||
return
|
||||
requests.put(f'{QDRANT}/collections/{COLL}', json={
|
||||
"vectors": {"size": DIM, "distance": "Cosine"}
|
||||
})
|
||||
print(f" ✓ Created collection {COLL}")
|
||||
|
||||
def embed_text(text):
|
||||
"""BGE-M3 embedding."""
|
||||
r = requests.post(EMBED_URL, json={"model":"bge-m3","prompt":text}, timeout=30)
|
||||
return r.json().get('embedding') or r.json().get('data', [{}])[0].get('embedding')
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
|
||||
"""Split into overlapping chunks."""
|
||||
if not text: return []
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
if len(text) <= size:
|
||||
return [text]
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(text):
|
||||
chunks.append(text[i:i+size])
|
||||
i += size - overlap
|
||||
return chunks
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
ensure_collection()
|
||||
|
||||
# Get all docs
|
||||
cu.execute("""SELECT id, title, sadrzaj, kratak_opis, vrsta, razina, organizacija,
|
||||
sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci
|
||||
FROM pgz_sport.dokumenti WHERE COALESCE(aktivan,true)=true""")
|
||||
rows = cu.fetchall()
|
||||
print(f"Embedding {len(rows)} dokumenata…")
|
||||
|
||||
# Clear existing chunks
|
||||
cu.execute("TRUNCATE pgz_sport.dokument_chunks RESTART IDENTITY")
|
||||
requests.delete(f'{QDRANT}/collections/{COLL}/points/delete',
|
||||
json={"filter":{"must":[{"key":"_dummy","match":{"value":"any"}}]}})
|
||||
# Easier — recreate
|
||||
requests.delete(f'{QDRANT}/collections/{COLL}')
|
||||
ensure_collection()
|
||||
|
||||
point_id = 1
|
||||
n_emb = 0
|
||||
for d in rows:
|
||||
# Build embed text
|
||||
title = (d.get('title') or '').strip()
|
||||
opis = (d.get('kratak_opis') or '').strip()
|
||||
sadrzaj = (d.get('sadrzaj') or '').strip()
|
||||
org = d.get('organizacija') or ''
|
||||
razina = d.get('razina') or ''
|
||||
vrsta = d.get('vrsta') or ''
|
||||
sport = d.get('sport') or ''
|
||||
kljuc = ', '.join(d.get('kljucne_rijeci') or [])
|
||||
glasnik = d.get('sluzbeni_glasnik') or ''
|
||||
|
||||
# Header injected into every chunk for context
|
||||
header = f"[{vrsta.upper()} · {razina} · {org}]\n"
|
||||
if sport: header += f"Sport: {sport}\n"
|
||||
if glasnik: header += f"Službeni glasnik: {glasnik}\n"
|
||||
|
||||
# Strategy: if sadrzaj > 200, chunk it. Else use kratak_opis+title.
|
||||
if sadrzaj and len(sadrzaj) > 200:
|
||||
chunks = chunk_text(sadrzaj)
|
||||
else:
|
||||
text_for_embed = f"{title}\n{opis}\n{kljuc}".strip()
|
||||
chunks = [text_for_embed] if text_for_embed else []
|
||||
|
||||
if not chunks: continue
|
||||
|
||||
for idx, chunk in enumerate(chunks):
|
||||
full_chunk = header + chunk[:CHUNK_SIZE]
|
||||
try:
|
||||
vec = embed_text(full_chunk)
|
||||
if not vec:
|
||||
continue
|
||||
# Save chunk to DB
|
||||
cu.execute("""INSERT INTO pgz_sport.dokument_chunks
|
||||
(dokument_id, chunk_index, chunk_text, chunk_tokens, embedded_at, qdrant_point_id)
|
||||
VALUES (%s, %s, %s, %s, now(), %s)""",
|
||||
(d['id'], idx, full_chunk, len(full_chunk.split()), point_id))
|
||||
# Upsert into Qdrant
|
||||
payload = {
|
||||
"dokument_id": d['id'],
|
||||
"chunk_index": idx,
|
||||
"title": title[:200],
|
||||
"vrsta": vrsta,
|
||||
"razina": razina,
|
||||
"organizacija": org,
|
||||
"sport": sport,
|
||||
"sluzbeni_glasnik": glasnik,
|
||||
"izvor_url": d.get('izvor_url') or '',
|
||||
"preview": chunk[:200],
|
||||
}
|
||||
requests.put(f'{QDRANT}/collections/{COLL}/points',
|
||||
json={"points":[{"id": point_id, "vector": vec, "payload": payload}]},
|
||||
timeout=10)
|
||||
point_id += 1
|
||||
n_emb += 1
|
||||
except Exception as e:
|
||||
print(f" err doc {d['id']} chunk {idx}: {e}")
|
||||
continue
|
||||
if n_emb % 25 == 0 and n_emb > 0:
|
||||
print(f" embedded {n_emb} chunks…")
|
||||
|
||||
# Final count
|
||||
qstats = requests.get(f'{QDRANT}/collections/{COLL}').json()
|
||||
print(f"\n✓ Embedded {n_emb} chunks total")
|
||||
print(f" Qdrant {COLL}: {qstats.get('result',{}).get('points_count',0)} points")
|
||||
|
||||
cu.execute("SELECT count(*) AS n FROM pgz_sport.dokument_chunks")
|
||||
print(f" DB chunks: {cu.fetchone()['n']}")
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user