PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
Executable
+193
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
geocode_objekti_v2.py — precision geocoding for pgz_sport.sportski_objekti
|
||||
|
||||
Re-geocodes all objects via Nominatim using {naziv} + {grad} + ", Hrvatska" queries.
|
||||
Verifies result is within PGŽ bounds (44.5-45.6, 14.0-15.1) and NOT a duplicated
|
||||
"city centroid" (where multiple objects share identical coordinates from a previous
|
||||
fallback pass). Updates lat/lng only when a more precise result is found.
|
||||
|
||||
Usage: python3 geocode_objekti_v2.py [--dry-run] [--only-duplicates]
|
||||
"""
|
||||
import os, sys, time, json, urllib.parse, argparse
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
|
||||
PG = dict(host=os.environ.get('PG_HOST','10.10.0.2'),
|
||||
port=int(os.environ.get('PG_PORT','6432')),
|
||||
dbname=os.environ.get('PG_DB','rinet_v3'),
|
||||
user=os.environ.get('PG_USER','rinet'),
|
||||
password=os.environ.get('PG_PASS',''))
|
||||
|
||||
PGZ_LAT = (44.5, 45.6)
|
||||
PGZ_LNG = (14.0, 15.1)
|
||||
|
||||
UA = 'pgz-sport/2.0 (dradulic@outlook.com)'
|
||||
|
||||
def nominatim(q, country='hr', limit=3):
|
||||
url = ('https://nominatim.openstreetmap.org/search?'
|
||||
'q='+urllib.parse.quote(q)+
|
||||
'&format=json&limit='+str(limit)+
|
||||
'&countrycodes='+country+
|
||||
'&addressdetails=1')
|
||||
req = urllib.request.Request(url, headers={'User-Agent': UA})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=10) as r:
|
||||
return json.loads(r.read().decode())
|
||||
except Exception as e:
|
||||
print(f' ! nominatim error: {e}')
|
||||
return []
|
||||
|
||||
def in_pgz(lat, lng):
|
||||
return PGZ_LAT[0] <= lat <= PGZ_LAT[1] and PGZ_LNG[0] <= lng <= PGZ_LNG[1]
|
||||
|
||||
def best_result(results):
|
||||
"""Pick best precision: prefer leisure/sports types, then building, then place."""
|
||||
if not results:
|
||||
return None
|
||||
type_priority = {
|
||||
'sports_centre': 100, 'stadium': 95, 'pitch': 90, 'swimming_pool': 90,
|
||||
'sports_hall': 95, 'leisure': 80, 'building': 70, 'tourism': 60,
|
||||
'highway': 30, 'place': 20,
|
||||
}
|
||||
best = None
|
||||
best_score = -1
|
||||
for r in results:
|
||||
try:
|
||||
lat = float(r['lat']); lng = float(r['lon'])
|
||||
except (KeyError, ValueError):
|
||||
continue
|
||||
if not in_pgz(lat, lng):
|
||||
continue
|
||||
cls = r.get('class','')
|
||||
typ = r.get('type','')
|
||||
# importance is Nominatim's intrinsic relevance score
|
||||
importance = float(r.get('importance', 0))
|
||||
score = type_priority.get(typ, type_priority.get(cls, 50)) + importance*10
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = (lat, lng, r)
|
||||
return best
|
||||
|
||||
def queries_for(naziv, grad, adresa):
|
||||
"""Generate ordered queries from most specific to most general."""
|
||||
qs = []
|
||||
n = (naziv or '').strip()
|
||||
g = (grad or '').strip()
|
||||
a = (adresa or '').strip()
|
||||
if a and g:
|
||||
qs.append(f'{a}, {g}, Hrvatska')
|
||||
if n and g:
|
||||
qs.append(f'{n}, {g}, Hrvatska')
|
||||
# Strip common prefixes for a cleaner search
|
||||
short = n
|
||||
for prefix in ('Sportska dvorana ', 'Gradska sportska dvorana ',
|
||||
'Multifunkcionalna dvorana za sport i turizam ',
|
||||
'Stadion ', 'Bazen ', 'Bazeni ', 'Dvorana ',
|
||||
'Boćalište ', 'Kuglana ', 'Marina '):
|
||||
if short.startswith(prefix):
|
||||
short = short[len(prefix):].strip()
|
||||
break
|
||||
if short and short != n and g:
|
||||
qs.append(f'{short}, {g}, Hrvatska')
|
||||
if n:
|
||||
qs.append(f'{n}, Hrvatska')
|
||||
if g and a:
|
||||
qs.append(f'{a}, {g}')
|
||||
# dedup preserving order
|
||||
seen = set(); out = []
|
||||
for q in qs:
|
||||
if q not in seen:
|
||||
seen.add(q); out.append(q)
|
||||
return out
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument('--dry-run', action='store_true')
|
||||
ap.add_argument('--only-duplicates', action='store_true',
|
||||
help='only re-geocode objects sharing coordinates with another object')
|
||||
ap.add_argument('--id', type=int, help='single object ID to re-geocode')
|
||||
args = ap.parse_args()
|
||||
|
||||
conn = psycopg2.connect(**PG)
|
||||
cur = conn.cursor()
|
||||
|
||||
if args.id:
|
||||
cur.execute("SELECT id, naziv, grad, adresa, lat, lng FROM pgz_sport.sportski_objekti WHERE id=%s", (args.id,))
|
||||
elif args.only_duplicates:
|
||||
cur.execute("""
|
||||
WITH dup AS (
|
||||
SELECT lat, lng FROM pgz_sport.sportski_objekti
|
||||
WHERE lat IS NOT NULL
|
||||
GROUP BY lat, lng HAVING count(*)>1
|
||||
)
|
||||
SELECT s.id, s.naziv, s.grad, s.adresa, s.lat, s.lng
|
||||
FROM pgz_sport.sportski_objekti s
|
||||
JOIN dup d USING (lat, lng)
|
||||
ORDER BY s.id
|
||||
""")
|
||||
else:
|
||||
cur.execute("SELECT id, naziv, grad, adresa, lat, lng FROM pgz_sport.sportski_objekti ORDER BY id")
|
||||
rows = cur.fetchall()
|
||||
print(f'== Processing {len(rows)} objects (dry_run={args.dry_run}) ==')
|
||||
|
||||
updated = 0
|
||||
skipped = 0
|
||||
failed = []
|
||||
for i, (oid, naziv, grad, adresa, oldlat, oldlng) in enumerate(rows, 1):
|
||||
print(f'[{i}/{len(rows)}] #{oid} {naziv} ({grad}) — current: {oldlat},{oldlng}')
|
||||
new_pos = None
|
||||
for q in queries_for(naziv, grad, adresa):
|
||||
results = nominatim(q)
|
||||
time.sleep(1.05) # Nominatim 1 req/s policy
|
||||
best = best_result(results)
|
||||
if best:
|
||||
lat, lng, raw = best
|
||||
# Skip queries that just resolve to a place/town center
|
||||
if raw.get('class') == 'place' and raw.get('type') in ('city','town','village','suburb','locality'):
|
||||
print(f' "{q}" -> {raw.get("display_name","")[:60]} (place type, skip)')
|
||||
continue
|
||||
print(f' "{q}" -> {lat},{lng} [{raw.get("class")}/{raw.get("type")}]')
|
||||
new_pos = (lat, lng, q)
|
||||
break
|
||||
else:
|
||||
print(f' "{q}" -> no result in PGŽ bounds')
|
||||
|
||||
if not new_pos:
|
||||
failed.append((oid, naziv, grad))
|
||||
print(' ✗ no precise match found')
|
||||
continue
|
||||
|
||||
nlat, nlng, nq = new_pos
|
||||
# Detect meaningful change (>50m). 0.0005° ≈ 55m at this latitude.
|
||||
if oldlat is not None and oldlng is not None:
|
||||
dlat = abs(float(oldlat) - nlat)
|
||||
dlng = abs(float(oldlng) - nlng)
|
||||
if dlat < 0.0005 and dlng < 0.0005:
|
||||
print(f' = unchanged (within 50m)')
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
print(f' [DRY] would UPDATE id={oid} -> {nlat},{nlng}')
|
||||
else:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.sportski_objekti
|
||||
SET lat=%s, lng=%s
|
||||
WHERE id=%s
|
||||
""", (nlat, nlng, oid))
|
||||
conn.commit()
|
||||
print(f' ✓ UPDATED -> {nlat},{nlng}')
|
||||
updated += 1
|
||||
|
||||
print('')
|
||||
print(f'== Summary: {updated} updated, {skipped} unchanged, {len(failed)} failed ==')
|
||||
if failed:
|
||||
print('Failed:')
|
||||
for oid, n, g in failed:
|
||||
print(f' #{oid} {n} ({g})')
|
||||
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_extract.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
|
||||
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
|
||||
# Zavisi od: httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_sport.clanovi
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
import asyncio, glob, json, logging, re, sys, time
|
||||
import httpx, psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("extract")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 4
|
||||
CHUNK_SIZE = 1400
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
|
||||
|
||||
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
|
||||
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
|
||||
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
|
||||
kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKA osobe s punim imenom i prezimenom
|
||||
2. Ako klub nije eksplicitno naveden -> klub=""
|
||||
3. NE izmisljaj - samo jasno navedena imena u tekstu
|
||||
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r'\n\n+', text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur: chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur: chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 80]
|
||||
|
||||
|
||||
# Preload klub cache
|
||||
def load_klub_cache(conn):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def fuzzy_klub(naziv, cache):
|
||||
if not naziv or len(naziv) < 3:
|
||||
return None
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in cache:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score, best_id = score, kid
|
||||
return best_id if best_score > 72 else None
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str, semaphore):
|
||||
async with semaphore:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90.0) as c:
|
||||
r = await c.post(VLLM_URL, json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5000]},
|
||||
],
|
||||
"temperature": 0.05,
|
||||
"max_tokens": 2500,
|
||||
"response_format": {"type": "json_object"},
|
||||
})
|
||||
d = r.json()
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.debug(f"Extract fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
VALID_ULOGE = {
|
||||
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
|
||||
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
|
||||
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
|
||||
"kineziolog","maser","sudac","volonter"
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Backup
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
|
||||
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
|
||||
SELECT * FROM pgz_sport.clanovi""")
|
||||
log.info("Backup created")
|
||||
|
||||
klub_cache = load_klub_cache(conn)
|
||||
log.info(f"Klub cache: {len(klub_cache)} klubova")
|
||||
|
||||
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Files: {len(files)}")
|
||||
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
total_inserted = 0
|
||||
total_skipped = 0
|
||||
|
||||
for f in files:
|
||||
m = re.search(r'godisnjak_(\d{4})_layout', f)
|
||||
year = m.group(1) if m else "?"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f"Year {year}: {len(chunks)} chunks")
|
||||
|
||||
tasks = [extract_persons(c, semaphore) for c in chunks]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
year_ins = 0
|
||||
rows = []
|
||||
for res in results:
|
||||
for o in res.get("osobe", []):
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
|
||||
continue
|
||||
# Basic sanity — no numbers, no too-long names
|
||||
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
|
||||
continue
|
||||
|
||||
uloga = (o.get("uloga") or "igrac").lower().strip()
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_klub(klub_naziv, klub_cache)
|
||||
|
||||
rows.append((
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
|
||||
"sportas",
|
||||
))
|
||||
|
||||
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
|
||||
for row in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", row)
|
||||
if cur.rowcount:
|
||||
year_ins += 1
|
||||
except Exception as e:
|
||||
log.debug(f"Insert skip: {e}")
|
||||
|
||||
total_inserted += year_ins
|
||||
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
|
||||
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
final = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== EXTRACT DONE ===
|
||||
Inserted this run: {total_inserted}
|
||||
Total godisnjak u DB: {final}
|
||||
""")
|
||||
|
||||
import requests as rq
|
||||
rq.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558",
|
||||
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_extract.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
|
||||
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
|
||||
# Zavisi od: httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_sport.clanovi
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
import asyncio, glob, json, logging, re, sys, time
|
||||
import httpx, psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("extract")
|
||||
|
||||
DSN = "host=127.0.0.1 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 4
|
||||
CHUNK_SIZE = 1400
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
|
||||
|
||||
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
|
||||
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
|
||||
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
|
||||
kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKA osobe s punim imenom i prezimenom
|
||||
2. Ako klub nije eksplicitno naveden -> klub=""
|
||||
3. NE izmisljaj - samo jasno navedena imena u tekstu
|
||||
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r'\n\n+', text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur: chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur: chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 80]
|
||||
|
||||
|
||||
# Preload klub cache
|
||||
def load_klub_cache(conn):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def fuzzy_klub(naziv, cache):
|
||||
if not naziv or len(naziv) < 3:
|
||||
return None
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in cache:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score, best_id = score, kid
|
||||
return best_id if best_score > 72 else None
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str, semaphore):
|
||||
async with semaphore:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90.0) as c:
|
||||
r = await c.post(VLLM_URL, json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5000]},
|
||||
],
|
||||
"temperature": 0.05,
|
||||
"max_tokens": 2500,
|
||||
"response_format": {"type": "json_object"},
|
||||
})
|
||||
d = r.json()
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.debug(f"Extract fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
VALID_ULOGE = {
|
||||
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
|
||||
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
|
||||
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
|
||||
"kineziolog","maser","sudac","volonter"
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Backup
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
|
||||
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
|
||||
SELECT * FROM pgz_sport.clanovi""")
|
||||
log.info("Backup created")
|
||||
|
||||
klub_cache = load_klub_cache(conn)
|
||||
log.info(f"Klub cache: {len(klub_cache)} klubova")
|
||||
|
||||
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Files: {len(files)}")
|
||||
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
total_inserted = 0
|
||||
total_skipped = 0
|
||||
|
||||
for f in files:
|
||||
m = re.search(r'godisnjak_(\d{4})_layout', f)
|
||||
year = m.group(1) if m else "?"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f"Year {year}: {len(chunks)} chunks")
|
||||
|
||||
tasks = [extract_persons(c, semaphore) for c in chunks]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
year_ins = 0
|
||||
rows = []
|
||||
for res in results:
|
||||
for o in res.get("osobe", []):
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
|
||||
continue
|
||||
# Basic sanity — no numbers, no too-long names
|
||||
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
|
||||
continue
|
||||
|
||||
uloga = (o.get("uloga") or "igrac").lower().strip()
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_klub(klub_naziv, klub_cache)
|
||||
|
||||
rows.append((
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
|
||||
"sportas",
|
||||
))
|
||||
|
||||
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
|
||||
for row in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", row)
|
||||
if cur.rowcount:
|
||||
year_ins += 1
|
||||
except Exception as e:
|
||||
log.debug(f"Insert skip: {e}")
|
||||
|
||||
total_inserted += year_ins
|
||||
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
|
||||
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
final = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== EXTRACT DONE ===
|
||||
Inserted this run: {total_inserted}
|
||||
Total godisnjak u DB: {final}
|
||||
""")
|
||||
|
||||
import requests as rq
|
||||
rq.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558",
|
||||
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_extract.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
|
||||
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
|
||||
# Zavisi od: httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_sport.clanovi
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
import asyncio, glob, json, logging, re, sys, time
|
||||
import httpx, psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("extract")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 4
|
||||
CHUNK_SIZE = 1400
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
|
||||
|
||||
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
|
||||
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
|
||||
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
|
||||
kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKA osobe s punim imenom i prezimenom
|
||||
2. Ako klub nije eksplicitno naveden -> klub=""
|
||||
3. NE izmisljaj - samo jasno navedena imena u tekstu
|
||||
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r'\n\n+', text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur: chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur: chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 80]
|
||||
|
||||
|
||||
# Preload klub cache
|
||||
def load_klub_cache(conn):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def fuzzy_klub(naziv, cache):
|
||||
if not naziv or len(naziv) < 3:
|
||||
return None
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in cache:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score, best_id = score, kid
|
||||
return best_id if best_score > 72 else None
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str, semaphore):
|
||||
async with semaphore:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90.0) as c:
|
||||
r = await c.post(VLLM_URL, json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5000]},
|
||||
],
|
||||
"temperature": 0.05,
|
||||
"max_tokens": 2500,
|
||||
"response_format": {"type": "json_object"},
|
||||
})
|
||||
d = r.json()
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.debug(f"Extract fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
VALID_ULOGE = {
|
||||
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
|
||||
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
|
||||
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
|
||||
"kineziolog","maser","sudac","volonter"
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Backup
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
|
||||
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
|
||||
SELECT * FROM pgz_sport.clanovi""")
|
||||
log.info("Backup created")
|
||||
|
||||
klub_cache = load_klub_cache(conn)
|
||||
log.info(f"Klub cache: {len(klub_cache)} klubova")
|
||||
|
||||
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Files: {len(files)}")
|
||||
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
total_inserted = 0
|
||||
total_skipped = 0
|
||||
|
||||
for f in files:
|
||||
m = re.search(r'godisnjak_(\d{4})_layout', f)
|
||||
year = m.group(1) if m else "?"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f"Year {year}: {len(chunks)} chunks")
|
||||
|
||||
tasks = [extract_persons(c, semaphore) for c in chunks]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
year_ins = 0
|
||||
rows = []
|
||||
for res in results:
|
||||
for o in res.get("osobe", []):
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
|
||||
continue
|
||||
# Basic sanity — no numbers, no too-long names
|
||||
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
|
||||
continue
|
||||
|
||||
uloga = (o.get("uloga") or "igrac").lower().strip()
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_klub(klub_naziv, klub_cache)
|
||||
|
||||
rows.append((
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
|
||||
"sportas",
|
||||
))
|
||||
|
||||
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
|
||||
for row in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", row)
|
||||
if cur.rowcount:
|
||||
year_ins += 1
|
||||
except Exception as e:
|
||||
log.debug(f"Insert skip: {e}")
|
||||
|
||||
total_inserted += year_ins
|
||||
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
|
||||
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
final = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== EXTRACT DONE ===
|
||||
Inserted this run: {total_inserted}
|
||||
Total godisnjak u DB: {final}
|
||||
""")
|
||||
|
||||
import requests as rq
|
||||
rq.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558",
|
||||
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,316 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_pipeline.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_pipeline.py
|
||||
# Svrha: Embed godisnjaci PGZ u pgz_universe + LLM ekstrakcija osoba/uloga
|
||||
# Zavisi od: qdrant_client, httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_universe (Qdrant), pgz_sport.clanovi (insert)
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""Godisnjak PGZ embed + LLM person extraction pipeline."""
|
||||
import asyncio
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/rinet-gpu/lib')
|
||||
try:
|
||||
from tg_notify import notify as _tg_notify
|
||||
except ImportError:
|
||||
_tg_notify = None
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import httpx
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [godisnjak] %(levelname)s: %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_pipeline.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("godisnjak")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
EMBED_URL = "http://localhost:9879/api/embeddings"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
QDRANT_COLLECTION = "pgz_universe"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 5
|
||||
CHUNK_SIZE = 1500 # < 2000 zbog BGE-M3 truncation
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Format strogo JSON:
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"predsjednik|igrac|trener|tajnik|fizioterapeut|lijecnik","godina_rodenja":1990}]}
|
||||
|
||||
Uloge ISKLJUCIVO: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKE osobe (ne strani sportasi koji su gostovali)
|
||||
2. Ako klub nije jasan -> ostavi prazan string
|
||||
3. NE izmisljaj imena -> samo ona JASNO IZRAZENA u tekstu
|
||||
4. Vrati VALID JSON bez markdown backtick-ova"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r"\n\n+", text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 100]
|
||||
|
||||
|
||||
async def embed_batch(texts):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(EMBED_URL, json={"texts": texts})
|
||||
d = r.json()
|
||||
return d.get("embeddings", [])
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(
|
||||
VLLM_URL,
|
||||
json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5500]},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 3000,
|
||||
"response_format": {"type": "json_object"},
|
||||
},
|
||||
)
|
||||
d = r.json()
|
||||
try:
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.warning(f"Parse fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
def fuzzy_match_klub(naziv, conn):
|
||||
"""Fuzzy match klub name to pgz_sport.klubovi.id"""
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi LIMIT 1000")
|
||||
rows = cur.fetchall()
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in rows:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_id = kid
|
||||
return best_id if best_score > 75 else None
|
||||
except Exception as e:
|
||||
log.warning(f"Fuzzy match fail: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def insert_persons(persons_data, year, conn):
|
||||
"""Insert extracted persons into pgz_sport.clanovi."""
|
||||
osobe = persons_data.get("osobe", [])
|
||||
if not osobe:
|
||||
return 0
|
||||
|
||||
inserted = 0
|
||||
cur = conn.cursor()
|
||||
|
||||
for o in osobe:
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime:
|
||||
continue
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_match_klub(klub_naziv, conn) if klub_naziv else None
|
||||
uloga = (o.get("uloga") or "igrac").strip()
|
||||
|
||||
# Validate uloga
|
||||
VALID_ULOGE = {
|
||||
"predsjednik", "dopredsjednik", "tajnik", "blagajnik", "clan_uprave",
|
||||
"igrac", "sportas", "glavni_trener", "trener", "pomocni_trener",
|
||||
"kondicioni_trener", "selektor", "izbornik", "team_manager", "voditelj",
|
||||
"lijecnik", "fizioterapeut", "kineziolog", "maser", "sudac", "volonter"
|
||||
}
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
profile_key = f"godisnjak:{year}:{ime}:{prezime}:{klub_naziv}"
|
||||
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": year, "klub_naziv": klub_naziv, "key": profile_key}),
|
||||
"sportas",
|
||||
))
|
||||
if cur.fetchone():
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
log.warning(f"Insert fail {ime} {prezime}: {e}")
|
||||
conn.rollback()
|
||||
|
||||
conn.commit()
|
||||
return inserted
|
||||
|
||||
|
||||
async def phase1_embed(files_layout):
|
||||
"""Embed sve godisnjake u pgz_universe."""
|
||||
log.info(f"Phase 1: Embed {len(files_layout)} godisnjaka")
|
||||
qdrant = QdrantClient(host="localhost", port=6333)
|
||||
|
||||
all_chunks = []
|
||||
all_meta = []
|
||||
for f in files_layout:
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
chunks = chunk_text(text)
|
||||
for i, c in enumerate(chunks):
|
||||
all_chunks.append(c)
|
||||
all_meta.append({"year": year, "chunk_idx": i, "source": f.split("/")[-1]})
|
||||
|
||||
log.info(f"Total chunks: {len(all_chunks)}")
|
||||
|
||||
points = []
|
||||
BATCH = 32
|
||||
for i in range(0, len(all_chunks), BATCH):
|
||||
batch = all_chunks[i : i + BATCH]
|
||||
try:
|
||||
embeddings = await embed_batch(batch)
|
||||
for j, (text, emb) in enumerate(zip(batch, embeddings)):
|
||||
meta = all_meta[i + j]
|
||||
pid_key = f"godisnjak:{meta['source']}:{meta['chunk_idx']}"
|
||||
point_id = int(hashlib.md5(pid_key.encode()).hexdigest()[:15], 16)
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=point_id,
|
||||
vector=emb,
|
||||
payload={**meta, "text": text[:1500], "type": "godisnjak_pgz"},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning(f"Embed batch {i} fail: {e}")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if i % 200 == 0:
|
||||
log.info(f" Embed progress: {i}/{len(all_chunks)}")
|
||||
|
||||
qdrant.upsert(collection_name=QDRANT_COLLECTION, points=points)
|
||||
log.info(f"Phase 1 DONE: {len(points)} chunks → {QDRANT_COLLECTION}")
|
||||
return len(points)
|
||||
|
||||
|
||||
async def phase2_extract(files_layout):
|
||||
"""LLM ekstrakcija osoba/uloga iz godisnjaka."""
|
||||
log.info(f"Phase 2: LLM extract persons from {len(files_layout)} godisnjaka")
|
||||
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = False
|
||||
|
||||
total_inserted = 0
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
|
||||
async def process_file(f):
|
||||
nonlocal total_inserted
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f" Year {year}: {len(chunks)} chunks")
|
||||
|
||||
year_inserted = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
async with semaphore:
|
||||
try:
|
||||
persons = await extract_persons(chunk)
|
||||
n = insert_persons(persons, year, conn)
|
||||
year_inserted += n
|
||||
if i % 10 == 0:
|
||||
log.info(f" {year} chunk {i}/{len(chunks)}: {n} osoba")
|
||||
except Exception as e:
|
||||
log.warning(f"Extract/insert fail {year} chunk {i}: {e}")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
total_inserted += year_inserted
|
||||
log.info(f" Year {year} DONE: {year_inserted} osoba inserted")
|
||||
|
||||
tasks = [process_file(f) for f in files_layout]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
conn.close()
|
||||
log.info(f"Phase 2 DONE: {total_inserted} total osoba inserted")
|
||||
return total_inserted
|
||||
|
||||
|
||||
async def main():
|
||||
files_layout = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Found {len(files_layout)} layout files: {[f.split('/')[-1] for f in files_layout]}")
|
||||
|
||||
if not files_layout:
|
||||
log.error("Nema layout fajlova!")
|
||||
sys.exit(1)
|
||||
|
||||
# Phase 1: Embed
|
||||
n_embedded = await phase1_embed(files_layout)
|
||||
|
||||
# Phase 2: LLM extract
|
||||
n_persons = await phase2_extract(files_layout)
|
||||
|
||||
# Final stats
|
||||
conn = psycopg2.connect(DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
total_godisnjak = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== GODISNJAK PIPELINE COMPLETE ===
|
||||
Chunks embedded: {n_embedded}
|
||||
Persons extracted: {n_persons}
|
||||
Total godisnjak clanovi u DB: {total_godisnjak}
|
||||
""")
|
||||
|
||||
# Telegram
|
||||
import requests as req_lib
|
||||
req_lib.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558", "text": f"✅ Godisnjak pipeline DONE: {n_embedded} chunks, {n_persons} osoba, {total_godisnjak} total u DB"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,311 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_pipeline.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_pipeline.py
|
||||
# Svrha: Embed godisnjaci PGZ u pgz_universe + LLM ekstrakcija osoba/uloga
|
||||
# Zavisi od: qdrant_client, httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_universe (Qdrant), pgz_sport.clanovi (insert)
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""Godisnjak PGZ embed + LLM person extraction pipeline."""
|
||||
import asyncio
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import httpx
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [godisnjak] %(levelname)s: %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_pipeline.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("godisnjak")
|
||||
|
||||
DSN = "host=127.0.0.1 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
EMBED_URL = "http://localhost:9879/api/embeddings"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
QDRANT_COLLECTION = "pgz_universe"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 5
|
||||
CHUNK_SIZE = 1500 # < 2000 zbog BGE-M3 truncation
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Format strogo JSON:
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"predsjednik|igrac|trener|tajnik|fizioterapeut|lijecnik","godina_rodenja":1990}]}
|
||||
|
||||
Uloge ISKLJUCIVO: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKE osobe (ne strani sportasi koji su gostovali)
|
||||
2. Ako klub nije jasan -> ostavi prazan string
|
||||
3. NE izmisljaj imena -> samo ona JASNO IZRAZENA u tekstu
|
||||
4. Vrati VALID JSON bez markdown backtick-ova"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r"\n\n+", text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 100]
|
||||
|
||||
|
||||
async def embed_batch(texts):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(EMBED_URL, json={"texts": texts})
|
||||
d = r.json()
|
||||
return d.get("embeddings", [])
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(
|
||||
VLLM_URL,
|
||||
json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5500]},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 3000,
|
||||
"response_format": {"type": "json_object"},
|
||||
},
|
||||
)
|
||||
d = r.json()
|
||||
try:
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.warning(f"Parse fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
def fuzzy_match_klub(naziv, conn):
|
||||
"""Fuzzy match klub name to pgz_sport.klubovi.id"""
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi LIMIT 1000")
|
||||
rows = cur.fetchall()
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in rows:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_id = kid
|
||||
return best_id if best_score > 75 else None
|
||||
except Exception as e:
|
||||
log.warning(f"Fuzzy match fail: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def insert_persons(persons_data, year, conn):
|
||||
"""Insert extracted persons into pgz_sport.clanovi."""
|
||||
osobe = persons_data.get("osobe", [])
|
||||
if not osobe:
|
||||
return 0
|
||||
|
||||
inserted = 0
|
||||
cur = conn.cursor()
|
||||
|
||||
for o in osobe:
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime:
|
||||
continue
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_match_klub(klub_naziv, conn) if klub_naziv else None
|
||||
uloga = (o.get("uloga") or "igrac").strip()
|
||||
|
||||
# Validate uloga
|
||||
VALID_ULOGE = {
|
||||
"predsjednik", "dopredsjednik", "tajnik", "blagajnik", "clan_uprave",
|
||||
"igrac", "sportas", "glavni_trener", "trener", "pomocni_trener",
|
||||
"kondicioni_trener", "selektor", "izbornik", "team_manager", "voditelj",
|
||||
"lijecnik", "fizioterapeut", "kineziolog", "maser", "sudac", "volonter"
|
||||
}
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
profile_key = f"godisnjak:{year}:{ime}:{prezime}:{klub_naziv}"
|
||||
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": year, "klub_naziv": klub_naziv, "key": profile_key}),
|
||||
"sportas",
|
||||
))
|
||||
if cur.fetchone():
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
log.warning(f"Insert fail {ime} {prezime}: {e}")
|
||||
conn.rollback()
|
||||
|
||||
conn.commit()
|
||||
return inserted
|
||||
|
||||
|
||||
async def phase1_embed(files_layout):
|
||||
"""Embed sve godisnjake u pgz_universe."""
|
||||
log.info(f"Phase 1: Embed {len(files_layout)} godisnjaka")
|
||||
qdrant = QdrantClient(host="localhost", port=6333)
|
||||
|
||||
all_chunks = []
|
||||
all_meta = []
|
||||
for f in files_layout:
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
chunks = chunk_text(text)
|
||||
for i, c in enumerate(chunks):
|
||||
all_chunks.append(c)
|
||||
all_meta.append({"year": year, "chunk_idx": i, "source": f.split("/")[-1]})
|
||||
|
||||
log.info(f"Total chunks: {len(all_chunks)}")
|
||||
|
||||
points = []
|
||||
BATCH = 32
|
||||
for i in range(0, len(all_chunks), BATCH):
|
||||
batch = all_chunks[i : i + BATCH]
|
||||
try:
|
||||
embeddings = await embed_batch(batch)
|
||||
for j, (text, emb) in enumerate(zip(batch, embeddings)):
|
||||
meta = all_meta[i + j]
|
||||
pid_key = f"godisnjak:{meta['source']}:{meta['chunk_idx']}"
|
||||
point_id = int(hashlib.md5(pid_key.encode()).hexdigest()[:15], 16)
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=point_id,
|
||||
vector=emb,
|
||||
payload={**meta, "text": text[:1500], "type": "godisnjak_pgz"},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning(f"Embed batch {i} fail: {e}")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if i % 200 == 0:
|
||||
log.info(f" Embed progress: {i}/{len(all_chunks)}")
|
||||
|
||||
qdrant.upsert(collection_name=QDRANT_COLLECTION, points=points)
|
||||
log.info(f"Phase 1 DONE: {len(points)} chunks → {QDRANT_COLLECTION}")
|
||||
return len(points)
|
||||
|
||||
|
||||
async def phase2_extract(files_layout):
|
||||
"""LLM ekstrakcija osoba/uloga iz godisnjaka."""
|
||||
log.info(f"Phase 2: LLM extract persons from {len(files_layout)} godisnjaka")
|
||||
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = False
|
||||
|
||||
total_inserted = 0
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
|
||||
async def process_file(f):
|
||||
nonlocal total_inserted
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f" Year {year}: {len(chunks)} chunks")
|
||||
|
||||
year_inserted = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
async with semaphore:
|
||||
try:
|
||||
persons = await extract_persons(chunk)
|
||||
n = insert_persons(persons, year, conn)
|
||||
year_inserted += n
|
||||
if i % 10 == 0:
|
||||
log.info(f" {year} chunk {i}/{len(chunks)}: {n} osoba")
|
||||
except Exception as e:
|
||||
log.warning(f"Extract/insert fail {year} chunk {i}: {e}")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
total_inserted += year_inserted
|
||||
log.info(f" Year {year} DONE: {year_inserted} osoba inserted")
|
||||
|
||||
tasks = [process_file(f) for f in files_layout]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
conn.close()
|
||||
log.info(f"Phase 2 DONE: {total_inserted} total osoba inserted")
|
||||
return total_inserted
|
||||
|
||||
|
||||
async def main():
|
||||
files_layout = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Found {len(files_layout)} layout files: {[f.split('/')[-1] for f in files_layout]}")
|
||||
|
||||
if not files_layout:
|
||||
log.error("Nema layout fajlova!")
|
||||
sys.exit(1)
|
||||
|
||||
# Phase 1: Embed
|
||||
n_embedded = await phase1_embed(files_layout)
|
||||
|
||||
# Phase 2: LLM extract
|
||||
n_persons = await phase2_extract(files_layout)
|
||||
|
||||
# Final stats
|
||||
conn = psycopg2.connect(DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
total_godisnjak = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== GODISNJAK PIPELINE COMPLETE ===
|
||||
Chunks embedded: {n_embedded}
|
||||
Persons extracted: {n_persons}
|
||||
Total godisnjak clanovi u DB: {total_godisnjak}
|
||||
""")
|
||||
|
||||
# Telegram
|
||||
import requests as req_lib
|
||||
req_lib.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558", "text": f"✅ Godisnjak pipeline DONE: {n_embedded} chunks, {n_persons} osoba, {total_godisnjak} total u DB"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,311 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_pipeline.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_pipeline.py
|
||||
# Svrha: Embed godisnjaci PGZ u pgz_universe + LLM ekstrakcija osoba/uloga
|
||||
# Zavisi od: qdrant_client, httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_universe (Qdrant), pgz_sport.clanovi (insert)
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""Godisnjak PGZ embed + LLM person extraction pipeline."""
|
||||
import asyncio
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import httpx
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [godisnjak] %(levelname)s: %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_pipeline.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("godisnjak")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
EMBED_URL = "http://localhost:9879/api/embeddings"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
QDRANT_COLLECTION = "pgz_universe"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 5
|
||||
CHUNK_SIZE = 1500 # < 2000 zbog BGE-M3 truncation
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Format strogo JSON:
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"predsjednik|igrac|trener|tajnik|fizioterapeut|lijecnik","godina_rodenja":1990}]}
|
||||
|
||||
Uloge ISKLJUCIVO: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKE osobe (ne strani sportasi koji su gostovali)
|
||||
2. Ako klub nije jasan -> ostavi prazan string
|
||||
3. NE izmisljaj imena -> samo ona JASNO IZRAZENA u tekstu
|
||||
4. Vrati VALID JSON bez markdown backtick-ova"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r"\n\n+", text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 100]
|
||||
|
||||
|
||||
async def embed_batch(texts):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(EMBED_URL, json={"texts": texts})
|
||||
d = r.json()
|
||||
return d.get("embeddings", [])
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(
|
||||
VLLM_URL,
|
||||
json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5500]},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 3000,
|
||||
"response_format": {"type": "json_object"},
|
||||
},
|
||||
)
|
||||
d = r.json()
|
||||
try:
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.warning(f"Parse fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
def fuzzy_match_klub(naziv, conn):
|
||||
"""Fuzzy match klub name to pgz_sport.klubovi.id"""
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi LIMIT 1000")
|
||||
rows = cur.fetchall()
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in rows:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_id = kid
|
||||
return best_id if best_score > 75 else None
|
||||
except Exception as e:
|
||||
log.warning(f"Fuzzy match fail: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def insert_persons(persons_data, year, conn):
|
||||
"""Insert extracted persons into pgz_sport.clanovi."""
|
||||
osobe = persons_data.get("osobe", [])
|
||||
if not osobe:
|
||||
return 0
|
||||
|
||||
inserted = 0
|
||||
cur = conn.cursor()
|
||||
|
||||
for o in osobe:
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime:
|
||||
continue
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_match_klub(klub_naziv, conn) if klub_naziv else None
|
||||
uloga = (o.get("uloga") or "igrac").strip()
|
||||
|
||||
# Validate uloga
|
||||
VALID_ULOGE = {
|
||||
"predsjednik", "dopredsjednik", "tajnik", "blagajnik", "clan_uprave",
|
||||
"igrac", "sportas", "glavni_trener", "trener", "pomocni_trener",
|
||||
"kondicioni_trener", "selektor", "izbornik", "team_manager", "voditelj",
|
||||
"lijecnik", "fizioterapeut", "kineziolog", "maser", "sudac", "volonter"
|
||||
}
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
profile_key = f"godisnjak:{year}:{ime}:{prezime}:{klub_naziv}"
|
||||
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": year, "klub_naziv": klub_naziv, "key": profile_key}),
|
||||
"sportas",
|
||||
))
|
||||
if cur.fetchone():
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
log.warning(f"Insert fail {ime} {prezime}: {e}")
|
||||
conn.rollback()
|
||||
|
||||
conn.commit()
|
||||
return inserted
|
||||
|
||||
|
||||
async def phase1_embed(files_layout):
|
||||
"""Embed sve godisnjake u pgz_universe."""
|
||||
log.info(f"Phase 1: Embed {len(files_layout)} godisnjaka")
|
||||
qdrant = QdrantClient(host="localhost", port=6333)
|
||||
|
||||
all_chunks = []
|
||||
all_meta = []
|
||||
for f in files_layout:
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
chunks = chunk_text(text)
|
||||
for i, c in enumerate(chunks):
|
||||
all_chunks.append(c)
|
||||
all_meta.append({"year": year, "chunk_idx": i, "source": f.split("/")[-1]})
|
||||
|
||||
log.info(f"Total chunks: {len(all_chunks)}")
|
||||
|
||||
points = []
|
||||
BATCH = 32
|
||||
for i in range(0, len(all_chunks), BATCH):
|
||||
batch = all_chunks[i : i + BATCH]
|
||||
try:
|
||||
embeddings = await embed_batch(batch)
|
||||
for j, (text, emb) in enumerate(zip(batch, embeddings)):
|
||||
meta = all_meta[i + j]
|
||||
pid_key = f"godisnjak:{meta['source']}:{meta['chunk_idx']}"
|
||||
point_id = int(hashlib.md5(pid_key.encode()).hexdigest()[:15], 16)
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=point_id,
|
||||
vector=emb,
|
||||
payload={**meta, "text": text[:1500], "type": "godisnjak_pgz"},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning(f"Embed batch {i} fail: {e}")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if i % 200 == 0:
|
||||
log.info(f" Embed progress: {i}/{len(all_chunks)}")
|
||||
|
||||
qdrant.upsert(collection_name=QDRANT_COLLECTION, points=points)
|
||||
log.info(f"Phase 1 DONE: {len(points)} chunks → {QDRANT_COLLECTION}")
|
||||
return len(points)
|
||||
|
||||
|
||||
async def phase2_extract(files_layout):
|
||||
"""LLM ekstrakcija osoba/uloga iz godisnjaka."""
|
||||
log.info(f"Phase 2: LLM extract persons from {len(files_layout)} godisnjaka")
|
||||
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = False
|
||||
|
||||
total_inserted = 0
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
|
||||
async def process_file(f):
|
||||
nonlocal total_inserted
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f" Year {year}: {len(chunks)} chunks")
|
||||
|
||||
year_inserted = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
async with semaphore:
|
||||
try:
|
||||
persons = await extract_persons(chunk)
|
||||
n = insert_persons(persons, year, conn)
|
||||
year_inserted += n
|
||||
if i % 10 == 0:
|
||||
log.info(f" {year} chunk {i}/{len(chunks)}: {n} osoba")
|
||||
except Exception as e:
|
||||
log.warning(f"Extract/insert fail {year} chunk {i}: {e}")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
total_inserted += year_inserted
|
||||
log.info(f" Year {year} DONE: {year_inserted} osoba inserted")
|
||||
|
||||
tasks = [process_file(f) for f in files_layout]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
conn.close()
|
||||
log.info(f"Phase 2 DONE: {total_inserted} total osoba inserted")
|
||||
return total_inserted
|
||||
|
||||
|
||||
async def main():
|
||||
files_layout = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Found {len(files_layout)} layout files: {[f.split('/')[-1] for f in files_layout]}")
|
||||
|
||||
if not files_layout:
|
||||
log.error("Nema layout fajlova!")
|
||||
sys.exit(1)
|
||||
|
||||
# Phase 1: Embed
|
||||
n_embedded = await phase1_embed(files_layout)
|
||||
|
||||
# Phase 2: LLM extract
|
||||
n_persons = await phase2_extract(files_layout)
|
||||
|
||||
# Final stats
|
||||
conn = psycopg2.connect(DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
total_godisnjak = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== GODISNJAK PIPELINE COMPLETE ===
|
||||
Chunks embedded: {n_embedded}
|
||||
Persons extracted: {n_persons}
|
||||
Total godisnjak clanovi u DB: {total_godisnjak}
|
||||
""")
|
||||
|
||||
# Telegram
|
||||
import requests as req_lib
|
||||
req_lib.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558", "text": f"✅ Godisnjak pipeline DONE: {n_embedded} chunks, {n_persons} osoba, {total_godisnjak} total u DB"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user