PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

This commit is contained in:
Damir Radulić
2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
+193
View File
@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""
geocode_objekti_v2.py — precision geocoding for pgz_sport.sportski_objekti
Re-geocodes all objects via Nominatim using {naziv} + {grad} + ", Hrvatska" queries.
Verifies result is within PGŽ bounds (44.5-45.6, 14.0-15.1) and NOT a duplicated
"city centroid" (where multiple objects share identical coordinates from a previous
fallback pass). Updates lat/lng only when a more precise result is found.
Usage: python3 geocode_objekti_v2.py [--dry-run] [--only-duplicates]
"""
import os, sys, time, json, urllib.parse, argparse
import urllib.request
import psycopg2
PG = dict(host=os.environ.get('PG_HOST','10.10.0.2'),
port=int(os.environ.get('PG_PORT','6432')),
dbname=os.environ.get('PG_DB','rinet_v3'),
user=os.environ.get('PG_USER','rinet'),
password=os.environ.get('PG_PASS',''))
PGZ_LAT = (44.5, 45.6)
PGZ_LNG = (14.0, 15.1)
UA = 'pgz-sport/2.0 (dradulic@outlook.com)'
def nominatim(q, country='hr', limit=3):
url = ('https://nominatim.openstreetmap.org/search?'
'q='+urllib.parse.quote(q)+
'&format=json&limit='+str(limit)+
'&countrycodes='+country+
'&addressdetails=1')
req = urllib.request.Request(url, headers={'User-Agent': UA})
try:
with urllib.request.urlopen(req, timeout=10) as r:
return json.loads(r.read().decode())
except Exception as e:
print(f' ! nominatim error: {e}')
return []
def in_pgz(lat, lng):
return PGZ_LAT[0] <= lat <= PGZ_LAT[1] and PGZ_LNG[0] <= lng <= PGZ_LNG[1]
def best_result(results):
"""Pick best precision: prefer leisure/sports types, then building, then place."""
if not results:
return None
type_priority = {
'sports_centre': 100, 'stadium': 95, 'pitch': 90, 'swimming_pool': 90,
'sports_hall': 95, 'leisure': 80, 'building': 70, 'tourism': 60,
'highway': 30, 'place': 20,
}
best = None
best_score = -1
for r in results:
try:
lat = float(r['lat']); lng = float(r['lon'])
except (KeyError, ValueError):
continue
if not in_pgz(lat, lng):
continue
cls = r.get('class','')
typ = r.get('type','')
# importance is Nominatim's intrinsic relevance score
importance = float(r.get('importance', 0))
score = type_priority.get(typ, type_priority.get(cls, 50)) + importance*10
if score > best_score:
best_score = score
best = (lat, lng, r)
return best
def queries_for(naziv, grad, adresa):
"""Generate ordered queries from most specific to most general."""
qs = []
n = (naziv or '').strip()
g = (grad or '').strip()
a = (adresa or '').strip()
if a and g:
qs.append(f'{a}, {g}, Hrvatska')
if n and g:
qs.append(f'{n}, {g}, Hrvatska')
# Strip common prefixes for a cleaner search
short = n
for prefix in ('Sportska dvorana ', 'Gradska sportska dvorana ',
'Multifunkcionalna dvorana za sport i turizam ',
'Stadion ', 'Bazen ', 'Bazeni ', 'Dvorana ',
'Boćalište ', 'Kuglana ', 'Marina '):
if short.startswith(prefix):
short = short[len(prefix):].strip()
break
if short and short != n and g:
qs.append(f'{short}, {g}, Hrvatska')
if n:
qs.append(f'{n}, Hrvatska')
if g and a:
qs.append(f'{a}, {g}')
# dedup preserving order
seen = set(); out = []
for q in qs:
if q not in seen:
seen.add(q); out.append(q)
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--dry-run', action='store_true')
ap.add_argument('--only-duplicates', action='store_true',
help='only re-geocode objects sharing coordinates with another object')
ap.add_argument('--id', type=int, help='single object ID to re-geocode')
args = ap.parse_args()
conn = psycopg2.connect(**PG)
cur = conn.cursor()
if args.id:
cur.execute("SELECT id, naziv, grad, adresa, lat, lng FROM pgz_sport.sportski_objekti WHERE id=%s", (args.id,))
elif args.only_duplicates:
cur.execute("""
WITH dup AS (
SELECT lat, lng FROM pgz_sport.sportski_objekti
WHERE lat IS NOT NULL
GROUP BY lat, lng HAVING count(*)>1
)
SELECT s.id, s.naziv, s.grad, s.adresa, s.lat, s.lng
FROM pgz_sport.sportski_objekti s
JOIN dup d USING (lat, lng)
ORDER BY s.id
""")
else:
cur.execute("SELECT id, naziv, grad, adresa, lat, lng FROM pgz_sport.sportski_objekti ORDER BY id")
rows = cur.fetchall()
print(f'== Processing {len(rows)} objects (dry_run={args.dry_run}) ==')
updated = 0
skipped = 0
failed = []
for i, (oid, naziv, grad, adresa, oldlat, oldlng) in enumerate(rows, 1):
print(f'[{i}/{len(rows)}] #{oid} {naziv} ({grad}) — current: {oldlat},{oldlng}')
new_pos = None
for q in queries_for(naziv, grad, adresa):
results = nominatim(q)
time.sleep(1.05) # Nominatim 1 req/s policy
best = best_result(results)
if best:
lat, lng, raw = best
# Skip queries that just resolve to a place/town center
if raw.get('class') == 'place' and raw.get('type') in ('city','town','village','suburb','locality'):
print(f' "{q}" -> {raw.get("display_name","")[:60]} (place type, skip)')
continue
print(f' "{q}" -> {lat},{lng} [{raw.get("class")}/{raw.get("type")}]')
new_pos = (lat, lng, q)
break
else:
print(f' "{q}" -> no result in PGŽ bounds')
if not new_pos:
failed.append((oid, naziv, grad))
print(' ✗ no precise match found')
continue
nlat, nlng, nq = new_pos
# Detect meaningful change (>50m). 0.0005° ≈ 55m at this latitude.
if oldlat is not None and oldlng is not None:
dlat = abs(float(oldlat) - nlat)
dlng = abs(float(oldlng) - nlng)
if dlat < 0.0005 and dlng < 0.0005:
print(f' = unchanged (within 50m)')
skipped += 1
continue
if args.dry_run:
print(f' [DRY] would UPDATE id={oid} -> {nlat},{nlng}')
else:
cur.execute("""
UPDATE pgz_sport.sportski_objekti
SET lat=%s, lng=%s
WHERE id=%s
""", (nlat, nlng, oid))
conn.commit()
print(f' ✓ UPDATED -> {nlat},{nlng}')
updated += 1
print('')
print(f'== Summary: {updated} updated, {skipped} unchanged, {len(failed)} failed ==')
if failed:
print('Failed:')
for oid, n, g in failed:
print(f' #{oid} {n} ({g})')
cur.close(); conn.close()
if __name__ == '__main__':
main()
+209
View File
@@ -0,0 +1,209 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_extract.py
# Verzija: 1.0.0
# Datum: 03.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
# Zavisi od: httpx, psycopg2, rapidfuzz
# Utječe na: pgz_sport.clanovi
# ═══════════════════════════════════════════════════════════════════
import asyncio, glob, json, logging, re, sys, time
import httpx, psycopg2
from psycopg2.extras import execute_batch
from rapidfuzz import fuzz
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s] %(message)s",
datefmt="%H:%M:%S",
handlers=[
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
logging.StreamHandler(),
],
)
log = logging.getLogger("extract")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
MAX_WORKERS = 4
CHUNK_SIZE = 1400
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
kineziolog, maser, sudac, volonter
Pravila:
1. Samo HRVATSKA osobe s punim imenom i prezimenom
2. Ako klub nije eksplicitno naveden -> klub=""
3. NE izmisljaj - samo jasno navedena imena u tekstu
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
def chunk_text(text, size=CHUNK_SIZE):
paragraphs = re.split(r'\n\n+', text)
chunks, cur = [], ""
for p in paragraphs:
if len(cur) + len(p) > size:
if cur: chunks.append(cur.strip())
cur = p
else:
cur += "\n\n" + p
if cur: chunks.append(cur.strip())
return [c for c in chunks if len(c) > 80]
# Preload klub cache
def load_klub_cache(conn):
cur = conn.cursor()
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
return cur.fetchall()
def fuzzy_klub(naziv, cache):
if not naziv or len(naziv) < 3:
return None
best_id, best_score = None, 0
for kid, kname in cache:
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
if score > best_score:
best_score, best_id = score, kid
return best_id if best_score > 72 else None
async def extract_persons(chunk_text_str, semaphore):
async with semaphore:
try:
async with httpx.AsyncClient(timeout=90.0) as c:
r = await c.post(VLLM_URL, json={
"model": VLLM_MODEL,
"messages": [
{"role": "system", "content": EXTRACT_PROMPT},
{"role": "user", "content": chunk_text_str[:5000]},
],
"temperature": 0.05,
"max_tokens": 2500,
"response_format": {"type": "json_object"},
})
d = r.json()
content = d["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
log.debug(f"Extract fail: {e}")
return {"osobe": []}
VALID_ULOGE = {
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
"kineziolog","maser","sudac","volonter"
}
async def main():
conn = psycopg2.connect(DSN)
conn.autocommit = True
cur = conn.cursor()
# Backup
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
SELECT * FROM pgz_sport.clanovi""")
log.info("Backup created")
klub_cache = load_klub_cache(conn)
log.info(f"Klub cache: {len(klub_cache)} klubova")
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
log.info(f"Files: {len(files)}")
semaphore = asyncio.Semaphore(MAX_WORKERS)
total_inserted = 0
total_skipped = 0
for f in files:
m = re.search(r'godisnjak_(\d{4})_layout', f)
year = m.group(1) if m else "?"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
log.info(f"Year {year}: {len(chunks)} chunks")
tasks = [extract_persons(c, semaphore) for c in chunks]
results = await asyncio.gather(*tasks)
year_ins = 0
rows = []
for res in results:
for o in res.get("osobe", []):
ime = (o.get("ime") or "").strip()
prezime = (o.get("prezime") or "").strip()
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
continue
# Basic sanity — no numbers, no too-long names
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
continue
uloga = (o.get("uloga") or "igrac").lower().strip()
if uloga not in VALID_ULOGE:
uloga = "igrac"
klub_naziv = (o.get("klub") or "").strip()
klub_id = fuzzy_klub(klub_naziv, klub_cache)
rows.append((
ime, prezime, uloga, klub_id,
"godisnjak",
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
"sportas",
))
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
for row in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
VALUES (%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT DO NOTHING
""", row)
if cur.rowcount:
year_ins += 1
except Exception as e:
log.debug(f"Insert skip: {e}")
total_inserted += year_ins
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
final = cur.fetchone()[0]
conn.close()
log.info(f"""
=== EXTRACT DONE ===
Inserted this run: {total_inserted}
Total godisnjak u DB: {final}
""")
import requests as rq
rq.post(
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
data={"chat_id": "7969491558",
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
timeout=10,
)
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,209 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_extract.py
# Verzija: 1.0.0
# Datum: 03.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
# Zavisi od: httpx, psycopg2, rapidfuzz
# Utječe na: pgz_sport.clanovi
# ═══════════════════════════════════════════════════════════════════
import asyncio, glob, json, logging, re, sys, time
import httpx, psycopg2
from psycopg2.extras import execute_batch
from rapidfuzz import fuzz
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s] %(message)s",
datefmt="%H:%M:%S",
handlers=[
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
logging.StreamHandler(),
],
)
log = logging.getLogger("extract")
DSN = "host=127.0.0.1 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
MAX_WORKERS = 4
CHUNK_SIZE = 1400
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
kineziolog, maser, sudac, volonter
Pravila:
1. Samo HRVATSKA osobe s punim imenom i prezimenom
2. Ako klub nije eksplicitno naveden -> klub=""
3. NE izmisljaj - samo jasno navedena imena u tekstu
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
def chunk_text(text, size=CHUNK_SIZE):
paragraphs = re.split(r'\n\n+', text)
chunks, cur = [], ""
for p in paragraphs:
if len(cur) + len(p) > size:
if cur: chunks.append(cur.strip())
cur = p
else:
cur += "\n\n" + p
if cur: chunks.append(cur.strip())
return [c for c in chunks if len(c) > 80]
# Preload klub cache
def load_klub_cache(conn):
cur = conn.cursor()
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
return cur.fetchall()
def fuzzy_klub(naziv, cache):
if not naziv or len(naziv) < 3:
return None
best_id, best_score = None, 0
for kid, kname in cache:
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
if score > best_score:
best_score, best_id = score, kid
return best_id if best_score > 72 else None
async def extract_persons(chunk_text_str, semaphore):
async with semaphore:
try:
async with httpx.AsyncClient(timeout=90.0) as c:
r = await c.post(VLLM_URL, json={
"model": VLLM_MODEL,
"messages": [
{"role": "system", "content": EXTRACT_PROMPT},
{"role": "user", "content": chunk_text_str[:5000]},
],
"temperature": 0.05,
"max_tokens": 2500,
"response_format": {"type": "json_object"},
})
d = r.json()
content = d["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
log.debug(f"Extract fail: {e}")
return {"osobe": []}
VALID_ULOGE = {
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
"kineziolog","maser","sudac","volonter"
}
async def main():
conn = psycopg2.connect(DSN)
conn.autocommit = True
cur = conn.cursor()
# Backup
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
SELECT * FROM pgz_sport.clanovi""")
log.info("Backup created")
klub_cache = load_klub_cache(conn)
log.info(f"Klub cache: {len(klub_cache)} klubova")
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
log.info(f"Files: {len(files)}")
semaphore = asyncio.Semaphore(MAX_WORKERS)
total_inserted = 0
total_skipped = 0
for f in files:
m = re.search(r'godisnjak_(\d{4})_layout', f)
year = m.group(1) if m else "?"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
log.info(f"Year {year}: {len(chunks)} chunks")
tasks = [extract_persons(c, semaphore) for c in chunks]
results = await asyncio.gather(*tasks)
year_ins = 0
rows = []
for res in results:
for o in res.get("osobe", []):
ime = (o.get("ime") or "").strip()
prezime = (o.get("prezime") or "").strip()
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
continue
# Basic sanity — no numbers, no too-long names
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
continue
uloga = (o.get("uloga") or "igrac").lower().strip()
if uloga not in VALID_ULOGE:
uloga = "igrac"
klub_naziv = (o.get("klub") or "").strip()
klub_id = fuzzy_klub(klub_naziv, klub_cache)
rows.append((
ime, prezime, uloga, klub_id,
"godisnjak",
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
"sportas",
))
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
for row in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
VALUES (%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT DO NOTHING
""", row)
if cur.rowcount:
year_ins += 1
except Exception as e:
log.debug(f"Insert skip: {e}")
total_inserted += year_ins
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
final = cur.fetchone()[0]
conn.close()
log.info(f"""
=== EXTRACT DONE ===
Inserted this run: {total_inserted}
Total godisnjak u DB: {final}
""")
import requests as rq
rq.post(
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
data={"chat_id": "7969491558",
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
timeout=10,
)
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,209 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_extract.py
# Verzija: 1.0.0
# Datum: 03.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
# Zavisi od: httpx, psycopg2, rapidfuzz
# Utječe na: pgz_sport.clanovi
# ═══════════════════════════════════════════════════════════════════
import asyncio, glob, json, logging, re, sys, time
import httpx, psycopg2
from psycopg2.extras import execute_batch
from rapidfuzz import fuzz
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s] %(message)s",
datefmt="%H:%M:%S",
handlers=[
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
logging.StreamHandler(),
],
)
log = logging.getLogger("extract")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
MAX_WORKERS = 4
CHUNK_SIZE = 1400
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
kineziolog, maser, sudac, volonter
Pravila:
1. Samo HRVATSKA osobe s punim imenom i prezimenom
2. Ako klub nije eksplicitno naveden -> klub=""
3. NE izmisljaj - samo jasno navedena imena u tekstu
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
def chunk_text(text, size=CHUNK_SIZE):
paragraphs = re.split(r'\n\n+', text)
chunks, cur = [], ""
for p in paragraphs:
if len(cur) + len(p) > size:
if cur: chunks.append(cur.strip())
cur = p
else:
cur += "\n\n" + p
if cur: chunks.append(cur.strip())
return [c for c in chunks if len(c) > 80]
# Preload klub cache
def load_klub_cache(conn):
cur = conn.cursor()
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
return cur.fetchall()
def fuzzy_klub(naziv, cache):
if not naziv or len(naziv) < 3:
return None
best_id, best_score = None, 0
for kid, kname in cache:
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
if score > best_score:
best_score, best_id = score, kid
return best_id if best_score > 72 else None
async def extract_persons(chunk_text_str, semaphore):
async with semaphore:
try:
async with httpx.AsyncClient(timeout=90.0) as c:
r = await c.post(VLLM_URL, json={
"model": VLLM_MODEL,
"messages": [
{"role": "system", "content": EXTRACT_PROMPT},
{"role": "user", "content": chunk_text_str[:5000]},
],
"temperature": 0.05,
"max_tokens": 2500,
"response_format": {"type": "json_object"},
})
d = r.json()
content = d["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
log.debug(f"Extract fail: {e}")
return {"osobe": []}
VALID_ULOGE = {
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
"kineziolog","maser","sudac","volonter"
}
async def main():
conn = psycopg2.connect(DSN)
conn.autocommit = True
cur = conn.cursor()
# Backup
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
SELECT * FROM pgz_sport.clanovi""")
log.info("Backup created")
klub_cache = load_klub_cache(conn)
log.info(f"Klub cache: {len(klub_cache)} klubova")
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
log.info(f"Files: {len(files)}")
semaphore = asyncio.Semaphore(MAX_WORKERS)
total_inserted = 0
total_skipped = 0
for f in files:
m = re.search(r'godisnjak_(\d{4})_layout', f)
year = m.group(1) if m else "?"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
log.info(f"Year {year}: {len(chunks)} chunks")
tasks = [extract_persons(c, semaphore) for c in chunks]
results = await asyncio.gather(*tasks)
year_ins = 0
rows = []
for res in results:
for o in res.get("osobe", []):
ime = (o.get("ime") or "").strip()
prezime = (o.get("prezime") or "").strip()
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
continue
# Basic sanity — no numbers, no too-long names
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
continue
uloga = (o.get("uloga") or "igrac").lower().strip()
if uloga not in VALID_ULOGE:
uloga = "igrac"
klub_naziv = (o.get("klub") or "").strip()
klub_id = fuzzy_klub(klub_naziv, klub_cache)
rows.append((
ime, prezime, uloga, klub_id,
"godisnjak",
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
"sportas",
))
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
for row in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
VALUES (%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT DO NOTHING
""", row)
if cur.rowcount:
year_ins += 1
except Exception as e:
log.debug(f"Insert skip: {e}")
total_inserted += year_ins
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
final = cur.fetchone()[0]
conn.close()
log.info(f"""
=== EXTRACT DONE ===
Inserted this run: {total_inserted}
Total godisnjak u DB: {final}
""")
import requests as rq
rq.post(
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
data={"chat_id": "7969491558",
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
timeout=10,
)
if __name__ == "__main__":
asyncio.run(main())
+316
View File
@@ -0,0 +1,316 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_pipeline.py
# Verzija: 1.0.0
# Datum: 03.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scripts/godisnjak_pipeline.py
# Svrha: Embed godisnjaci PGZ u pgz_universe + LLM ekstrakcija osoba/uloga
# Zavisi od: qdrant_client, httpx, psycopg2, rapidfuzz
# Utječe na: pgz_universe (Qdrant), pgz_sport.clanovi (insert)
# ═══════════════════════════════════════════════════════════════════
"""Godisnjak PGZ embed + LLM person extraction pipeline."""
import asyncio
import glob
import hashlib
import json
import logging
import re
import sys
sys.path.insert(0, '/opt/rinet-gpu/lib')
try:
from tg_notify import notify as _tg_notify
except ImportError:
_tg_notify = None
import time
from concurrent.futures import ThreadPoolExecutor
import httpx
import psycopg2
from psycopg2.extras import execute_batch
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [godisnjak] %(levelname)s: %(message)s",
handlers=[
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_pipeline.log"),
logging.StreamHandler(),
],
)
log = logging.getLogger("godisnjak")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
EMBED_URL = "http://localhost:9879/api/embeddings"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
QDRANT_COLLECTION = "pgz_universe"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
MAX_WORKERS = 5
CHUNK_SIZE = 1500 # < 2000 zbog BGE-M3 truncation
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
Format strogo JSON:
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"predsjednik|igrac|trener|tajnik|fizioterapeut|lijecnik","godina_rodenja":1990}]}
Uloge ISKLJUCIVO: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter
Pravila:
1. Samo HRVATSKE osobe (ne strani sportasi koji su gostovali)
2. Ako klub nije jasan -> ostavi prazan string
3. NE izmisljaj imena -> samo ona JASNO IZRAZENA u tekstu
4. Vrati VALID JSON bez markdown backtick-ova"""
def chunk_text(text, size=CHUNK_SIZE):
paragraphs = re.split(r"\n\n+", text)
chunks, cur = [], ""
for p in paragraphs:
if len(cur) + len(p) > size:
if cur:
chunks.append(cur.strip())
cur = p
else:
cur += "\n\n" + p
if cur:
chunks.append(cur.strip())
return [c for c in chunks if len(c) > 100]
async def embed_batch(texts):
async with httpx.AsyncClient(timeout=120.0) as c:
r = await c.post(EMBED_URL, json={"texts": texts})
d = r.json()
return d.get("embeddings", [])
async def extract_persons(chunk_text_str):
async with httpx.AsyncClient(timeout=120.0) as c:
r = await c.post(
VLLM_URL,
json={
"model": VLLM_MODEL,
"messages": [
{"role": "system", "content": EXTRACT_PROMPT},
{"role": "user", "content": chunk_text_str[:5500]},
],
"temperature": 0.1,
"max_tokens": 3000,
"response_format": {"type": "json_object"},
},
)
d = r.json()
try:
content = d["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
log.warning(f"Parse fail: {e}")
return {"osobe": []}
def fuzzy_match_klub(naziv, conn):
"""Fuzzy match klub name to pgz_sport.klubovi.id"""
try:
from rapidfuzz import fuzz
cur = conn.cursor()
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi LIMIT 1000")
rows = cur.fetchall()
best_id, best_score = None, 0
for kid, kname in rows:
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
if score > best_score:
best_score = score
best_id = kid
return best_id if best_score > 75 else None
except Exception as e:
log.warning(f"Fuzzy match fail: {e}")
return None
def insert_persons(persons_data, year, conn):
"""Insert extracted persons into pgz_sport.clanovi."""
osobe = persons_data.get("osobe", [])
if not osobe:
return 0
inserted = 0
cur = conn.cursor()
for o in osobe:
ime = (o.get("ime") or "").strip()
prezime = (o.get("prezime") or "").strip()
if not ime or not prezime:
continue
klub_naziv = (o.get("klub") or "").strip()
klub_id = fuzzy_match_klub(klub_naziv, conn) if klub_naziv else None
uloga = (o.get("uloga") or "igrac").strip()
# Validate uloga
VALID_ULOGE = {
"predsjednik", "dopredsjednik", "tajnik", "blagajnik", "clan_uprave",
"igrac", "sportas", "glavni_trener", "trener", "pomocni_trener",
"kondicioni_trener", "selektor", "izbornik", "team_manager", "voditelj",
"lijecnik", "fizioterapeut", "kineziolog", "maser", "sudac", "volonter"
}
if uloga not in VALID_ULOGE:
uloga = "igrac"
profile_key = f"godisnjak:{year}:{ime}:{prezime}:{klub_naziv}"
try:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
RETURNING id
""", (
ime, prezime, uloga, klub_id,
"godisnjak",
json.dumps({"year": year, "klub_naziv": klub_naziv, "key": profile_key}),
"sportas",
))
if cur.fetchone():
inserted += 1
except Exception as e:
log.warning(f"Insert fail {ime} {prezime}: {e}")
conn.rollback()
conn.commit()
return inserted
async def phase1_embed(files_layout):
"""Embed sve godisnjake u pgz_universe."""
log.info(f"Phase 1: Embed {len(files_layout)} godisnjaka")
qdrant = QdrantClient(host="localhost", port=6333)
all_chunks = []
all_meta = []
for f in files_layout:
m = re.search(r"godisnjak_(\d{4})_layout", f)
year = m.group(1) if m else "unknown"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
for i, c in enumerate(chunks):
all_chunks.append(c)
all_meta.append({"year": year, "chunk_idx": i, "source": f.split("/")[-1]})
log.info(f"Total chunks: {len(all_chunks)}")
points = []
BATCH = 32
for i in range(0, len(all_chunks), BATCH):
batch = all_chunks[i : i + BATCH]
try:
embeddings = await embed_batch(batch)
for j, (text, emb) in enumerate(zip(batch, embeddings)):
meta = all_meta[i + j]
pid_key = f"godisnjak:{meta['source']}:{meta['chunk_idx']}"
point_id = int(hashlib.md5(pid_key.encode()).hexdigest()[:15], 16)
points.append(
PointStruct(
id=point_id,
vector=emb,
payload={**meta, "text": text[:1500], "type": "godisnjak_pgz"},
)
)
except Exception as e:
log.warning(f"Embed batch {i} fail: {e}")
await asyncio.sleep(2)
if i % 200 == 0:
log.info(f" Embed progress: {i}/{len(all_chunks)}")
qdrant.upsert(collection_name=QDRANT_COLLECTION, points=points)
log.info(f"Phase 1 DONE: {len(points)} chunks → {QDRANT_COLLECTION}")
return len(points)
async def phase2_extract(files_layout):
"""LLM ekstrakcija osoba/uloga iz godisnjaka."""
log.info(f"Phase 2: LLM extract persons from {len(files_layout)} godisnjaka")
conn = psycopg2.connect(DSN)
conn.autocommit = False
total_inserted = 0
semaphore = asyncio.Semaphore(MAX_WORKERS)
async def process_file(f):
nonlocal total_inserted
m = re.search(r"godisnjak_(\d{4})_layout", f)
year = m.group(1) if m else "unknown"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
log.info(f" Year {year}: {len(chunks)} chunks")
year_inserted = 0
for i, chunk in enumerate(chunks):
async with semaphore:
try:
persons = await extract_persons(chunk)
n = insert_persons(persons, year, conn)
year_inserted += n
if i % 10 == 0:
log.info(f" {year} chunk {i}/{len(chunks)}: {n} osoba")
except Exception as e:
log.warning(f"Extract/insert fail {year} chunk {i}: {e}")
await asyncio.sleep(0.5)
total_inserted += year_inserted
log.info(f" Year {year} DONE: {year_inserted} osoba inserted")
tasks = [process_file(f) for f in files_layout]
await asyncio.gather(*tasks)
conn.close()
log.info(f"Phase 2 DONE: {total_inserted} total osoba inserted")
return total_inserted
async def main():
files_layout = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
log.info(f"Found {len(files_layout)} layout files: {[f.split('/')[-1] for f in files_layout]}")
if not files_layout:
log.error("Nema layout fajlova!")
sys.exit(1)
# Phase 1: Embed
n_embedded = await phase1_embed(files_layout)
# Phase 2: LLM extract
n_persons = await phase2_extract(files_layout)
# Final stats
conn = psycopg2.connect(DSN)
cur = conn.cursor()
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
total_godisnjak = cur.fetchone()[0]
conn.close()
log.info(f"""
=== GODISNJAK PIPELINE COMPLETE ===
Chunks embedded: {n_embedded}
Persons extracted: {n_persons}
Total godisnjak clanovi u DB: {total_godisnjak}
""")
# Telegram
import requests as req_lib
req_lib.post(
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
data={"chat_id": "7969491558", "text": f"✅ Godisnjak pipeline DONE: {n_embedded} chunks, {n_persons} osoba, {total_godisnjak} total u DB"},
timeout=10,
)
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,311 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_pipeline.py
# Verzija: 1.0.0
# Datum: 03.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scripts/godisnjak_pipeline.py
# Svrha: Embed godisnjaci PGZ u pgz_universe + LLM ekstrakcija osoba/uloga
# Zavisi od: qdrant_client, httpx, psycopg2, rapidfuzz
# Utječe na: pgz_universe (Qdrant), pgz_sport.clanovi (insert)
# ═══════════════════════════════════════════════════════════════════
"""Godisnjak PGZ embed + LLM person extraction pipeline."""
import asyncio
import glob
import hashlib
import json
import logging
import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor
import httpx
import psycopg2
from psycopg2.extras import execute_batch
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [godisnjak] %(levelname)s: %(message)s",
handlers=[
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_pipeline.log"),
logging.StreamHandler(),
],
)
log = logging.getLogger("godisnjak")
DSN = "host=127.0.0.1 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
EMBED_URL = "http://localhost:9879/api/embeddings"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
QDRANT_COLLECTION = "pgz_universe"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
MAX_WORKERS = 5
CHUNK_SIZE = 1500 # < 2000 zbog BGE-M3 truncation
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
Format strogo JSON:
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"predsjednik|igrac|trener|tajnik|fizioterapeut|lijecnik","godina_rodenja":1990}]}
Uloge ISKLJUCIVO: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter
Pravila:
1. Samo HRVATSKE osobe (ne strani sportasi koji su gostovali)
2. Ako klub nije jasan -> ostavi prazan string
3. NE izmisljaj imena -> samo ona JASNO IZRAZENA u tekstu
4. Vrati VALID JSON bez markdown backtick-ova"""
def chunk_text(text, size=CHUNK_SIZE):
paragraphs = re.split(r"\n\n+", text)
chunks, cur = [], ""
for p in paragraphs:
if len(cur) + len(p) > size:
if cur:
chunks.append(cur.strip())
cur = p
else:
cur += "\n\n" + p
if cur:
chunks.append(cur.strip())
return [c for c in chunks if len(c) > 100]
async def embed_batch(texts):
async with httpx.AsyncClient(timeout=120.0) as c:
r = await c.post(EMBED_URL, json={"texts": texts})
d = r.json()
return d.get("embeddings", [])
async def extract_persons(chunk_text_str):
async with httpx.AsyncClient(timeout=120.0) as c:
r = await c.post(
VLLM_URL,
json={
"model": VLLM_MODEL,
"messages": [
{"role": "system", "content": EXTRACT_PROMPT},
{"role": "user", "content": chunk_text_str[:5500]},
],
"temperature": 0.1,
"max_tokens": 3000,
"response_format": {"type": "json_object"},
},
)
d = r.json()
try:
content = d["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
log.warning(f"Parse fail: {e}")
return {"osobe": []}
def fuzzy_match_klub(naziv, conn):
"""Fuzzy match klub name to pgz_sport.klubovi.id"""
try:
from rapidfuzz import fuzz
cur = conn.cursor()
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi LIMIT 1000")
rows = cur.fetchall()
best_id, best_score = None, 0
for kid, kname in rows:
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
if score > best_score:
best_score = score
best_id = kid
return best_id if best_score > 75 else None
except Exception as e:
log.warning(f"Fuzzy match fail: {e}")
return None
def insert_persons(persons_data, year, conn):
"""Insert extracted persons into pgz_sport.clanovi."""
osobe = persons_data.get("osobe", [])
if not osobe:
return 0
inserted = 0
cur = conn.cursor()
for o in osobe:
ime = (o.get("ime") or "").strip()
prezime = (o.get("prezime") or "").strip()
if not ime or not prezime:
continue
klub_naziv = (o.get("klub") or "").strip()
klub_id = fuzzy_match_klub(klub_naziv, conn) if klub_naziv else None
uloga = (o.get("uloga") or "igrac").strip()
# Validate uloga
VALID_ULOGE = {
"predsjednik", "dopredsjednik", "tajnik", "blagajnik", "clan_uprave",
"igrac", "sportas", "glavni_trener", "trener", "pomocni_trener",
"kondicioni_trener", "selektor", "izbornik", "team_manager", "voditelj",
"lijecnik", "fizioterapeut", "kineziolog", "maser", "sudac", "volonter"
}
if uloga not in VALID_ULOGE:
uloga = "igrac"
profile_key = f"godisnjak:{year}:{ime}:{prezime}:{klub_naziv}"
try:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
RETURNING id
""", (
ime, prezime, uloga, klub_id,
"godisnjak",
json.dumps({"year": year, "klub_naziv": klub_naziv, "key": profile_key}),
"sportas",
))
if cur.fetchone():
inserted += 1
except Exception as e:
log.warning(f"Insert fail {ime} {prezime}: {e}")
conn.rollback()
conn.commit()
return inserted
async def phase1_embed(files_layout):
"""Embed sve godisnjake u pgz_universe."""
log.info(f"Phase 1: Embed {len(files_layout)} godisnjaka")
qdrant = QdrantClient(host="localhost", port=6333)
all_chunks = []
all_meta = []
for f in files_layout:
m = re.search(r"godisnjak_(\d{4})_layout", f)
year = m.group(1) if m else "unknown"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
for i, c in enumerate(chunks):
all_chunks.append(c)
all_meta.append({"year": year, "chunk_idx": i, "source": f.split("/")[-1]})
log.info(f"Total chunks: {len(all_chunks)}")
points = []
BATCH = 32
for i in range(0, len(all_chunks), BATCH):
batch = all_chunks[i : i + BATCH]
try:
embeddings = await embed_batch(batch)
for j, (text, emb) in enumerate(zip(batch, embeddings)):
meta = all_meta[i + j]
pid_key = f"godisnjak:{meta['source']}:{meta['chunk_idx']}"
point_id = int(hashlib.md5(pid_key.encode()).hexdigest()[:15], 16)
points.append(
PointStruct(
id=point_id,
vector=emb,
payload={**meta, "text": text[:1500], "type": "godisnjak_pgz"},
)
)
except Exception as e:
log.warning(f"Embed batch {i} fail: {e}")
await asyncio.sleep(2)
if i % 200 == 0:
log.info(f" Embed progress: {i}/{len(all_chunks)}")
qdrant.upsert(collection_name=QDRANT_COLLECTION, points=points)
log.info(f"Phase 1 DONE: {len(points)} chunks → {QDRANT_COLLECTION}")
return len(points)
async def phase2_extract(files_layout):
"""LLM ekstrakcija osoba/uloga iz godisnjaka."""
log.info(f"Phase 2: LLM extract persons from {len(files_layout)} godisnjaka")
conn = psycopg2.connect(DSN)
conn.autocommit = False
total_inserted = 0
semaphore = asyncio.Semaphore(MAX_WORKERS)
async def process_file(f):
nonlocal total_inserted
m = re.search(r"godisnjak_(\d{4})_layout", f)
year = m.group(1) if m else "unknown"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
log.info(f" Year {year}: {len(chunks)} chunks")
year_inserted = 0
for i, chunk in enumerate(chunks):
async with semaphore:
try:
persons = await extract_persons(chunk)
n = insert_persons(persons, year, conn)
year_inserted += n
if i % 10 == 0:
log.info(f" {year} chunk {i}/{len(chunks)}: {n} osoba")
except Exception as e:
log.warning(f"Extract/insert fail {year} chunk {i}: {e}")
await asyncio.sleep(0.5)
total_inserted += year_inserted
log.info(f" Year {year} DONE: {year_inserted} osoba inserted")
tasks = [process_file(f) for f in files_layout]
await asyncio.gather(*tasks)
conn.close()
log.info(f"Phase 2 DONE: {total_inserted} total osoba inserted")
return total_inserted
async def main():
files_layout = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
log.info(f"Found {len(files_layout)} layout files: {[f.split('/')[-1] for f in files_layout]}")
if not files_layout:
log.error("Nema layout fajlova!")
sys.exit(1)
# Phase 1: Embed
n_embedded = await phase1_embed(files_layout)
# Phase 2: LLM extract
n_persons = await phase2_extract(files_layout)
# Final stats
conn = psycopg2.connect(DSN)
cur = conn.cursor()
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
total_godisnjak = cur.fetchone()[0]
conn.close()
log.info(f"""
=== GODISNJAK PIPELINE COMPLETE ===
Chunks embedded: {n_embedded}
Persons extracted: {n_persons}
Total godisnjak clanovi u DB: {total_godisnjak}
""")
# Telegram
import requests as req_lib
req_lib.post(
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
data={"chat_id": "7969491558", "text": f"✅ Godisnjak pipeline DONE: {n_embedded} chunks, {n_persons} osoba, {total_godisnjak} total u DB"},
timeout=10,
)
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,311 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_pipeline.py
# Verzija: 1.0.0
# Datum: 03.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scripts/godisnjak_pipeline.py
# Svrha: Embed godisnjaci PGZ u pgz_universe + LLM ekstrakcija osoba/uloga
# Zavisi od: qdrant_client, httpx, psycopg2, rapidfuzz
# Utječe na: pgz_universe (Qdrant), pgz_sport.clanovi (insert)
# ═══════════════════════════════════════════════════════════════════
"""Godisnjak PGZ embed + LLM person extraction pipeline."""
import asyncio
import glob
import hashlib
import json
import logging
import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor
import httpx
import psycopg2
from psycopg2.extras import execute_batch
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [godisnjak] %(levelname)s: %(message)s",
handlers=[
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_pipeline.log"),
logging.StreamHandler(),
],
)
log = logging.getLogger("godisnjak")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
EMBED_URL = "http://localhost:9879/api/embeddings"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
QDRANT_COLLECTION = "pgz_universe"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
MAX_WORKERS = 5
CHUNK_SIZE = 1500 # < 2000 zbog BGE-M3 truncation
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
Format strogo JSON:
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"predsjednik|igrac|trener|tajnik|fizioterapeut|lijecnik","godina_rodenja":1990}]}
Uloge ISKLJUCIVO: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter
Pravila:
1. Samo HRVATSKE osobe (ne strani sportasi koji su gostovali)
2. Ako klub nije jasan -> ostavi prazan string
3. NE izmisljaj imena -> samo ona JASNO IZRAZENA u tekstu
4. Vrati VALID JSON bez markdown backtick-ova"""
def chunk_text(text, size=CHUNK_SIZE):
paragraphs = re.split(r"\n\n+", text)
chunks, cur = [], ""
for p in paragraphs:
if len(cur) + len(p) > size:
if cur:
chunks.append(cur.strip())
cur = p
else:
cur += "\n\n" + p
if cur:
chunks.append(cur.strip())
return [c for c in chunks if len(c) > 100]
async def embed_batch(texts):
async with httpx.AsyncClient(timeout=120.0) as c:
r = await c.post(EMBED_URL, json={"texts": texts})
d = r.json()
return d.get("embeddings", [])
async def extract_persons(chunk_text_str):
async with httpx.AsyncClient(timeout=120.0) as c:
r = await c.post(
VLLM_URL,
json={
"model": VLLM_MODEL,
"messages": [
{"role": "system", "content": EXTRACT_PROMPT},
{"role": "user", "content": chunk_text_str[:5500]},
],
"temperature": 0.1,
"max_tokens": 3000,
"response_format": {"type": "json_object"},
},
)
d = r.json()
try:
content = d["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
log.warning(f"Parse fail: {e}")
return {"osobe": []}
def fuzzy_match_klub(naziv, conn):
"""Fuzzy match klub name to pgz_sport.klubovi.id"""
try:
from rapidfuzz import fuzz
cur = conn.cursor()
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi LIMIT 1000")
rows = cur.fetchall()
best_id, best_score = None, 0
for kid, kname in rows:
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
if score > best_score:
best_score = score
best_id = kid
return best_id if best_score > 75 else None
except Exception as e:
log.warning(f"Fuzzy match fail: {e}")
return None
def insert_persons(persons_data, year, conn):
"""Insert extracted persons into pgz_sport.clanovi."""
osobe = persons_data.get("osobe", [])
if not osobe:
return 0
inserted = 0
cur = conn.cursor()
for o in osobe:
ime = (o.get("ime") or "").strip()
prezime = (o.get("prezime") or "").strip()
if not ime or not prezime:
continue
klub_naziv = (o.get("klub") or "").strip()
klub_id = fuzzy_match_klub(klub_naziv, conn) if klub_naziv else None
uloga = (o.get("uloga") or "igrac").strip()
# Validate uloga
VALID_ULOGE = {
"predsjednik", "dopredsjednik", "tajnik", "blagajnik", "clan_uprave",
"igrac", "sportas", "glavni_trener", "trener", "pomocni_trener",
"kondicioni_trener", "selektor", "izbornik", "team_manager", "voditelj",
"lijecnik", "fizioterapeut", "kineziolog", "maser", "sudac", "volonter"
}
if uloga not in VALID_ULOGE:
uloga = "igrac"
profile_key = f"godisnjak:{year}:{ime}:{prezime}:{klub_naziv}"
try:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
RETURNING id
""", (
ime, prezime, uloga, klub_id,
"godisnjak",
json.dumps({"year": year, "klub_naziv": klub_naziv, "key": profile_key}),
"sportas",
))
if cur.fetchone():
inserted += 1
except Exception as e:
log.warning(f"Insert fail {ime} {prezime}: {e}")
conn.rollback()
conn.commit()
return inserted
async def phase1_embed(files_layout):
"""Embed sve godisnjake u pgz_universe."""
log.info(f"Phase 1: Embed {len(files_layout)} godisnjaka")
qdrant = QdrantClient(host="localhost", port=6333)
all_chunks = []
all_meta = []
for f in files_layout:
m = re.search(r"godisnjak_(\d{4})_layout", f)
year = m.group(1) if m else "unknown"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
for i, c in enumerate(chunks):
all_chunks.append(c)
all_meta.append({"year": year, "chunk_idx": i, "source": f.split("/")[-1]})
log.info(f"Total chunks: {len(all_chunks)}")
points = []
BATCH = 32
for i in range(0, len(all_chunks), BATCH):
batch = all_chunks[i : i + BATCH]
try:
embeddings = await embed_batch(batch)
for j, (text, emb) in enumerate(zip(batch, embeddings)):
meta = all_meta[i + j]
pid_key = f"godisnjak:{meta['source']}:{meta['chunk_idx']}"
point_id = int(hashlib.md5(pid_key.encode()).hexdigest()[:15], 16)
points.append(
PointStruct(
id=point_id,
vector=emb,
payload={**meta, "text": text[:1500], "type": "godisnjak_pgz"},
)
)
except Exception as e:
log.warning(f"Embed batch {i} fail: {e}")
await asyncio.sleep(2)
if i % 200 == 0:
log.info(f" Embed progress: {i}/{len(all_chunks)}")
qdrant.upsert(collection_name=QDRANT_COLLECTION, points=points)
log.info(f"Phase 1 DONE: {len(points)} chunks → {QDRANT_COLLECTION}")
return len(points)
async def phase2_extract(files_layout):
"""LLM ekstrakcija osoba/uloga iz godisnjaka."""
log.info(f"Phase 2: LLM extract persons from {len(files_layout)} godisnjaka")
conn = psycopg2.connect(DSN)
conn.autocommit = False
total_inserted = 0
semaphore = asyncio.Semaphore(MAX_WORKERS)
async def process_file(f):
nonlocal total_inserted
m = re.search(r"godisnjak_(\d{4})_layout", f)
year = m.group(1) if m else "unknown"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
log.info(f" Year {year}: {len(chunks)} chunks")
year_inserted = 0
for i, chunk in enumerate(chunks):
async with semaphore:
try:
persons = await extract_persons(chunk)
n = insert_persons(persons, year, conn)
year_inserted += n
if i % 10 == 0:
log.info(f" {year} chunk {i}/{len(chunks)}: {n} osoba")
except Exception as e:
log.warning(f"Extract/insert fail {year} chunk {i}: {e}")
await asyncio.sleep(0.5)
total_inserted += year_inserted
log.info(f" Year {year} DONE: {year_inserted} osoba inserted")
tasks = [process_file(f) for f in files_layout]
await asyncio.gather(*tasks)
conn.close()
log.info(f"Phase 2 DONE: {total_inserted} total osoba inserted")
return total_inserted
async def main():
files_layout = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
log.info(f"Found {len(files_layout)} layout files: {[f.split('/')[-1] for f in files_layout]}")
if not files_layout:
log.error("Nema layout fajlova!")
sys.exit(1)
# Phase 1: Embed
n_embedded = await phase1_embed(files_layout)
# Phase 2: LLM extract
n_persons = await phase2_extract(files_layout)
# Final stats
conn = psycopg2.connect(DSN)
cur = conn.cursor()
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
total_godisnjak = cur.fetchone()[0]
conn.close()
log.info(f"""
=== GODISNJAK PIPELINE COMPLETE ===
Chunks embedded: {n_embedded}
Persons extracted: {n_persons}
Total godisnjak clanovi u DB: {total_godisnjak}
""")
# Telegram
import requests as req_lib
req_lib.post(
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
data={"chat_id": "7969491558", "text": f"✅ Godisnjak pipeline DONE: {n_embedded} chunks, {n_persons} osoba, {total_godisnjak} total u DB"},
timeout=10,
)
if __name__ == "__main__":
asyncio.run(main())