PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_extract.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
|
||||
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
|
||||
# Zavisi od: httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_sport.clanovi
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
import asyncio, glob, json, logging, re, sys, time
|
||||
import httpx, psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("extract")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 4
|
||||
CHUNK_SIZE = 1400
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
|
||||
|
||||
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
|
||||
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
|
||||
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
|
||||
kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKA osobe s punim imenom i prezimenom
|
||||
2. Ako klub nije eksplicitno naveden -> klub=""
|
||||
3. NE izmisljaj - samo jasno navedena imena u tekstu
|
||||
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r'\n\n+', text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur: chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur: chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 80]
|
||||
|
||||
|
||||
# Preload klub cache
|
||||
def load_klub_cache(conn):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def fuzzy_klub(naziv, cache):
|
||||
if not naziv or len(naziv) < 3:
|
||||
return None
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in cache:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score, best_id = score, kid
|
||||
return best_id if best_score > 72 else None
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str, semaphore):
|
||||
async with semaphore:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90.0) as c:
|
||||
r = await c.post(VLLM_URL, json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5000]},
|
||||
],
|
||||
"temperature": 0.05,
|
||||
"max_tokens": 2500,
|
||||
"response_format": {"type": "json_object"},
|
||||
})
|
||||
d = r.json()
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.debug(f"Extract fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
VALID_ULOGE = {
|
||||
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
|
||||
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
|
||||
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
|
||||
"kineziolog","maser","sudac","volonter"
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Backup
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
|
||||
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
|
||||
SELECT * FROM pgz_sport.clanovi""")
|
||||
log.info("Backup created")
|
||||
|
||||
klub_cache = load_klub_cache(conn)
|
||||
log.info(f"Klub cache: {len(klub_cache)} klubova")
|
||||
|
||||
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Files: {len(files)}")
|
||||
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
total_inserted = 0
|
||||
total_skipped = 0
|
||||
|
||||
for f in files:
|
||||
m = re.search(r'godisnjak_(\d{4})_layout', f)
|
||||
year = m.group(1) if m else "?"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f"Year {year}: {len(chunks)} chunks")
|
||||
|
||||
tasks = [extract_persons(c, semaphore) for c in chunks]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
year_ins = 0
|
||||
rows = []
|
||||
for res in results:
|
||||
for o in res.get("osobe", []):
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
|
||||
continue
|
||||
# Basic sanity — no numbers, no too-long names
|
||||
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
|
||||
continue
|
||||
|
||||
uloga = (o.get("uloga") or "igrac").lower().strip()
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_klub(klub_naziv, klub_cache)
|
||||
|
||||
rows.append((
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
|
||||
"sportas",
|
||||
))
|
||||
|
||||
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
|
||||
for row in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", row)
|
||||
if cur.rowcount:
|
||||
year_ins += 1
|
||||
except Exception as e:
|
||||
log.debug(f"Insert skip: {e}")
|
||||
|
||||
total_inserted += year_ins
|
||||
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
|
||||
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
final = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== EXTRACT DONE ===
|
||||
Inserted this run: {total_inserted}
|
||||
Total godisnjak u DB: {final}
|
||||
""")
|
||||
|
||||
import requests as rq
|
||||
rq.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558",
|
||||
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user