#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh import os # ═══════════════════════════════════════════════════════════════════ # Fajl: godisnjak_extract.py # Verzija: 1.0.0 # Datum: 03.05.2026 # Autor: Damir Radulić # Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py # Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2) # Zavisi od: httpx, psycopg2, rapidfuzz # Utječe na: pgz_sport.clanovi # ═══════════════════════════════════════════════════════════════════ import asyncio, glob, json, logging, re, sys, time import httpx, psycopg2 from psycopg2.extras import execute_batch from rapidfuzz import fuzz logging.basicConfig( level=logging.INFO, format="[%(asctime)s] %(message)s", datefmt="%H:%M:%S", handlers=[ logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"), logging.StreamHandler(), ], ) log = logging.getLogger("extract") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" VLLM_URL = "http://localhost:8001/v1/chat/completions" VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ" DATA_DIR = "/opt/pgz-sport/_data/godisnjaci" MAX_WORKERS = 4 CHUNK_SIZE = 1400 EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge. Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja): {"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]} Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter Pravila: 1. Samo HRVATSKA osobe s punim imenom i prezimenom 2. Ako klub nije eksplicitno naveden -> klub="" 3. NE izmisljaj - samo jasno navedena imena u tekstu 4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi""" def chunk_text(text, size=CHUNK_SIZE): paragraphs = re.split(r'\n\n+', text) chunks, cur = [], "" for p in paragraphs: if len(cur) + len(p) > size: if cur: chunks.append(cur.strip()) cur = p else: cur += "\n\n" + p if cur: chunks.append(cur.strip()) return [c for c in chunks if len(c) > 80] # Preload klub cache def load_klub_cache(conn): cur = conn.cursor() cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000") return cur.fetchall() def fuzzy_klub(naziv, cache): if not naziv or len(naziv) < 3: return None best_id, best_score = None, 0 for kid, kname in cache: score = fuzz.token_set_ratio(naziv.lower(), kname.lower()) if score > best_score: best_score, best_id = score, kid return best_id if best_score > 72 else None async def extract_persons(chunk_text_str, semaphore): async with semaphore: try: async with httpx.AsyncClient(timeout=90.0) as c: r = await c.post(VLLM_URL, json={ "model": VLLM_MODEL, "messages": [ {"role": "system", "content": EXTRACT_PROMPT}, {"role": "user", "content": chunk_text_str[:5000]}, ], "temperature": 0.05, "max_tokens": 2500, "response_format": {"type": "json_object"}, }) d = r.json() content = d["choices"][0]["message"]["content"] return json.loads(content) except Exception as e: log.debug(f"Extract fail: {e}") return {"osobe": []} VALID_ULOGE = { "predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave", "igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener", "selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut", "kineziolog","maser","sudac","volonter" } async def main(): conn = psycopg2.connect(DSN) conn.autocommit = True cur = conn.cursor() # Backup cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""") cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup SELECT * FROM pgz_sport.clanovi""") log.info("Backup created") klub_cache = load_klub_cache(conn) log.info(f"Klub cache: {len(klub_cache)} klubova") files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt")) log.info(f"Files: {len(files)}") semaphore = asyncio.Semaphore(MAX_WORKERS) total_inserted = 0 total_skipped = 0 for f in files: m = re.search(r'godisnjak_(\d{4})_layout', f) year = m.group(1) if m else "?" with open(f) as fp: text = fp.read() chunks = chunk_text(text) log.info(f"Year {year}: {len(chunks)} chunks") tasks = [extract_persons(c, semaphore) for c in chunks] results = await asyncio.gather(*tasks) year_ins = 0 rows = [] for res in results: for o in res.get("osobe", []): ime = (o.get("ime") or "").strip() prezime = (o.get("prezime") or "").strip() if not ime or not prezime or len(ime) < 2 or len(prezime) < 2: continue # Basic sanity — no numbers, no too-long names if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60: continue uloga = (o.get("uloga") or "igrac").lower().strip() if uloga not in VALID_ULOGE: uloga = "igrac" klub_naziv = (o.get("klub") or "").strip() klub_id = fuzzy_klub(klub_naziv, klub_cache) rows.append(( ime, prezime, uloga, klub_id, "godisnjak", json.dumps({"year": int(year), "klub_naziv": klub_naziv}), "sportas", )) # Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata for row in rows: try: cur.execute(""" INSERT INTO pgz_sport.clanovi (ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija) VALUES (%s,%s,%s,%s,%s,%s,%s) ON CONFLICT DO NOTHING """, row) if cur.rowcount: year_ins += 1 except Exception as e: log.debug(f"Insert skip: {e}") total_inserted += year_ins log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})") cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'") final = cur.fetchone()[0] conn.close() log.info(f""" === EXTRACT DONE === Inserted this run: {total_inserted} Total godisnjak u DB: {final} """) import requests as rq rq.post( "https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage", data={"chat_id": "7969491558", "text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"}, timeout=10, ) if __name__ == "__main__": asyncio.run(main())