#!/usr/bin/env python3 """LLM mining v6 - multi-godina, persist to clan_nagrada, name validation.""" import json, re, psycopg2, requests, time, os, sys DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7') conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() OLLAMA = "http://localhost:11434/api/generate" MODEL = "qwen2.5:7b" PROMPT_TPL = """Iz teksta godišnjaka sporta ekstraktiraj sve KONKRETNE rezultate sportaša. Vrati ISKLJUČIVO valjani JSON niz. Format jednog objekta: {"sportas":"Ime Prezime","klub":"AK Kvarner","sport":"atletika","godina":2024,"natjecanje":"PH","disciplina":"100m","plasman":1,"medalja":"zlato"} KAKO ČITATI: - "1. mjesto" → plasman=1, medalja="zlato" - "2. mjesto" → plasman=2, medalja="srebro" - "3. mjesto" → plasman=3, medalja="bronca" - "4./5./6./7./8. mjesto" → plasman=N, medalja=null - Ako tekst eksplicitno kaže "zlato/srebro/bronca", uzmi to PRAVILA: - Vrati [] ako nema konkretnih rezultata - Sport mora biti pravi naziv: atletika, plivanje, košarka, rukomet, vaterpolo, nogomet, jedrenje, biciklizam, šah, taekwondo, karate, gimnastika, tenis, judo, streljaštvo, boćanje, kuglanje, veslanje, stolni tenis, plesovi itd. - "klub" - ekstraktiraj iz teksta ako je naveden - BEZ markdowna, BEZ ```, samo JSON niz - Ne izmišljaj imena - ako nije sigurno, preskoči TEKST: ___TXT___""" KW = re.compile(r'(zlat|srebr|bronc|prvenstv|prvak|EP\b|SP\b|olimp|medalj|svjetsk|europsk|\d\.\s*mjest)', re.IGNORECASE) NAMES_RE = re.compile(r'\b([A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+\s+[A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+)\b') # Pre-filter junk - non-name patterns that LLM can pick up NAME_BLACKLIST = re.compile(r'^(Sport Psychology|Endurance|HNS|HOO|HRS|HKS|MSO|HSS|SSP|TVS|HTS|RNK|ZSU|DVD|Liga HSS|HRP|Vikend|Posto|Visin|Razin|Klub|Sportski|Nogomet|Adresa|Email|Web|Tel|Mob|Fax)', re.IGNORECASE) def find_chunks(sadrzaj, chunk_size=2200, overlap=200): chunks = [] i = 0 while i < len(sadrzaj): chunk = sadrzaj[i:i+chunk_size] if KW.search(chunk) and len(NAMES_RE.findall(chunk)) >= 2: chunks.append((i, chunk)) i += chunk_size - overlap return chunks def call_llm(text): try: rsp = requests.post(OLLAMA, json={ "model": MODEL, "prompt": PROMPT_TPL.replace("___TXT___", text), "stream": False, "options": {"temperature": 0.0, "num_predict": 3000, "num_ctx": 4096} }, timeout=180) if rsp.status_code != 200: return None out = rsp.json().get('response', '').strip() out = re.sub(r'^```(?:json)?\s*', '', out) out = re.sub(r'\s*```$', '', out) m = re.search(r'\[\s*[\{\]]', out) if not m: return [] t = out[m.start():] try: return json.loads(t) except json.JSONDecodeError: for end in range(len(t)-1, max(len(t)-3000, 0), -50): if t[end] == ']': try: return json.loads(t[:end+1]) except: pass return [] except Exception: return None def insert_fact(f, godina_godisnjaka, doc_id): """Insert into clan_nagrada with dedup.""" sportas = f.get('sportas','').strip() if not sportas or len(sportas) < 4: return None if NAME_BLACKLIST.match(sportas): return 'blacklisted' if not f.get('plasman'): return None plasman = f.get('plasman') medalja = f.get('medalja') if plasman not in (1,2,3,4,5,6,7,8): return None # Auto-fix medalja if plasman == 1 and not medalja: medalja = 'zlato' elif plasman == 2 and not medalja: medalja = 'srebro' elif plasman == 3 and not medalja: medalja = 'bronca' god = f.get('godina') or godina_godisnjaka natj = (f.get('natjecanje') or 'unknown')[:200] disc = (f.get('disciplina') or 'unknown')[:200] # Try to match clan_id by ime+prezime parts = sportas.split(None, 1) clan_id = None if len(parts) == 2: cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE LOWER(ime||' '||prezime) = LOWER(%s) OR LOWER(prezime||' '||ime) = LOWER(%s) LIMIT 1""", (sportas, sportas)) rec = cu.fetchone() if rec: clan_id = rec[0] # Try to match klub_id klub_naziv = (f.get('klub') or '').strip() klub_id = None if klub_naziv and len(klub_naziv) >= 4: cu.execute("""SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s) ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END, CASE WHEN region='PGŽ' THEN 0 ELSE 1 END, id ASC LIMIT 1""", (klub_naziv, f"%{klub_naziv}%")) rec = cu.fetchone() if rec: klub_id = rec[0] try: cu.execute("""INSERT INTO pgz_sport.clan_nagrada (clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje, disciplina, plasman, medalja, source, source_url, last_updated) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'llm_godisnjak', %s, now()) ON CONFLICT ON CONSTRAINT clan_nagrada_uniq DO UPDATE SET clan_id = COALESCE(pgz_sport.clan_nagrada.clan_id, EXCLUDED.clan_id), klub_id = COALESCE(pgz_sport.clan_nagrada.klub_id, EXCLUDED.klub_id), last_updated = now()""", (clan_id, sportas, klub_id, klub_naziv or None, god, natj, disc, plasman, medalja, f"dokument:{doc_id}")) return 'inserted' except Exception as e: return f'err:{e}' GODINA_LIST = [int(x) for x in (sys.argv[1].split(',') if len(sys.argv) > 1 else ['2024'])] LIMIT = int(os.environ.get('LLM_LIMIT', 50)) total_inserted = 0 total_blacklisted = 0 total_facts = 0 t_global = time.time() for godina in GODINA_LIST: cu.execute("SELECT id, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina=%s LIMIT 1", (godina,)) rec = cu.fetchone() if not rec: print(f"=== Nema {godina} ==="); continue did, sadrzaj = rec print(f"\n=== Godišnjak {godina} (id={did}, {len(sadrzaj):,} chars) ===") chunks = find_chunks(sadrzaj) print(f"Chunks: {len(chunks)}, processing {min(LIMIT, len(chunks))}") god_inserted = 0 god_facts = 0 t_god = time.time() for idx, (off, chunk) in enumerate(chunks[:LIMIT]): t0 = time.time() facts = call_llm(chunk) el = time.time() - t0 if facts is None: print(f" [{idx+1}/{LIMIT}] ERR ({el:.1f}s)", flush=True) continue if not facts: continue for f in facts: if not isinstance(f, dict): continue res = insert_fact(f, godina, did) god_facts += 1 if res == 'inserted': god_inserted += 1 elif res == 'blacklisted': total_blacklisted += 1 if facts: print(f" [{idx+1}/{LIMIT}] {el:.1f}s {len(facts)} parsed", flush=True) print(f"\n=== {godina}: {god_facts} facts, {god_inserted} inserted, {time.time()-t_god:.0f}s ===") total_facts += god_facts total_inserted += god_inserted print(f"\n=== TOTAL ({time.time()-t_global:.0f}s): {total_facts} parsed, {total_inserted} inserted, {total_blacklisted} blacklisted ===") # Stats cu.execute("""SELECT godina, count(*) FROM pgz_sport.clan_nagrada WHERE source='llm_godisnjak' GROUP BY godina ORDER BY godina""") print("\n=== LLM clan_nagrada by year ===") for r in cu.fetchall(): print(f" {r[0]}: {r[1]}") cu.execute("""SELECT count(*) FILTER (WHERE clan_id IS NOT NULL), count(*) FILTER (WHERE clan_id IS NULL), count(*) FILTER (WHERE klub_id IS NOT NULL), count(*) FROM pgz_sport.clan_nagrada WHERE source='llm_godisnjak'""") r = cu.fetchone() print(f"\nLinking stats: {r[0]} clan_id linked, {r[1]} unlinked, {r[2]} klub_id linked of {r[3]} total") # Audit cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details) VALUES ('clan_nagrada', 'llm_mining_v6_db', 'qwen2.5:7b', NULL, %s::jsonb)""", (json.dumps({"godine": GODINA_LIST, "facts": total_facts, "inserted": total_inserted, "chunk_limit": LIMIT}),))