Files
pgz-sport/scrapers/llm_godisnjak_mining.py

207 lines
8.5 KiB
Python
Executable File

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""LLM mining v6 - multi-godina, persist to clan_nagrada, name validation."""
import json, re, psycopg2, requests, time, os, sys
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password=os.environ["DB_PASSWORD"])
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
OLLAMA = "http://localhost:11434/api/generate"
MODEL = "qwen2.5:7b"
PROMPT_TPL = """Iz teksta godišnjaka sporta ekstraktiraj sve KONKRETNE rezultate sportaša. Vrati ISKLJUČIVO valjani JSON niz.
Format jednog objekta:
{"sportas":"Ime Prezime","klub":"AK Kvarner","sport":"atletika","godina":2024,"natjecanje":"PH","disciplina":"100m","plasman":1,"medalja":"zlato"}
KAKO ČITATI:
- "1. mjesto" → plasman=1, medalja="zlato"
- "2. mjesto" → plasman=2, medalja="srebro"
- "3. mjesto" → plasman=3, medalja="bronca"
- "4./5./6./7./8. mjesto" → plasman=N, medalja=null
- Ako tekst eksplicitno kaže "zlato/srebro/bronca", uzmi to
PRAVILA:
- Vrati [] ako nema konkretnih rezultata
- Sport mora biti pravi naziv: atletika, plivanje, košarka, rukomet, vaterpolo, nogomet, jedrenje, biciklizam, šah, taekwondo, karate, gimnastika, tenis, judo, streljaštvo, boćanje, kuglanje, veslanje, stolni tenis, plesovi itd.
- "klub" - ekstraktiraj iz teksta ako je naveden
- BEZ markdowna, BEZ ```, samo JSON niz
- Ne izmišljaj imena - ako nije sigurno, preskoči
TEKST:
___TXT___"""
KW = re.compile(r'(zlat|srebr|bronc|prvenstv|prvak|EP\b|SP\b|olimp|medalj|svjetsk|europsk|\d\.\s*mjest)', re.IGNORECASE)
NAMES_RE = re.compile(r'\b([A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+\s+[A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+)\b')
# Pre-filter junk - non-name patterns that LLM can pick up
NAME_BLACKLIST = re.compile(r'^(Sport Psychology|Endurance|HNS|HOO|HRS|HKS|MSO|HSS|SSP|TVS|HTS|RNK|ZSU|DVD|Liga HSS|HRP|Vikend|Posto|Visin|Razin|Klub|Sportski|Nogomet|Adresa|Email|Web|Tel|Mob|Fax)', re.IGNORECASE)
def find_chunks(sadrzaj, chunk_size=2200, overlap=200):
chunks = []
i = 0
while i < len(sadrzaj):
chunk = sadrzaj[i:i+chunk_size]
if KW.search(chunk) and len(NAMES_RE.findall(chunk)) >= 2:
chunks.append((i, chunk))
i += chunk_size - overlap
return chunks
def call_llm(text):
try:
rsp = requests.post(OLLAMA, json={
"model": MODEL,
"prompt": PROMPT_TPL.replace("___TXT___", text),
"stream": False,
"options": {"temperature": 0.0, "num_predict": 3000, "num_ctx": 4096}
}, timeout=180)
if rsp.status_code != 200: return None
out = rsp.json().get('response', '').strip()
out = re.sub(r'^```(?:json)?\s*', '', out)
out = re.sub(r'\s*```$', '', out)
m = re.search(r'\[\s*[\{\]]', out)
if not m: return []
t = out[m.start():]
try: return json.loads(t)
except json.JSONDecodeError:
for end in range(len(t)-1, max(len(t)-3000, 0), -50):
if t[end] == ']':
try: return json.loads(t[:end+1])
except: pass
return []
except Exception:
return None
def insert_fact(f, godina_godisnjaka, doc_id):
"""Insert into clan_nagrada with dedup."""
sportas = f.get('sportas','').strip()
if not sportas or len(sportas) < 4: return None
if NAME_BLACKLIST.match(sportas): return 'blacklisted'
if not f.get('plasman'): return None
plasman = f.get('plasman')
medalja = f.get('medalja')
if plasman not in (1,2,3,4,5,6,7,8): return None
# Auto-fix medalja
if plasman == 1 and not medalja: medalja = 'zlato'
elif plasman == 2 and not medalja: medalja = 'srebro'
elif plasman == 3 and not medalja: medalja = 'bronca'
god = f.get('godina') or godina_godisnjaka
natj = (f.get('natjecanje') or 'unknown')[:200]
disc = (f.get('disciplina') or 'unknown')[:200]
# Try to match clan_id by ime+prezime
parts = sportas.split(None, 1)
clan_id = None
if len(parts) == 2:
cu.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime||' '||prezime) = LOWER(%s)
OR LOWER(prezime||' '||ime) = LOWER(%s)
LIMIT 1""", (sportas, sportas))
rec = cu.fetchone()
if rec: clan_id = rec[0]
# Try to match klub_id
klub_naziv = (f.get('klub') or '').strip()
klub_id = None
if klub_naziv and len(klub_naziv) >= 4:
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s)
OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(klub_naziv, f"%{klub_naziv}%"))
rec = cu.fetchone()
if rec: klub_id = rec[0]
try:
cu.execute("""INSERT INTO pgz_sport.clan_nagrada
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
disciplina, plasman, medalja, source, source_url, last_updated)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'llm_godisnjak', %s, now())
ON CONFLICT ON CONSTRAINT clan_nagrada_uniq DO UPDATE SET
clan_id = COALESCE(pgz_sport.clan_nagrada.clan_id, EXCLUDED.clan_id),
klub_id = COALESCE(pgz_sport.clan_nagrada.klub_id, EXCLUDED.klub_id),
last_updated = now()""",
(clan_id, sportas, klub_id, klub_naziv or None, god, natj, disc, plasman, medalja,
f"dokument:{doc_id}"))
return 'inserted'
except Exception as e:
return f'err:{e}'
GODINA_LIST = [int(x) for x in (sys.argv[1].split(',') if len(sys.argv) > 1 else ['2024'])]
LIMIT = int(os.environ.get('LLM_LIMIT', 50))
total_inserted = 0
total_blacklisted = 0
total_facts = 0
t_global = time.time()
for godina in GODINA_LIST:
cu.execute("SELECT id, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina=%s LIMIT 1", (godina,))
rec = cu.fetchone()
if not rec:
print(f"=== Nema {godina} ==="); continue
did, sadrzaj = rec
print(f"\n=== Godišnjak {godina} (id={did}, {len(sadrzaj):,} chars) ===")
chunks = find_chunks(sadrzaj)
print(f"Chunks: {len(chunks)}, processing {min(LIMIT, len(chunks))}")
god_inserted = 0
god_facts = 0
t_god = time.time()
for idx, (off, chunk) in enumerate(chunks[:LIMIT]):
t0 = time.time()
facts = call_llm(chunk)
el = time.time() - t0
if facts is None:
print(f" [{idx+1}/{LIMIT}] ERR ({el:.1f}s)", flush=True)
continue
if not facts: continue
for f in facts:
if not isinstance(f, dict): continue
res = insert_fact(f, godina, did)
god_facts += 1
if res == 'inserted': god_inserted += 1
elif res == 'blacklisted': total_blacklisted += 1
if facts:
print(f" [{idx+1}/{LIMIT}] {el:.1f}s {len(facts)} parsed", flush=True)
print(f"\n=== {godina}: {god_facts} facts, {god_inserted} inserted, {time.time()-t_god:.0f}s ===")
total_facts += god_facts
total_inserted += god_inserted
print(f"\n=== TOTAL ({time.time()-t_global:.0f}s): {total_facts} parsed, {total_inserted} inserted, {total_blacklisted} blacklisted ===")
# Stats
cu.execute("""SELECT godina, count(*) FROM pgz_sport.clan_nagrada
WHERE source='llm_godisnjak' GROUP BY godina ORDER BY godina""")
print("\n=== LLM clan_nagrada by year ===")
for r in cu.fetchall():
print(f" {r[0]}: {r[1]}")
cu.execute("""SELECT count(*) FILTER (WHERE clan_id IS NOT NULL),
count(*) FILTER (WHERE clan_id IS NULL),
count(*) FILTER (WHERE klub_id IS NOT NULL),
count(*)
FROM pgz_sport.clan_nagrada WHERE source='llm_godisnjak'""")
r = cu.fetchone()
print(f"\nLinking stats: {r[0]} clan_id linked, {r[1]} unlinked, {r[2]} klub_id linked of {r[3]} total")
# Audit
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('clan_nagrada', 'llm_mining_v6_db', 'qwen2.5:7b', NULL, %s::jsonb)""",
(json.dumps({"godine": GODINA_LIST, "facts": total_facts, "inserted": total_inserted, "chunk_limit": LIMIT}),))