204 lines
8.4 KiB
Python
Executable File
204 lines
8.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""LLM mining v6 - multi-godina, persist to clan_nagrada, name validation."""
|
|
import json, re, psycopg2, requests, time, os, sys
|
|
|
|
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
|
user='rinet', password=os.environ["DB_PASSWORD"])
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cu = conn.cursor()
|
|
|
|
OLLAMA = "http://localhost:11434/api/generate"
|
|
MODEL = "qwen2.5:7b"
|
|
|
|
PROMPT_TPL = """Iz teksta godišnjaka sporta ekstraktiraj sve KONKRETNE rezultate sportaša. Vrati ISKLJUČIVO valjani JSON niz.
|
|
|
|
Format jednog objekta:
|
|
{"sportas":"Ime Prezime","klub":"AK Kvarner","sport":"atletika","godina":2024,"natjecanje":"PH","disciplina":"100m","plasman":1,"medalja":"zlato"}
|
|
|
|
KAKO ČITATI:
|
|
- "1. mjesto" → plasman=1, medalja="zlato"
|
|
- "2. mjesto" → plasman=2, medalja="srebro"
|
|
- "3. mjesto" → plasman=3, medalja="bronca"
|
|
- "4./5./6./7./8. mjesto" → plasman=N, medalja=null
|
|
- Ako tekst eksplicitno kaže "zlato/srebro/bronca", uzmi to
|
|
|
|
PRAVILA:
|
|
- Vrati [] ako nema konkretnih rezultata
|
|
- Sport mora biti pravi naziv: atletika, plivanje, košarka, rukomet, vaterpolo, nogomet, jedrenje, biciklizam, šah, taekwondo, karate, gimnastika, tenis, judo, streljaštvo, boćanje, kuglanje, veslanje, stolni tenis, plesovi itd.
|
|
- "klub" - ekstraktiraj iz teksta ako je naveden
|
|
- BEZ markdowna, BEZ ```, samo JSON niz
|
|
- Ne izmišljaj imena - ako nije sigurno, preskoči
|
|
|
|
TEKST:
|
|
___TXT___"""
|
|
|
|
KW = re.compile(r'(zlat|srebr|bronc|prvenstv|prvak|EP\b|SP\b|olimp|medalj|svjetsk|europsk|\d\.\s*mjest)', re.IGNORECASE)
|
|
NAMES_RE = re.compile(r'\b([A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+\s+[A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+)\b')
|
|
# Pre-filter junk - non-name patterns that LLM can pick up
|
|
NAME_BLACKLIST = re.compile(r'^(Sport Psychology|Endurance|HNS|HOO|HRS|HKS|MSO|HSS|SSP|TVS|HTS|RNK|ZSU|DVD|Liga HSS|HRP|Vikend|Posto|Visin|Razin|Klub|Sportski|Nogomet|Adresa|Email|Web|Tel|Mob|Fax)', re.IGNORECASE)
|
|
|
|
def find_chunks(sadrzaj, chunk_size=2200, overlap=200):
|
|
chunks = []
|
|
i = 0
|
|
while i < len(sadrzaj):
|
|
chunk = sadrzaj[i:i+chunk_size]
|
|
if KW.search(chunk) and len(NAMES_RE.findall(chunk)) >= 2:
|
|
chunks.append((i, chunk))
|
|
i += chunk_size - overlap
|
|
return chunks
|
|
|
|
def call_llm(text):
|
|
try:
|
|
rsp = requests.post(OLLAMA, json={
|
|
"model": MODEL,
|
|
"prompt": PROMPT_TPL.replace("___TXT___", text),
|
|
"stream": False,
|
|
"options": {"temperature": 0.0, "num_predict": 3000, "num_ctx": 4096}
|
|
}, timeout=180)
|
|
if rsp.status_code != 200: return None
|
|
out = rsp.json().get('response', '').strip()
|
|
out = re.sub(r'^```(?:json)?\s*', '', out)
|
|
out = re.sub(r'\s*```$', '', out)
|
|
m = re.search(r'\[\s*[\{\]]', out)
|
|
if not m: return []
|
|
t = out[m.start():]
|
|
try: return json.loads(t)
|
|
except json.JSONDecodeError:
|
|
for end in range(len(t)-1, max(len(t)-3000, 0), -50):
|
|
if t[end] == ']':
|
|
try: return json.loads(t[:end+1])
|
|
except: pass
|
|
return []
|
|
except Exception:
|
|
return None
|
|
|
|
def insert_fact(f, godina_godisnjaka, doc_id):
|
|
"""Insert into clan_nagrada with dedup."""
|
|
sportas = f.get('sportas','').strip()
|
|
if not sportas or len(sportas) < 4: return None
|
|
if NAME_BLACKLIST.match(sportas): return 'blacklisted'
|
|
if not f.get('plasman'): return None
|
|
|
|
plasman = f.get('plasman')
|
|
medalja = f.get('medalja')
|
|
if plasman not in (1,2,3,4,5,6,7,8): return None
|
|
|
|
# Auto-fix medalja
|
|
if plasman == 1 and not medalja: medalja = 'zlato'
|
|
elif plasman == 2 and not medalja: medalja = 'srebro'
|
|
elif plasman == 3 and not medalja: medalja = 'bronca'
|
|
|
|
god = f.get('godina') or godina_godisnjaka
|
|
natj = (f.get('natjecanje') or 'unknown')[:200]
|
|
disc = (f.get('disciplina') or 'unknown')[:200]
|
|
|
|
# Try to match clan_id by ime+prezime
|
|
parts = sportas.split(None, 1)
|
|
clan_id = None
|
|
if len(parts) == 2:
|
|
cu.execute("""SELECT id FROM pgz_sport.clanovi
|
|
WHERE LOWER(ime||' '||prezime) = LOWER(%s)
|
|
OR LOWER(prezime||' '||ime) = LOWER(%s)
|
|
LIMIT 1""", (sportas, sportas))
|
|
rec = cu.fetchone()
|
|
if rec: clan_id = rec[0]
|
|
|
|
# Try to match klub_id
|
|
klub_naziv = (f.get('klub') or '').strip()
|
|
klub_id = None
|
|
if klub_naziv and len(klub_naziv) >= 4:
|
|
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
|
WHERE LOWER(naziv) = LOWER(%s)
|
|
OR LOWER(naziv) LIKE LOWER(%s)
|
|
ORDER BY
|
|
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
|
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
|
id ASC LIMIT 1""",
|
|
(klub_naziv, f"%{klub_naziv}%"))
|
|
rec = cu.fetchone()
|
|
if rec: klub_id = rec[0]
|
|
|
|
try:
|
|
cu.execute("""INSERT INTO pgz_sport.clan_nagrada
|
|
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
|
|
disciplina, plasman, medalja, source, source_url, last_updated)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'llm_godisnjak', %s, now())
|
|
ON CONFLICT ON CONSTRAINT clan_nagrada_uniq DO UPDATE SET
|
|
clan_id = COALESCE(pgz_sport.clan_nagrada.clan_id, EXCLUDED.clan_id),
|
|
klub_id = COALESCE(pgz_sport.clan_nagrada.klub_id, EXCLUDED.klub_id),
|
|
last_updated = now()""",
|
|
(clan_id, sportas, klub_id, klub_naziv or None, god, natj, disc, plasman, medalja,
|
|
f"dokument:{doc_id}"))
|
|
return 'inserted'
|
|
except Exception as e:
|
|
return f'err:{e}'
|
|
|
|
GODINA_LIST = [int(x) for x in (sys.argv[1].split(',') if len(sys.argv) > 1 else ['2024'])]
|
|
LIMIT = int(os.environ.get('LLM_LIMIT', 50))
|
|
|
|
total_inserted = 0
|
|
total_blacklisted = 0
|
|
total_facts = 0
|
|
t_global = time.time()
|
|
|
|
for godina in GODINA_LIST:
|
|
cu.execute("SELECT id, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina=%s LIMIT 1", (godina,))
|
|
rec = cu.fetchone()
|
|
if not rec:
|
|
print(f"=== Nema {godina} ==="); continue
|
|
did, sadrzaj = rec
|
|
|
|
print(f"\n=== Godišnjak {godina} (id={did}, {len(sadrzaj):,} chars) ===")
|
|
chunks = find_chunks(sadrzaj)
|
|
print(f"Chunks: {len(chunks)}, processing {min(LIMIT, len(chunks))}")
|
|
|
|
god_inserted = 0
|
|
god_facts = 0
|
|
t_god = time.time()
|
|
|
|
for idx, (off, chunk) in enumerate(chunks[:LIMIT]):
|
|
t0 = time.time()
|
|
facts = call_llm(chunk)
|
|
el = time.time() - t0
|
|
if facts is None:
|
|
print(f" [{idx+1}/{LIMIT}] ERR ({el:.1f}s)", flush=True)
|
|
continue
|
|
if not facts: continue
|
|
|
|
for f in facts:
|
|
if not isinstance(f, dict): continue
|
|
res = insert_fact(f, godina, did)
|
|
god_facts += 1
|
|
if res == 'inserted': god_inserted += 1
|
|
elif res == 'blacklisted': total_blacklisted += 1
|
|
|
|
if facts:
|
|
print(f" [{idx+1}/{LIMIT}] {el:.1f}s {len(facts)} parsed", flush=True)
|
|
|
|
print(f"\n=== {godina}: {god_facts} facts, {god_inserted} inserted, {time.time()-t_god:.0f}s ===")
|
|
total_facts += god_facts
|
|
total_inserted += god_inserted
|
|
|
|
print(f"\n=== TOTAL ({time.time()-t_global:.0f}s): {total_facts} parsed, {total_inserted} inserted, {total_blacklisted} blacklisted ===")
|
|
|
|
# Stats
|
|
cu.execute("""SELECT godina, count(*) FROM pgz_sport.clan_nagrada
|
|
WHERE source='llm_godisnjak' GROUP BY godina ORDER BY godina""")
|
|
print("\n=== LLM clan_nagrada by year ===")
|
|
for r in cu.fetchall():
|
|
print(f" {r[0]}: {r[1]}")
|
|
|
|
cu.execute("""SELECT count(*) FILTER (WHERE clan_id IS NOT NULL),
|
|
count(*) FILTER (WHERE clan_id IS NULL),
|
|
count(*) FILTER (WHERE klub_id IS NOT NULL),
|
|
count(*)
|
|
FROM pgz_sport.clan_nagrada WHERE source='llm_godisnjak'""")
|
|
r = cu.fetchone()
|
|
print(f"\nLinking stats: {r[0]} clan_id linked, {r[1]} unlinked, {r[2]} klub_id linked of {r[3]} total")
|
|
|
|
# Audit
|
|
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
|
(table_name, action, source, source_url, details)
|
|
VALUES ('clan_nagrada', 'llm_mining_v6_db', 'qwen2.5:7b', NULL, %s::jsonb)""",
|
|
(json.dumps({"godine": GODINA_LIST, "facts": total_facts, "inserted": total_inserted, "chunk_limit": LIMIT}),))
|