124 lines
5.3 KiB
Python
124 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Fajl: trener_extractor.py | v1.0.0 | 05.05.2026
|
|
# Lokacija: /opt/pgz-sport/scrapers/trener_extractor.py
|
|
# Svrha: Ekstrahira imena trenera iz dokumenti.tekst + dabi.knowledge
|
|
# - Regex pattern za "trener: <ime>" , "glavni trener", "izbornik"
|
|
# - Cross-link s pgz_sport.osobe (ako postoji), inserts new
|
|
# - Confidence based on pattern strength
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""Trener extractor — pull names from documents."""
|
|
import os, re, time, json, hashlib
|
|
from collections import Counter
|
|
import psycopg2
|
|
from psycopg2.extras import execute_batch, RealDictCursor
|
|
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
|
|
# Regex patterns — Croatian morphology (trener, treneru, trenerom, izbornik)
|
|
PATTERNS = [
|
|
# "glavni trener (ime prezime)" / "trener (ime prezime)"
|
|
re.compile(r"(?:glavni\s+)?trener[a-z]*\s+([A-Z][a-zčćžšđ]+(?:\s+[A-Z][a-zčćžšđ]+){1,2})", re.U),
|
|
re.compile(r"izbornik[a-z]*\s+([A-Z][a-zčćžšđ]+(?:\s+[A-Z][a-zčćžšđ]+){1,2})", re.U),
|
|
re.compile(r"([A-Z][a-zčćžšđ]+(?:\s+[A-Z][a-zčćžšđ]+){1,2}),?\s+(?:glavni\s+)?trener", re.U),
|
|
re.compile(r"([A-Z][a-zčćžšđ]+(?:\s+[A-Z][a-zčćžšđ]+){1,2}),?\s+izbornik", re.U),
|
|
# Šef stručnog stožera
|
|
re.compile(r"\b(?:šef|voditelj)\s+stru(?:č|c)nog\s+sto(?:ž|z)era\s+([A-Z][a-zčćžšđ]+(?:\s+[A-Z][a-zčćžšđ]+){1,2})", re.U),
|
|
]
|
|
|
|
# Filters — exclude obvious non-names
|
|
EXCLUDED_TOKENS = {"Hrvatska", "Republika", "Hrvatske", "Klubu", "Kluba", "Sezone",
|
|
"Prvenstva", "Prvenstvo", "Liga", "Lige", "PGŽ", "PG"}
|
|
|
|
|
|
def extract_trainers_from_text(text):
|
|
"""Run all patterns + return Counter of (full_name)."""
|
|
found = Counter()
|
|
if not text or len(text) < 50:
|
|
return found
|
|
for pat in PATTERNS:
|
|
for m in pat.finditer(text):
|
|
name = m.group(1).strip()
|
|
# Filter
|
|
tokens = name.split()
|
|
if len(tokens) < 2 or len(tokens) > 4:
|
|
continue
|
|
if any(t in EXCLUDED_TOKENS for t in tokens):
|
|
continue
|
|
if any(len(t) < 3 for t in tokens):
|
|
continue
|
|
found[name] += 1
|
|
return found
|
|
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
cur = conn.cursor(cursor_factory=RealDictCursor)
|
|
|
|
# Source 1: pgz_sport.dokumenti (tekst column)
|
|
cur.execute("""
|
|
SELECT id, COALESCE(title, '') AS title, COALESCE(sadrzaj, '') AS tekst
|
|
FROM pgz_sport.dokumenti
|
|
WHERE COALESCE(sadrzaj, '') != '' AND length(sadrzaj) > 200
|
|
""")
|
|
docs = cur.fetchall()
|
|
print(f"Documents to scan: {len(docs)}")
|
|
|
|
all_trainers = Counter() # name → total mentions
|
|
trainer_clubs = {} # name → set(klub_ids)
|
|
|
|
for d in docs:
|
|
found = extract_trainers_from_text(d.get("tekst", ""))
|
|
for name, cnt in found.items():
|
|
all_trainers[name] += cnt
|
|
trainer_clubs.setdefault(name, set()).add(d.get("id"))
|
|
|
|
print(f"Unique trainer names found: {len(all_trainers)}")
|
|
print(f"Top 20 by mentions:")
|
|
for name, cnt in all_trainers.most_common(20):
|
|
clubs = trainer_clubs.get(name, set())
|
|
print(f" {name:35} mentions={cnt:>3} klubova={len(clubs)}")
|
|
|
|
# Insert into dabi.knowledge as forensic_findings
|
|
cur2 = conn.cursor()
|
|
fact_inserted = 0
|
|
for name, cnt in all_trainers.most_common(500):
|
|
if cnt < 2: # skip noise (1-time mentions)
|
|
continue
|
|
clubs_set = trainer_clubs.get(name, set())
|
|
clubs_list = [c for c in clubs_set if c]
|
|
|
|
fact = f"Trener {name} spomenut {cnt}x u {len(clubs_list)} dokumenata PGŽ klubova."
|
|
h = hashlib.md5(fact.encode()).hexdigest()
|
|
|
|
try:
|
|
cur2.execute("""
|
|
INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
|
VALUES (%s, 'trener_extract_pgz_sport', 'pgz_sport_treneri',
|
|
%s, %s, %s::jsonb)
|
|
ON CONFLICT (data_hash) DO NOTHING
|
|
""", (fact, min(0.7 + cnt*0.05, 0.95), h,
|
|
json.dumps({"name": name, "mentions": cnt, "clubs": clubs_list[:10]})))
|
|
if cur2.rowcount > 0:
|
|
fact_inserted += 1
|
|
except Exception as e:
|
|
print(f" err: {e}")
|
|
|
|
print(f"\nFacts inserted: {fact_inserted}")
|
|
|
|
# Also try inserting into pgz_sport.treneri if structure allows
|
|
cur.execute("""
|
|
SELECT column_name FROM information_schema.columns
|
|
WHERE table_schema='pgz_sport' AND table_name='treneri'
|
|
ORDER BY ordinal_position
|
|
""")
|
|
cols = [r["column_name"] for r in cur.fetchall()]
|
|
print(f"\npgz_sport.treneri cols: {cols}")
|
|
|
|
cur2.close(); cur.close(); conn.close()
|
|
return {"trainers_found": len(all_trainers), "facts_inserted": fact_inserted}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print(json.dumps(main(), default=str))
|