Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers

- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti - 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail - 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333 - PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter) - navItemClick handler for nav items with href
2026-05-05 13:08:11 +02:00
parent 9fb512932a
commit 1d02c0897d
970 changed files with 268354 additions and 434 deletions
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# ═══════════════════════════════════════════════════════════════════
+# Fajl: pgz_savezi_deep.py | v1.0.0 | 05.05.2026
+# Lokacija: /opt/pgz-sport/scrapers/pgz_savezi_deep.py
+# Svrha: Deep crawl glavnih sportskih saveza za PGŽ klubove
+#   - HNS (nogomet) — hns-cff.hr, prvahnl.hr
+#   - HKS (košarka) — hks.hr, abaliga.com
+#   - HRS (rukomet) — hrs.hr
+#   - HOS (odbojka) — hos.hr
+#   - HBS (boćanje) — hbs.hr
+#   - HVS (vaterpolo) — hvs.hr
+#   Sve klube + utakmice + rezultate koji su u PGŽ
+# ═══════════════════════════════════════════════════════════════════
+"""Multi-savez deep scrape for PGŽ clubs."""
+import os, sys, re, time, hashlib, logging, json
+from urllib.parse import urljoin, urlparse
+import urllib.request
+from html import unescape
+import psycopg2
+from psycopg2.extras import execute_batch
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [savezi] %(message)s")
+log = logging.getLogger("savezi")
+
+DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
+UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
+
+# PGŽ municipalities — for filtering relevant clubs
+PGZ_TOWNS = ["Rijeka", "Opatija", "Crikvenica", "Krk", "Cres", "Mali Lošinj",
+             "Rab", "Delnice", "Vrbovsko", "Čabar", "Bakar", "Kraljevica",
+             "Kastav", "Viškovo", "Klana", "Mošćenička Draga", "Lovran",
+             "Matulji", "Omišalj", "Punat", "Vrbnik", "Baška", "Dobrinj",
+             "Malinska", "Jelenje", "Costrena", "Kostrena", "Čavle", "Lopar",
+             "Brod Moravice", "Mrkopalj", "Ravna Gora", "Lokve", "Skrad",
+             "Fužine", "Novi Vinodolski", "Vinodol"]
+
+ROOTS = {
+    "hns_nogomet":    ["https://hns-cff.hr/", "https://prvahnl.hr/", "https://hns-cff.hr/klubovi/"],
+    "hks_kosarka":    ["https://hks.hr/", "https://hks.hr/klubovi/"],
+    "hrs_rukomet":    ["https://hrs.hr/", "https://hrs.hr/klubovi/"],
+    "hos_odbojka":    ["https://hos.hr/", "https://hos.hr/klubovi/"],
+    "hbs_bocanje":    ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
+    "hvs_vaterpolo":  ["https://hvs.hr/", "https://hvs.hr/klubovi/"],
+    "hps_plivanje":   ["https://hps.hr/"],
+    "haof_atletika":  ["https://haaf.hr/"],
+    "hgsf_gimnastika":["https://hgsf.hr/"],
+}
+
+
+def fetch(url, timeout=20, retries=2):
+    for i in range(retries):
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": UA})
+            with urllib.request.urlopen(req, timeout=timeout) as r:
+                return r.read().decode("utf-8", errors="replace"), r.status
+        except Exception:
+            time.sleep(2*(i+1))
+    return None, 0
+
+
+def extract_text(html):
+    h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
+    h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
+    t = re.sub(r"<[^>]+>", " ", h)
+    return re.sub(r"\s+", " ", unescape(t)).strip()
+
+
+def is_pgz_relevant(text):
+    """Check if text mentions PGŽ towns/clubs."""
+    return any(t in text for t in PGZ_TOWNS) or "Primorsko-goranska" in text or "PGŽ" in text
+
+
+def chunk(text, max_len=800):
+    if len(text) <= max_len: return [text] if text else []
+    out = []; start = 0
+    while start < len(text):
+        end = min(start + max_len, len(text))
+        if end < len(text):
+            for sep in [". ", "! ", "? ", "\n"]:
+                p = text.rfind(sep, start, end)
+                if p > start + max_len // 2:
+                    end = p + len(sep); break
+        out.append(text[start:end].strip())
+        start = end
+    return [c for c in out if len(c) > 80]
+
+
+def upsert(conn, facts, savez_key):
+    if not facts: return 0
+    cur = conn.cursor()
+    rows = []
+    for f in facts:
+        h = hashlib.md5(f["fact"].encode()).hexdigest()
+        rows.append((f["fact"], f"savezi_{savez_key}", "pgz_sport_savezi",
+                     f.get("confidence", 0.82), h,
+                     json.dumps({"url": f.get("url", "")})))
+    sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
+             VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
+    try:
+        execute_batch(cur, sql, rows, page_size=50)
+        n = cur.rowcount
+        cur.close()
+        return n
+    except Exception as e:
+        log.error(f"upsert: {e}")
+        return 0
+
+
+def crawl_savez(savez_key, urls, max_per=80):
+    log.info(f"=== {savez_key} ===")
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    
+    visited = set()
+    queue = list(urls)
+    total_facts = 0
+    pgz_relevant = 0
+    
+    while queue and len(visited) < max_per:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        
+        html, _ = fetch(url, timeout=15)
+        if not html: continue
+        
+        text = extract_text(html)
+        if not text or len(text) < 100: continue
+        
+        # Add subpages
+        for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
+            u = urljoin(url, m.group(1))
+            host = urlparse(u).hostname or ""
+            base_host = urlparse(url).hostname or ""
+            if host == base_host and u not in visited and len(queue) < 200:
+                queue.append(u.split("#")[0])
+        
+        # Only ingest PGŽ-relevant content
+        if not is_pgz_relevant(text):
+            continue
+        pgz_relevant += 1
+        
+        facts = [{"fact": c, "url": url, "confidence": 0.82}
+                 for c in chunk(text, 800) if len(c) > 100]
+        total_facts += upsert(conn, facts, savez_key)
+        
+        time.sleep(0.4)
+    
+    log.info(f"  {savez_key}: visited={len(visited)} pgz_relevant={pgz_relevant} facts={total_facts}")
+    conn.close()
+    return {"savez": savez_key, "visited": len(visited),
+            "pgz_relevant": pgz_relevant, "facts": total_facts}
+
+
+def main():
+    results = []
+    for savez, urls in ROOTS.items():
+        try:
+            r = crawl_savez(savez, urls, max_per=60)
+            results.append(r)
+        except Exception as e:
+            log.error(f"{savez} fail: {e}")
+            results.append({"savez": savez, "error": str(e)})
+    
+    print(json.dumps({"summary": results,
+                       "total_facts": sum(r.get("facts", 0) for r in results)}))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+# ═══════════════════════════════════════════════════════════════════
+# Fajl: rss_hr_full.py | v1.0.0 | 05.05.2026
+# Lokacija: /opt/pgz-sport/scrapers/rss_hr_full.py
+# Svrha: rss.hr (Riječki sport savez) full crawl
+# ═══════════════════════════════════════════════════════════════════
+"""rss.hr complete corpus."""
+import os, sys, re, time, hashlib, logging, json
+from urllib.parse import urljoin, urlparse
+import urllib.request
+from html import unescape
+import psycopg2
+from psycopg2.extras import execute_batch
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [rss_hr] %(message)s")
+log = logging.getLogger("rss_hr")
+
+DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
+UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
+ROOT = "https://rss.hr"
+HTML_DIR = "/opt/pgz-sport/data/rss_hr_html"
+PDF_DIR = "/opt/pgz-sport/data/rss_hr_pdf"
+os.makedirs(HTML_DIR, exist_ok=True)
+os.makedirs(PDF_DIR, exist_ok=True)
+
+
+def fetch(url, timeout=20, retries=3, binary=False):
+    for i in range(retries):
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": UA})
+            with urllib.request.urlopen(req, timeout=timeout) as r:
+                d = r.read()
+                return d if binary else d.decode("utf-8", errors="replace"), r.status
+        except Exception:
+            time.sleep(3*(i+1))
+    return None, 0
+
+
+def extract_title(html):
+    m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
+    return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
+
+
+def extract_text(html):
+    h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
+    h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
+    h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
+    h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
+    t = re.sub(r"<[^>]+>", " ", h)
+    return re.sub(r"\s+", " ", unescape(t)).strip()
+
+
+def find_internal_links(html, base):
+    if not html: return []
+    out = set()
+    for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
+        u = urljoin(base, m.group(1))
+        host = urlparse(u).hostname or ""
+        if "rss.hr" in host:
+            # Strip query/fragment
+            u = u.split("#")[0]
+            out.add(u)
+    return list(out)
+
+
+def find_pdfs(html, base):
+    out = set()
+    for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html or "", re.I):
+        out.add(urljoin(base, m.group(1)))
+    return list(out)
+
+
+def chunk(text, max_len=800):
+    if len(text) <= max_len: return [text] if text else []
+    out = []
+    start = 0
+    while start < len(text):
+        end = min(start + max_len, len(text))
+        if end < len(text):
+            for sep in [". ", "! ", "? ", "\n"]:
+                p = text.rfind(sep, start, end)
+                if p > start + max_len // 2:
+                    end = p + len(sep); break
+        out.append(text[start:end].strip())
+        start = end
+    return [c for c in out if len(c) > 50]
+
+
+def upsert(conn, facts):
+    if not facts: return 0
+    cur = conn.cursor()
+    rows = []
+    for f in facts:
+        h = hashlib.md5(f["fact"].encode()).hexdigest()
+        rows.append((f["fact"], f["source"], f.get("category", "rss_hr"),
+                     f.get("confidence", 0.85), h,
+                     json.dumps({"url": f.get("url", "")})))
+    sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
+             VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
+    try:
+        execute_batch(cur, sql, rows, page_size=50)
+        n = cur.rowcount
+        cur.close()
+        return n
+    except Exception as e:
+        log.error(f"upsert: {e}")
+        return 0
+
+
+def crawl(max_pages=400):
+    log.info(f"=== rss.hr crawl (max {max_pages} pages) ===")
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    
+    visited = set()
+    queue = [ROOT, f"{ROOT}/clanovi/", f"{ROOT}/natjecanja/",
+             f"{ROOT}/dokumenti/", f"{ROOT}/o-nama/",
+             f"{ROOT}/sportasi-sezone/", f"{ROOT}/povjerenstva/",
+             f"{ROOT}/strucni-savjet/"]
+    
+    total_facts = 0
+    pdfs = set()
+    
+    while queue and len(visited) < max_pages:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        
+        if len(visited) % 20 == 0:
+            log.info(f"  visited {len(visited)}, queue {len(queue)}, facts {total_facts}")
+        
+        result = fetch(url, timeout=15)
+        if not result or not result[0]:
+            continue
+        html = result[0]
+        
+        # Save html
+        try:
+            h = hashlib.md5(url.encode()).hexdigest()[:16]
+            with open(f"{HTML_DIR}/{h}.html", "w", encoding="utf-8") as f:
+                f.write(html)
+        except Exception:
+            pass
+        
+        title = extract_title(html)
+        text = extract_text(html)
+        
+        # PDFs
+        for p in find_pdfs(html, url):
+            pdfs.add(p)
+        
+        # Facts
+        facts = []
+        if title and len(title) > 8:
+            facts.append({"fact": f"rss.hr — {title}", "source": "rss.hr",
+                          "category": "rss_hr_riecki_sport_savez",
+                          "confidence": 0.90, "url": url})
+        for c in chunk(text, 800):
+            if len(c) < 80: continue
+            facts.append({"fact": c, "source": "rss.hr",
+                          "category": "rss_hr_riecki_sport_savez",
+                          "confidence": 0.85, "url": url})
+        
+        total_facts += upsert(conn, facts)
+        
+        # Discover more links
+        for l in find_internal_links(html, url):
+            if l not in visited and len(queue) < 1000:
+                queue.append(l)
+        
+        time.sleep(0.4)
+    
+    # Download PDFs
+    pdf_dl = 0
+    for p in list(pdfs)[:100]:
+        try:
+            h = hashlib.md5(p.encode()).hexdigest()[:16]
+            path = f"{PDF_DIR}/{h}.pdf"
+            if os.path.exists(path): continue
+            data, st = fetch(p, timeout=30, binary=True)
+            if data and st == 200:
+                with open(path, "wb") as f: f.write(data)
+                pdf_dl += 1
+                time.sleep(0.8)
+        except Exception:
+            pass
+    
+    log.info(f"=== DONE: {len(visited)} visited, {total_facts} facts, {pdf_dl} pdfs ===")
+    conn.close()
+    return {"visited": len(visited), "facts": total_facts, "pdfs": pdf_dl}
+
+
+if __name__ == "__main__":
+    r = crawl()
+    print(json.dumps(r))
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# ═══════════════════════════════════════════════════════════════════
+# Fajl: sport_pgz_full.py | v1.0.0 | 05.05.2026
+# Lokacija: /opt/pgz-sport/scrapers/sport_pgz_full.py
+# Svrha: Sitemap-driven full crawl of sport-pgz.hr
+#   - All 4 sitemaps: objave (1+2), natječaji, stranice
+#   - PDF download + OCR ingest
+#   - Article parsing → dabi.knowledge ingest
+# ═══════════════════════════════════════════════════════════════════
+"""sport-pgz.hr complete corpus via sitemap."""
+import os, sys, re, time, hashlib, logging, json
+from urllib.parse import urljoin, urlparse
+import urllib.request
+from html import unescape
+import psycopg2
+from psycopg2.extras import execute_batch
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [sport_pgz] %(message)s")
+log = logging.getLogger("sport_pgz")
+
+DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
+UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
+SITEMAP_INDEX = "https://sport-pgz.hr/sitemap.xml"
+PDF_DIR = "/opt/pgz-sport/data/sport_pgz_pdf"
+HTML_DIR = "/opt/pgz-sport/data/sport_pgz_html"
+
+os.makedirs(PDF_DIR, exist_ok=True)
+os.makedirs(HTML_DIR, exist_ok=True)
+
+
+def fetch(url, timeout=20, retries=3, binary=False):
+    for i in range(retries):
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": UA})
+            with urllib.request.urlopen(req, timeout=timeout) as r:
+                data = r.read()
+                if binary:
+                    return data, r.status
+                return data.decode("utf-8", errors="replace"), r.status
+        except Exception as e:
+            log.warning(f"fetch fail attempt {i+1} {url}: {e}")
+            time.sleep(3*(i+1))
+    return None, 0
+
+
+def parse_sitemap_index(xml):
+    """Return list of sub-sitemap URLs."""
+    return re.findall(r"<loc>(https?://[^<]+)</loc>", xml or "")
+
+
+def parse_sitemap_urls(xml):
+    """Return list of (url, lastmod) pairs."""
+    out = []
+    for m in re.finditer(r"<url>\s*<loc>([^<]+)</loc>(?:\s*<lastmod>([^<]*)</lastmod>)?", xml or ""):
+        out.append((m.group(1), m.group(2) or ""))
+    return out
+
+
+def extract_main_text(html):
+    if not html: return ""
+    h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S|re.I)
+    h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
+    h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
+    h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
+    text = re.sub(r"<[^>]+>", " ", h)
+    text = unescape(text)
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def extract_title(html):
+    m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
+    if m:
+        return re.sub(r"\s+", " ", unescape(m.group(1))).strip()
+    return ""
+
+
+def find_pdf_links(html, base):
+    if not html: return []
+    out = set()
+    for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
+        out.add(urljoin(base, m.group(1)))
+    return list(out)
+
+
+def chunk_text(text, max_len=800):
+    """Split into ~800 char chunks, prefer sentence boundaries."""
+    if len(text) <= max_len:
+        return [text] if text else []
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = min(start + max_len, len(text))
+        # Find last . or ! or ? or newline
+        if end < len(text):
+            for sep in [". ", "! ", "? ", "\n"]:
+                p = text.rfind(sep, start, end)
+                if p > start + max_len // 2:
+                    end = p + len(sep)
+                    break
+        chunks.append(text[start:end].strip())
+        start = end
+    return [c for c in chunks if len(c) > 50]
+
+
+def upsert_facts(conn, facts):
+    """Bulk insert into dabi.knowledge."""
+    if not facts:
+        return 0
+    cur = conn.cursor()
+    rows = []
+    for f in facts:
+        h = hashlib.md5(f["fact"].encode()).hexdigest()
+        rows.append((
+            f["fact"], f["source"], f.get("category", "sport_pgz"),
+            f.get("confidence", 0.85), h,
+            json.dumps({"url": f.get("url", "")}),
+        ))
+    sql = """
+        INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
+        VALUES (%s, %s, %s, %s, %s, %s::jsonb)
+        ON CONFLICT (data_hash) DO NOTHING
+    """
+    try:
+        execute_batch(cur, sql, rows, page_size=50)
+        cnt = cur.rowcount
+        cur.close()
+        return cnt
+    except Exception as e:
+        log.error(f"upsert err: {e}")
+        return 0
+
+
+def crawl():
+    """Main crawl entry."""
+    log.info(f"=== sport-pgz.hr full crawl ===")
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    
+    # 1. Get sitemap index
+    xml, _ = fetch(SITEMAP_INDEX)
+    if not xml:
+        log.error("sitemap fetch failed"); return
+    
+    sub_sitemaps = parse_sitemap_index(xml)
+    log.info(f"sub-sitemaps: {len(sub_sitemaps)}")
+    
+    all_urls = []
+    for sm in sub_sitemaps:
+        sub_xml, _ = fetch(sm)
+        if sub_xml:
+            urls = parse_sitemap_urls(sub_xml)
+            all_urls.extend(urls)
+            log.info(f"  {sm}: {len(urls)} urls")
+    
+    log.info(f"TOTAL URLs to crawl: {len(all_urls)}")
+    
+    # 2. Crawl each URL → text → facts
+    total_facts = 0
+    crawled = 0
+    pdfs_found = []
+    
+    for idx, (url, lastmod) in enumerate(all_urls, 1):
+        if idx % 20 == 0:
+            log.info(f"  progress: {idx}/{len(all_urls)} crawled, {total_facts} facts")
+        
+        try:
+            html, status = fetch(url, timeout=15)
+            if not html:
+                continue
+            
+            # Save HTML for replay
+            url_hash = hashlib.md5(url.encode()).hexdigest()[:16]
+            html_path = f"{HTML_DIR}/{url_hash}.html"
+            try:
+                with open(html_path, "w", encoding="utf-8") as f:
+                    f.write(html)
+            except Exception:
+                pass
+            
+            title = extract_title(html)
+            text = extract_main_text(html)
+            
+            # Collect PDFs
+            for pdf_url in find_pdf_links(html, url):
+                pdfs_found.append(pdf_url)
+            
+            # Build facts
+            facts = []
+            if title and len(title) > 10:
+                facts.append({
+                    "fact": f"sport-pgz.hr — {title}",
+                    "source": "sport-pgz.hr",
+                    "category": "sport_pgz_official",
+                    "confidence": 0.92,
+                    "url": url,
+                })
+            
+            for chunk in chunk_text(text, max_len=800):
+                if len(chunk) < 80:
+                    continue
+                facts.append({
+                    "fact": chunk,
+                    "source": "sport-pgz.hr",
+                    "category": "sport_pgz_official",
+                    "confidence": 0.88,
+                    "url": url,
+                })
+            
+            inserted = upsert_facts(conn, facts)
+            total_facts += inserted
+            crawled += 1
+            
+            time.sleep(0.5)  # rate limit
+        
+        except Exception as e:
+            log.warning(f"err {url}: {e}")
+    
+    # 3. Download PDFs
+    pdfs_set = list(set(pdfs_found))
+    log.info(f"PDF links found: {len(pdfs_set)}")
+    pdf_downloaded = 0
+    for pdf_url in pdfs_set[:200]:  # limit for first run
+        try:
+            url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:16]
+            pdf_path = f"{PDF_DIR}/{url_hash}.pdf"
+            if os.path.exists(pdf_path):
+                continue
+            data, status = fetch(pdf_url, timeout=30, binary=True)
+            if data and status == 200:
+                with open(pdf_path, "wb") as f:
+                    f.write(data)
+                pdf_downloaded += 1
+                time.sleep(1)
+        except Exception as e:
+            log.warning(f"pdf err {pdf_url}: {e}")
+    
+    log.info(f"=== DONE: {crawled} pages crawled, {total_facts} facts inserted, {pdf_downloaded} PDFs downloaded ===")
+    conn.close()
+    
+    return {"crawled": crawled, "facts": total_facts, "pdfs": pdf_downloaded}
+
+
+if __name__ == "__main__":
+    r = crawl()
+    print(json.dumps(r))
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# ═══════════════════════════════════════════════════════════════════
+# Fajl: wiki_pgz_sport.py | v1.0.0 | 05.05.2026
+# Lokacija: /opt/pgz-sport/scrapers/wiki_pgz_sport.py
+# Svrha: Wikipedia HR/EN scrape — PGŽ sport klubovi + sportaši
+#   - Iterate kroz sve known PGŽ klubove
+#   - Wiki API → page extract
+#   - Plus historical match results od Wikipedia season tables
+# ═══════════════════════════════════════════════════════════════════
+"""Wikipedia PGŽ sport corpus."""
+import os, sys, re, time, hashlib, logging, json
+import urllib.request, urllib.parse
+import psycopg2
+from psycopg2.extras import execute_batch
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [wiki_sport] %(message)s")
+log = logging.getLogger("wiki_sport")
+
+DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
+UA = "Ri.NET Civic Bot 1.0 (contact: dradulic@outlook.com)"
+API_HR = "https://hr.wikipedia.org/w/api.php"
+API_EN = "https://en.wikipedia.org/w/api.php"
+
+
+def wiki_extract(api, title, sentences=None):
+    """Get plain text extract for a Wikipedia page."""
+    params = {
+        "action": "query", "prop": "extracts", "explaintext": "1",
+        "redirects": "1", "format": "json", "titles": title,
+    }
+    if sentences:
+        params["exsentences"] = str(sentences)
+    
+    url = api + "?" + urllib.parse.urlencode(params)
+    req = urllib.request.Request(url, headers={"User-Agent": UA})
+    try:
+        with urllib.request.urlopen(req, timeout=15) as r:
+            d = json.loads(r.read())
+        pages = d.get("query", {}).get("pages", {})
+        for pid, p in pages.items():
+            if pid == "-1": return None  # not found
+            return p.get("extract", "")
+    except Exception as e:
+        log.warning(f"wiki err {title}: {e}")
+        return None
+
+
+def wiki_search(api, query, limit=5):
+    """Search Wikipedia for related pages."""
+    params = {"action": "query", "list": "search", "srsearch": query,
+              "format": "json", "srlimit": str(limit)}
+    url = api + "?" + urllib.parse.urlencode(params)
+    req = urllib.request.Request(url, headers={"User-Agent": UA})
+    try:
+        with urllib.request.urlopen(req, timeout=10) as r:
+            d = json.loads(r.read())
+        return [p["title"] for p in d.get("query", {}).get("search", [])]
+    except Exception:
+        return []
+
+
+def get_pgz_clubs(conn):
+    """Fetch active PGŽ clubs from DB."""
+    cur = conn.cursor()
+    cur.execute("""
+        SELECT naziv, COALESCE(skraceni_naziv, '') 
+        FROM pgz_sport.klubovi 
+        WHERE COALESCE(aktivan, true) = true
+        ORDER BY naziv
+    """)
+    out = [(r[0], r[1]) for r in cur.fetchall()]
+    cur.close()
+    return out
+
+
+def chunk(text, max_len=700):
+    if len(text) <= max_len: return [text] if text else []
+    out = []; start = 0
+    while start < len(text):
+        end = min(start + max_len, len(text))
+        if end < len(text):
+            for sep in [". ", "! ", "? ", "\n"]:
+                p = text.rfind(sep, start, end)
+                if p > start + max_len // 2:
+                    end = p + len(sep); break
+        out.append(text[start:end].strip())
+        start = end
+    return [c for c in out if len(c) > 80]
+
+
+def upsert(conn, facts):
+    if not facts: return 0
+    cur = conn.cursor()
+    rows = []
+    for f in facts:
+        h = hashlib.md5(f["fact"].encode()).hexdigest()
+        rows.append((f["fact"], "wikipedia_pgz_sport", "pgz_sport_wiki",
+                     f.get("confidence", 0.84), h,
+                     json.dumps({"page": f.get("page", ""), "lang": f.get("lang", "hr")})))
+    sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
+             VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
+    try:
+        execute_batch(cur, sql, rows, page_size=50)
+        n = cur.rowcount; cur.close()
+        return n
+    except Exception as e:
+        log.error(f"upsert: {e}")
+        return 0
+
+
+def main():
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    clubs = get_pgz_clubs(conn)
+    log.info(f"PGŽ active clubs: {len(clubs)}")
+    
+    total_facts = 0
+    found_pages = 0
+    
+    for naziv, kraci in clubs[:200]:  # limit first run
+        # Try direct page first
+        text = wiki_extract(API_HR, naziv)
+        if not text:
+            # Try search
+            candidates = wiki_search(API_HR, naziv, limit=3)
+            for c in candidates:
+                if any(t.lower() in c.lower() for t in [naziv.split()[-1], "Rijeka", "Opatija", "Krk"]):
+                    text = wiki_extract(API_HR, c)
+                    if text:
+                        break
+        
+        if text and len(text) > 200:
+            found_pages += 1
+            facts = [{"fact": c, "page": naziv, "lang": "hr", "confidence": 0.85}
+                     for c in chunk(text, 700)]
+            total_facts += upsert(conn, facts)
+        
+        time.sleep(0.5)
+        if found_pages % 20 == 0 and found_pages > 0:
+            log.info(f"  progress: pages {found_pages}, facts {total_facts}")
+    
+    log.info(f"=== DONE: pages={found_pages} facts={total_facts} ===")
+    print(json.dumps({"pages": found_pages, "facts": total_facts}))
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()