feat: /api/v2/analiza/* endpoints - sport analytics backend

2026-05-16 00:28:12 +02:00
parent 7ca5d7d94e
commit aca5051418
1355 changed files with 321891 additions and 4128 deletions
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# ═══════════════════════════════════════════════════════════════════
+# Fajl: pgz_savezi_deep.py | v1.0.0 | 05.05.2026
+# Lokacija: /opt/pgz-sport/scrapers/pgz_savezi_deep.py
+# Svrha: Deep crawl glavnih sportskih saveza za PGŽ klubove
+#   - HNS (nogomet) — hns-cff.hr, prvahnl.hr
+#   - HKS (košarka) — hks.hr, abaliga.com
+#   - HRS (rukomet) — hrs.hr
+#   - HOS (odbojka) — hos.hr
+#   - HBS (boćanje) — hbs.hr
+#   - HVS (vaterpolo) — hvs.hr
+#   Sve klube + utakmice + rezultate koji su u PGŽ
+# ═══════════════════════════════════════════════════════════════════
+"""Multi-savez deep scrape for PGŽ clubs."""
+import os, sys, re, time, hashlib, logging, json
+from urllib.parse import urljoin, urlparse
+import urllib.request
+from html import unescape
+import psycopg2
+from psycopg2.extras import execute_batch
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [savezi] %(message)s")
+log = logging.getLogger("savezi")
+
+DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
+UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
+
+# PGŽ municipalities — for filtering relevant clubs
+PGZ_TOWNS = ["Rijeka", "Opatija", "Crikvenica", "Krk", "Cres", "Mali Lošinj",
+             "Rab", "Delnice", "Vrbovsko", "Čabar", "Bakar", "Kraljevica",
+             "Kastav", "Viškovo", "Klana", "Mošćenička Draga", "Lovran",
+             "Matulji", "Omišalj", "Punat", "Vrbnik", "Baška", "Dobrinj",
+             "Malinska", "Jelenje", "Costrena", "Kostrena", "Čavle", "Lopar",
+             "Brod Moravice", "Mrkopalj", "Ravna Gora", "Lokve", "Skrad",
+             "Fužine", "Novi Vinodolski", "Vinodol"]
+
+ROOTS = {
+    "hns_nogomet":    ["https://hns-cff.hr/", "https://prvahnl.hr/", "https://hns-cff.hr/klubovi/"],
+    "hks_kosarka":    ["https://hks.hr/", "https://hks.hr/klubovi/"],
+    "hrs_rukomet":    ["https://hrs.hr/", "https://hrs.hr/klubovi/"],
+    "hos_odbojka":    ["https://hos.hr/", "https://hos.hr/klubovi/"],
+    "hbs_bocanje":    ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
+    "hvs_vaterpolo":  ["https://hvs.hr/", "https://hvs.hr/klubovi/"],
+    "hps_plivanje":   ["https://hps.hr/"],
+    "haof_atletika":  ["https://haaf.hr/"],
+    "hgsf_gimnastika":["https://hgsf.hr/"],
+}
+
+
+def fetch(url, timeout=20, retries=2):
+    for i in range(retries):
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": UA})
+            with urllib.request.urlopen(req, timeout=timeout) as r:
+                return r.read().decode("utf-8", errors="replace"), r.status
+        except Exception:
+            time.sleep(2*(i+1))
+    return None, 0
+
+
+def extract_text(html):
+    h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
+    h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
+    t = re.sub(r"<[^>]+>", " ", h)
+    return re.sub(r"\s+", " ", unescape(t)).strip()
+
+
+def is_pgz_relevant(text):
+    """Check if text mentions PGŽ towns/clubs."""
+    return any(t in text for t in PGZ_TOWNS) or "Primorsko-goranska" in text or "PGŽ" in text
+
+
+def chunk(text, max_len=800):
+    if len(text) <= max_len: return [text] if text else []
+    out = []; start = 0
+    while start < len(text):
+        end = min(start + max_len, len(text))
+        if end < len(text):
+            for sep in [". ", "! ", "? ", "\n"]:
+                p = text.rfind(sep, start, end)
+                if p > start + max_len // 2:
+                    end = p + len(sep); break
+        out.append(text[start:end].strip())
+        start = end
+    return [c for c in out if len(c) > 80]
+
+
+def upsert(conn, facts, savez_key):
+    if not facts: return 0
+    cur = conn.cursor()
+    rows = []
+    for f in facts:
+        h = hashlib.md5(f["fact"].encode()).hexdigest()
+        rows.append((f["fact"], f"savezi_{savez_key}", "pgz_sport_savezi",
+                     f.get("confidence", 0.82), h,
+                     json.dumps({"url": f.get("url", "")})))
+    sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
+             VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
+    try:
+        execute_batch(cur, sql, rows, page_size=50)
+        n = cur.rowcount
+        cur.close()
+        return n
+    except Exception as e:
+        log.error(f"upsert: {e}")
+        return 0
+
+
+def crawl_savez(savez_key, urls, max_per=80):
+    log.info(f"=== {savez_key} ===")
+    conn = psycopg2.connect(DSN); conn.autocommit = True
+    
+    visited = set()
+    queue = list(urls)
+    total_facts = 0
+    pgz_relevant = 0
+    
+    while queue and len(visited) < max_per:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        
+        html, _ = fetch(url, timeout=15)
+        if not html: continue
+        
+        text = extract_text(html)
+        if not text or len(text) < 100: continue
+        
+        # Add subpages
+        for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
+            u = urljoin(url, m.group(1))
+            host = urlparse(u).hostname or ""
+            base_host = urlparse(url).hostname or ""
+            if host == base_host and u not in visited and len(queue) < 200:
+                queue.append(u.split("#")[0])
+        
+        # Only ingest PGŽ-relevant content
+        if not is_pgz_relevant(text):
+            continue
+        pgz_relevant += 1
+        
+        facts = [{"fact": c, "url": url, "confidence": 0.82}
+                 for c in chunk(text, 800) if len(c) > 100]
+        total_facts += upsert(conn, facts, savez_key)
+        
+        time.sleep(0.4)
+    
+    log.info(f"  {savez_key}: visited={len(visited)} pgz_relevant={pgz_relevant} facts={total_facts}")
+    conn.close()
+    return {"savez": savez_key, "visited": len(visited),
+            "pgz_relevant": pgz_relevant, "facts": total_facts}
+
+
+def main():
+    results = []
+    for savez, urls in ROOTS.items():
+        try:
+            r = crawl_savez(savez, urls, max_per=60)
+            results.append(r)
+        except Exception as e:
+            log.error(f"{savez} fail: {e}")
+            results.append({"savez": savez, "error": str(e)})
+    
+    print(json.dumps({"summary": results,
+                       "total_facts": sum(r.get("facts", 0) for r in results)}))
+
+
+if __name__ == "__main__":
+    main()