feat: /api/v2/analiza/* endpoints - sport analytics backend

2026-05-16 00:28:12 +02:00
parent 7ca5d7d94e
commit aca5051418
1355 changed files with 321891 additions and 4128 deletions
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+from dotenv import load_dotenv
+load_dotenv("/opt/rinet-gpu/.env.master")
+# auto-added
 """Common scraper helpers."""
 import os, re, time, json, hashlib
 from urllib.parse import urljoin, urlparse, urlencode, quote
@@ -7,7 +10,7 @@ from html import unescape
 import psycopg2
 from psycopg2.extras import execute_batch

-DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
+DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
 UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"


@@ -63,6 +66,7 @@ def upsert_facts(conn, facts, source_name, category, confidence=0.85):
    cur = conn.cursor()
    rows = []
    for f in facts:
+        f["fact"] = f["fact"].replace("\x00", "")
        h = hashlib.md5(f["fact"].encode()).hexdigest()
        rows.append((f["fact"], source_name, category, confidence, h,
                     json.dumps({"url": f.get("url", ""), "title": f.get("title", "")})))
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""Common scraper helpers."""
+import os, re, time, json, hashlib
+from urllib.parse import urljoin, urlparse, urlencode, quote
+import urllib.request
+from html import unescape
+import psycopg2
+from psycopg2.extras import execute_batch
+
+DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
+UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
+
+
+def fetch(url, timeout=20, retries=3, binary=False):
+    for i in range(retries):
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": UA})
+            with urllib.request.urlopen(req, timeout=timeout) as r:
+                d = r.read()
+                return (d if binary else d.decode("utf-8", errors="replace")), r.status
+        except Exception:
+            time.sleep(2 * (i + 1))
+    return None, 0
+
+
+def extract_text(html):
+    if not html:
+        return ""
+    h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S | re.I)
+    h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S | re.I)
+    h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S | re.I)
+    h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S | re.I)
+    t = re.sub(r"<[^>]+>", " ", h)
+    return re.sub(r"\s+", " ", unescape(t)).strip()
+
+
+def extract_title(html):
+    m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
+    return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
+
+
+def chunk_text(text, max_len=800):
+    if len(text) <= max_len:
+        return [text] if text else []
+    out = []
+    start = 0
+    while start < len(text):
+        end = min(start + max_len, len(text))
+        if end < len(text):
+            for sep in [". ", "! ", "? ", "\n"]:
+                p = text.rfind(sep, start, end)
+                if p > start + max_len // 2:
+                    end = p + len(sep)
+                    break
+        out.append(text[start:end].strip())
+        start = end
+    return [c for c in out if len(c) > 80]
+
+
+def upsert_facts(conn, facts, source_name, category, confidence=0.85):
+    if not facts:
+        return 0
+    cur = conn.cursor()
+    rows = []
+    for f in facts:
+        h = hashlib.md5(f["fact"].encode()).hexdigest()
+        rows.append((f["fact"], source_name, category, confidence, h,
+                     json.dumps({"url": f.get("url", ""), "title": f.get("title", "")})))
+    sql = ("INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) "
+           "VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING")
+    try:
+        execute_batch(cur, sql, rows, page_size=50)
+        n = cur.rowcount
+        cur.close()
+        return n
+    except Exception:
+        return 0
+
+
+HREF_RE = re.compile("href=[\"']([^\"']+)[\"']")
+
+
+def find_internal_links(html, base_url):
+    if not html:
+        return []
+    base_host = urlparse(base_url).hostname or ""
+    out = set()
+    for m in HREF_RE.finditer(html):
+        u = urljoin(base_url, m.group(1))
+        host = urlparse(u).hostname or ""
+        if host == base_host:
+            out.add(u.split("#")[0])
+    return list(out)