feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv("/opt/rinet-gpu/.env.master")
|
||||
# auto-added
|
||||
"""Common scraper helpers."""
|
||||
import os, re, time, json, hashlib
|
||||
from urllib.parse import urljoin, urlparse, urlencode, quote
|
||||
@@ -7,7 +10,7 @@ from html import unescape
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
|
||||
@@ -63,6 +66,7 @@ def upsert_facts(conn, facts, source_name, category, confidence=0.85):
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
f["fact"] = f["fact"].replace("\x00", "")
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((f["fact"], source_name, category, confidence, h,
|
||||
json.dumps({"url": f.get("url", ""), "title": f.get("title", "")})))
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Common scraper helpers."""
|
||||
import os, re, time, json, hashlib
|
||||
from urllib.parse import urljoin, urlparse, urlencode, quote
|
||||
import urllib.request
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
|
||||
def fetch(url, timeout=20, retries=3, binary=False):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
d = r.read()
|
||||
return (d if binary else d.decode("utf-8", errors="replace")), r.status
|
||||
except Exception:
|
||||
time.sleep(2 * (i + 1))
|
||||
return None, 0
|
||||
|
||||
|
||||
def extract_text(html):
|
||||
if not html:
|
||||
return ""
|
||||
h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S | re.I)
|
||||
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S | re.I)
|
||||
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S | re.I)
|
||||
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S | re.I)
|
||||
t = re.sub(r"<[^>]+>", " ", h)
|
||||
return re.sub(r"\s+", " ", unescape(t)).strip()
|
||||
|
||||
|
||||
def extract_title(html):
|
||||
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
|
||||
return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
|
||||
|
||||
|
||||
def chunk_text(text, max_len=800):
|
||||
if len(text) <= max_len:
|
||||
return [text] if text else []
|
||||
out = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_len, len(text))
|
||||
if end < len(text):
|
||||
for sep in [". ", "! ", "? ", "\n"]:
|
||||
p = text.rfind(sep, start, end)
|
||||
if p > start + max_len // 2:
|
||||
end = p + len(sep)
|
||||
break
|
||||
out.append(text[start:end].strip())
|
||||
start = end
|
||||
return [c for c in out if len(c) > 80]
|
||||
|
||||
|
||||
def upsert_facts(conn, facts, source_name, category, confidence=0.85):
|
||||
if not facts:
|
||||
return 0
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((f["fact"], source_name, category, confidence, h,
|
||||
json.dumps({"url": f.get("url", ""), "title": f.get("title", "")})))
|
||||
sql = ("INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING")
|
||||
try:
|
||||
execute_batch(cur, sql, rows, page_size=50)
|
||||
n = cur.rowcount
|
||||
cur.close()
|
||||
return n
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
HREF_RE = re.compile("href=[\"']([^\"']+)[\"']")
|
||||
|
||||
|
||||
def find_internal_links(html, base_url):
|
||||
if not html:
|
||||
return []
|
||||
base_host = urlparse(base_url).hostname or ""
|
||||
out = set()
|
||||
for m in HREF_RE.finditer(html):
|
||||
u = urljoin(base_url, m.group(1))
|
||||
host = urlparse(u).hostname or ""
|
||||
if host == base_host:
|
||||
out.add(u.split("#")[0])
|
||||
return list(out)
|
||||
Reference in New Issue
Block a user