feat: /api/v2/analiza/* endpoints - sport analytics backend

2026-05-16 00:28:12 +02:00
parent 7ca5d7d94e
commit aca5051418
1355 changed files with 321891 additions and 4128 deletions
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+D news enrichment: search via DuckDuckGo HTML (no API key) for biographical news mentions.
+Gather first 2-3 hits, scrape, extract relevant paragraphs that mention the athlete + sport.
+"""
+import os
+import re, json, time, sys
+import urllib.request, urllib.parse
+import psycopg2
+
+DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"])
+UA = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
+
+def out(msg):
+    print(msg, flush=True)
+
+def http_get(url, timeout=12):
+    try:
+        req = urllib.request.Request(url, headers={
+            "User-Agent": UA,
+            "Accept": "text/html,application/xhtml+xml",
+            "Accept-Language": "hr,en"
+        })
+        with urllib.request.urlopen(req, timeout=timeout) as r:
+            return r.read().decode("utf-8", errors="replace")
+    except Exception:
+        return None
+
+def ddg_search(query, limit=3):
+    """DuckDuckGo HTML search. Returns list of (url, snippet)."""
+    q = urllib.parse.quote(query)
+    url = f"https://html.duckduckgo.com/html/?q={q}"
+    h = http_get(url)
+    if not h: return []
+    results = []
+    # Extract <a class="result__a" href="..."> or "result__url"
+    for m in re.finditer(r'<a [^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)</a>', h):
+        link = urllib.parse.unquote(m.group(1))
+        title = m.group(2).strip()
+        # Strip DDG redirect
+        m2 = re.search(r'uddg=([^&]+)', link)
+        if m2: link = urllib.parse.unquote(m2.group(1))
+        results.append((link, title))
+        if len(results) >= limit: break
+    return results
+
+def html_to_text(html):
+    h = re.sub(r'<script.*?</script>', '', html, flags=re.S)
+    h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
+    h = re.sub(r'<[^>]+>', ' ', h)
+    h = re.sub(r'&nbsp;', ' ', h)
+    h = re.sub(r'&amp;', '&', h)
+    h = re.sub(r'&quot;', '"', h)
+    h = re.sub(r'&#\d+;', '', h)
+    h = re.sub(r'\s+', ' ', h)
+    return h
+
+def relevant_paragraph(text, ime, prez, sport):
+    """Extract first relevant sentence(s) that mention name + sport."""
+    sents = re.split(r'(?<=[.!?])\s+', text)
+    full_name = f"{ime} {prez}"
+    relevant = []
+    for s in sents:
+        sl = s.lower()
+        if (full_name.lower() in sl or prez.lower() in sl) and len(s) > 50:
+            relevant.append(s)
+            if len(" ".join(relevant)) > 800: break
+    return " ".join(relevant)[:1500]
+
+def main():
+    conn = psycopg2.connect(**DB); conn.autocommit = True
+    cr = conn.cursor()
+    cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv 
+                  FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
+                  WHERE (c.kategorija_hoo IN (1, 2, 3) 
+                     OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada 
+                                 WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
+                    AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
+                  ORDER BY c.kategorija_hoo NULLS LAST LIMIT 80""")
+    targets = cr.fetchall()
+    out(f"Targets: {len(targets)}")
+    
+    enriched = 0; tried = 0
+    
+    for tid, ime, prez, sport, kat, klub in targets:
+        tried += 1
+        full = f"{ime} {prez}"
+        sport_kw = sport or "sportaš"
+        # DuckDuckGo query
+        query = f'"{full}" {sport_kw} Rijeka'
+        results = ddg_search(query, limit=3)
+        if not results:
+            time.sleep(0.5)
+            continue
+        
+        bio_text = ""
+        bio_url = None
+        for link, title in results[:3]:
+            # Skip non-news domains: facebook, instagram, hns.family etc
+            if any(skip in link.lower() for skip in ("facebook", "instagram", "twitter", "youtube",
+                                                       "x.com", "tiktok")): continue
+            html = http_get(link, timeout=10)
+            if not html: continue
+            text = html_to_text(html)
+            para = relevant_paragraph(text, ime, prez, sport_kw)
+            if para and len(para) >= 200:
+                bio_text = para
+                bio_url = link
+                break
+            time.sleep(0.3)
+        
+        if not bio_text:
+            time.sleep(0.4)
+            continue
+        
+        # Insert
+        try:
+            cr.execute("""UPDATE pgz_sport.clanovi 
+                          SET biografija = %s,
+                              source_url = COALESCE(source_url, %s),
+                              source_synced_at = now()
+                          WHERE id = %s""", (bio_text, bio_url, tid))
+            cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
+                          VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
+                       (f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: {bio_url})"[:2000],
+                        "online_news", 0.85, "biografija_sportasa"))
+            enriched += 1
+            out(f"  ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
+        except Exception as e:
+            out(f"  ERR {full}: {e}")
+        
+        if tried % 15 == 0:
+            out(f"  Progress: tried={tried} enriched={enriched}")
+        time.sleep(0.5)
+    
+    out(f"=== DONE: tried={tried} enriched={enriched} ===")
+    conn.close()
+
+if __name__ == "__main__":
+    main()