feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
Executable
+140
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D news enrichment: search via DuckDuckGo HTML (no API key) for biographical news mentions.
|
||||
Gather first 2-3 hits, scrape, extract relevant paragraphs that mention the athlete + sport.
|
||||
"""
|
||||
import os
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"])
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=12):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": UA,
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "hr,en"
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def ddg_search(query, limit=3):
|
||||
"""DuckDuckGo HTML search. Returns list of (url, snippet)."""
|
||||
q = urllib.parse.quote(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={q}"
|
||||
h = http_get(url)
|
||||
if not h: return []
|
||||
results = []
|
||||
# Extract <a class="result__a" href="..."> or "result__url"
|
||||
for m in re.finditer(r'<a [^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)</a>', h):
|
||||
link = urllib.parse.unquote(m.group(1))
|
||||
title = m.group(2).strip()
|
||||
# Strip DDG redirect
|
||||
m2 = re.search(r'uddg=([^&]+)', link)
|
||||
if m2: link = urllib.parse.unquote(m2.group(1))
|
||||
results.append((link, title))
|
||||
if len(results) >= limit: break
|
||||
return results
|
||||
|
||||
def html_to_text(html):
|
||||
h = re.sub(r'<script.*?</script>', '', html, flags=re.S)
|
||||
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
|
||||
h = re.sub(r'<[^>]+>', ' ', h)
|
||||
h = re.sub(r' ', ' ', h)
|
||||
h = re.sub(r'&', '&', h)
|
||||
h = re.sub(r'"', '"', h)
|
||||
h = re.sub(r'&#\d+;', '', h)
|
||||
h = re.sub(r'\s+', ' ', h)
|
||||
return h
|
||||
|
||||
def relevant_paragraph(text, ime, prez, sport):
|
||||
"""Extract first relevant sentence(s) that mention name + sport."""
|
||||
sents = re.split(r'(?<=[.!?])\s+', text)
|
||||
full_name = f"{ime} {prez}"
|
||||
relevant = []
|
||||
for s in sents:
|
||||
sl = s.lower()
|
||||
if (full_name.lower() in sl or prez.lower() in sl) and len(s) > 50:
|
||||
relevant.append(s)
|
||||
if len(" ".join(relevant)) > 800: break
|
||||
return " ".join(relevant)[:1500]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 80""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
sport_kw = sport or "sportaš"
|
||||
# DuckDuckGo query
|
||||
query = f'"{full}" {sport_kw} Rijeka'
|
||||
results = ddg_search(query, limit=3)
|
||||
if not results:
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
bio_text = ""
|
||||
bio_url = None
|
||||
for link, title in results[:3]:
|
||||
# Skip non-news domains: facebook, instagram, hns.family etc
|
||||
if any(skip in link.lower() for skip in ("facebook", "instagram", "twitter", "youtube",
|
||||
"x.com", "tiktok")): continue
|
||||
html = http_get(link, timeout=10)
|
||||
if not html: continue
|
||||
text = html_to_text(html)
|
||||
para = relevant_paragraph(text, ime, prez, sport_kw)
|
||||
if para and len(para) >= 200:
|
||||
bio_text = para
|
||||
bio_url = link
|
||||
break
|
||||
time.sleep(0.3)
|
||||
|
||||
if not bio_text:
|
||||
time.sleep(0.4)
|
||||
continue
|
||||
|
||||
# Insert
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""", (bio_text, bio_url, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: {bio_url})"[:2000],
|
||||
"online_news", 0.85, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
if tried % 15 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.5)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user