feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
Executable
+173
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D: Wikipedia/online enrichment for top sportašs.
|
||||
For each athlete: fetch hr.wikipedia + en.wikipedia summary,
|
||||
extract bio + medalje + datum/mjesto rođenja, populate clanovi.biografija + dabi.knowledge facts.
|
||||
"""
|
||||
import os
|
||||
import re, json, time
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"])
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
DELAY = 0.5
|
||||
|
||||
def http_get(url, timeout=15):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404: return None
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
"""Use Wikipedia REST API for clean summary."""
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_search(query, lang="hr"):
|
||||
"""Find best Wikipedia title for a person."""
|
||||
enc = urllib.parse.quote(query)
|
||||
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=3&format=json"
|
||||
raw = http_get(url)
|
||||
if not raw: return []
|
||||
try:
|
||||
d = json.loads(raw)
|
||||
return list(zip(d[1], d[3])) # (title, url)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
# Get target list: HOO kat 1-3 + SP/EP/OI medalisti
|
||||
cr.execute("""
|
||||
SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, c.biografija, k.naziv
|
||||
FROM pgz_sport.clanovi c
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))
|
||||
ORDER BY c.kategorija_hoo NULLS LAST, c.prezime, c.ime
|
||||
""")
|
||||
targets = cr.fetchall()
|
||||
print(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0
|
||||
fact_count = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, bio, klub in targets:
|
||||
if bio and len(bio) > 200:
|
||||
continue # already enriched
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
# Try HR wiki first
|
||||
summary = None
|
||||
wiki_title = None
|
||||
wiki_lang = None
|
||||
wiki_url = None
|
||||
|
||||
for lang in ["hr", "en"]:
|
||||
# Direct title try
|
||||
s = wiki_summary(full, lang)
|
||||
if s and s.get("type") == "standard" and not s.get("disambiguation"):
|
||||
# Sanity check: must mention sport in description or extract
|
||||
desc = (s.get("description","") + " " + s.get("extract",""))[:2000].lower()
|
||||
if any(kw in desc for kw in ("sport", "igra", "klub", "natjecat", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu",
|
||||
"ju-jitsu", "sahist", "šahis", "atlet", "biciklist", "plivač",
|
||||
"plivat", "boxer", "olimpij", "bonifac", "athlete", "compete",
|
||||
"sportaš", "swimmer", "diver", "boxer", "sailor", "wrestler")):
|
||||
summary = s
|
||||
wiki_title = s.get("title")
|
||||
wiki_lang = lang
|
||||
wiki_url = s.get("content_urls", {}).get("desktop", {}).get("page")
|
||||
break
|
||||
time.sleep(DELAY)
|
||||
|
||||
# Search fallback
|
||||
results = wiki_search(full, lang)
|
||||
for title, url in results:
|
||||
# Skip disambiguations
|
||||
if "razdvojba" in title.lower() or "disambiguation" in title.lower():
|
||||
continue
|
||||
s2 = wiki_summary(title, lang)
|
||||
if not s2: continue
|
||||
desc = (s2.get("description","") + " " + s2.get("extract",""))[:2000].lower()
|
||||
if any(kw in desc for kw in ("sport", "igrač", "klub", "natjecat", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "atletič",
|
||||
"ju-jitsu", "šahist", "biciklist", "plivač", "olimpij", "athlete",
|
||||
"compete", "sportaš", "swimmer", "diver", "sailor", "wrestler")):
|
||||
# Must mention sport-relevant keyword OR our sport
|
||||
if sport and sport.lower()[:5] in desc:
|
||||
summary = s2; wiki_title = title; wiki_lang = lang
|
||||
wiki_url = s2.get("content_urls", {}).get("desktop", {}).get("page")
|
||||
break
|
||||
time.sleep(DELAY)
|
||||
if summary: break
|
||||
|
||||
if not summary or not summary.get("extract"):
|
||||
continue
|
||||
|
||||
extract = summary.get("extract", "").strip()[:1500]
|
||||
if len(extract) < 80:
|
||||
continue
|
||||
|
||||
# Try to extract date of birth from extract (pattern: rođen* DD.MM.YYYY or DD month YYYY)
|
||||
dob = None
|
||||
m = re.search(r"rođen[ai]?\s+(\d{1,2}\.\s*\w+\s+\d{4}|\d{1,2}\.\d{1,2}\.\d{4})", extract.lower())
|
||||
if m: dob = m.group(1)
|
||||
|
||||
# Update clanovi
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source = COALESCE(source, %s),
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract, f"wikipedia_{wiki_lang}", wiki_url, tid))
|
||||
enriched += 1
|
||||
except Exception as e:
|
||||
print(f" ERR update {full}: {e}")
|
||||
continue
|
||||
|
||||
# Insert as fact in dabi.knowledge
|
||||
fact = f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {wiki_lang.upper()})"
|
||||
try:
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(fact[:2000], f"wikipedia_{wiki_lang}", 0.9, "biografija_sportasa"))
|
||||
if cr.rowcount: fact_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f" ✓ {full} ({wiki_lang}) {len(extract)} chars")
|
||||
time.sleep(DELAY)
|
||||
|
||||
print(f"\n=== DONE: {enriched} enriched, {fact_count} new facts ===")
|
||||
|
||||
# Sample bios
|
||||
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio_len
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE biografija IS NOT NULL AND LENGTH(biografija) > 100
|
||||
AND source LIKE 'wikipedia%'
|
||||
ORDER BY bio_len DESC LIMIT 15""")
|
||||
print("\nTop bios:")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r[0]} {r[1]} ({r[2]}) - {r[3]} chars")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user