feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
+229
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HBS PDF results scraper - Prvenstvo RH + Kup Hrvatske + Međunarodno natjecanje.
|
||||
Parsira PDF rezultate, ekstrahira plasmane (1-3 = medalje),
|
||||
matcha s PGŽ igračima u DB-u, ubacuje u clan_nagrada.
|
||||
"""
|
||||
import os, re, sys, time, json, html as ht
|
||||
import urllib.request, urllib.parse
|
||||
import subprocess
|
||||
import psycopg2
|
||||
import datetime as dt
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password=os.environ["DB_PASSWORD"])
|
||||
BASE = "https://hrvatski-bocarski-savez.hr"
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
TMP = "/tmp/hbs_pdf"
|
||||
os.makedirs(TMP, exist_ok=True)
|
||||
|
||||
DELAY = 1.0
|
||||
LOG_FP = "/opt/pgz-sport/_logs/hbs_pdf_results.log"
|
||||
|
||||
def log(msg):
|
||||
line = f"[{dt.datetime.now().isoformat()}] {msg}"
|
||||
print(line, flush=True)
|
||||
try:
|
||||
with open(LOG_FP, "a") as f: f.write(line + "\n")
|
||||
except: pass
|
||||
|
||||
def fetch_html(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
log(f"FETCH err {url}: {e}")
|
||||
return None
|
||||
|
||||
def fetch_pdf(url, dst):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
with open(dst, "wb") as f: f.write(r.read())
|
||||
return True
|
||||
except Exception as e:
|
||||
log(f"PDF fetch err {url}: {e}")
|
||||
return False
|
||||
|
||||
def pdf_text(path):
|
||||
try:
|
||||
out = subprocess.run(["pdftotext", "-layout", path, "-"],
|
||||
capture_output=True, timeout=20, check=False)
|
||||
return out.stdout.decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return ""
|
||||
|
||||
def db():
|
||||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||||
|
||||
def discover_pdfs(godina):
|
||||
"""Discover all PDF result links for a given year from natjecanja pages."""
|
||||
pdfs = []
|
||||
pages = [
|
||||
("Prvenstvo RH", f"/ostala-natjecanja/prvenstvo-rh/?godina={godina}"),
|
||||
("Kup Hrvatske", f"/ostala-natjecanja/kup-hrvatske/?godina={godina}"),
|
||||
("Međunarodno", f"/ostala-natjecanja/medunarodno-natjecanje/?godina={godina}"),
|
||||
]
|
||||
for natj, slug in pages:
|
||||
h = fetch_html(BASE + slug)
|
||||
if not h: continue
|
||||
# Find PDF links (they go through /cdn/content/...pdf)
|
||||
for m in re.finditer(r'href="(https://hrvatski-bocarski-savez\.hr/cdn/content/[^"]+\.pdf)"[^>]*>(?:<i[^>]*></i>\s*)?([^<]+)</a>', h):
|
||||
url, label = m.group(1), m.group(2).strip()
|
||||
# Find context around (last <h3> before this) to detect dobna kategorija (Seniori/Juniori/Kadeti...)
|
||||
pos = m.start()
|
||||
ctx_window = h[max(0, pos-2000):pos]
|
||||
kat_m = re.findall(r'<h3[^>]*>([^<]+)</h3>', ctx_window)
|
||||
disc_m = re.findall(r'<h4[^>]*>([^<:]+)[:]', ctx_window)
|
||||
kat = kat_m[-1].strip() if kat_m else "?"
|
||||
disc = disc_m[-1].strip() if disc_m else label
|
||||
pdfs.append({"natj": natj, "url": url, "label": label, "kat": kat, "disc": disc, "godina": godina})
|
||||
time.sleep(DELAY)
|
||||
return pdfs
|
||||
|
||||
def parse_pdf_for_top3(text, pdf_meta):
|
||||
"""
|
||||
Extract top-3 plasmans from a Prvenstvo PDF.
|
||||
Strategy: PDF often has final placements at the END showing
|
||||
"1. Klub | Player A, Player B" etc.
|
||||
Look for patterns like "1. <name>", "2. <name>", "3. <name>" at end.
|
||||
"""
|
||||
results = []
|
||||
|
||||
# Pattern 1: "Pl. Igrač Klub" tabular format at end
|
||||
# Pattern 2: "1. <name>" at line start, followed by "(<klub>)"
|
||||
# Try simple first: look for medalje/poredak section
|
||||
poredak_idx = max(text.lower().rfind("poredak"), text.lower().rfind("kona\u010dni"))
|
||||
if poredak_idx < 0:
|
||||
# Use last 30% of doc
|
||||
poredak_idx = int(len(text) * 0.7)
|
||||
tail = text[poredak_idx:]
|
||||
|
||||
# Lines that start with rank
|
||||
for ln in tail.split("\n"):
|
||||
ln = ln.strip()
|
||||
# "1. Marko Markovic K.K. KASTAV" - allow optional dot
|
||||
m = re.match(r"^(\d{1,2})\.?\s+([A-ZČĆŠŽĐ][\wčćšžđa-zA-Z\s\.\-]{4,40}?)\s{2,}([A-ZČĆŠŽĐ\.][\w\.\sčćšžđA-Za-z\-]+?)\s*$", ln)
|
||||
if m:
|
||||
rank = int(m.group(1))
|
||||
ime_full = m.group(2).strip()
|
||||
klub = m.group(3).strip()
|
||||
if 1 <= rank <= 12 and len(ime_full) >= 5 and len(klub) >= 3:
|
||||
results.append({"rank": rank, "ime_full": ime_full, "klub": klub})
|
||||
else:
|
||||
# Single name on line followed by line with rank
|
||||
pass
|
||||
|
||||
# Dedup by rank (take first)
|
||||
seen = set(); uniq = []
|
||||
for r in results:
|
||||
if r["rank"] not in seen and r["rank"] <= 8:
|
||||
seen.add(r["rank"])
|
||||
uniq.append(r)
|
||||
return uniq
|
||||
|
||||
def find_clan(cr, ime_full):
|
||||
"""Try to match ime_full to clanovi.id."""
|
||||
parts = ime_full.split()
|
||||
if len(parts) < 2: return None
|
||||
# Try ime+prezime
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
||||
AND LOWER(sport) IN ('boćanje','bocanje','bo\u0107anje') LIMIT 1""",
|
||||
(parts[0], " ".join(parts[1:])))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
# Try last word as prezime, first as ime
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""",
|
||||
(parts[0], parts[-1]))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
# Try anywhere
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime || ' ' || prezime) = LOWER(%s) AND LOWER(sport) LIKE '%%boć%%' LIMIT 1""",
|
||||
(ime_full,))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
return None
|
||||
|
||||
def find_klub(cr, klub_name):
|
||||
if not klub_name: return None
|
||||
cr.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) ILIKE %s ORDER BY LENGTH(naziv) ASC LIMIT 1""",
|
||||
(f"%{klub_name.lower()}%",))
|
||||
r = cr.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
PLAS_TO_MED = {1: "ZLATO", 2: "SREBRO", 3: "BRONCA"}
|
||||
|
||||
def main():
|
||||
conn = db(); cr = conn.cursor()
|
||||
log("=== HBS PDF results scraper START ===")
|
||||
|
||||
all_pdfs = []
|
||||
for godina in [2025, 2024, 2023]:
|
||||
log(f"Discovering year {godina}…")
|
||||
pdfs = discover_pdfs(godina)
|
||||
log(f" {godina}: {len(pdfs)} PDFs")
|
||||
all_pdfs.extend(pdfs)
|
||||
|
||||
log(f"Total PDFs to process: {len(all_pdfs)}")
|
||||
|
||||
inserted = 0; matched_clan = 0; processed = 0
|
||||
|
||||
for pdf in all_pdfs:
|
||||
processed += 1
|
||||
url = pdf["url"]
|
||||
fname = re.sub(r'[^\w]', '_', url.split("/")[-1])[:80]
|
||||
local = f"{TMP}/{fname}"
|
||||
if not os.path.exists(local):
|
||||
if not fetch_pdf(url, local):
|
||||
continue
|
||||
time.sleep(DELAY)
|
||||
|
||||
text = pdf_text(local)
|
||||
if not text or len(text) < 200:
|
||||
continue
|
||||
|
||||
top3 = parse_pdf_for_top3(text, pdf)
|
||||
if processed % 30 == 0:
|
||||
log(f" Progress {processed}/{len(all_pdfs)}, inserted {inserted}, matched {matched_clan}")
|
||||
|
||||
razina = "DP" if "Prvenstvo" in pdf["natj"] else ("DK" if "Kup" in pdf["natj"] else "OS")
|
||||
natjecanje = f"{pdf['natj']} {pdf['godina']}"
|
||||
|
||||
for r in top3:
|
||||
clan_id = find_clan(cr, r["ime_full"])
|
||||
if clan_id: matched_clan += 1
|
||||
klub_id = find_klub(cr, r["klub"])
|
||||
try:
|
||||
cr.execute("""INSERT INTO pgz_sport.clan_nagrada
|
||||
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
|
||||
razina_natjecanja, dobna_kategorija, disciplina, plasman, medalja,
|
||||
napomena, source, source_url)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING""",
|
||||
(clan_id, r["ime_full"], klub_id, r["klub"], pdf["godina"],
|
||||
natjecanje, razina, pdf["kat"][:30], pdf["disc"][:60],
|
||||
r["rank"], PLAS_TO_MED.get(r["rank"]),
|
||||
pdf["label"], "hbs_pdf_results", url))
|
||||
if cr.rowcount: inserted += 1
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
log(f"=== DONE: {inserted} new nagrade inserted, {matched_clan} matched to clanovi, processed {processed} PDFs ===")
|
||||
|
||||
# Show sample of newly inserted
|
||||
cr.execute("""SELECT ime_prezime, klub_naziv, godina, plasman, dobna_kategorija, disciplina, natjecanje, clan_id
|
||||
FROM pgz_sport.clan_nagrada
|
||||
WHERE source = 'hbs_pdf_results' AND clan_id IS NOT NULL
|
||||
ORDER BY plasman, godina DESC LIMIT 25""")
|
||||
print("\n=== SAMPLE matched ===")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r[0]:25} ({r[1][:30] if r[1] else '-'}) {r[2]} - {r[3]}. ({r[4]}, {r[5][:30]}) {r[6]} clan={r[7]}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user