230 lines
8.9 KiB
Python
Executable File
230 lines
8.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
HBS PDF results scraper - Prvenstvo RH + Kup Hrvatske + Međunarodno natjecanje.
|
|
Parsira PDF rezultate, ekstrahira plasmane (1-3 = medalje),
|
|
matcha s PGŽ igračima u DB-u, ubacuje u clan_nagrada.
|
|
"""
|
|
import os, re, sys, time, json, html as ht
|
|
import urllib.request, urllib.parse
|
|
import subprocess
|
|
import psycopg2
|
|
import datetime as dt
|
|
|
|
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
|
BASE = "https://hrvatski-bocarski-savez.hr"
|
|
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
|
TMP = "/tmp/hbs_pdf"
|
|
os.makedirs(TMP, exist_ok=True)
|
|
|
|
DELAY = 1.0
|
|
LOG_FP = "/opt/pgz-sport/_logs/hbs_pdf_results.log"
|
|
|
|
def log(msg):
|
|
line = f"[{dt.datetime.now().isoformat()}] {msg}"
|
|
print(line, flush=True)
|
|
try:
|
|
with open(LOG_FP, "a") as f: f.write(line + "\n")
|
|
except: pass
|
|
|
|
def fetch_html(url):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=20) as r:
|
|
return r.read().decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
log(f"FETCH err {url}: {e}")
|
|
return None
|
|
|
|
def fetch_pdf(url, dst):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=30) as r:
|
|
with open(dst, "wb") as f: f.write(r.read())
|
|
return True
|
|
except Exception as e:
|
|
log(f"PDF fetch err {url}: {e}")
|
|
return False
|
|
|
|
def pdf_text(path):
|
|
try:
|
|
out = subprocess.run(["pdftotext", "-layout", path, "-"],
|
|
capture_output=True, timeout=20, check=False)
|
|
return out.stdout.decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
return ""
|
|
|
|
def db():
|
|
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
|
|
|
def discover_pdfs(godina):
|
|
"""Discover all PDF result links for a given year from natjecanja pages."""
|
|
pdfs = []
|
|
pages = [
|
|
("Prvenstvo RH", f"/ostala-natjecanja/prvenstvo-rh/?godina={godina}"),
|
|
("Kup Hrvatske", f"/ostala-natjecanja/kup-hrvatske/?godina={godina}"),
|
|
("Međunarodno", f"/ostala-natjecanja/medunarodno-natjecanje/?godina={godina}"),
|
|
]
|
|
for natj, slug in pages:
|
|
h = fetch_html(BASE + slug)
|
|
if not h: continue
|
|
# Find PDF links (they go through /cdn/content/...pdf)
|
|
for m in re.finditer(r'href="(https://hrvatski-bocarski-savez\.hr/cdn/content/[^"]+\.pdf)"[^>]*>(?:<i[^>]*></i>\s*)?([^<]+)</a>', h):
|
|
url, label = m.group(1), m.group(2).strip()
|
|
# Find context around (last <h3> before this) to detect dobna kategorija (Seniori/Juniori/Kadeti...)
|
|
pos = m.start()
|
|
ctx_window = h[max(0, pos-2000):pos]
|
|
kat_m = re.findall(r'<h3[^>]*>([^<]+)</h3>', ctx_window)
|
|
disc_m = re.findall(r'<h4[^>]*>([^<:]+)[:]', ctx_window)
|
|
kat = kat_m[-1].strip() if kat_m else "?"
|
|
disc = disc_m[-1].strip() if disc_m else label
|
|
pdfs.append({"natj": natj, "url": url, "label": label, "kat": kat, "disc": disc, "godina": godina})
|
|
time.sleep(DELAY)
|
|
return pdfs
|
|
|
|
def parse_pdf_for_top3(text, pdf_meta):
|
|
"""
|
|
Extract top-3 plasmans from a Prvenstvo PDF.
|
|
Strategy: PDF often has final placements at the END showing
|
|
"1. Klub | Player A, Player B" etc.
|
|
Look for patterns like "1. <name>", "2. <name>", "3. <name>" at end.
|
|
"""
|
|
results = []
|
|
|
|
# Pattern 1: "Pl. Igrač Klub" tabular format at end
|
|
# Pattern 2: "1. <name>" at line start, followed by "(<klub>)"
|
|
# Try simple first: look for medalje/poredak section
|
|
poredak_idx = max(text.lower().rfind("poredak"), text.lower().rfind("kona\u010dni"))
|
|
if poredak_idx < 0:
|
|
# Use last 30% of doc
|
|
poredak_idx = int(len(text) * 0.7)
|
|
tail = text[poredak_idx:]
|
|
|
|
# Lines that start with rank
|
|
for ln in tail.split("\n"):
|
|
ln = ln.strip()
|
|
# "1. Marko Markovic K.K. KASTAV" - allow optional dot
|
|
m = re.match(r"^(\d{1,2})\.?\s+([A-ZČĆŠŽĐ][\wčćšžđa-zA-Z\s\.\-]{4,40}?)\s{2,}([A-ZČĆŠŽĐ\.][\w\.\sčćšžđA-Za-z\-]+?)\s*$", ln)
|
|
if m:
|
|
rank = int(m.group(1))
|
|
ime_full = m.group(2).strip()
|
|
klub = m.group(3).strip()
|
|
if 1 <= rank <= 12 and len(ime_full) >= 5 and len(klub) >= 3:
|
|
results.append({"rank": rank, "ime_full": ime_full, "klub": klub})
|
|
else:
|
|
# Single name on line followed by line with rank
|
|
pass
|
|
|
|
# Dedup by rank (take first)
|
|
seen = set(); uniq = []
|
|
for r in results:
|
|
if r["rank"] not in seen and r["rank"] <= 8:
|
|
seen.add(r["rank"])
|
|
uniq.append(r)
|
|
return uniq
|
|
|
|
def find_clan(cr, ime_full):
|
|
"""Try to match ime_full to clanovi.id."""
|
|
parts = ime_full.split()
|
|
if len(parts) < 2: return None
|
|
# Try ime+prezime
|
|
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
|
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
|
AND LOWER(sport) IN ('boćanje','bocanje','bo\u0107anje') LIMIT 1""",
|
|
(parts[0], " ".join(parts[1:])))
|
|
r = cr.fetchone()
|
|
if r: return r[0]
|
|
# Try last word as prezime, first as ime
|
|
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
|
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""",
|
|
(parts[0], parts[-1]))
|
|
r = cr.fetchone()
|
|
if r: return r[0]
|
|
# Try anywhere
|
|
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
|
WHERE LOWER(ime || ' ' || prezime) = LOWER(%s) AND LOWER(sport) LIKE '%%boć%%' LIMIT 1""",
|
|
(ime_full,))
|
|
r = cr.fetchone()
|
|
if r: return r[0]
|
|
return None
|
|
|
|
def find_klub(cr, klub_name):
|
|
if not klub_name: return None
|
|
cr.execute("""SELECT id FROM pgz_sport.klubovi
|
|
WHERE LOWER(naziv) ILIKE %s ORDER BY LENGTH(naziv) ASC LIMIT 1""",
|
|
(f"%{klub_name.lower()}%",))
|
|
r = cr.fetchone()
|
|
return r[0] if r else None
|
|
|
|
PLAS_TO_MED = {1: "ZLATO", 2: "SREBRO", 3: "BRONCA"}
|
|
|
|
def main():
|
|
conn = db(); cr = conn.cursor()
|
|
log("=== HBS PDF results scraper START ===")
|
|
|
|
all_pdfs = []
|
|
for godina in [2025, 2024, 2023]:
|
|
log(f"Discovering year {godina}…")
|
|
pdfs = discover_pdfs(godina)
|
|
log(f" {godina}: {len(pdfs)} PDFs")
|
|
all_pdfs.extend(pdfs)
|
|
|
|
log(f"Total PDFs to process: {len(all_pdfs)}")
|
|
|
|
inserted = 0; matched_clan = 0; processed = 0
|
|
|
|
for pdf in all_pdfs:
|
|
processed += 1
|
|
url = pdf["url"]
|
|
fname = re.sub(r'[^\w]', '_', url.split("/")[-1])[:80]
|
|
local = f"{TMP}/{fname}"
|
|
if not os.path.exists(local):
|
|
if not fetch_pdf(url, local):
|
|
continue
|
|
time.sleep(DELAY)
|
|
|
|
text = pdf_text(local)
|
|
if not text or len(text) < 200:
|
|
continue
|
|
|
|
top3 = parse_pdf_for_top3(text, pdf)
|
|
if processed % 30 == 0:
|
|
log(f" Progress {processed}/{len(all_pdfs)}, inserted {inserted}, matched {matched_clan}")
|
|
|
|
razina = "DP" if "Prvenstvo" in pdf["natj"] else ("DK" if "Kup" in pdf["natj"] else "OS")
|
|
natjecanje = f"{pdf['natj']} {pdf['godina']}"
|
|
|
|
for r in top3:
|
|
clan_id = find_clan(cr, r["ime_full"])
|
|
if clan_id: matched_clan += 1
|
|
klub_id = find_klub(cr, r["klub"])
|
|
try:
|
|
cr.execute("""INSERT INTO pgz_sport.clan_nagrada
|
|
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
|
|
razina_natjecanja, dobna_kategorija, disciplina, plasman, medalja,
|
|
napomena, source, source_url)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT DO NOTHING""",
|
|
(clan_id, r["ime_full"], klub_id, r["klub"], pdf["godina"],
|
|
natjecanje, razina, pdf["kat"][:30], pdf["disc"][:60],
|
|
r["rank"], PLAS_TO_MED.get(r["rank"]),
|
|
pdf["label"], "hbs_pdf_results", url))
|
|
if cr.rowcount: inserted += 1
|
|
except Exception as e:
|
|
pass
|
|
|
|
log(f"=== DONE: {inserted} new nagrade inserted, {matched_clan} matched to clanovi, processed {processed} PDFs ===")
|
|
|
|
# Show sample of newly inserted
|
|
cr.execute("""SELECT ime_prezime, klub_naziv, godina, plasman, dobna_kategorija, disciplina, natjecanje, clan_id
|
|
FROM pgz_sport.clan_nagrada
|
|
WHERE source = 'hbs_pdf_results' AND clan_id IS NOT NULL
|
|
ORDER BY plasman, godina DESC LIMIT 25""")
|
|
print("\n=== SAMPLE matched ===")
|
|
for r in cr.fetchall():
|
|
print(f" {r[0]:25} ({r[1][:30] if r[1] else '-'}) {r[2]} - {r[3]}. ({r[4]}, {r[5][:30]}) {r[6]} clan={r[7]}")
|
|
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|