Files
pgz-sport/scrapers/hbs_pdf_results.py.pre_b_switch.1777897180
T

230 lines
8.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
HBS PDF results scraper - Prvenstvo RH + Kup Hrvatske + Međunarodno natjecanje.
Parsira PDF rezultate, ekstrahira plasmane (1-3 = medalje),
matcha s PGŽ igračima u DB-u, ubacuje u clan_nagrada.
"""
import os, re, sys, time, json, html as ht
import urllib.request, urllib.parse
import subprocess
import psycopg2
import datetime as dt
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
TMP = "/tmp/hbs_pdf"
os.makedirs(TMP, exist_ok=True)
DELAY = 1.0
LOG_FP = "/opt/pgz-sport/_logs/hbs_pdf_results.log"
def log(msg):
line = f"[{dt.datetime.now().isoformat()}] {msg}"
print(line, flush=True)
try:
with open(LOG_FP, "a") as f: f.write(line + "\n")
except: pass
def fetch_html(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
log(f"FETCH err {url}: {e}")
return None
def fetch_pdf(url, dst):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=30) as r:
with open(dst, "wb") as f: f.write(r.read())
return True
except Exception as e:
log(f"PDF fetch err {url}: {e}")
return False
def pdf_text(path):
try:
out = subprocess.run(["pdftotext", "-layout", path, "-"],
capture_output=True, timeout=20, check=False)
return out.stdout.decode("utf-8", errors="replace")
except Exception as e:
return ""
def db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def discover_pdfs(godina):
"""Discover all PDF result links for a given year from natjecanja pages."""
pdfs = []
pages = [
("Prvenstvo RH", f"/ostala-natjecanja/prvenstvo-rh/?godina={godina}"),
("Kup Hrvatske", f"/ostala-natjecanja/kup-hrvatske/?godina={godina}"),
("Međunarodno", f"/ostala-natjecanja/medunarodno-natjecanje/?godina={godina}"),
]
for natj, slug in pages:
h = fetch_html(BASE + slug)
if not h: continue
# Find PDF links (they go through /cdn/content/...pdf)
for m in re.finditer(r'href="(https://hrvatski-bocarski-savez\.hr/cdn/content/[^"]+\.pdf)"[^>]*>(?:<i[^>]*></i>\s*)?([^<]+)</a>', h):
url, label = m.group(1), m.group(2).strip()
# Find context around (last <h3> before this) to detect dobna kategorija (Seniori/Juniori/Kadeti...)
pos = m.start()
ctx_window = h[max(0, pos-2000):pos]
kat_m = re.findall(r'<h3[^>]*>([^<]+)</h3>', ctx_window)
disc_m = re.findall(r'<h4[^>]*>([^<:]+)[:]', ctx_window)
kat = kat_m[-1].strip() if kat_m else "?"
disc = disc_m[-1].strip() if disc_m else label
pdfs.append({"natj": natj, "url": url, "label": label, "kat": kat, "disc": disc, "godina": godina})
time.sleep(DELAY)
return pdfs
def parse_pdf_for_top3(text, pdf_meta):
"""
Extract top-3 plasmans from a Prvenstvo PDF.
Strategy: PDF often has final placements at the END showing
"1. Klub | Player A, Player B" etc.
Look for patterns like "1. <name>", "2. <name>", "3. <name>" at end.
"""
results = []
# Pattern 1: "Pl. Igrač Klub" tabular format at end
# Pattern 2: "1. <name>" at line start, followed by "(<klub>)"
# Try simple first: look for medalje/poredak section
poredak_idx = max(text.lower().rfind("poredak"), text.lower().rfind("kona\u010dni"))
if poredak_idx < 0:
# Use last 30% of doc
poredak_idx = int(len(text) * 0.7)
tail = text[poredak_idx:]
# Lines that start with rank
for ln in tail.split("\n"):
ln = ln.strip()
# "1. Marko Markovic K.K. KASTAV" - allow optional dot
m = re.match(r"^(\d{1,2})\.?\s+([A-ZČĆŠŽĐ][\wčćšžđa-zA-Z\s\.\-]{4,40}?)\s{2,}([A-ZČĆŠŽĐ\.][\w\.\sčćšžđA-Za-z\-]+?)\s*$", ln)
if m:
rank = int(m.group(1))
ime_full = m.group(2).strip()
klub = m.group(3).strip()
if 1 <= rank <= 12 and len(ime_full) >= 5 and len(klub) >= 3:
results.append({"rank": rank, "ime_full": ime_full, "klub": klub})
else:
# Single name on line followed by line with rank
pass
# Dedup by rank (take first)
seen = set(); uniq = []
for r in results:
if r["rank"] not in seen and r["rank"] <= 8:
seen.add(r["rank"])
uniq.append(r)
return uniq
def find_clan(cr, ime_full):
"""Try to match ime_full to clanovi.id."""
parts = ime_full.split()
if len(parts) < 2: return None
# Try ime+prezime
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
AND LOWER(sport) IN ('boćanje','bocanje','bo\u0107anje') LIMIT 1""",
(parts[0], " ".join(parts[1:])))
r = cr.fetchone()
if r: return r[0]
# Try last word as prezime, first as ime
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""",
(parts[0], parts[-1]))
r = cr.fetchone()
if r: return r[0]
# Try anywhere
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime || ' ' || prezime) = LOWER(%s) AND LOWER(sport) LIKE '%%boć%%' LIMIT 1""",
(ime_full,))
r = cr.fetchone()
if r: return r[0]
return None
def find_klub(cr, klub_name):
if not klub_name: return None
cr.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) ILIKE %s ORDER BY LENGTH(naziv) ASC LIMIT 1""",
(f"%{klub_name.lower()}%",))
r = cr.fetchone()
return r[0] if r else None
PLAS_TO_MED = {1: "ZLATO", 2: "SREBRO", 3: "BRONCA"}
def main():
conn = db(); cr = conn.cursor()
log("=== HBS PDF results scraper START ===")
all_pdfs = []
for godina in [2025, 2024, 2023]:
log(f"Discovering year {godina}")
pdfs = discover_pdfs(godina)
log(f" {godina}: {len(pdfs)} PDFs")
all_pdfs.extend(pdfs)
log(f"Total PDFs to process: {len(all_pdfs)}")
inserted = 0; matched_clan = 0; processed = 0
for pdf in all_pdfs:
processed += 1
url = pdf["url"]
fname = re.sub(r'[^\w]', '_', url.split("/")[-1])[:80]
local = f"{TMP}/{fname}"
if not os.path.exists(local):
if not fetch_pdf(url, local):
continue
time.sleep(DELAY)
text = pdf_text(local)
if not text or len(text) < 200:
continue
top3 = parse_pdf_for_top3(text, pdf)
if processed % 30 == 0:
log(f" Progress {processed}/{len(all_pdfs)}, inserted {inserted}, matched {matched_clan}")
razina = "DP" if "Prvenstvo" in pdf["natj"] else ("DK" if "Kup" in pdf["natj"] else "OS")
natjecanje = f"{pdf['natj']} {pdf['godina']}"
for r in top3:
clan_id = find_clan(cr, r["ime_full"])
if clan_id: matched_clan += 1
klub_id = find_klub(cr, r["klub"])
try:
cr.execute("""INSERT INTO pgz_sport.clan_nagrada
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
razina_natjecanja, dobna_kategorija, disciplina, plasman, medalja,
napomena, source, source_url)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING""",
(clan_id, r["ime_full"], klub_id, r["klub"], pdf["godina"],
natjecanje, razina, pdf["kat"][:30], pdf["disc"][:60],
r["rank"], PLAS_TO_MED.get(r["rank"]),
pdf["label"], "hbs_pdf_results", url))
if cr.rowcount: inserted += 1
except Exception as e:
pass
log(f"=== DONE: {inserted} new nagrade inserted, {matched_clan} matched to clanovi, processed {processed} PDFs ===")
# Show sample of newly inserted
cr.execute("""SELECT ime_prezime, klub_naziv, godina, plasman, dobna_kategorija, disciplina, natjecanje, clan_id
FROM pgz_sport.clan_nagrada
WHERE source = 'hbs_pdf_results' AND clan_id IS NOT NULL
ORDER BY plasman, godina DESC LIMIT 25""")
print("\n=== SAMPLE matched ===")
for r in cr.fetchall():
print(f" {r[0]:25} ({r[1][:30] if r[1] else '-'}) {r[2]} - {r[3]}. ({r[4]}, {r[5][:30]}) {r[6]} clan={r[7]}")
conn.close()
if __name__ == "__main__":
main()