Crisis V6: panel expand + klub matching + ne-klub filter + samo_klubovi default
DB: - pgz_sport.potpore_nositelji.je_klub flag (false za RSS programs/savezi) - Re-match klub_id case-insensitive trim normalize Endpoint: - /api/dashboard/top-primatelji: samo_klubovi=True default Frontend: - sport2.html #panel/#dpanel: 70vw / 1100px max-width za HNS karijera - mobile responsive za panel
This commit is contained in:
@@ -0,0 +1,462 @@
|
||||
#!/usr/bin/env python3
|
||||
# hns_api_client.py — HNS Semafor structured client (v1.0)
|
||||
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
||||
# Date: 2026-05-05
|
||||
# Description:
|
||||
# Reverse-engineered client for https://semafor.hns.family.
|
||||
# The site is server-rendered ASP.NET (NOT Next.js — no __NEXT_DATA__,
|
||||
# no hydration JSON). The only XHR endpoints exposed are filter helpers
|
||||
# /handlers/getOrganizations/, getCompetitions/, getAgeCategories/,
|
||||
# getCalendarEvents/. All player & club data is rendered into stable
|
||||
# semantic HTML (classes: playerHeader, playerData, playerCompetitionStatsTable,
|
||||
# matchlist, clubHeader, basic_info, playerslist…).
|
||||
#
|
||||
# This module therefore implements a fast HTML→JSON parser using requests
|
||||
# + BeautifulSoup, with a connection-pooled session, polite UA, and a
|
||||
# per-hour cache. It exposes the SDK surface SUB4 was asked for:
|
||||
# fetch_player(hns_id, slug) -> dict
|
||||
# fetch_klub(hns_id, slug) -> dict
|
||||
# fetch_klub_roster(hns_id, slug) -> list
|
||||
# get_buildid() -> str (no buildId on this stack — returns site CSS hash)
|
||||
#
|
||||
# Fallback chain inside _get_html():
|
||||
# 1. requests + polite UA (primary)
|
||||
# 2. requests with referer + cookie (retry on 403/503)
|
||||
# 3. Playwright (lazy import) for JS-only edge cases
|
||||
#
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
BASE = "https://semafor.hns.family"
|
||||
UA = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
HEADERS = {
|
||||
"User-Agent": UA,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "hr,en-US;q=0.7,en;q=0.3",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Cache-Control": "no-cache",
|
||||
}
|
||||
|
||||
_session = requests.Session()
|
||||
_session.headers.update(HEADERS)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
def _txt(node, default: str = "") -> str:
|
||||
if node is None:
|
||||
return default
|
||||
return re.sub(r"\s+", " ", node.get_text(" ", strip=True)).strip()
|
||||
|
||||
|
||||
def _int(s: str | None, default: int | None = None) -> int | None:
|
||||
if s is None:
|
||||
return default
|
||||
m = re.search(r"-?\d+", s.replace("\xa0", " "))
|
||||
return int(m.group()) if m else default
|
||||
|
||||
|
||||
def _href_id(a, pattern: str) -> int | None:
|
||||
if a is None or not a.get("href"):
|
||||
return None
|
||||
m = re.search(pattern, a["href"])
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def _get_html(url: str, *, timeout: int = 20, retries: int = 2) -> str:
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
r = _session.get(url, timeout=timeout)
|
||||
if r.status_code == 200 and r.text:
|
||||
return r.text
|
||||
if r.status_code in (403, 503) and attempt < retries:
|
||||
time.sleep(1.0 + attempt)
|
||||
continue
|
||||
r.raise_for_status()
|
||||
except Exception as e: # pragma: no cover — network paths
|
||||
last_exc = e
|
||||
time.sleep(1.0 + attempt)
|
||||
# Playwright fallback (last resort)
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright # type: ignore
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(user_agent=UA, locale="hr-HR")
|
||||
page = ctx.new_page()
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||
html = page.content()
|
||||
browser.close()
|
||||
return html
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"_get_html failed for {url}: {last_exc or e}") from e
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# buildId-equivalent — NOT a Next.js app. We surface a deterministic version
|
||||
# token taken from the cached CSS asset hash; refreshes hourly.
|
||||
# ---------------------------------------------------------------------------
|
||||
@lru_cache(maxsize=1)
|
||||
def _build_token_cached(hour_bucket: int) -> str:
|
||||
try:
|
||||
html = _get_html(BASE + "/", timeout=15)
|
||||
m = re.search(r"common\.min\.css\?v=([A-Za-z0-9_\-]+)", html)
|
||||
return m.group(1) if m else f"unknown-{hour_bucket}"
|
||||
except Exception:
|
||||
return f"unknown-{hour_bucket}"
|
||||
|
||||
|
||||
def get_buildid() -> str:
|
||||
"""Returns the current site asset version hash (hourly-cached).
|
||||
HNS Semafor is server-rendered ASP.NET, so there is no Next.js buildId.
|
||||
We surface the CSS asset hash as an equivalent cache-busting token."""
|
||||
return _build_token_cached(int(time.time()) // 3600)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Player parser
|
||||
# ---------------------------------------------------------------------------
|
||||
def _parse_player_header(soup: BeautifulSoup) -> dict[str, Any]:
|
||||
hdr = soup.select_one("div.block.playerHeader")
|
||||
if hdr is None:
|
||||
return {}
|
||||
name = _txt(hdr.select_one(".playerName .name"))
|
||||
surname = _txt(hdr.select_one(".playerName .surname"))
|
||||
shirt = _txt(hdr.select_one(".playerName h3"))
|
||||
img = hdr.select_one(".photo img")
|
||||
photo_url = img["src"] if img and img.get("src") else None
|
||||
club_a = hdr.select_one(".playerData li.club a")
|
||||
club_name = _txt(hdr.select_one(".playerData li.club h4"))
|
||||
club_id = _href_id(club_a, r"/klubovi/(\d+)/")
|
||||
dob_node = hdr.select_one(".playerData li.dob h4")
|
||||
dob_text = _txt(dob_node)
|
||||
dob = None
|
||||
age = None
|
||||
m = re.match(r"(\d{2}\.\d{2}\.\d{4})", dob_text)
|
||||
if m:
|
||||
dob = m.group(1)
|
||||
m_age = re.search(r"\((\d+)\s+godina", dob_text)
|
||||
if m_age:
|
||||
age = int(m_age.group(1))
|
||||
pob = _txt(hdr.select_one(".playerData li.pob h4")) or None
|
||||
return {
|
||||
"name": name or None,
|
||||
"surname": surname or None,
|
||||
"shirt_number": _int(shirt),
|
||||
"photo_url": photo_url,
|
||||
"current_club": {"id": club_id, "name": club_name or None},
|
||||
"dob": dob,
|
||||
"age": age,
|
||||
"place_of_birth": pob,
|
||||
}
|
||||
|
||||
|
||||
def _parse_season_recap(soup: BeautifulSoup) -> dict[str, Any]:
|
||||
block = soup.select_one("div.player_season_stats_recap")
|
||||
if block is None:
|
||||
return {}
|
||||
season = _txt(block.select_one("h2"))
|
||||
out = {"season": season or None}
|
||||
for li in block.select("ul > li"):
|
||||
cls = (li.get("class") or [None])[0]
|
||||
out[cls] = _int(_txt(li.select_one("h4")))
|
||||
return out
|
||||
|
||||
|
||||
def _parse_player_seasons(soup: BeautifulSoup) -> list[dict[str, Any]]:
|
||||
"""Statistika po sezonama: each season has a stats table + matches table."""
|
||||
seasons: list[dict[str, Any]] = []
|
||||
container = soup.select_one("div.player_profile_matches")
|
||||
if container is None:
|
||||
return seasons
|
||||
titles = container.select("h2.seasonTitle")
|
||||
tabbed = container.select("div.tabbedContent")
|
||||
for i, h2 in enumerate(titles):
|
||||
season = _txt(h2)
|
||||
tab = tabbed[i] if i < len(tabbed) else None
|
||||
comps: list[dict[str, Any]] = []
|
||||
matches: list[dict[str, Any]] = []
|
||||
if tab is not None:
|
||||
# competitions stats table
|
||||
for st in tab.select("div.stats_table"):
|
||||
rows: list[dict[str, Any]] = []
|
||||
for row in st.select("ul > li.row"):
|
||||
if "header" in (row.get("class") or []):
|
||||
continue
|
||||
title_a = row.select_one(".title a")
|
||||
rows.append(
|
||||
{
|
||||
"competition": _txt(row.select_one(".title")) or None,
|
||||
"competition_url": title_a["href"] if title_a and title_a.get("href") else None,
|
||||
"competition_id": _href_id(title_a, r"/natjecanja/(\d+)/"),
|
||||
"apps": _int(_txt(row.select_one(".apps"))),
|
||||
"starter": _int(_txt(row.select_one(".starter"))),
|
||||
"sub": _int(_txt(row.select_one(".sub"))),
|
||||
"minutes": _int(_txt(row.select_one(".minutes"))),
|
||||
"goals": _int(_txt(row.select_one(".goals"))),
|
||||
"yellows": _int(_txt(row.select_one(".yellows"))),
|
||||
"reds": _int(_txt(row.select_one(".reds"))),
|
||||
}
|
||||
)
|
||||
if rows:
|
||||
comps.append({"rows": rows})
|
||||
# matches list
|
||||
for ml in tab.select("div.matchlist"):
|
||||
for row in ml.select("li.row, div.row"):
|
||||
if "header" in (row.get("class") or []):
|
||||
continue
|
||||
date = _txt(row.select_one(".date"))
|
||||
c1 = _txt(row.select_one(".club1"))
|
||||
c2 = _txt(row.select_one(".club2"))
|
||||
res = _txt(row.select_one(".result"))
|
||||
comp = _txt(row.select_one(".competitionround"))
|
||||
if not (date or c1 or c2 or res):
|
||||
continue
|
||||
matches.append(
|
||||
{
|
||||
"date": date or None,
|
||||
"home": c1 or None,
|
||||
"away": c2 or None,
|
||||
"result": res or None,
|
||||
"competition_round": comp or None,
|
||||
}
|
||||
)
|
||||
seasons.append({"season": season, "competitions": comps, "matches": matches})
|
||||
return seasons
|
||||
|
||||
|
||||
def fetch_player(hns_id: int, slug: str) -> dict[str, Any]:
|
||||
"""Fetch a player profile from semafor.hns.family and return structured JSON."""
|
||||
url = f"{BASE}/igraci/{hns_id}/{slug.strip('/')}/"
|
||||
t0 = time.time()
|
||||
html = _get_html(url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
data = {
|
||||
"source_url": url,
|
||||
"hns_igrac_id": hns_id,
|
||||
"slug": slug.strip("/"),
|
||||
"build_token": get_buildid(),
|
||||
"fetched_at": int(time.time()),
|
||||
"fetch_ms": int((time.time() - t0) * 1000),
|
||||
"profile": _parse_player_header(soup),
|
||||
"season_recap": _parse_season_recap(soup),
|
||||
"seasons": _parse_player_seasons(soup),
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Klub parser
|
||||
# ---------------------------------------------------------------------------
|
||||
def _parse_klub_header(soup: BeautifulSoup) -> dict[str, Any]:
|
||||
hdr = soup.select_one("div.block.clubHeader")
|
||||
if hdr is None:
|
||||
return {}
|
||||
name = _txt(hdr.select_one(".title h1"))
|
||||
long_name = _txt(hdr.select_one(".title h2"))
|
||||
img = hdr.select_one(".logo img")
|
||||
info: dict[str, Any] = {
|
||||
"name": name or None,
|
||||
"full_name": long_name or None,
|
||||
"logo_url": img["src"] if img and img.get("src") else None,
|
||||
}
|
||||
for li in hdr.select(".info ul > li"):
|
||||
cls = (li.get("class") or [None])[0]
|
||||
val = _txt(li.select_one("h3"))
|
||||
info[cls] = val or None
|
||||
return info
|
||||
|
||||
|
||||
def fetch_klub(hns_id: int, slug: str) -> dict[str, Any]:
|
||||
"""Fetch a club profile (header + meta) from semafor.hns.family."""
|
||||
url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/"
|
||||
t0 = time.time()
|
||||
html = _get_html(url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
return {
|
||||
"source_url": url,
|
||||
"hns_klub_id": hns_id,
|
||||
"slug": slug.strip("/"),
|
||||
"build_token": get_buildid(),
|
||||
"fetched_at": int(time.time()),
|
||||
"fetch_ms": int((time.time() - t0) * 1000),
|
||||
"info": _parse_klub_header(soup),
|
||||
"competitions": _parse_klub_competitions(soup),
|
||||
"next_match": _parse_next_match(soup),
|
||||
}
|
||||
|
||||
|
||||
def _parse_klub_competitions(soup: BeautifulSoup) -> list[dict[str, Any]]:
|
||||
out = []
|
||||
for tbl in soup.select("div.competition_table"):
|
||||
rows = []
|
||||
for li in tbl.select("li.row, div.row"):
|
||||
if "header" in (li.get("class") or []):
|
||||
continue
|
||||
comp_a = li.select_one("a")
|
||||
rows.append(
|
||||
{
|
||||
"competition": _txt(li.select_one(".title")) or _txt(comp_a) or None,
|
||||
"competition_id": _href_id(comp_a, r"/natjecanja/(\d+)/"),
|
||||
}
|
||||
)
|
||||
if rows:
|
||||
out.extend(rows)
|
||||
return out
|
||||
|
||||
|
||||
def _parse_next_match(soup: BeautifulSoup) -> dict[str, Any] | None:
|
||||
nm = soup.select_one("div.scoreboard_next_previous, div.current_results")
|
||||
if nm is None:
|
||||
return None
|
||||
return {"raw_text": _txt(nm)[:400] or None}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Roster parser — the “Igrači” tab (server-rendered with the page)
|
||||
# ---------------------------------------------------------------------------
|
||||
def fetch_klub_roster(hns_id: int, slug: str) -> list[dict[str, Any]]:
|
||||
"""Return list of players currently rostered at the club."""
|
||||
url = f"{BASE}/klubovi/{hns_id}/{slug.strip('/')}/"
|
||||
html = _get_html(url)
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
roster: list[dict[str, Any]] = []
|
||||
seen: set[int] = set()
|
||||
for block in soup.select("div.playerslist"):
|
||||
for row in block.select("li.row, div.row, tr"):
|
||||
if "header" in (row.get("class") or []):
|
||||
continue
|
||||
a = row.find("a", href=re.compile(r"/igraci/\d+/"))
|
||||
if a is None:
|
||||
continue
|
||||
pid = _href_id(a, r"/igraci/(\d+)/")
|
||||
if pid is None or pid in seen:
|
||||
continue
|
||||
seen.add(pid)
|
||||
slug_m = re.search(r"/igraci/\d+/([^/]+)/", a["href"])
|
||||
roster.append(
|
||||
{
|
||||
"hns_igrac_id": pid,
|
||||
"slug": slug_m.group(1) if slug_m else None,
|
||||
"name": _txt(a) or None,
|
||||
"url": BASE + a["href"] if a["href"].startswith("/") else a["href"],
|
||||
"apps": _int(_txt(row.select_one(".apps"))),
|
||||
"minutes": _int(_txt(row.select_one(".minutes"))),
|
||||
"goals": _int(_txt(row.select_one(".goals"))),
|
||||
"yellows": _int(_txt(row.select_one(".yellows"))),
|
||||
"reds": _int(_txt(row.select_one(".reds"))),
|
||||
}
|
||||
)
|
||||
return roster
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI / smoke test
|
||||
# ---------------------------------------------------------------------------
|
||||
def _bench(label: str, fn, *args):
|
||||
t0 = time.time()
|
||||
out = fn(*args)
|
||||
dt = (time.time() - t0) * 1000
|
||||
print(f"[{label}] {dt:6.1f} ms args={args}", file=sys.stderr)
|
||||
return out, dt
|
||||
|
||||
|
||||
def _slugify(name: str) -> str:
|
||||
name = name.lower()
|
||||
repl = {"č": "c", "ć": "c", "ž": "z", "š": "s", "đ": "d"}
|
||||
for k, v in repl.items():
|
||||
name = name.replace(k, v)
|
||||
name = re.sub(r"[^a-z0-9]+", "-", name).strip("-")
|
||||
return name
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
samples_players = [
|
||||
(88284, "zoran-kurjaga"),
|
||||
(134238, "roko-antesic"),
|
||||
(1223263, "anel-husic"),
|
||||
]
|
||||
samples_klubovi = [
|
||||
(1589, "nk-zamet"),
|
||||
(107150, "nk-pomorac"),
|
||||
(1574, "hnk-lovran"),
|
||||
]
|
||||
|
||||
print(f"# build_token = {get_buildid()}")
|
||||
print()
|
||||
|
||||
print("## fetch_player benchmark")
|
||||
p_times: list[float] = []
|
||||
for pid, slug in samples_players:
|
||||
data, dt = _bench("player", fetch_player, pid, slug)
|
||||
p_times.append(dt)
|
||||
prof = data.get("profile", {})
|
||||
recap = data.get("season_recap", {})
|
||||
seasons = data.get("seasons", [])
|
||||
print(
|
||||
f" - id={pid} slug={slug} -> {prof.get('name')} {prof.get('surname')}, "
|
||||
f"club={prof.get('current_club', {}).get('name')}, dob={prof.get('dob')}, "
|
||||
f"recap_season={recap.get('season')} apps={recap.get('apps')}, "
|
||||
f"seasons_count={len(seasons)}"
|
||||
)
|
||||
print(f" avg={sum(p_times)/len(p_times):.1f} ms")
|
||||
print()
|
||||
|
||||
print("## fetch_klub benchmark")
|
||||
k_times: list[float] = []
|
||||
for kid, slug in samples_klubovi:
|
||||
data, dt = _bench("klub", fetch_klub, kid, slug)
|
||||
k_times.append(dt)
|
||||
info = data.get("info", {})
|
||||
print(
|
||||
f" - id={kid} slug={slug} -> {info.get('name')} | "
|
||||
f"founded={info.get('foundation_date')} | "
|
||||
f"address={info.get('address')} | "
|
||||
f"comps={len(data.get('competitions', []))}"
|
||||
)
|
||||
print(f" avg={sum(k_times)/len(k_times):.1f} ms")
|
||||
print()
|
||||
|
||||
print("## fetch_klub_roster benchmark")
|
||||
r_times: list[float] = []
|
||||
for kid, slug in samples_klubovi:
|
||||
roster, dt = _bench("roster", fetch_klub_roster, kid, slug)
|
||||
r_times.append(dt)
|
||||
print(f" - id={kid} slug={slug} -> {len(roster)} players")
|
||||
for r in roster[:3]:
|
||||
print(f" • {r['name']} (id={r['hns_igrac_id']}) apps={r.get('apps')} goals={r.get('goals')}")
|
||||
print(f" avg={sum(r_times)/len(r_times):.1f} ms")
|
||||
|
||||
# Dump first sample to disk for inspection
|
||||
out_dir = Path("/opt/pgz-sport/cc_tasks/sub4")
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
pid0, slug0 = samples_players[0]
|
||||
sample = fetch_player(pid0, slug0)
|
||||
(out_dir / "sample_player.json").write_text(
|
||||
json.dumps(sample, ensure_ascii=False, indent=2)
|
||||
)
|
||||
kid0, kslug0 = samples_klubovi[0]
|
||||
ksample = fetch_klub(kid0, kslug0)
|
||||
(out_dir / "sample_klub.json").write_text(
|
||||
json.dumps(ksample, ensure_ascii=False, indent=2)
|
||||
)
|
||||
rsample = fetch_klub_roster(kid0, kslug0)
|
||||
(out_dir / "sample_roster.json").write_text(
|
||||
json.dumps(rsample, ensure_ascii=False, indent=2)
|
||||
)
|
||||
print()
|
||||
print(f"# sample JSONs written to {out_dir}/")
|
||||
@@ -0,0 +1,534 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
hns_player_deep.py — SUB3 deep HNS player scraper
|
||||
─────────────────────────────────────────────────
|
||||
Author: dradulic@outlook.com / damir@rinet.one
|
||||
Date: 2026-05-05
|
||||
Version: 1.0
|
||||
|
||||
Scrapes semafor.hns.family/igraci/{id}/{slug}/ for every clanovi.hns_igrac_id row,
|
||||
extracting:
|
||||
• profil meta (datum_rodenja, mjesto_rodenja, broj_dresa, current klub)
|
||||
• per-season stats per natjecanje (UPSERT pgz_sport.hns_player_seasons)
|
||||
• last 30+ matches (UPSERT pgz_sport.hns_player_matches)
|
||||
|
||||
Server-rendered HTML — no Playwright needed → uses requests for 5–10× speedup.
|
||||
Fallback to Playwright if --use-playwright is passed.
|
||||
|
||||
Resume-able: skips clanovi where last_scraped_at > now() - interval N days.
|
||||
|
||||
Usage:
|
||||
python3 hns_player_deep.py [--limit 200] [--days 7] [--player HNS_ID] [--use-playwright]
|
||||
"""
|
||||
import os, sys, re, time, json, argparse, traceback
|
||||
from datetime import datetime, date
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
SLEEP = float(os.getenv("SLEEP", "0.8"))
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
LOG_FILE = f"{LOG_DIR}/sub3_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
LOG_FH = open(LOG_FILE, "a", encoding="utf-8")
|
||||
|
||||
def log(msg: str, telegram: bool = False) -> None:
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True)
|
||||
LOG_FH.write(line + "\n"); LOG_FH.flush()
|
||||
if telegram and TG_TOKEN and TG_CHAT:
|
||||
try:
|
||||
requests.post(
|
||||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
data={"chat_id": TG_CHAT, "text": msg[:4000]},
|
||||
timeout=8,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── HTTP session ──────────────────────────────────────────────────────────
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": UA, "Accept-Language": "hr,en;q=0.7"})
|
||||
|
||||
def fetch_html(url: str, timeout: int = 20) -> str | None:
|
||||
try:
|
||||
r = SESSION.get(url, timeout=timeout)
|
||||
if r.status_code != 200:
|
||||
log(f" HTTP {r.status_code} {url}")
|
||||
return None
|
||||
return r.text
|
||||
except Exception as e:
|
||||
log(f" fetch fail {url}: {e}")
|
||||
return None
|
||||
|
||||
# ── Parsers ───────────────────────────────────────────────────────────────
|
||||
def _strip_html(s: str) -> str:
|
||||
s = re.sub(r"<[^>]+>", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
def parse_profile(html: str) -> dict:
|
||||
"""Extract player profile meta (HNS exposes only birth date / city / jersey / current club)."""
|
||||
out = {
|
||||
"broj_dresa": None,
|
||||
"datum_rodenja": None,
|
||||
"mjesto_rodenja": None,
|
||||
"klub_hns_id": None,
|
||||
"klub_naziv": None,
|
||||
}
|
||||
# playerHeader block (everything from header to first <!--)
|
||||
m = re.search(r'<div class="block playerHeader"[^>]*>(.*?)<!--', html, re.DOTALL)
|
||||
header_html = m.group(1) if m else html
|
||||
|
||||
# Jersey number
|
||||
m = re.search(r'<span class="number"[^>]*>(\d+)</span>', header_html)
|
||||
if not m:
|
||||
# fallback: number in playerHeader text region (first standalone digit before name)
|
||||
text = _strip_html(header_html)
|
||||
mm = re.match(r'^\s*(\d{1,2})\s+[A-ZČĆŠŽĐ]', text)
|
||||
if mm:
|
||||
out["broj_dresa"] = int(mm.group(1))
|
||||
else:
|
||||
out["broj_dresa"] = int(m.group(1))
|
||||
|
||||
# Trenutni klub (first /klubovi/ link in header)
|
||||
m = re.search(r'<a[^>]+href="/klubovi/(\d+)/([\w-]+)/?"[^>]*>([^<]+)<', header_html)
|
||||
if m:
|
||||
out["klub_hns_id"] = m.group(1)
|
||||
out["klub_naziv"] = m.group(3).strip()
|
||||
|
||||
# Datum rođenja (dd.mm.yyyy.)
|
||||
m = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})\.\s*(?:</[^>]+>\s*)?(?:<[^>]+>\s*)*\(?\s*\d+\s*godin', header_html)
|
||||
if not m:
|
||||
# Looser pattern in playerData
|
||||
m = re.search(r'<div[^>]*class="[^"]*birth[^"]*"[^>]*>(\d{1,2})\.(\d{1,2})\.(\d{4})', header_html)
|
||||
if not m:
|
||||
# Fallback: any dd.mm.yyyy. near "Datum rođenja"
|
||||
text = _strip_html(header_html)
|
||||
mm = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})\.\s*\(?\s*\d+\s*godin[ae]?\)?\s*Datum rođenja', text)
|
||||
if mm:
|
||||
m = mm
|
||||
if m:
|
||||
try:
|
||||
out["datum_rodenja"] = date(int(m.group(3)), int(m.group(2)), int(m.group(1)))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Mjesto rođenja: text right before "Mjesto rođenja"
|
||||
text_all = _strip_html(header_html)
|
||||
mm = re.search(r'([A-ZČĆŠŽĐ][\w\sčćšžđČĆŠŽĐ\-]{1,80}?)\s+Mjesto rođenja', text_all)
|
||||
if mm:
|
||||
out["mjesto_rodenja"] = mm.group(1).strip()
|
||||
|
||||
return out
|
||||
|
||||
# Each season block: "{YYYY/YY} Statistika Utakmice ... <playerCompetitionStatsTable> ... <matchlist>"
|
||||
# We split player_profile_matches by the recurring pattern.
|
||||
SEASON_HEADER_RE = re.compile(
|
||||
r'(?:<[^>]+>\s*)?(20\d{2}/\d{2})(?:\s*<[^>]+>)?\s*Statistika\s+Utakmice',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def parse_seasons_and_matches(html: str) -> tuple[list[dict], list[dict]]:
|
||||
"""Return (season_rows, match_rows) for ALL seasons on the profile page."""
|
||||
# Limit to player_profile_matches block to avoid noise
|
||||
m = re.search(
|
||||
r'<div class="block w1280 matchlist style1 player_profile_matches"[^>]*>(.*?)(?=<!--|<footer)',
|
||||
html, re.DOTALL,
|
||||
)
|
||||
if not m:
|
||||
return [], []
|
||||
block = m.group(1)
|
||||
|
||||
# Find season header positions: <h2 class="seasonTitle ...">YYYY/YY</h2>
|
||||
headers = list(re.finditer(
|
||||
r'<h2\s+class="seasonTitle[^"]*"[^>]*>\s*(20\d{2}/\d{2})\s*</h2>',
|
||||
block,
|
||||
))
|
||||
if not headers:
|
||||
# Fallback: any <h2> with season label
|
||||
headers = list(re.finditer(r'<h2[^>]*>\s*(20\d{2}/\d{2})\s*</h2>', block))
|
||||
if not headers:
|
||||
plain = re.sub(r'<[^>]+>', ' ', block)
|
||||
plain = re.sub(r'\s+', ' ', plain)
|
||||
return _parse_plain(plain)
|
||||
|
||||
sections = []
|
||||
for i, h in enumerate(headers):
|
||||
sezona = h.group(1)
|
||||
start = h.start()
|
||||
end = headers[i + 1].start() if i + 1 < len(headers) else len(block)
|
||||
sections.append((sezona, block[start:end]))
|
||||
|
||||
season_rows: list[dict] = []
|
||||
match_rows: list[dict] = []
|
||||
|
||||
for sezona, sec in sections:
|
||||
# ── Per-season per-natjecanje stats (playerCompetitionStatsTable) ──
|
||||
cs = re.search(
|
||||
r'<div class="block w1280 playerCompetitionStatsTable"[^>]*>(.*?)</div>\s*</div>\s*</div>',
|
||||
sec, re.DOTALL,
|
||||
)
|
||||
if cs:
|
||||
stab = cs.group(1)
|
||||
# Header row → identify columns; body rows have natjecanje + 6 ints
|
||||
# Extract: total row "Ukupno" + per-competition rows
|
||||
# Each row appears as <td>…</td>. Use table-agnostic approach: find every block of
|
||||
# "<td>NATJECANJE</td><td>N</td><td>S</td><td>Z</td><td>G</td><td>YEL</td><td>RED</td>"
|
||||
# but tables here use divs not td. Walk plain text per line.
|
||||
stext = _strip_html(stab)
|
||||
# Split by competition-row pattern: "<label> <int> <int> <int> <int> <int> <int>"
|
||||
for rm in re.finditer(
|
||||
r'([A-ZČĆŠŽĐa-zčćšžđ0-9][^|]*?)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)(?=\s|$)',
|
||||
stext,
|
||||
):
|
||||
label = rm.group(1).strip()
|
||||
if label.lower().startswith("ukupno"):
|
||||
continue # we keep per-natjecanje rows only (UNIQUE prefers natjecanje)
|
||||
if "Nastupi" in label or "Započeo" in label or "Statistika" in label:
|
||||
continue
|
||||
try:
|
||||
season_rows.append({
|
||||
"sezona": sezona,
|
||||
"natjecanje": label[:200],
|
||||
"nastupi": int(rm.group(2)),
|
||||
"startna": int(rm.group(3)),
|
||||
"zamjena": int(rm.group(4)),
|
||||
"golovi": int(rm.group(5)),
|
||||
"zuti": int(rm.group(6)),
|
||||
"crveni": int(rm.group(7)),
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Matches (matchlist style2) ──
|
||||
ml = re.search(
|
||||
r'<div class="matchlist style2 semafor player[^"]*"[^>]*>(.*?)</ul>',
|
||||
sec, re.DOTALL,
|
||||
)
|
||||
if ml:
|
||||
list_html = ml.group(1)
|
||||
for row in re.finditer(
|
||||
r'<li class="row[^"]*"[^>]*data-match="(\d+)"[^>]*>(.*?)</li>',
|
||||
list_html, re.DOTALL,
|
||||
):
|
||||
row_html = row.group(2)
|
||||
# Date
|
||||
d = re.search(r'<div class="date">([^<]+)</div>', row_html)
|
||||
# club1 / club2
|
||||
c1 = re.search(r'<div class="club1"[^>]*>\s*<a[^>]*>([^<]+?)<', row_html)
|
||||
c2 = re.search(r'<div class="club2"[^>]*>\s*<a[^>]*>([^<]+?)<', row_html)
|
||||
# result
|
||||
r1 = re.search(r'<div class="res1">(\d+)</div>', row_html)
|
||||
r2 = re.search(r'<div class="res2">(\d+)</div>', row_html)
|
||||
# natjecanje
|
||||
cr = re.search(r'<div class="competitionround">([^<]+)</div>', row_html)
|
||||
# goals
|
||||
gl = re.search(r'<div class="goals">(\d+)</div>', row_html)
|
||||
# cards "Y / R"
|
||||
ca = re.search(r'<div class="cards">.*?(\d+)\s*/\s*(\d+).*?</div>', row_html, re.DOTALL)
|
||||
# minutes
|
||||
mn = re.search(r'<div class="minutes">(\d+)</div>', row_html)
|
||||
|
||||
# Parse date dd.mm.yyyy. HH:MM
|
||||
datum = None
|
||||
if d:
|
||||
dm = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', d.group(1))
|
||||
if dm:
|
||||
try:
|
||||
datum = date(int(dm.group(3)), int(dm.group(2)), int(dm.group(1)))
|
||||
except Exception:
|
||||
pass
|
||||
rezultat = f"{r1.group(1)}:{r2.group(1)}" if r1 and r2 else None
|
||||
|
||||
match_rows.append({
|
||||
"datum": datum,
|
||||
"domacin": (c1.group(1).strip() if c1 else "")[:120],
|
||||
"gost": (c2.group(1).strip() if c2 else "")[:120],
|
||||
"rezultat": rezultat,
|
||||
"natjecanje": (cr.group(1).strip() if cr else "")[:200],
|
||||
"golovi": int(gl.group(1)) if gl else 0,
|
||||
"zuti": int(ca.group(1)) if ca else 0,
|
||||
"crveni": int(ca.group(2)) if ca else 0,
|
||||
"minute_do": int(mn.group(1)) if mn else None,
|
||||
})
|
||||
|
||||
return season_rows, match_rows
|
||||
|
||||
|
||||
def _parse_plain(plain_text: str) -> tuple[list[dict], list[dict]]:
|
||||
"""Fallback: parse from already-stripped plain text (no match-row HTML access)."""
|
||||
# Best effort: extract season totals only
|
||||
season_rows: list[dict] = []
|
||||
# Split by season headers
|
||||
parts = re.split(r'(20\d{2}/\d{2})\s+Statistika\s+Utakmice', plain_text)
|
||||
# parts: [pre, season1, body1, season2, body2, ...]
|
||||
for i in range(1, len(parts), 2):
|
||||
sezona = parts[i]
|
||||
body = parts[i + 1] if i + 1 < len(parts) else ""
|
||||
# Find the "Ukupno N N N G Y R" then per-competition lines
|
||||
for rm in re.finditer(
|
||||
r'([A-ZČĆŠŽĐa-zčćšžđ0-9][^|]*?)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)(?=\s|$)',
|
||||
body[:3000],
|
||||
):
|
||||
label = rm.group(1).strip()
|
||||
if label.lower().startswith("ukupno"):
|
||||
continue
|
||||
if "Nastupi" in label or "Statistika" in label:
|
||||
continue
|
||||
season_rows.append({
|
||||
"sezona": sezona,
|
||||
"natjecanje": label[:200],
|
||||
"nastupi": int(rm.group(2)),
|
||||
"startna": int(rm.group(3)),
|
||||
"zamjena": int(rm.group(4)),
|
||||
"golovi": int(rm.group(5)),
|
||||
"zuti": int(rm.group(6)),
|
||||
"crveni": int(rm.group(7)),
|
||||
})
|
||||
return season_rows, []
|
||||
|
||||
# ── DB ────────────────────────────────────────────────────────────────────
|
||||
def db_conn():
|
||||
c = psycopg2.connect(DSN); c.autocommit = True; return c
|
||||
|
||||
def get_targets(conn, limit: int, days: int, force_player: str | None = None) -> list[dict]:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
if force_player:
|
||||
cur.execute("""
|
||||
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id = %s
|
||||
LIMIT 1
|
||||
""", (force_player,))
|
||||
else:
|
||||
cur.execute("""
|
||||
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL
|
||||
AND (last_scraped_at IS NULL OR last_scraped_at < now() - %s::interval)
|
||||
ORDER BY (last_scraped_at IS NULL) DESC, id ASC
|
||||
LIMIT %s
|
||||
""", (f"{days} days", limit))
|
||||
return cur.fetchall()
|
||||
|
||||
def update_clan(conn, clan_id: int, profile: dict, url: str) -> None:
|
||||
sets, vals = [], []
|
||||
if profile.get("datum_rodenja"):
|
||||
sets.append("datum_rodenja = COALESCE(datum_rodenja, %s)")
|
||||
vals.append(profile["datum_rodenja"])
|
||||
sets.append("datum_rodjenja = COALESCE(datum_rodjenja, %s)")
|
||||
vals.append(profile["datum_rodenja"])
|
||||
if profile.get("mjesto_rodenja"):
|
||||
sets.append("mjesto_rodenja = COALESCE(NULLIF(mjesto_rodenja,''), %s)")
|
||||
vals.append(profile["mjesto_rodenja"])
|
||||
sets.append("mjesto_rodjenja = COALESCE(NULLIF(mjesto_rodjenja,''), %s)")
|
||||
vals.append(profile["mjesto_rodenja"])
|
||||
if profile.get("broj_dresa") is not None:
|
||||
sets.append("broj_dresa = COALESCE(broj_dresa, %s)")
|
||||
vals.append(profile["broj_dresa"])
|
||||
sets.append("source_url = %s"); vals.append(url)
|
||||
sets.append("source = COALESCE(NULLIF(source,''), 'hns_semafor')")
|
||||
sets.append("sport = COALESCE(NULLIF(sport,''), 'nogomet')")
|
||||
sets.append("last_scraped_at = now()")
|
||||
sets.append("source_synced_at = now()")
|
||||
vals.append(clan_id)
|
||||
sql = f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s"
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, tuple(vals))
|
||||
|
||||
def upsert_seasons(conn, hns_id: str, clan_id: int, url: str, rows: list[dict]) -> int:
|
||||
if not rows:
|
||||
return 0
|
||||
raw = [
|
||||
(hns_id, clan_id, r["sezona"], None, None, r["natjecanje"][:200],
|
||||
r.get("nastupi", 0), r.get("startna", 0), r.get("zamjena", 0),
|
||||
r.get("golovi", 0), 0, r.get("zuti", 0), r.get("crveni", 0), 0, url)
|
||||
for r in rows
|
||||
]
|
||||
# Dedupe by UNIQUE (hns_igrac_id, sezona, klub_hns_id, natjecanje)
|
||||
dedup: dict[tuple, tuple] = {}
|
||||
for row in raw:
|
||||
k = (row[0], row[2], row[3], row[5])
|
||||
dedup[k] = row
|
||||
data = list(dedup.values())
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
|
||||
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
|
||||
nastupi = EXCLUDED.nastupi,
|
||||
startna = EXCLUDED.startna,
|
||||
zamjena = EXCLUDED.zamjena,
|
||||
golovi = EXCLUDED.golovi,
|
||||
zuti = EXCLUDED.zuti,
|
||||
crveni = EXCLUDED.crveni,
|
||||
source_url = EXCLUDED.source_url,
|
||||
scraped_at = now()
|
||||
""", data)
|
||||
return len(rows)
|
||||
|
||||
def upsert_matches(conn, hns_id: str, clan_id: int, url: str, rows: list[dict]) -> int:
|
||||
if not rows:
|
||||
return 0
|
||||
raw = [
|
||||
(hns_id, clan_id, r["datum"], r["natjecanje"], r["domacin"], r["gost"],
|
||||
r["rezultat"], None, None, None, r.get("minute_do"),
|
||||
r.get("golovi", 0), 0, r.get("zuti", 0), r.get("crveni", 0), url)
|
||||
for r in rows if r["datum"] and r["domacin"] and r["gost"]
|
||||
]
|
||||
# Dedupe by UNIQUE key (hns_igrac_id, datum, domacin, gost) — keep last occurrence
|
||||
dedup: dict[tuple, tuple] = {}
|
||||
for row in raw:
|
||||
k = (row[0], row[2], row[4], row[5])
|
||||
dedup[k] = row
|
||||
data = list(dedup.values())
|
||||
if not data:
|
||||
return 0
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_matches
|
||||
(hns_igrac_id, clan_id, datum, natjecanje, domacin, gost,
|
||||
rezultat, pozicija, startna, minute_od, minute_do,
|
||||
golovi, asistencije, zuti, crveni, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO UPDATE SET
|
||||
rezultat = EXCLUDED.rezultat,
|
||||
natjecanje = EXCLUDED.natjecanje,
|
||||
minute_do = EXCLUDED.minute_do,
|
||||
golovi = EXCLUDED.golovi,
|
||||
zuti = EXCLUDED.zuti,
|
||||
crveni = EXCLUDED.crveni,
|
||||
source_url = EXCLUDED.source_url,
|
||||
scraped_at = now()
|
||||
""", data)
|
||||
return len(data)
|
||||
|
||||
# ── Slug helper ───────────────────────────────────────────────────────────
|
||||
def slugify(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
repl = str.maketrans("čćžšđČĆŽŠĐ", "ccczsdcczsd"[:10])
|
||||
t = text.lower().translate(repl)
|
||||
t = re.sub(r"[^a-z0-9\s-]", "", t)
|
||||
return re.sub(r"\s+", "-", t).strip("-")
|
||||
|
||||
def build_url(t: dict) -> str:
|
||||
if t.get("source_url") and "semafor.hns.family/igraci/" in t["source_url"]:
|
||||
return t["source_url"]
|
||||
slug = (t.get("slug") or slugify(f"{t['ime']} {t['prezime']}")) or "x"
|
||||
return f"https://semafor.hns.family/igraci/{t['hns_igrac_id']}/{slug}/"
|
||||
|
||||
# ── Driver ────────────────────────────────────────────────────────────────
|
||||
def process_one(conn, t: dict) -> dict:
|
||||
url = build_url(t)
|
||||
html = fetch_html(url)
|
||||
if not html or "playerHeader" not in html:
|
||||
log(f" ✗ no playerHeader for {t['ime']} {t['prezime']} ({t['hns_igrac_id']}) → {url}")
|
||||
# Mark as scraped to avoid hot-loop on broken URL
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE pgz_sport.clanovi SET last_scraped_at = now() WHERE id = %s",
|
||||
(t["id"],),
|
||||
)
|
||||
return {"profile": False, "seasons": 0, "matches": 0, "fields": 0}
|
||||
|
||||
profile = parse_profile(html)
|
||||
seasons, matches = parse_seasons_and_matches(html)
|
||||
|
||||
# Update clan profile
|
||||
update_clan(conn, t["id"], profile, url)
|
||||
n_fields = sum(1 for k in ("datum_rodenja", "mjesto_rodenja", "broj_dresa") if profile.get(k))
|
||||
|
||||
n_s = upsert_seasons(conn, t["hns_igrac_id"], t["id"], url, seasons)
|
||||
n_m = upsert_matches(conn, t["hns_igrac_id"], t["id"], url, matches)
|
||||
|
||||
return {"profile": True, "seasons": n_s, "matches": n_m, "fields": n_fields}
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--limit", type=int, default=200)
|
||||
ap.add_argument("--days", type=int, default=7)
|
||||
ap.add_argument("--player", help="Single HNS ID (debug)")
|
||||
ap.add_argument("--missing-matches", action="store_true",
|
||||
help="Only target clanovi without rows in hns_player_matches")
|
||||
ap.add_argument("--no-telegram", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
log(f"SUB3 deep scraper start | limit={args.limit} | days={args.days} | "
|
||||
f"missing_matches={args.missing_matches} | log={LOG_FILE}",
|
||||
telegram=not args.no_telegram)
|
||||
|
||||
conn = db_conn()
|
||||
if args.missing_matches:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL
|
||||
AND id NOT IN (
|
||||
SELECT clan_id FROM pgz_sport.hns_player_matches WHERE clan_id IS NOT NULL
|
||||
)
|
||||
ORDER BY id ASC
|
||||
LIMIT %s
|
||||
""", (args.limit,))
|
||||
targets = cur.fetchall()
|
||||
else:
|
||||
targets = get_targets(conn, args.limit, args.days, args.player)
|
||||
log(f"Targets: {len(targets)}")
|
||||
|
||||
stats = {"scraped": 0, "seasons": 0, "matches": 0, "fields": 0, "errors": 0}
|
||||
t0 = time.time()
|
||||
|
||||
for i, t in enumerate(targets, 1):
|
||||
try:
|
||||
r = process_one(conn, t)
|
||||
stats["scraped"] += 1
|
||||
stats["seasons"] += r["seasons"]
|
||||
stats["matches"] += r["matches"]
|
||||
stats["fields"] += r["fields"]
|
||||
if i % 10 == 0 or r["matches"] > 0:
|
||||
log(f" [{i}/{len(targets)}] {t['ime']} {t['prezime']} "
|
||||
f"→ seasons +{r['seasons']} matches +{r['matches']} fields +{r['fields']} "
|
||||
f"(totals: s={stats['seasons']} m={stats['matches']})")
|
||||
except Exception as e:
|
||||
stats["errors"] += 1
|
||||
log(f" ✗ ERROR {t['ime']} {t['prezime']} ({t['hns_igrac_id']}): {e}")
|
||||
log(traceback.format_exc()[:500])
|
||||
time.sleep(SLEEP)
|
||||
|
||||
dur = time.time() - t0
|
||||
summary = (
|
||||
f"SUB3 done in {dur:.0f}s | scraped={stats['scraped']} "
|
||||
f"seasons +{stats['seasons']} matches +{stats['matches']} "
|
||||
f"fields +{stats['fields']} errors={stats['errors']}"
|
||||
)
|
||||
log(summary, telegram=not args.no_telegram)
|
||||
|
||||
# Result file
|
||||
res_path = "/opt/pgz-sport/cc_tasks/SUB3_RESULT.md"
|
||||
with open(res_path, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n## Run {datetime.now().isoformat(timespec='seconds')}\n")
|
||||
f.write(f"- batch_limit: {args.limit}\n")
|
||||
f.write(f"- targets: {len(targets)}\n")
|
||||
f.write(f"- scraped: {stats['scraped']}\n")
|
||||
f.write(f"- seasons +{stats['seasons']}\n")
|
||||
f.write(f"- matches +{stats['matches']}\n")
|
||||
f.write(f"- profile fields enriched: +{stats['fields']}\n")
|
||||
f.write(f"- errors: {stats['errors']}\n")
|
||||
f.write(f"- duration: {dur:.0f}s\n")
|
||||
f.write(f"- log: {LOG_FILE}\n")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,573 @@
|
||||
#!/usr/bin/env python3
|
||||
# hns_youth_categories.py — SUB5 — HNS Semafor youth team scraper (v1.0)
|
||||
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
||||
# Date: 2026-05-05
|
||||
# Description:
|
||||
# Discovers per-club age categories (Seniori / U-19 juniori / U-17 kadeti /
|
||||
# U-15 stariji pioniri / U-13 mlađi pioniri / U-11/U-9 početnici) by
|
||||
# scraping HNS COMET Semafor competition pages and matching participating
|
||||
# klubovi with hns_klub_id in pgz_sport.klubovi. For each (klub, kategorija,
|
||||
# sezona) the per-club competition roster is fetched and players are
|
||||
# upserted into pgz_sport.clan_kategorije (M2M player x category x season).
|
||||
#
|
||||
# Strategy:
|
||||
# 1. Hardcoded list of per-season national + 2.NL competitions whose
|
||||
# cid → kategorija mapping is known (PGZ regional 3.NL/ŽNS leagues
|
||||
# are added as discovered).
|
||||
# 2. For each competition, fetch /natjecanja/{cid}/{slug}/ and extract
|
||||
# all participating /klubovi/{kid}/{slug}/ links.
|
||||
# 3. Match against pgz_sport.klubovi (hns_klub_id). For each match,
|
||||
# fetch /klubovi/{kid}/{slug}/?cid={cid} and parse player /igraci/
|
||||
# links — these are the players belonging to this age category.
|
||||
# 4. Upsert each player as clanovi (source=hns_semafor) and write
|
||||
# clan_kategorije(clan_id, klub_id, kategorija, sezona, source,
|
||||
# source_url, scraped_at).
|
||||
#
|
||||
# Run modes:
|
||||
# python hns_youth_categories.py discover # dry-run, only logs
|
||||
# python hns_youth_categories.py run # full scrape + DB upsert
|
||||
# python hns_youth_categories.py klub <db_kid> # one club only
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from urllib.parse import unquote
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Try to use SUB4's hns_api_client for shared session/UA
|
||||
SCRIPTS_DIR = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(SCRIPTS_DIR))
|
||||
try:
|
||||
import hns_api_client as hns_api # type: ignore
|
||||
_GET_HTML = hns_api._get_html
|
||||
_UA = hns_api.UA
|
||||
except Exception:
|
||||
_GET_HTML = None
|
||||
_UA = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
DB_DSN = dict(
|
||||
host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7",
|
||||
)
|
||||
BASE = "https://semafor.hns.family"
|
||||
RATE_S = 1.0
|
||||
TIMEOUT = 25
|
||||
|
||||
LOG_DIR = Path("/var/log/pgz-sport-debug")
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
LOG_FILE = LOG_DIR / f"sub5_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
|
||||
log = logging.getLogger("sub5")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE, encoding="utf-8"),
|
||||
logging.StreamHandler(sys.stdout),
|
||||
],
|
||||
)
|
||||
|
||||
# ── Telegram ───────────────────────────────────────────────────────────────
|
||||
TG_TOKEN = os.environ.get("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.environ.get("TG_CHAT", "7969491558")
|
||||
|
||||
|
||||
def tg_send(msg: str):
|
||||
if not TG_TOKEN or not TG_CHAT:
|
||||
return
|
||||
try:
|
||||
requests.post(
|
||||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
data={"chat_id": TG_CHAT, "text": msg, "parse_mode": "Markdown"},
|
||||
timeout=10,
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning(f"telegram failed: {e}")
|
||||
|
||||
|
||||
# ── HTTP fallback ──────────────────────────────────────────────────────────
|
||||
_session = requests.Session()
|
||||
_session.headers.update({"User-Agent": _UA, "Accept-Language": "hr,en;q=0.7"})
|
||||
|
||||
|
||||
def fetch(url: str) -> str:
|
||||
if _GET_HTML is not None:
|
||||
return _GET_HTML(url)
|
||||
log.debug(f"GET {url}")
|
||||
r = _session.get(url, timeout=TIMEOUT)
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
|
||||
|
||||
# ── Competition catalogue ─────────────────────────────────────────────────
|
||||
# Each entry: (cid, slug, kategorija, sezona). PGZ-relevant national /
|
||||
# 2.NL leagues per season. Regional ŽNS leagues are discovered dynamically
|
||||
# via discover_pgz_competitions() once we find them inside klub raspored.
|
||||
COMP_CATALOG = [
|
||||
# 2025/2026 season
|
||||
("100454960", "1-nl-juniori", "juniori-u19", "2025/2026"),
|
||||
("100454979", "1-nl-kadeti", "kadeti-u17", "2025/2026"),
|
||||
("100454999", "1-nl-pioniri", "pioniri-u15", "2025/2026"),
|
||||
("100540163", "2-nl-juniori-a", "juniori-u19", "2025/2026"),
|
||||
("100540177", "2-nl-juniori-b", "juniori-u19", "2025/2026"),
|
||||
("100540032", "2-nl-kadeti-a", "kadeti-u17", "2025/2026"),
|
||||
("100540109", "2-nl-kadeti-b", "kadeti-u17", "2025/2026"),
|
||||
("100381663", "kvalifikacije-za-prvu-nl-juniori", "juniori-u19", "2025/2026"),
|
||||
("100381584", "kvalifikacije-za-prvu-nl-kadeti", "kadeti-u17", "2025/2026"),
|
||||
("100381484", "kvalifikacije-za-prvu-nl-pioniri", "pioniri-u15", "2025/2026"),
|
||||
("100569152", "treca-nl-istok", "seniori", "2025/2026"), # Treća NL Istok
|
||||
("100585203", "treca-nl-zapad", "seniori", "2025/2026"), # Treća NL Zapad (PGŽ klubovi)
|
||||
("100391485", "supersport-hnl", "seniori", "2025/2026"),
|
||||
("100413651", "supersport-prva-nl", "seniori", "2025/2026"),
|
||||
("100418001", "supersport-druga-nl", "seniori", "2025/2026"),
|
||||
("100439118", "supersport-hnk", "seniori", "2025/2026"), # Cup, all seniori
|
||||
("101411063", "hrvatski-nogometni-kup", "seniori", "2025/2026"),
|
||||
# 2024/2025 season — same structure, slightly different cids; will be
|
||||
# discovered dynamically per-klub as well.
|
||||
]
|
||||
|
||||
# Map from acat dropdown values (HR semantic labels) → kategorija
|
||||
ACAT_MAP = {
|
||||
"Seniors": "seniori",
|
||||
"Juniors": "juniori-u19",
|
||||
"Juniors 2": "juniori-u19",
|
||||
"Cadets": "kadeti-u17",
|
||||
"Cadets 2": "kadeti-u17",
|
||||
"Pioneers": "pioniri-u15",
|
||||
"Pioneers 2": "pioniri-u15",
|
||||
"Young pioneers": "mladji-pioniri-u13",
|
||||
"Beginners": "pocetnici-u11",
|
||||
"Pre-beginners (6+1, 20min)": "pocetnici-u9",
|
||||
}
|
||||
|
||||
# Heuristic from competition name → kategorija
|
||||
def kategorija_from_name(name: str) -> str:
|
||||
nl = name.lower()
|
||||
if "juniori" in nl or "juniors" in nl:
|
||||
return "juniori-u19"
|
||||
if "kadeti" in nl or "cadets" in nl or "kadetkinje" in nl:
|
||||
return "kadeti-u17"
|
||||
if "stariji pioniri" in nl:
|
||||
return "pioniri-u15"
|
||||
if "mladji pioniri" in nl or "mlađi pioniri" in nl or "young pioneers" in nl:
|
||||
return "mladji-pioniri-u13"
|
||||
if "pioniri" in nl or "pioneers" in nl or "pionirke" in nl:
|
||||
return "pioniri-u15"
|
||||
if "pocetnici u-9" in nl or "pre-beginners" in nl or "pocetnici-u-9" in nl:
|
||||
return "pocetnici-u9"
|
||||
if "pocetnici u-11" in nl or "beginners" in nl or "pocetnici-u-11" in nl:
|
||||
return "pocetnici-u11"
|
||||
return "seniori"
|
||||
|
||||
|
||||
# ── DB helpers ─────────────────────────────────────────────────────────────
|
||||
def conn():
|
||||
return psycopg2.connect(**DB_DSN)
|
||||
|
||||
|
||||
def ensure_schema():
|
||||
"""Verify clan_kategorije table exists; the schema in production already
|
||||
matches the M2M shape required (no DDL change needed here)."""
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""SELECT 1 FROM information_schema.tables
|
||||
WHERE table_schema='pgz_sport' AND table_name='clan_kategorije'"""
|
||||
)
|
||||
if cu.fetchone():
|
||||
log.info("clan_kategorije table verified.")
|
||||
return
|
||||
cu.execute(
|
||||
"""CREATE TABLE pgz_sport.clan_kategorije (
|
||||
id SERIAL PRIMARY KEY,
|
||||
clan_id INTEGER REFERENCES pgz_sport.clanovi(id) ON DELETE CASCADE,
|
||||
klub_id INTEGER REFERENCES pgz_sport.klubovi(id),
|
||||
kategorija TEXT NOT NULL,
|
||||
sezona TEXT,
|
||||
source TEXT,
|
||||
source_url TEXT,
|
||||
scraped_at TIMESTAMPTZ DEFAULT now(),
|
||||
UNIQUE (clan_id, kategorija, sezona, klub_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_clan_kat_clan
|
||||
ON pgz_sport.clan_kategorije(clan_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_clan_kat_sezona
|
||||
ON pgz_sport.clan_kategorije(sezona);
|
||||
CREATE INDEX IF NOT EXISTS idx_clan_kat_klub
|
||||
ON pgz_sport.clan_kategorije(klub_id);
|
||||
"""
|
||||
)
|
||||
c.commit()
|
||||
log.info("Created pgz_sport.clan_kategorije.")
|
||||
|
||||
|
||||
def load_pgz_klubovi() -> dict[int, dict]:
|
||||
"""Returns {hns_klub_id: {db_id, naziv, slug}}, deduped by hns_klub_id
|
||||
(keeping the first / lowest-id row when duplicates exist)."""
|
||||
out: dict[int, dict] = {}
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""SELECT id, naziv, hns_klub_id, COALESCE(NULLIF(hns_slug,''), slug)
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE hns_klub_id IS NOT NULL
|
||||
ORDER BY id"""
|
||||
)
|
||||
for kid_db, naziv, hns_id, slug in cu.fetchall():
|
||||
if hns_id in out:
|
||||
continue # keep first occurrence
|
||||
out[hns_id] = {
|
||||
"db_id": kid_db,
|
||||
"naziv": naziv,
|
||||
"slug": slug or _slugify(naziv),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def _slugify(name: str) -> str:
|
||||
name = (name or "").lower()
|
||||
repl = {"č": "c", "ć": "c", "ž": "z", "š": "s", "đ": "d"}
|
||||
for k, v in repl.items():
|
||||
name = name.replace(k, v)
|
||||
name = re.sub(r"[^a-z0-9]+", "-", name).strip("-")
|
||||
return name
|
||||
|
||||
|
||||
def upsert_clan(klub_db_id: int, hns_pid: int, ime_prezime: str, slug: str) -> int:
|
||||
"""Upsert a player into pgz_sport.clanovi keyed on (source='hns_semafor',
|
||||
source_id=hns_pid). Returns clan_id."""
|
||||
ime, prezime = "", ""
|
||||
if ime_prezime:
|
||||
parts = ime_prezime.strip().split(" ", 1)
|
||||
ime = parts[0]
|
||||
prezime = parts[1] if len(parts) > 1 else ""
|
||||
url = f"{BASE}/igraci/{hns_pid}/{slug or 'x'}/"
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE source='hns_semafor' AND source_id=%s LIMIT 1""",
|
||||
(str(hns_pid),),
|
||||
)
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
return row[0]
|
||||
# Try secondary lookup by hns_igrac_id (some rows from earlier runs)
|
||||
# NOTE: hns_igrac_id is varchar in DB, cast to text
|
||||
cu.execute(
|
||||
"SELECT id FROM pgz_sport.clanovi WHERE hns_igrac_id=%s LIMIT 1",
|
||||
(str(hns_pid),),
|
||||
)
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
cu.execute(
|
||||
"""UPDATE pgz_sport.clanovi
|
||||
SET source='hns_semafor', source_id=%s, source_url=%s,
|
||||
source_synced_at=now()
|
||||
WHERE id=%s""",
|
||||
(str(hns_pid), url, row[0]),
|
||||
)
|
||||
c.commit()
|
||||
return row[0]
|
||||
cu.execute(
|
||||
"""INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, source, source_id, source_url,
|
||||
source_synced_at, slug, hns_igrac_id, sport, aktivan,
|
||||
verified, created_at, updated_at)
|
||||
VALUES (%s,%s,%s,'hns_semafor',%s,%s,now(),%s,%s,'nogomet',
|
||||
true, false, now(), now())
|
||||
RETURNING id""",
|
||||
(klub_db_id, ime, prezime, str(hns_pid), url, slug or None, hns_pid),
|
||||
)
|
||||
cid = cu.fetchone()[0]
|
||||
c.commit()
|
||||
return cid
|
||||
|
||||
|
||||
def upsert_clan_kategorija(
|
||||
clan_id: int, klub_db_id: int, kategorija: str, sezona: str,
|
||||
source_url: str,
|
||||
):
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""INSERT INTO pgz_sport.clan_kategorije
|
||||
(clan_id, klub_id, kategorija, sezona, source, source_url,
|
||||
scraped_at)
|
||||
VALUES (%s,%s,%s,%s,'hns_semafor',%s,now())
|
||||
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO UPDATE
|
||||
SET source_url=EXCLUDED.source_url,
|
||||
scraped_at=now()""",
|
||||
(clan_id, klub_db_id, kategorija, sezona, source_url),
|
||||
)
|
||||
c.commit()
|
||||
|
||||
|
||||
# ── Scrape primitives ─────────────────────────────────────────────────────
|
||||
def parse_competition_klubovi(html: str) -> list[tuple[int, str]]:
|
||||
"""Extract participating klubovi from a /natjecanja/{cid}/ page.
|
||||
Returns list of (hns_klub_id, slug)."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
seen = set()
|
||||
out = []
|
||||
for a in soup.find_all("a", href=re.compile(r"^/klubovi/\d+/[a-z0-9-]+/?")):
|
||||
m = re.match(r"^/klubovi/(\d+)/([a-z0-9-]+)/?", a["href"])
|
||||
if not m:
|
||||
continue
|
||||
kid, slug = int(m.group(1)), m.group(2)
|
||||
if kid in seen:
|
||||
continue
|
||||
seen.add(kid)
|
||||
out.append((kid, slug))
|
||||
return out
|
||||
|
||||
|
||||
def parse_klub_roster(html: str) -> list[tuple[int, str, str]]:
|
||||
"""Extract (hns_pid, slug, name) from a klub-with-cid page."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
seen = set()
|
||||
out = []
|
||||
for a in soup.find_all("a", href=re.compile(r"^/?(?:https?://[^/]+)?/igraci/\d+/[a-z0-9-]+/?")):
|
||||
href = a["href"]
|
||||
m = re.search(r"/igraci/(\d+)/([a-z0-9-]+)/?", href)
|
||||
if not m:
|
||||
continue
|
||||
pid, slug = int(m.group(1)), m.group(2)
|
||||
if pid in seen:
|
||||
continue
|
||||
seen.add(pid)
|
||||
name = (a.get_text(" ", strip=True) or "").strip()
|
||||
out.append((pid, slug, name))
|
||||
return out
|
||||
|
||||
|
||||
def parse_klub_competitions(html: str) -> list[tuple[int, str]]:
|
||||
"""From a klub page, parse the cid options dropdown — those are the
|
||||
competitions the club currently participates in (default season+acat
|
||||
only, but useful to discover more cids)."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
out = []
|
||||
for opt in soup.select('select#cid option'):
|
||||
val = opt.get("value") or ""
|
||||
m = re.search(r"\?cid=(\d+)", val)
|
||||
if not m:
|
||||
continue
|
||||
out.append((int(m.group(1)), opt.get_text(" ", strip=True)))
|
||||
return out
|
||||
|
||||
|
||||
# ── Main flow ──────────────────────────────────────────────────────────────
|
||||
def harvest():
|
||||
pgz = load_pgz_klubovi()
|
||||
log.info(
|
||||
f"Loaded {len(pgz)} unique PGŽ klubovi with hns_klub_id "
|
||||
f"({sum(1 for v in pgz.values() if v['slug'])} have slug)."
|
||||
)
|
||||
|
||||
stats = {
|
||||
"competitions_processed": 0,
|
||||
"competitions_skipped": 0,
|
||||
"klubovi_matched": 0,
|
||||
"rosters_fetched": 0,
|
||||
"players_upserted": 0,
|
||||
"kategorije_inserted": 0,
|
||||
"errors": 0,
|
||||
"per_kategorija": {},
|
||||
"per_klub": {},
|
||||
}
|
||||
|
||||
discovered_extra: set[tuple[str, str, str]] = set() # (cid, slug, sezona)
|
||||
seen_clan_kat: set[tuple[int, int, str, str]] = set()
|
||||
|
||||
for cid, slug, kategorija, sezona in COMP_CATALOG:
|
||||
comp_url = f"{BASE}/natjecanja/{cid}/{slug}/"
|
||||
try:
|
||||
html = fetch(comp_url)
|
||||
except Exception as e:
|
||||
log.warning(f"comp {cid} fetch failed: {e}")
|
||||
stats["competitions_skipped"] += 1
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
klubovi = parse_competition_klubovi(html)
|
||||
log.info(
|
||||
f"COMP cid={cid} '{slug}' [{kategorija}/{sezona}] -> "
|
||||
f"{len(klubovi)} participating klubovi"
|
||||
)
|
||||
stats["competitions_processed"] += 1
|
||||
time.sleep(RATE_S)
|
||||
|
||||
for hns_kid, k_slug in klubovi:
|
||||
if hns_kid not in pgz:
|
||||
continue
|
||||
klub = pgz[hns_kid]
|
||||
klub_db_id = klub["db_id"]
|
||||
stats["klubovi_matched"] += 1
|
||||
stats["per_klub"].setdefault(klub["naziv"], set()).add(kategorija)
|
||||
|
||||
# Fetch klub roster filtered by this competition cid
|
||||
slug_use = klub["slug"] or k_slug
|
||||
roster_url = f"{BASE}/klubovi/{hns_kid}/{slug_use}/?cid={cid}"
|
||||
try:
|
||||
rhtml = fetch(roster_url)
|
||||
except Exception as e:
|
||||
log.warning(f"roster {hns_kid} cid={cid} failed: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
stats["rosters_fetched"] += 1
|
||||
time.sleep(RATE_S)
|
||||
|
||||
# Discover any other cids this klub plays in
|
||||
for ocid, oname in parse_klub_competitions(rhtml):
|
||||
if ocid != int(cid):
|
||||
discovered_extra.add((str(ocid), oname, sezona))
|
||||
|
||||
roster = parse_klub_roster(rhtml)
|
||||
if not roster:
|
||||
log.info(f" {klub['naziv']} cid={cid}: empty roster")
|
||||
continue
|
||||
log.info(
|
||||
f" KLUB '{klub['naziv']}' (db={klub_db_id}, hns={hns_kid}) "
|
||||
f"cid={cid} -> {len(roster)} igraca [{kategorija}]"
|
||||
)
|
||||
|
||||
for hns_pid, p_slug, name in roster:
|
||||
try:
|
||||
clan_id = upsert_clan(klub_db_id, hns_pid, name, p_slug)
|
||||
except Exception as e:
|
||||
log.error(f"upsert_clan({hns_pid}) fail: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
stats["players_upserted"] += 1
|
||||
key = (clan_id, klub_db_id, kategorija, sezona)
|
||||
if key in seen_clan_kat:
|
||||
continue
|
||||
seen_clan_kat.add(key)
|
||||
try:
|
||||
upsert_clan_kategorija(
|
||||
clan_id, klub_db_id, kategorija, sezona, roster_url
|
||||
)
|
||||
stats["kategorije_inserted"] += 1
|
||||
stats["per_kategorija"][kategorija] = (
|
||||
stats["per_kategorija"].get(kategorija, 0) + 1
|
||||
)
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"upsert_clan_kategorija(clan={clan_id} "
|
||||
f"klub={klub_db_id} kat={kategorija}) fail: {e}"
|
||||
)
|
||||
stats["errors"] += 1
|
||||
|
||||
# Summarize discovered extra cids (not yet in catalog) for next run
|
||||
if discovered_extra:
|
||||
log.info(
|
||||
f"Discovered {len(discovered_extra)} extra cids not in catalog "
|
||||
f"(top 15 below):"
|
||||
)
|
||||
for cid, name, sezona in list(discovered_extra)[:15]:
|
||||
log.info(f" + cid={cid} '{name}' sezona={sezona}")
|
||||
|
||||
# Convert per_klub sets to lists for JSON serialisation
|
||||
stats["per_klub"] = {k: sorted(v) for k, v in stats["per_klub"].items()}
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
global load_pgz_klubovi # noqa: PLW0603
|
||||
cmd = sys.argv[1] if len(sys.argv) > 1 else "run"
|
||||
log.info(f"=== SUB5 hns_youth_categories START cmd={cmd} log={LOG_FILE} ===")
|
||||
ensure_schema()
|
||||
|
||||
if cmd == "discover":
|
||||
pgz = load_pgz_klubovi()
|
||||
log.info(f"PGŽ klubovi with hns_klub_id: {len(pgz)}")
|
||||
for hk, v in list(pgz.items())[:10]:
|
||||
log.info(f" hns={hk} db={v['db_id']} slug={v['slug']} naziv={v['naziv']}")
|
||||
return
|
||||
|
||||
if cmd == "klub" and len(sys.argv) > 2:
|
||||
# narrow-scope debug mode — monkey-patch loader before harvest()
|
||||
target_db = int(sys.argv[2])
|
||||
_orig = load_pgz_klubovi
|
||||
pgz = {k: v for k, v in _orig().items() if v["db_id"] == target_db}
|
||||
log.info(f"Restricted to db_id={target_db}: {len(pgz)} match")
|
||||
load_pgz_klubovi = lambda: pgz # type: ignore
|
||||
try:
|
||||
stats = harvest()
|
||||
finally:
|
||||
load_pgz_klubovi = _orig # type: ignore
|
||||
else:
|
||||
stats = harvest()
|
||||
|
||||
log.info("=== SUMMARY ===")
|
||||
log.info(json.dumps(stats, ensure_ascii=False, indent=2))
|
||||
|
||||
# Write SUB5_RESULT.md
|
||||
md_path = Path("/opt/pgz-sport/cc_tasks/SUB5_RESULT.md")
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
md = render_summary_md(stats)
|
||||
md_path.write_text(md, encoding="utf-8")
|
||||
log.info(f"Result MD written → {md_path}")
|
||||
|
||||
# Telegram
|
||||
tg_send(
|
||||
"*SUB5 — HNS youth categories*\n"
|
||||
f"Klubovi matched: *{stats['klubovi_matched']}*\n"
|
||||
f"Rosters fetched: *{stats['rosters_fetched']}*\n"
|
||||
f"Players upserted: *{stats['players_upserted']}*\n"
|
||||
f"clan_kategorije: *{stats['kategorije_inserted']}*\n"
|
||||
f"Errors: {stats['errors']}\n"
|
||||
f"Log: `{LOG_FILE.name}`"
|
||||
)
|
||||
|
||||
|
||||
def render_summary_md(stats: dict) -> str:
|
||||
lines = [
|
||||
"# SUB5 — HNS youth categories result",
|
||||
"",
|
||||
f"_Generated: {datetime.now().isoformat(timespec='seconds')}_",
|
||||
"",
|
||||
"## High-level counters",
|
||||
"",
|
||||
f"- Competitions processed: **{stats['competitions_processed']}**",
|
||||
f"- Competitions skipped: {stats['competitions_skipped']}",
|
||||
f"- Klubovi (DB) matched in competitions: **{stats['klubovi_matched']}**",
|
||||
f"- Rosters fetched: **{stats['rosters_fetched']}**",
|
||||
f"- Players upserted into `clanovi`: **{stats['players_upserted']}**",
|
||||
f"- M2M rows written into `clan_kategorije`: **{stats['kategorije_inserted']}**",
|
||||
f"- Errors: {stats['errors']}",
|
||||
"",
|
||||
"## Per kategorija",
|
||||
"",
|
||||
"| Kategorija | M2M zapisa |",
|
||||
"|---|---:|",
|
||||
]
|
||||
for k in sorted(stats["per_kategorija"].keys()):
|
||||
lines.append(f"| {k} | {stats['per_kategorija'][k]} |")
|
||||
lines.append("")
|
||||
lines.append("## Per klub — kategorije pronadjene")
|
||||
lines.append("")
|
||||
lines.append("| Klub | Kategorije |")
|
||||
lines.append("|---|---|")
|
||||
for klub in sorted(stats["per_klub"].keys()):
|
||||
kats = ", ".join(stats["per_klub"][klub])
|
||||
lines.append(f"| {klub} | {kats} |")
|
||||
lines.append("")
|
||||
lines.append(f"_Log: `{LOG_FILE}`_")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log.exception(f"FATAL: {e}")
|
||||
tg_send(f"*SUB5 FATAL*: {e}")
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_fix_and_extract.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_fix_and_extract.py
|
||||
# Autor: dradulic@outlook.com / damir@rinet.one
|
||||
# Svrha: SUB1 finalize — (a) rollback false positives,
|
||||
# (b) extract hns_klub_id iz već postojećeg source_url,
|
||||
# (c) verify presence preko HEAD i upsert.
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""SUB1 fix-up: false-positive rollback + source_url-based extraction."""
|
||||
import os, re, sys, time, json, subprocess, urllib.request
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
|
||||
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_fix_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
LOG = open(LOG_PATH, "a")
|
||||
|
||||
# False positives to ROLLBACK (cleared and marked not_found)
|
||||
FALSE_POS = {
|
||||
2572: "NK Hajduk Tovarnik (matched HNK Hajduk Split — different club)",
|
||||
600: "Ženski NK XXL Kraljevica (matched men's NK Kraljevica — wrong sex)",
|
||||
}
|
||||
|
||||
def log(msg, telegram=False):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True); LOG.write(line+"\n"); LOG.flush()
|
||||
if telegram:
|
||||
try:
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot{TG}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={msg[:3500]}"],
|
||||
timeout=8, capture_output=True)
|
||||
except: pass
|
||||
|
||||
def http_head_or_get(url, timeout=12):
|
||||
"""Verify URL exists. Return (status, title)."""
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
html = r.read().decode("utf-8", errors="replace")
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||||
title = m.group(1).strip() if m else None
|
||||
return r.status, title
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, None
|
||||
except Exception as e:
|
||||
return 0, str(e)
|
||||
|
||||
URL_RE = re.compile(r'/klubovi/(\d+)/([a-z0-9-]*)/?')
|
||||
|
||||
def main():
|
||||
log(f"=== SUB1 fix start; log={LOG_PATH} ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
# Phase 1: Rollback false positives
|
||||
rb = 0
|
||||
for kid, reason in FALSE_POS.items():
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = NULL,
|
||||
hns_slug = NULL,
|
||||
scrape_source = 'hns_not_found',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (kid,))
|
||||
log(f" ROLLBACK [{kid}] — {reason}")
|
||||
rb += 1
|
||||
|
||||
# Phase 2: Extract hns_klub_id from existing source_url
|
||||
cur.execute("""
|
||||
SELECT id, naziv, source_url
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
||||
AND hns_klub_id IS NULL
|
||||
AND source_url ~ 'semafor\\.hns\\.family/klubovi/[0-9]+'
|
||||
ORDER BY id
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
log(f"Source-URL extraction candidates: {len(rows)}")
|
||||
|
||||
extracted = 0; verify_fail = 0
|
||||
for r in rows:
|
||||
kid, naziv, url = r['id'], r['naziv'], r['source_url']
|
||||
m = URL_RE.search(url)
|
||||
if not m:
|
||||
log(f" SKIP [{kid}] no match in {url}")
|
||||
continue
|
||||
hns_id = int(m.group(1))
|
||||
slug = m.group(2) or None
|
||||
# Verify
|
||||
verify_url = f"https://semafor.hns.family/klubovi/{hns_id}/"
|
||||
status, title = http_head_or_get(verify_url)
|
||||
time.sleep(0.8)
|
||||
if status != 200 or not title:
|
||||
log(f" VERIFY FAIL [{kid}] {naziv} -> {hns_id}: status={status} title={title}")
|
||||
verify_fail += 1
|
||||
continue
|
||||
# If slug missing, try inferring from title
|
||||
if not slug and title:
|
||||
slug = re.sub(r'[^a-z0-9]+', '-',
|
||||
title.lower()
|
||||
.replace('č','c').replace('ć','c').replace('š','s').replace('ž','z').replace('đ','d')
|
||||
).strip('-')
|
||||
canonical = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/" if slug else verify_url
|
||||
try:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = %s,
|
||||
hns_slug = %s,
|
||||
source_url = %s,
|
||||
scrape_source = 'hns_semafor',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (hns_id, slug, canonical, kid))
|
||||
log(f" EXTRACT [{kid}] {naziv} -> HNS {hns_id} '{title}' (slug={slug})")
|
||||
extracted += 1
|
||||
except Exception as e:
|
||||
log(f" UPDATE fail [{kid}]: {e}")
|
||||
|
||||
# Phase 3: Final stats
|
||||
cur.execute("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE hns_klub_id IS NOT NULL) AS mapped,
|
||||
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND scrape_source='hns_not_found') AS marked_nf,
|
||||
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND (scrape_source IS NULL OR scrape_source != 'hns_not_found')) AS untouched
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
||||
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
|
||||
""")
|
||||
stats = cur.fetchone()
|
||||
log(f"=== Final state (real football, PGŽ priority): mapped={stats['mapped']}, marked_not_found={stats['marked_nf']}, untouched={stats['untouched']} ===")
|
||||
|
||||
msg = (f"SUB1 fix done: rollback={rb}, source_url-extracted={extracted}, "
|
||||
f"verify_fail={verify_fail}. Final mapped={stats['mapped']} / "
|
||||
f"not_found={stats['marked_nf']} / untouched={stats['untouched']}")
|
||||
log(msg, telegram=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,358 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
|
||||
# Autor: dradulic@outlook.com / damir@rinet.one
|
||||
# Svrha: SUB1 — Pronađi semafor.hns.family link za PGŽ priority
|
||||
# nogometne klubove koji nemaju hns_klub_id.
|
||||
# Strategija:
|
||||
# 1. Enumerate ŽNS Primorsko-goranski (oid=51) competitions across
|
||||
# seasons, plus 4. NL NS Rijeka, 3. HNL Zapad arhive
|
||||
# 2. Za svaki natjecanje GET /natjecanja/{cid}/{cname}/ i izvuci
|
||||
# sve <a href="/klubovi/{id}/{slug}/">{naziv}</a>
|
||||
# 3. Build catalog (hns_id, slug, naziv) — skup unique
|
||||
# 4. Fuzzy match candidate klubovi: normalize, drop NK/HNK/GNK
|
||||
# prefiks, ukloni dijakritike, pa equality + substring + ratio
|
||||
# 5. UPDATE pgz_sport.klubovi za matche; mark not_found za ostalo
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""SUB1 — HNS link harvester for PGŽ priority football clubs."""
|
||||
import os, re, sys, time, json, traceback, subprocess, difflib
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote
|
||||
import urllib.request, urllib.error
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
|
||||
SLEEP = 1.1
|
||||
BASE = "https://semafor.hns.family"
|
||||
|
||||
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
LOG = open(LOG_PATH, "a")
|
||||
|
||||
def log(msg, telegram=False):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True)
|
||||
LOG.write(line + "\n"); LOG.flush()
|
||||
if telegram:
|
||||
try:
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot{TG}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={msg[:3500]}"],
|
||||
timeout=8, capture_output=True)
|
||||
except Exception as e:
|
||||
log(f"TG error: {e}")
|
||||
|
||||
def http_get(url, accept_json=False, timeout=25):
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": UA,
|
||||
"Accept": "application/json, */*" if accept_json else "text/html,*/*",
|
||||
"X-Requested-With": "XMLHttpRequest" if accept_json else "",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
|
||||
# ── Normalization for fuzzy match ──
|
||||
DIACRITIC_MAP = str.maketrans({
|
||||
'č':'c','ć':'c','ž':'z','š':'s','đ':'d',
|
||||
'Č':'c','Ć':'c','Ž':'z','Š':'s','Đ':'d',
|
||||
'á':'a','é':'e','í':'i','ó':'o','ú':'u',
|
||||
})
|
||||
PREFIX_RE = re.compile(
|
||||
r'^(hrvatski\s+nogometni\s+klub|hrvatski\s+nogometni\s+klub\.?|'
|
||||
r'nogometni\s+klub|nogometna\s+akademija|nogometna\s+škola|'
|
||||
r'sportska\s+akademija|Ženski\s+nogometni\s+klub|'
|
||||
r'hnk|nk|gnk|znk|žnk|nk\.?|hnk\.?)\s+',
|
||||
re.IGNORECASE
|
||||
)
|
||||
SUFFIX_NOISE_RE = re.compile(
|
||||
r'\b(veterani|veterana|gornji\s+zamet|grada\s+crikvenice|'
|
||||
r'gomirje\s+gomirje|mrkopalj\s+mrkopalj|snježnik\s+gerovo|'
|
||||
r'-?\s*\d{4}\s*$)', re.IGNORECASE)
|
||||
|
||||
def norm(s):
|
||||
if not s: return ""
|
||||
s = s.lower().strip()
|
||||
s = s.translate(DIACRITIC_MAP)
|
||||
s = re.sub(r'["\'`]', '', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s
|
||||
|
||||
def core_name(naziv):
|
||||
"""Strip prefixes and noise; return core token list + joined."""
|
||||
s = norm(naziv)
|
||||
# remove prefix(es) (sometimes nested e.g. "Nogometni Klub HNK ...")
|
||||
for _ in range(3):
|
||||
s2 = PREFIX_RE.sub('', s)
|
||||
if s2 == s: break
|
||||
s = s2
|
||||
s = SUFFIX_NOISE_RE.sub('', s).strip()
|
||||
s = re.sub(r'\s+', ' ', s).strip()
|
||||
return s
|
||||
|
||||
def slugify(s):
|
||||
s = core_name(s)
|
||||
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
||||
return s
|
||||
|
||||
# ── Catalog harvest ──
|
||||
def get_pgz_competitions(season):
|
||||
"""Fetch list of competitions for ŽNS Primorsko-goranski (oid=51) for a season."""
|
||||
t = int(time.time()*1000)
|
||||
url = (f"{BASE}/handlers/getCompetitions/"
|
||||
f"?season={quote(season)}&oid=51&teamch=Club"
|
||||
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
|
||||
f"&lang=hr&t={t}")
|
||||
try:
|
||||
body = http_get(url, accept_json=True)
|
||||
return json.loads(body)
|
||||
except Exception as e:
|
||||
log(f" comps fetch fail {season}: {e}")
|
||||
return []
|
||||
|
||||
def get_organizations(season):
|
||||
"""List all organizations (regional federations) for a season."""
|
||||
t = int(time.time()*1000)
|
||||
url = (f"{BASE}/handlers/getOrganizations/"
|
||||
f"?season={quote(season)}&teamch=Club&lang=hr&t={t}")
|
||||
try:
|
||||
body = http_get(url, accept_json=True)
|
||||
return json.loads(body)
|
||||
except Exception as e:
|
||||
log(f" orgs fetch fail {season}: {e}")
|
||||
return []
|
||||
|
||||
# Match <a href="/klubovi/{id}/{slug}/">NAME<div...>...</a> — name is anything before first child element
|
||||
CLUB_LINK_RE2 = re.compile(
|
||||
r'<a[^>]+href="(?:https?://semafor\.hns\.family)?/klubovi/(\d+)/([a-z0-9-]*)/?"[^>]*>([^<]{1,150})(?:<|</a>)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
def harvest_competition(cid):
|
||||
"""GET natjecanje page and extract all club refs."""
|
||||
# The dynamic linkConstructor returned literal {cid}/{cname} — try direct id
|
||||
url = f"{BASE}/natjecanja/{cid}/x/"
|
||||
try:
|
||||
html = http_get(url)
|
||||
except Exception as e:
|
||||
log(f" nat fetch fail {cid}: {e}")
|
||||
return []
|
||||
found = []
|
||||
for m in CLUB_LINK_RE2.finditer(html):
|
||||
hns_id, slug, naziv = m.group(1), m.group(2), m.group(3).strip()
|
||||
# filter: real club name (not "Klubovi" navigation etc.)
|
||||
if len(naziv) > 1 and not naziv.lower().startswith('klubov'):
|
||||
found.append((hns_id, slug, naziv))
|
||||
return found
|
||||
|
||||
# ── Match logic ──
|
||||
def match_score(candidate_naziv, candidate_grad, hns_naziv):
|
||||
"""Score 0-100 how well candidate matches an HNS club entry."""
|
||||
cand_core = core_name(candidate_naziv)
|
||||
hns_core = core_name(hns_naziv)
|
||||
if not cand_core or not hns_core:
|
||||
return 0
|
||||
if cand_core == hns_core:
|
||||
return 100
|
||||
# ratio
|
||||
r = difflib.SequenceMatcher(None, cand_core, hns_core).ratio()
|
||||
score = int(r * 100)
|
||||
# bonus if grad in HNS naziv (e.g. "NK Borac (Ba)" + grad="Bakar")
|
||||
if candidate_grad:
|
||||
gnorm = norm(candidate_grad)
|
||||
if gnorm and (gnorm[:3] in norm(hns_naziv) or norm(hns_naziv).endswith('('+gnorm[:1]+')')):
|
||||
score = min(100, score + 5)
|
||||
# substring containment bonus (one fully contained)
|
||||
if cand_core in hns_core or hns_core in cand_core:
|
||||
score = max(score, 85)
|
||||
return score
|
||||
|
||||
# ── Main ──
|
||||
def main():
|
||||
log(f"=== SUB1 HNS link harvester start; log={LOG_PATH} ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
# 1) Get candidate clubs
|
||||
cur.execute("""
|
||||
SELECT id, naziv, grad
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
||||
AND hns_klub_id IS NULL
|
||||
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
|
||||
ORDER BY naziv
|
||||
""")
|
||||
candidates = cur.fetchall()
|
||||
log(f"Candidates: {len(candidates)}")
|
||||
|
||||
# 2) Build HNS catalog from PGŽ competitions across recent seasons
|
||||
SEASONS = ["2025/2026","2024/2025","2023/2024","2022/2023","2021/2022","2020/2021","2019/2020","2018/2019","2017/2018"]
|
||||
catalog = {} # hns_id -> {slug, naziv, sources:set}
|
||||
|
||||
seen_cids = set()
|
||||
for season in SEASONS:
|
||||
log(f"-- season {season}")
|
||||
comps = get_pgz_competitions(season)
|
||||
time.sleep(SLEEP)
|
||||
log(f" PGŽ comps: {len(comps)}")
|
||||
for c in comps:
|
||||
cid = str(c.get('id',''))
|
||||
if not cid or cid in seen_cids: continue
|
||||
seen_cids.add(cid)
|
||||
cname = c.get('value','')
|
||||
try:
|
||||
clubs = harvest_competition(cid)
|
||||
except Exception as e:
|
||||
log(f" {cid} ({cname}) fetch error: {e}")
|
||||
clubs = []
|
||||
for hns_id, slug, naziv in clubs:
|
||||
if hns_id not in catalog:
|
||||
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
|
||||
else:
|
||||
if slug and not catalog[hns_id]['slug']:
|
||||
catalog[hns_id]['slug'] = slug
|
||||
catalog[hns_id]['sources'].add(f"{season}:{cname[:30]}")
|
||||
log(f" {cid} '{cname[:40]}' -> {len(clubs)} clubs (catalog={len(catalog)})")
|
||||
time.sleep(SLEEP)
|
||||
# also sweep top-tier comps to catch HNK Rijeka-tier (though those usually mapped)
|
||||
# Also: 3.HNL Zapad / 4.NL NS Rijeka by oid=178180 (NS Rijeka)
|
||||
log("-- NS Rijeka oid=178180 sweep")
|
||||
for season in SEASONS:
|
||||
t = int(time.time()*1000)
|
||||
url = (f"{BASE}/handlers/getCompetitions/"
|
||||
f"?season={quote(season)}&oid=178180&teamch=Club"
|
||||
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
|
||||
f"&lang=hr&t={t}")
|
||||
try:
|
||||
comps = json.loads(http_get(url, accept_json=True))
|
||||
except Exception as e:
|
||||
log(f" ns_rijeka {season} fail: {e}"); comps = []
|
||||
time.sleep(SLEEP)
|
||||
for c in comps:
|
||||
cid = str(c.get('id',''))
|
||||
if not cid or cid in seen_cids: continue
|
||||
seen_cids.add(cid)
|
||||
cname = c.get('value','')
|
||||
try:
|
||||
clubs = harvest_competition(cid)
|
||||
except Exception as e:
|
||||
clubs = []
|
||||
for hns_id, slug, naziv in clubs:
|
||||
if hns_id not in catalog:
|
||||
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
|
||||
catalog[hns_id]['sources'].add(f"NSR:{season}:{cname[:30]}")
|
||||
log(f" NSR {cid} '{cname[:40]}' -> {len(clubs)} (cat={len(catalog)})")
|
||||
time.sleep(SLEEP)
|
||||
|
||||
log(f"=== Catalog built: {len(catalog)} unique HNS clubs ===")
|
||||
|
||||
# Save catalog snapshot
|
||||
snap = {hid: {'slug': v['slug'], 'naziv': v['naziv'], 'sources': sorted(v['sources'])[:5]}
|
||||
for hid,v in catalog.items()}
|
||||
with open("/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json","w") as f:
|
||||
json.dump(snap, f, ensure_ascii=False, indent=2)
|
||||
log(f"Catalog snapshot -> /opt/pgz-sport/cc_tasks/sub1_hns_catalog.json")
|
||||
|
||||
# 3) Match candidates
|
||||
matched = [] # (db_id, db_naziv, hns_id, slug, hns_naziv, score)
|
||||
not_found = []
|
||||
ambiguous = []
|
||||
|
||||
for cand in candidates:
|
||||
db_id, naziv, grad = cand['id'], cand['naziv'], cand['grad']
|
||||
ranked = []
|
||||
for hid, v in catalog.items():
|
||||
sc = match_score(naziv, grad, v['naziv'])
|
||||
if sc >= 70:
|
||||
ranked.append((sc, hid, v['slug'], v['naziv']))
|
||||
ranked.sort(reverse=True)
|
||||
if not ranked:
|
||||
not_found.append((db_id, naziv, grad))
|
||||
log(f" NOT FOUND: [{db_id}] {naziv} ({grad})")
|
||||
continue
|
||||
top = ranked[0]
|
||||
if len(ranked) > 1 and ranked[1][0] >= top[0] - 3 and top[0] < 95:
|
||||
ambiguous.append((db_id, naziv, grad, ranked[:3]))
|
||||
log(f" AMBIGUOUS: [{db_id}] {naziv} -> top: {top[3]} ({top[0]}), 2nd: {ranked[1][3]} ({ranked[1][0]})")
|
||||
# Skip ambiguous, mark not_found for safety
|
||||
not_found.append((db_id, naziv, grad))
|
||||
continue
|
||||
matched.append((db_id, naziv, top[1], top[2], top[3], top[0]))
|
||||
log(f" MATCH [{db_id}] {naziv} -> HNS {top[1]} '{top[3]}' (slug={top[2]}, score={top[0]})")
|
||||
|
||||
log(f"=== Match results: {len(matched)} matched, {len(not_found)} not_found, {len(ambiguous)} ambiguous ===")
|
||||
|
||||
# 4) Apply UPDATEs
|
||||
upd_ok, upd_fail = 0, 0
|
||||
for db_id, naziv, hns_id, slug, hns_naziv, sc in matched:
|
||||
try:
|
||||
source_url = f"{BASE}/klubovi/{hns_id}/{slug}/" if slug else f"{BASE}/klubovi/{hns_id}/"
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = %s,
|
||||
hns_slug = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
scrape_source = 'hns_semafor',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (int(hns_id), slug or None, source_url, db_id))
|
||||
upd_ok += 1
|
||||
except Exception as e:
|
||||
upd_fail += 1
|
||||
log(f" UPDATE fail [{db_id}] {naziv}: {e}")
|
||||
|
||||
# Mark not_found
|
||||
nf_ok = 0
|
||||
for db_id, naziv, grad in not_found:
|
||||
try:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET scrape_source = 'hns_not_found',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s AND hns_klub_id IS NULL
|
||||
""", (db_id,))
|
||||
nf_ok += 1
|
||||
except Exception as e:
|
||||
log(f" not_found mark fail [{db_id}]: {e}")
|
||||
|
||||
# 5) Write result md
|
||||
res_path = "/opt/pgz-sport/cc_tasks/SUB1_RESULT.md"
|
||||
with open(res_path, "w") as f:
|
||||
f.write(f"# SUB1 — HNS Link Harvest Result\n\n")
|
||||
f.write(f"Date: {datetime.now().isoformat(timespec='seconds')}\n\n")
|
||||
f.write(f"- Candidates processed: **{len(candidates)}**\n")
|
||||
f.write(f"- HNS catalog built: **{len(catalog)}** unique clubs from {len(seen_cids)} competitions\n")
|
||||
f.write(f"- Matched: **{len(matched)}** (DB updated: {upd_ok}, fail: {upd_fail})\n")
|
||||
f.write(f"- Ambiguous (skipped to safety): **{len(ambiguous)}**\n")
|
||||
f.write(f"- Not found (marked hns_not_found): **{len(not_found)}** (mark ok: {nf_ok})\n\n")
|
||||
f.write(f"## Matched\n\n| db_id | DB naziv | HNS id | HNS naziv | slug | score |\n|---|---|---|---|---|---|\n")
|
||||
for db_id, naziv, hns_id, slug, hns_naziv, sc in sorted(matched, key=lambda x: -x[5]):
|
||||
f.write(f"| {db_id} | {naziv} | {hns_id} | {hns_naziv} | {slug} | {sc} |\n")
|
||||
f.write(f"\n## Ambiguous (manual review)\n\n")
|
||||
for db_id, naziv, grad, ranked in ambiguous:
|
||||
f.write(f"- **[{db_id}] {naziv}** ({grad})\n")
|
||||
for sc, hid, slug, hns_naziv in ranked:
|
||||
f.write(f" - {sc}: HNS {hid} '{hns_naziv}' (slug={slug})\n")
|
||||
f.write(f"\n## Not Found\n\n")
|
||||
for db_id, naziv, grad in not_found:
|
||||
f.write(f"- [{db_id}] {naziv} ({grad})\n")
|
||||
f.write(f"\n## Log\n\n`{LOG_PATH}`\n")
|
||||
log(f"Result -> {res_path}")
|
||||
|
||||
# 6) Telegram notify
|
||||
msg = (f"SUB1 HNS done: matched {len(matched)}, not_found {len(not_found)}, "
|
||||
f"ambiguous {len(ambiguous)}. Catalog={len(catalog)}. "
|
||||
f"DB upd ok={upd_ok}/fail={upd_fail}. See SUB1_RESULT.md")
|
||||
log(msg, telegram=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: {e}\n{traceback.format_exc()}", telegram=True)
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_manual_overrides.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_manual_overrides.py
|
||||
# Autor: dradulic@outlook.com / damir@rinet.one
|
||||
# Svrha: SUB1 — Manual high-confidence overrides za klubove koje
|
||||
# fuzzy match nije uhvatio (ali postoje u HNS-u).
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""SUB1 manual overrides — verified mapping for special cases."""
|
||||
import os, re, sys, time, urllib.request
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
|
||||
|
||||
# Manual mappings — verified by visiting semafor.hns.family
|
||||
# Format: db_id -> (hns_id, slug, naziv-na-HNS, reason)
|
||||
OVERRIDES = {
|
||||
9: (3440, "znk-rijeka", "ŽNK Rijeka", "Ženski NK Rijeka — same modern club"),
|
||||
101: (3440, "znk-rijeka", "ŽNK Rijeka", "Ženski NK Rijeka 'Jack Pot' — sponsor naming, same club"),
|
||||
574: (5239, "nk-medicinar", "NK Medicinar", "NK Medicinar Rijeka (osnovan 1996, SRC Belveder)"),
|
||||
}
|
||||
|
||||
def http_check(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
html = r.read().decode("utf-8", errors="replace")
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||||
return r.status, (m.group(1).strip() if m else None)
|
||||
except Exception as e:
|
||||
return 0, str(e)
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
print(f"[{datetime.now().isoformat(timespec='seconds')}] Manual overrides start")
|
||||
ok = 0; fail = 0
|
||||
for kid, (hns_id, slug, naziv, reason) in OVERRIDES.items():
|
||||
url = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/"
|
||||
status, title = http_check(url)
|
||||
time.sleep(0.8)
|
||||
if status != 200:
|
||||
print(f" VERIFY FAIL [{kid}] {hns_id}: {status} {title}")
|
||||
fail += 1
|
||||
continue
|
||||
try:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = %s,
|
||||
hns_slug = %s,
|
||||
source_url = %s,
|
||||
scrape_source = 'hns_semafor_manual',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (hns_id, slug, url, kid))
|
||||
print(f" OVERRIDE [{kid}] -> HNS {hns_id} '{title}' ({reason})")
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
print(f" UPDATE fail [{kid}]: {e}")
|
||||
fail += 1
|
||||
print(f"Done: ok={ok}, fail={fail}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user