HNS+UI: 4 nova endpointa + multi-sport schema (M2M kategorije + player_stats)

Endpoints:
- GET /api/v2/enrich-sources — sport→source mapping
- GET /api/v2/klubovi/priority-sort — financirani/godišnjak prvi
- GET /api/v2/clan/{id}/kategorije — many-to-many kategorije
- GET /api/v2/clan/{id}/full — kompletna slika (profil+kategorije+sezone+utakmice+stats)
- POST /api/v2/export/klubovi — XLSX export selektiranih

Schema:
- pgz_sport.clan_kategorije (M2M: igrač u juniorskoj+seniorskoj)
- pgz_sport.player_stats (multi-sport: nogomet/košarka/rukomet/odbojka/vaterpolo)
- pgz_sport.klub_roster (multi-source)
- pgz_sport.enrichment_sources (sport→izvor)
- View: v_pgz_priority_klubovi (financiran || u_godisnjaku)
- View: v_klubovi_priority_sort (priority sort)

Sport harvesters scaffold:
- scripts/sport_harvesters/__base.py (SportHarvester class)
- hks_basketball.py, hrs_handball.py, hos_volleyball.py, hvs_waterpolo.py
This commit is contained in:
2026-05-05 10:42:49 +02:00
parent c68fd4471e
commit 9fb512932a
10 changed files with 4765 additions and 0 deletions
File diff suppressed because it is too large Load Diff
+111
View File
@@ -18,6 +18,7 @@ Changes (2026-05-05, sub-agent W5):
from fastapi import FastAPI, HTTPException, Query, Body, Header, Depends, UploadFile, File, Form, Request from fastapi import FastAPI, HTTPException, Query, Body, Header, Depends, UploadFile, File, Form, Request
import json import json
import time
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel from pydantic import BaseModel
from typing import Optional, List from typing import Optional, List
@@ -2072,6 +2073,116 @@ def dashboard_hns_coverage():
return stats[0] if stats else {} return stats[0] if stats else {}
@app.get("/api/v2/enrich-sources")
def enrich_sources():
"""Sport→source mapping za frontend Obogati podatke dugme."""
rows = fetch("SELECT * FROM pgz_sport.enrichment_sources ORDER BY sport")
return {"sources": rows}
@app.get("/api/v2/clan/{clan_id}/kategorije")
def clan_kategorije(clan_id: int):
"""Kategorije igrača (M2M)."""
rows = fetch("""
SELECT kategorija, sezona, klub_id, source, source_url, scraped_at
FROM pgz_sport.clan_kategorije WHERE clan_id = %s
ORDER BY sezona DESC, kategorija
""", (clan_id,))
return {"clan_id": clan_id, "kategorije": rows}
@app.get("/api/v2/klubovi/priority-sort")
def klubovi_priority_sort(sport: str = None, limit: int = 500):
"""Klubovi sortirani: priority (financirani || godišnjak) prvi."""
where = ""
params = []
if sport:
where = " WHERE sport = %s"
params.append(sport)
rows = fetch(f"""
SELECT k.*, k.priority_label,
(SELECT count(*) FROM pgz_sport.clanovi WHERE klub_id = k.id) AS sportasa,
(SELECT count(*) FROM pgz_sport.hns_klub_roster WHERE klub_id = k.id) AS hns_roster,
(SELECT sum(iznos) FROM pgz_sport.potpore_nositelji WHERE klub_id = k.id OR naziv_kluba ILIKE k.naziv) AS potpora_ukupno
FROM pgz_sport.v_klubovi_priority_sort k
{where}
ORDER BY priority, potpora_ukupno DESC NULLS LAST, naziv
LIMIT %s
""", tuple(params) + (limit,))
return {"count": len(rows), "rows": rows}
@app.get("/api/v2/clan/{clan_id}/full")
def clan_full(clan_id: int):
"""Punu sliku igrača: profil + kategorije + sezone + utakmice + potpore."""
profile = fetch("SELECT * FROM pgz_sport.clanovi WHERE id = %s", (clan_id,))
if not profile: return {"error": "not_found"}
p = profile[0]
kategorije = fetch("SELECT * FROM pgz_sport.clan_kategorije WHERE clan_id = %s ORDER BY sezona DESC", (clan_id,))
seasons = fetch("SELECT * FROM pgz_sport.hns_player_seasons WHERE clan_id = %s ORDER BY sezona DESC", (clan_id,))
matches = fetch("SELECT * FROM pgz_sport.hns_player_matches WHERE clan_id = %s ORDER BY datum DESC NULLS LAST LIMIT 30", (clan_id,))
multi_stats = fetch("SELECT * FROM pgz_sport.player_stats WHERE clan_id = %s ORDER BY sezona DESC", (clan_id,))
return {
"profile": p,
"kategorije": kategorije,
"hns_seasons": seasons,
"hns_matches": matches,
"multi_sport_stats": multi_stats,
"stats": {
"total_seasons": len(seasons),
"total_matches": len(matches),
"total_kategorije": len(kategorije),
}
}
@app.post("/api/v2/export/klubovi")
def export_klubovi(req: dict):
"""Export klubova kao XLSX."""
import io
try:
from openpyxl import Workbook
except ImportError:
return {"error": "openpyxl not installed"}
ids = req.get("ids", [])
if not ids:
return {"error": "no ids"}
rows = fetch("""
SELECT k.id, k.naziv, k.sport, k.razina, k.oib, k.grad,
k.financiran, k.u_godisnjaku, k.priority_label,
(SELECT count(*) FROM pgz_sport.clanovi WHERE klub_id = k.id) AS sportasa,
(SELECT sum(iznos) FROM pgz_sport.potpore_nositelji WHERE klub_id = k.id OR naziv_kluba ILIKE k.naziv) AS potpora
FROM pgz_sport.v_klubovi_priority_sort k
WHERE k.id = ANY(%s)
ORDER BY k.priority, k.naziv
""", (ids,))
wb = Workbook()
ws = wb.active
ws.title = "Klubovi"
if rows:
headers = list(rows[0].keys())
ws.append([h.replace('_',' ').title() for h in headers])
for r in rows:
ws.append([r.get(h) for h in headers])
buf = io.BytesIO()
wb.save(buf)
buf.seek(0)
from fastapi.responses import StreamingResponse
return StreamingResponse(
buf,
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
headers={"Content-Disposition": f"attachment; filename=klubovi_export_{int(time.time())}.xlsx"}
)
@app.get("/") @app.get("/")
def root(request: Request): def root(request: Request):
host = request.headers.get("host", "") host = request.headers.get("host", "")
+112
View File
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""HNS sezone retry — pojednostavljen extract."""
import os, time, re, json, sys
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
from playwright.sync_api import sync_playwright
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def find_seasons_in_obj(obj, found=None):
if found is None: found = []
if isinstance(obj, dict):
if 'season' in obj or 'sezona' in obj:
found.append(obj)
for v in obj.values():
find_seasons_in_obj(v, found)
elif isinstance(obj, list):
for item in obj:
find_seasons_in_obj(item, found)
return found
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
FROM pgz_sport.clanovi c
WHERE c.hns_igrac_id IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
ORDER BY c.id LIMIT 200
""")
targets = cur.fetchall()
print(f"Targets: {len(targets)}", flush=True)
seasons_added = 0
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0")
page = ctx.new_page()
for i, t in enumerate(targets):
url = t['source_url']
if not url or 'semafor.hns.family/igraci/' not in url:
continue
try:
page.goto(url, wait_until="networkidle", timeout=20000)
try: page.wait_for_selector('table, .karijera, [class*="season"]', timeout=6000)
except: pass
time.sleep(0.5)
rows = []
# Extract from __NEXT_DATA__ if exists
html = page.content()
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
if m:
try:
data = json.loads(m.group(1))
sezone = find_seasons_in_obj(data)
for s in sezone:
sezona = s.get('season') or s.get('sezona')
if sezona:
rows.append({'sezona': str(sezona), 'klub': '', 'natjecanje': '', 'nastupi': 0, 'golovi': 0})
except Exception:
pass
# Fallback regex on body
if not rows:
body = page.locator('body').inner_text()
for line in body.split('\n'):
match = re.match(r'^(20\d{2}/\d{2})\s+(.+?)\s+(\d+(?:\s+\d+)*)\s*$', line.strip())
if match:
sezona = match.group(1)
rest = match.group(2)
nums = [int(x) for x in match.group(3).split()]
rows.append({
'sezona': sezona, 'klub': rest[:200], 'natjecanje': '',
'nastupi': nums[0] if nums else 0,
'golovi': nums[1] if len(nums) > 1 else 0,
})
if rows:
with conn.cursor() as cur:
for r in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, golovi)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'],
r['natjecanje'], r['nastupi'], r['golovi']))
seasons_added += 1
except Exception:
pass
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: {len(rows)} sezone (total added: {seasons_added})", flush=True)
if i % 20 == 0:
print(f" [{i}/{len(targets)}] processed, total added: {seasons_added}", flush=True)
except Exception as e:
print(f"{t['ime']}: {e}", flush=True)
browser.close()
print(f"\nDone. Total sezone added: {seasons_added}")
if __name__ == '__main__':
main()
+49
View File
@@ -0,0 +1,49 @@
#!/bin/bash
# sport_harvest_backup.sh — pre-cron pg_dump of harvest tables
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# Description: Backups 4 ključne pgz_sport tablice prije sport harvester cron cikla.
# Pokreće ga /etc/cron.d/sport-harvesters u 02:50 svaki 2. dan.
set -u
DSN_HOST="${RINET_DB_HOST:-10.10.0.2}"
DSN_PORT="${RINET_DB_PORT:-6432}"
DSN_DB="${RINET_DB_NAME:-rinet_v3}"
DSN_USER="${RINET_DB_USER:-rinet}"
DSN_PASS="${RINET_DB_PASS:-R1net2026!SecureDB#v7}"
BACKUP_DIR="/opt/pgz-sport/_backups"
LOG_DIR="/var/log/pgz-sport-debug"
DATE_TAG="$(date +%Y%m%d_%H%M)"
DATE_DAY="$(date +%Y%m%d)"
OUT_FILE="${BACKUP_DIR}/sport_harvest_pre_${DATE_TAG}.sql"
LOG_FILE="${LOG_DIR}/cron_backup_${DATE_DAY}.log"
mkdir -p "${BACKUP_DIR}" "${LOG_DIR}"
{
echo "[$(date -Is)] sport_harvest_backup START → ${OUT_FILE}"
PGPASSWORD="${DSN_PASS}" pg_dump \
-h "${DSN_HOST}" -p "${DSN_PORT}" -U "${DSN_USER}" -d "${DSN_DB}" \
--no-owner --no-privileges --data-only \
-t pgz_sport.clanovi \
-t pgz_sport.klub_roster \
-t pgz_sport.player_stats \
-t pgz_sport.clan_kategorije \
-f "${OUT_FILE}"
RC=$?
if [ "${RC}" -eq 0 ] && [ -s "${OUT_FILE}" ]; then
SIZE=$(stat -c%s "${OUT_FILE}")
echo "[$(date -Is)] OK rc=${RC} size=${SIZE}B"
else
echo "[$(date -Is)] FAIL rc=${RC} (file empty or pg_dump error)"
exit 1
fi
# Retencija: zadrži 14 dana
find "${BACKUP_DIR}" -maxdepth 1 -name 'sport_harvest_pre_*.sql' -mtime +14 -print -delete
echo "[$(date -Is)] retention swept (>14d)"
echo "[$(date -Is)] sport_harvest_backup DONE"
} >> "${LOG_FILE}" 2>&1
+120
View File
@@ -0,0 +1,120 @@
#!/usr/bin/env python3
# sport_harvest_health.py — staleness check za pgz_sport klubove
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# Description: Provjerava kad je svaki aktivan klub zadnji put scrape-an
# (klub_roster.scraped_at clanovi.last_scraped_at). Klubovi >7 dana
# flag-irani su za re-scrape; Telegram alert se šalje ako ima staleova.
# Pokreće ga /etc/cron.d/sport-harvesters u 04:30 svaki 2. dan.
import os
import sys
import json
import subprocess
from datetime import datetime, timedelta, timezone
import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv(
"RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7",
)
TG_TOKEN = os.getenv("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
STALE_DAYS = int(os.getenv("SPORT_STALE_DAYS", "7"))
LOG_DIR = "/var/log/pgz-sport-debug"
LOG_PATH = os.path.join(LOG_DIR, f"health_{datetime.now().strftime('%Y%m%d_%H%M')}.log")
os.makedirs(LOG_DIR, exist_ok=True)
_logfh = open(LOG_PATH, "a")
def log(msg: str) -> None:
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
_logfh.write(line + "\n")
_logfh.flush()
SQL = """
WITH last_per_klub AS (
SELECT k.id AS klub_id, k.naziv, k.sport,
GREATEST(
COALESCE((SELECT MAX(scraped_at) FROM pgz_sport.klub_roster WHERE klub_id = k.id), 'epoch'::timestamptz),
COALESCE((SELECT MAX(last_scraped_at) FROM pgz_sport.clanovi WHERE klub_id = k.id), 'epoch'::timestamptz)
) AS last_scrape
FROM pgz_sport.klubovi k
WHERE k.aktivan = true
)
SELECT klub_id, naziv, sport, last_scrape,
(last_scrape <= 'epoch'::timestamptz OR last_scrape < now() - interval %s) AS stale
FROM last_per_klub;
"""
def telegram(text: str) -> None:
try:
subprocess.run(
[
"curl", "-sS", "-X", "POST",
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={text}",
],
capture_output=True,
timeout=10,
check=False,
)
log(f"telegram sent ({len(text)} chars)")
except Exception as e:
log(f"telegram fail: {e}")
def main() -> int:
log(f"sport_harvest_health START stale_days={STALE_DAYS}")
try:
conn = psycopg2.connect(DSN)
except Exception as e:
log(f"DB connect FAIL: {e}")
telegram(f"🚨 sport_harvest_health: DB connect FAIL — {e}")
return 2
interval_str = f"{STALE_DAYS} days"
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(SQL, (interval_str,))
rows = cur.fetchall()
total = len(rows)
stale_rows = [r for r in rows if r["stale"]]
by_sport: dict = {}
for r in stale_rows:
s = (r["sport"] or "?").lower()
by_sport[s] = by_sport.get(s, 0) + 1
top_stale = sorted(
stale_rows,
key=lambda r: (r["last_scrape"] or datetime(1970, 1, 1, tzinfo=timezone.utc)),
)[:10]
log(f"klubova_total={total} stale={len(stale_rows)} by_sport={json.dumps(by_sport, ensure_ascii=False)}")
for r in top_stale:
log(f" STALE klub_id={r['klub_id']} sport={r['sport']} last={r['last_scrape']} naziv={r['naziv']}")
if stale_rows:
sport_summary = ", ".join(f"{k.upper()}:{v}" for k, v in sorted(by_sport.items()))
top_lines = "\n".join(
f"{r['naziv']} ({(r['sport'] or '?')}) — {r['last_scrape']}"
for r in top_stale[:5]
)
msg = (
f"⚠️ Sport harvest stale: {len(stale_rows)}/{total} klubova "
f">{STALE_DAYS} dana ({sport_summary})\nTop:\n{top_lines}"
)
telegram(msg)
conn.close()
log("sport_harvest_health DONE")
return 1 if stale_rows else 0
if __name__ == "__main__":
sys.exit(main())
+149
View File
@@ -0,0 +1,149 @@
"""
Multi-sport scrape base class.
Usage: subclass + implement scrape_klub(), scrape_player()
"""
import os, time, json, re, sys
from datetime import datetime
from playwright.sync_api import sync_playwright
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
class SportHarvester:
SPORT = None # override
SOURCE = None # override
def __init__(self):
self.conn = psycopg2.connect(DSN)
self.conn.autocommit = True
self.stats = {'klubova': 0, 'players': 0, 'stats': 0, 'errors': 0}
self.log_file = open(f"/var/log/pgz-sport-debug/harvest_{self.SPORT}_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
def log(self, msg):
line = f"[{datetime.now().isoformat(timespec='seconds')}] [{self.SPORT}] {msg}"
print(line, flush=True)
self.log_file.write(line + "\n"); self.log_file.flush()
def slugify(self, s):
if not s: return ""
t = s.lower().strip()
for old, new in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d')]:
t = t.replace(old, new)
t = re.sub(r'[^a-z0-9\s-]', '', t)
return re.sub(r'\s+', '-', t).strip('-')
def get_target_klubovi(self, limit=999):
"""Get PGŽ priority klubovi for this sport."""
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s AND (financiran OR u_godisnjaku)
ORDER BY financiran DESC, u_godisnjaku DESC, id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def upsert_clan(self, klub_id, source_id, ime, prezime, source_url, kategorija=None, sezona=None, extra=None):
"""Upsert player + return clan_id."""
ime = re.sub(r'\s+', ' ', (ime or '')).strip()
prezime = re.sub(r'\s+', ' ', (prezime or '')).strip()
with self.conn.cursor() as cur:
# Try find existing by source+source_id
cur.execute("""
SELECT id FROM pgz_sport.clanovi
WHERE source = %s AND source_id = %s
ORDER BY id LIMIT 1
""", (self.SOURCE, str(source_id)))
row = cur.fetchone()
if row:
clan_id = row[0]
cur.execute("""
UPDATE pgz_sport.clanovi
SET ime = COALESCE(NULLIF(ime,''), %s),
prezime = COALESCE(NULLIF(prezime,''), %s),
klub_id = COALESCE(klub_id, %s),
source_url = %s, last_updated = now(), last_scraped_at = now(),
sport = COALESCE(sport, %s),
metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
WHERE id = %s
""", (ime, prezime, klub_id, source_url, self.SPORT, json.dumps(extra or {}), clan_id))
else:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, sport, source, source_id, source_url, last_scraped_at, aktivan, metadata)
VALUES (%s, %s, %s, %s, %s, %s, %s, now(), true, %s::jsonb)
RETURNING id
""", (klub_id, ime, prezime, self.SPORT, self.SOURCE, str(source_id), source_url, json.dumps(extra or {})))
clan_id = cur.fetchone()[0]
# Add kategorija if specified (many-to-many)
if kategorija:
cur.execute("""
INSERT INTO pgz_sport.clan_kategorije
(clan_id, kategorija, sezona, klub_id, source, source_url)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO NOTHING
""", (clan_id, kategorija, sezona, klub_id, self.SOURCE, source_url))
return clan_id
def upsert_stats(self, clan_id, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict, raw=None):
"""Upsert player_stats row."""
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.player_stats
(clan_id, sport, source, sezona, klub_id, klub_naziv, natjecanje, kategorija,
nastupi, golovi, asistencije, bodovi, trice, skokovi, blokade, servis_asovi,
zuti, crveni, minute, metadata)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (clan_id, sport, sezona, klub_id, natjecanje)
DO UPDATE SET
nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije, bodovi = EXCLUDED.bodovi,
trice = EXCLUDED.trice, skokovi = EXCLUDED.skokovi,
blokade = EXCLUDED.blokade, servis_asovi = EXCLUDED.servis_asovi,
zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
metadata = EXCLUDED.metadata, scraped_at = now()
""", (clan_id, self.SPORT, self.SOURCE, sezona, klub_id, klub_naziv, natjecanje, kategorija,
stats_dict.get('nastupi'), stats_dict.get('golovi'), stats_dict.get('asistencije'),
stats_dict.get('bodovi'), stats_dict.get('trice'), stats_dict.get('skokovi'),
stats_dict.get('blokade'), stats_dict.get('servis_asovi'),
stats_dict.get('zuti'), stats_dict.get('crveni'), stats_dict.get('minute'),
json.dumps(raw or {})))
def run(self, limit=999):
klubovi = self.get_target_klubovi(limit)
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova")
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(
ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
)
page = ctx.new_page()
for klub in klubovi:
try:
self.scrape_klub(page, klub)
self.stats['klubova'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
browser.close()
self.log(f"✅ Done. Stats: {self.stats}")
# Telegram
import subprocess
try:
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d","chat_id=7969491558",
"--data-urlencode", f"text={self.SPORT.upper()} harvest done: {self.stats}"],
timeout=8, capture_output=True)
except: pass
def scrape_klub(self, page, klub):
raise NotImplementedError("subclass must implement")
+32
View File
@@ -0,0 +1,32 @@
#!/usr/bin/env python3
"""HKS-CBF + FIBA LiveStats basketball harvester."""
import sys, re
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
class HKSHarvester(SportHarvester):
SPORT = 'košarka'
SOURCE = 'hks_cbf'
def scrape_klub(self, page, klub):
# Discovery: try search hks-cbf.hr by club name
url = f"https://www.hks-cbf.hr/?s={klub['naziv'].replace(' ','+')}"
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']}{url}")
try:
page.goto(url, wait_until="domcontentloaded", timeout=20000)
# Look for /klubovi/ or /klub/ link
klub_links = page.locator('a[href*="/klubovi/"], a[href*="/klub/"]').all()
for a in klub_links[:3]:
href = a.get_attribute('href')
if href and 'klub' in href:
self.log(f" Found: {href}")
# Save URL to klub
with self.conn.cursor() as cur:
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s",
(href, klub['id']))
break
except Exception as e:
self.log(f"{e}")
if __name__ == '__main__':
HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
+21
View File
@@ -0,0 +1,21 @@
#!/usr/bin/env python3
"""HOS volleyball harvester."""
import sys
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
class HOSHarvester(SportHarvester):
SPORT = 'odbojka'
SOURCE = 'hos'
def scrape_klub(self, page, klub):
# HOS-CVF.hr search
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
try:
page.goto("https://hos-cvf.hr/", wait_until="domcontentloaded", timeout=20000)
self.log(f" [discovery mode] HOS site loaded")
except Exception as e:
self.log(f"{e}")
if __name__ == '__main__':
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
+27
View File
@@ -0,0 +1,27 @@
#!/usr/bin/env python3
"""HRS handball harvester."""
import sys
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
class HRSHarvester(SportHarvester):
SPORT = 'rukomet'
SOURCE = 'hrs'
def scrape_klub(self, page, klub):
url = f"https://hrs.hr/?s={klub['naziv'].replace(' ','+')}"
self.log(f" 🤾 Klub {klub['id']} {klub['naziv']}")
try:
page.goto(url, wait_until="domcontentloaded", timeout=20000)
# Find natjecanje or klub link
links = page.locator('a[href*="hrs.hr"]').all()
for a in links[:5]:
href = a.get_attribute('href') or ''
if 'natjecanje' in href or 'klub' in href:
self.log(f" Found: {href}")
break
except Exception as e:
self.log(f"{e}")
if __name__ == '__main__':
HRSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
+54
View File
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""HVS waterpolo harvester."""
import sys, re
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
from __base import SportHarvester
class HVSHarvester(SportHarvester):
SPORT = 'vaterpolo'
SOURCE = 'hvs'
def scrape_klub(self, page, klub):
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
try:
# Get all klubovi list from HVS
page.goto("https://hvs.hr/klubovi/", wait_until="domcontentloaded", timeout=20000)
klub_links = page.locator('a[href*="/klub/"]').all()
naziv_lower = klub['naziv'].lower()
for a in klub_links[:30]:
text = a.inner_text().lower()
href = a.get_attribute('href') or ''
# Naivni match: ima li klub naziv u text-u
if any(kw in text for kw in naziv_lower.split() if len(kw) > 3):
self.log(f" Match: {text[:50]}{href}")
m = re.search(r'/klub/(\d+)', href)
if m:
kid = m.group(1)
new_url = f"https://hvs.hr/klub/{kid}/"
with self.conn.cursor() as cur:
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s", (new_url, klub['id']))
# Now visit klub page for roster
page.goto(new_url, wait_until="domcontentloaded", timeout=15000)
igrac_links = page.locator('a[href*="/igrac/"]').all()
self.log(f" {len(igrac_links)} igrača found")
for ia in igrac_links[:30]:
ihref = ia.get_attribute('href') or ''
naziv = ia.inner_text().strip()
mi = re.search(r'/igrac/(\d+)', ihref)
if mi and naziv:
parts = re.split(r'\s+', naziv, 1)
ime = parts[0]
prezime = parts[1] if len(parts) > 1 else ''
full_url = ihref if ihref.startswith('http') else f"https://hvs.hr{ihref}"
clan_id = self.upsert_clan(
klub_id=klub['id'], source_id=mi.group(1),
ime=ime, prezime=prezime,
source_url=full_url
)
self.stats['players'] += 1
break
except Exception as e:
self.log(f"{e}")
if __name__ == '__main__':
HVSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)