HNS+UI: 4 nova endpointa + multi-sport schema (M2M kategorije + player_stats)
Endpoints:
- GET /api/v2/enrich-sources — sport→source mapping
- GET /api/v2/klubovi/priority-sort — financirani/godišnjak prvi
- GET /api/v2/clan/{id}/kategorije — many-to-many kategorije
- GET /api/v2/clan/{id}/full — kompletna slika (profil+kategorije+sezone+utakmice+stats)
- POST /api/v2/export/klubovi — XLSX export selektiranih
Schema:
- pgz_sport.clan_kategorije (M2M: igrač u juniorskoj+seniorskoj)
- pgz_sport.player_stats (multi-sport: nogomet/košarka/rukomet/odbojka/vaterpolo)
- pgz_sport.klub_roster (multi-source)
- pgz_sport.enrichment_sources (sport→izvor)
- View: v_pgz_priority_klubovi (financiran || u_godisnjaku)
- View: v_klubovi_priority_sort (priority sort)
Sport harvesters scaffold:
- scripts/sport_harvesters/__base.py (SportHarvester class)
- hks_basketball.py, hrs_handball.py, hos_volleyball.py, hvs_waterpolo.py
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -18,6 +18,7 @@ Changes (2026-05-05, sub-agent W5):
|
|||||||
|
|
||||||
from fastapi import FastAPI, HTTPException, Query, Body, Header, Depends, UploadFile, File, Form, Request
|
from fastapi import FastAPI, HTTPException, Query, Body, Header, Depends, UploadFile, File, Form, Request
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
@@ -2072,6 +2073,116 @@ def dashboard_hns_coverage():
|
|||||||
return stats[0] if stats else {}
|
return stats[0] if stats else {}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v2/enrich-sources")
|
||||||
|
def enrich_sources():
|
||||||
|
"""Sport→source mapping za frontend Obogati podatke dugme."""
|
||||||
|
rows = fetch("SELECT * FROM pgz_sport.enrichment_sources ORDER BY sport")
|
||||||
|
return {"sources": rows}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v2/clan/{clan_id}/kategorije")
|
||||||
|
def clan_kategorije(clan_id: int):
|
||||||
|
"""Kategorije igrača (M2M)."""
|
||||||
|
rows = fetch("""
|
||||||
|
SELECT kategorija, sezona, klub_id, source, source_url, scraped_at
|
||||||
|
FROM pgz_sport.clan_kategorije WHERE clan_id = %s
|
||||||
|
ORDER BY sezona DESC, kategorija
|
||||||
|
""", (clan_id,))
|
||||||
|
return {"clan_id": clan_id, "kategorije": rows}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v2/klubovi/priority-sort")
|
||||||
|
def klubovi_priority_sort(sport: str = None, limit: int = 500):
|
||||||
|
"""Klubovi sortirani: priority (financirani || godišnjak) prvi."""
|
||||||
|
where = ""
|
||||||
|
params = []
|
||||||
|
if sport:
|
||||||
|
where = " WHERE sport = %s"
|
||||||
|
params.append(sport)
|
||||||
|
rows = fetch(f"""
|
||||||
|
SELECT k.*, k.priority_label,
|
||||||
|
(SELECT count(*) FROM pgz_sport.clanovi WHERE klub_id = k.id) AS sportasa,
|
||||||
|
(SELECT count(*) FROM pgz_sport.hns_klub_roster WHERE klub_id = k.id) AS hns_roster,
|
||||||
|
(SELECT sum(iznos) FROM pgz_sport.potpore_nositelji WHERE klub_id = k.id OR naziv_kluba ILIKE k.naziv) AS potpora_ukupno
|
||||||
|
FROM pgz_sport.v_klubovi_priority_sort k
|
||||||
|
{where}
|
||||||
|
ORDER BY priority, potpora_ukupno DESC NULLS LAST, naziv
|
||||||
|
LIMIT %s
|
||||||
|
""", tuple(params) + (limit,))
|
||||||
|
return {"count": len(rows), "rows": rows}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v2/clan/{clan_id}/full")
|
||||||
|
def clan_full(clan_id: int):
|
||||||
|
"""Punu sliku igrača: profil + kategorije + sezone + utakmice + potpore."""
|
||||||
|
profile = fetch("SELECT * FROM pgz_sport.clanovi WHERE id = %s", (clan_id,))
|
||||||
|
if not profile: return {"error": "not_found"}
|
||||||
|
p = profile[0]
|
||||||
|
|
||||||
|
kategorije = fetch("SELECT * FROM pgz_sport.clan_kategorije WHERE clan_id = %s ORDER BY sezona DESC", (clan_id,))
|
||||||
|
seasons = fetch("SELECT * FROM pgz_sport.hns_player_seasons WHERE clan_id = %s ORDER BY sezona DESC", (clan_id,))
|
||||||
|
matches = fetch("SELECT * FROM pgz_sport.hns_player_matches WHERE clan_id = %s ORDER BY datum DESC NULLS LAST LIMIT 30", (clan_id,))
|
||||||
|
multi_stats = fetch("SELECT * FROM pgz_sport.player_stats WHERE clan_id = %s ORDER BY sezona DESC", (clan_id,))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"profile": p,
|
||||||
|
"kategorije": kategorije,
|
||||||
|
"hns_seasons": seasons,
|
||||||
|
"hns_matches": matches,
|
||||||
|
"multi_sport_stats": multi_stats,
|
||||||
|
"stats": {
|
||||||
|
"total_seasons": len(seasons),
|
||||||
|
"total_matches": len(matches),
|
||||||
|
"total_kategorije": len(kategorije),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/v2/export/klubovi")
|
||||||
|
def export_klubovi(req: dict):
|
||||||
|
"""Export klubova kao XLSX."""
|
||||||
|
import io
|
||||||
|
try:
|
||||||
|
from openpyxl import Workbook
|
||||||
|
except ImportError:
|
||||||
|
return {"error": "openpyxl not installed"}
|
||||||
|
|
||||||
|
ids = req.get("ids", [])
|
||||||
|
if not ids:
|
||||||
|
return {"error": "no ids"}
|
||||||
|
|
||||||
|
rows = fetch("""
|
||||||
|
SELECT k.id, k.naziv, k.sport, k.razina, k.oib, k.grad,
|
||||||
|
k.financiran, k.u_godisnjaku, k.priority_label,
|
||||||
|
(SELECT count(*) FROM pgz_sport.clanovi WHERE klub_id = k.id) AS sportasa,
|
||||||
|
(SELECT sum(iznos) FROM pgz_sport.potpore_nositelji WHERE klub_id = k.id OR naziv_kluba ILIKE k.naziv) AS potpora
|
||||||
|
FROM pgz_sport.v_klubovi_priority_sort k
|
||||||
|
WHERE k.id = ANY(%s)
|
||||||
|
ORDER BY k.priority, k.naziv
|
||||||
|
""", (ids,))
|
||||||
|
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "Klubovi"
|
||||||
|
|
||||||
|
if rows:
|
||||||
|
headers = list(rows[0].keys())
|
||||||
|
ws.append([h.replace('_',' ').title() for h in headers])
|
||||||
|
for r in rows:
|
||||||
|
ws.append([r.get(h) for h in headers])
|
||||||
|
|
||||||
|
buf = io.BytesIO()
|
||||||
|
wb.save(buf)
|
||||||
|
buf.seek(0)
|
||||||
|
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
return StreamingResponse(
|
||||||
|
buf,
|
||||||
|
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
headers={"Content-Disposition": f"attachment; filename=klubovi_export_{int(time.time())}.xlsx"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
def root(request: Request):
|
def root(request: Request):
|
||||||
host = request.headers.get("host", "")
|
host = request.headers.get("host", "")
|
||||||
|
|||||||
Executable
+112
@@ -0,0 +1,112 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""HNS sezone retry — pojednostavljen extract."""
|
||||||
|
import os, time, re, json, sys
|
||||||
|
from datetime import datetime
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||||
|
|
||||||
|
def find_seasons_in_obj(obj, found=None):
|
||||||
|
if found is None: found = []
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
if 'season' in obj or 'sezona' in obj:
|
||||||
|
found.append(obj)
|
||||||
|
for v in obj.values():
|
||||||
|
find_seasons_in_obj(v, found)
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
for item in obj:
|
||||||
|
find_seasons_in_obj(item, found)
|
||||||
|
return found
|
||||||
|
|
||||||
|
def main():
|
||||||
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||||
|
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
|
||||||
|
FROM pgz_sport.clanovi c
|
||||||
|
WHERE c.hns_igrac_id IS NOT NULL
|
||||||
|
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
|
||||||
|
ORDER BY c.id LIMIT 200
|
||||||
|
""")
|
||||||
|
targets = cur.fetchall()
|
||||||
|
|
||||||
|
print(f"Targets: {len(targets)}", flush=True)
|
||||||
|
|
||||||
|
seasons_added = 0
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||||
|
ctx = browser.new_context(ignore_https_errors=True,
|
||||||
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0")
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
for i, t in enumerate(targets):
|
||||||
|
url = t['source_url']
|
||||||
|
if not url or 'semafor.hns.family/igraci/' not in url:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
page.goto(url, wait_until="networkidle", timeout=20000)
|
||||||
|
try: page.wait_for_selector('table, .karijera, [class*="season"]', timeout=6000)
|
||||||
|
except: pass
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
# Extract from __NEXT_DATA__ if exists
|
||||||
|
html = page.content()
|
||||||
|
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
data = json.loads(m.group(1))
|
||||||
|
sezone = find_seasons_in_obj(data)
|
||||||
|
for s in sezone:
|
||||||
|
sezona = s.get('season') or s.get('sezona')
|
||||||
|
if sezona:
|
||||||
|
rows.append({'sezona': str(sezona), 'klub': '', 'natjecanje': '', 'nastupi': 0, 'golovi': 0})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback regex on body
|
||||||
|
if not rows:
|
||||||
|
body = page.locator('body').inner_text()
|
||||||
|
for line in body.split('\n'):
|
||||||
|
match = re.match(r'^(20\d{2}/\d{2})\s+(.+?)\s+(\d+(?:\s+\d+)*)\s*$', line.strip())
|
||||||
|
if match:
|
||||||
|
sezona = match.group(1)
|
||||||
|
rest = match.group(2)
|
||||||
|
nums = [int(x) for x in match.group(3).split()]
|
||||||
|
rows.append({
|
||||||
|
'sezona': sezona, 'klub': rest[:200], 'natjecanje': '',
|
||||||
|
'nastupi': nums[0] if nums else 0,
|
||||||
|
'golovi': nums[1] if len(nums) > 1 else 0,
|
||||||
|
})
|
||||||
|
|
||||||
|
if rows:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
for r in rows:
|
||||||
|
try:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO pgz_sport.hns_player_seasons
|
||||||
|
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, golovi)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||||
|
ON CONFLICT DO NOTHING
|
||||||
|
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'],
|
||||||
|
r['natjecanje'], r['nastupi'], r['golovi']))
|
||||||
|
seasons_added += 1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: {len(rows)} sezone (total added: {seasons_added})", flush=True)
|
||||||
|
|
||||||
|
if i % 20 == 0:
|
||||||
|
print(f" [{i}/{len(targets)}] processed, total added: {seasons_added}", flush=True)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ {t['ime']}: {e}", flush=True)
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
print(f"\nDone. Total sezone added: {seasons_added}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+49
@@ -0,0 +1,49 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# sport_harvest_backup.sh — pre-cron pg_dump of harvest tables
|
||||||
|
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
||||||
|
# Description: Backups 4 ključne pgz_sport tablice prije sport harvester cron cikla.
|
||||||
|
# Pokreće ga /etc/cron.d/sport-harvesters u 02:50 svaki 2. dan.
|
||||||
|
|
||||||
|
set -u
|
||||||
|
|
||||||
|
DSN_HOST="${RINET_DB_HOST:-10.10.0.2}"
|
||||||
|
DSN_PORT="${RINET_DB_PORT:-6432}"
|
||||||
|
DSN_DB="${RINET_DB_NAME:-rinet_v3}"
|
||||||
|
DSN_USER="${RINET_DB_USER:-rinet}"
|
||||||
|
DSN_PASS="${RINET_DB_PASS:-R1net2026!SecureDB#v7}"
|
||||||
|
|
||||||
|
BACKUP_DIR="/opt/pgz-sport/_backups"
|
||||||
|
LOG_DIR="/var/log/pgz-sport-debug"
|
||||||
|
DATE_TAG="$(date +%Y%m%d_%H%M)"
|
||||||
|
DATE_DAY="$(date +%Y%m%d)"
|
||||||
|
OUT_FILE="${BACKUP_DIR}/sport_harvest_pre_${DATE_TAG}.sql"
|
||||||
|
LOG_FILE="${LOG_DIR}/cron_backup_${DATE_DAY}.log"
|
||||||
|
|
||||||
|
mkdir -p "${BACKUP_DIR}" "${LOG_DIR}"
|
||||||
|
|
||||||
|
{
|
||||||
|
echo "[$(date -Is)] sport_harvest_backup START → ${OUT_FILE}"
|
||||||
|
|
||||||
|
PGPASSWORD="${DSN_PASS}" pg_dump \
|
||||||
|
-h "${DSN_HOST}" -p "${DSN_PORT}" -U "${DSN_USER}" -d "${DSN_DB}" \
|
||||||
|
--no-owner --no-privileges --data-only \
|
||||||
|
-t pgz_sport.clanovi \
|
||||||
|
-t pgz_sport.klub_roster \
|
||||||
|
-t pgz_sport.player_stats \
|
||||||
|
-t pgz_sport.clan_kategorije \
|
||||||
|
-f "${OUT_FILE}"
|
||||||
|
RC=$?
|
||||||
|
|
||||||
|
if [ "${RC}" -eq 0 ] && [ -s "${OUT_FILE}" ]; then
|
||||||
|
SIZE=$(stat -c%s "${OUT_FILE}")
|
||||||
|
echo "[$(date -Is)] OK rc=${RC} size=${SIZE}B"
|
||||||
|
else
|
||||||
|
echo "[$(date -Is)] FAIL rc=${RC} (file empty or pg_dump error)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Retencija: zadrži 14 dana
|
||||||
|
find "${BACKUP_DIR}" -maxdepth 1 -name 'sport_harvest_pre_*.sql' -mtime +14 -print -delete
|
||||||
|
echo "[$(date -Is)] retention swept (>14d)"
|
||||||
|
echo "[$(date -Is)] sport_harvest_backup DONE"
|
||||||
|
} >> "${LOG_FILE}" 2>&1
|
||||||
Executable
+120
@@ -0,0 +1,120 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# sport_harvest_health.py — staleness check za pgz_sport klubove
|
||||||
|
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
||||||
|
# Description: Provjerava kad je svaki aktivan klub zadnji put scrape-an
|
||||||
|
# (klub_roster.scraped_at ∪ clanovi.last_scraped_at). Klubovi >7 dana
|
||||||
|
# flag-irani su za re-scrape; Telegram alert se šalje ako ima staleova.
|
||||||
|
# Pokreće ga /etc/cron.d/sport-harvesters u 04:30 svaki 2. dan.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
|
||||||
|
DSN = os.getenv(
|
||||||
|
"RINET_DSN",
|
||||||
|
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7",
|
||||||
|
)
|
||||||
|
TG_TOKEN = os.getenv("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||||
|
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||||
|
STALE_DAYS = int(os.getenv("SPORT_STALE_DAYS", "7"))
|
||||||
|
LOG_DIR = "/var/log/pgz-sport-debug"
|
||||||
|
|
||||||
|
LOG_PATH = os.path.join(LOG_DIR, f"health_{datetime.now().strftime('%Y%m%d_%H%M')}.log")
|
||||||
|
os.makedirs(LOG_DIR, exist_ok=True)
|
||||||
|
_logfh = open(LOG_PATH, "a")
|
||||||
|
|
||||||
|
|
||||||
|
def log(msg: str) -> None:
|
||||||
|
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||||
|
print(line, flush=True)
|
||||||
|
_logfh.write(line + "\n")
|
||||||
|
_logfh.flush()
|
||||||
|
|
||||||
|
|
||||||
|
SQL = """
|
||||||
|
WITH last_per_klub AS (
|
||||||
|
SELECT k.id AS klub_id, k.naziv, k.sport,
|
||||||
|
GREATEST(
|
||||||
|
COALESCE((SELECT MAX(scraped_at) FROM pgz_sport.klub_roster WHERE klub_id = k.id), 'epoch'::timestamptz),
|
||||||
|
COALESCE((SELECT MAX(last_scraped_at) FROM pgz_sport.clanovi WHERE klub_id = k.id), 'epoch'::timestamptz)
|
||||||
|
) AS last_scrape
|
||||||
|
FROM pgz_sport.klubovi k
|
||||||
|
WHERE k.aktivan = true
|
||||||
|
)
|
||||||
|
SELECT klub_id, naziv, sport, last_scrape,
|
||||||
|
(last_scrape <= 'epoch'::timestamptz OR last_scrape < now() - interval %s) AS stale
|
||||||
|
FROM last_per_klub;
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def telegram(text: str) -> None:
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"curl", "-sS", "-X", "POST",
|
||||||
|
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||||
|
"-d", f"chat_id={TG_CHAT}",
|
||||||
|
"--data-urlencode", f"text={text}",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
timeout=10,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
log(f"telegram sent ({len(text)} chars)")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"telegram fail: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
log(f"sport_harvest_health START stale_days={STALE_DAYS}")
|
||||||
|
try:
|
||||||
|
conn = psycopg2.connect(DSN)
|
||||||
|
except Exception as e:
|
||||||
|
log(f"DB connect FAIL: {e}")
|
||||||
|
telegram(f"🚨 sport_harvest_health: DB connect FAIL — {e}")
|
||||||
|
return 2
|
||||||
|
|
||||||
|
interval_str = f"{STALE_DAYS} days"
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute(SQL, (interval_str,))
|
||||||
|
rows = cur.fetchall()
|
||||||
|
|
||||||
|
total = len(rows)
|
||||||
|
stale_rows = [r for r in rows if r["stale"]]
|
||||||
|
by_sport: dict = {}
|
||||||
|
for r in stale_rows:
|
||||||
|
s = (r["sport"] or "?").lower()
|
||||||
|
by_sport[s] = by_sport.get(s, 0) + 1
|
||||||
|
|
||||||
|
top_stale = sorted(
|
||||||
|
stale_rows,
|
||||||
|
key=lambda r: (r["last_scrape"] or datetime(1970, 1, 1, tzinfo=timezone.utc)),
|
||||||
|
)[:10]
|
||||||
|
|
||||||
|
log(f"klubova_total={total} stale={len(stale_rows)} by_sport={json.dumps(by_sport, ensure_ascii=False)}")
|
||||||
|
for r in top_stale:
|
||||||
|
log(f" STALE klub_id={r['klub_id']} sport={r['sport']} last={r['last_scrape']} naziv={r['naziv']}")
|
||||||
|
|
||||||
|
if stale_rows:
|
||||||
|
sport_summary = ", ".join(f"{k.upper()}:{v}" for k, v in sorted(by_sport.items()))
|
||||||
|
top_lines = "\n".join(
|
||||||
|
f" • {r['naziv']} ({(r['sport'] or '?')}) — {r['last_scrape']}"
|
||||||
|
for r in top_stale[:5]
|
||||||
|
)
|
||||||
|
msg = (
|
||||||
|
f"⚠️ Sport harvest stale: {len(stale_rows)}/{total} klubova "
|
||||||
|
f">{STALE_DAYS} dana ({sport_summary})\nTop:\n{top_lines}"
|
||||||
|
)
|
||||||
|
telegram(msg)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
log("sport_harvest_health DONE")
|
||||||
|
return 1 if stale_rows else 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Executable
+149
@@ -0,0 +1,149 @@
|
|||||||
|
"""
|
||||||
|
Multi-sport scrape base class.
|
||||||
|
Usage: subclass + implement scrape_klub(), scrape_player()
|
||||||
|
"""
|
||||||
|
import os, time, json, re, sys
|
||||||
|
from datetime import datetime
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor, execute_values
|
||||||
|
|
||||||
|
DSN = os.getenv("RINET_DSN",
|
||||||
|
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||||
|
|
||||||
|
class SportHarvester:
|
||||||
|
SPORT = None # override
|
||||||
|
SOURCE = None # override
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.conn = psycopg2.connect(DSN)
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.stats = {'klubova': 0, 'players': 0, 'stats': 0, 'errors': 0}
|
||||||
|
self.log_file = open(f"/var/log/pgz-sport-debug/harvest_{self.SPORT}_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
||||||
|
|
||||||
|
def log(self, msg):
|
||||||
|
line = f"[{datetime.now().isoformat(timespec='seconds')}] [{self.SPORT}] {msg}"
|
||||||
|
print(line, flush=True)
|
||||||
|
self.log_file.write(line + "\n"); self.log_file.flush()
|
||||||
|
|
||||||
|
def slugify(self, s):
|
||||||
|
if not s: return ""
|
||||||
|
t = s.lower().strip()
|
||||||
|
for old, new in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d')]:
|
||||||
|
t = t.replace(old, new)
|
||||||
|
t = re.sub(r'[^a-z0-9\s-]', '', t)
|
||||||
|
return re.sub(r'\s+', '-', t).strip('-')
|
||||||
|
|
||||||
|
def get_target_klubovi(self, limit=999):
|
||||||
|
"""Get PGŽ priority klubovi for this sport."""
|
||||||
|
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||||||
|
WHERE sport = %s AND (financiran OR u_godisnjaku)
|
||||||
|
ORDER BY financiran DESC, u_godisnjaku DESC, id
|
||||||
|
LIMIT %s
|
||||||
|
""", (self.SPORT, limit))
|
||||||
|
return cur.fetchall()
|
||||||
|
|
||||||
|
def upsert_clan(self, klub_id, source_id, ime, prezime, source_url, kategorija=None, sezona=None, extra=None):
|
||||||
|
"""Upsert player + return clan_id."""
|
||||||
|
ime = re.sub(r'\s+', ' ', (ime or '')).strip()
|
||||||
|
prezime = re.sub(r'\s+', ' ', (prezime or '')).strip()
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
# Try find existing by source+source_id
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id FROM pgz_sport.clanovi
|
||||||
|
WHERE source = %s AND source_id = %s
|
||||||
|
ORDER BY id LIMIT 1
|
||||||
|
""", (self.SOURCE, str(source_id)))
|
||||||
|
row = cur.fetchone()
|
||||||
|
if row:
|
||||||
|
clan_id = row[0]
|
||||||
|
cur.execute("""
|
||||||
|
UPDATE pgz_sport.clanovi
|
||||||
|
SET ime = COALESCE(NULLIF(ime,''), %s),
|
||||||
|
prezime = COALESCE(NULLIF(prezime,''), %s),
|
||||||
|
klub_id = COALESCE(klub_id, %s),
|
||||||
|
source_url = %s, last_updated = now(), last_scraped_at = now(),
|
||||||
|
sport = COALESCE(sport, %s),
|
||||||
|
metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
|
||||||
|
WHERE id = %s
|
||||||
|
""", (ime, prezime, klub_id, source_url, self.SPORT, json.dumps(extra or {}), clan_id))
|
||||||
|
else:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO pgz_sport.clanovi
|
||||||
|
(klub_id, ime, prezime, sport, source, source_id, source_url, last_scraped_at, aktivan, metadata)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, %s, now(), true, %s::jsonb)
|
||||||
|
RETURNING id
|
||||||
|
""", (klub_id, ime, prezime, self.SPORT, self.SOURCE, str(source_id), source_url, json.dumps(extra or {})))
|
||||||
|
clan_id = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# Add kategorija if specified (many-to-many)
|
||||||
|
if kategorija:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO pgz_sport.clan_kategorije
|
||||||
|
(clan_id, kategorija, sezona, klub_id, source, source_url)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s)
|
||||||
|
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO NOTHING
|
||||||
|
""", (clan_id, kategorija, sezona, klub_id, self.SOURCE, source_url))
|
||||||
|
return clan_id
|
||||||
|
|
||||||
|
def upsert_stats(self, clan_id, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict, raw=None):
|
||||||
|
"""Upsert player_stats row."""
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO pgz_sport.player_stats
|
||||||
|
(clan_id, sport, source, sezona, klub_id, klub_naziv, natjecanje, kategorija,
|
||||||
|
nastupi, golovi, asistencije, bodovi, trice, skokovi, blokade, servis_asovi,
|
||||||
|
zuti, crveni, minute, metadata)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
|
||||||
|
ON CONFLICT (clan_id, sport, sezona, klub_id, natjecanje)
|
||||||
|
DO UPDATE SET
|
||||||
|
nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi,
|
||||||
|
asistencije = EXCLUDED.asistencije, bodovi = EXCLUDED.bodovi,
|
||||||
|
trice = EXCLUDED.trice, skokovi = EXCLUDED.skokovi,
|
||||||
|
blokade = EXCLUDED.blokade, servis_asovi = EXCLUDED.servis_asovi,
|
||||||
|
zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
|
||||||
|
metadata = EXCLUDED.metadata, scraped_at = now()
|
||||||
|
""", (clan_id, self.SPORT, self.SOURCE, sezona, klub_id, klub_naziv, natjecanje, kategorija,
|
||||||
|
stats_dict.get('nastupi'), stats_dict.get('golovi'), stats_dict.get('asistencije'),
|
||||||
|
stats_dict.get('bodovi'), stats_dict.get('trice'), stats_dict.get('skokovi'),
|
||||||
|
stats_dict.get('blokade'), stats_dict.get('servis_asovi'),
|
||||||
|
stats_dict.get('zuti'), stats_dict.get('crveni'), stats_dict.get('minute'),
|
||||||
|
json.dumps(raw or {})))
|
||||||
|
|
||||||
|
def run(self, limit=999):
|
||||||
|
klubovi = self.get_target_klubovi(limit)
|
||||||
|
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova")
|
||||||
|
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||||
|
ctx = browser.new_context(
|
||||||
|
ignore_https_errors=True,
|
||||||
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
for klub in klubovi:
|
||||||
|
try:
|
||||||
|
self.scrape_klub(page, klub)
|
||||||
|
self.stats['klubova'] += 1
|
||||||
|
except Exception as e:
|
||||||
|
self.stats['errors'] += 1
|
||||||
|
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
self.log(f"✅ Done. Stats: {self.stats}")
|
||||||
|
# Telegram
|
||||||
|
import subprocess
|
||||||
|
try:
|
||||||
|
subprocess.run(["curl","-s","-X","POST",
|
||||||
|
f"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||||
|
"-d","chat_id=7969491558",
|
||||||
|
"--data-urlencode", f"text={self.SPORT.upper()} harvest done: {self.stats}"],
|
||||||
|
timeout=8, capture_output=True)
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
def scrape_klub(self, page, klub):
|
||||||
|
raise NotImplementedError("subclass must implement")
|
||||||
Executable
+32
@@ -0,0 +1,32 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""HKS-CBF + FIBA LiveStats basketball harvester."""
|
||||||
|
import sys, re
|
||||||
|
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||||
|
from __base import SportHarvester
|
||||||
|
|
||||||
|
class HKSHarvester(SportHarvester):
|
||||||
|
SPORT = 'košarka'
|
||||||
|
SOURCE = 'hks_cbf'
|
||||||
|
|
||||||
|
def scrape_klub(self, page, klub):
|
||||||
|
# Discovery: try search hks-cbf.hr by club name
|
||||||
|
url = f"https://www.hks-cbf.hr/?s={klub['naziv'].replace(' ','+')}"
|
||||||
|
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → {url}")
|
||||||
|
try:
|
||||||
|
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||||
|
# Look for /klubovi/ or /klub/ link
|
||||||
|
klub_links = page.locator('a[href*="/klubovi/"], a[href*="/klub/"]').all()
|
||||||
|
for a in klub_links[:3]:
|
||||||
|
href = a.get_attribute('href')
|
||||||
|
if href and 'klub' in href:
|
||||||
|
self.log(f" Found: {href}")
|
||||||
|
# Save URL to klub
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s",
|
||||||
|
(href, klub['id']))
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f" ❌ {e}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||||
Executable
+21
@@ -0,0 +1,21 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""HOS volleyball harvester."""
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||||
|
from __base import SportHarvester
|
||||||
|
|
||||||
|
class HOSHarvester(SportHarvester):
|
||||||
|
SPORT = 'odbojka'
|
||||||
|
SOURCE = 'hos'
|
||||||
|
|
||||||
|
def scrape_klub(self, page, klub):
|
||||||
|
# HOS-CVF.hr search
|
||||||
|
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
|
||||||
|
try:
|
||||||
|
page.goto("https://hos-cvf.hr/", wait_until="domcontentloaded", timeout=20000)
|
||||||
|
self.log(f" [discovery mode] HOS site loaded")
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f" ❌ {e}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||||
Executable
+27
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""HRS handball harvester."""
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||||
|
from __base import SportHarvester
|
||||||
|
|
||||||
|
class HRSHarvester(SportHarvester):
|
||||||
|
SPORT = 'rukomet'
|
||||||
|
SOURCE = 'hrs'
|
||||||
|
|
||||||
|
def scrape_klub(self, page, klub):
|
||||||
|
url = f"https://hrs.hr/?s={klub['naziv'].replace(' ','+')}"
|
||||||
|
self.log(f" 🤾 Klub {klub['id']} {klub['naziv']}")
|
||||||
|
try:
|
||||||
|
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||||
|
# Find natjecanje or klub link
|
||||||
|
links = page.locator('a[href*="hrs.hr"]').all()
|
||||||
|
for a in links[:5]:
|
||||||
|
href = a.get_attribute('href') or ''
|
||||||
|
if 'natjecanje' in href or 'klub' in href:
|
||||||
|
self.log(f" Found: {href}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f" ❌ {e}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
HRSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||||
Executable
+54
@@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""HVS waterpolo harvester."""
|
||||||
|
import sys, re
|
||||||
|
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||||
|
from __base import SportHarvester
|
||||||
|
|
||||||
|
class HVSHarvester(SportHarvester):
|
||||||
|
SPORT = 'vaterpolo'
|
||||||
|
SOURCE = 'hvs'
|
||||||
|
|
||||||
|
def scrape_klub(self, page, klub):
|
||||||
|
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
|
||||||
|
try:
|
||||||
|
# Get all klubovi list from HVS
|
||||||
|
page.goto("https://hvs.hr/klubovi/", wait_until="domcontentloaded", timeout=20000)
|
||||||
|
klub_links = page.locator('a[href*="/klub/"]').all()
|
||||||
|
naziv_lower = klub['naziv'].lower()
|
||||||
|
for a in klub_links[:30]:
|
||||||
|
text = a.inner_text().lower()
|
||||||
|
href = a.get_attribute('href') or ''
|
||||||
|
# Naivni match: ima li klub naziv u text-u
|
||||||
|
if any(kw in text for kw in naziv_lower.split() if len(kw) > 3):
|
||||||
|
self.log(f" Match: {text[:50]} → {href}")
|
||||||
|
m = re.search(r'/klub/(\d+)', href)
|
||||||
|
if m:
|
||||||
|
kid = m.group(1)
|
||||||
|
new_url = f"https://hvs.hr/klub/{kid}/"
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s", (new_url, klub['id']))
|
||||||
|
# Now visit klub page for roster
|
||||||
|
page.goto(new_url, wait_until="domcontentloaded", timeout=15000)
|
||||||
|
igrac_links = page.locator('a[href*="/igrac/"]').all()
|
||||||
|
self.log(f" {len(igrac_links)} igrača found")
|
||||||
|
for ia in igrac_links[:30]:
|
||||||
|
ihref = ia.get_attribute('href') or ''
|
||||||
|
naziv = ia.inner_text().strip()
|
||||||
|
mi = re.search(r'/igrac/(\d+)', ihref)
|
||||||
|
if mi and naziv:
|
||||||
|
parts = re.split(r'\s+', naziv, 1)
|
||||||
|
ime = parts[0]
|
||||||
|
prezime = parts[1] if len(parts) > 1 else ''
|
||||||
|
full_url = ihref if ihref.startswith('http') else f"https://hvs.hr{ihref}"
|
||||||
|
clan_id = self.upsert_clan(
|
||||||
|
klub_id=klub['id'], source_id=mi.group(1),
|
||||||
|
ime=ime, prezime=prezime,
|
||||||
|
source_url=full_url
|
||||||
|
)
|
||||||
|
self.stats['players'] += 1
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f" ❌ {e}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
HVSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||||
Reference in New Issue
Block a user