HNS+UI: 4 nova endpointa + multi-sport schema (M2M kategorije + player_stats)
Endpoints:
- GET /api/v2/enrich-sources — sport→source mapping
- GET /api/v2/klubovi/priority-sort — financirani/godišnjak prvi
- GET /api/v2/clan/{id}/kategorije — many-to-many kategorije
- GET /api/v2/clan/{id}/full — kompletna slika (profil+kategorije+sezone+utakmice+stats)
- POST /api/v2/export/klubovi — XLSX export selektiranih
Schema:
- pgz_sport.clan_kategorije (M2M: igrač u juniorskoj+seniorskoj)
- pgz_sport.player_stats (multi-sport: nogomet/košarka/rukomet/odbojka/vaterpolo)
- pgz_sport.klub_roster (multi-source)
- pgz_sport.enrichment_sources (sport→izvor)
- View: v_pgz_priority_klubovi (financiran || u_godisnjaku)
- View: v_klubovi_priority_sort (priority sort)
Sport harvesters scaffold:
- scripts/sport_harvesters/__base.py (SportHarvester class)
- hks_basketball.py, hrs_handball.py, hos_volleyball.py, hvs_waterpolo.py
This commit is contained in:
Executable
+149
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Multi-sport scrape base class.
|
||||
Usage: subclass + implement scrape_klub(), scrape_player()
|
||||
"""
|
||||
import os, time, json, re, sys
|
||||
from datetime import datetime
|
||||
from playwright.sync_api import sync_playwright
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
|
||||
class SportHarvester:
|
||||
SPORT = None # override
|
||||
SOURCE = None # override
|
||||
|
||||
def __init__(self):
|
||||
self.conn = psycopg2.connect(DSN)
|
||||
self.conn.autocommit = True
|
||||
self.stats = {'klubova': 0, 'players': 0, 'stats': 0, 'errors': 0}
|
||||
self.log_file = open(f"/var/log/pgz-sport-debug/harvest_{self.SPORT}_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
||||
|
||||
def log(self, msg):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] [{self.SPORT}] {msg}"
|
||||
print(line, flush=True)
|
||||
self.log_file.write(line + "\n"); self.log_file.flush()
|
||||
|
||||
def slugify(self, s):
|
||||
if not s: return ""
|
||||
t = s.lower().strip()
|
||||
for old, new in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d')]:
|
||||
t = t.replace(old, new)
|
||||
t = re.sub(r'[^a-z0-9\s-]', '', t)
|
||||
return re.sub(r'\s+', '-', t).strip('-')
|
||||
|
||||
def get_target_klubovi(self, limit=999):
|
||||
"""Get PGŽ priority klubovi for this sport."""
|
||||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||||
WHERE sport = %s AND (financiran OR u_godisnjaku)
|
||||
ORDER BY financiran DESC, u_godisnjaku DESC, id
|
||||
LIMIT %s
|
||||
""", (self.SPORT, limit))
|
||||
return cur.fetchall()
|
||||
|
||||
def upsert_clan(self, klub_id, source_id, ime, prezime, source_url, kategorija=None, sezona=None, extra=None):
|
||||
"""Upsert player + return clan_id."""
|
||||
ime = re.sub(r'\s+', ' ', (ime or '')).strip()
|
||||
prezime = re.sub(r'\s+', ' ', (prezime or '')).strip()
|
||||
with self.conn.cursor() as cur:
|
||||
# Try find existing by source+source_id
|
||||
cur.execute("""
|
||||
SELECT id FROM pgz_sport.clanovi
|
||||
WHERE source = %s AND source_id = %s
|
||||
ORDER BY id LIMIT 1
|
||||
""", (self.SOURCE, str(source_id)))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
clan_id = row[0]
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.clanovi
|
||||
SET ime = COALESCE(NULLIF(ime,''), %s),
|
||||
prezime = COALESCE(NULLIF(prezime,''), %s),
|
||||
klub_id = COALESCE(klub_id, %s),
|
||||
source_url = %s, last_updated = now(), last_scraped_at = now(),
|
||||
sport = COALESCE(sport, %s),
|
||||
metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
|
||||
WHERE id = %s
|
||||
""", (ime, prezime, klub_id, source_url, self.SPORT, json.dumps(extra or {}), clan_id))
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, sport, source, source_id, source_url, last_scraped_at, aktivan, metadata)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, now(), true, %s::jsonb)
|
||||
RETURNING id
|
||||
""", (klub_id, ime, prezime, self.SPORT, self.SOURCE, str(source_id), source_url, json.dumps(extra or {})))
|
||||
clan_id = cur.fetchone()[0]
|
||||
|
||||
# Add kategorija if specified (many-to-many)
|
||||
if kategorija:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clan_kategorije
|
||||
(clan_id, kategorija, sezona, klub_id, source, source_url)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO NOTHING
|
||||
""", (clan_id, kategorija, sezona, klub_id, self.SOURCE, source_url))
|
||||
return clan_id
|
||||
|
||||
def upsert_stats(self, clan_id, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict, raw=None):
|
||||
"""Upsert player_stats row."""
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.player_stats
|
||||
(clan_id, sport, source, sezona, klub_id, klub_naziv, natjecanje, kategorija,
|
||||
nastupi, golovi, asistencije, bodovi, trice, skokovi, blokade, servis_asovi,
|
||||
zuti, crveni, minute, metadata)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
|
||||
ON CONFLICT (clan_id, sport, sezona, klub_id, natjecanje)
|
||||
DO UPDATE SET
|
||||
nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi,
|
||||
asistencije = EXCLUDED.asistencije, bodovi = EXCLUDED.bodovi,
|
||||
trice = EXCLUDED.trice, skokovi = EXCLUDED.skokovi,
|
||||
blokade = EXCLUDED.blokade, servis_asovi = EXCLUDED.servis_asovi,
|
||||
zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
|
||||
metadata = EXCLUDED.metadata, scraped_at = now()
|
||||
""", (clan_id, self.SPORT, self.SOURCE, sezona, klub_id, klub_naziv, natjecanje, kategorija,
|
||||
stats_dict.get('nastupi'), stats_dict.get('golovi'), stats_dict.get('asistencije'),
|
||||
stats_dict.get('bodovi'), stats_dict.get('trice'), stats_dict.get('skokovi'),
|
||||
stats_dict.get('blokade'), stats_dict.get('servis_asovi'),
|
||||
stats_dict.get('zuti'), stats_dict.get('crveni'), stats_dict.get('minute'),
|
||||
json.dumps(raw or {})))
|
||||
|
||||
def run(self, limit=999):
|
||||
klubovi = self.get_target_klubovi(limit)
|
||||
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova")
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||
ctx = browser.new_context(
|
||||
ignore_https_errors=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
for klub in klubovi:
|
||||
try:
|
||||
self.scrape_klub(page, klub)
|
||||
self.stats['klubova'] += 1
|
||||
except Exception as e:
|
||||
self.stats['errors'] += 1
|
||||
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
|
||||
|
||||
browser.close()
|
||||
|
||||
self.log(f"✅ Done. Stats: {self.stats}")
|
||||
# Telegram
|
||||
import subprocess
|
||||
try:
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
"-d","chat_id=7969491558",
|
||||
"--data-urlencode", f"text={self.SPORT.upper()} harvest done: {self.stats}"],
|
||||
timeout=8, capture_output=True)
|
||||
except: pass
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
raise NotImplementedError("subclass must implement")
|
||||
Executable
+32
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HKS-CBF + FIBA LiveStats basketball harvester."""
|
||||
import sys, re
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
|
||||
class HKSHarvester(SportHarvester):
|
||||
SPORT = 'košarka'
|
||||
SOURCE = 'hks_cbf'
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
# Discovery: try search hks-cbf.hr by club name
|
||||
url = f"https://www.hks-cbf.hr/?s={klub['naziv'].replace(' ','+')}"
|
||||
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → {url}")
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||
# Look for /klubovi/ or /klub/ link
|
||||
klub_links = page.locator('a[href*="/klubovi/"], a[href*="/klub/"]').all()
|
||||
for a in klub_links[:3]:
|
||||
href = a.get_attribute('href')
|
||||
if href and 'klub' in href:
|
||||
self.log(f" Found: {href}")
|
||||
# Save URL to klub
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s",
|
||||
(href, klub['id']))
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
Executable
+21
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HOS volleyball harvester."""
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
|
||||
class HOSHarvester(SportHarvester):
|
||||
SPORT = 'odbojka'
|
||||
SOURCE = 'hos'
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
# HOS-CVF.hr search
|
||||
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
|
||||
try:
|
||||
page.goto("https://hos-cvf.hr/", wait_until="domcontentloaded", timeout=20000)
|
||||
self.log(f" [discovery mode] HOS site loaded")
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
Executable
+27
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HRS handball harvester."""
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
|
||||
class HRSHarvester(SportHarvester):
|
||||
SPORT = 'rukomet'
|
||||
SOURCE = 'hrs'
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
url = f"https://hrs.hr/?s={klub['naziv'].replace(' ','+')}"
|
||||
self.log(f" 🤾 Klub {klub['id']} {klub['naziv']}")
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||
# Find natjecanje or klub link
|
||||
links = page.locator('a[href*="hrs.hr"]').all()
|
||||
for a in links[:5]:
|
||||
href = a.get_attribute('href') or ''
|
||||
if 'natjecanje' in href or 'klub' in href:
|
||||
self.log(f" Found: {href}")
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
HRSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
Executable
+54
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HVS waterpolo harvester."""
|
||||
import sys, re
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
|
||||
class HVSHarvester(SportHarvester):
|
||||
SPORT = 'vaterpolo'
|
||||
SOURCE = 'hvs'
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
|
||||
try:
|
||||
# Get all klubovi list from HVS
|
||||
page.goto("https://hvs.hr/klubovi/", wait_until="domcontentloaded", timeout=20000)
|
||||
klub_links = page.locator('a[href*="/klub/"]').all()
|
||||
naziv_lower = klub['naziv'].lower()
|
||||
for a in klub_links[:30]:
|
||||
text = a.inner_text().lower()
|
||||
href = a.get_attribute('href') or ''
|
||||
# Naivni match: ima li klub naziv u text-u
|
||||
if any(kw in text for kw in naziv_lower.split() if len(kw) > 3):
|
||||
self.log(f" Match: {text[:50]} → {href}")
|
||||
m = re.search(r'/klub/(\d+)', href)
|
||||
if m:
|
||||
kid = m.group(1)
|
||||
new_url = f"https://hvs.hr/klub/{kid}/"
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s", (new_url, klub['id']))
|
||||
# Now visit klub page for roster
|
||||
page.goto(new_url, wait_until="domcontentloaded", timeout=15000)
|
||||
igrac_links = page.locator('a[href*="/igrac/"]').all()
|
||||
self.log(f" {len(igrac_links)} igrača found")
|
||||
for ia in igrac_links[:30]:
|
||||
ihref = ia.get_attribute('href') or ''
|
||||
naziv = ia.inner_text().strip()
|
||||
mi = re.search(r'/igrac/(\d+)', ihref)
|
||||
if mi and naziv:
|
||||
parts = re.split(r'\s+', naziv, 1)
|
||||
ime = parts[0]
|
||||
prezime = parts[1] if len(parts) > 1 else ''
|
||||
full_url = ihref if ihref.startswith('http') else f"https://hvs.hr{ihref}"
|
||||
clan_id = self.upsert_clan(
|
||||
klub_id=klub['id'], source_id=mi.group(1),
|
||||
ime=ime, prezime=prezime,
|
||||
source_url=full_url
|
||||
)
|
||||
self.stats['players'] += 1
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
HVSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
Reference in New Issue
Block a user