HNS endpoints: /clan/{id}/hns-career + /klubovi/pgz-financirani + /dashboard/hns-coverage
Backed by: pgz_sport.hns_player_seasons, hns_klub_roster, v_pgz_financirani_klubovi Used by: cc-hns subagents for UI integration
This commit is contained in:
Executable
+349
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HNS Master Harvester — Playwright-based scrape semafor.hns.family
|
||||
─────────────────────────────────────────────────────────────────
|
||||
1. List PGŽ financirani nogometni klubovi
|
||||
2. For each klub: scrape klub roster
|
||||
3. For each player: scrape full profile (sezone, utakmice)
|
||||
4. UPSERT u pgz_sport: hns_klub_roster, hns_player_seasons, hns_player_matches, clanovi
|
||||
5. Audit log
|
||||
|
||||
Usage: python3 hns_master_harvester.py [--limit N] [--klub-id X] [--players-only]
|
||||
"""
|
||||
import os, sys, time, json, re, argparse, traceback
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
||||
|
||||
def log(msg, telegram=False):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True)
|
||||
LOG.write(line + "\n"); LOG.flush()
|
||||
if telegram:
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot{TG}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={msg[:2000]}"],
|
||||
timeout=8, capture_output=True)
|
||||
except: pass
|
||||
|
||||
def db_conn():
|
||||
c = psycopg2.connect(DSN); c.autocommit = True; return c
|
||||
|
||||
# ── Slug HNS = "Franko Andrijašević" → "franko-andrijasevic" ──
|
||||
def slugify_hns(text):
|
||||
if not text: return ""
|
||||
t = text.lower().strip()
|
||||
t = (t.replace('č','c').replace('ć','c').replace('ž','z').replace('š','s').replace('đ','d')
|
||||
.replace('Č','c').replace('Ć','c').replace('Ž','z').replace('Š','s').replace('Đ','d'))
|
||||
t = re.sub(r'[^a-z0-9\s-]', '', t)
|
||||
t = re.sub(r'\s+', '-', t).strip('-')
|
||||
return t
|
||||
|
||||
def scrape_player(page, hns_id, slug):
|
||||
"""Scrape player profile + sezone + utakmice."""
|
||||
url = f"https://semafor.hns.family/igraci/{hns_id}/{slug}/"
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
except Exception as e:
|
||||
log(f" ❌ Goto fail {url}: {e}")
|
||||
return None
|
||||
|
||||
h1 = page.locator('h1').first.inner_text() if page.locator('h1').count() else ''
|
||||
# Body text
|
||||
body_text = page.locator('body').inner_text()
|
||||
|
||||
# Trenutni klub link (first /klubovi/ link)
|
||||
current_klub = None
|
||||
klub_links = page.locator('a[href*="/klubovi/"]').all()
|
||||
if klub_links:
|
||||
href = klub_links[0].get_attribute('href') or ''
|
||||
m = re.search(r'/klubovi/(\d+)/([\w-]+)/', href)
|
||||
if m:
|
||||
current_klub = {'hns_id': m.group(1), 'slug': m.group(2), 'naziv': klub_links[0].inner_text().strip()}
|
||||
|
||||
# Karijera: regex za sezone (npr "2024/25", "2023/24")
|
||||
sezone = []
|
||||
# Potraži pattern "Sezona | Klub | ..." u tekstu
|
||||
season_lines = re.findall(r'(20\d{2}/\d{2}).{0,200}', body_text)
|
||||
|
||||
# Tables (možda dynamiclli rendered)
|
||||
seasons_data = []
|
||||
matches_data = []
|
||||
|
||||
# Wait for dynamic content
|
||||
try: page.wait_for_selector('table, .karijera, .sezona, [class*="season"]', timeout=8000)
|
||||
except: pass
|
||||
time.sleep(1)
|
||||
|
||||
# Re-grab full body after wait
|
||||
body_text = page.locator('body').inner_text()
|
||||
|
||||
# Parse karijera section: "Sezona | Klub | Natjecanje | Nastupi | Golovi"
|
||||
# Pattern: 2024/25 ... HNK Orijent ... 3.HNL ... 14 ... 2
|
||||
season_blocks = re.findall(r'(20\d{2}/\d{2})\s+([\w\s\u017c-\u017e\u0107\u010d\u0161\u017d\u0110\.\-]+?)\s+([\d\.\s]+)(?=20\d{2}/\d{2}|$)', body_text)
|
||||
for sb in season_blocks:
|
||||
sezona, klub_text, stats_text = sb
|
||||
nums = re.findall(r'\d+', stats_text)
|
||||
if len(nums) >= 1:
|
||||
seasons_data.append({
|
||||
'sezona': sezona,
|
||||
'klub': klub_text.strip()[:200],
|
||||
'nastupi': int(nums[0]) if len(nums) > 0 else 0,
|
||||
'golovi': int(nums[1]) if len(nums) > 1 else 0,
|
||||
})
|
||||
|
||||
tables = page.locator('table').all()
|
||||
for t in tables:
|
||||
rows = t.locator('tr').all()
|
||||
if len(rows) < 2: continue
|
||||
# Header
|
||||
header = [c.inner_text().strip() for c in rows[0].locator('th, td').all()]
|
||||
for r in rows[1:]:
|
||||
cells = [c.inner_text().strip() for c in r.locator('th, td').all()]
|
||||
if not cells: continue
|
||||
row_dict = dict(zip(header, cells))
|
||||
# Detect: has season column?
|
||||
sezona = next((v for k,v in row_dict.items() if re.match(r'\d{4}/\d{2}', v)), None)
|
||||
if sezona:
|
||||
seasons_data.append({**row_dict, 'sezona': sezona})
|
||||
|
||||
return {
|
||||
'hns_id': hns_id,
|
||||
'slug': slug,
|
||||
'naziv': h1,
|
||||
'url': url,
|
||||
'current_klub': current_klub,
|
||||
'sezone_count': len(seasons_data),
|
||||
'seasons': seasons_data,
|
||||
'matches': matches_data,
|
||||
'body_text_len': len(body_text),
|
||||
}
|
||||
|
||||
def scrape_klub_roster(page, klub_hns_id, klub_slug):
|
||||
"""Scrape klub roster — sve igrače trenutno u klubu."""
|
||||
url = f"https://semafor.hns.family/klubovi/{klub_hns_id}/{klub_slug}/"
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
except Exception as e:
|
||||
log(f" ❌ Goto fail {url}: {e}")
|
||||
return []
|
||||
|
||||
# Sve linkove na igrače
|
||||
players = []
|
||||
player_links = page.locator('a[href*="/igraci/"]').all()
|
||||
seen_ids = set()
|
||||
for a in player_links:
|
||||
href = a.get_attribute('href') or ''
|
||||
m = re.search(r'/igraci/(\d+)/([\w-]+)', href)
|
||||
if m:
|
||||
hns_id = m.group(1)
|
||||
if hns_id in seen_ids: continue
|
||||
seen_ids.add(hns_id)
|
||||
players.append({
|
||||
'hns_id': hns_id,
|
||||
'slug': m.group(2),
|
||||
'naziv': a.inner_text().strip(),
|
||||
'url': f"https://semafor.hns.family{href}" if href.startswith('/') else href
|
||||
})
|
||||
return players
|
||||
|
||||
def upsert_clan(conn, klub_id, player_data):
|
||||
"""Upsert člana iz HNS profil podataka."""
|
||||
# Naziv split: "FrankoAndrijašević" → ime/prezime
|
||||
naziv = re.sub(r'\s+', ' ', player_data.get('naziv', '')).strip()
|
||||
# Better: ako h1 join-an, podijeli camelcase
|
||||
parts = re.findall(r'[A-ZČĆŠŽĐ][a-zčćšžđ\']+', naziv)
|
||||
if len(parts) >= 2:
|
||||
ime = parts[0]
|
||||
prezime = ' '.join(parts[1:])
|
||||
else:
|
||||
ime = naziv
|
||||
prezime = ''
|
||||
|
||||
hns_id = player_data['hns_id']
|
||||
url = player_data['url']
|
||||
|
||||
with conn.cursor() as cur:
|
||||
# Try find existing
|
||||
cur.execute("""
|
||||
SELECT id FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id = %s
|
||||
ORDER BY id LIMIT 1
|
||||
""", (hns_id,))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
clan_id = row[0]
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.clanovi
|
||||
SET ime = COALESCE(NULLIF(ime,''), %s),
|
||||
prezime = COALESCE(NULLIF(prezime,''), %s),
|
||||
klub_id = COALESCE(klub_id, %s),
|
||||
hns_igrac_id = %s,
|
||||
source = 'hns_semafor',
|
||||
source_url = %s,
|
||||
last_updated = now(),
|
||||
last_scraped_at = now(),
|
||||
sport = COALESCE(sport, 'nogomet')
|
||||
WHERE id = %s
|
||||
""", (ime, prezime, klub_id, hns_id, url, clan_id))
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, sport, source, source_url, hns_igrac_id, last_scraped_at, aktivan)
|
||||
VALUES (%s, %s, %s, 'nogomet', 'hns_semafor', %s, %s, now(), true)
|
||||
RETURNING id
|
||||
""", (klub_id, ime, prezime, url, hns_id))
|
||||
clan_id = cur.fetchone()[0]
|
||||
return clan_id
|
||||
|
||||
def upsert_seasons(conn, hns_id, clan_id, seasons):
|
||||
if not seasons: return 0
|
||||
rows = []
|
||||
for s in seasons:
|
||||
sezona = s.get('sezona', '')
|
||||
if not sezona: continue
|
||||
# Try extract klub iz row
|
||||
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
|
||||
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
|
||||
def num(key):
|
||||
for k in s.keys():
|
||||
if key in k.lower():
|
||||
try: return int(re.sub(r'\D','', s[k]) or 0)
|
||||
except: return 0
|
||||
return 0
|
||||
rows.append((
|
||||
hns_id, clan_id, sezona, None, klub, natjecanje,
|
||||
num('nastup'), num('start'), num('zamj'),
|
||||
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
|
||||
))
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
|
||||
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje)
|
||||
DO UPDATE SET
|
||||
nastupi = EXCLUDED.nastupi, startna = EXCLUDED.startna,
|
||||
zamjena = EXCLUDED.zamjena, golovi = EXCLUDED.golovi,
|
||||
asistencije = EXCLUDED.asistencije, zuti = EXCLUDED.zuti,
|
||||
crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
|
||||
scraped_at = now()
|
||||
""", rows)
|
||||
return len(rows)
|
||||
|
||||
def upsert_klub_roster(conn, klub_id, klub_hns_id, players):
|
||||
if not players: return 0
|
||||
rows = [(klub_id, klub_hns_id, p['hns_id'],
|
||||
p.get('naziv','').split()[0] if p.get('naziv') else '',
|
||||
' '.join(p.get('naziv','').split()[1:]) if p.get('naziv') else '',
|
||||
p.get('pozicija',''), p.get('url',''))
|
||||
for p in players]
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_klub_roster
|
||||
(klub_id, klub_hns_id, hns_igrac_id, ime, prezime, pozicija, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (klub_hns_id, hns_igrac_id)
|
||||
DO UPDATE SET klub_id = EXCLUDED.klub_id, scraped_at = now()
|
||||
""", rows)
|
||||
return len(rows)
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument('--limit', type=int, default=999)
|
||||
ap.add_argument('--klub-id', type=int, default=None)
|
||||
ap.add_argument('--single-player', help='HNS ID of single player to scrape')
|
||||
args = ap.parse_args()
|
||||
|
||||
conn = db_conn()
|
||||
|
||||
# Get target klubs: PGŽ financirani nogometni
|
||||
if args.single_player:
|
||||
klubovi = []
|
||||
else:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
if args.klub_id:
|
||||
cur.execute("SELECT * FROM pgz_sport.klubovi WHERE id = %s", (args.klub_id,))
|
||||
else:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_financirani_klubovi
|
||||
WHERE sport = 'nogomet' AND source_url LIKE %s
|
||||
ORDER BY id LIMIT %s
|
||||
""", ('%semafor.hns.family/klubovi%', args.limit))
|
||||
klubovi = cur.fetchall()
|
||||
|
||||
log(f"🚀 HNS Harvester starting. Target klubova: {len(klubovi)}", telegram=True)
|
||||
|
||||
stats = {'klubova': 0, 'players_scraped': 0, 'seasons_upserted': 0, 'errors': 0}
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||
ctx = browser.new_context(
|
||||
ignore_https_errors=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
if args.single_player:
|
||||
# Test mode
|
||||
log(f"🔬 Single player mode: {args.single_player}")
|
||||
data = scrape_player(page, args.single_player, 'unknown')
|
||||
log(f" Data: {json.dumps(data, default=str, ensure_ascii=False)[:500]}")
|
||||
browser.close()
|
||||
return
|
||||
|
||||
for klub in klubovi:
|
||||
try:
|
||||
src = klub.get('source_url', '') or ''
|
||||
m = re.search(r'/klubovi/(\d+)/([^/]*)', src)
|
||||
if not m:
|
||||
log(f" ⏭ Klub {klub['id']} {klub['naziv']} — no HNS URL")
|
||||
continue
|
||||
khns, kslug = m.group(1), m.group(2) or 'klub'
|
||||
log(f"\n🏟 Klub {klub['id']} {klub['naziv']} → HNS {khns}/{kslug}")
|
||||
|
||||
roster = scrape_klub_roster(page, khns, kslug)
|
||||
log(f" Roster: {len(roster)} igrača")
|
||||
|
||||
if roster:
|
||||
upsert_klub_roster(conn, klub['id'], khns, roster)
|
||||
|
||||
# Each player
|
||||
for p in roster[:30]: # safety: max 30 per klub for now
|
||||
try:
|
||||
time.sleep(0.5)
|
||||
pdata = scrape_player(page, p['hns_id'], p['slug'])
|
||||
if pdata:
|
||||
clan_id = upsert_clan(conn, klub['id'], pdata)
|
||||
n_seas = upsert_seasons(conn, pdata['hns_id'], clan_id, pdata.get('seasons', []))
|
||||
stats['players_scraped'] += 1
|
||||
stats['seasons_upserted'] += n_seas
|
||||
log(f" ✓ {pdata['naziv']} (clan_id={clan_id}, seasons={n_seas})")
|
||||
except Exception as e:
|
||||
stats['errors'] += 1
|
||||
log(f" ❌ Player {p['hns_id']}: {e}")
|
||||
|
||||
stats['klubova'] += 1
|
||||
except Exception as e:
|
||||
stats['errors'] += 1
|
||||
log(f" ❌ Klub {klub['id']}: {e}\n{traceback.format_exc()[:500]}")
|
||||
|
||||
browser.close()
|
||||
|
||||
summary = f"✅ HNS Harvester done. Klubova: {stats['klubova']}, Players: {stats['players_scraped']}, Seasons: {stats['seasons_upserted']}, Errors: {stats['errors']}"
|
||||
log(summary, telegram=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user