Files
pgz-sport/scripts/hns_master_harvester.py
T
damir c68fd4471e HNS endpoints: /clan/{id}/hns-career + /klubovi/pgz-financirani + /dashboard/hns-coverage
Backed by: pgz_sport.hns_player_seasons, hns_klub_roster, v_pgz_financirani_klubovi
Used by: cc-hns subagents for UI integration
2026-05-05 10:22:36 +02:00

350 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
HNS Master Harvester — Playwright-based scrape semafor.hns.family
─────────────────────────────────────────────────────────────────
1. List PGŽ financirani nogometni klubovi
2. For each klub: scrape klub roster
3. For each player: scrape full profile (sezone, utakmice)
4. UPSERT u pgz_sport: hns_klub_roster, hns_player_seasons, hns_player_matches, clanovi
5. Audit log
Usage: python3 hns_master_harvester.py [--limit N] [--klub-id X] [--players-only]
"""
import os, sys, time, json, re, argparse, traceback
from datetime import datetime
from urllib.parse import urlparse
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
from playwright.sync_api import sync_playwright
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
def log(msg, telegram=False):
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
LOG.write(line + "\n"); LOG.flush()
if telegram:
try:
import subprocess
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot{TG}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={msg[:2000]}"],
timeout=8, capture_output=True)
except: pass
def db_conn():
c = psycopg2.connect(DSN); c.autocommit = True; return c
# ── Slug HNS = "Franko Andrijašević" → "franko-andrijasevic" ──
def slugify_hns(text):
if not text: return ""
t = text.lower().strip()
t = (t.replace('č','c').replace('ć','c').replace('ž','z').replace('š','s').replace('đ','d')
.replace('Č','c').replace('Ć','c').replace('Ž','z').replace('Š','s').replace('Đ','d'))
t = re.sub(r'[^a-z0-9\s-]', '', t)
t = re.sub(r'\s+', '-', t).strip('-')
return t
def scrape_player(page, hns_id, slug):
"""Scrape player profile + sezone + utakmice."""
url = f"https://semafor.hns.family/igraci/{hns_id}/{slug}/"
try:
page.goto(url, wait_until="networkidle", timeout=30000)
except Exception as e:
log(f" ❌ Goto fail {url}: {e}")
return None
h1 = page.locator('h1').first.inner_text() if page.locator('h1').count() else ''
# Body text
body_text = page.locator('body').inner_text()
# Trenutni klub link (first /klubovi/ link)
current_klub = None
klub_links = page.locator('a[href*="/klubovi/"]').all()
if klub_links:
href = klub_links[0].get_attribute('href') or ''
m = re.search(r'/klubovi/(\d+)/([\w-]+)/', href)
if m:
current_klub = {'hns_id': m.group(1), 'slug': m.group(2), 'naziv': klub_links[0].inner_text().strip()}
# Karijera: regex za sezone (npr "2024/25", "2023/24")
sezone = []
# Potraži pattern "Sezona | Klub | ..." u tekstu
season_lines = re.findall(r'(20\d{2}/\d{2}).{0,200}', body_text)
# Tables (možda dynamiclli rendered)
seasons_data = []
matches_data = []
# Wait for dynamic content
try: page.wait_for_selector('table, .karijera, .sezona, [class*="season"]', timeout=8000)
except: pass
time.sleep(1)
# Re-grab full body after wait
body_text = page.locator('body').inner_text()
# Parse karijera section: "Sezona | Klub | Natjecanje | Nastupi | Golovi"
# Pattern: 2024/25 ... HNK Orijent ... 3.HNL ... 14 ... 2
season_blocks = re.findall(r'(20\d{2}/\d{2})\s+([\w\s\u017c-\u017e\u0107\u010d\u0161\u017d\u0110\.\-]+?)\s+([\d\.\s]+)(?=20\d{2}/\d{2}|$)', body_text)
for sb in season_blocks:
sezona, klub_text, stats_text = sb
nums = re.findall(r'\d+', stats_text)
if len(nums) >= 1:
seasons_data.append({
'sezona': sezona,
'klub': klub_text.strip()[:200],
'nastupi': int(nums[0]) if len(nums) > 0 else 0,
'golovi': int(nums[1]) if len(nums) > 1 else 0,
})
tables = page.locator('table').all()
for t in tables:
rows = t.locator('tr').all()
if len(rows) < 2: continue
# Header
header = [c.inner_text().strip() for c in rows[0].locator('th, td').all()]
for r in rows[1:]:
cells = [c.inner_text().strip() for c in r.locator('th, td').all()]
if not cells: continue
row_dict = dict(zip(header, cells))
# Detect: has season column?
sezona = next((v for k,v in row_dict.items() if re.match(r'\d{4}/\d{2}', v)), None)
if sezona:
seasons_data.append({**row_dict, 'sezona': sezona})
return {
'hns_id': hns_id,
'slug': slug,
'naziv': h1,
'url': url,
'current_klub': current_klub,
'sezone_count': len(seasons_data),
'seasons': seasons_data,
'matches': matches_data,
'body_text_len': len(body_text),
}
def scrape_klub_roster(page, klub_hns_id, klub_slug):
"""Scrape klub roster — sve igrače trenutno u klubu."""
url = f"https://semafor.hns.family/klubovi/{klub_hns_id}/{klub_slug}/"
try:
page.goto(url, wait_until="networkidle", timeout=30000)
except Exception as e:
log(f" ❌ Goto fail {url}: {e}")
return []
# Sve linkove na igrače
players = []
player_links = page.locator('a[href*="/igraci/"]').all()
seen_ids = set()
for a in player_links:
href = a.get_attribute('href') or ''
m = re.search(r'/igraci/(\d+)/([\w-]+)', href)
if m:
hns_id = m.group(1)
if hns_id in seen_ids: continue
seen_ids.add(hns_id)
players.append({
'hns_id': hns_id,
'slug': m.group(2),
'naziv': a.inner_text().strip(),
'url': f"https://semafor.hns.family{href}" if href.startswith('/') else href
})
return players
def upsert_clan(conn, klub_id, player_data):
"""Upsert člana iz HNS profil podataka."""
# Naziv split: "FrankoAndrijašević" → ime/prezime
naziv = re.sub(r'\s+', ' ', player_data.get('naziv', '')).strip()
# Better: ako h1 join-an, podijeli camelcase
parts = re.findall(r'[A-ZČĆŠŽĐ][a-zčćšžđ\']+', naziv)
if len(parts) >= 2:
ime = parts[0]
prezime = ' '.join(parts[1:])
else:
ime = naziv
prezime = ''
hns_id = player_data['hns_id']
url = player_data['url']
with conn.cursor() as cur:
# Try find existing
cur.execute("""
SELECT id FROM pgz_sport.clanovi
WHERE hns_igrac_id = %s
ORDER BY id LIMIT 1
""", (hns_id,))
row = cur.fetchone()
if row:
clan_id = row[0]
cur.execute("""
UPDATE pgz_sport.clanovi
SET ime = COALESCE(NULLIF(ime,''), %s),
prezime = COALESCE(NULLIF(prezime,''), %s),
klub_id = COALESCE(klub_id, %s),
hns_igrac_id = %s,
source = 'hns_semafor',
source_url = %s,
last_updated = now(),
last_scraped_at = now(),
sport = COALESCE(sport, 'nogomet')
WHERE id = %s
""", (ime, prezime, klub_id, hns_id, url, clan_id))
else:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, sport, source, source_url, hns_igrac_id, last_scraped_at, aktivan)
VALUES (%s, %s, %s, 'nogomet', 'hns_semafor', %s, %s, now(), true)
RETURNING id
""", (klub_id, ime, prezime, url, hns_id))
clan_id = cur.fetchone()[0]
return clan_id
def upsert_seasons(conn, hns_id, clan_id, seasons):
if not seasons: return 0
rows = []
for s in seasons:
sezona = s.get('sezona', '')
if not sezona: continue
# Try extract klub iz row
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
def num(key):
for k in s.keys():
if key in k.lower():
try: return int(re.sub(r'\D','', s[k]) or 0)
except: return 0
return 0
rows.append((
hns_id, clan_id, sezona, None, klub, natjecanje,
num('nastup'), num('start'), num('zamj'),
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
))
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje)
DO UPDATE SET
nastupi = EXCLUDED.nastupi, startna = EXCLUDED.startna,
zamjena = EXCLUDED.zamjena, golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije, zuti = EXCLUDED.zuti,
crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
scraped_at = now()
""", rows)
return len(rows)
def upsert_klub_roster(conn, klub_id, klub_hns_id, players):
if not players: return 0
rows = [(klub_id, klub_hns_id, p['hns_id'],
p.get('naziv','').split()[0] if p.get('naziv') else '',
' '.join(p.get('naziv','').split()[1:]) if p.get('naziv') else '',
p.get('pozicija',''), p.get('url',''))
for p in players]
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.hns_klub_roster
(klub_id, klub_hns_id, hns_igrac_id, ime, prezime, pozicija, source_url)
VALUES %s
ON CONFLICT (klub_hns_id, hns_igrac_id)
DO UPDATE SET klub_id = EXCLUDED.klub_id, scraped_at = now()
""", rows)
return len(rows)
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--limit', type=int, default=999)
ap.add_argument('--klub-id', type=int, default=None)
ap.add_argument('--single-player', help='HNS ID of single player to scrape')
args = ap.parse_args()
conn = db_conn()
# Get target klubs: PGŽ financirani nogometni
if args.single_player:
klubovi = []
else:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
if args.klub_id:
cur.execute("SELECT * FROM pgz_sport.klubovi WHERE id = %s", (args.klub_id,))
else:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_financirani_klubovi
WHERE sport = 'nogomet' AND source_url LIKE %s
ORDER BY id LIMIT %s
""", ('%semafor.hns.family/klubovi%', args.limit))
klubovi = cur.fetchall()
log(f"🚀 HNS Harvester starting. Target klubova: {len(klubovi)}", telegram=True)
stats = {'klubova': 0, 'players_scraped': 0, 'seasons_upserted': 0, 'errors': 0}
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(
ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = ctx.new_page()
if args.single_player:
# Test mode
log(f"🔬 Single player mode: {args.single_player}")
data = scrape_player(page, args.single_player, 'unknown')
log(f" Data: {json.dumps(data, default=str, ensure_ascii=False)[:500]}")
browser.close()
return
for klub in klubovi:
try:
src = klub.get('source_url', '') or ''
m = re.search(r'/klubovi/(\d+)/([^/]*)', src)
if not m:
log(f" ⏭ Klub {klub['id']} {klub['naziv']} — no HNS URL")
continue
khns, kslug = m.group(1), m.group(2) or 'klub'
log(f"\n🏟 Klub {klub['id']} {klub['naziv']} → HNS {khns}/{kslug}")
roster = scrape_klub_roster(page, khns, kslug)
log(f" Roster: {len(roster)} igrača")
if roster:
upsert_klub_roster(conn, klub['id'], khns, roster)
# Each player
for p in roster[:30]: # safety: max 30 per klub for now
try:
time.sleep(0.5)
pdata = scrape_player(page, p['hns_id'], p['slug'])
if pdata:
clan_id = upsert_clan(conn, klub['id'], pdata)
n_seas = upsert_seasons(conn, pdata['hns_id'], clan_id, pdata.get('seasons', []))
stats['players_scraped'] += 1
stats['seasons_upserted'] += n_seas
log(f"{pdata['naziv']} (clan_id={clan_id}, seasons={n_seas})")
except Exception as e:
stats['errors'] += 1
log(f" ❌ Player {p['hns_id']}: {e}")
stats['klubova'] += 1
except Exception as e:
stats['errors'] += 1
log(f" ❌ Klub {klub['id']}: {e}\n{traceback.format_exc()[:500]}")
browser.close()
summary = f"✅ HNS Harvester done. Klubova: {stats['klubova']}, Players: {stats['players_scraped']}, Seasons: {stats['seasons_upserted']}, Errors: {stats['errors']}"
log(summary, telegram=True)
if __name__ == '__main__':
main()