c68fd4471e
Backed by: pgz_sport.hns_player_seasons, hns_klub_roster, v_pgz_financirani_klubovi Used by: cc-hns subagents for UI integration
350 lines
14 KiB
Python
Executable File
350 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
HNS Master Harvester — Playwright-based scrape semafor.hns.family
|
|
─────────────────────────────────────────────────────────────────
|
|
1. List PGŽ financirani nogometni klubovi
|
|
2. For each klub: scrape klub roster
|
|
3. For each player: scrape full profile (sezone, utakmice)
|
|
4. UPSERT u pgz_sport: hns_klub_roster, hns_player_seasons, hns_player_matches, clanovi
|
|
5. Audit log
|
|
|
|
Usage: python3 hns_master_harvester.py [--limit N] [--klub-id X] [--players-only]
|
|
"""
|
|
import os, sys, time, json, re, argparse, traceback
|
|
from datetime import datetime
|
|
from urllib.parse import urlparse
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor, execute_values
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
DSN = os.getenv("RINET_DSN",
|
|
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
|
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
|
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
|
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
|
|
|
def log(msg, telegram=False):
|
|
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
|
print(line, flush=True)
|
|
LOG.write(line + "\n"); LOG.flush()
|
|
if telegram:
|
|
try:
|
|
import subprocess
|
|
subprocess.run(["curl","-s","-X","POST",
|
|
f"https://api.telegram.org/bot{TG}/sendMessage",
|
|
"-d", f"chat_id={TG_CHAT}",
|
|
"--data-urlencode", f"text={msg[:2000]}"],
|
|
timeout=8, capture_output=True)
|
|
except: pass
|
|
|
|
def db_conn():
|
|
c = psycopg2.connect(DSN); c.autocommit = True; return c
|
|
|
|
# ── Slug HNS = "Franko Andrijašević" → "franko-andrijasevic" ──
|
|
def slugify_hns(text):
|
|
if not text: return ""
|
|
t = text.lower().strip()
|
|
t = (t.replace('č','c').replace('ć','c').replace('ž','z').replace('š','s').replace('đ','d')
|
|
.replace('Č','c').replace('Ć','c').replace('Ž','z').replace('Š','s').replace('Đ','d'))
|
|
t = re.sub(r'[^a-z0-9\s-]', '', t)
|
|
t = re.sub(r'\s+', '-', t).strip('-')
|
|
return t
|
|
|
|
def scrape_player(page, hns_id, slug):
|
|
"""Scrape player profile + sezone + utakmice."""
|
|
url = f"https://semafor.hns.family/igraci/{hns_id}/{slug}/"
|
|
try:
|
|
page.goto(url, wait_until="networkidle", timeout=30000)
|
|
except Exception as e:
|
|
log(f" ❌ Goto fail {url}: {e}")
|
|
return None
|
|
|
|
h1 = page.locator('h1').first.inner_text() if page.locator('h1').count() else ''
|
|
# Body text
|
|
body_text = page.locator('body').inner_text()
|
|
|
|
# Trenutni klub link (first /klubovi/ link)
|
|
current_klub = None
|
|
klub_links = page.locator('a[href*="/klubovi/"]').all()
|
|
if klub_links:
|
|
href = klub_links[0].get_attribute('href') or ''
|
|
m = re.search(r'/klubovi/(\d+)/([\w-]+)/', href)
|
|
if m:
|
|
current_klub = {'hns_id': m.group(1), 'slug': m.group(2), 'naziv': klub_links[0].inner_text().strip()}
|
|
|
|
# Karijera: regex za sezone (npr "2024/25", "2023/24")
|
|
sezone = []
|
|
# Potraži pattern "Sezona | Klub | ..." u tekstu
|
|
season_lines = re.findall(r'(20\d{2}/\d{2}).{0,200}', body_text)
|
|
|
|
# Tables (možda dynamiclli rendered)
|
|
seasons_data = []
|
|
matches_data = []
|
|
|
|
# Wait for dynamic content
|
|
try: page.wait_for_selector('table, .karijera, .sezona, [class*="season"]', timeout=8000)
|
|
except: pass
|
|
time.sleep(1)
|
|
|
|
# Re-grab full body after wait
|
|
body_text = page.locator('body').inner_text()
|
|
|
|
# Parse karijera section: "Sezona | Klub | Natjecanje | Nastupi | Golovi"
|
|
# Pattern: 2024/25 ... HNK Orijent ... 3.HNL ... 14 ... 2
|
|
season_blocks = re.findall(r'(20\d{2}/\d{2})\s+([\w\s\u017c-\u017e\u0107\u010d\u0161\u017d\u0110\.\-]+?)\s+([\d\.\s]+)(?=20\d{2}/\d{2}|$)', body_text)
|
|
for sb in season_blocks:
|
|
sezona, klub_text, stats_text = sb
|
|
nums = re.findall(r'\d+', stats_text)
|
|
if len(nums) >= 1:
|
|
seasons_data.append({
|
|
'sezona': sezona,
|
|
'klub': klub_text.strip()[:200],
|
|
'nastupi': int(nums[0]) if len(nums) > 0 else 0,
|
|
'golovi': int(nums[1]) if len(nums) > 1 else 0,
|
|
})
|
|
|
|
tables = page.locator('table').all()
|
|
for t in tables:
|
|
rows = t.locator('tr').all()
|
|
if len(rows) < 2: continue
|
|
# Header
|
|
header = [c.inner_text().strip() for c in rows[0].locator('th, td').all()]
|
|
for r in rows[1:]:
|
|
cells = [c.inner_text().strip() for c in r.locator('th, td').all()]
|
|
if not cells: continue
|
|
row_dict = dict(zip(header, cells))
|
|
# Detect: has season column?
|
|
sezona = next((v for k,v in row_dict.items() if re.match(r'\d{4}/\d{2}', v)), None)
|
|
if sezona:
|
|
seasons_data.append({**row_dict, 'sezona': sezona})
|
|
|
|
return {
|
|
'hns_id': hns_id,
|
|
'slug': slug,
|
|
'naziv': h1,
|
|
'url': url,
|
|
'current_klub': current_klub,
|
|
'sezone_count': len(seasons_data),
|
|
'seasons': seasons_data,
|
|
'matches': matches_data,
|
|
'body_text_len': len(body_text),
|
|
}
|
|
|
|
def scrape_klub_roster(page, klub_hns_id, klub_slug):
|
|
"""Scrape klub roster — sve igrače trenutno u klubu."""
|
|
url = f"https://semafor.hns.family/klubovi/{klub_hns_id}/{klub_slug}/"
|
|
try:
|
|
page.goto(url, wait_until="networkidle", timeout=30000)
|
|
except Exception as e:
|
|
log(f" ❌ Goto fail {url}: {e}")
|
|
return []
|
|
|
|
# Sve linkove na igrače
|
|
players = []
|
|
player_links = page.locator('a[href*="/igraci/"]').all()
|
|
seen_ids = set()
|
|
for a in player_links:
|
|
href = a.get_attribute('href') or ''
|
|
m = re.search(r'/igraci/(\d+)/([\w-]+)', href)
|
|
if m:
|
|
hns_id = m.group(1)
|
|
if hns_id in seen_ids: continue
|
|
seen_ids.add(hns_id)
|
|
players.append({
|
|
'hns_id': hns_id,
|
|
'slug': m.group(2),
|
|
'naziv': a.inner_text().strip(),
|
|
'url': f"https://semafor.hns.family{href}" if href.startswith('/') else href
|
|
})
|
|
return players
|
|
|
|
def upsert_clan(conn, klub_id, player_data):
|
|
"""Upsert člana iz HNS profil podataka."""
|
|
# Naziv split: "FrankoAndrijašević" → ime/prezime
|
|
naziv = re.sub(r'\s+', ' ', player_data.get('naziv', '')).strip()
|
|
# Better: ako h1 join-an, podijeli camelcase
|
|
parts = re.findall(r'[A-ZČĆŠŽĐ][a-zčćšžđ\']+', naziv)
|
|
if len(parts) >= 2:
|
|
ime = parts[0]
|
|
prezime = ' '.join(parts[1:])
|
|
else:
|
|
ime = naziv
|
|
prezime = ''
|
|
|
|
hns_id = player_data['hns_id']
|
|
url = player_data['url']
|
|
|
|
with conn.cursor() as cur:
|
|
# Try find existing
|
|
cur.execute("""
|
|
SELECT id FROM pgz_sport.clanovi
|
|
WHERE hns_igrac_id = %s
|
|
ORDER BY id LIMIT 1
|
|
""", (hns_id,))
|
|
row = cur.fetchone()
|
|
if row:
|
|
clan_id = row[0]
|
|
cur.execute("""
|
|
UPDATE pgz_sport.clanovi
|
|
SET ime = COALESCE(NULLIF(ime,''), %s),
|
|
prezime = COALESCE(NULLIF(prezime,''), %s),
|
|
klub_id = COALESCE(klub_id, %s),
|
|
hns_igrac_id = %s,
|
|
source = 'hns_semafor',
|
|
source_url = %s,
|
|
last_updated = now(),
|
|
last_scraped_at = now(),
|
|
sport = COALESCE(sport, 'nogomet')
|
|
WHERE id = %s
|
|
""", (ime, prezime, klub_id, hns_id, url, clan_id))
|
|
else:
|
|
cur.execute("""
|
|
INSERT INTO pgz_sport.clanovi
|
|
(klub_id, ime, prezime, sport, source, source_url, hns_igrac_id, last_scraped_at, aktivan)
|
|
VALUES (%s, %s, %s, 'nogomet', 'hns_semafor', %s, %s, now(), true)
|
|
RETURNING id
|
|
""", (klub_id, ime, prezime, url, hns_id))
|
|
clan_id = cur.fetchone()[0]
|
|
return clan_id
|
|
|
|
def upsert_seasons(conn, hns_id, clan_id, seasons):
|
|
if not seasons: return 0
|
|
rows = []
|
|
for s in seasons:
|
|
sezona = s.get('sezona', '')
|
|
if not sezona: continue
|
|
# Try extract klub iz row
|
|
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
|
|
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
|
|
def num(key):
|
|
for k in s.keys():
|
|
if key in k.lower():
|
|
try: return int(re.sub(r'\D','', s[k]) or 0)
|
|
except: return 0
|
|
return 0
|
|
rows.append((
|
|
hns_id, clan_id, sezona, None, klub, natjecanje,
|
|
num('nastup'), num('start'), num('zamj'),
|
|
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
|
|
))
|
|
with conn.cursor() as cur:
|
|
execute_values(cur, """
|
|
INSERT INTO pgz_sport.hns_player_seasons
|
|
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
|
|
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
|
|
VALUES %s
|
|
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje)
|
|
DO UPDATE SET
|
|
nastupi = EXCLUDED.nastupi, startna = EXCLUDED.startna,
|
|
zamjena = EXCLUDED.zamjena, golovi = EXCLUDED.golovi,
|
|
asistencije = EXCLUDED.asistencije, zuti = EXCLUDED.zuti,
|
|
crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
|
|
scraped_at = now()
|
|
""", rows)
|
|
return len(rows)
|
|
|
|
def upsert_klub_roster(conn, klub_id, klub_hns_id, players):
|
|
if not players: return 0
|
|
rows = [(klub_id, klub_hns_id, p['hns_id'],
|
|
p.get('naziv','').split()[0] if p.get('naziv') else '',
|
|
' '.join(p.get('naziv','').split()[1:]) if p.get('naziv') else '',
|
|
p.get('pozicija',''), p.get('url',''))
|
|
for p in players]
|
|
with conn.cursor() as cur:
|
|
execute_values(cur, """
|
|
INSERT INTO pgz_sport.hns_klub_roster
|
|
(klub_id, klub_hns_id, hns_igrac_id, ime, prezime, pozicija, source_url)
|
|
VALUES %s
|
|
ON CONFLICT (klub_hns_id, hns_igrac_id)
|
|
DO UPDATE SET klub_id = EXCLUDED.klub_id, scraped_at = now()
|
|
""", rows)
|
|
return len(rows)
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument('--limit', type=int, default=999)
|
|
ap.add_argument('--klub-id', type=int, default=None)
|
|
ap.add_argument('--single-player', help='HNS ID of single player to scrape')
|
|
args = ap.parse_args()
|
|
|
|
conn = db_conn()
|
|
|
|
# Get target klubs: PGŽ financirani nogometni
|
|
if args.single_player:
|
|
klubovi = []
|
|
else:
|
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
if args.klub_id:
|
|
cur.execute("SELECT * FROM pgz_sport.klubovi WHERE id = %s", (args.klub_id,))
|
|
else:
|
|
cur.execute("""
|
|
SELECT * FROM pgz_sport.v_pgz_financirani_klubovi
|
|
WHERE sport = 'nogomet' AND source_url LIKE %s
|
|
ORDER BY id LIMIT %s
|
|
""", ('%semafor.hns.family/klubovi%', args.limit))
|
|
klubovi = cur.fetchall()
|
|
|
|
log(f"🚀 HNS Harvester starting. Target klubova: {len(klubovi)}", telegram=True)
|
|
|
|
stats = {'klubova': 0, 'players_scraped': 0, 'seasons_upserted': 0, 'errors': 0}
|
|
|
|
with sync_playwright() as pw:
|
|
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
|
ctx = browser.new_context(
|
|
ignore_https_errors=True,
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
page = ctx.new_page()
|
|
|
|
if args.single_player:
|
|
# Test mode
|
|
log(f"🔬 Single player mode: {args.single_player}")
|
|
data = scrape_player(page, args.single_player, 'unknown')
|
|
log(f" Data: {json.dumps(data, default=str, ensure_ascii=False)[:500]}")
|
|
browser.close()
|
|
return
|
|
|
|
for klub in klubovi:
|
|
try:
|
|
src = klub.get('source_url', '') or ''
|
|
m = re.search(r'/klubovi/(\d+)/([^/]*)', src)
|
|
if not m:
|
|
log(f" ⏭ Klub {klub['id']} {klub['naziv']} — no HNS URL")
|
|
continue
|
|
khns, kslug = m.group(1), m.group(2) or 'klub'
|
|
log(f"\n🏟 Klub {klub['id']} {klub['naziv']} → HNS {khns}/{kslug}")
|
|
|
|
roster = scrape_klub_roster(page, khns, kslug)
|
|
log(f" Roster: {len(roster)} igrača")
|
|
|
|
if roster:
|
|
upsert_klub_roster(conn, klub['id'], khns, roster)
|
|
|
|
# Each player
|
|
for p in roster[:30]: # safety: max 30 per klub for now
|
|
try:
|
|
time.sleep(0.5)
|
|
pdata = scrape_player(page, p['hns_id'], p['slug'])
|
|
if pdata:
|
|
clan_id = upsert_clan(conn, klub['id'], pdata)
|
|
n_seas = upsert_seasons(conn, pdata['hns_id'], clan_id, pdata.get('seasons', []))
|
|
stats['players_scraped'] += 1
|
|
stats['seasons_upserted'] += n_seas
|
|
log(f" ✓ {pdata['naziv']} (clan_id={clan_id}, seasons={n_seas})")
|
|
except Exception as e:
|
|
stats['errors'] += 1
|
|
log(f" ❌ Player {p['hns_id']}: {e}")
|
|
|
|
stats['klubova'] += 1
|
|
except Exception as e:
|
|
stats['errors'] += 1
|
|
log(f" ❌ Klub {klub['id']}: {e}\n{traceback.format_exc()[:500]}")
|
|
|
|
browser.close()
|
|
|
|
summary = f"✅ HNS Harvester done. Klubova: {stats['klubova']}, Players: {stats['players_scraped']}, Seasons: {stats['seasons_upserted']}, Errors: {stats['errors']}"
|
|
log(summary, telegram=True)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|