#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """ HNS Semafor scraper for PGŽ football clubs. Strategy: 1. Seed-map known PGŽ clubs to HNS COMET klub_id (manual list to start) 2. For each klub: fetch /klubovi/{id}/{slug}/ and extract roster (player list) 3. For each player: fetch /igraci/{id}/{slug}/ → store in clanovi + utakmice_log 4. Respect rate limit (1 req / 1.5s), record run in scraper_runs Run modes: python hns_semafor.py seed # set hns_klub_id for known clubs python hns_semafor.py klub # scrape one klub roster + players python hns_semafor.py player # scrape one player python hns_semafor.py daily # full daily harvest of seeded PGŽ clubs """ import os, re, sys, time, json, logging from datetime import datetime, date from urllib.parse import urljoin import psycopg2 import psycopg2.extras import requests from bs4 import BeautifulSoup DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) BASE = "https://semafor.hns.family" UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)" RATE_S = 1.6 # seconds between requests TIMEOUT = 25 log = logging.getLogger("hns") logging.basicConfig( format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO, handlers=[ logging.FileHandler("/opt/pgz-sport/_logs/hns_scraper.log"), logging.StreamHandler(sys.stdout), ], ) # ═══ Manual seed mapping — PGŽ klubovi → HNS COMET id ═══ # Discovered from semafor.hns.family/igraci/1167145/marko-komadina/ matches SEED_MAP = { # naziv → hns_klub_id "NK Klana": 1569, "NK Krk": 1558, "NK Mune": 1576, "NK Vihor": 4326, "NK Doker": 107415, "HNK Kozala": 3090, "HNK Lovran": 1574, "HNK Goranin": 1565, "NK Risnjak": 1583, "NK Lokomotiva": 1570, "NK Omladinac Vrata": 1579, "NK Draga": 1554, "NK Zamet": 1589, "NK Vrbovsko": 1588, "NK Rikard Benčić": 1582, "NK OŠK Omišalj": 3071, } def conn(): return psycopg2.connect(**DB) def fetch(url: str) -> str: log.info(f"GET {url}") r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT) r.raise_for_status() time.sleep(RATE_S) return r.text def slugify(s: str) -> str: s = s.lower().strip() s = re.sub(r'[čćš]', lambda m: {'č':'c','ć':'c','š':'s'}[m.group()], s) s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s) s = re.sub(r'[^a-z0-9]+', '-', s).strip('-') return s def cmd_seed(): """Map SEED_MAP to klubovi.hns_klub_id where naziv matches; auto-INSERT if missing.""" n_updated = 0; n_inserted = 0 with conn() as c: cu = c.cursor() for naziv, hns_id in SEED_MAP.items(): cu.execute("""UPDATE pgz_sport.klubovi SET hns_klub_id=%s, hns_slug=%s, source_synced_at=now() WHERE naziv ILIKE %s AND sport='nogomet' AND (hns_klub_id IS NULL OR hns_klub_id=%s)""", (hns_id, slugify(naziv), f"%{naziv}%", hns_id)) if cu.rowcount > 0: n_updated += cu.rowcount continue # Try by hns_klub_id directly (already set elsewhere) cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (hns_id,)) if cu.fetchone(): continue # Insert new minimal row cu.execute("""INSERT INTO pgz_sport.klubovi (naziv, sport, razina, hns_klub_id, hns_slug, aktivan, region, source_synced_at, napomena) VALUES (%s,'nogomet','3.HRL',%s,%s,true,'PGŽ',now(), 'Auto-seeded from HNS Semafor (legitimni interes — analitika)')""", (naziv, hns_id, slugify(naziv))) n_inserted += 1 c.commit() log.info(f"Seed: updated={n_updated}, inserted={n_inserted}") return {"updated": n_updated, "inserted": n_inserted} def parse_player_profile(hns_pid: int, html: str) -> dict: """Parse /igraci/{id}/{slug}/ → dict.""" soup = BeautifulSoup(html, 'html.parser') data = {"hns_pid": hns_pid, "matches": []} # Name in first

h1 = soup.find('h1') if h1: data['ime_prezime'] = h1.get_text(' ', strip=True) # Photo img = soup.find('img', alt=data.get('ime_prezime', '')) if img and img.get('src'): data['slika_url'] = img['src'] # Trenutni klub — find h4 with link (klub heading) klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/')) if klub_link: m = re.search(r'/klubovi/(\d+)/', klub_link['href']) if m: data['trenutni_klub_hns_id'] = int(m.group(1)) h = klub_link.find('h4') if h: data['trenutni_klub'] = h.get_text(' ', strip=True) # Datum rođenja - targetira
  • direktno li_dob = soup.find('li', class_='dob') if li_dob: h4 = li_dob.find('h4') if h4: t = h4.get_text(' ', strip=True) data['datum_rodenja_raw'] = t m = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t) if m: try: d = m.groups() data['datum_rodenja'] = date(int(d[2]), int(d[1]), int(d[0])).isoformat() except Exception: pass # Mjesto rođenja - targetira
  • li_pob = soup.find('li', class_='pob') if li_pob: h4_m = li_pob.find('h4') if h4_m: data['mjesto_rodenja'] = h4_m.get_text(strip=True) # Stara fallback metoda - h4 followed by h3 "Mjesto rođenja" for h3 in soup.find_all('h3'): if 'Mjesto rođenja' in h3.get_text(): prev = h3.find_previous('h4') if prev: data['mjesto_rodenja'] = prev.get_text(strip=True) return data def upsert_player(klub_id_db: int, prof: dict) -> int: """Upsert clanovi row from parsed profile, return clan_id.""" with conn() as c: cu = c.cursor() # Try find existing by source_id cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE source='hns_semafor' AND source_id=%s""", (str(prof['hns_pid']),)) row = cu.fetchone() ime, *prezime = (prof.get('ime_prezime','') or '').split(' ', 1) prezime = prezime[0] if prezime else '' url = f"{BASE}/igraci/{prof['hns_pid']}/{slugify(prof.get('ime_prezime',''))}/" if row: cid = row[0] cu.execute("""UPDATE pgz_sport.clanovi SET ime=%s, prezime=%s, datum_rodenja=%s, mjesto_rodenja=%s, slika_url=%s, klub_id=%s, source_url=%s, source_synced_at=now() WHERE id=%s""", (ime, prezime, prof.get('datum_rodenja'), prof.get('mjesto_rodenja'), prof.get('slika_url'), klub_id_db, url, cid)) else: cu.execute("""INSERT INTO pgz_sport.clanovi (klub_id, ime, prezime, datum_rodenja, mjesto_rodenja, slika_url, source, source_id, source_url, source_synced_at, slug) VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s) RETURNING id""", (klub_id_db, ime, prezime, prof.get('datum_rodenja'), prof.get('mjesto_rodenja'), prof.get('slika_url'), str(prof['hns_pid']), url, slugify(prof.get('ime_prezime','')))) cid = cu.fetchone()[0] c.commit() return cid def cmd_player(hns_pid: int, klub_id_db: int = None): """Scrape a single player by HNS ID.""" if klub_id_db is None: # try to infer from current klub via DB if previously stored with conn() as c: cu = c.cursor() cu.execute("""SELECT klub_id FROM pgz_sport.clanovi WHERE source='hns_semafor' AND source_id=%s""", (str(hns_pid),)) r = cu.fetchone() if r: klub_id_db = r[0] url = f"{BASE}/igraci/{hns_pid}/dummy/" # slug is forgiving; HNS redirects html = fetch(url) prof = parse_player_profile(hns_pid, html) log.info(f"Parsed: {prof.get('ime_prezime','?')} (HNS#{hns_pid}) klub={prof.get('trenutni_klub','?')}") # Resolve current_klub_hns_id → klub_id_db if not provided if klub_id_db is None and prof.get('trenutni_klub_hns_id'): with conn() as c: cu = c.cursor() cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (prof['trenutni_klub_hns_id'],)) r = cu.fetchone() if r: klub_id_db = r[0] if klub_id_db is None: log.warning(f"No DB klub_id for HNS player {hns_pid} — skipping upsert") return None return upsert_player(klub_id_db, prof) def cmd_daily(): """Refresh seeded clubs and their rosters (pull from sample player). To be expanded.""" run_id = None with conn() as c: cu = c.cursor() cu.execute("""INSERT INTO pgz_sport.scraper_runs (source, scope) VALUES ('hns_semafor','daily') RETURNING id""") run_id = cu.fetchone()[0]; c.commit() inserted = 0; updated = 0; errors = [] try: # Phase 1: ensure seed mapping is current cmd_seed() log.info("=== Daily HNS harvest start ===") # TODO: roster discovery requires per-klub roster page. For now, only re-fetch known players. with conn() as c: cu = c.cursor() cu.execute("""SELECT source_id FROM pgz_sport.clanovi WHERE source='hns_semafor' ORDER BY source_synced_at NULLS FIRST LIMIT 500""") pids = [r[0] for r in cu.fetchall()] for pid in pids: try: cmd_player(int(pid)) updated += 1 except Exception as e: log.error(f"player {pid}: {e}") errors.append({"pid": pid, "err": str(e)}) log.info(f"=== Daily done: updated={updated} errors={len(errors)} ===") finally: with conn() as c: cu = c.cursor() cu.execute("""UPDATE pgz_sport.scraper_runs SET finished_at=now(), status=%s, rows_updated=%s, errors=%s::jsonb, rows_inserted=%s WHERE id=%s""", ("ok" if not errors else "partial", updated, json.dumps(errors), inserted, run_id)) c.commit() def parse_match(html, match_url=None): """HNS match parser v4 — uses precise class signals. Player
  • :
    9
    • 40'
    • 87'
    • ...
    • ...
    • ...
    • ...
    """ soup = BeautifulSoup(html, 'html.parser') out = {"teams": {}, "match_url": match_url, "meta": {}, "title": ""} h1 = soup.find('h1') out['title'] = h1.get_text(' ', strip=True) if h1 else '' EVENT_KIND_MAP = { 'goal': 'gol', 'ownGoal': 'autogol', 'penaltyGoal': 'gol', 'yellow': 'zuti', 'secondYellow': 'zuti2', # second yellow → effectively red 'red': 'crveni', 'substitutionIn': 'subIn', 'substitutionOut': 'subOut', } def parse_team_div(team_div): if not team_div: return None, [] ul = team_div.find('ul', recursive=False) if not ul: ul = team_div.find('ul') if not ul: return None, [] team_name = None players = [] is_starter = True for li in ul.find_all('li', recursive=False): cls = li.get('class') or [] if 'header' in cls and 'clubName' in cls: team_name = li.get_text(' ', strip=True) continue if 'header' in cls and 'separatorTitle' in cls: if 'Pričuvni' in li.get_text(' ', strip=True): is_starter = False continue if not ('row' in cls and 'match_lineup' in cls): continue # Player extraction pid = li.get('data-personid') if not pid: a = li.find('a', href=re.compile(r'/igraci/(\d+)/')) if not a: continue pm = re.search(r'/igraci/(\d+)/', a['href']) pid = pm.group(1) try: pid = int(pid) except: continue # Shirt number sn = li.find('div', class_='shirtNumber') broj_dresa = None if sn: bs = sn.get_text(' ', strip=True).strip() if bs.isdigit(): broj_dresa = int(bs) # Image img = li.find('img') slika = img.get('src') if img else None # Name + position pn = li.find('div', class_='playerName') ime_prezime = '' pozicija = None captain = False if pn: a2 = pn.find('a') if a2: ime_prezime = a2.get_text(' ', strip=True) # Position is text after

    full = pn.get_text(' ', strip=True) rest = full.replace(ime_prezime, '').strip() if '(C)' in rest: captain = True rest = rest.replace('(C)', '').strip() if 'Vratar' in rest: pozicija = 'Vratar' elif 'Igrač' in rest: pozicija = 'Igrač' # Events events = [] me_div = li.find('div', class_='matchEvents') if me_div: ev_ul = me_div.find('ul', class_='events') if ev_ul: for ev_li in ev_ul.find_all('li', recursive=False): ev_cls = ev_li.get('class') or [] kind = None for k in ev_cls: if k in EVENT_KIND_MAP: kind = EVENT_KIND_MAP[k]; break text = ev_li.get_text(' ', strip=True) mm = re.search(r"(\d+(?:\+\d+)?)\s*'", text) minute = mm.group(1) if mm else None if kind: events.append({'kind': kind, 'minute': minute}) # Aggregate counts cnt_gol = sum(1 for e in events if e['kind'] in ('gol',)) cnt_zuti = sum(1 for e in events if e['kind'] == 'zuti') cnt_crveni = sum(1 for e in events if e['kind'] in ('crveni','zuti2')) # 2nd yellow = red # Substitution minutes (in/out) sub_in_min = next((e['minute'] for e in events if e['kind']=='subIn'), None) sub_out_min = next((e['minute'] for e in events if e['kind']=='subOut'), None) # Estimate minutes played minutes = None if is_starter: if sub_out_min: try: minutes = int(re.sub(r'[^\d]','', sub_out_min)) except: pass else: minutes = 90 # full game else: # bench if sub_in_min: try: minutes = max(0, 90 - int(re.sub(r'[^\d]','', sub_in_min))) except: pass else: minutes = 0 # never came on players.append({ 'hns_pid': pid, 'ime_prezime': ime_prezime, 'broj_dresa': broj_dresa, 'pozicija': pozicija, 'slika_url': slika, 'captain': captain, 'starter': is_starter, 'events': events, 'pogodaka': cnt_gol, 'zuti_kartoni': cnt_zuti, 'crveni_kartoni': cnt_crveni, 'minute': minutes, }) return team_name, players home_div = soup.find('div', class_='homeTeam') away_div = soup.find('div', class_='awayTeam') home_name, home_players = parse_team_div(home_div) away_name, away_players = parse_team_div(away_div) if home_name: out['teams'][home_name] = home_players if away_name: out['teams'][away_name] = away_players # Logo URLs if home_div: h_img = home_div.find('img') out['meta']['klub_dom_logo'] = h_img.get('src') if h_img else None if away_div: a_img = away_div.find('img') out['meta']['klub_gost_logo'] = a_img.get('src') if a_img else None out['meta']['klub_dom'] = home_name out['meta']['klub_gost'] = away_name # Date/time, viewership, score, competition body_text = soup.get_text(' ', strip=True) dm = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4})\.?\s*(\d{1,2}:\d{2})', body_text) if dm: try: d_parts = dm.group(1).split('.') out['meta']['datum'] = f"{d_parts[2]}-{d_parts[1].zfill(2)}-{d_parts[0].zfill(2)}" out['meta']['vrijeme'] = dm.group(2) except: pass gm = re.search(r'Gledatelja:\s*(\d+)', body_text) if gm: out['meta']['gledatelja'] = int(gm.group(1)) rm = re.search(r'(\d+):(\d+)', out.get('title','')) if rm: out['meta']['rezultat'] = f"{rm.group(1)}:{rm.group(2)}" nat_match = out.get('title','').split(',') if len(nat_match) > 1: out['meta']['natjecanje'] = nat_match[-1].strip() return out def cmd_klub(klub_id_db: int, max_matches: int = 999): """Scrape klub: club page → all matches → for our team upsert player + utakmice_log row with full stats.""" with conn() as c: cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cu.execute("SELECT id, naziv, hns_klub_id, hns_slug FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,)) klub = cu.fetchone() if not klub or not klub['hns_klub_id']: log.error(f"Klub #{klub_id_db}: nema hns_klub_id"); return 0 klub_url = f"{BASE}/klubovi/{klub['hns_klub_id']}/{klub['hns_slug'] or 'k'}/" log.info(f"Klub: {klub['naziv']} → {klub_url}") try: html = fetch(klub_url) except Exception as e: log.error(f"klub fetch failed: {e}"); return 0 soup = BeautifulSoup(html, 'html.parser') match_ids = [] for a in soup.find_all('a', href=re.compile(r'/utakmice/(\d+)/')): mm = re.search(r'/utakmice/(\d+)/', a['href']) if mm and mm.group(1) not in match_ids: match_ids.append(mm.group(1)) log.info(f" found {len(match_ids)} matches; processing up to {max_matches}") klub_naziv_low = klub['naziv'].lower() seen_pids = set() matches_logged = 0 for mid in match_ids[:max_matches]: try: mhtml = fetch(f"{BASE}/utakmice/{mid}/") md = parse_match(mhtml, match_url=f"{BASE}/utakmice/{mid}/") except Exception as e: log.error(f" match {mid}: {e}"); continue if not md.get('teams'): log.warning(f" match {mid}: no teams parsed"); continue # Find which team (home or away) is OURS — use looser match (incl. token overlap) roster = []; matched_team = None for tn, players in md['teams'].items(): tn_low = tn.lower() # try exact substring both directions if klub_naziv_low in tn_low or tn_low in klub_naziv_low: roster = players; matched_team = tn; break # token-set overlap (e.g. "NK Krk" vs "NK Krk Krk" or "NK Vihor" vs "NK Vihor (B)") tokens_klub = set(re.split(r'\s+', re.sub(r'[^\w]',' ', klub_naziv_low))) tokens_team = set(re.split(r'\s+', re.sub(r'[^\w]',' ', tn_low))) tokens_klub.discard(''); tokens_team.discard('') common = tokens_klub & tokens_team # Drop generic tokens generic = {'nk','hnk','klub','nogometni'} common_strong = common - generic if len(common_strong) >= 1 and (klub_naziv_low.split()[-1] in tn_low or tn_low.split()[-1] in klub_naziv_low): roster = players; matched_team = tn log.info(f" fuzzy match: {klub['naziv']} ↔ {tn}") break if not roster: continue # silently skip non-matching meta = md.get('meta', {}) team_keys = list(md['teams'].keys()) klub_dom = team_keys[0] if team_keys else None klub_gost = team_keys[1] if len(team_keys) > 1 else None with conn() as c: cu = c.cursor() for pl in roster: if not pl.get('hns_pid'): continue seen_pids.add(pl['hns_pid']) name = pl['ime_prezime'] or '' parts = name.rsplit(' ', 1) ime = parts[0] if len(parts) > 1 else name prezime = parts[1] if len(parts) > 1 else '' slug = slugify(name) src_url = f"{BASE}/igraci/{pl['hns_pid']}/{slug}/" cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE source='hns_semafor' AND source_id=%s""", (str(pl['hns_pid']),)) row = cu.fetchone() if row: cid = row[0] cu.execute("""UPDATE pgz_sport.clanovi SET ime=%s, prezime=%s, slika_url=COALESCE(NULLIF(%s,''), slika_url), broj_dresa=COALESCE(%s, broj_dresa), pozicija=COALESCE(%s, pozicija), klub_id=%s, source_url=%s, source_synced_at=now(), slug=%s WHERE id=%s""", (ime, prezime, pl.get('slika_url') or '', pl.get('broj_dresa'), pl.get('pozicija'), klub_id_db, src_url, slug, cid)) else: cu.execute("""INSERT INTO pgz_sport.clanovi (klub_id, ime, prezime, slika_url, broj_dresa, pozicija, source, source_id, source_url, source_synced_at, slug) VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s) RETURNING id""", (klub_id_db, ime, prezime, pl.get('slika_url'), pl.get('broj_dresa'), pl.get('pozicija'), str(pl['hns_pid']), src_url, slug)) cid = cu.fetchone()[0] cu.execute("""INSERT INTO pgz_sport.utakmice_log (clan_id, source, source_match_id, source_url, datum, vrijeme, natjecanje, klub_dom, klub_dom_logo, klub_gost, klub_gost_logo, rezultat, za_klub_id, pogodaka, zuti_kartoni, crveni_kartoni, minute, zapocet_kao_starter) VALUES (%s,'hns_semafor',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (source, source_match_id, clan_id) DO UPDATE SET datum=EXCLUDED.datum, rezultat=EXCLUDED.rezultat, za_klub_id=EXCLUDED.za_klub_id, pogodaka=EXCLUDED.pogodaka, zuti_kartoni=EXCLUDED.zuti_kartoni, crveni_kartoni=EXCLUDED.crveni_kartoni, minute=EXCLUDED.minute, zapocet_kao_starter=EXCLUDED.zapocet_kao_starter, klub_dom_logo=EXCLUDED.klub_dom_logo, klub_gost_logo=EXCLUDED.klub_gost_logo""", (cid, mid, f"{BASE}/utakmice/{mid}/", meta.get('datum'), meta.get('vrijeme'), meta.get('natjecanje'), klub_dom, meta.get('klub_dom_logo'), klub_gost, meta.get('klub_gost_logo'), meta.get('rezultat'), klub_id_db, pl.get('pogodaka',0), pl.get('zuti_kartoni',0), pl.get('crveni_kartoni',0), pl.get('minute'), pl.get('starter', True))) c.commit() matches_logged += 1 log.info(f"Klub {klub['naziv']} done: {len(seen_pids)} unique players, {matches_logged} matches logged") return len(seen_pids) if __name__ == '__main__': if len(sys.argv) < 2: print(__doc__); sys.exit(1) cmd = sys.argv[1] if cmd == 'seed': print(cmd_seed()) elif cmd == 'player': cid = cmd_player(int(sys.argv[2])) print(f"clan_id={cid}") elif cmd == 'daily': cmd_daily() elif cmd == 'klub': if len(sys.argv) < 3: print("Usage: klub [max_matches]"); sys.exit(2) max_m = int(sys.argv[3]) if len(sys.argv) > 3 else 1 cmd_klub(int(sys.argv[2]), max_matches=max_m) elif cmd == 'klub_all': # Scrape all PGŽ klubovi with hns_klub_id set with conn() as c: cu = c.cursor() cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id IS NOT NULL ORDER BY id") kids = [r[0] for r in cu.fetchall()] log.info(f"Scraping rosters for {len(kids)} klubova…") for kid in kids: try: cmd_klub(kid, max_matches=999) except Exception as e: log.error(f"klub {kid}: {e}") else: print(f"Unknown: {cmd}"); sys.exit(2)