609 lines
25 KiB
Python
Executable File
609 lines
25 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
HNS Semafor scraper for PGŽ football clubs.
|
|
|
|
Strategy:
|
|
1. Seed-map known PGŽ clubs to HNS COMET klub_id (manual list to start)
|
|
2. For each klub: fetch /klubovi/{id}/{slug}/ and extract roster (player list)
|
|
3. For each player: fetch /igraci/{id}/{slug}/ → store in clanovi + utakmice_log
|
|
4. Respect rate limit (1 req / 1.5s), record run in scraper_runs
|
|
|
|
Run modes:
|
|
python hns_semafor.py seed # set hns_klub_id for known clubs
|
|
python hns_semafor.py klub <db_klub_id> # scrape one klub roster + players
|
|
python hns_semafor.py player <hns_pid> # scrape one player
|
|
python hns_semafor.py daily # full daily harvest of seeded PGŽ clubs
|
|
"""
|
|
import os, re, sys, time, json, logging
|
|
from datetime import datetime, date
|
|
from urllib.parse import urljoin
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
|
|
BASE = "https://semafor.hns.family"
|
|
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)"
|
|
RATE_S = 1.6 # seconds between requests
|
|
TIMEOUT = 25
|
|
|
|
log = logging.getLogger("hns")
|
|
logging.basicConfig(
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
level=logging.INFO,
|
|
handlers=[
|
|
logging.FileHandler("/opt/pgz-sport/_logs/hns_scraper.log"),
|
|
logging.StreamHandler(sys.stdout),
|
|
],
|
|
)
|
|
|
|
# ═══ Manual seed mapping — PGŽ klubovi → HNS COMET id ═══
|
|
# Discovered from semafor.hns.family/igraci/1167145/marko-komadina/ matches
|
|
SEED_MAP = {
|
|
# naziv → hns_klub_id
|
|
"NK Klana": 1569,
|
|
"NK Krk": 1558,
|
|
"NK Mune": 1576,
|
|
"NK Vihor": 4326,
|
|
"NK Doker": 107415,
|
|
"HNK Kozala": 3090,
|
|
"HNK Lovran": 1574,
|
|
"HNK Goranin": 1565,
|
|
"NK Risnjak": 1583,
|
|
"NK Lokomotiva": 1570,
|
|
"NK Omladinac Vrata": 1579,
|
|
"NK Draga": 1554,
|
|
"NK Zamet": 1589,
|
|
"NK Vrbovsko": 1588,
|
|
"NK Rikard Benčić": 1582,
|
|
"NK OŠK Omišalj": 3071,
|
|
}
|
|
|
|
def conn():
|
|
return psycopg2.connect(**DB)
|
|
|
|
def fetch(url: str) -> str:
|
|
log.info(f"GET {url}")
|
|
r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT)
|
|
r.raise_for_status()
|
|
time.sleep(RATE_S)
|
|
return r.text
|
|
|
|
def slugify(s: str) -> str:
|
|
s = s.lower().strip()
|
|
s = re.sub(r'[čćš]', lambda m: {'č':'c','ć':'c','š':'s'}[m.group()], s)
|
|
s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s)
|
|
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
|
return s
|
|
|
|
def cmd_seed():
|
|
"""Map SEED_MAP to klubovi.hns_klub_id where naziv matches; auto-INSERT if missing."""
|
|
n_updated = 0; n_inserted = 0
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
for naziv, hns_id in SEED_MAP.items():
|
|
cu.execute("""UPDATE pgz_sport.klubovi
|
|
SET hns_klub_id=%s, hns_slug=%s, source_synced_at=now()
|
|
WHERE naziv ILIKE %s AND sport='nogomet'
|
|
AND (hns_klub_id IS NULL OR hns_klub_id=%s)""",
|
|
(hns_id, slugify(naziv), f"%{naziv}%", hns_id))
|
|
if cu.rowcount > 0:
|
|
n_updated += cu.rowcount
|
|
continue
|
|
# Try by hns_klub_id directly (already set elsewhere)
|
|
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (hns_id,))
|
|
if cu.fetchone():
|
|
continue
|
|
# Insert new minimal row
|
|
cu.execute("""INSERT INTO pgz_sport.klubovi
|
|
(naziv, sport, razina, hns_klub_id, hns_slug, aktivan, region,
|
|
source_synced_at, napomena)
|
|
VALUES (%s,'nogomet','3.HRL',%s,%s,true,'PGŽ',now(),
|
|
'Auto-seeded from HNS Semafor (legitimni interes — analitika)')""",
|
|
(naziv, hns_id, slugify(naziv)))
|
|
n_inserted += 1
|
|
c.commit()
|
|
log.info(f"Seed: updated={n_updated}, inserted={n_inserted}")
|
|
return {"updated": n_updated, "inserted": n_inserted}
|
|
|
|
def parse_player_profile(hns_pid: int, html: str) -> dict:
|
|
"""Parse /igraci/{id}/{slug}/ → dict."""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
data = {"hns_pid": hns_pid, "matches": []}
|
|
|
|
# Name in first <h1>
|
|
h1 = soup.find('h1')
|
|
if h1:
|
|
data['ime_prezime'] = h1.get_text(' ', strip=True)
|
|
|
|
# Photo
|
|
img = soup.find('img', alt=data.get('ime_prezime', ''))
|
|
if img and img.get('src'):
|
|
data['slika_url'] = img['src']
|
|
|
|
# Trenutni klub — find h4 with link (klub heading)
|
|
klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
|
|
if klub_link:
|
|
m = re.search(r'/klubovi/(\d+)/', klub_link['href'])
|
|
if m: data['trenutni_klub_hns_id'] = int(m.group(1))
|
|
h = klub_link.find('h4')
|
|
if h: data['trenutni_klub'] = h.get_text(' ', strip=True)
|
|
|
|
# Datum rođenja - targetira <li class="dob"> direktno
|
|
li_dob = soup.find('li', class_='dob')
|
|
if li_dob:
|
|
h4 = li_dob.find('h4')
|
|
if h4:
|
|
t = h4.get_text(' ', strip=True)
|
|
data['datum_rodenja_raw'] = t
|
|
m = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
|
|
if m:
|
|
try:
|
|
d = m.groups()
|
|
data['datum_rodenja'] = date(int(d[2]), int(d[1]), int(d[0])).isoformat()
|
|
except Exception:
|
|
pass
|
|
|
|
# Mjesto rođenja - targetira <li class="pob">
|
|
li_pob = soup.find('li', class_='pob')
|
|
if li_pob:
|
|
h4_m = li_pob.find('h4')
|
|
if h4_m:
|
|
data['mjesto_rodenja'] = h4_m.get_text(strip=True)
|
|
|
|
# Stara fallback metoda - h4 followed by h3 "Mjesto rođenja"
|
|
for h3 in soup.find_all('h3'):
|
|
if 'Mjesto rođenja' in h3.get_text():
|
|
prev = h3.find_previous('h4')
|
|
if prev: data['mjesto_rodenja'] = prev.get_text(strip=True)
|
|
|
|
return data
|
|
|
|
def upsert_player(klub_id_db: int, prof: dict) -> int:
|
|
"""Upsert clanovi row from parsed profile, return clan_id."""
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
# Try find existing by source_id
|
|
cu.execute("""SELECT id FROM pgz_sport.clanovi
|
|
WHERE source='hns_semafor' AND source_id=%s""", (str(prof['hns_pid']),))
|
|
row = cu.fetchone()
|
|
ime, *prezime = (prof.get('ime_prezime','') or '').split(' ', 1)
|
|
prezime = prezime[0] if prezime else ''
|
|
url = f"{BASE}/igraci/{prof['hns_pid']}/{slugify(prof.get('ime_prezime',''))}/"
|
|
if row:
|
|
cid = row[0]
|
|
cu.execute("""UPDATE pgz_sport.clanovi
|
|
SET ime=%s, prezime=%s, datum_rodenja=%s, mjesto_rodenja=%s,
|
|
slika_url=%s, klub_id=%s, source_url=%s, source_synced_at=now()
|
|
WHERE id=%s""",
|
|
(ime, prezime, prof.get('datum_rodenja'), prof.get('mjesto_rodenja'),
|
|
prof.get('slika_url'), klub_id_db, url, cid))
|
|
else:
|
|
cu.execute("""INSERT INTO pgz_sport.clanovi
|
|
(klub_id, ime, prezime, datum_rodenja, mjesto_rodenja, slika_url,
|
|
source, source_id, source_url, source_synced_at, slug)
|
|
VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s)
|
|
RETURNING id""",
|
|
(klub_id_db, ime, prezime, prof.get('datum_rodenja'),
|
|
prof.get('mjesto_rodenja'), prof.get('slika_url'),
|
|
str(prof['hns_pid']), url, slugify(prof.get('ime_prezime',''))))
|
|
cid = cu.fetchone()[0]
|
|
c.commit()
|
|
return cid
|
|
|
|
def cmd_player(hns_pid: int, klub_id_db: int = None):
|
|
"""Scrape a single player by HNS ID."""
|
|
if klub_id_db is None:
|
|
# try to infer from current klub via DB if previously stored
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
cu.execute("""SELECT klub_id FROM pgz_sport.clanovi
|
|
WHERE source='hns_semafor' AND source_id=%s""", (str(hns_pid),))
|
|
r = cu.fetchone()
|
|
if r: klub_id_db = r[0]
|
|
|
|
url = f"{BASE}/igraci/{hns_pid}/dummy/" # slug is forgiving; HNS redirects
|
|
html = fetch(url)
|
|
prof = parse_player_profile(hns_pid, html)
|
|
log.info(f"Parsed: {prof.get('ime_prezime','?')} (HNS#{hns_pid}) klub={prof.get('trenutni_klub','?')}")
|
|
|
|
# Resolve current_klub_hns_id → klub_id_db if not provided
|
|
if klub_id_db is None and prof.get('trenutni_klub_hns_id'):
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (prof['trenutni_klub_hns_id'],))
|
|
r = cu.fetchone()
|
|
if r: klub_id_db = r[0]
|
|
|
|
if klub_id_db is None:
|
|
log.warning(f"No DB klub_id for HNS player {hns_pid} — skipping upsert")
|
|
return None
|
|
|
|
return upsert_player(klub_id_db, prof)
|
|
|
|
def cmd_daily():
|
|
"""Refresh seeded clubs and their rosters (pull from sample player). To be expanded."""
|
|
run_id = None
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
cu.execute("""INSERT INTO pgz_sport.scraper_runs (source, scope)
|
|
VALUES ('hns_semafor','daily') RETURNING id""")
|
|
run_id = cu.fetchone()[0]; c.commit()
|
|
|
|
inserted = 0; updated = 0; errors = []
|
|
try:
|
|
# Phase 1: ensure seed mapping is current
|
|
cmd_seed()
|
|
log.info("=== Daily HNS harvest start ===")
|
|
# TODO: roster discovery requires per-klub roster page. For now, only re-fetch known players.
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
cu.execute("""SELECT source_id FROM pgz_sport.clanovi
|
|
WHERE source='hns_semafor' ORDER BY source_synced_at NULLS FIRST LIMIT 500""")
|
|
pids = [r[0] for r in cu.fetchall()]
|
|
for pid in pids:
|
|
try:
|
|
cmd_player(int(pid))
|
|
updated += 1
|
|
except Exception as e:
|
|
log.error(f"player {pid}: {e}")
|
|
errors.append({"pid": pid, "err": str(e)})
|
|
log.info(f"=== Daily done: updated={updated} errors={len(errors)} ===")
|
|
finally:
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
cu.execute("""UPDATE pgz_sport.scraper_runs
|
|
SET finished_at=now(), status=%s, rows_updated=%s, errors=%s::jsonb, rows_inserted=%s
|
|
WHERE id=%s""",
|
|
("ok" if not errors else "partial", updated, json.dumps(errors), inserted, run_id))
|
|
c.commit()
|
|
|
|
|
|
def parse_match(html, match_url=None):
|
|
"""HNS match parser v4 — uses precise class signals.
|
|
|
|
Player <li class='row match_lineup' data-personid='87561'>:
|
|
<div class='shirtNumber'>9</div>
|
|
<div class='playerPhoto'><div class='photo'><img src='...' /></div></div>
|
|
<div class='playerName'><h3><a href='/igraci/.../'>Ivan Laginja</a></h3>Igrač</div>
|
|
<div class='matchEvents'>
|
|
<ul class='events'>
|
|
<li class='goal'><div class='icon' title='Gol'></div>40'</li>
|
|
<li class='substitutionOut'><div class='icon' title='Izmjena'></div>87'</li>
|
|
<li class='yellow'>...</li>
|
|
<li class='red'>...</li>
|
|
<li class='ownGoal'>...</li>
|
|
<li class='substitutionIn'>...</li>
|
|
</ul>
|
|
</div>
|
|
"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
out = {"teams": {}, "match_url": match_url, "meta": {}, "title": ""}
|
|
|
|
h1 = soup.find('h1')
|
|
out['title'] = h1.get_text(' ', strip=True) if h1 else ''
|
|
|
|
EVENT_KIND_MAP = {
|
|
'goal': 'gol',
|
|
'ownGoal': 'autogol',
|
|
'penaltyGoal': 'gol',
|
|
'yellow': 'zuti',
|
|
'secondYellow': 'zuti2', # second yellow → effectively red
|
|
'red': 'crveni',
|
|
'substitutionIn': 'subIn',
|
|
'substitutionOut': 'subOut',
|
|
}
|
|
|
|
def parse_team_div(team_div):
|
|
if not team_div: return None, []
|
|
ul = team_div.find('ul', recursive=False)
|
|
if not ul: ul = team_div.find('ul')
|
|
if not ul: return None, []
|
|
team_name = None
|
|
players = []
|
|
is_starter = True
|
|
for li in ul.find_all('li', recursive=False):
|
|
cls = li.get('class') or []
|
|
if 'header' in cls and 'clubName' in cls:
|
|
team_name = li.get_text(' ', strip=True)
|
|
continue
|
|
if 'header' in cls and 'separatorTitle' in cls:
|
|
if 'Pričuvni' in li.get_text(' ', strip=True):
|
|
is_starter = False
|
|
continue
|
|
if not ('row' in cls and 'match_lineup' in cls):
|
|
continue
|
|
|
|
# Player extraction
|
|
pid = li.get('data-personid')
|
|
if not pid:
|
|
a = li.find('a', href=re.compile(r'/igraci/(\d+)/'))
|
|
if not a: continue
|
|
pm = re.search(r'/igraci/(\d+)/', a['href'])
|
|
pid = pm.group(1)
|
|
try: pid = int(pid)
|
|
except: continue
|
|
|
|
# Shirt number
|
|
sn = li.find('div', class_='shirtNumber')
|
|
broj_dresa = None
|
|
if sn:
|
|
bs = sn.get_text(' ', strip=True).strip()
|
|
if bs.isdigit(): broj_dresa = int(bs)
|
|
|
|
# Image
|
|
img = li.find('img')
|
|
slika = img.get('src') if img else None
|
|
|
|
# Name + position
|
|
pn = li.find('div', class_='playerName')
|
|
ime_prezime = ''
|
|
pozicija = None
|
|
captain = False
|
|
if pn:
|
|
a2 = pn.find('a')
|
|
if a2:
|
|
ime_prezime = a2.get_text(' ', strip=True)
|
|
# Position is text after <h3>
|
|
full = pn.get_text(' ', strip=True)
|
|
rest = full.replace(ime_prezime, '').strip()
|
|
if '(C)' in rest: captain = True
|
|
rest = rest.replace('(C)', '').strip()
|
|
if 'Vratar' in rest: pozicija = 'Vratar'
|
|
elif 'Igrač' in rest: pozicija = 'Igrač'
|
|
|
|
# Events
|
|
events = []
|
|
me_div = li.find('div', class_='matchEvents')
|
|
if me_div:
|
|
ev_ul = me_div.find('ul', class_='events')
|
|
if ev_ul:
|
|
for ev_li in ev_ul.find_all('li', recursive=False):
|
|
ev_cls = ev_li.get('class') or []
|
|
kind = None
|
|
for k in ev_cls:
|
|
if k in EVENT_KIND_MAP:
|
|
kind = EVENT_KIND_MAP[k]; break
|
|
text = ev_li.get_text(' ', strip=True)
|
|
mm = re.search(r"(\d+(?:\+\d+)?)\s*'", text)
|
|
minute = mm.group(1) if mm else None
|
|
if kind:
|
|
events.append({'kind': kind, 'minute': minute})
|
|
|
|
# Aggregate counts
|
|
cnt_gol = sum(1 for e in events if e['kind'] in ('gol',))
|
|
cnt_zuti = sum(1 for e in events if e['kind'] == 'zuti')
|
|
cnt_crveni = sum(1 for e in events if e['kind'] in ('crveni','zuti2')) # 2nd yellow = red
|
|
|
|
# Substitution minutes (in/out)
|
|
sub_in_min = next((e['minute'] for e in events if e['kind']=='subIn'), None)
|
|
sub_out_min = next((e['minute'] for e in events if e['kind']=='subOut'), None)
|
|
|
|
# Estimate minutes played
|
|
minutes = None
|
|
if is_starter:
|
|
if sub_out_min:
|
|
try: minutes = int(re.sub(r'[^\d]','', sub_out_min))
|
|
except: pass
|
|
else:
|
|
minutes = 90 # full game
|
|
else: # bench
|
|
if sub_in_min:
|
|
try: minutes = max(0, 90 - int(re.sub(r'[^\d]','', sub_in_min)))
|
|
except: pass
|
|
else:
|
|
minutes = 0 # never came on
|
|
|
|
players.append({
|
|
'hns_pid': pid,
|
|
'ime_prezime': ime_prezime,
|
|
'broj_dresa': broj_dresa,
|
|
'pozicija': pozicija,
|
|
'slika_url': slika,
|
|
'captain': captain,
|
|
'starter': is_starter,
|
|
'events': events,
|
|
'pogodaka': cnt_gol,
|
|
'zuti_kartoni': cnt_zuti,
|
|
'crveni_kartoni': cnt_crveni,
|
|
'minute': minutes,
|
|
})
|
|
return team_name, players
|
|
|
|
home_div = soup.find('div', class_='homeTeam')
|
|
away_div = soup.find('div', class_='awayTeam')
|
|
|
|
home_name, home_players = parse_team_div(home_div)
|
|
away_name, away_players = parse_team_div(away_div)
|
|
|
|
if home_name: out['teams'][home_name] = home_players
|
|
if away_name: out['teams'][away_name] = away_players
|
|
|
|
# Logo URLs
|
|
if home_div:
|
|
h_img = home_div.find('img')
|
|
out['meta']['klub_dom_logo'] = h_img.get('src') if h_img else None
|
|
if away_div:
|
|
a_img = away_div.find('img')
|
|
out['meta']['klub_gost_logo'] = a_img.get('src') if a_img else None
|
|
|
|
out['meta']['klub_dom'] = home_name
|
|
out['meta']['klub_gost'] = away_name
|
|
|
|
# Date/time, viewership, score, competition
|
|
body_text = soup.get_text(' ', strip=True)
|
|
dm = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4})\.?\s*(\d{1,2}:\d{2})', body_text)
|
|
if dm:
|
|
try:
|
|
d_parts = dm.group(1).split('.')
|
|
out['meta']['datum'] = f"{d_parts[2]}-{d_parts[1].zfill(2)}-{d_parts[0].zfill(2)}"
|
|
out['meta']['vrijeme'] = dm.group(2)
|
|
except: pass
|
|
gm = re.search(r'Gledatelja:\s*(\d+)', body_text)
|
|
if gm: out['meta']['gledatelja'] = int(gm.group(1))
|
|
|
|
rm = re.search(r'(\d+):(\d+)', out.get('title',''))
|
|
if rm:
|
|
out['meta']['rezultat'] = f"{rm.group(1)}:{rm.group(2)}"
|
|
|
|
nat_match = out.get('title','').split(',')
|
|
if len(nat_match) > 1: out['meta']['natjecanje'] = nat_match[-1].strip()
|
|
|
|
return out
|
|
|
|
|
|
def cmd_klub(klub_id_db: int, max_matches: int = 999):
|
|
"""Scrape klub: club page → all matches → for our team upsert player + utakmice_log row with full stats."""
|
|
with conn() as c:
|
|
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
cu.execute("SELECT id, naziv, hns_klub_id, hns_slug FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,))
|
|
klub = cu.fetchone()
|
|
if not klub or not klub['hns_klub_id']:
|
|
log.error(f"Klub #{klub_id_db}: nema hns_klub_id"); return 0
|
|
|
|
klub_url = f"{BASE}/klubovi/{klub['hns_klub_id']}/{klub['hns_slug'] or 'k'}/"
|
|
log.info(f"Klub: {klub['naziv']} → {klub_url}")
|
|
try: html = fetch(klub_url)
|
|
except Exception as e: log.error(f"klub fetch failed: {e}"); return 0
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
match_ids = []
|
|
for a in soup.find_all('a', href=re.compile(r'/utakmice/(\d+)/')):
|
|
mm = re.search(r'/utakmice/(\d+)/', a['href'])
|
|
if mm and mm.group(1) not in match_ids:
|
|
match_ids.append(mm.group(1))
|
|
log.info(f" found {len(match_ids)} matches; processing up to {max_matches}")
|
|
|
|
klub_naziv_low = klub['naziv'].lower()
|
|
seen_pids = set()
|
|
matches_logged = 0
|
|
|
|
for mid in match_ids[:max_matches]:
|
|
try:
|
|
mhtml = fetch(f"{BASE}/utakmice/{mid}/")
|
|
md = parse_match(mhtml, match_url=f"{BASE}/utakmice/{mid}/")
|
|
except Exception as e: log.error(f" match {mid}: {e}"); continue
|
|
|
|
if not md.get('teams'):
|
|
log.warning(f" match {mid}: no teams parsed"); continue
|
|
|
|
# Find which team (home or away) is OURS — use looser match (incl. token overlap)
|
|
roster = []; matched_team = None
|
|
for tn, players in md['teams'].items():
|
|
tn_low = tn.lower()
|
|
# try exact substring both directions
|
|
if klub_naziv_low in tn_low or tn_low in klub_naziv_low:
|
|
roster = players; matched_team = tn; break
|
|
# token-set overlap (e.g. "NK Krk" vs "NK Krk Krk" or "NK Vihor" vs "NK Vihor (B)")
|
|
tokens_klub = set(re.split(r'\s+', re.sub(r'[^\w]',' ', klub_naziv_low)))
|
|
tokens_team = set(re.split(r'\s+', re.sub(r'[^\w]',' ', tn_low)))
|
|
tokens_klub.discard(''); tokens_team.discard('')
|
|
common = tokens_klub & tokens_team
|
|
# Drop generic tokens
|
|
generic = {'nk','hnk','klub','nogometni'}
|
|
common_strong = common - generic
|
|
if len(common_strong) >= 1 and (klub_naziv_low.split()[-1] in tn_low or tn_low.split()[-1] in klub_naziv_low):
|
|
roster = players; matched_team = tn
|
|
log.info(f" fuzzy match: {klub['naziv']} ↔ {tn}")
|
|
break
|
|
|
|
if not roster:
|
|
continue # silently skip non-matching
|
|
|
|
meta = md.get('meta', {})
|
|
team_keys = list(md['teams'].keys())
|
|
klub_dom = team_keys[0] if team_keys else None
|
|
klub_gost = team_keys[1] if len(team_keys) > 1 else None
|
|
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
for pl in roster:
|
|
if not pl.get('hns_pid'): continue
|
|
seen_pids.add(pl['hns_pid'])
|
|
|
|
name = pl['ime_prezime'] or ''
|
|
parts = name.rsplit(' ', 1)
|
|
ime = parts[0] if len(parts) > 1 else name
|
|
prezime = parts[1] if len(parts) > 1 else ''
|
|
slug = slugify(name)
|
|
src_url = f"{BASE}/igraci/{pl['hns_pid']}/{slug}/"
|
|
|
|
cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE source='hns_semafor' AND source_id=%s""", (str(pl['hns_pid']),))
|
|
row = cu.fetchone()
|
|
if row:
|
|
cid = row[0]
|
|
cu.execute("""UPDATE pgz_sport.clanovi
|
|
SET ime=%s, prezime=%s, slika_url=COALESCE(NULLIF(%s,''), slika_url),
|
|
broj_dresa=COALESCE(%s, broj_dresa),
|
|
pozicija=COALESCE(%s, pozicija),
|
|
klub_id=%s, source_url=%s, source_synced_at=now(), slug=%s
|
|
WHERE id=%s""",
|
|
(ime, prezime, pl.get('slika_url') or '', pl.get('broj_dresa'),
|
|
pl.get('pozicija'), klub_id_db, src_url, slug, cid))
|
|
else:
|
|
cu.execute("""INSERT INTO pgz_sport.clanovi
|
|
(klub_id, ime, prezime, slika_url, broj_dresa, pozicija,
|
|
source, source_id, source_url, source_synced_at, slug)
|
|
VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s)
|
|
RETURNING id""",
|
|
(klub_id_db, ime, prezime, pl.get('slika_url'), pl.get('broj_dresa'),
|
|
pl.get('pozicija'), str(pl['hns_pid']), src_url, slug))
|
|
cid = cu.fetchone()[0]
|
|
|
|
cu.execute("""INSERT INTO pgz_sport.utakmice_log
|
|
(clan_id, source, source_match_id, source_url, datum, vrijeme,
|
|
natjecanje, klub_dom, klub_dom_logo, klub_gost, klub_gost_logo, rezultat, za_klub_id,
|
|
pogodaka, zuti_kartoni, crveni_kartoni, minute, zapocet_kao_starter)
|
|
VALUES (%s,'hns_semafor',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
|
ON CONFLICT (source, source_match_id, clan_id) DO UPDATE SET
|
|
datum=EXCLUDED.datum, rezultat=EXCLUDED.rezultat,
|
|
za_klub_id=EXCLUDED.za_klub_id,
|
|
pogodaka=EXCLUDED.pogodaka, zuti_kartoni=EXCLUDED.zuti_kartoni,
|
|
crveni_kartoni=EXCLUDED.crveni_kartoni, minute=EXCLUDED.minute,
|
|
zapocet_kao_starter=EXCLUDED.zapocet_kao_starter,
|
|
klub_dom_logo=EXCLUDED.klub_dom_logo, klub_gost_logo=EXCLUDED.klub_gost_logo""",
|
|
(cid, mid, f"{BASE}/utakmice/{mid}/",
|
|
meta.get('datum'), meta.get('vrijeme'),
|
|
meta.get('natjecanje'), klub_dom, meta.get('klub_dom_logo'),
|
|
klub_gost, meta.get('klub_gost_logo'),
|
|
meta.get('rezultat'), klub_id_db,
|
|
pl.get('pogodaka',0), pl.get('zuti_kartoni',0),
|
|
pl.get('crveni_kartoni',0), pl.get('minute'),
|
|
pl.get('starter', True)))
|
|
c.commit()
|
|
matches_logged += 1
|
|
|
|
log.info(f"Klub {klub['naziv']} done: {len(seen_pids)} unique players, {matches_logged} matches logged")
|
|
return len(seen_pids)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 2:
|
|
print(__doc__); sys.exit(1)
|
|
cmd = sys.argv[1]
|
|
if cmd == 'seed':
|
|
print(cmd_seed())
|
|
elif cmd == 'player':
|
|
cid = cmd_player(int(sys.argv[2]))
|
|
print(f"clan_id={cid}")
|
|
elif cmd == 'daily':
|
|
cmd_daily()
|
|
elif cmd == 'klub':
|
|
if len(sys.argv) < 3:
|
|
print("Usage: klub <db_klub_id> [max_matches]"); sys.exit(2)
|
|
max_m = int(sys.argv[3]) if len(sys.argv) > 3 else 1
|
|
cmd_klub(int(sys.argv[2]), max_matches=max_m)
|
|
elif cmd == 'klub_all':
|
|
# Scrape all PGŽ klubovi with hns_klub_id set
|
|
with conn() as c:
|
|
cu = c.cursor()
|
|
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id IS NOT NULL ORDER BY id")
|
|
kids = [r[0] for r in cu.fetchall()]
|
|
log.info(f"Scraping rosters for {len(kids)} klubova…")
|
|
for kid in kids:
|
|
try: cmd_klub(kid, max_matches=999)
|
|
except Exception as e: log.error(f"klub {kid}: {e}")
|
|
else:
|
|
print(f"Unknown: {cmd}"); sys.exit(2)
|