Files
pgz-sport/scripts/sub1_hns_link_harvester.py
T

362 lines
15 KiB
Python

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
# Autor: dradulic@outlook.com / damir@rinet.one
# Svrha: SUB1 — Pronađi semafor.hns.family link za PGŽ priority
# nogometne klubove koji nemaju hns_klub_id.
# Strategija:
# 1. Enumerate ŽNS Primorsko-goranski (oid=51) competitions across
# seasons, plus 4. NL NS Rijeka, 3. HNL Zapad arhive
# 2. Za svaki natjecanje GET /natjecanja/{cid}/{cname}/ i izvuci
# sve <a href="/klubovi/{id}/{slug}/">{naziv}</a>
# 3. Build catalog (hns_id, slug, naziv) — skup unique
# 4. Fuzzy match candidate klubovi: normalize, drop NK/HNK/GNK
# prefiks, ukloni dijakritike, pa equality + substring + ratio
# 5. UPDATE pgz_sport.klubovi za matche; mark not_found za ostalo
# ═══════════════════════════════════════════════════════════════════
"""SUB1 — HNS link harvester for PGŽ priority football clubs."""
import os, re, sys, time, json, traceback, subprocess, difflib
from datetime import datetime
from urllib.parse import quote
import urllib.request, urllib.error
import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
SLEEP = 1.1
BASE = "https://semafor.hns.family"
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
LOG = open(LOG_PATH, "a")
def log(msg, telegram=False):
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
LOG.write(line + "\n"); LOG.flush()
if telegram:
try:
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot{TG}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={msg[:3500]}"],
timeout=8, capture_output=True)
except Exception as e:
log(f"TG error: {e}")
def http_get(url, accept_json=False, timeout=25):
req = urllib.request.Request(url, headers={
"User-Agent": UA,
"Accept": "application/json, */*" if accept_json else "text/html,*/*",
"X-Requested-With": "XMLHttpRequest" if accept_json else "",
})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
# ── Normalization for fuzzy match ──
DIACRITIC_MAP = str.maketrans({
'č':'c','ć':'c','ž':'z','š':'s','đ':'d',
'Č':'c','Ć':'c','Ž':'z','Š':'s','Đ':'d',
'á':'a','é':'e','í':'i','ó':'o','ú':'u',
})
PREFIX_RE = re.compile(
r'^(hrvatski\s+nogometni\s+klub|hrvatski\s+nogometni\s+klub\.?|'
r'nogometni\s+klub|nogometna\s+akademija|nogometna\s+škola|'
r'sportska\s+akademija|Ženski\s+nogometni\s+klub|'
r'hnk|nk|gnk|znk|žnk|nk\.?|hnk\.?)\s+',
re.IGNORECASE
)
SUFFIX_NOISE_RE = re.compile(
r'\b(veterani|veterana|gornji\s+zamet|grada\s+crikvenice|'
r'gomirje\s+gomirje|mrkopalj\s+mrkopalj|snježnik\s+gerovo|'
r'-?\s*\d{4}\s*$)', re.IGNORECASE)
def norm(s):
if not s: return ""
s = s.lower().strip()
s = s.translate(DIACRITIC_MAP)
s = re.sub(r'["\'`]', '', s)
s = re.sub(r'\s+', ' ', s)
return s
def core_name(naziv):
"""Strip prefixes and noise; return core token list + joined."""
s = norm(naziv)
# remove prefix(es) (sometimes nested e.g. "Nogometni Klub HNK ...")
for _ in range(3):
s2 = PREFIX_RE.sub('', s)
if s2 == s: break
s = s2
s = SUFFIX_NOISE_RE.sub('', s).strip()
s = re.sub(r'\s+', ' ', s).strip()
return s
def slugify(s):
s = core_name(s)
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
return s
# ── Catalog harvest ──
def get_pgz_competitions(season):
"""Fetch list of competitions for ŽNS Primorsko-goranski (oid=51) for a season."""
t = int(time.time()*1000)
url = (f"{BASE}/handlers/getCompetitions/"
f"?season={quote(season)}&oid=51&teamch=Club"
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
f"&lang=hr&t={t}")
try:
body = http_get(url, accept_json=True)
return json.loads(body)
except Exception as e:
log(f" comps fetch fail {season}: {e}")
return []
def get_organizations(season):
"""List all organizations (regional federations) for a season."""
t = int(time.time()*1000)
url = (f"{BASE}/handlers/getOrganizations/"
f"?season={quote(season)}&teamch=Club&lang=hr&t={t}")
try:
body = http_get(url, accept_json=True)
return json.loads(body)
except Exception as e:
log(f" orgs fetch fail {season}: {e}")
return []
# Match <a href="/klubovi/{id}/{slug}/">NAME<div...>...</a> — name is anything before first child element
CLUB_LINK_RE2 = re.compile(
r'<a[^>]+href="(?:https?://semafor\.hns\.family)?/klubovi/(\d+)/([a-z0-9-]*)/?"[^>]*>([^<]{1,150})(?:<|</a>)',
re.IGNORECASE
)
def harvest_competition(cid):
"""GET natjecanje page and extract all club refs."""
# The dynamic linkConstructor returned literal {cid}/{cname} — try direct id
url = f"{BASE}/natjecanja/{cid}/x/"
try:
html = http_get(url)
except Exception as e:
log(f" nat fetch fail {cid}: {e}")
return []
found = []
for m in CLUB_LINK_RE2.finditer(html):
hns_id, slug, naziv = m.group(1), m.group(2), m.group(3).strip()
# filter: real club name (not "Klubovi" navigation etc.)
if len(naziv) > 1 and not naziv.lower().startswith('klubov'):
found.append((hns_id, slug, naziv))
return found
# ── Match logic ──
def match_score(candidate_naziv, candidate_grad, hns_naziv):
"""Score 0-100 how well candidate matches an HNS club entry."""
cand_core = core_name(candidate_naziv)
hns_core = core_name(hns_naziv)
if not cand_core or not hns_core:
return 0
if cand_core == hns_core:
return 100
# ratio
r = difflib.SequenceMatcher(None, cand_core, hns_core).ratio()
score = int(r * 100)
# bonus if grad in HNS naziv (e.g. "NK Borac (Ba)" + grad="Bakar")
if candidate_grad:
gnorm = norm(candidate_grad)
if gnorm and (gnorm[:3] in norm(hns_naziv) or norm(hns_naziv).endswith('('+gnorm[:1]+')')):
score = min(100, score + 5)
# substring containment bonus (one fully contained)
if cand_core in hns_core or hns_core in cand_core:
score = max(score, 85)
return score
# ── Main ──
def main():
log(f"=== SUB1 HNS link harvester start; log={LOG_PATH} ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor(cursor_factory=RealDictCursor)
# 1) Get candidate clubs
cur.execute("""
SELECT id, naziv, grad
FROM pgz_sport.klubovi
WHERE sport='nogomet' AND pgz_sufinanciran=true
AND hns_klub_id IS NULL
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
ORDER BY naziv
""")
candidates = cur.fetchall()
log(f"Candidates: {len(candidates)}")
# 2) Build HNS catalog from PGŽ competitions across recent seasons
SEASONS = ["2025/2026","2024/2025","2023/2024","2022/2023","2021/2022","2020/2021","2019/2020","2018/2019","2017/2018"]
catalog = {} # hns_id -> {slug, naziv, sources:set}
seen_cids = set()
for season in SEASONS:
log(f"-- season {season}")
comps = get_pgz_competitions(season)
time.sleep(SLEEP)
log(f" PGŽ comps: {len(comps)}")
for c in comps:
cid = str(c.get('id',''))
if not cid or cid in seen_cids: continue
seen_cids.add(cid)
cname = c.get('value','')
try:
clubs = harvest_competition(cid)
except Exception as e:
log(f" {cid} ({cname}) fetch error: {e}")
clubs = []
for hns_id, slug, naziv in clubs:
if hns_id not in catalog:
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
else:
if slug and not catalog[hns_id]['slug']:
catalog[hns_id]['slug'] = slug
catalog[hns_id]['sources'].add(f"{season}:{cname[:30]}")
log(f" {cid} '{cname[:40]}' -> {len(clubs)} clubs (catalog={len(catalog)})")
time.sleep(SLEEP)
# also sweep top-tier comps to catch HNK Rijeka-tier (though those usually mapped)
# Also: 3.HNL Zapad / 4.NL NS Rijeka by oid=178180 (NS Rijeka)
log("-- NS Rijeka oid=178180 sweep")
for season in SEASONS:
t = int(time.time()*1000)
url = (f"{BASE}/handlers/getCompetitions/"
f"?season={quote(season)}&oid=178180&teamch=Club"
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
f"&lang=hr&t={t}")
try:
comps = json.loads(http_get(url, accept_json=True))
except Exception as e:
log(f" ns_rijeka {season} fail: {e}"); comps = []
time.sleep(SLEEP)
for c in comps:
cid = str(c.get('id',''))
if not cid or cid in seen_cids: continue
seen_cids.add(cid)
cname = c.get('value','')
try:
clubs = harvest_competition(cid)
except Exception as e:
clubs = []
for hns_id, slug, naziv in clubs:
if hns_id not in catalog:
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
catalog[hns_id]['sources'].add(f"NSR:{season}:{cname[:30]}")
log(f" NSR {cid} '{cname[:40]}' -> {len(clubs)} (cat={len(catalog)})")
time.sleep(SLEEP)
log(f"=== Catalog built: {len(catalog)} unique HNS clubs ===")
# Save catalog snapshot
snap = {hid: {'slug': v['slug'], 'naziv': v['naziv'], 'sources': sorted(v['sources'])[:5]}
for hid,v in catalog.items()}
with open("/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json","w") as f:
json.dump(snap, f, ensure_ascii=False, indent=2)
log(f"Catalog snapshot -> /opt/pgz-sport/cc_tasks/sub1_hns_catalog.json")
# 3) Match candidates
matched = [] # (db_id, db_naziv, hns_id, slug, hns_naziv, score)
not_found = []
ambiguous = []
for cand in candidates:
db_id, naziv, grad = cand['id'], cand['naziv'], cand['grad']
ranked = []
for hid, v in catalog.items():
sc = match_score(naziv, grad, v['naziv'])
if sc >= 70:
ranked.append((sc, hid, v['slug'], v['naziv']))
ranked.sort(reverse=True)
if not ranked:
not_found.append((db_id, naziv, grad))
log(f" NOT FOUND: [{db_id}] {naziv} ({grad})")
continue
top = ranked[0]
if len(ranked) > 1 and ranked[1][0] >= top[0] - 3 and top[0] < 95:
ambiguous.append((db_id, naziv, grad, ranked[:3]))
log(f" AMBIGUOUS: [{db_id}] {naziv} -> top: {top[3]} ({top[0]}), 2nd: {ranked[1][3]} ({ranked[1][0]})")
# Skip ambiguous, mark not_found for safety
not_found.append((db_id, naziv, grad))
continue
matched.append((db_id, naziv, top[1], top[2], top[3], top[0]))
log(f" MATCH [{db_id}] {naziv} -> HNS {top[1]} '{top[3]}' (slug={top[2]}, score={top[0]})")
log(f"=== Match results: {len(matched)} matched, {len(not_found)} not_found, {len(ambiguous)} ambiguous ===")
# 4) Apply UPDATEs
upd_ok, upd_fail = 0, 0
for db_id, naziv, hns_id, slug, hns_naziv, sc in matched:
try:
source_url = f"{BASE}/klubovi/{hns_id}/{slug}/" if slug else f"{BASE}/klubovi/{hns_id}/"
cur.execute("""
UPDATE pgz_sport.klubovi
SET hns_klub_id = %s,
hns_slug = %s,
source_url = COALESCE(source_url, %s),
scrape_source = 'hns_semafor',
last_scraped_at = now()
WHERE id = %s
""", (int(hns_id), slug or None, source_url, db_id))
upd_ok += 1
except Exception as e:
upd_fail += 1
log(f" UPDATE fail [{db_id}] {naziv}: {e}")
# Mark not_found
nf_ok = 0
for db_id, naziv, grad in not_found:
try:
cur.execute("""
UPDATE pgz_sport.klubovi
SET scrape_source = 'hns_not_found',
last_scraped_at = now()
WHERE id = %s AND hns_klub_id IS NULL
""", (db_id,))
nf_ok += 1
except Exception as e:
log(f" not_found mark fail [{db_id}]: {e}")
# 5) Write result md
res_path = "/opt/pgz-sport/cc_tasks/SUB1_RESULT.md"
with open(res_path, "w") as f:
f.write(f"# SUB1 — HNS Link Harvest Result\n\n")
f.write(f"Date: {datetime.now().isoformat(timespec='seconds')}\n\n")
f.write(f"- Candidates processed: **{len(candidates)}**\n")
f.write(f"- HNS catalog built: **{len(catalog)}** unique clubs from {len(seen_cids)} competitions\n")
f.write(f"- Matched: **{len(matched)}** (DB updated: {upd_ok}, fail: {upd_fail})\n")
f.write(f"- Ambiguous (skipped to safety): **{len(ambiguous)}**\n")
f.write(f"- Not found (marked hns_not_found): **{len(not_found)}** (mark ok: {nf_ok})\n\n")
f.write(f"## Matched\n\n| db_id | DB naziv | HNS id | HNS naziv | slug | score |\n|---|---|---|---|---|---|\n")
for db_id, naziv, hns_id, slug, hns_naziv, sc in sorted(matched, key=lambda x: -x[5]):
f.write(f"| {db_id} | {naziv} | {hns_id} | {hns_naziv} | {slug} | {sc} |\n")
f.write(f"\n## Ambiguous (manual review)\n\n")
for db_id, naziv, grad, ranked in ambiguous:
f.write(f"- **[{db_id}] {naziv}** ({grad})\n")
for sc, hid, slug, hns_naziv in ranked:
f.write(f" - {sc}: HNS {hid} '{hns_naziv}' (slug={slug})\n")
f.write(f"\n## Not Found\n\n")
for db_id, naziv, grad in not_found:
f.write(f"- [{db_id}] {naziv} ({grad})\n")
f.write(f"\n## Log\n\n`{LOG_PATH}`\n")
log(f"Result -> {res_path}")
# 6) Telegram notify
msg = (f"SUB1 HNS done: matched {len(matched)}, not_found {len(not_found)}, "
f"ambiguous {len(ambiguous)}. Catalog={len(catalog)}. "
f"DB upd ok={upd_ok}/fail={upd_fail}. See SUB1_RESULT.md")
log(msg, telegram=True)
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: {e}\n{traceback.format_exc()}", telegram=True)
sys.exit(1)