359 lines
15 KiB
Python
359 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
|
|
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
|
|
# Autor: dradulic@outlook.com / damir@rinet.one
|
|
# Svrha: SUB1 — Pronađi semafor.hns.family link za PGŽ priority
|
|
# nogometne klubove koji nemaju hns_klub_id.
|
|
# Strategija:
|
|
# 1. Enumerate ŽNS Primorsko-goranski (oid=51) competitions across
|
|
# seasons, plus 4. NL NS Rijeka, 3. HNL Zapad arhive
|
|
# 2. Za svaki natjecanje GET /natjecanja/{cid}/{cname}/ i izvuci
|
|
# sve <a href="/klubovi/{id}/{slug}/">{naziv}</a>
|
|
# 3. Build catalog (hns_id, slug, naziv) — skup unique
|
|
# 4. Fuzzy match candidate klubovi: normalize, drop NK/HNK/GNK
|
|
# prefiks, ukloni dijakritike, pa equality + substring + ratio
|
|
# 5. UPDATE pgz_sport.klubovi za matche; mark not_found za ostalo
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""SUB1 — HNS link harvester for PGŽ priority football clubs."""
|
|
import os, re, sys, time, json, traceback, subprocess, difflib
|
|
from datetime import datetime
|
|
from urllib.parse import quote
|
|
import urllib.request, urllib.error
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
|
|
DSN = os.getenv("RINET_DSN",
|
|
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
|
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
|
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
|
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
|
|
SLEEP = 1.1
|
|
BASE = "https://semafor.hns.family"
|
|
|
|
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
|
LOG = open(LOG_PATH, "a")
|
|
|
|
def log(msg, telegram=False):
|
|
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
|
print(line, flush=True)
|
|
LOG.write(line + "\n"); LOG.flush()
|
|
if telegram:
|
|
try:
|
|
subprocess.run(["curl","-s","-X","POST",
|
|
f"https://api.telegram.org/bot{TG}/sendMessage",
|
|
"-d", f"chat_id={TG_CHAT}",
|
|
"--data-urlencode", f"text={msg[:3500]}"],
|
|
timeout=8, capture_output=True)
|
|
except Exception as e:
|
|
log(f"TG error: {e}")
|
|
|
|
def http_get(url, accept_json=False, timeout=25):
|
|
req = urllib.request.Request(url, headers={
|
|
"User-Agent": UA,
|
|
"Accept": "application/json, */*" if accept_json else "text/html,*/*",
|
|
"X-Requested-With": "XMLHttpRequest" if accept_json else "",
|
|
})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read().decode("utf-8", errors="replace")
|
|
|
|
# ── Normalization for fuzzy match ──
|
|
DIACRITIC_MAP = str.maketrans({
|
|
'č':'c','ć':'c','ž':'z','š':'s','đ':'d',
|
|
'Č':'c','Ć':'c','Ž':'z','Š':'s','Đ':'d',
|
|
'á':'a','é':'e','í':'i','ó':'o','ú':'u',
|
|
})
|
|
PREFIX_RE = re.compile(
|
|
r'^(hrvatski\s+nogometni\s+klub|hrvatski\s+nogometni\s+klub\.?|'
|
|
r'nogometni\s+klub|nogometna\s+akademija|nogometna\s+škola|'
|
|
r'sportska\s+akademija|Ženski\s+nogometni\s+klub|'
|
|
r'hnk|nk|gnk|znk|žnk|nk\.?|hnk\.?)\s+',
|
|
re.IGNORECASE
|
|
)
|
|
SUFFIX_NOISE_RE = re.compile(
|
|
r'\b(veterani|veterana|gornji\s+zamet|grada\s+crikvenice|'
|
|
r'gomirje\s+gomirje|mrkopalj\s+mrkopalj|snježnik\s+gerovo|'
|
|
r'-?\s*\d{4}\s*$)', re.IGNORECASE)
|
|
|
|
def norm(s):
|
|
if not s: return ""
|
|
s = s.lower().strip()
|
|
s = s.translate(DIACRITIC_MAP)
|
|
s = re.sub(r'["\'`]', '', s)
|
|
s = re.sub(r'\s+', ' ', s)
|
|
return s
|
|
|
|
def core_name(naziv):
|
|
"""Strip prefixes and noise; return core token list + joined."""
|
|
s = norm(naziv)
|
|
# remove prefix(es) (sometimes nested e.g. "Nogometni Klub HNK ...")
|
|
for _ in range(3):
|
|
s2 = PREFIX_RE.sub('', s)
|
|
if s2 == s: break
|
|
s = s2
|
|
s = SUFFIX_NOISE_RE.sub('', s).strip()
|
|
s = re.sub(r'\s+', ' ', s).strip()
|
|
return s
|
|
|
|
def slugify(s):
|
|
s = core_name(s)
|
|
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
|
return s
|
|
|
|
# ── Catalog harvest ──
|
|
def get_pgz_competitions(season):
|
|
"""Fetch list of competitions for ŽNS Primorsko-goranski (oid=51) for a season."""
|
|
t = int(time.time()*1000)
|
|
url = (f"{BASE}/handlers/getCompetitions/"
|
|
f"?season={quote(season)}&oid=51&teamch=Club"
|
|
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
|
|
f"&lang=hr&t={t}")
|
|
try:
|
|
body = http_get(url, accept_json=True)
|
|
return json.loads(body)
|
|
except Exception as e:
|
|
log(f" comps fetch fail {season}: {e}")
|
|
return []
|
|
|
|
def get_organizations(season):
|
|
"""List all organizations (regional federations) for a season."""
|
|
t = int(time.time()*1000)
|
|
url = (f"{BASE}/handlers/getOrganizations/"
|
|
f"?season={quote(season)}&teamch=Club&lang=hr&t={t}")
|
|
try:
|
|
body = http_get(url, accept_json=True)
|
|
return json.loads(body)
|
|
except Exception as e:
|
|
log(f" orgs fetch fail {season}: {e}")
|
|
return []
|
|
|
|
# Match <a href="/klubovi/{id}/{slug}/">NAME<div...>...</a> — name is anything before first child element
|
|
CLUB_LINK_RE2 = re.compile(
|
|
r'<a[^>]+href="(?:https?://semafor\.hns\.family)?/klubovi/(\d+)/([a-z0-9-]*)/?"[^>]*>([^<]{1,150})(?:<|</a>)',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
def harvest_competition(cid):
|
|
"""GET natjecanje page and extract all club refs."""
|
|
# The dynamic linkConstructor returned literal {cid}/{cname} — try direct id
|
|
url = f"{BASE}/natjecanja/{cid}/x/"
|
|
try:
|
|
html = http_get(url)
|
|
except Exception as e:
|
|
log(f" nat fetch fail {cid}: {e}")
|
|
return []
|
|
found = []
|
|
for m in CLUB_LINK_RE2.finditer(html):
|
|
hns_id, slug, naziv = m.group(1), m.group(2), m.group(3).strip()
|
|
# filter: real club name (not "Klubovi" navigation etc.)
|
|
if len(naziv) > 1 and not naziv.lower().startswith('klubov'):
|
|
found.append((hns_id, slug, naziv))
|
|
return found
|
|
|
|
# ── Match logic ──
|
|
def match_score(candidate_naziv, candidate_grad, hns_naziv):
|
|
"""Score 0-100 how well candidate matches an HNS club entry."""
|
|
cand_core = core_name(candidate_naziv)
|
|
hns_core = core_name(hns_naziv)
|
|
if not cand_core or not hns_core:
|
|
return 0
|
|
if cand_core == hns_core:
|
|
return 100
|
|
# ratio
|
|
r = difflib.SequenceMatcher(None, cand_core, hns_core).ratio()
|
|
score = int(r * 100)
|
|
# bonus if grad in HNS naziv (e.g. "NK Borac (Ba)" + grad="Bakar")
|
|
if candidate_grad:
|
|
gnorm = norm(candidate_grad)
|
|
if gnorm and (gnorm[:3] in norm(hns_naziv) or norm(hns_naziv).endswith('('+gnorm[:1]+')')):
|
|
score = min(100, score + 5)
|
|
# substring containment bonus (one fully contained)
|
|
if cand_core in hns_core or hns_core in cand_core:
|
|
score = max(score, 85)
|
|
return score
|
|
|
|
# ── Main ──
|
|
def main():
|
|
log(f"=== SUB1 HNS link harvester start; log={LOG_PATH} ===")
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
cur = conn.cursor(cursor_factory=RealDictCursor)
|
|
|
|
# 1) Get candidate clubs
|
|
cur.execute("""
|
|
SELECT id, naziv, grad
|
|
FROM pgz_sport.klubovi
|
|
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
|
AND hns_klub_id IS NULL
|
|
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
|
|
ORDER BY naziv
|
|
""")
|
|
candidates = cur.fetchall()
|
|
log(f"Candidates: {len(candidates)}")
|
|
|
|
# 2) Build HNS catalog from PGŽ competitions across recent seasons
|
|
SEASONS = ["2025/2026","2024/2025","2023/2024","2022/2023","2021/2022","2020/2021","2019/2020","2018/2019","2017/2018"]
|
|
catalog = {} # hns_id -> {slug, naziv, sources:set}
|
|
|
|
seen_cids = set()
|
|
for season in SEASONS:
|
|
log(f"-- season {season}")
|
|
comps = get_pgz_competitions(season)
|
|
time.sleep(SLEEP)
|
|
log(f" PGŽ comps: {len(comps)}")
|
|
for c in comps:
|
|
cid = str(c.get('id',''))
|
|
if not cid or cid in seen_cids: continue
|
|
seen_cids.add(cid)
|
|
cname = c.get('value','')
|
|
try:
|
|
clubs = harvest_competition(cid)
|
|
except Exception as e:
|
|
log(f" {cid} ({cname}) fetch error: {e}")
|
|
clubs = []
|
|
for hns_id, slug, naziv in clubs:
|
|
if hns_id not in catalog:
|
|
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
|
|
else:
|
|
if slug and not catalog[hns_id]['slug']:
|
|
catalog[hns_id]['slug'] = slug
|
|
catalog[hns_id]['sources'].add(f"{season}:{cname[:30]}")
|
|
log(f" {cid} '{cname[:40]}' -> {len(clubs)} clubs (catalog={len(catalog)})")
|
|
time.sleep(SLEEP)
|
|
# also sweep top-tier comps to catch HNK Rijeka-tier (though those usually mapped)
|
|
# Also: 3.HNL Zapad / 4.NL NS Rijeka by oid=178180 (NS Rijeka)
|
|
log("-- NS Rijeka oid=178180 sweep")
|
|
for season in SEASONS:
|
|
t = int(time.time()*1000)
|
|
url = (f"{BASE}/handlers/getCompetitions/"
|
|
f"?season={quote(season)}&oid=178180&teamch=Club"
|
|
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
|
|
f"&lang=hr&t={t}")
|
|
try:
|
|
comps = json.loads(http_get(url, accept_json=True))
|
|
except Exception as e:
|
|
log(f" ns_rijeka {season} fail: {e}"); comps = []
|
|
time.sleep(SLEEP)
|
|
for c in comps:
|
|
cid = str(c.get('id',''))
|
|
if not cid or cid in seen_cids: continue
|
|
seen_cids.add(cid)
|
|
cname = c.get('value','')
|
|
try:
|
|
clubs = harvest_competition(cid)
|
|
except Exception as e:
|
|
clubs = []
|
|
for hns_id, slug, naziv in clubs:
|
|
if hns_id not in catalog:
|
|
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
|
|
catalog[hns_id]['sources'].add(f"NSR:{season}:{cname[:30]}")
|
|
log(f" NSR {cid} '{cname[:40]}' -> {len(clubs)} (cat={len(catalog)})")
|
|
time.sleep(SLEEP)
|
|
|
|
log(f"=== Catalog built: {len(catalog)} unique HNS clubs ===")
|
|
|
|
# Save catalog snapshot
|
|
snap = {hid: {'slug': v['slug'], 'naziv': v['naziv'], 'sources': sorted(v['sources'])[:5]}
|
|
for hid,v in catalog.items()}
|
|
with open("/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json","w") as f:
|
|
json.dump(snap, f, ensure_ascii=False, indent=2)
|
|
log(f"Catalog snapshot -> /opt/pgz-sport/cc_tasks/sub1_hns_catalog.json")
|
|
|
|
# 3) Match candidates
|
|
matched = [] # (db_id, db_naziv, hns_id, slug, hns_naziv, score)
|
|
not_found = []
|
|
ambiguous = []
|
|
|
|
for cand in candidates:
|
|
db_id, naziv, grad = cand['id'], cand['naziv'], cand['grad']
|
|
ranked = []
|
|
for hid, v in catalog.items():
|
|
sc = match_score(naziv, grad, v['naziv'])
|
|
if sc >= 70:
|
|
ranked.append((sc, hid, v['slug'], v['naziv']))
|
|
ranked.sort(reverse=True)
|
|
if not ranked:
|
|
not_found.append((db_id, naziv, grad))
|
|
log(f" NOT FOUND: [{db_id}] {naziv} ({grad})")
|
|
continue
|
|
top = ranked[0]
|
|
if len(ranked) > 1 and ranked[1][0] >= top[0] - 3 and top[0] < 95:
|
|
ambiguous.append((db_id, naziv, grad, ranked[:3]))
|
|
log(f" AMBIGUOUS: [{db_id}] {naziv} -> top: {top[3]} ({top[0]}), 2nd: {ranked[1][3]} ({ranked[1][0]})")
|
|
# Skip ambiguous, mark not_found for safety
|
|
not_found.append((db_id, naziv, grad))
|
|
continue
|
|
matched.append((db_id, naziv, top[1], top[2], top[3], top[0]))
|
|
log(f" MATCH [{db_id}] {naziv} -> HNS {top[1]} '{top[3]}' (slug={top[2]}, score={top[0]})")
|
|
|
|
log(f"=== Match results: {len(matched)} matched, {len(not_found)} not_found, {len(ambiguous)} ambiguous ===")
|
|
|
|
# 4) Apply UPDATEs
|
|
upd_ok, upd_fail = 0, 0
|
|
for db_id, naziv, hns_id, slug, hns_naziv, sc in matched:
|
|
try:
|
|
source_url = f"{BASE}/klubovi/{hns_id}/{slug}/" if slug else f"{BASE}/klubovi/{hns_id}/"
|
|
cur.execute("""
|
|
UPDATE pgz_sport.klubovi
|
|
SET hns_klub_id = %s,
|
|
hns_slug = %s,
|
|
source_url = COALESCE(source_url, %s),
|
|
scrape_source = 'hns_semafor',
|
|
last_scraped_at = now()
|
|
WHERE id = %s
|
|
""", (int(hns_id), slug or None, source_url, db_id))
|
|
upd_ok += 1
|
|
except Exception as e:
|
|
upd_fail += 1
|
|
log(f" UPDATE fail [{db_id}] {naziv}: {e}")
|
|
|
|
# Mark not_found
|
|
nf_ok = 0
|
|
for db_id, naziv, grad in not_found:
|
|
try:
|
|
cur.execute("""
|
|
UPDATE pgz_sport.klubovi
|
|
SET scrape_source = 'hns_not_found',
|
|
last_scraped_at = now()
|
|
WHERE id = %s AND hns_klub_id IS NULL
|
|
""", (db_id,))
|
|
nf_ok += 1
|
|
except Exception as e:
|
|
log(f" not_found mark fail [{db_id}]: {e}")
|
|
|
|
# 5) Write result md
|
|
res_path = "/opt/pgz-sport/cc_tasks/SUB1_RESULT.md"
|
|
with open(res_path, "w") as f:
|
|
f.write(f"# SUB1 — HNS Link Harvest Result\n\n")
|
|
f.write(f"Date: {datetime.now().isoformat(timespec='seconds')}\n\n")
|
|
f.write(f"- Candidates processed: **{len(candidates)}**\n")
|
|
f.write(f"- HNS catalog built: **{len(catalog)}** unique clubs from {len(seen_cids)} competitions\n")
|
|
f.write(f"- Matched: **{len(matched)}** (DB updated: {upd_ok}, fail: {upd_fail})\n")
|
|
f.write(f"- Ambiguous (skipped to safety): **{len(ambiguous)}**\n")
|
|
f.write(f"- Not found (marked hns_not_found): **{len(not_found)}** (mark ok: {nf_ok})\n\n")
|
|
f.write(f"## Matched\n\n| db_id | DB naziv | HNS id | HNS naziv | slug | score |\n|---|---|---|---|---|---|\n")
|
|
for db_id, naziv, hns_id, slug, hns_naziv, sc in sorted(matched, key=lambda x: -x[5]):
|
|
f.write(f"| {db_id} | {naziv} | {hns_id} | {hns_naziv} | {slug} | {sc} |\n")
|
|
f.write(f"\n## Ambiguous (manual review)\n\n")
|
|
for db_id, naziv, grad, ranked in ambiguous:
|
|
f.write(f"- **[{db_id}] {naziv}** ({grad})\n")
|
|
for sc, hid, slug, hns_naziv in ranked:
|
|
f.write(f" - {sc}: HNS {hid} '{hns_naziv}' (slug={slug})\n")
|
|
f.write(f"\n## Not Found\n\n")
|
|
for db_id, naziv, grad in not_found:
|
|
f.write(f"- [{db_id}] {naziv} ({grad})\n")
|
|
f.write(f"\n## Log\n\n`{LOG_PATH}`\n")
|
|
log(f"Result -> {res_path}")
|
|
|
|
# 6) Telegram notify
|
|
msg = (f"SUB1 HNS done: matched {len(matched)}, not_found {len(not_found)}, "
|
|
f"ambiguous {len(ambiguous)}. Catalog={len(catalog)}. "
|
|
f"DB upd ok={upd_ok}/fail={upd_fail}. See SUB1_RESULT.md")
|
|
log(msg, telegram=True)
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except Exception as e:
|
|
log(f"FATAL: {e}\n{traceback.format_exc()}", telegram=True)
|
|
sys.exit(1)
|