pgz-sport/scripts/sub1_hns_link_harvester.py_prije_env_deepseek

#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
# Autor: dradulic@outlook.com / damir@rinet.one
# Svrha: SUB1 — Pronađi semafor.hns.family link za PGŽ priority
#        nogometne klubove koji nemaju hns_klub_id.
# Strategija:
#   1. Enumerate ŽNS Primorsko-goranski (oid=51) competitions across
#      seasons, plus 4. NL NS Rijeka, 3. HNL Zapad arhive
#   2. Za svaki natjecanje GET /natjecanja/{cid}/{cname}/ i izvuci
#      sve <a href="/klubovi/{id}/{slug}/">{naziv}</a>
#   3. Build catalog (hns_id, slug, naziv) — skup unique
#   4. Fuzzy match candidate klubovi: normalize, drop NK/HNK/GNK
#      prefiks, ukloni dijakritike, pa equality + substring + ratio
#   5. UPDATE pgz_sport.klubovi za matche; mark not_found za ostalo
# ═══════════════════════════════════════════════════════════════════
"""SUB1 — HNS link harvester for PGŽ priority football clubs."""
import os, re, sys, time, json, traceback, subprocess, difflib
from datetime import datetime
from urllib.parse import quote
import urllib.request, urllib.error
import psycopg2
from psycopg2.extras import RealDictCursor

DSN = os.getenv("RINET_DSN",
    f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
SLEEP = 1.1
BASE = "https://semafor.hns.family"

LOG_PATH = f"/var/log/pgz-sport-debug/sub1_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
LOG = open(LOG_PATH, "a")

def log(msg, telegram=False):
    line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
    print(line, flush=True)
    LOG.write(line + "\n"); LOG.flush()
    if telegram:
        try:
            subprocess.run(["curl","-s","-X","POST",
                f"https://api.telegram.org/bot{TG}/sendMessage",
                "-d", f"chat_id={TG_CHAT}",
                "--data-urlencode", f"text={msg[:3500]}"],
                timeout=8, capture_output=True)
        except Exception as e:
            log(f"TG error: {e}")

def http_get(url, accept_json=False, timeout=25):
    req = urllib.request.Request(url, headers={
        "User-Agent": UA,
        "Accept": "application/json, */*" if accept_json else "text/html,*/*",
        "X-Requested-With": "XMLHttpRequest" if accept_json else "",
    })
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return r.read().decode("utf-8", errors="replace")

# ── Normalization for fuzzy match ──
DIACRITIC_MAP = str.maketrans({
    'č':'c','ć':'c','ž':'z','š':'s','đ':'d',
    'Č':'c','Ć':'c','Ž':'z','Š':'s','Đ':'d',
    'á':'a','é':'e','í':'i','ó':'o','ú':'u',
})
PREFIX_RE = re.compile(
    r'^(hrvatski\s+nogometni\s+klub|hrvatski\s+nogometni\s+klub\.?|'
    r'nogometni\s+klub|nogometna\s+akademija|nogometna\s+škola|'
    r'sportska\s+akademija|Ženski\s+nogometni\s+klub|'
    r'hnk|nk|gnk|znk|žnk|nk\.?|hnk\.?)\s+',
    re.IGNORECASE
)
SUFFIX_NOISE_RE = re.compile(
    r'\b(veterani|veterana|gornji\s+zamet|grada\s+crikvenice|'
    r'gomirje\s+gomirje|mrkopalj\s+mrkopalj|snježnik\s+gerovo|'
    r'-?\s*\d{4}\s*$)', re.IGNORECASE)

def norm(s):
    if not s: return ""
    s = s.lower().strip()
    s = s.translate(DIACRITIC_MAP)
    s = re.sub(r'["\'`]', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s

def core_name(naziv):
    """Strip prefixes and noise; return core token list + joined."""
    s = norm(naziv)
    # remove prefix(es) (sometimes nested e.g. "Nogometni Klub HNK ...")
    for _ in range(3):
        s2 = PREFIX_RE.sub('', s)
        if s2 == s: break
        s = s2
    s = SUFFIX_NOISE_RE.sub('', s).strip()
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def slugify(s):
    s = core_name(s)
    s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
    return s

# ── Catalog harvest ──
def get_pgz_competitions(season):
    """Fetch list of competitions for ŽNS Primorsko-goranski (oid=51) for a season."""
    t = int(time.time()*1000)
    url = (f"{BASE}/handlers/getCompetitions/"
           f"?season={quote(season)}&oid=51&teamch=Club"
           f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
           f"&lang=hr&t={t}")
    try:
        body = http_get(url, accept_json=True)
        return json.loads(body)
    except Exception as e:
        log(f"  comps fetch fail {season}: {e}")
        return []

def get_organizations(season):
    """List all organizations (regional federations) for a season."""
    t = int(time.time()*1000)
    url = (f"{BASE}/handlers/getOrganizations/"
           f"?season={quote(season)}&teamch=Club&lang=hr&t={t}")
    try:
        body = http_get(url, accept_json=True)
        return json.loads(body)
    except Exception as e:
        log(f"  orgs fetch fail {season}: {e}")
        return []

# Match <a href="/klubovi/{id}/{slug}/">NAME<div...>...</a>  — name is anything before first child element
CLUB_LINK_RE2 = re.compile(
    r'<a[^>]+href="(?:https?://semafor\.hns\.family)?/klubovi/(\d+)/([a-z0-9-]*)/?"[^>]*>([^<]{1,150})(?:<|</a>)',
    re.IGNORECASE
)

def harvest_competition(cid):
    """GET natjecanje page and extract all club refs."""
    # The dynamic linkConstructor returned literal {cid}/{cname} — try direct id
    url = f"{BASE}/natjecanja/{cid}/x/"
    try:
        html = http_get(url)
    except Exception as e:
        log(f"  nat fetch fail {cid}: {e}")
        return []
    found = []
    for m in CLUB_LINK_RE2.finditer(html):
        hns_id, slug, naziv = m.group(1), m.group(2), m.group(3).strip()
        # filter: real club name (not "Klubovi" navigation etc.)
        if len(naziv) > 1 and not naziv.lower().startswith('klubov'):
            found.append((hns_id, slug, naziv))
    return found

# ── Match logic ──
def match_score(candidate_naziv, candidate_grad, hns_naziv):
    """Score 0-100 how well candidate matches an HNS club entry."""
    cand_core = core_name(candidate_naziv)
    hns_core  = core_name(hns_naziv)
    if not cand_core or not hns_core:
        return 0
    if cand_core == hns_core:
        return 100
    # ratio
    r = difflib.SequenceMatcher(None, cand_core, hns_core).ratio()
    score = int(r * 100)
    # bonus if grad in HNS naziv (e.g. "NK Borac (Ba)" + grad="Bakar")
    if candidate_grad:
        gnorm = norm(candidate_grad)
        if gnorm and (gnorm[:3] in norm(hns_naziv) or norm(hns_naziv).endswith('('+gnorm[:1]+')')):
            score = min(100, score + 5)
    # substring containment bonus (one fully contained)
    if cand_core in hns_core or hns_core in cand_core:
        score = max(score, 85)
    return score

# ── Main ──
def main():
    log(f"=== SUB1 HNS link harvester start; log={LOG_PATH} ===")
    conn = psycopg2.connect(DSN); conn.autocommit = True
    cur = conn.cursor(cursor_factory=RealDictCursor)

    # 1) Get candidate clubs
    cur.execute("""
        SELECT id, naziv, grad
        FROM pgz_sport.klubovi
        WHERE sport='nogomet' AND pgz_sufinanciran=true
          AND hns_klub_id IS NULL
          AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
        ORDER BY naziv
    """)
    candidates = cur.fetchall()
    log(f"Candidates: {len(candidates)}")

    # 2) Build HNS catalog from PGŽ competitions across recent seasons
    SEASONS = ["2025/2026","2024/2025","2023/2024","2022/2023","2021/2022","2020/2021","2019/2020","2018/2019","2017/2018"]
    catalog = {}  # hns_id -> {slug, naziv, sources:set}

    seen_cids = set()
    for season in SEASONS:
        log(f"-- season {season}")
        comps = get_pgz_competitions(season)
        time.sleep(SLEEP)
        log(f"   PGŽ comps: {len(comps)}")
        for c in comps:
            cid = str(c.get('id',''))
            if not cid or cid in seen_cids: continue
            seen_cids.add(cid)
            cname = c.get('value','')
            try:
                clubs = harvest_competition(cid)
            except Exception as e:
                log(f"   {cid} ({cname}) fetch error: {e}")
                clubs = []
            for hns_id, slug, naziv in clubs:
                if hns_id not in catalog:
                    catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
                else:
                    if slug and not catalog[hns_id]['slug']:
                        catalog[hns_id]['slug'] = slug
                catalog[hns_id]['sources'].add(f"{season}:{cname[:30]}")
            log(f"   {cid} '{cname[:40]}' -> {len(clubs)} clubs (catalog={len(catalog)})")
            time.sleep(SLEEP)
        # also sweep top-tier comps to catch HNK Rijeka-tier (though those usually mapped)
    # Also: 3.HNL Zapad / 4.NL NS Rijeka by oid=178180 (NS Rijeka)
    log("-- NS Rijeka oid=178180 sweep")
    for season in SEASONS:
        t = int(time.time()*1000)
        url = (f"{BASE}/handlers/getCompetitions/"
               f"?season={quote(season)}&oid=178180&teamch=Club"
               f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
               f"&lang=hr&t={t}")
        try:
            comps = json.loads(http_get(url, accept_json=True))
        except Exception as e:
            log(f"   ns_rijeka {season} fail: {e}"); comps = []
        time.sleep(SLEEP)
        for c in comps:
            cid = str(c.get('id',''))
            if not cid or cid in seen_cids: continue
            seen_cids.add(cid)
            cname = c.get('value','')
            try:
                clubs = harvest_competition(cid)
            except Exception as e:
                clubs = []
            for hns_id, slug, naziv in clubs:
                if hns_id not in catalog:
                    catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
                catalog[hns_id]['sources'].add(f"NSR:{season}:{cname[:30]}")
            log(f"   NSR {cid} '{cname[:40]}' -> {len(clubs)} (cat={len(catalog)})")
            time.sleep(SLEEP)

    log(f"=== Catalog built: {len(catalog)} unique HNS clubs ===")

    # Save catalog snapshot
    snap = {hid: {'slug': v['slug'], 'naziv': v['naziv'], 'sources': sorted(v['sources'])[:5]}
            for hid,v in catalog.items()}
    with open("/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json","w") as f:
        json.dump(snap, f, ensure_ascii=False, indent=2)
    log(f"Catalog snapshot -> /opt/pgz-sport/cc_tasks/sub1_hns_catalog.json")

    # 3) Match candidates
    matched = []   # (db_id, db_naziv, hns_id, slug, hns_naziv, score)
    not_found = []
    ambiguous = []

    for cand in candidates:
        db_id, naziv, grad = cand['id'], cand['naziv'], cand['grad']
        ranked = []
        for hid, v in catalog.items():
            sc = match_score(naziv, grad, v['naziv'])
            if sc >= 70:
                ranked.append((sc, hid, v['slug'], v['naziv']))
        ranked.sort(reverse=True)
        if not ranked:
            not_found.append((db_id, naziv, grad))
            log(f"  NOT FOUND: [{db_id}] {naziv} ({grad})")
            continue
        top = ranked[0]
        if len(ranked) > 1 and ranked[1][0] >= top[0] - 3 and top[0] < 95:
            ambiguous.append((db_id, naziv, grad, ranked[:3]))
            log(f"  AMBIGUOUS: [{db_id}] {naziv} -> top: {top[3]} ({top[0]}), 2nd: {ranked[1][3]} ({ranked[1][0]})")
            # Skip ambiguous, mark not_found for safety
            not_found.append((db_id, naziv, grad))
            continue
        matched.append((db_id, naziv, top[1], top[2], top[3], top[0]))
        log(f"  MATCH [{db_id}] {naziv} -> HNS {top[1]} '{top[3]}' (slug={top[2]}, score={top[0]})")

    log(f"=== Match results: {len(matched)} matched, {len(not_found)} not_found, {len(ambiguous)} ambiguous ===")

    # 4) Apply UPDATEs
    upd_ok, upd_fail = 0, 0
    for db_id, naziv, hns_id, slug, hns_naziv, sc in matched:
        try:
            source_url = f"{BASE}/klubovi/{hns_id}/{slug}/" if slug else f"{BASE}/klubovi/{hns_id}/"
            cur.execute("""
                UPDATE pgz_sport.klubovi
                SET hns_klub_id = %s,
                    hns_slug = %s,
                    source_url = COALESCE(source_url, %s),
                    scrape_source = 'hns_semafor',
                    last_scraped_at = now()
                WHERE id = %s
            """, (int(hns_id), slug or None, source_url, db_id))
            upd_ok += 1
        except Exception as e:
            upd_fail += 1
            log(f"  UPDATE fail [{db_id}] {naziv}: {e}")

    # Mark not_found
    nf_ok = 0
    for db_id, naziv, grad in not_found:
        try:
            cur.execute("""
                UPDATE pgz_sport.klubovi
                SET scrape_source = 'hns_not_found',
                    last_scraped_at = now()
                WHERE id = %s AND hns_klub_id IS NULL
            """, (db_id,))
            nf_ok += 1
        except Exception as e:
            log(f"  not_found mark fail [{db_id}]: {e}")

    # 5) Write result md
    res_path = "/opt/pgz-sport/cc_tasks/SUB1_RESULT.md"
    with open(res_path, "w") as f:
        f.write(f"# SUB1 — HNS Link Harvest Result\n\n")
        f.write(f"Date: {datetime.now().isoformat(timespec='seconds')}\n\n")
        f.write(f"- Candidates processed: **{len(candidates)}**\n")
        f.write(f"- HNS catalog built: **{len(catalog)}** unique clubs from {len(seen_cids)} competitions\n")
        f.write(f"- Matched: **{len(matched)}** (DB updated: {upd_ok}, fail: {upd_fail})\n")
        f.write(f"- Ambiguous (skipped to safety): **{len(ambiguous)}**\n")
        f.write(f"- Not found (marked hns_not_found): **{len(not_found)}** (mark ok: {nf_ok})\n\n")
        f.write(f"## Matched\n\n| db_id | DB naziv | HNS id | HNS naziv | slug | score |\n|---|---|---|---|---|---|\n")
        for db_id, naziv, hns_id, slug, hns_naziv, sc in sorted(matched, key=lambda x: -x[5]):
            f.write(f"| {db_id} | {naziv} | {hns_id} | {hns_naziv} | {slug} | {sc} |\n")
        f.write(f"\n## Ambiguous (manual review)\n\n")
        for db_id, naziv, grad, ranked in ambiguous:
            f.write(f"- **[{db_id}] {naziv}** ({grad})\n")
            for sc, hid, slug, hns_naziv in ranked:
                f.write(f"  - {sc}: HNS {hid} '{hns_naziv}' (slug={slug})\n")
        f.write(f"\n## Not Found\n\n")
        for db_id, naziv, grad in not_found:
            f.write(f"- [{db_id}] {naziv} ({grad})\n")
        f.write(f"\n## Log\n\n`{LOG_PATH}`\n")
    log(f"Result -> {res_path}")

    # 6) Telegram notify
    msg = (f"SUB1 HNS done: matched {len(matched)}, not_found {len(not_found)}, "
           f"ambiguous {len(ambiguous)}. Catalog={len(catalog)}. "
           f"DB upd ok={upd_ok}/fail={upd_fail}. See SUB1_RESULT.md")
    log(msg, telegram=True)

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        log(f"FATAL: {e}\n{traceback.format_exc()}", telegram=True)
        sys.exit(1)