#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""Sports table scrapers - HOS, HKS, HRS, HVS, HBS"""
import os
import time, re, requests, psycopg2
from bs4 import BeautifulSoup

DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"

def get(url):
    r = requests.get(url, headers={"User-Agent": UA}, timeout=20)
    r.raise_for_status()
    time.sleep(1.5)
    return BeautifulSoup(r.text, "html.parser")

def si(s):
    try: return int(re.sub(r'[^\d]','',str(s)) or '0')
    except: return 0

def save(nat_id, rows, src):
    if not rows: return 0
    conn = psycopg2.connect(DSN)
    cur = conn.cursor()
    cur.execute("DELETE FROM pgz_sport.natjecanje_tablica WHERE natjecanje_id=%s AND source=%s", (nat_id, src))
    for i,r in enumerate(rows):
        cur.execute("""INSERT INTO pgz_sport.natjecanje_tablica
            (natjecanje_id,rang,klub_naziv,utakmica,pobjede,nerijeseno,porazi,golovi_za,golovi_protiv,bodovi,source)
            VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            ON CONFLICT(natjecanje_id,klub_naziv) DO UPDATE SET
            rang=EXCLUDED.rang,utakmica=EXCLUDED.utakmica,pobjede=EXCLUDED.pobjede,
            nerijeseno=EXCLUDED.nerijeseno,porazi=EXCLUDED.porazi,
            golovi_za=EXCLUDED.golovi_za,golovi_protiv=EXCLUDED.golovi_protiv,
            bodovi=EXCLUDED.bodovi,source=EXCLUDED.source,scraped_at=NOW()""",
            (nat_id,i+1,r['k'],r.get('u',0),r.get('p',0),r.get('n',0),
             r.get('i',0),r.get('gf',0),r.get('ga',0),r.get('bod',0),src))
    conn.commit(); conn.close()
    return len(rows)

# ─── HOS Odbojka ─────────────────────────────
def scrape_hos(nat_id, url):
    print(f"HOS {nat_id}: {url}")
    s = get(url)
    rows = []
    for tbl in s.find_all('table'):
        trs = tbl.find_all('tr')
        # Find table with EKIPA column
        hdrs = ' '.join(th.get_text(strip=True).lower() for th in trs[0].find_all(['th','td'])) if trs else ''
        if 'ekipa' not in hdrs and 'klub' not in hdrs: continue
        # Find column indices
        ths = [t.get_text(strip=True).upper() for t in trs[0].find_all(['th','td'])]
        def ci(names):
            for n in names:
                for i,h in enumerate(ths):
                    if n in h: return i
            return None
        ki = ci(['EKIPA','KLUB','TIM'])
        ui = ci(['UT.','UTA','IGR'])
        bi = ci(['BODOV','BOD','BN'])
        if ki is None: continue
        for tr in trs[1:]:
            tds = tr.find_all(['td','th'])
            if len(tds) <= ki: continue
            klub = tds[ki].get_text(strip=True)
            # Clean rank prefix
            klub = re.sub(r'^\d+\.?\s*','',klub).strip()
            if not klub or len(klub) < 3: continue
            nums = [si(td.get_text(strip=True)) for td in tds if td.get_text(strip=True).lstrip('-').lstrip('+').isdigit() or (td.get_text(strip=True) and re.match(r'^[\d\-+]+$',td.get_text(strip=True)))]
            rows.append({'k':klub,'u':nums[0] if nums else 0,'p':nums[1] if len(nums)>1 else 0,
                'n':nums[2] if len(nums)>2 else 0,'i':nums[3] if len(nums)>3 else 0,
                'gf':nums[4] if len(nums)>4 else 0,'ga':nums[5] if len(nums)>5 else 0,
                'bod':nums[-1] if nums else 0})
        if rows: break
    n = save(nat_id, rows, 'hos_cvf')
    print(f"  => {n} ekipa")
    return n

# ─── Generic standings scraper ─────────────────
def scrape_generic(nat_id, url, src):
    print(f"{src} {nat_id}: {url}")
    s = get(url)
    best = []
    for tbl in s.find_all('table'):
        trs = tbl.find_all('tr')
        if len(trs) < 4: continue
        rows = []
        for tr in trs[1:]:
            tds = tr.find_all(['td','th'])
            if len(tds) < 5: continue
            texts = [t.get_text(strip=True) for t in tds]
            # Find club name (non-numeric, len > 2, not rank)
            klub = None
            for j, t in enumerate(texts):
                if t and len(t) > 3 and not t.lstrip('+-').lstrip('.').isdigit():
                    if j > 0 or (j==0 and not t.isdigit()):
                        klub = re.sub(r'^\d+\.?\s*','',t).strip()
                        if len(klub) > 3:
                            break
            if not klub: continue
            nums = [si(t) for t in texts if re.match(r'^[\d\-+]+$',t) and len(t) < 5]
            if len(nums) < 2: continue
            rows.append({'k':klub,'u':nums[0],'p':nums[1] if len(nums)>1 else 0,
                'n':nums[2] if len(nums)>2 else 0,'i':nums[3] if len(nums)>3 else 0,
                'gf':nums[4] if len(nums)>4 else 0,'ga':nums[5] if len(nums)>5 else 0,
                'bod':nums[-1]})
        if len(rows) > len(best): best = rows
    n = save(nat_id, best, src)
    print(f"  => {n} ekipa")
    return n

# ─── HBS Bocanje - get all leagues ─────────────
def scrape_hbs_all():
    print("HBS Bocanje - all leagues")
    s = get("https://hrvatski-bocarski-savez.hr/lige/")
    links = []
    for a in s.find_all('a', href=True):
        href = a['href']
        if 'liga' in href.lower() or 'lig' in href.lower():
            if not href.startswith('http'): href = 'https://hrvatski-bocarski-savez.hr'+href
            text = a.get_text(strip=True)
            if text and len(text) > 3:
                links.append((text, href))
    print(f"  Found {len(links)} HBS leagues")
    # Find PGZ-relevant ones
    pgz_links = [(t,h) for t,h in links if any(k in t.upper() for k in ['PGZ','PGŽ','KVARN','RIJECKA','PRIMORJE'])]
    print(f"  PGZ relevant: {pgz_links[:5]}")
    return pgz_links

# ─── Main ──────────────────────────────────────
conn = psycopg2.connect(DSN)
cur = conn.cursor()
cur.execute("SELECT id,naziv,sport,external_url FROM pgz_sport.natjecanja WHERE external_url IS NOT NULL ORDER BY sport")
nats = cur.fetchall()
conn.close()

# Scrape specific known URLs
scrape_hos(409, "https://hos-cvf.hr/natjecanje.php?id=2807")

# Scrape all with external URLs
for nat_id, naziv, sport, url in nats:
    if not url or nat_id == 409: continue
    src = None
    if 'hos-cvf' in url: src = 'hos_cvf'
    elif 'hks-cbf' in url: src = 'hks_cbf'
    elif 'hrs.hr' in url: src = 'hrs'
    elif 'hvs.hr' in url or 'hnvs' in url: src = 'hvs'
    elif 'bocarski' in url or 'hbs' in url: src = 'hbs'
    if src:
        try: scrape_generic(nat_id, url, src)
        except Exception as e: print(f"  SKIP {nat_id}: {e}")

# HBS all leagues
try: scrape_hbs_all()
except Exception as e: print(f"HBS all: {e}")

# Final count
conn = psycopg2.connect(DSN)
cur = conn.cursor()
cur.execute("SELECT source, count(*) FROM pgz_sport.natjecanje_tablica GROUP BY source ORDER BY count DESC")
for row in cur.fetchall(): print(f"  {row[0]}: {row[1]} teams")
conn.close()
print("ALL DONE")