#!/usr/bin/env python3
"""
HOS-CVF scraper - Hrvatski odbojkaški savez league standings.
Path: /opt/pgz-sport/scrapers/hos_scraper.py
"""
import requests, re, json, psycopg2, html as ihtml
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}
def parse_table(html, table_idx):
"""Parse a single table - return rows."""
tables = re.findall(r'
', html, re.DOTALL)
if table_idx >= len(tables): return []
rows = re.findall(r']*>(.+?)
', tables[table_idx], re.DOTALL)
out = []
for row in rows:
cells = re.findall(r']*>(.*?)', row, re.DOTALL)
clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells]
if not clean or not clean[0]: continue
# Skip header
if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue
# Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36']
try:
poz_match = re.match(r'(\d+)', clean[0])
if not poz_match: continue
poz = int(poz_match.group(1))
if len(clean) < 6: continue
klub = clean[2] if clean[2] else clean[1]
if not klub: continue
out.append({
"poz": poz,
"klub": klub,
"uk": int(clean[3]),
"pob": int(clean[4]),
"por": int(clean[5]),
"bod": int(clean[6]) if len(clean) > 6 else 0,
"ner": 0,
})
except (ValueError, IndexError):
continue
return out
def find_table_titles(html):
"""Find h2/h3 headers + their position to associate with following tables."""
# Use regex to find heading + nearest table
out = []
# Finds positions of titles
for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)\1>', html, re.DOTALL):
title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip()
if title and len(title) > 5:
out.append((m.start(), title))
return out
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
# Get main page
r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20)
html = r.text
print(f"Length: {len(html)}")
tables = re.findall(r'', html, re.DOTALL)
print(f"Tables: {len(tables)}")
# Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar)
# Best heuristic: title closest before each table
title_positions = find_table_titles(html)
table_positions = [m.start() for m in re.finditer(r']*>', html)]
table_with_title = []
for tp in table_positions:
# Find closest title before
candidates = [(pos, t) for pos, t in title_positions if pos < tp]
if candidates:
title = candidates[-1][1]
else:
title = "Unknown"
table_with_title.append((tp, title))
print("\n=== Table titles ===")
for i, (tp, t) in enumerate(table_with_title[:8]):
print(f" Table {i+1}: {t[:80]}")
# Manual mapping based on Damir's request: extract all visible league tables
# Looking at output: Tables 1-5 with kluba names
LEAGUES_2025_26 = [
# Idx, Name, Razina, Spol
(0, "Supersport Superliga (M) 2025/26", "Superliga", "M"),
(1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"),
(2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe
(3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"),
(4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"),
]
total_inserted = 0
pgz_seen = set()
for idx, natj_naziv, razina, spol in LEAGUES_2025_26:
rows = parse_table(html, idx)
if not rows: continue
print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===")
for r in rows[:3]:
print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por")
# Get/create natjecanje
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,))
nr = cu.fetchone()
if nr: natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/')
RETURNING id""", (natj_naziv, razina))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
for r in rows:
klub_id = None
cu.execute("""SELECT id, region FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='odbojka' THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
if kr:
klub_id = kr[0]
if kr[1] == 'PGŽ': pgz_seen.add(r['klub'])
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""",
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'],
json.dumps({"spol": spol})))
total_inserted += 1
print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===")
cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL)
FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf'
WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HOS lige ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hos_cvf' AND k.region='PGŽ'
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HOS ===")
for r in cu.fetchall():
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'")
cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""",
(json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),))
if __name__ == "__main__":
main()