169 lines
7.4 KiB
Python
Executable File
169 lines
7.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
HOS-CVF scraper - Hrvatski odbojkaški savez league standings.
|
|
Path: /opt/pgz-sport/scrapers/hos_scraper.py
|
|
"""
|
|
import requests, re, json, psycopg2, html as ihtml
|
|
|
|
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
|
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
|
HDR = {"User-Agent": UA}
|
|
|
|
def parse_table(html, table_idx):
|
|
"""Parse a single table - return rows."""
|
|
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
|
if table_idx >= len(tables): return []
|
|
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[table_idx], re.DOTALL)
|
|
out = []
|
|
for row in rows:
|
|
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
|
clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells]
|
|
if not clean or not clean[0]: continue
|
|
# Skip header
|
|
if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue
|
|
# Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36']
|
|
try:
|
|
poz_match = re.match(r'(\d+)', clean[0])
|
|
if not poz_match: continue
|
|
poz = int(poz_match.group(1))
|
|
if len(clean) < 6: continue
|
|
klub = clean[2] if clean[2] else clean[1]
|
|
if not klub: continue
|
|
out.append({
|
|
"poz": poz,
|
|
"klub": klub,
|
|
"uk": int(clean[3]),
|
|
"pob": int(clean[4]),
|
|
"por": int(clean[5]),
|
|
"bod": int(clean[6]) if len(clean) > 6 else 0,
|
|
"ner": 0,
|
|
})
|
|
except (ValueError, IndexError):
|
|
continue
|
|
return out
|
|
|
|
def find_table_titles(html):
|
|
"""Find h2/h3 headers + their position to associate with following tables."""
|
|
# Use regex to find heading + nearest table
|
|
out = []
|
|
# Finds positions of titles
|
|
for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)</\1>', html, re.DOTALL):
|
|
title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip()
|
|
if title and len(title) > 5:
|
|
out.append((m.start(), title))
|
|
return out
|
|
|
|
def main():
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cu = conn.cursor()
|
|
|
|
# Get main page
|
|
r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20)
|
|
html = r.text
|
|
print(f"Length: {len(html)}")
|
|
|
|
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
|
print(f"Tables: {len(tables)}")
|
|
|
|
# Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar)
|
|
# Best heuristic: title closest before each table
|
|
title_positions = find_table_titles(html)
|
|
table_positions = [m.start() for m in re.finditer(r'<table[^>]*>', html)]
|
|
|
|
table_with_title = []
|
|
for tp in table_positions:
|
|
# Find closest title before
|
|
candidates = [(pos, t) for pos, t in title_positions if pos < tp]
|
|
if candidates:
|
|
title = candidates[-1][1]
|
|
else:
|
|
title = "Unknown"
|
|
table_with_title.append((tp, title))
|
|
|
|
print("\n=== Table titles ===")
|
|
for i, (tp, t) in enumerate(table_with_title[:8]):
|
|
print(f" Table {i+1}: {t[:80]}")
|
|
|
|
# Manual mapping based on Damir's request: extract all visible league tables
|
|
# Looking at output: Tables 1-5 with kluba names
|
|
LEAGUES_2025_26 = [
|
|
# Idx, Name, Razina, Spol
|
|
(0, "Supersport Superliga (M) 2025/26", "Superliga", "M"),
|
|
(1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"),
|
|
(2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe
|
|
(3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"),
|
|
(4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"),
|
|
]
|
|
|
|
total_inserted = 0
|
|
pgz_seen = set()
|
|
|
|
for idx, natj_naziv, razina, spol in LEAGUES_2025_26:
|
|
rows = parse_table(html, idx)
|
|
if not rows: continue
|
|
print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===")
|
|
for r in rows[:3]:
|
|
print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por")
|
|
|
|
# Get/create natjecanje
|
|
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,))
|
|
nr = cu.fetchone()
|
|
if nr: natj_id = nr[0]
|
|
else:
|
|
cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url)
|
|
VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/')
|
|
RETURNING id""", (natj_naziv, razina))
|
|
natj_id = cu.fetchone()[0]
|
|
print(f" Created natjecanje id={natj_id}")
|
|
|
|
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
|
|
|
|
for r in rows:
|
|
klub_id = None
|
|
cu.execute("""SELECT id, region FROM pgz_sport.klubovi
|
|
WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)
|
|
ORDER BY
|
|
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
|
CASE WHEN sport='odbojka' THEN 0 ELSE 1 END,
|
|
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
|
id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%"))
|
|
kr = cu.fetchone()
|
|
if kr:
|
|
klub_id = kr[0]
|
|
if kr[1] == 'PGŽ': pgz_seen.add(r['klub'])
|
|
|
|
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
|
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
|
|
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data)
|
|
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""",
|
|
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'],
|
|
json.dumps({"spol": spol})))
|
|
total_inserted += 1
|
|
|
|
print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===")
|
|
|
|
cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL)
|
|
FROM pgz_sport.natjecanja n
|
|
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf'
|
|
WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""")
|
|
print("\n=== HOS lige ===")
|
|
for r in cu.fetchall():
|
|
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
|
|
|
|
cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv
|
|
FROM pgz_sport.natjecanja_tablice t
|
|
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
|
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
|
WHERE t.source='hos_cvf' AND k.region='PGŽ'
|
|
ORDER BY n.naziv, t.pozicija""")
|
|
print("\n=== PGŽ klubovi u HOS ===")
|
|
for r in cu.fetchall():
|
|
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'")
|
|
|
|
cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details)
|
|
VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""",
|
|
(json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|