PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
+168
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HOS-CVF scraper - Hrvatski odbojkaški savez league standings.
|
||||
Path: /opt/pgz-sport/scrapers/hos_scraper.py
|
||||
"""
|
||||
import requests, re, json, psycopg2, html as ihtml
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA}
|
||||
|
||||
def parse_table(html, table_idx):
|
||||
"""Parse a single table - return rows."""
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
if table_idx >= len(tables): return []
|
||||
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[table_idx], re.DOTALL)
|
||||
out = []
|
||||
for row in rows:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells]
|
||||
if not clean or not clean[0]: continue
|
||||
# Skip header
|
||||
if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue
|
||||
# Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36']
|
||||
try:
|
||||
poz_match = re.match(r'(\d+)', clean[0])
|
||||
if not poz_match: continue
|
||||
poz = int(poz_match.group(1))
|
||||
if len(clean) < 6: continue
|
||||
klub = clean[2] if clean[2] else clean[1]
|
||||
if not klub: continue
|
||||
out.append({
|
||||
"poz": poz,
|
||||
"klub": klub,
|
||||
"uk": int(clean[3]),
|
||||
"pob": int(clean[4]),
|
||||
"por": int(clean[5]),
|
||||
"bod": int(clean[6]) if len(clean) > 6 else 0,
|
||||
"ner": 0,
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
def find_table_titles(html):
|
||||
"""Find h2/h3 headers + their position to associate with following tables."""
|
||||
# Use regex to find heading + nearest table
|
||||
out = []
|
||||
# Finds positions of titles
|
||||
for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)</\1>', html, re.DOTALL):
|
||||
title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip()
|
||||
if title and len(title) > 5:
|
||||
out.append((m.start(), title))
|
||||
return out
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
# Get main page
|
||||
r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20)
|
||||
html = r.text
|
||||
print(f"Length: {len(html)}")
|
||||
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
print(f"Tables: {len(tables)}")
|
||||
|
||||
# Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar)
|
||||
# Best heuristic: title closest before each table
|
||||
title_positions = find_table_titles(html)
|
||||
table_positions = [m.start() for m in re.finditer(r'<table[^>]*>', html)]
|
||||
|
||||
table_with_title = []
|
||||
for tp in table_positions:
|
||||
# Find closest title before
|
||||
candidates = [(pos, t) for pos, t in title_positions if pos < tp]
|
||||
if candidates:
|
||||
title = candidates[-1][1]
|
||||
else:
|
||||
title = "Unknown"
|
||||
table_with_title.append((tp, title))
|
||||
|
||||
print("\n=== Table titles ===")
|
||||
for i, (tp, t) in enumerate(table_with_title[:8]):
|
||||
print(f" Table {i+1}: {t[:80]}")
|
||||
|
||||
# Manual mapping based on Damir's request: extract all visible league tables
|
||||
# Looking at output: Tables 1-5 with kluba names
|
||||
LEAGUES_2025_26 = [
|
||||
# Idx, Name, Razina, Spol
|
||||
(0, "Supersport Superliga (M) 2025/26", "Superliga", "M"),
|
||||
(1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"),
|
||||
(2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe
|
||||
(3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"),
|
||||
(4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"),
|
||||
]
|
||||
|
||||
total_inserted = 0
|
||||
pgz_seen = set()
|
||||
|
||||
for idx, natj_naziv, razina, spol in LEAGUES_2025_26:
|
||||
rows = parse_table(html, idx)
|
||||
if not rows: continue
|
||||
print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===")
|
||||
for r in rows[:3]:
|
||||
print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por")
|
||||
|
||||
# Get/create natjecanje
|
||||
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,))
|
||||
nr = cu.fetchone()
|
||||
if nr: natj_id = nr[0]
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url)
|
||||
VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/')
|
||||
RETURNING id""", (natj_naziv, razina))
|
||||
natj_id = cu.fetchone()[0]
|
||||
print(f" Created natjecanje id={natj_id}")
|
||||
|
||||
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
|
||||
|
||||
for r in rows:
|
||||
klub_id = None
|
||||
cu.execute("""SELECT id, region FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='odbojka' THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%"))
|
||||
kr = cu.fetchone()
|
||||
if kr:
|
||||
klub_id = kr[0]
|
||||
if kr[1] == 'PGŽ': pgz_seen.add(r['klub'])
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
|
||||
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""",
|
||||
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'],
|
||||
json.dumps({"spol": spol})))
|
||||
total_inserted += 1
|
||||
|
||||
print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===")
|
||||
|
||||
cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL)
|
||||
FROM pgz_sport.natjecanja n
|
||||
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf'
|
||||
WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""")
|
||||
print("\n=== HOS lige ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
|
||||
|
||||
cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv
|
||||
FROM pgz_sport.natjecanja_tablice t
|
||||
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
||||
WHERE t.source='hos_cvf' AND k.region='PGŽ'
|
||||
ORDER BY n.naziv, t.pozicija""")
|
||||
print("\n=== PGŽ klubovi u HOS ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'")
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details)
|
||||
VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""",
|
||||
(json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user