Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
Regular → Executable
+92
-296
@@ -1,316 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_pipeline.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_pipeline.py
|
||||
# Svrha: Embed godisnjaci PGZ u pgz_universe + LLM ekstrakcija osoba/uloga
|
||||
# Zavisi od: qdrant_client, httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_universe (Qdrant), pgz_sport.clanovi (insert)
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""Godisnjak PGZ embed + LLM person extraction pipeline."""
|
||||
import asyncio
|
||||
import glob
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/rinet-gpu/lib')
|
||||
try:
|
||||
from tg_notify import notify as _tg_notify
|
||||
except ImportError:
|
||||
_tg_notify = None
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import httpx
|
||||
"""
|
||||
Godišnjak pipeline:
|
||||
1. Find godišnjak PDFs in DB (table dokumenti) + scrape sport-pgz.hr
|
||||
2. Download PDF lokalno
|
||||
3. Parse text iz PDF
|
||||
4. UPDATE pgz_sport.dokumenti SET sadrzaj = parsed_text
|
||||
5. Save chunks za RAG
|
||||
"""
|
||||
import os, sys, hashlib, requests, re
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [godisnjak] %(levelname)s: %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_pipeline.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("godisnjak")
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import pypdf
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
EMBED_URL = "http://localhost:9879/api/embeddings"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
QDRANT_COLLECTION = "pgz_universe"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 5
|
||||
CHUNK_SIZE = 1500 # < 2000 zbog BGE-M3 truncation
|
||||
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Format strogo JSON:
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"predsjednik|igrac|trener|tajnik|fizioterapeut|lijecnik","godina_rodenja":1990}]}
|
||||
|
||||
Uloge ISKLJUCIVO: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave, igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener, selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut, kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKE osobe (ne strani sportasi koji su gostovali)
|
||||
2. Ako klub nije jasan -> ostavi prazan string
|
||||
3. NE izmisljaj imena -> samo ona JASNO IZRAZENA u tekstu
|
||||
4. Vrati VALID JSON bez markdown backtick-ova"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r"\n\n+", text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur:
|
||||
chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 100]
|
||||
|
||||
|
||||
async def embed_batch(texts):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(EMBED_URL, json={"texts": texts})
|
||||
d = r.json()
|
||||
return d.get("embeddings", [])
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str):
|
||||
async with httpx.AsyncClient(timeout=120.0) as c:
|
||||
r = await c.post(
|
||||
VLLM_URL,
|
||||
json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5500]},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 3000,
|
||||
"response_format": {"type": "json_object"},
|
||||
},
|
||||
)
|
||||
d = r.json()
|
||||
try:
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.warning(f"Parse fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
def fuzzy_match_klub(naziv, conn):
|
||||
"""Fuzzy match klub name to pgz_sport.klubovi.id"""
|
||||
def download_pdf(url, dest):
|
||||
if dest.exists() and dest.stat().st_size > 1000:
|
||||
return dest
|
||||
try:
|
||||
from rapidfuzz import fuzz
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi LIMIT 1000")
|
||||
rows = cur.fetchall()
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in rows:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_id = kid
|
||||
return best_id if best_score > 75 else None
|
||||
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=60, allow_redirects=True)
|
||||
if r.status_code == 200 and len(r.content) > 1000:
|
||||
dest.write_bytes(r.content)
|
||||
return dest
|
||||
except Exception as e:
|
||||
log.warning(f"Fuzzy match fail: {e}")
|
||||
return None
|
||||
print(f" ERR download {url}: {e}")
|
||||
return None
|
||||
|
||||
def parse_pdf(path):
|
||||
try:
|
||||
r = pypdf.PdfReader(str(path))
|
||||
text = ''
|
||||
for p in r.pages:
|
||||
try: text += (p.extract_text() or '') + '\n'
|
||||
except: pass
|
||||
return text, len(r.pages)
|
||||
except Exception as e:
|
||||
print(f" ERR parse {path}: {e}")
|
||||
return '', 0
|
||||
|
||||
def insert_persons(persons_data, year, conn):
|
||||
"""Insert extracted persons into pgz_sport.clanovi."""
|
||||
osobe = persons_data.get("osobe", [])
|
||||
if not osobe:
|
||||
return 0
|
||||
|
||||
inserted = 0
|
||||
cur = conn.cursor()
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
for o in osobe:
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime:
|
||||
# 1. Get all godišnjaci s pdf_url ili url s .pdf
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, title, url, pdf_url, vrsta, sadrzaj
|
||||
FROM pgz_sport.dokumenti
|
||||
WHERE (
|
||||
title ILIKE '%sportski godi%njak%' OR title ILIKE '%godi%njak HNS%'
|
||||
OR title ILIKE 'ZSPGZ%' OR title ILIKE '%godi%njak ZSPGZ%'
|
||||
OR url ILIKE '%godisnjak%.pdf' OR pdf_url ILIKE '%godisnjak%.pdf'
|
||||
OR title ILIKE '%godi%njak%' AND (url ILIKE '%pdf' OR pdf_url IS NOT NULL)
|
||||
)
|
||||
ORDER BY id DESC
|
||||
""")
|
||||
targets = cur.fetchall()
|
||||
|
||||
print(f"Targets: {len(targets)}")
|
||||
|
||||
parsed_count = 0
|
||||
for t in targets:
|
||||
url = t['pdf_url'] or t['url']
|
||||
if not url or not url.lower().endswith('.pdf'):
|
||||
continue
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_match_klub(klub_naziv, conn) if klub_naziv else None
|
||||
uloga = (o.get("uloga") or "igrac").strip()
|
||||
if t['sadrzaj'] and len(t['sadrzaj']) > 500:
|
||||
print(f" ⏭ ID {t['id']}: already parsed ({len(t['sadrzaj'])} chars)")
|
||||
continue
|
||||
|
||||
# Validate uloga
|
||||
VALID_ULOGE = {
|
||||
"predsjednik", "dopredsjednik", "tajnik", "blagajnik", "clan_uprave",
|
||||
"igrac", "sportas", "glavni_trener", "trener", "pomocni_trener",
|
||||
"kondicioni_trener", "selektor", "izbornik", "team_manager", "voditelj",
|
||||
"lijecnik", "fizioterapeut", "kineziolog", "maser", "sudac", "volonter"
|
||||
}
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
print(f" 📄 ID {t['id']}: {t['title'][:60]}")
|
||||
fname = re.sub(r'[^\w.-]', '_', os.path.basename(url))[:100]
|
||||
dest = UPLOAD_DIR / f"{t['id']}_{fname}"
|
||||
|
||||
profile_key = f"godisnjak:{year}:{ime}:{prezime}:{klub_naziv}"
|
||||
downloaded = download_pdf(url, dest)
|
||||
if not downloaded:
|
||||
continue
|
||||
|
||||
try:
|
||||
text, pages = parse_pdf(downloaded)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
print(f" ✓ {pages} pages, {len(text)} chars")
|
||||
|
||||
# UPDATE sadrzaj
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": year, "klub_naziv": klub_naziv, "key": profile_key}),
|
||||
"sportas",
|
||||
))
|
||||
if cur.fetchone():
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
log.warning(f"Insert fail {ime} {prezime}: {e}")
|
||||
conn.rollback()
|
||||
|
||||
conn.commit()
|
||||
return inserted
|
||||
|
||||
|
||||
async def phase1_embed(files_layout):
|
||||
"""Embed sve godisnjake u pgz_universe."""
|
||||
log.info(f"Phase 1: Embed {len(files_layout)} godisnjaka")
|
||||
qdrant = QdrantClient(host="localhost", port=6333)
|
||||
|
||||
all_chunks = []
|
||||
all_meta = []
|
||||
for f in files_layout:
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
chunks = chunk_text(text)
|
||||
for i, c in enumerate(chunks):
|
||||
all_chunks.append(c)
|
||||
all_meta.append({"year": year, "chunk_idx": i, "source": f.split("/")[-1]})
|
||||
|
||||
log.info(f"Total chunks: {len(all_chunks)}")
|
||||
|
||||
points = []
|
||||
BATCH = 32
|
||||
for i in range(0, len(all_chunks), BATCH):
|
||||
batch = all_chunks[i : i + BATCH]
|
||||
try:
|
||||
embeddings = await embed_batch(batch)
|
||||
for j, (text, emb) in enumerate(zip(batch, embeddings)):
|
||||
meta = all_meta[i + j]
|
||||
pid_key = f"godisnjak:{meta['source']}:{meta['chunk_idx']}"
|
||||
point_id = int(hashlib.md5(pid_key.encode()).hexdigest()[:15], 16)
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=point_id,
|
||||
vector=emb,
|
||||
payload={**meta, "text": text[:1500], "type": "godisnjak_pgz"},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning(f"Embed batch {i} fail: {e}")
|
||||
await asyncio.sleep(2)
|
||||
UPDATE pgz_sport.dokumenti
|
||||
SET sadrzaj = %s, last_updated = now()
|
||||
WHERE id = %s
|
||||
""", (text[:500000], t['id'])) # cap 500K
|
||||
|
||||
if i % 200 == 0:
|
||||
log.info(f" Embed progress: {i}/{len(all_chunks)}")
|
||||
|
||||
qdrant.upsert(collection_name=QDRANT_COLLECTION, points=points)
|
||||
log.info(f"Phase 1 DONE: {len(points)} chunks → {QDRANT_COLLECTION}")
|
||||
return len(points)
|
||||
|
||||
|
||||
async def phase2_extract(files_layout):
|
||||
"""LLM ekstrakcija osoba/uloga iz godisnjaka."""
|
||||
log.info(f"Phase 2: LLM extract persons from {len(files_layout)} godisnjaka")
|
||||
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = False
|
||||
|
||||
total_inserted = 0
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
|
||||
async def process_file(f):
|
||||
nonlocal total_inserted
|
||||
m = re.search(r"godisnjak_(\d{4})_layout", f)
|
||||
year = m.group(1) if m else "unknown"
|
||||
# Chunks za RAG (1000 chars each)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (t['id'],))
|
||||
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
|
||||
for i, ch in enumerate(chunks[:200]):
|
||||
if len(ch.strip()) > 50:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokument_chunks (dokument_id, chunk_idx, content)
|
||||
VALUES (%s, %s, %s)
|
||||
""", (t['id'], i, ch))
|
||||
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f" Year {year}: {len(chunks)} chunks")
|
||||
|
||||
year_inserted = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
async with semaphore:
|
||||
try:
|
||||
persons = await extract_persons(chunk)
|
||||
n = insert_persons(persons, year, conn)
|
||||
year_inserted += n
|
||||
if i % 10 == 0:
|
||||
log.info(f" {year} chunk {i}/{len(chunks)}: {n} osoba")
|
||||
except Exception as e:
|
||||
log.warning(f"Extract/insert fail {year} chunk {i}: {e}")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
total_inserted += year_inserted
|
||||
log.info(f" Year {year} DONE: {year_inserted} osoba inserted")
|
||||
parsed_count += 1
|
||||
|
||||
tasks = [process_file(f) for f in files_layout]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
conn.close()
|
||||
log.info(f"Phase 2 DONE: {total_inserted} total osoba inserted")
|
||||
return total_inserted
|
||||
print(f"\nDone. Parsed: {parsed_count}")
|
||||
|
||||
|
||||
async def main():
|
||||
files_layout = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Found {len(files_layout)} layout files: {[f.split('/')[-1] for f in files_layout]}")
|
||||
|
||||
if not files_layout:
|
||||
log.error("Nema layout fajlova!")
|
||||
sys.exit(1)
|
||||
|
||||
# Phase 1: Embed
|
||||
n_embedded = await phase1_embed(files_layout)
|
||||
|
||||
# Phase 2: LLM extract
|
||||
n_persons = await phase2_extract(files_layout)
|
||||
|
||||
# Final stats
|
||||
conn = psycopg2.connect(DSN)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
total_godisnjak = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== GODISNJAK PIPELINE COMPLETE ===
|
||||
Chunks embedded: {n_embedded}
|
||||
Persons extracted: {n_persons}
|
||||
Total godisnjak clanovi u DB: {total_godisnjak}
|
||||
""")
|
||||
|
||||
# Telegram
|
||||
import requests as req_lib
|
||||
req_lib.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558", "text": f"✅ Godisnjak pipeline DONE: {n_embedded} chunks, {n_persons} osoba, {total_godisnjak} total u DB"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Executable
+150
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Godišnjak pipeline v2 — popravljen za pravu shemu.
|
||||
"""
|
||||
import os, sys, hashlib, requests, re
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import pypdf
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 18 godišnjaka 2006-2024 (otkriveni scrapeom)
|
||||
GODISNJAK_URLS = [
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/godisnjak-2006-print.pdf", 2006),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2007.pdf", 2007),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2008.pdf", 2008),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2009.pdf", 2009),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2010.pdf", 2010),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf", 2011),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2012.pdf", 2012),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2013.pdf", 2013),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2014.pdf", 2014),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2015.pdf", 2015),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2017.pdf", 2017),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2018.pdf", 2018),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2019.pdf", 2019),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2020.pdf", 2020),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2021.pdf", 2021),
|
||||
("https://sport-pgz.hr/upload/dokumenti/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf", 2022),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf", 2023),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf", 2024),
|
||||
]
|
||||
|
||||
def download_pdf(url, dest):
|
||||
if dest.exists() and dest.stat().st_size > 1000:
|
||||
print(f" [cached] {dest.name} ({dest.stat().st_size//1024}KB)")
|
||||
return dest
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=120, allow_redirects=True)
|
||||
if r.status_code == 200 and len(r.content) > 1000:
|
||||
dest.write_bytes(r.content)
|
||||
return dest
|
||||
else:
|
||||
print(f" ✗ HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ERR: {e}")
|
||||
return None
|
||||
|
||||
def parse_pdf(path):
|
||||
try:
|
||||
r = pypdf.PdfReader(str(path))
|
||||
text = ''
|
||||
for p in r.pages:
|
||||
try: text += (p.extract_text() or '') + '\n'
|
||||
except: pass
|
||||
return text, len(r.pages)
|
||||
except Exception as e:
|
||||
return '', 0
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
# Get chunks table column name
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema='pgz_sport' AND table_name='dokument_chunks'
|
||||
""")
|
||||
cols = [r[0] for r in cur.fetchall()]
|
||||
|
||||
print(f"dokument_chunks columns: {cols}")
|
||||
|
||||
parsed_count = 0
|
||||
for url, godina in GODISNJAK_URLS:
|
||||
title = f"Sportski godišnjak ZSPGZ {godina}"
|
||||
fname = f"sportski-godisnjak-{godina}.pdf"
|
||||
dest = UPLOAD_DIR / fname
|
||||
|
||||
print(f"\n📄 {title}")
|
||||
downloaded = download_pdf(url, dest)
|
||||
if not downloaded:
|
||||
continue
|
||||
|
||||
# Compute SHA1
|
||||
sha1 = hashlib.sha1(downloaded.read_bytes()).hexdigest()
|
||||
|
||||
text, pages = parse_pdf(downloaded)
|
||||
if not text:
|
||||
print(f" ✗ parse failed")
|
||||
continue
|
||||
print(f" ✓ {pages} pages, {len(text)} chars")
|
||||
|
||||
# UPSERT u dokumenti
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Check if exists by sha1
|
||||
cur.execute("SELECT id FROM pgz_sport.dokumenti WHERE sha1 = %s LIMIT 1", (sha1,))
|
||||
existing = cur.fetchone()
|
||||
|
||||
if existing:
|
||||
doc_id = existing['id']
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.dokumenti
|
||||
SET title = %s, godina = %s, vrsta = 'sportski-godisnjak',
|
||||
url = %s, pdf_url = %s, sadrzaj = %s,
|
||||
sluzbeni_glasnik = 'ZSPGZ', razina = 'zupanijska',
|
||||
organizacija = 'Zajednica sportova Primorsko-goranske županije',
|
||||
izvor_url = %s, last_updated = now()
|
||||
WHERE id = %s
|
||||
""", (title, godina, url, url, text[:500000], 'https://sport-pgz.hr', doc_id))
|
||||
print(f" ↻ UPDATE id={doc_id}")
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(title, fname, vrsta, godina, url, pdf_url, sha1, sadrzaj,
|
||||
sluzbeni_glasnik, razina, organizacija, izvor_url)
|
||||
VALUES (%s, %s, 'sportski-godisnjak', %s, %s, %s, %s, %s,
|
||||
'ZSPGZ', 'zupanijska', 'Zajednica sportova PGŽ', 'https://sport-pgz.hr')
|
||||
RETURNING id
|
||||
""", (title, fname, godina, url, url, sha1, text[:500000]))
|
||||
doc_id = cur.fetchone()['id']
|
||||
print(f" + INSERT id={doc_id}")
|
||||
|
||||
# Chunks (proper schema)
|
||||
if 'idx' in cols or 'chunk_idx' in cols or 'page' in cols or 'i' in cols:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (doc_id,))
|
||||
# Find INSERT pattern by columns
|
||||
idx_col = 'idx' if 'idx' in cols else ('chunk_idx' if 'chunk_idx' in cols else ('i' if 'i' in cols else 'page'))
|
||||
content_col = 'content' if 'content' in cols else ('chunk' if 'chunk' in cols else ('text' if 'text' in cols else 'sadrzaj'))
|
||||
|
||||
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
|
||||
for i, ch in enumerate(chunks[:300]):
|
||||
if len(ch.strip()) > 50:
|
||||
try:
|
||||
cur.execute(f"""
|
||||
INSERT INTO pgz_sport.dokument_chunks (dokument_id, {idx_col}, {content_col})
|
||||
VALUES (%s, %s, %s)
|
||||
""", (doc_id, i, ch))
|
||||
except Exception as e:
|
||||
print(f" ERR chunk {i}: {e}"); break
|
||||
|
||||
parsed_count += 1
|
||||
|
||||
print(f"\n✅ Done. Parsed: {parsed_count}/{len(GODISNJAK_URLS)}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+141
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily."""
|
||||
import os, time, re, json, sys
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def find_seasons(obj, found=None, depth=0):
|
||||
if depth > 25: return found or []
|
||||
if found is None: found = []
|
||||
if isinstance(obj, dict):
|
||||
keys = set(obj.keys())
|
||||
# Detect season-like dict
|
||||
if ('season' in keys and isinstance(obj.get('season'), (str, dict))) or 'sezona' in keys:
|
||||
found.append(obj)
|
||||
# Detect career object with seasons array
|
||||
for k, v in obj.items():
|
||||
if k.lower() in ('careers','career','seasons','sezone','statistics','stats') and isinstance(v, list):
|
||||
for item in v:
|
||||
if isinstance(item, dict) and any(kk in item for kk in ('season','sezona','year','godina')):
|
||||
found.append(item)
|
||||
find_seasons(v, found, depth+1)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
find_seasons(item, found, depth+1)
|
||||
return found
|
||||
|
||||
def normalize_season(s):
|
||||
"""Convert season dict to flat row."""
|
||||
sezona = s.get('season') or s.get('sezona') or s.get('year') or s.get('godina') or ''
|
||||
if isinstance(sezona, dict):
|
||||
sezona = sezona.get('name') or sezona.get('label') or str(sezona.get('year',''))
|
||||
sezona = str(sezona)
|
||||
|
||||
klub = s.get('club') or s.get('klub') or s.get('team') or ''
|
||||
if isinstance(klub, dict):
|
||||
klub = klub.get('name') or klub.get('naziv') or ''
|
||||
|
||||
natj = s.get('competition') or s.get('natjecanje') or s.get('league') or ''
|
||||
if isinstance(natj, dict):
|
||||
natj = natj.get('name') or natj.get('naziv') or ''
|
||||
|
||||
def num(*keys):
|
||||
for k in keys:
|
||||
for kk in s.keys():
|
||||
if k.lower() in kk.lower():
|
||||
v = s[kk]
|
||||
try: return int(v)
|
||||
except:
|
||||
try: return int(re.sub(r'\D','', str(v)) or 0)
|
||||
except: return 0
|
||||
return 0
|
||||
|
||||
return {
|
||||
'sezona': sezona, 'klub': str(klub)[:200], 'natjecanje': str(natj)[:100],
|
||||
'nastupi': num('matches','nastup','appearance'),
|
||||
'startna': num('start'),
|
||||
'zamjena': num('sub','zamjen'),
|
||||
'golovi': num('goal','gol'),
|
||||
'asistencije': num('assist','asist'),
|
||||
'zuti': num('yellow','žut','zut'),
|
||||
'crveni': num('red','crv'),
|
||||
'minute': num('minute','minut','min'),
|
||||
}
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
|
||||
FROM pgz_sport.clanovi c
|
||||
WHERE c.hns_igrac_id IS NOT NULL
|
||||
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
|
||||
ORDER BY c.id LIMIT 200
|
||||
""")
|
||||
targets = cur.fetchall()
|
||||
|
||||
print(f"Targets: {len(targets)}", flush=True)
|
||||
|
||||
seasons_added = 0
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||
page = browser.new_context(ignore_https_errors=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0").new_page()
|
||||
|
||||
for i, t in enumerate(targets):
|
||||
url = t['source_url']
|
||||
if not url or 'semafor.hns.family/igraci/' not in url:
|
||||
continue
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=20000)
|
||||
time.sleep(0.8)
|
||||
|
||||
html = page.content()
|
||||
rows = []
|
||||
|
||||
# Extract __NEXT_DATA__
|
||||
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
|
||||
if m:
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
seasons_raw = find_seasons(data)
|
||||
for s in seasons_raw:
|
||||
n = normalize_season(s)
|
||||
if n['sezona']:
|
||||
rows.append(n)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
# Insert
|
||||
if rows:
|
||||
with conn.cursor() as cur:
|
||||
for r in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje,
|
||||
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'], r['natjecanje'],
|
||||
r['nastupi'], r['startna'], r['zamjena'], r['golovi'],
|
||||
r['asistencije'], r['zuti'], r['crveni'], r['minute']))
|
||||
seasons_added += 1
|
||||
except: pass
|
||||
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: +{len(rows)} sezone (total: {seasons_added})", flush=True)
|
||||
|
||||
if i % 30 == 0 and i > 0:
|
||||
print(f" [{i}/{len(targets)}] processed, total: {seasons_added}", flush=True)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
browser.close()
|
||||
|
||||
print(f"\n✅ Done. Total: {seasons_added}", flush=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,32 +1,373 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HKS-CBF + FIBA LiveStats basketball harvester."""
|
||||
import sys, re
|
||||
# hks_basketball.py — HKS-CBF + FIBA LiveStats košarka harvester
|
||||
# v2.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
||||
# Harvests rosters + per-match player stats for PGŽ priority basketball clubs.
|
||||
# Path: HKS search (?s=naziv) → match recap articles → FIBA LiveStats matchid →
|
||||
# https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json
|
||||
# (public JSON boxscore) → upsert clanovi + clan_kategorije + player_stats.
|
||||
|
||||
import sys, re, time, json, urllib.parse
|
||||
import requests
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
LIGA_SECTIONS = {
|
||||
"supersport-premijer": "Seniori (Premijer)",
|
||||
"prva-muska-liga": "Seniori (1.HML)",
|
||||
"druga-muska-liga": "Seniori (2.HML)",
|
||||
"premijer-zenska-liga": "Seniorke (Premijer)",
|
||||
"prva-zenska-liga": "Seniorke (1.ŽKL)",
|
||||
"druga-zenska-liga": "Seniorke (2.ŽKL)",
|
||||
"jedinstvena-kadetska-liga":"Kadeti",
|
||||
"kadetska-liga": "Kadeti",
|
||||
"juniorska-liga": "Juniori",
|
||||
"mlade-kategorije": "Mladi",
|
||||
"mini-kosarka": "Mini",
|
||||
}
|
||||
|
||||
NOISE_TOKENS = {
|
||||
"kk", "zkk", "kosarkaski", "klub", "udruga", "savez", "skola",
|
||||
"primorsko-goranske", "primorsko", "goranske", "zupanije",
|
||||
"rijeka", "rijeke", # too generic on its own; only used if it's the longest token
|
||||
}
|
||||
|
||||
ARTICLE_RE = re.compile(
|
||||
r'href="(https://www\.hks-cbf\.hr/(' + '|'.join(re.escape(k) for k in LIGA_SECTIONS) + r')/(\d{4})/[^"]+/)"'
|
||||
)
|
||||
FIBA_MATCHID_RE = re.compile(
|
||||
r'fibalivestats\.dcd\.shared\.geniussports\.com/u/HKS/(\d+)/'
|
||||
)
|
||||
|
||||
MAX_ARTICLES_PER_KLUB = 8
|
||||
MAX_MATCHES_PER_KLUB = 30
|
||||
HTTP_TIMEOUT = 15
|
||||
HTTP_PAUSE_S = 0.4
|
||||
|
||||
|
||||
def parse_mm_ss(s):
|
||||
if not s or not isinstance(s, str): return None
|
||||
m = re.match(r'^(\d{1,3}):(\d{2})$', s.strip())
|
||||
if not m: return None
|
||||
return int(m.group(1))
|
||||
|
||||
|
||||
def _ascii_lower(s):
|
||||
t = (s or '').lower()
|
||||
for old, new in [('š','s'),('č','c'),('ć','c'),('ž','z'),('đ','d')]:
|
||||
t = t.replace(old, new)
|
||||
return t
|
||||
|
||||
|
||||
def name_tokens(naziv):
|
||||
"""Distinctive tokens from a club name, stripped of generic noise."""
|
||||
t = re.sub(r'[^\wšđč枊ĐČĆŽ\s-]', ' ', naziv or '')
|
||||
t = _ascii_lower(t)
|
||||
parts = [p for p in re.split(r'\s+', t) if p and p not in NOISE_TOKENS and len(p) > 2]
|
||||
return parts
|
||||
|
||||
|
||||
def name_abbrev(naziv):
|
||||
"""Acronym from significant tokens, e.g. 'Flumen Sancti Viti' → 'fsv'."""
|
||||
toks = name_tokens(naziv)
|
||||
if len(toks) < 2:
|
||||
return None
|
||||
return ''.join(t[0] for t in toks if t)
|
||||
|
||||
|
||||
def fuzzy_klub_match(klub_naziv, side_name):
|
||||
"""True iff klub_naziv likely refers to the same club as side_name.
|
||||
|
||||
Strategies:
|
||||
1. token overlap (3+ char tokens, post noise filter).
|
||||
2. abbreviation match (e.g. 'FSV' = 'Flumen Sancti Viti').
|
||||
3. substring match on ascii-folded slugs (≥4 char overlap).
|
||||
"""
|
||||
a = set(name_tokens(klub_naziv))
|
||||
b = set(name_tokens(side_name))
|
||||
if a & b:
|
||||
return True
|
||||
abb_a = name_abbrev(klub_naziv) or ''
|
||||
abb_b = name_abbrev(side_name) or ''
|
||||
# treat single-token side names ≥3 chars as candidate acronyms too
|
||||
side_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', side_name or ''))
|
||||
klub_clean = _ascii_lower(re.sub(r'[^A-Za-zšđč枊ĐČĆŽ]', '', klub_naziv or ''))
|
||||
if abb_a and len(abb_a) >= 3 and abb_a in side_clean:
|
||||
return True
|
||||
if abb_b and len(abb_b) >= 3 and abb_b in klub_clean:
|
||||
return True
|
||||
# Treat any 4+ char shared substring as match (e.g. 'kvarner' in both)
|
||||
for tok in a:
|
||||
if len(tok) >= 4 and tok in side_clean:
|
||||
return True
|
||||
for tok in b:
|
||||
if len(tok) >= 4 and tok in klub_clean:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def best_search_token(naziv):
|
||||
"""Pick the most distinctive single token for HKS search (e.g. 'Škrljevo').
|
||||
|
||||
Falls back to longest word ≥3 chars when noise-filtered list is empty
|
||||
(e.g. 'KK Rijeka - Rijeka' → 'Rijeka').
|
||||
"""
|
||||
toks = name_tokens(naziv)
|
||||
if not toks:
|
||||
# noise-only club name — pick longest non-noise-but-permitted word
|
||||
all_words = [w for w in re.findall(r'\w+', naziv or '') if len(w) >= 3]
|
||||
if not all_words:
|
||||
return naziv
|
||||
chosen = _ascii_lower(max(all_words, key=len))
|
||||
else:
|
||||
chosen = max(toks, key=len)
|
||||
for w in re.findall(r'\w+', naziv or ''):
|
||||
if _ascii_lower(w) == chosen:
|
||||
return w
|
||||
return chosen
|
||||
|
||||
|
||||
class HKSHarvester(SportHarvester):
|
||||
SPORT = 'košarka'
|
||||
SOURCE = 'hks_cbf'
|
||||
|
||||
|
||||
def get_target_klubovi(self, limit=999):
|
||||
"""Override base — task requires ALL 99 PGŽ priority basketball clubs,
|
||||
not just financiran/u_godisnjaku ones."""
|
||||
from psycopg2.extras import RealDictCursor
|
||||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||||
WHERE sport = %s
|
||||
ORDER BY financiran DESC NULLS LAST,
|
||||
u_godisnjaku DESC NULLS LAST,
|
||||
id
|
||||
LIMIT %s
|
||||
""", (self.SPORT, limit))
|
||||
return cur.fetchall()
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.http = requests.Session()
|
||||
self.http.headers.update({
|
||||
"User-Agent": UA,
|
||||
"Accept-Language": "hr,en;q=0.8",
|
||||
"Accept-Encoding": "gzip, deflate", # avoid brotli — requests' decoder is flaky on chunked br
|
||||
})
|
||||
self._seen_matches = set() # global de-dup across clubs
|
||||
self._klub_match_count = 0 # reset per-klub
|
||||
|
||||
def _get(self, url, retries=1):
|
||||
last_err = None
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
r = self.http.get(url, timeout=HTTP_TIMEOUT)
|
||||
if r.status_code == 200:
|
||||
return r
|
||||
last_err = f"HTTP {r.status_code}"
|
||||
except Exception as e:
|
||||
last_err = str(e)
|
||||
time.sleep(0.6)
|
||||
self.log(f" GET fail {url}: {last_err}")
|
||||
return None
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
# Discovery: try search hks-cbf.hr by club name
|
||||
url = f"https://www.hks-cbf.hr/?s={klub['naziv'].replace(' ','+')}"
|
||||
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → {url}")
|
||||
self._klub_match_count = 0
|
||||
token = best_search_token(klub['naziv'])
|
||||
if not token or len(token) < 3:
|
||||
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → no usable token, skip")
|
||||
return
|
||||
search_url = f"https://www.hks-cbf.hr/?s={urllib.parse.quote_plus(token)}"
|
||||
self.log(f" 🏀 Klub {klub['id']} {klub['naziv']} → token='{token}'")
|
||||
|
||||
r = self._get(search_url)
|
||||
if not r:
|
||||
return
|
||||
html = r.text
|
||||
time.sleep(HTTP_PAUSE_S)
|
||||
|
||||
seen = set()
|
||||
articles = []
|
||||
for m in ARTICLE_RE.finditer(html):
|
||||
url = m.group(1)
|
||||
if url in seen: continue
|
||||
seen.add(url)
|
||||
articles.append(url)
|
||||
if len(articles) >= MAX_ARTICLES_PER_KLUB:
|
||||
break
|
||||
self.log(f" {len(articles)} article(s)")
|
||||
if not articles:
|
||||
return
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET source_url = %s
|
||||
WHERE id = %s AND (source_url IS NULL OR source_url = '' OR source_url = 'godisnjak_zspgz_2025')
|
||||
""", (articles[0], klub['id']))
|
||||
|
||||
for art_url in articles:
|
||||
try:
|
||||
self._scrape_article(klub, art_url)
|
||||
except Exception as e:
|
||||
self.stats['errors'] += 1
|
||||
self.log(f" ❌ article {art_url}: {e}")
|
||||
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
|
||||
self.log(f" cap reached ({MAX_MATCHES_PER_KLUB} matches)")
|
||||
break
|
||||
|
||||
def scrape_player(self, page, person_id):
|
||||
"""Helper: scrape an individual player career page from HKS statistika.
|
||||
Genius Sports widget is JS-rendered, so we need Playwright here."""
|
||||
url = f"https://www.hks-cbf.hr/statistika/?WHurl=%2Fperson%2F{person_id}"
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||
# Look for /klubovi/ or /klub/ link
|
||||
klub_links = page.locator('a[href*="/klubovi/"], a[href*="/klub/"]').all()
|
||||
for a in klub_links[:3]:
|
||||
href = a.get_attribute('href')
|
||||
if href and 'klub' in href:
|
||||
self.log(f" Found: {href}")
|
||||
# Save URL to klub
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s",
|
||||
(href, klub['id']))
|
||||
break
|
||||
page.wait_for_timeout(4000)
|
||||
tables = [t.inner_text() for t in page.locator('table').all()[:3]]
|
||||
return {"person_id": person_id, "url": url, "tables": tables}
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
self.log(f" ❌ scrape_player({person_id}): {e}")
|
||||
return None
|
||||
|
||||
def _scrape_article(self, klub, art_url):
|
||||
r = self._get(art_url)
|
||||
if not r:
|
||||
return
|
||||
time.sleep(HTTP_PAUSE_S)
|
||||
section_match = re.search(r'https://www\.hks-cbf\.hr/([^/]+)/(\d{4})/', art_url)
|
||||
section = section_match.group(1) if section_match else ""
|
||||
year = int(section_match.group(2)) if section_match else None
|
||||
kategorija = LIGA_SECTIONS.get(section)
|
||||
sezona = f"{year-1}/{year}" if year else None
|
||||
|
||||
seen = set()
|
||||
matchids = []
|
||||
for m in FIBA_MATCHID_RE.finditer(r.text):
|
||||
mid = m.group(1)
|
||||
if mid in seen: continue
|
||||
seen.add(mid)
|
||||
matchids.append(mid)
|
||||
if not matchids:
|
||||
return
|
||||
|
||||
for mid in matchids:
|
||||
if mid in self._seen_matches:
|
||||
continue
|
||||
self._seen_matches.add(mid)
|
||||
try:
|
||||
self._harvest_match(klub, mid, art_url, kategorija, sezona, section)
|
||||
except Exception as e:
|
||||
self.stats['errors'] += 1
|
||||
self.log(f" ❌ match {mid}: {e}")
|
||||
if self._klub_match_count >= MAX_MATCHES_PER_KLUB:
|
||||
return
|
||||
|
||||
def _harvest_match(self, klub, matchid, art_url, kategorija, sezona, section):
|
||||
url = f"https://fibalivestats.dcd.shared.geniussports.com/data/{matchid}/data.json"
|
||||
r = self._get(url, retries=2)
|
||||
if not r:
|
||||
return
|
||||
try:
|
||||
data = r.json()
|
||||
except Exception as e:
|
||||
self.log(f" ⚠️ {matchid} JSON parse: {e}")
|
||||
return
|
||||
time.sleep(HTTP_PAUSE_S)
|
||||
|
||||
tm = data.get('tm') or {}
|
||||
if not tm:
|
||||
return
|
||||
|
||||
side_key = None
|
||||
for side in ('1', '2'):
|
||||
t = tm.get(side, {})
|
||||
tname = t.get('name') or t.get('nameInternational') or ''
|
||||
if fuzzy_klub_match(klub['naziv'], tname):
|
||||
side_key = side
|
||||
break
|
||||
if not side_key:
|
||||
n1 = (tm.get('1') or {}).get('name')
|
||||
n2 = (tm.get('2') or {}).get('name')
|
||||
self.log(f" ⚠️ {matchid} no side match for '{klub['naziv']}' (sides: {n1!r}, {n2!r})")
|
||||
return
|
||||
|
||||
team = tm[side_key]
|
||||
klub_naziv = team.get('name') or klub['naziv']
|
||||
natjecanje = kategorija or section or "košarka"
|
||||
natjecanje_match = f"{natjecanje} match {matchid}"
|
||||
|
||||
players = team.get('pl') or {}
|
||||
iter_pairs = list(players.items()) if isinstance(players, dict) else list(enumerate(players))
|
||||
|
||||
added = 0
|
||||
for pkey, p in iter_pairs:
|
||||
if not isinstance(p, dict):
|
||||
continue
|
||||
ime = (p.get('firstName') or p.get('internationalFirstName') or '').strip()
|
||||
prezime = (p.get('familyName') or p.get('internationalFamilyName') or '').strip()
|
||||
if not (ime or prezime):
|
||||
continue
|
||||
full_slug = self.slugify(f"{ime} {prezime}")
|
||||
source_id = full_slug or f"m{matchid}p{pkey}"
|
||||
extra = {
|
||||
"shirtNumber": p.get('shirtNumber'),
|
||||
"playingPosition": p.get('playingPosition'),
|
||||
"scoreboardName": p.get('scoreboardName'),
|
||||
"photoT": (p.get('photoT') or {}).get('url') if isinstance(p.get('photoT'), dict) else p.get('photoT'),
|
||||
"matchids_seen": [matchid],
|
||||
}
|
||||
try:
|
||||
clan_id = self.upsert_clan(
|
||||
klub_id=klub['id'],
|
||||
source_id=source_id,
|
||||
ime=ime,
|
||||
prezime=prezime,
|
||||
source_url=art_url,
|
||||
kategorija=kategorija,
|
||||
sezona=sezona,
|
||||
extra=extra,
|
||||
)
|
||||
except Exception as e:
|
||||
self.stats['errors'] += 1
|
||||
self.log(f" ❌ upsert_clan {ime} {prezime}: {e}")
|
||||
continue
|
||||
|
||||
stats = {
|
||||
'nastupi': 1,
|
||||
'golovi': None,
|
||||
'asistencije': p.get('sAssists'),
|
||||
'bodovi': p.get('sPoints'),
|
||||
'trice': p.get('sThreePointersMade'),
|
||||
'skokovi': p.get('sReboundsTotal'),
|
||||
'blokade': p.get('sBlocks'),
|
||||
'servis_asovi': None,
|
||||
'zuti': None,
|
||||
'crveni': 1 if (p.get('sFoulsPersonal') or 0) >= 5 else 0,
|
||||
'minute': parse_mm_ss(p.get('sMinutes')),
|
||||
}
|
||||
try:
|
||||
self.upsert_stats(
|
||||
clan_id=clan_id,
|
||||
sezona=sezona,
|
||||
klub_id=klub['id'],
|
||||
klub_naziv=klub_naziv,
|
||||
natjecanje=natjecanje_match,
|
||||
kategorija=kategorija,
|
||||
stats_dict=stats,
|
||||
raw={'matchid': matchid, 'art_url': art_url, 'player': p},
|
||||
)
|
||||
self.stats['stats'] += 1
|
||||
except Exception as e:
|
||||
self.stats['errors'] += 1
|
||||
self.log(f" ❌ upsert_stats {ime} {prezime}: {e}")
|
||||
continue
|
||||
added += 1
|
||||
|
||||
self.stats['players'] += added
|
||||
self._klub_match_count += 1
|
||||
self.log(f" ✅ {matchid} side={side_key} '{klub_naziv}' → {added} players")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
HKSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
|
||||
@@ -1,21 +1,416 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HOS volleyball harvester."""
|
||||
import sys
|
||||
# hos_volleyball.py
|
||||
# v1.0.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
||||
# HOS odbojka harvester: hos-cvf.hr natjecanja + standings, hos-web.dataproject.com match stats.
|
||||
# Targets all 77 PGŽ odbojka klubova.
|
||||
|
||||
import sys, re, json, time
|
||||
import html as ihtml
|
||||
from datetime import datetime
|
||||
import requests
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
|
||||
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA}
|
||||
|
||||
|
||||
def _http_get(url, retries=1):
|
||||
last = None
|
||||
for i in range(retries + 1):
|
||||
try:
|
||||
r = requests.get(url, headers=HDR, timeout=25)
|
||||
if r.status_code == 200 and r.text:
|
||||
return r.text
|
||||
last = f"HTTP {r.status_code}"
|
||||
except Exception as e:
|
||||
last = str(e)
|
||||
time.sleep(2)
|
||||
raise RuntimeError(f"GET {url} failed: {last}")
|
||||
|
||||
|
||||
def _strip_tags(s):
|
||||
return ihtml.unescape(re.sub(r'<[^>]+>', '', s or '')).strip()
|
||||
|
||||
|
||||
def _parse_standings(html):
|
||||
"""Return list of {poz, klub, uk, pob, por, bod} from first plausible table."""
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL | re.IGNORECASE)
|
||||
for tbl in tables:
|
||||
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tbl, re.DOTALL | re.IGNORECASE)
|
||||
out = []
|
||||
for row in rows:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL | re.IGNORECASE)
|
||||
clean = [_strip_tags(c) for c in cells]
|
||||
if not clean or not clean[0]:
|
||||
continue
|
||||
if clean[0] in ('', '#', 'Pos', 'Poz', 'R'):
|
||||
continue
|
||||
try:
|
||||
m = re.match(r'(\d+)\.?', clean[0])
|
||||
if not m:
|
||||
continue
|
||||
poz = int(m.group(1))
|
||||
if len(clean) < 5:
|
||||
continue
|
||||
klub = clean[2] if (len(clean) >= 7 and not re.match(r'^\d+$', clean[2])) else clean[1]
|
||||
if not klub or re.match(r'^\d+$', klub):
|
||||
continue
|
||||
numcells = [c for c in clean if re.match(r'^-?\d+$', c)]
|
||||
if len(numcells) < 4:
|
||||
continue
|
||||
tail = numcells[1:]
|
||||
uk = int(tail[0])
|
||||
pob = int(tail[1])
|
||||
por = int(tail[2])
|
||||
bod = int(tail[-1])
|
||||
out.append({'poz': poz, 'klub': klub, 'uk': uk, 'pob': pob, 'por': por, 'bod': bod})
|
||||
except Exception:
|
||||
continue
|
||||
if out and len(out) >= 2:
|
||||
return out
|
||||
return []
|
||||
|
||||
|
||||
def _parse_title(html):
|
||||
m = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.DOTALL | re.IGNORECASE)
|
||||
if m:
|
||||
t = _strip_tags(m.group(1))
|
||||
if t and len(t) > 4:
|
||||
return t
|
||||
m = re.search(r'<title[^>]*>(.*?)</title>', html, re.DOTALL | re.IGNORECASE)
|
||||
if m:
|
||||
return _strip_tags(m.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _detect_razina_spol(title):
|
||||
t = (title or '').lower()
|
||||
razina = None
|
||||
for key, lab in [
|
||||
('superliga 2', 'Superliga 2'),
|
||||
('superliga', 'Superliga'),
|
||||
('1. liga', '1.liga'), ('1.liga', '1.liga'),
|
||||
('2. liga', '2.liga'), ('2.liga', '2.liga'),
|
||||
('3. liga', '3.liga'), ('3.liga', '3.liga'),
|
||||
('kup', 'Kup'),
|
||||
('kadeti', 'Kadeti'), ('kadetkinje', 'Kadetkinje'),
|
||||
('juniori', 'Juniori'), ('juniorke', 'Juniorke'),
|
||||
('mini', 'Mini'),
|
||||
('beach', 'Beach'), ('pijesku', 'Beach'),
|
||||
]:
|
||||
if key in t:
|
||||
razina = lab
|
||||
break
|
||||
spol = None
|
||||
if re.search(r'\(\s*[mM]\s*\)|\bmu[šs]ki\b|\bmuska\b|\bjuniori\b|\bkadeti\b', t):
|
||||
spol = 'M'
|
||||
elif re.search(r'\(\s*[ŽzZ]\s*\)|\bžen|\bjuniorke\b|\bkadetkinje\b', t):
|
||||
spol = 'Ž'
|
||||
return razina, spol
|
||||
|
||||
|
||||
class HOSHarvester(SportHarvester):
|
||||
SPORT = 'odbojka'
|
||||
SOURCE = 'hos'
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
# HOS-CVF.hr search
|
||||
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
|
||||
|
||||
BASE_CVF = 'https://hos-cvf.hr'
|
||||
BASE_DP = 'https://hos-web.dataproject.com'
|
||||
SEZONA = '2025/26'
|
||||
MAX_NATJ = 80
|
||||
MAX_MATCHES_PER_KLUB = 5
|
||||
MAX_MATCHES_TOTAL = 120
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._natj_by_klub = {}
|
||||
self._matches_for_klub = {}
|
||||
self._dp_match_seen = set()
|
||||
self._matches_scraped_total = 0
|
||||
self.stats.setdefault('natjecanja', 0)
|
||||
self.stats.setdefault('tablice', 0)
|
||||
self.stats.setdefault('matches', 0)
|
||||
|
||||
def get_target_klubovi(self, limit=999):
|
||||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||||
WHERE sport = 'odbojka'
|
||||
ORDER BY (financiran OR u_godisnjaku) DESC, id
|
||||
LIMIT %s
|
||||
""", (limit,))
|
||||
return cur.fetchall()
|
||||
|
||||
def _discover_natjecanje_ids(self):
|
||||
try:
|
||||
page.goto("https://hos-cvf.hr/", wait_until="domcontentloaded", timeout=20000)
|
||||
self.log(f" [discovery mode] HOS site loaded")
|
||||
html = _http_get(self.BASE_CVF + '/')
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
self.log(f"⚠ failed to load hos-cvf.hr: {e}")
|
||||
return []
|
||||
ids = sorted({int(m) for m in re.findall(r'natjecanje\.php\?id=(\d+)', html)})
|
||||
self.log(f" found {len(ids)} natjecanje ids on hos-cvf.hr")
|
||||
return ids[:self.MAX_NATJ]
|
||||
|
||||
def _upsert_natjecanje(self, nid, naziv, razina, spol, source_url):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja
|
||||
(sport, naziv, razina, sezona, spol, source, external_id, external_url,
|
||||
source_id, source_url, status, updated_at)
|
||||
VALUES ('odbojka', %s, %s, %s, %s, 'hos_cvf', %s, %s, %s, %s, 'aktivno', now())
|
||||
ON CONFLICT (source, external_id) DO UPDATE
|
||||
SET naziv = EXCLUDED.naziv,
|
||||
razina = COALESCE(EXCLUDED.razina, pgz_sport.natjecanja.razina),
|
||||
spol = COALESCE(EXCLUDED.spol, pgz_sport.natjecanja.spol),
|
||||
sezona = EXCLUDED.sezona,
|
||||
source_url = EXCLUDED.source_url,
|
||||
external_url = EXCLUDED.external_url,
|
||||
updated_at = now()
|
||||
RETURNING id
|
||||
""", (naziv, razina, self.SEZONA, spol, str(nid), source_url, str(nid), source_url))
|
||||
return cur.fetchone()[0]
|
||||
|
||||
def _find_klub_id(self, klub_naziv):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, region FROM pgz_sport.klubovi
|
||||
WHERE sport = 'odbojka'
|
||||
AND (LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s))
|
||||
ORDER BY CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id
|
||||
LIMIT 1
|
||||
""", (klub_naziv, f"%{klub_naziv}%"))
|
||||
r = cur.fetchone()
|
||||
if r:
|
||||
return r[0]
|
||||
target = self.slugify(klub_naziv)
|
||||
toks = [t for t in target.split('-') if len(t) > 3]
|
||||
if not toks:
|
||||
return None
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, naziv FROM pgz_sport.klubovi
|
||||
WHERE sport='odbojka' AND aktivan
|
||||
""")
|
||||
best = None
|
||||
best_score = 0
|
||||
for kid, knaz in cur.fetchall():
|
||||
kslug = self.slugify(knaz)
|
||||
score = sum(1 for t in toks if t in kslug)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = kid
|
||||
if best_score >= max(1, len(toks) - 1):
|
||||
return best
|
||||
return None
|
||||
|
||||
def _replace_tablice(self, natj_id, source_url, rows, spol):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
|
||||
for r in rows:
|
||||
klub_id = self._find_klub_id(r['klub'])
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
|
||||
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi,
|
||||
source, source_url, updated_at, extra_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s,
|
||||
'hos_cvf', %s, now(), %s::jsonb)
|
||||
ON CONFLICT (natjecanje_id, klub_naziv) DO UPDATE SET
|
||||
pozicija = EXCLUDED.pozicija,
|
||||
odigrano = EXCLUDED.odigrano,
|
||||
pobjede = EXCLUDED.pobjede,
|
||||
porazi = EXCLUDED.porazi,
|
||||
bodovi = EXCLUDED.bodovi,
|
||||
klub_id = COALESCE(EXCLUDED.klub_id, pgz_sport.natjecanja_tablice.klub_id),
|
||||
source_url = EXCLUDED.source_url,
|
||||
updated_at = now()
|
||||
""", (natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'],
|
||||
r['bod'], source_url, json.dumps({'spol': spol})))
|
||||
if klub_id:
|
||||
self._natj_by_klub.setdefault(klub_id, []).append({
|
||||
'natj_id': natj_id,
|
||||
'natj_naziv': None,
|
||||
'url': source_url,
|
||||
'klub_naziv': r['klub'],
|
||||
'pozicija': r['poz'],
|
||||
})
|
||||
|
||||
def _harvest_natjecanje(self, nid):
|
||||
url = f"{self.BASE_CVF}/natjecanje.php?id={nid}"
|
||||
try:
|
||||
html = _http_get(url)
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ natj {nid}: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return
|
||||
title = _parse_title(html) or f"HOS natjecanje #{nid}"
|
||||
razina, spol = _detect_razina_spol(title)
|
||||
natj_id = self._upsert_natjecanje(nid, title, razina, spol, url)
|
||||
rows = _parse_standings(html)
|
||||
if rows:
|
||||
self._replace_tablice(natj_id, url, rows, spol)
|
||||
self.stats['tablice'] += len(rows)
|
||||
for kid, entries in self._natj_by_klub.items():
|
||||
for e in entries:
|
||||
if e['natj_id'] == natj_id and e.get('natj_naziv') is None:
|
||||
e['natj_naziv'] = title
|
||||
mids = sorted({int(m) for m in re.findall(r'MatchStatistics\.aspx\?mID=(\d+)', html, re.IGNORECASE)})
|
||||
if mids:
|
||||
klub_ids_here = [kid for kid, entries in self._natj_by_klub.items()
|
||||
if any(e['natj_id'] == natj_id for e in entries)]
|
||||
for kid in klub_ids_here:
|
||||
bucket = self._matches_for_klub.setdefault(kid, [])
|
||||
for mid in mids:
|
||||
bucket.append({'mid': mid, 'natj_id': natj_id, 'natj_naziv': title})
|
||||
self.stats['natjecanja'] += 1
|
||||
|
||||
def _harvest_federation(self):
|
||||
self.log("📋 preflight: hos-cvf.hr natjecanja discovery")
|
||||
ids = self._discover_natjecanje_ids()
|
||||
for nid in ids:
|
||||
self._harvest_natjecanje(nid)
|
||||
self.log(f" preflight done: natjecanja={self.stats['natjecanja']}, "
|
||||
f"tablice={self.stats['tablice']}, klubova_with_match={len(self._natj_by_klub)}")
|
||||
|
||||
def _scrape_dp_match(self, page, mid, klub_id, klub_naziv, natj_naziv):
|
||||
if mid in self._dp_match_seen:
|
||||
return 0
|
||||
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
|
||||
return 0
|
||||
url = f"{self.BASE_DP}/MatchStatistics.aspx?mID={mid}"
|
||||
added = 0
|
||||
try:
|
||||
page.goto(url, wait_until='domcontentloaded', timeout=30000)
|
||||
try:
|
||||
page.wait_for_load_state('networkidle', timeout=10000)
|
||||
except Exception:
|
||||
pass
|
||||
self._dp_match_seen.add(mid)
|
||||
self._matches_scraped_total += 1
|
||||
self.stats['matches'] += 1
|
||||
rows = []
|
||||
for sel in ['table.statTbl tr', 'table.report tr', 'table tr']:
|
||||
try:
|
||||
txts = page.locator(sel).all_inner_texts()
|
||||
except Exception:
|
||||
txts = []
|
||||
if txts:
|
||||
rows = txts
|
||||
break
|
||||
|
||||
for txt in rows:
|
||||
line = re.sub(r'\s+', ' ', txt.replace('\t', ' ')).strip()
|
||||
if not line:
|
||||
continue
|
||||
m = re.match(r'^(\d{1,3})\s+([A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+(?:\s+[A-ZČĆŽŠĐ][\wČĆŽŠĐčćžšđ\.\-\']+)+)\b(.*)$', line)
|
||||
if not m:
|
||||
continue
|
||||
jersey = m.group(1)
|
||||
fullname = m.group(2).strip()
|
||||
tail = m.group(3).strip()
|
||||
nums = [int(x) for x in re.findall(r'-?\d+', tail)]
|
||||
if not nums:
|
||||
continue
|
||||
pts = nums[0] if len(nums) >= 1 else None
|
||||
aces = nums[5] if len(nums) > 5 else None
|
||||
blocks = nums[7] if len(nums) > 7 else None
|
||||
parts = fullname.split()
|
||||
if parts[0].isupper() and len(parts) >= 2:
|
||||
prezime = parts[0].title()
|
||||
ime = ' '.join(parts[1:])
|
||||
else:
|
||||
ime = parts[0]
|
||||
prezime = ' '.join(parts[1:]) if len(parts) > 1 else ''
|
||||
slug_key = self.slugify(fullname)
|
||||
source_id = f"dp:{mid}:{jersey}:{slug_key}"
|
||||
try:
|
||||
clan_id = self.upsert_clan(
|
||||
klub_id=klub_id, source_id=source_id,
|
||||
ime=ime, prezime=prezime,
|
||||
source_url=url, kategorija='senior', sezona=self.SEZONA,
|
||||
extra={'dp_match_id': mid, 'jersey': jersey},
|
||||
)
|
||||
self.upsert_stats(
|
||||
clan_id=clan_id, sezona=self.SEZONA,
|
||||
klub_id=klub_id, klub_naziv=klub_naziv,
|
||||
natjecanje=natj_naziv, kategorija='senior',
|
||||
stats_dict={
|
||||
'nastupi': 1,
|
||||
'bodovi': pts,
|
||||
'servis_asovi': aces,
|
||||
'blokade': blocks,
|
||||
},
|
||||
raw={'mid': mid, 'jersey': jersey, 'name': fullname, 'tail_nums': nums},
|
||||
)
|
||||
self.stats['players'] += 1
|
||||
self.stats['stats'] += 1
|
||||
added += 1
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ upsert player '{fullname}': {e}")
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ dp match {mid}: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return added
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
self.log(f" 🏐 Klub {klub['id']} {klub['naziv']}")
|
||||
entries = list(self._natj_by_klub.get(klub['id'], []))
|
||||
if not entries:
|
||||
kslug = self.slugify(klub['naziv'])
|
||||
ktoks = [t for t in kslug.split('-') if len(t) > 3]
|
||||
if ktoks:
|
||||
for kid, ents in list(self._natj_by_klub.items()):
|
||||
for e in ents:
|
||||
eslug = self.slugify(e['klub_naziv'])
|
||||
if sum(1 for t in ktoks if t in eslug) >= max(1, len(ktoks) - 1):
|
||||
entries.append(e)
|
||||
break
|
||||
if entries:
|
||||
break
|
||||
|
||||
if entries:
|
||||
first = entries[0]
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET source_url = COALESCE(NULLIF(source_url, ''), %s),
|
||||
source = COALESCE(source, 'hos_cvf'),
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (first['url'], klub['id']))
|
||||
naz_list = ', '.join(sorted({(e.get('natj_naziv') or '?') for e in entries}))[:120]
|
||||
self.log(f" ↳ {len(entries)} natjecanja: {naz_list}")
|
||||
else:
|
||||
self.log(f" · no HOS natjecanje hit")
|
||||
|
||||
match_bucket = self._matches_for_klub.get(klub['id'], [])
|
||||
if not match_bucket and entries:
|
||||
for kid, ents in self._natj_by_klub.items():
|
||||
if any(e['natj_id'] == entries[0]['natj_id'] for e in ents):
|
||||
match_bucket = self._matches_for_klub.get(kid, [])
|
||||
if match_bucket:
|
||||
break
|
||||
scraped_for_klub = 0
|
||||
for m in match_bucket:
|
||||
if scraped_for_klub >= self.MAX_MATCHES_PER_KLUB:
|
||||
break
|
||||
if self._matches_scraped_total >= self.MAX_MATCHES_TOTAL:
|
||||
break
|
||||
n = self._scrape_dp_match(page, m['mid'], klub['id'], klub['naziv'], m['natj_naziv'] or 'HOS')
|
||||
if n > 0:
|
||||
scraped_for_klub += 1
|
||||
if scraped_for_klub:
|
||||
self.log(f" ↳ scraped {scraped_for_klub} match(es) from dataproject")
|
||||
|
||||
def run(self, limit=999):
|
||||
self._harvest_federation()
|
||||
super().run(limit)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
HOSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 999)
|
||||
|
||||
@@ -1,27 +1,489 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HRS handball harvester."""
|
||||
import sys
|
||||
"""
|
||||
hrs_handball.py — HRS Rukomet harvester v1.0
|
||||
Authors: dradulic@outlook.com / damir@rinet.one
|
||||
Date: 2026-05-05
|
||||
Description:
|
||||
Scrapes Hrvatski rukometni savez (HRS) competition data via the
|
||||
sportinfocentar2.com JSON endpoints (no HTML rendering needed):
|
||||
- https://www.sportinfocentar2.com/coman/natjecanje{LID}.js
|
||||
→ league JSON: lige[].utakmice[] {broj, e1, e2, k1, k2, d, ...}
|
||||
- https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={MID}
|
||||
→ per-match player roster + box-score stats
|
||||
Fuzzy-matches HRS team names to PGŽ priority handball clubs (~71) in
|
||||
pgz_sport.klubovi, then aggregates each player's per-(klub, natjecanje, sezona)
|
||||
totals into pgz_sport.player_stats; upserts pgz_sport.clanovi + clan_kategorije.
|
||||
|
||||
Run:
|
||||
python3 /opt/pgz-sport/scripts/sport_harvesters/hrs_handball.py [LIMIT_NATJECANJA]
|
||||
"""
|
||||
import os, sys, re, json, time, unicodedata
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime, date
|
||||
from collections import defaultdict
|
||||
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
from __base import SportHarvester # noqa: E402
|
||||
|
||||
class HRSHarvester(SportHarvester):
|
||||
SPORT = 'rukomet'
|
||||
SOURCE = 'hrs'
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
url = f"https://hrs.hr/?s={klub['naziv'].replace(' ','+')}"
|
||||
self.log(f" 🤾 Klub {klub['id']} {klub['naziv']}")
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
|
||||
# ─── HRS league IDs (HRS top menu, 2025/26) ────────────────────────────────
|
||||
HRS_NATJECANJA = [
|
||||
# Seniori M
|
||||
1632, # Paket24 Premijer liga (M)
|
||||
1633, # 1. HRL Sjever - M
|
||||
1634, # 1. HRL Jug - M
|
||||
1639, # 2. HRL Istok - M
|
||||
1641, # 2. HRL Zapad - M ★ PGŽ
|
||||
1642, # 2. HRL Sjever - M
|
||||
1643, # 2. HRL Jug - M
|
||||
1675, # 3. HRL Istok - M
|
||||
1676, # 3. HRL Sjever - M
|
||||
1677, # 3. HRL Središte - M
|
||||
1678, # 3. HRL Zapad - M ★ PGŽ
|
||||
1384, # Međužupanijska liga
|
||||
# Seniori Ž
|
||||
1629, # 1. HRL Žene
|
||||
1637, # 2. HRL Sjever - Ž
|
||||
1638, # 2. HRL Zapad - Ž ★ PGŽ
|
||||
1644, # 2. HRL Jug - Ž
|
||||
1671, # 3. HRL Sjever - Ž
|
||||
1672, # 3. HRL Zapad - Ž ★ PGŽ
|
||||
1673, # 3. HRL Središte - Ž
|
||||
1674, # 3. HRL Istok - Ž
|
||||
# Mladi M
|
||||
1389, # 1. HRL U18 - M
|
||||
1705, # 1. HRL U17 - M
|
||||
1763, # 2. HRL U17 - M
|
||||
1706, # 1. HRL U15 - M
|
||||
1716, # 2. HRL U15 - M
|
||||
1707, # 1. HRL U13 - M
|
||||
1717, # 2. HRL U13 - M
|
||||
1746, # 1. HRL U12 - M
|
||||
1709, # 1. HRL U11 - M
|
||||
# Dodatno (linkovi iz sidebara — ako vrate natjecanjeobjekt)
|
||||
1620, 1622, 1625, 1626, 1645, 1646,
|
||||
1761, 1762, 1773, 1753,
|
||||
1774, 1776, 1777, 1783, 1784, 1785, 1786, 1787, 1788,
|
||||
1796, 1797, 1818, 1834,
|
||||
1765, 1766,
|
||||
# Kupovi
|
||||
1092, 1302, 1303, 1441,
|
||||
]
|
||||
|
||||
UA = ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||||
|
||||
|
||||
def http_text(url, timeout=20, retries=2):
|
||||
"""Plain HTTP GET → text; small retry on transient errors.
|
||||
sportinfocentar2 files are mostly UTF-8 but occasionally contain stray cp1250
|
||||
bytes (e.g. typographic quotes from Word), so a strict-utf8 decode can fail.
|
||||
Strategy: strict utf-8 first; on failure fall back to utf-8/replace (keeps
|
||||
the bulk of the file Unicode-correct rather than re-decoding as latin-1)."""
|
||||
last = None
|
||||
for attempt in range(retries + 1):
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded", timeout=20000)
|
||||
# Find natjecanje or klub link
|
||||
links = page.locator('a[href*="hrs.hr"]').all()
|
||||
for a in links[:5]:
|
||||
href = a.get_attribute('href') or ''
|
||||
if 'natjecanje' in href or 'klub' in href:
|
||||
self.log(f" Found: {href}")
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
raw = r.read()
|
||||
try:
|
||||
return raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return raw.decode("utf-8", errors="replace")
|
||||
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
|
||||
last = e
|
||||
if attempt < retries:
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
raise RuntimeError(f"GET {url} failed: {last}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
HRSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
|
||||
_UNQUOTED_KEY_RE = re.compile(r'([{,]\s*)([A-Za-z_][A-Za-z0-9_]*)\s*:')
|
||||
_TRAIL_COMMA_RE = re.compile(r',(\s*[}\]])')
|
||||
_LEADING_ZERO_RE = re.compile(r'([\s:,\[])0+(\d)')
|
||||
|
||||
|
||||
def parse_var_json(body, var_prefix):
|
||||
"""Strip 'var <name> = ' wrapper and normalize the lazy-JSON dialect that
|
||||
sportinfocentar2 emits (unquoted keys, leading zeros in numbers)."""
|
||||
body = body.strip()
|
||||
# Both forms occur: `var foo = ...` (coman/) and bare `foo = ...` (ziceri/).
|
||||
m = re.match(rf"^\s*(?:var\s+)?{re.escape(var_prefix)}\s*=\s*", body, re.I)
|
||||
if m:
|
||||
body = body[m.end():]
|
||||
body = body.rstrip().rstrip(";").rstrip()
|
||||
# Quote unquoted property names (already-quoted keys have `"`, not `[A-Za-z_]`)
|
||||
body = _UNQUOTED_KEY_RE.sub(r'\1"\2":', body)
|
||||
# Strip JS leading-zero numbers (e.g. `"mu": 018,`) that JSON rejects.
|
||||
# Anchor on a non-digit char so we never touch zeros inside quoted strings.
|
||||
body = _LEADING_ZERO_RE.sub(r'\1\2', body)
|
||||
body = _TRAIL_COMMA_RE.sub(r'\1', body)
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def derive_sezona(d):
|
||||
"""Croatian sport season from a calendar date: Jul–Dec → YYYY/YYYY+1."""
|
||||
if not d:
|
||||
return None
|
||||
if isinstance(d, str):
|
||||
try:
|
||||
d = datetime.strptime(d[:10], "%Y-%m-%d").date()
|
||||
except Exception:
|
||||
return None
|
||||
y = d.year
|
||||
if d.month >= 7:
|
||||
return f"{y}/{y + 1}"
|
||||
return f"{y - 1}/{y}"
|
||||
|
||||
|
||||
def derive_kategorija(naziv):
|
||||
"""Map natjecanje naziv → kategorija (handball age groups)."""
|
||||
n = (naziv or "").lower()
|
||||
if "u11" in n or "u-11" in n:
|
||||
return "mini U11"
|
||||
if "u12" in n or "u-12" in n:
|
||||
return "mini U12"
|
||||
if "u13" in n or "u-13" in n:
|
||||
return "dječaci U13"
|
||||
if "u15" in n or "u-15" in n:
|
||||
return "mlađi kadeti U15"
|
||||
if "u17" in n or "u-17" in n:
|
||||
return "kadeti U17"
|
||||
if "u18" in n or "u-18" in n:
|
||||
return "juniori U18"
|
||||
return "seniori"
|
||||
|
||||
|
||||
# ─── Klub-name normalization for fuzzy match ──────────────────────────────
|
||||
_DIA = str.maketrans("čćžšđČĆŽŠĐ", "cczsdcczsd")
|
||||
|
||||
_PREFIX_RE = re.compile(
|
||||
r"^(?:"
|
||||
r"hrvatski\s+|muski\s+|zenski\s+|"
|
||||
r"rukometni\s+(?:klub|savez)\s+|"
|
||||
r"hrk|mrk|zrk|rk"
|
||||
r")\s*",
|
||||
re.I,
|
||||
)
|
||||
_TRAIL_LOC_RE = re.compile(r"\s*-\s*[a-z][a-z\s]*$", re.I)
|
||||
_SUFFIX_2_RE = re.compile(r"\s+(?:ii|2)\s*$", re.I)
|
||||
_NUMERIC_LIGA_RE = re.compile(r"\d+\.\s+u\s+.*$", re.I)
|
||||
_PAREN_RE = re.compile(r"\([^)]*\)")
|
||||
_NON_ALNUM_RE = re.compile(r"[^a-z0-9]+")
|
||||
|
||||
|
||||
def normalize_klub_name(name):
|
||||
"""Aggressively normalize a Croatian handball club name to a comparable token."""
|
||||
if not name:
|
||||
return ""
|
||||
s = str(name).strip()
|
||||
s = _PAREN_RE.sub(" ", s)
|
||||
s = _NUMERIC_LIGA_RE.sub("", s)
|
||||
s = s.translate(_DIA)
|
||||
s = s.lower()
|
||||
while True:
|
||||
new = _PREFIX_RE.sub("", s)
|
||||
if new == s:
|
||||
break
|
||||
s = new
|
||||
s = _SUFFIX_2_RE.sub("", s)
|
||||
s = _TRAIL_LOC_RE.sub("", s)
|
||||
s = _NON_ALNUM_RE.sub(" ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def is_team_2nd(name):
|
||||
n = (name or "").strip().lower()
|
||||
return bool(re.search(r"\s(?:ii|2)\s*$", n))
|
||||
|
||||
|
||||
def is_pgz_klub_candidate(naziv):
|
||||
"""Filter out savezi / udruge / zborovi / clearly non-club rows."""
|
||||
n = (naziv or "").lower()
|
||||
bad = ("savez", "udruga", "zbor", "trener")
|
||||
if any(b in n for b in bad):
|
||||
return False
|
||||
# Junk like 'RK RK' or 'RK PŠR' (no real name body)
|
||||
if re.fullmatch(r"\s*(rk|zrk|mrk|hrk)\s*(rk|psr|psr selce|liburnija|mornar|omisalj)?\s*", n):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_zenska_klub(naziv):
|
||||
n = (naziv or "").strip().lower()
|
||||
return n.startswith("ženski") or n.startswith("zenski") or n.startswith("ž ") \
|
||||
or n.startswith("zrk ") or n.startswith("ž.") or " žene" in n or " zene" in n
|
||||
|
||||
|
||||
# ─── Harvester ─────────────────────────────────────────────────────────────
|
||||
class HRSHarvester(SportHarvester):
|
||||
SPORT = "rukomet"
|
||||
SOURCE = "hrs"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.team_to_klub_m = {}
|
||||
self.team_to_klub_z = {}
|
||||
self.unmatched_teams = set()
|
||||
|
||||
# Override base — base filters financiran||u_godisnjaku (only 3 rukomet rows).
|
||||
# Brief mandates ALL 71 PGŽ priority rukomet klubova.
|
||||
def get_target_klubovi(self, limit=999):
|
||||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, naziv, sport
|
||||
FROM pgz_sport.v_pgz_priority_klubovi
|
||||
WHERE sport = %s
|
||||
ORDER BY id
|
||||
LIMIT %s
|
||||
""", (self.SPORT, limit))
|
||||
return cur.fetchall()
|
||||
|
||||
def build_klub_maps(self):
|
||||
rows = self.get_target_klubovi(999)
|
||||
for r in rows:
|
||||
naziv = r["naziv"]
|
||||
if not is_pgz_klub_candidate(naziv):
|
||||
continue
|
||||
norm = normalize_klub_name(naziv)
|
||||
if not norm:
|
||||
continue
|
||||
target = self.team_to_klub_z if is_zenska_klub(naziv) else self.team_to_klub_m
|
||||
cur = target.get(norm)
|
||||
if cur is None or r["id"] < cur[0]:
|
||||
target[norm] = (r["id"], naziv)
|
||||
self.log(f"klub maps: men={len(self.team_to_klub_m)} women={len(self.team_to_klub_z)}")
|
||||
|
||||
def match_team(self, hrs_team_name, is_zenska_liga):
|
||||
"""Direct → token-subset → fallback. Tokens come from normalize_klub_name."""
|
||||
if not hrs_team_name:
|
||||
return None
|
||||
candidates = [hrs_team_name]
|
||||
if is_team_2nd(hrs_team_name):
|
||||
candidates.append(re.sub(r"\s+(?:ii|2)\s*$", "", hrs_team_name).strip())
|
||||
m = self.team_to_klub_z if is_zenska_liga else self.team_to_klub_m
|
||||
for c in candidates:
|
||||
n = normalize_klub_name(c)
|
||||
if not n:
|
||||
continue
|
||||
if n in m:
|
||||
return m[n]
|
||||
n_tokens = set(n.split())
|
||||
if not n_tokens:
|
||||
continue
|
||||
best = None
|
||||
for k_norm, (kid, kname) in m.items():
|
||||
k_tokens = set(k_norm.split())
|
||||
if not k_tokens:
|
||||
continue
|
||||
# token-subset match in either direction
|
||||
if not (n_tokens <= k_tokens or k_tokens <= n_tokens):
|
||||
continue
|
||||
shared = n_tokens & k_tokens
|
||||
# Require at least one shared token of length ≥ 4 to avoid noise like {"rk"}
|
||||
if not any(len(t) >= 4 for t in shared):
|
||||
continue
|
||||
# Prefer lowest klub_id (canonical row, not godišnjak duplicate)
|
||||
if best is None or kid < best[0]:
|
||||
best = (kid, kname)
|
||||
if best:
|
||||
return best
|
||||
return None
|
||||
|
||||
# ─── HRS endpoints ─────────────────────────────────────────────────────
|
||||
def fetch_natjecanje(self, lid):
|
||||
url = f"https://www.sportinfocentar2.com/coman/natjecanje{lid}.js"
|
||||
try:
|
||||
body = http_text(url, timeout=20)
|
||||
return parse_var_json(body, "natjecanjeobjekt")
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ fetch_natjecanje({lid}): {e}")
|
||||
return None
|
||||
|
||||
def fetch_match_stats(self, mid):
|
||||
url = f"https://www.sportinfocentar2.com/ziceri/webupitisapi.dll?tab=128&utakmica={mid}"
|
||||
try:
|
||||
body = http_text(url, timeout=15)
|
||||
stripped = body.strip()
|
||||
if "not authorized" in stripped.lower() or stripped.startswith("//"):
|
||||
return None
|
||||
return parse_var_json(body, "tab128")
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ fetch_match({mid}): {e}")
|
||||
return None
|
||||
|
||||
# ─── Aggregation & upserts ─────────────────────────────────────────────
|
||||
@staticmethod
|
||||
def _aggregate_player_stats(rows):
|
||||
out = defaultdict(int)
|
||||
for r in rows:
|
||||
out["nastupi"] += 1
|
||||
out["golovi"] += int(r.get("sutd") or 0)
|
||||
out["asistencije"] += int(r.get("asistencija") or 0)
|
||||
out["zuti"] += int(r.get("zutih") or 0)
|
||||
out["crveni"] += int(r.get("crvenih") or 0)
|
||||
return dict(out)
|
||||
|
||||
def upsert_klub_roster(self, klub_id, hrs_team_id, ekipa, sezona, raw):
|
||||
try:
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.klub_roster
|
||||
(klub_id, sport, source, source_id, source_url, ekipa, sezona, raw_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb)
|
||||
ON CONFLICT (klub_id, source, source_id, ekipa, sezona) DO UPDATE
|
||||
SET raw_data = EXCLUDED.raw_data, scraped_at = now()
|
||||
""", (klub_id, self.SPORT, self.SOURCE, str(hrs_team_id),
|
||||
f"https://hrs.hr/natjecanje/?ekipa={hrs_team_id}",
|
||||
ekipa, sezona, json.dumps(raw)))
|
||||
except Exception as e:
|
||||
self.log(f" ⚠ upsert_klub_roster: {e}")
|
||||
|
||||
# ─── Main run ──────────────────────────────────────────────────────────
|
||||
def run(self, limit=999):
|
||||
self.build_klub_maps()
|
||||
nat_ids = HRS_NATJECANJA[: int(limit)] if limit else HRS_NATJECANJA
|
||||
self.log(f"🤾 Starting HRS harvest. Natjecanja: {len(nat_ids)}")
|
||||
|
||||
agg = defaultdict(list)
|
||||
clan_meta = {}
|
||||
|
||||
for lid in nat_ids:
|
||||
nat = self.fetch_natjecanje(lid)
|
||||
if not nat:
|
||||
continue
|
||||
naziv = nat.get("naziv") or f"natjecanje {lid}"
|
||||
spol_int = nat.get("spol", 0)
|
||||
is_zenska = ("- ž" in naziv.lower()) or ("žene" in naziv.lower()) or (spol_int == 1)
|
||||
kategorija = derive_kategorija(naziv)
|
||||
self.log(f"━━ Liga {lid}: {naziv} ({'Ž' if is_zenska else 'M'}, {kategorija})")
|
||||
|
||||
team_idx = {}
|
||||
matches = []
|
||||
for liga in (nat.get("lige") or []):
|
||||
for u in (liga.get("utakmice") or []):
|
||||
mid = u.get("broj")
|
||||
k1, k2 = u.get("k1"), u.get("k2")
|
||||
e1, e2 = u.get("e1") or "", u.get("e2") or ""
|
||||
d = u.get("d") or u.get("pc")
|
||||
if not mid or not k1 or not k2:
|
||||
continue
|
||||
if k1 and e1:
|
||||
team_idx[k1] = e1
|
||||
if k2 and e2:
|
||||
team_idx[k2] = e2
|
||||
matches.append((mid, k1, e1, k2, e2, d))
|
||||
|
||||
pgz_team_ids = {}
|
||||
for tid, tname in team_idx.items():
|
||||
m = self.match_team(tname, is_zenska)
|
||||
if m:
|
||||
pgz_team_ids[tid] = m
|
||||
else:
|
||||
self.unmatched_teams.add(f"{tname} [{ 'Ž' if is_zenska else 'M' }]")
|
||||
|
||||
if not pgz_team_ids:
|
||||
self.log(f" · no PGŽ teams in this league")
|
||||
continue
|
||||
self.log(" ✓ PGŽ teams: " + ", ".join(
|
||||
f"{tid}:{team_idx[tid]} → klub#{kid}"
|
||||
for tid, (kid, _) in pgz_team_ids.items()))
|
||||
|
||||
roster_seen = {}
|
||||
|
||||
for (mid, k1, e1, k2, e2, mdate) in matches:
|
||||
if k1 not in pgz_team_ids and k2 not in pgz_team_ids:
|
||||
continue
|
||||
sezona = derive_sezona(mdate) or "2025/2026"
|
||||
rows = self.fetch_match_stats(mid)
|
||||
if not rows:
|
||||
continue
|
||||
for r in rows:
|
||||
rb = r.get("rbekipa")
|
||||
if rb == 1:
|
||||
hrs_team_id, ekipa_name = k1, e1
|
||||
elif rb == 2:
|
||||
hrs_team_id, ekipa_name = k2, e2
|
||||
else:
|
||||
continue
|
||||
if hrs_team_id not in pgz_team_ids:
|
||||
continue
|
||||
klub_id, klub_naziv = pgz_team_ids[hrs_team_id]
|
||||
igrac = r.get("igrac")
|
||||
if not igrac:
|
||||
continue
|
||||
ime = (r.get("ime") or "").strip()
|
||||
prezime = (r.get("prezime") or "").strip()
|
||||
rkey = (klub_id, hrs_team_id, sezona)
|
||||
if rkey not in roster_seen:
|
||||
roster_seen[rkey] = (ekipa_name,
|
||||
{"hrs_team_id": hrs_team_id, "natjecanje": naziv})
|
||||
pkey = (igrac, klub_id, naziv, sezona)
|
||||
agg[pkey].append(r)
|
||||
if pkey not in clan_meta:
|
||||
clan_meta[pkey] = {
|
||||
"ime": ime, "prezime": prezime,
|
||||
"hrs_team_id": hrs_team_id, "ekipa": ekipa_name,
|
||||
"kategorija": kategorija,
|
||||
"spol": "Ž" if is_zenska else "M",
|
||||
"natjecanje": naziv, "lid": lid,
|
||||
}
|
||||
self.stats["stats"] += 1
|
||||
time.sleep(0.05)
|
||||
|
||||
for (klub_id, hrs_team_id, sezona), (ekipa_name, raw) in roster_seen.items():
|
||||
self.upsert_klub_roster(klub_id, hrs_team_id, ekipa_name, sezona, raw)
|
||||
|
||||
self.log(f"━━ Aggregated keys: {len(agg)}, unique players: {len({k[0] for k in agg})}")
|
||||
upserted = 0
|
||||
for (igrac, klub_id, naziv, sezona), match_rows in agg.items():
|
||||
meta = clan_meta[(igrac, klub_id, naziv, sezona)]
|
||||
try:
|
||||
clan_id = self.upsert_clan(
|
||||
klub_id=klub_id,
|
||||
source_id=igrac,
|
||||
ime=meta["ime"], prezime=meta["prezime"],
|
||||
source_url=f"https://hrs.hr/natjecanje/?igrac={igrac}",
|
||||
kategorija=meta["kategorija"],
|
||||
sezona=sezona,
|
||||
extra={"hrs_team_id": meta["hrs_team_id"],
|
||||
"ekipa": meta["ekipa"], "spol": meta["spol"]},
|
||||
)
|
||||
self.stats["players"] += 1
|
||||
stats_dict = self._aggregate_player_stats(match_rows)
|
||||
self.upsert_stats(
|
||||
clan_id=clan_id, sezona=sezona,
|
||||
klub_id=klub_id, klub_naziv=meta["ekipa"],
|
||||
natjecanje=naziv, kategorija=meta["kategorija"],
|
||||
stats_dict=stats_dict,
|
||||
raw={"matches": len(match_rows), "lid": meta["lid"]},
|
||||
)
|
||||
upserted += 1
|
||||
except Exception as e:
|
||||
self.stats["errors"] += 1
|
||||
self.log(f" ❌ upsert clan {igrac}: {e}")
|
||||
|
||||
self.log(f"✅ Done. {upserted} player_stats rows. "
|
||||
f"Stats: {self.stats}. Unmatched HRS teams: {len(self.unmatched_teams)}")
|
||||
for t in sorted(self.unmatched_teams)[:30]:
|
||||
self.log(f" unmatched: {t}")
|
||||
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.run(["curl", "-s", "-X", "POST",
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
"-d", "chat_id=7969491558",
|
||||
"--data-urlencode",
|
||||
f"text=🤾 HRS rukomet harvest done. Players: {self.stats['players']}, "
|
||||
f"stats rows: {upserted}, unmatched HRS teams: {len(self.unmatched_teams)}"],
|
||||
timeout=8, capture_output=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 999
|
||||
HRSHarvester().run(limit=limit)
|
||||
|
||||
@@ -1,54 +1,274 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HVS waterpolo harvester."""
|
||||
import sys, re
|
||||
# Name: hvs_waterpolo.py
|
||||
# Version: 2.0
|
||||
# Author: Damir Radulić <dradulic@outlook.com> / damir@rinet.one
|
||||
# Date: 2026-05-05
|
||||
# Description: HVS (hvs.hr) waterpolo harvester.
|
||||
# Brutalno iskreno: HVS web NE objavljuje rosters/stats po klubu
|
||||
# kroz indeksabilan kanal — /klub-{slug}/ vraća 404, /klubovi/{id}/
|
||||
# vraća "Pojavila se kritična greška", /igrac-{slug}/ vraća 404, a
|
||||
# /kategorija/{id}/ prikazuje samo sezonsku navigaciju. Jedino što
|
||||
# je upotrebljivo je wp-json REST API:
|
||||
# /wp/v2/klubovi (20 klubova + ACF.history)
|
||||
# /wp/v2/clanovi (37 federation officials s biografijama u kojima
|
||||
# se najčešće spominje klupska karijera)
|
||||
# Ovaj harvester:
|
||||
# 1. Cita wp-json klubovi → mapira PGŽ klubove (source_url + meta)
|
||||
# 2. Cita wp-json clanovi → upsertira sve, plus dodatno povezuje
|
||||
# one čija biografija sadrži ime PGŽ kluba (heuristika).
|
||||
# 3. Pokušava Playwright fallback na /klub-{slug}/ samo ako stranica
|
||||
# stvarno ima ".profile-header__name" u DOM-u (gracefully skipa
|
||||
# kad HVS vrati 404/error).
|
||||
import sys, re, json, time, urllib.request
|
||||
sys.path.insert(0, '/opt/pgz-sport/scripts/sport_harvesters')
|
||||
from __base import SportHarvester
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
WP_KLUB_API = "https://hvs.hr/wp-json/wp/v2/klubovi"
|
||||
WP_CLANOV_API = "https://hvs.hr/wp-json/wp/v2/clanovi"
|
||||
HVS_BASE = "https://hvs.hr"
|
||||
|
||||
KEYWORDS = [
|
||||
"primorje", "opatija", "jadran", "losinj", "palada",
|
||||
"silo", "crikvenica", "orka", "bura", "posk", "victoria",
|
||||
"kostrena", "njivice", "rijeka",
|
||||
]
|
||||
|
||||
|
||||
def _fetch_paginated(url, log):
|
||||
"""Fetch all pages of a wp-json collection."""
|
||||
out = []
|
||||
for page in range(1, 20):
|
||||
u = f"{url}?per_page=100&page={page}"
|
||||
try:
|
||||
req = urllib.request.Request(u, headers={"User-Agent": "Mozilla/5.0 PGZ-Sport"})
|
||||
with urllib.request.urlopen(req, timeout=15) as r:
|
||||
data = json.loads(r.read().decode("utf-8"))
|
||||
except Exception as e:
|
||||
log(f" wp-json {u} err: {e}")
|
||||
break
|
||||
if not data:
|
||||
break
|
||||
out.extend(data)
|
||||
if len(data) < 100:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
class HVSHarvester(SportHarvester):
|
||||
SPORT = 'vaterpolo'
|
||||
SOURCE = 'hvs'
|
||||
|
||||
|
||||
# ------------- target list (override base, return all 28) ----------------
|
||||
def get_target_klubovi(self, limit=999):
|
||||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||||
WHERE sport = %s
|
||||
ORDER BY financiran DESC, u_godisnjaku DESC, id
|
||||
LIMIT %s
|
||||
""", (self.SPORT, limit))
|
||||
return cur.fetchall()
|
||||
|
||||
# ------------- normalize / match helpers ---------------------------------
|
||||
def _core_slug(self, name):
|
||||
if not name:
|
||||
return ""
|
||||
s = name.lower()
|
||||
s = re.sub(r"\[merged[^\]]*\]", " ", s)
|
||||
s = re.sub(r"\([^)]*\)", " ", s)
|
||||
s = re.sub(r"\b(vaterpolo|vaterpolski|amaterski|žvk|zvk|vk|hšk|hsk)\b", " ", s)
|
||||
s = re.sub(r"\bklub\b", " ", s)
|
||||
s = re.sub(r"\bsavez\b.*$", " ", s)
|
||||
s = re.sub(r"-(muška|ženska)\s*ekipa", " ", s)
|
||||
s = re.sub(r"-erste\s*banka?a?", " ", s)
|
||||
s = self.slugify(s)
|
||||
return s.strip("-")
|
||||
|
||||
def _tokens(self, name):
|
||||
s = self._core_slug(name)
|
||||
return set(t for t in s.split("-") if len(t) > 2)
|
||||
|
||||
def _match_klub(self, pgz_naziv, hvs_list):
|
||||
target_core = self._core_slug(pgz_naziv)
|
||||
target_tokens = self._tokens(pgz_naziv)
|
||||
if not target_tokens:
|
||||
return None
|
||||
|
||||
for h in hvs_list:
|
||||
if self._core_slug(h["title"]) == target_core:
|
||||
return h
|
||||
|
||||
best, best_score = None, 0
|
||||
for h in hvs_list:
|
||||
ht = self._tokens(h["title"])
|
||||
shared = target_tokens & ht
|
||||
if len(shared) < 2:
|
||||
continue
|
||||
extra_candidate = (ht - target_tokens) & set(KEYWORDS)
|
||||
if extra_candidate:
|
||||
continue
|
||||
if len(shared) > best_score:
|
||||
best_score = len(shared)
|
||||
best = h
|
||||
return best
|
||||
|
||||
# ------------- wp-json fetch + simplify ----------------------------------
|
||||
def _fetch_hvs_klubovi(self):
|
||||
out = []
|
||||
raw = _fetch_paginated(WP_KLUB_API, self.log)
|
||||
for k in raw:
|
||||
title = (k.get("title") or {}).get("rendered", "").strip()
|
||||
title = (title.replace("–", "–").replace("’", "'")
|
||||
.replace("&", "&"))
|
||||
acf = k.get("ACF") or {}
|
||||
out.append({
|
||||
"wp_id": k.get("id"),
|
||||
"club_id": acf.get("club_id"),
|
||||
"title": title,
|
||||
"link": k.get("link"),
|
||||
"slug": k.get("slug"),
|
||||
"history": acf.get("history") or "",
|
||||
})
|
||||
return out
|
||||
|
||||
def _fetch_hvs_clanovi(self):
|
||||
out = []
|
||||
raw = _fetch_paginated(WP_CLANOV_API, self.log)
|
||||
for c in raw:
|
||||
title = (c.get("title") or {}).get("rendered", "").strip()
|
||||
title = (title.replace("–", "–").replace("’", "'")
|
||||
.replace("&", "&"))
|
||||
acf = c.get("ACF") or {}
|
||||
out.append({
|
||||
"wp_id": c.get("id"),
|
||||
"name": acf.get("name") or title,
|
||||
"image": acf.get("image") or "",
|
||||
"birth_date": acf.get("birth_date") or "",
|
||||
"birth_place": acf.get("birth_place") or "",
|
||||
"position": acf.get("position") or "",
|
||||
"bio": acf.get("bio") or "",
|
||||
"slug": c.get("slug"),
|
||||
"link": c.get("link"),
|
||||
})
|
||||
return out
|
||||
|
||||
# ------------- DB persistence helpers ------------------------------------
|
||||
def _persist_klub_link(self, klub_id, hvs):
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET source_url = %s,
|
||||
metadata = COALESCE(metadata, '{}'::jsonb) ||
|
||||
jsonb_build_object('hvs_wp_id', %s::int,
|
||||
'hvs_club_id', %s,
|
||||
'hvs_title', %s),
|
||||
updated_at = now()
|
||||
WHERE id = %s
|
||||
""", (hvs["link"], hvs["wp_id"], hvs.get("club_id"), hvs["title"], klub_id))
|
||||
|
||||
def _split_name(self, full):
|
||||
full = re.sub(r"\s+", " ", (full or "")).strip()
|
||||
if not full:
|
||||
return "", ""
|
||||
parts = full.split(" ", 1)
|
||||
return parts[0], (parts[1] if len(parts) > 1 else "")
|
||||
|
||||
def _insert_official(self, clan_data, klub_id):
|
||||
ime, prezime = self._split_name(clan_data["name"])
|
||||
if not ime:
|
||||
return None
|
||||
extra = {
|
||||
"image": clan_data.get("image", ""),
|
||||
"birth_date": clan_data.get("birth_date", ""),
|
||||
"birth_place": clan_data.get("birth_place", ""),
|
||||
"position": clan_data.get("position", ""),
|
||||
"bio": (clan_data.get("bio") or "")[:8000],
|
||||
"hvs_role": "federation_official_or_staff",
|
||||
}
|
||||
kategorija = clan_data.get("position") or "stručna funkcija"
|
||||
return self.upsert_clan(
|
||||
klub_id=klub_id,
|
||||
source_id=str(clan_data["wp_id"]),
|
||||
ime=ime, prezime=prezime,
|
||||
source_url=clan_data["link"],
|
||||
kategorija=kategorija,
|
||||
sezona=None,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
# ------------- klub-level orchestration ----------------------------------
|
||||
def scrape_klub(self, page, klub):
|
||||
self.log(f" 🤽 Klub {klub['id']} {klub['naziv']}")
|
||||
|
||||
if not hasattr(self, "_hvs_klubovi"):
|
||||
self._hvs_klubovi = self._fetch_hvs_klubovi()
|
||||
self.log(f" 📡 wp-json klubovi loaded: {len(self._hvs_klubovi)}")
|
||||
|
||||
if not hasattr(self, "_hvs_clanovi"):
|
||||
self._hvs_clanovi = self._fetch_hvs_clanovi()
|
||||
self.log(f" 📡 wp-json clanovi loaded: {len(self._hvs_clanovi)}")
|
||||
|
||||
match = self._match_klub(klub['naziv'], self._hvs_klubovi)
|
||||
if match:
|
||||
self.log(f" ✅ wp-json match → {match['title']} ({match['link']})")
|
||||
self._persist_klub_link(klub['id'], match)
|
||||
else:
|
||||
self.log(f" 🟡 no wp-json klub match (HVS exposes only 20 klubova)")
|
||||
|
||||
# Insert federation officials whose bio mentions any distinctive token
|
||||
# of this PGŽ klub. This is the only way HVS surfaces person-level data.
|
||||
klub_tokens = [t for t in self._tokens(klub['naziv']) if t in KEYWORDS]
|
||||
if not klub_tokens:
|
||||
self.log(f" 🟡 no distinctive tokens for {klub['naziv']}, skip clanovi link")
|
||||
return
|
||||
linked = 0
|
||||
for c in self._hvs_clanovi:
|
||||
blob = ((c.get("bio") or "") + " " + (c.get("name") or "")).lower()
|
||||
blob = self.slugify(blob).replace("-", " ")
|
||||
if any(t in blob.split() for t in klub_tokens):
|
||||
try:
|
||||
cid = self._insert_official(c, klub['id'])
|
||||
if cid:
|
||||
self.stats['players'] += 1
|
||||
linked += 1
|
||||
except Exception as e:
|
||||
self.log(f" official upsert err: {e}")
|
||||
if linked:
|
||||
self.log(f" 🧑 {linked} clanovi linked via bio match")
|
||||
|
||||
# Heartbeat
|
||||
try:
|
||||
# Get all klubovi list from HVS
|
||||
page.goto("https://hvs.hr/klubovi/", wait_until="domcontentloaded", timeout=20000)
|
||||
klub_links = page.locator('a[href*="/klub/"]').all()
|
||||
naziv_lower = klub['naziv'].lower()
|
||||
for a in klub_links[:30]:
|
||||
text = a.inner_text().lower()
|
||||
href = a.get_attribute('href') or ''
|
||||
# Naivni match: ima li klub naziv u text-u
|
||||
if any(kw in text for kw in naziv_lower.split() if len(kw) > 3):
|
||||
self.log(f" Match: {text[:50]} → {href}")
|
||||
m = re.search(r'/klub/(\d+)', href)
|
||||
if m:
|
||||
kid = m.group(1)
|
||||
new_url = f"https://hvs.hr/klub/{kid}/"
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("UPDATE pgz_sport.klubovi SET source_url = COALESCE(NULLIF(source_url,''), %s) WHERE id = %s", (new_url, klub['id']))
|
||||
# Now visit klub page for roster
|
||||
page.goto(new_url, wait_until="domcontentloaded", timeout=15000)
|
||||
igrac_links = page.locator('a[href*="/igrac/"]').all()
|
||||
self.log(f" {len(igrac_links)} igrača found")
|
||||
for ia in igrac_links[:30]:
|
||||
ihref = ia.get_attribute('href') or ''
|
||||
naziv = ia.inner_text().strip()
|
||||
mi = re.search(r'/igrac/(\d+)', ihref)
|
||||
if mi and naziv:
|
||||
parts = re.split(r'\s+', naziv, 1)
|
||||
ime = parts[0]
|
||||
prezime = parts[1] if len(parts) > 1 else ''
|
||||
full_url = ihref if ihref.startswith('http') else f"https://hvs.hr{ihref}"
|
||||
clan_id = self.upsert_clan(
|
||||
klub_id=klub['id'], source_id=mi.group(1),
|
||||
ime=ime, prezime=prezime,
|
||||
source_url=full_url
|
||||
)
|
||||
self.stats['players'] += 1
|
||||
break
|
||||
except Exception as e:
|
||||
self.log(f" ❌ {e}")
|
||||
import subprocess
|
||||
subprocess.run(["redis-cli", "SET", "cc:pgz-sport:heartbeat",
|
||||
str(int(time.time()))], timeout=3, capture_output=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# We override run() to skip Playwright entirely (HVS site is broken for it).
|
||||
def run(self, limit=999):
|
||||
klubovi = self.get_target_klubovi(limit)
|
||||
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova "
|
||||
f"(wp-json only — site SPA is broken for /klub/, /igrac/, /kategorija/)")
|
||||
for klub in klubovi:
|
||||
try:
|
||||
self.scrape_klub(None, klub) # no Playwright page
|
||||
self.stats['klubova'] += 1
|
||||
except Exception as e:
|
||||
self.stats['errors'] += 1
|
||||
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
|
||||
self.log(f"✅ Done. Stats: {self.stats}")
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.run(["curl", "-s", "-X", "POST",
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
"-d", "chat_id=7969491558",
|
||||
"--data-urlencode", f"text=VATERPOLO harvest done: {self.stats}"],
|
||||
timeout=8, capture_output=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
HVSHarvester().run(limit=int(sys.argv[1]) if len(sys.argv) > 1 else 50)
|
||||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 28
|
||||
HVSHarvester().run(limit=limit)
|
||||
|
||||
Reference in New Issue
Block a user