feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
coverage_report.py — Per-entity coverage scoring across pgz_sport schema
|
||||
|
||||
@@ -14,7 +17,7 @@ from datetime import datetime, timezone
|
||||
import psycopg2, psycopg2.extras
|
||||
|
||||
PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
|
||||
# Per-type coverage definition: list of fields that count toward coverage
|
||||
DEFS = {
|
||||
|
||||
+197
@@ -0,0 +1,197 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
coverage_report.py — Per-entity coverage scoring across pgz_sport schema
|
||||
|
||||
Fills /opt/pgz-sport/data_quality_report.md with:
|
||||
- per-type aggregate (n, mean coverage, median, # zero-coverage, # complete)
|
||||
- distribution histogram
|
||||
- top 50 entities most needing manual review (lowest coverage AND non-empty name)
|
||||
- link to detail panel for each (so audit.html-style triage is one click away)
|
||||
"""
|
||||
import os, json
|
||||
from collections import Counter
|
||||
from datetime import datetime, timezone
|
||||
import psycopg2, psycopg2.extras
|
||||
|
||||
PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3',
|
||||
user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
|
||||
# Per-type coverage definition: list of fields that count toward coverage
|
||||
DEFS = {
|
||||
'savez': {
|
||||
'table': 'pgz_sport.savezi',
|
||||
'name_col': 'naziv',
|
||||
'fields': ['naziv','sport','predsjednik','tajnik','email','telefon','web','oib','adresa','godina_osnutka'],
|
||||
'panel_path': lambda i: f'/?nav=savezi&open={i}',
|
||||
},
|
||||
'klub': {
|
||||
'table': 'pgz_sport.klubovi',
|
||||
'name_col': 'naziv',
|
||||
# Use COALESCE-ish: web OR web_stranica counts; sjediste OR adresa counts
|
||||
'fields': ['naziv','sport','grad','oib','predsjednik','tajnik','email','telefon',
|
||||
'web_or_stranica','sjediste_or_adresa','ciljevi','opis_djelatnosti'],
|
||||
'panel_path': lambda i: f'/?nav=klubovi&open={i}',
|
||||
},
|
||||
'sportas': {
|
||||
'table': 'pgz_sport.clanovi',
|
||||
'name_col': "ime||' '||prezime",
|
||||
'fields': ['ime','prezime','sport','klub_id','datum_rodenja','slika_url','oib','profile_url','biografija','hns_igrac_id'],
|
||||
'panel_path': lambda i: f'/?nav=sportasi&open={i}',
|
||||
},
|
||||
'objekt': {
|
||||
'table': 'pgz_sport.sportski_objekti',
|
||||
'name_col': 'naziv',
|
||||
'fields': ['naziv','tip','grad','adresa','lat','lng','upravitelj','kapacitet','sportovi','izgradeno'],
|
||||
'panel_path': lambda i: f'/?nav=objekti&open={i}',
|
||||
},
|
||||
'manifestacija': {
|
||||
'table': 'pgz_sport.manifestacije',
|
||||
'name_col': 'naziv',
|
||||
'fields': ['naziv','mjesto','organizator','razina','broj_ucesnika','godina_od','source_url'],
|
||||
'panel_path': lambda i: f'/?nav=manifestacije&open={i}',
|
||||
},
|
||||
}
|
||||
|
||||
def fetch_rows(cur, kind: str):
|
||||
spec = DEFS[kind]
|
||||
table = spec['table']
|
||||
if kind == 'klub':
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN COALESCE(web, web_stranica) IS NOT NULL AND COALESCE(web, web_stranica)<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN COALESCE(sjediste, adresa) IS NOT NULL AND COALESCE(sjediste, adresa)<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN ciljevi IS NOT NULL AND ciljevi<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN opis_djelatnosti IS NOT NULL AND opis_djelatnosti<>'' THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
elif kind == 'sportas':
|
||||
sql = f"""
|
||||
SELECT id, (COALESCE(ime,'')||' '||COALESCE(prezime,'')) AS naziv,
|
||||
(CASE WHEN ime IS NOT NULL AND ime<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN prezime IS NOT NULL AND prezime<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN klub_id IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN datum_rodenja IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN slika_url IS NOT NULL AND slika_url<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN profile_url IS NOT NULL AND profile_url<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN biografija IS NOT NULL AND biografija<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN hns_igrac_id IS NOT NULL AND hns_igrac_id<>'' THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
elif kind == 'objekt':
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN tip IS NOT NULL AND tip<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN lat IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN lng IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN upravitelj IS NOT NULL AND upravitelj<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN kapacitet IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN sportovi IS NOT NULL AND array_length(sportovi,1)>0 THEN 1 ELSE 0 END +
|
||||
CASE WHEN izgradeno IS NOT NULL THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
elif kind == 'manifestacija':
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN mjesto IS NOT NULL AND mjesto<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN organizator IS NOT NULL AND organizator<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN razina IS NOT NULL AND razina<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN broj_ucesnika IS NOT NULL AND broj_ucesnika::text<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN godina_od IS NOT NULL THEN 1 ELSE 0 END +
|
||||
CASE WHEN source_url IS NOT NULL AND source_url<>'' THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
else: # savez
|
||||
sql = f"""
|
||||
SELECT id, naziv,
|
||||
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN web IS NOT NULL AND web<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END +
|
||||
CASE WHEN godina_osnutka IS NOT NULL THEN 1 ELSE 0 END
|
||||
) AS filled
|
||||
FROM {table}
|
||||
"""
|
||||
cur.execute(sql)
|
||||
rows = []
|
||||
for r in cur.fetchall():
|
||||
rows.append({'kind': kind, 'id': r['id'], 'naziv': r['naziv'] or '',
|
||||
'filled': int(r['filled']),
|
||||
'total': len(spec['fields'])})
|
||||
return rows
|
||||
|
||||
|
||||
def stats(rows):
|
||||
if not rows: return {}
|
||||
pcts = [r['filled']/r['total']*100 for r in rows]
|
||||
pcts.sort()
|
||||
n = len(pcts)
|
||||
mean = sum(pcts)/n
|
||||
median = pcts[n//2]
|
||||
zero = sum(1 for p in pcts if p == 0)
|
||||
complete = sum(1 for p in pcts if p >= 99.0)
|
||||
bins = Counter()
|
||||
for p in pcts:
|
||||
b = int(p // 10) * 10
|
||||
if b == 100: b = 90
|
||||
bins[b] += 1
|
||||
return {'n': n, 'mean': round(mean,1), 'median': round(median,1),
|
||||
'zero': zero, 'complete': complete,
|
||||
'distribution': dict(sorted(bins.items()))}
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**PG)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
all_rows = []
|
||||
by_kind = {}
|
||||
for kind in DEFS:
|
||||
rows = fetch_rows(cur, kind)
|
||||
by_kind[kind] = rows
|
||||
all_rows.extend(rows)
|
||||
print(f'{kind:14s} n={len(rows):5d} mean={stats(rows)["mean"]:.1f}% complete={stats(rows)["complete"]}')
|
||||
|
||||
# Top 50 worst — exclude rows with empty naziv (those are flagged separately)
|
||||
valid = [r for r in all_rows if (r['naziv'] or '').strip()]
|
||||
# Sort by coverage ASC, then by total DESC
|
||||
worst = sorted(valid, key=lambda r: (r['filled']/r['total'], -r['total']))[:50]
|
||||
out = {
|
||||
'generated_at': datetime.now(timezone.utc).isoformat(),
|
||||
'totals': {k: len(v) for k,v in by_kind.items()},
|
||||
'total_entities': len(all_rows),
|
||||
'per_type_stats': {k: stats(v) for k,v in by_kind.items()},
|
||||
'top50_review': worst,
|
||||
}
|
||||
print(f'\nTotal entities: {len(all_rows)}')
|
||||
print(f'Top 50 worst — sample:')
|
||||
for r in worst[:5]:
|
||||
pct = r['filled']/r['total']*100
|
||||
print(f" {r['kind']:14s} id={r['id']:7d} {r['naziv'][:50]:50s} {r['filled']}/{r['total']} ({pct:.0f}%)")
|
||||
json.dump(out, open('/tmp/coverage_data.json','w'), ensure_ascii=False, default=str)
|
||||
cur.close(); conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,4 +1,8 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
import os
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_extract.py
|
||||
# Verzija: 1.0.0
|
||||
@@ -25,7 +29,7 @@ logging.basicConfig(
|
||||
)
|
||||
log = logging.getLogger("extract")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
|
||||
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: godisnjak_extract.py
|
||||
# Verzija: 1.0.0
|
||||
# Datum: 03.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
|
||||
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
|
||||
# Zavisi od: httpx, psycopg2, rapidfuzz
|
||||
# Utječe na: pgz_sport.clanovi
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
import asyncio, glob, json, logging, re, sys, time
|
||||
import httpx, psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("extract")
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
VLLM_URL = "http://localhost:8001/v1/chat/completions"
|
||||
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
|
||||
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
MAX_WORKERS = 4
|
||||
CHUNK_SIZE = 1400
|
||||
|
||||
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
|
||||
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
|
||||
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
|
||||
|
||||
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
|
||||
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
|
||||
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
|
||||
kineziolog, maser, sudac, volonter
|
||||
|
||||
Pravila:
|
||||
1. Samo HRVATSKA osobe s punim imenom i prezimenom
|
||||
2. Ako klub nije eksplicitno naveden -> klub=""
|
||||
3. NE izmisljaj - samo jasno navedena imena u tekstu
|
||||
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
|
||||
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE):
|
||||
paragraphs = re.split(r'\n\n+', text)
|
||||
chunks, cur = [], ""
|
||||
for p in paragraphs:
|
||||
if len(cur) + len(p) > size:
|
||||
if cur: chunks.append(cur.strip())
|
||||
cur = p
|
||||
else:
|
||||
cur += "\n\n" + p
|
||||
if cur: chunks.append(cur.strip())
|
||||
return [c for c in chunks if len(c) > 80]
|
||||
|
||||
|
||||
# Preload klub cache
|
||||
def load_klub_cache(conn):
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def fuzzy_klub(naziv, cache):
|
||||
if not naziv or len(naziv) < 3:
|
||||
return None
|
||||
best_id, best_score = None, 0
|
||||
for kid, kname in cache:
|
||||
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
|
||||
if score > best_score:
|
||||
best_score, best_id = score, kid
|
||||
return best_id if best_score > 72 else None
|
||||
|
||||
|
||||
async def extract_persons(chunk_text_str, semaphore):
|
||||
async with semaphore:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90.0) as c:
|
||||
r = await c.post(VLLM_URL, json={
|
||||
"model": VLLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": EXTRACT_PROMPT},
|
||||
{"role": "user", "content": chunk_text_str[:5000]},
|
||||
],
|
||||
"temperature": 0.05,
|
||||
"max_tokens": 2500,
|
||||
"response_format": {"type": "json_object"},
|
||||
})
|
||||
d = r.json()
|
||||
content = d["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
except Exception as e:
|
||||
log.debug(f"Extract fail: {e}")
|
||||
return {"osobe": []}
|
||||
|
||||
|
||||
VALID_ULOGE = {
|
||||
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
|
||||
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
|
||||
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
|
||||
"kineziolog","maser","sudac","volonter"
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Backup
|
||||
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
|
||||
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
|
||||
SELECT * FROM pgz_sport.clanovi""")
|
||||
log.info("Backup created")
|
||||
|
||||
klub_cache = load_klub_cache(conn)
|
||||
log.info(f"Klub cache: {len(klub_cache)} klubova")
|
||||
|
||||
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
|
||||
log.info(f"Files: {len(files)}")
|
||||
|
||||
semaphore = asyncio.Semaphore(MAX_WORKERS)
|
||||
total_inserted = 0
|
||||
total_skipped = 0
|
||||
|
||||
for f in files:
|
||||
m = re.search(r'godisnjak_(\d{4})_layout', f)
|
||||
year = m.group(1) if m else "?"
|
||||
with open(f) as fp:
|
||||
text = fp.read()
|
||||
|
||||
chunks = chunk_text(text)
|
||||
log.info(f"Year {year}: {len(chunks)} chunks")
|
||||
|
||||
tasks = [extract_persons(c, semaphore) for c in chunks]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
year_ins = 0
|
||||
rows = []
|
||||
for res in results:
|
||||
for o in res.get("osobe", []):
|
||||
ime = (o.get("ime") or "").strip()
|
||||
prezime = (o.get("prezime") or "").strip()
|
||||
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
|
||||
continue
|
||||
# Basic sanity — no numbers, no too-long names
|
||||
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
|
||||
continue
|
||||
|
||||
uloga = (o.get("uloga") or "igrac").lower().strip()
|
||||
if uloga not in VALID_ULOGE:
|
||||
uloga = "igrac"
|
||||
|
||||
klub_naziv = (o.get("klub") or "").strip()
|
||||
klub_id = fuzzy_klub(klub_naziv, klub_cache)
|
||||
|
||||
rows.append((
|
||||
ime, prezime, uloga, klub_id,
|
||||
"godisnjak",
|
||||
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
|
||||
"sportas",
|
||||
))
|
||||
|
||||
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
|
||||
for row in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", row)
|
||||
if cur.rowcount:
|
||||
year_ins += 1
|
||||
except Exception as e:
|
||||
log.debug(f"Insert skip: {e}")
|
||||
|
||||
total_inserted += year_ins
|
||||
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
|
||||
|
||||
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
|
||||
final = cur.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
log.info(f"""
|
||||
=== EXTRACT DONE ===
|
||||
Inserted this run: {total_inserted}
|
||||
Total godisnjak u DB: {final}
|
||||
""")
|
||||
|
||||
import requests as rq
|
||||
rq.post(
|
||||
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
data={"chat_id": "7969491558",
|
||||
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
Godišnjak pipeline:
|
||||
1. Find godišnjak PDFs in DB (table dokumenti) + scrape sport-pgz.hr
|
||||
@@ -13,7 +16,7 @@ import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import pypdf
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
+112
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Godišnjak pipeline:
|
||||
1. Find godišnjak PDFs in DB (table dokumenti) + scrape sport-pgz.hr
|
||||
2. Download PDF lokalno
|
||||
3. Parse text iz PDF
|
||||
4. UPDATE pgz_sport.dokumenti SET sadrzaj = parsed_text
|
||||
5. Save chunks za RAG
|
||||
"""
|
||||
import os, sys, hashlib, requests, re
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import pypdf
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def download_pdf(url, dest):
|
||||
if dest.exists() and dest.stat().st_size > 1000:
|
||||
return dest
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=60, allow_redirects=True)
|
||||
if r.status_code == 200 and len(r.content) > 1000:
|
||||
dest.write_bytes(r.content)
|
||||
return dest
|
||||
except Exception as e:
|
||||
print(f" ERR download {url}: {e}")
|
||||
return None
|
||||
|
||||
def parse_pdf(path):
|
||||
try:
|
||||
r = pypdf.PdfReader(str(path))
|
||||
text = ''
|
||||
for p in r.pages:
|
||||
try: text += (p.extract_text() or '') + '\n'
|
||||
except: pass
|
||||
return text, len(r.pages)
|
||||
except Exception as e:
|
||||
print(f" ERR parse {path}: {e}")
|
||||
return '', 0
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
# 1. Get all godišnjaci s pdf_url ili url s .pdf
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, title, url, pdf_url, vrsta, sadrzaj
|
||||
FROM pgz_sport.dokumenti
|
||||
WHERE (
|
||||
title ILIKE '%sportski godi%njak%' OR title ILIKE '%godi%njak HNS%'
|
||||
OR title ILIKE 'ZSPGZ%' OR title ILIKE '%godi%njak ZSPGZ%'
|
||||
OR url ILIKE '%godisnjak%.pdf' OR pdf_url ILIKE '%godisnjak%.pdf'
|
||||
OR title ILIKE '%godi%njak%' AND (url ILIKE '%pdf' OR pdf_url IS NOT NULL)
|
||||
)
|
||||
ORDER BY id DESC
|
||||
""")
|
||||
targets = cur.fetchall()
|
||||
|
||||
print(f"Targets: {len(targets)}")
|
||||
|
||||
parsed_count = 0
|
||||
for t in targets:
|
||||
url = t['pdf_url'] or t['url']
|
||||
if not url or not url.lower().endswith('.pdf'):
|
||||
continue
|
||||
|
||||
if t['sadrzaj'] and len(t['sadrzaj']) > 500:
|
||||
print(f" ⏭ ID {t['id']}: already parsed ({len(t['sadrzaj'])} chars)")
|
||||
continue
|
||||
|
||||
print(f" 📄 ID {t['id']}: {t['title'][:60]}")
|
||||
fname = re.sub(r'[^\w.-]', '_', os.path.basename(url))[:100]
|
||||
dest = UPLOAD_DIR / f"{t['id']}_{fname}"
|
||||
|
||||
downloaded = download_pdf(url, dest)
|
||||
if not downloaded:
|
||||
continue
|
||||
|
||||
text, pages = parse_pdf(downloaded)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
print(f" ✓ {pages} pages, {len(text)} chars")
|
||||
|
||||
# UPDATE sadrzaj
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.dokumenti
|
||||
SET sadrzaj = %s, last_updated = now()
|
||||
WHERE id = %s
|
||||
""", (text[:500000], t['id'])) # cap 500K
|
||||
|
||||
# Chunks za RAG (1000 chars each)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (t['id'],))
|
||||
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
|
||||
for i, ch in enumerate(chunks[:200]):
|
||||
if len(ch.strip()) > 50:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokument_chunks (dokument_id, chunk_idx, content)
|
||||
VALUES (%s, %s, %s)
|
||||
""", (t['id'], i, ch))
|
||||
|
||||
parsed_count += 1
|
||||
|
||||
print(f"\nDone. Parsed: {parsed_count}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
Godišnjak pipeline v2 — popravljen za pravu shemu.
|
||||
"""
|
||||
@@ -8,7 +11,7 @@ import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import pypdf
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
+150
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Godišnjak pipeline v2 — popravljen za pravu shemu.
|
||||
"""
|
||||
import os, sys, hashlib, requests, re
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import pypdf
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 18 godišnjaka 2006-2024 (otkriveni scrapeom)
|
||||
GODISNJAK_URLS = [
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/godisnjak-2006-print.pdf", 2006),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2007.pdf", 2007),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2008.pdf", 2008),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2009.pdf", 2009),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2010.pdf", 2010),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf", 2011),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2012.pdf", 2012),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2013.pdf", 2013),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2014.pdf", 2014),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2015.pdf", 2015),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2017.pdf", 2017),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2018.pdf", 2018),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2019.pdf", 2019),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2020.pdf", 2020),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2021.pdf", 2021),
|
||||
("https://sport-pgz.hr/upload/dokumenti/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf", 2022),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf", 2023),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf", 2024),
|
||||
]
|
||||
|
||||
def download_pdf(url, dest):
|
||||
if dest.exists() and dest.stat().st_size > 1000:
|
||||
print(f" [cached] {dest.name} ({dest.stat().st_size//1024}KB)")
|
||||
return dest
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=120, allow_redirects=True)
|
||||
if r.status_code == 200 and len(r.content) > 1000:
|
||||
dest.write_bytes(r.content)
|
||||
return dest
|
||||
else:
|
||||
print(f" ✗ HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ERR: {e}")
|
||||
return None
|
||||
|
||||
def parse_pdf(path):
|
||||
try:
|
||||
r = pypdf.PdfReader(str(path))
|
||||
text = ''
|
||||
for p in r.pages:
|
||||
try: text += (p.extract_text() or '') + '\n'
|
||||
except: pass
|
||||
return text, len(r.pages)
|
||||
except Exception as e:
|
||||
return '', 0
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
# Get chunks table column name
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema='pgz_sport' AND table_name='dokument_chunks'
|
||||
""")
|
||||
cols = [r[0] for r in cur.fetchall()]
|
||||
|
||||
print(f"dokument_chunks columns: {cols}")
|
||||
|
||||
parsed_count = 0
|
||||
for url, godina in GODISNJAK_URLS:
|
||||
title = f"Sportski godišnjak ZSPGZ {godina}"
|
||||
fname = f"sportski-godisnjak-{godina}.pdf"
|
||||
dest = UPLOAD_DIR / fname
|
||||
|
||||
print(f"\n📄 {title}")
|
||||
downloaded = download_pdf(url, dest)
|
||||
if not downloaded:
|
||||
continue
|
||||
|
||||
# Compute SHA1
|
||||
sha1 = hashlib.sha1(downloaded.read_bytes()).hexdigest()
|
||||
|
||||
text, pages = parse_pdf(downloaded)
|
||||
if not text:
|
||||
print(f" ✗ parse failed")
|
||||
continue
|
||||
print(f" ✓ {pages} pages, {len(text)} chars")
|
||||
|
||||
# UPSERT u dokumenti
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Check if exists by sha1
|
||||
cur.execute("SELECT id FROM pgz_sport.dokumenti WHERE sha1 = %s LIMIT 1", (sha1,))
|
||||
existing = cur.fetchone()
|
||||
|
||||
if existing:
|
||||
doc_id = existing['id']
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.dokumenti
|
||||
SET title = %s, godina = %s, vrsta = 'sportski-godisnjak',
|
||||
url = %s, pdf_url = %s, sadrzaj = %s,
|
||||
sluzbeni_glasnik = 'ZSPGZ', razina = 'zupanijska',
|
||||
organizacija = 'Zajednica sportova Primorsko-goranske županije',
|
||||
izvor_url = %s, last_updated = now()
|
||||
WHERE id = %s
|
||||
""", (title, godina, url, url, text[:500000], 'https://sport-pgz.hr', doc_id))
|
||||
print(f" ↻ UPDATE id={doc_id}")
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(title, fname, vrsta, godina, url, pdf_url, sha1, sadrzaj,
|
||||
sluzbeni_glasnik, razina, organizacija, izvor_url)
|
||||
VALUES (%s, %s, 'sportski-godisnjak', %s, %s, %s, %s, %s,
|
||||
'ZSPGZ', 'zupanijska', 'Zajednica sportova PGŽ', 'https://sport-pgz.hr')
|
||||
RETURNING id
|
||||
""", (title, fname, godina, url, url, sha1, text[:500000]))
|
||||
doc_id = cur.fetchone()['id']
|
||||
print(f" + INSERT id={doc_id}")
|
||||
|
||||
# Chunks (proper schema)
|
||||
if 'idx' in cols or 'chunk_idx' in cols or 'page' in cols or 'i' in cols:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (doc_id,))
|
||||
# Find INSERT pattern by columns
|
||||
idx_col = 'idx' if 'idx' in cols else ('chunk_idx' if 'chunk_idx' in cols else ('i' if 'i' in cols else 'page'))
|
||||
content_col = 'content' if 'content' in cols else ('chunk' if 'chunk' in cols else ('text' if 'text' in cols else 'sadrzaj'))
|
||||
|
||||
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
|
||||
for i, ch in enumerate(chunks[:300]):
|
||||
if len(ch.strip()) > 50:
|
||||
try:
|
||||
cur.execute(f"""
|
||||
INSERT INTO pgz_sport.dokument_chunks (dokument_id, {idx_col}, {content_col})
|
||||
VALUES (%s, %s, %s)
|
||||
""", (doc_id, i, ch))
|
||||
except Exception as e:
|
||||
print(f" ERR chunk {i}: {e}"); break
|
||||
|
||||
parsed_count += 1
|
||||
|
||||
print(f"\n✅ Done. Parsed: {parsed_count}/{len(GODISNJAK_URLS)}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# Fajl: hns_avatar_harvester.py | v1.0 | 05.05.2026
|
||||
# Author: Damir Radulić
|
||||
# Lokacija: /opt/pgz-sport/scripts/hns_avatar_harvester.py
|
||||
@@ -8,7 +11,7 @@ import psycopg2
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
DSN = os.environ.get("RINET_DSN", "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
DSN = os.environ.get("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
HEADERS = {"User-Agent": "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"}
|
||||
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
# Fajl: hns_avatar_harvester.py | v1.0 | 05.05.2026
|
||||
# Author: Damir Radulić
|
||||
# Lokacija: /opt/pgz-sport/scripts/hns_avatar_harvester.py
|
||||
# Svrha: Dohvati avatar URL za svakog igrača sa HNS profila
|
||||
import os, time, re, json, sys
|
||||
import psycopg2
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
DSN = os.environ.get("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
HEADERS = {"User-Agent": "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"}
|
||||
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
def fetch_avatar(hns_id, slug=""):
|
||||
url = f"https://semafor.hns.family/igraci/{hns_id}/"
|
||||
if slug: url += f"{slug}/"
|
||||
try:
|
||||
r = requests.get(url, headers=HEADERS, timeout=15)
|
||||
if r.status_code != 200: return None
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
# Player photo selectors
|
||||
for sel in [".playerPhoto img", ".player-photo img", ".playerHeader img", "img.player_photo"]:
|
||||
img = soup.select_one(sel)
|
||||
if img and img.get("src"):
|
||||
src = img["src"]
|
||||
if src.startswith("/"): src = "https://hns.family" + src
|
||||
return src
|
||||
# Generic: first img inside header
|
||||
hdr = soup.select_one(".playerHeader, .player-header, .basic_info")
|
||||
if hdr:
|
||||
img = hdr.find("img")
|
||||
if img and img.get("src"):
|
||||
src = img["src"]
|
||||
if src.startswith("/"): src = "https://hns.family" + src
|
||||
return src
|
||||
return None
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, hns_igrac_id, ime, prezime
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL AND foto_url IS NULL
|
||||
LIMIT 200
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
|
||||
print(f"Total: {len(rows)} igrača za avatar fetch")
|
||||
hits = 0
|
||||
for i, (cid, hns_id, ime, prezime) in enumerate(rows):
|
||||
slug = f"{ime}-{prezime}".lower().replace("ć","c").replace("č","c").replace("š","s").replace("ž","z").replace("đ","d").replace(" ","-")
|
||||
slug = re.sub(r"[^a-z0-9-]", "", slug)
|
||||
avatar = fetch_avatar(hns_id, slug)
|
||||
if avatar:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("UPDATE pgz_sport.clanovi SET foto_url=%s WHERE id=%s", (avatar, cid))
|
||||
hits += 1
|
||||
if i % 10 == 0: print(f" [{i+1}/{len(rows)}] {ime} {prezime} → {avatar[:80]}")
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"\nDONE: {hits}/{len(rows)} avatar URL-ova spremljen")
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
HNS Master Harvester — Playwright-based scrape semafor.hns.family
|
||||
─────────────────────────────────────────────────────────────────
|
||||
@@ -18,7 +21,7 @@ from psycopg2.extras import RealDictCursor, execute_values
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
||||
@@ -210,12 +213,30 @@ def upsert_clan(conn, klub_id, player_data):
|
||||
def upsert_seasons(conn, hns_id, clan_id, seasons):
|
||||
if not seasons: return 0
|
||||
rows = []
|
||||
skipped = 0
|
||||
# Reject rows where klub_naziv is obviously a misparsed HTML stat-block
|
||||
# (the parser at scrape_player_full() can produce these when a <table> row
|
||||
# has fewer cells than the header — dict(zip(...)) silently drops, leaving
|
||||
# whole-block dumps or bare numbers in the value).
|
||||
_BAD_PREFIXES = ('STATISTIKA', 'NASTUPI', 'ZAPOČEO', 'ZAMJENA',
|
||||
'POGOTCI', 'ŽUTI', 'CRVENI', 'UKUPNO', 'SUPERSPORT')
|
||||
def _looks_like_garbage(klub_text):
|
||||
if not klub_text: return True
|
||||
t = klub_text.strip()
|
||||
if not t: return True # whitespace only
|
||||
if re.match(r'^\d+$', t): return True # bare number (year, jersey #)
|
||||
if t.count('\n') >= 2: return True # multi-line label dump
|
||||
u = t.upper()
|
||||
return any(u.startswith(p) for p in _BAD_PREFIXES)
|
||||
for s in seasons:
|
||||
sezona = s.get('sezona', '')
|
||||
if not sezona: continue
|
||||
# Try extract klub iz row
|
||||
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
|
||||
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
|
||||
if _looks_like_garbage(klub):
|
||||
skipped += 1
|
||||
continue
|
||||
def num(key):
|
||||
for k in s.keys():
|
||||
if key in k.lower():
|
||||
@@ -227,6 +248,8 @@ def upsert_seasons(conn, hns_id, clan_id, seasons):
|
||||
num('nastup'), num('start'), num('zamj'),
|
||||
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
|
||||
))
|
||||
if skipped:
|
||||
print(f'[hns_master_harvester] upsert_seasons: skipped {skipped} garbage rows for hns_id={hns_id}')
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
|
||||
+369
@@ -0,0 +1,369 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HNS Master Harvester — Playwright-based scrape semafor.hns.family
|
||||
─────────────────────────────────────────────────────────────────
|
||||
1. List PGŽ financirani nogometni klubovi
|
||||
2. For each klub: scrape klub roster
|
||||
3. For each player: scrape full profile (sezone, utakmice)
|
||||
4. UPSERT u pgz_sport: hns_klub_roster, hns_player_seasons, hns_player_matches, clanovi
|
||||
5. Audit log
|
||||
|
||||
Usage: python3 hns_master_harvester.py [--limit N] [--klub-id X] [--players-only]
|
||||
"""
|
||||
import os, sys, time, json, re, argparse, traceback
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
||||
|
||||
def log(msg, telegram=False):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True)
|
||||
LOG.write(line + "\n"); LOG.flush()
|
||||
if telegram:
|
||||
try:
|
||||
import subprocess
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot{TG}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={msg[:2000]}"],
|
||||
timeout=8, capture_output=True)
|
||||
except: pass
|
||||
|
||||
def db_conn():
|
||||
c = psycopg2.connect(DSN); c.autocommit = True; return c
|
||||
|
||||
# ── Slug HNS = "Franko Andrijašević" → "franko-andrijasevic" ──
|
||||
def slugify_hns(text):
|
||||
if not text: return ""
|
||||
t = text.lower().strip()
|
||||
t = (t.replace('č','c').replace('ć','c').replace('ž','z').replace('š','s').replace('đ','d')
|
||||
.replace('Č','c').replace('Ć','c').replace('Ž','z').replace('Š','s').replace('Đ','d'))
|
||||
t = re.sub(r'[^a-z0-9\s-]', '', t)
|
||||
t = re.sub(r'\s+', '-', t).strip('-')
|
||||
return t
|
||||
|
||||
def scrape_player(page, hns_id, slug):
|
||||
"""Scrape player profile + sezone + utakmice."""
|
||||
url = f"https://semafor.hns.family/igraci/{hns_id}/{slug}/"
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
except Exception as e:
|
||||
log(f" ❌ Goto fail {url}: {e}")
|
||||
return None
|
||||
|
||||
h1 = page.locator('h1').first.inner_text() if page.locator('h1').count() else ''
|
||||
# Body text
|
||||
body_text = page.locator('body').inner_text()
|
||||
|
||||
# Trenutni klub link (first /klubovi/ link)
|
||||
current_klub = None
|
||||
klub_links = page.locator('a[href*="/klubovi/"]').all()
|
||||
if klub_links:
|
||||
href = klub_links[0].get_attribute('href') or ''
|
||||
m = re.search(r'/klubovi/(\d+)/([\w-]+)/', href)
|
||||
if m:
|
||||
current_klub = {'hns_id': m.group(1), 'slug': m.group(2), 'naziv': klub_links[0].inner_text().strip()}
|
||||
|
||||
# Karijera: regex za sezone (npr "2024/25", "2023/24")
|
||||
sezone = []
|
||||
# Potraži pattern "Sezona | Klub | ..." u tekstu
|
||||
season_lines = re.findall(r'(20\d{2}/\d{2}).{0,200}', body_text)
|
||||
|
||||
# Tables (možda dynamiclli rendered)
|
||||
seasons_data = []
|
||||
matches_data = []
|
||||
|
||||
# Wait for dynamic content
|
||||
try: page.wait_for_selector('table, .karijera, .sezona, [class*="season"]', timeout=8000)
|
||||
except: pass
|
||||
time.sleep(1)
|
||||
|
||||
# Re-grab full body after wait
|
||||
body_text = page.locator('body').inner_text()
|
||||
|
||||
# Parse karijera section: "Sezona | Klub | Natjecanje | Nastupi | Golovi"
|
||||
# Pattern: 2024/25 ... HNK Orijent ... 3.HNL ... 14 ... 2
|
||||
season_blocks = re.findall(r'(20\d{2}/\d{2})\s+([\w\s\u017c-\u017e\u0107\u010d\u0161\u017d\u0110\.\-]+?)\s+([\d\.\s]+)(?=20\d{2}/\d{2}|$)', body_text)
|
||||
for sb in season_blocks:
|
||||
sezona, klub_text, stats_text = sb
|
||||
nums = re.findall(r'\d+', stats_text)
|
||||
if len(nums) >= 1:
|
||||
seasons_data.append({
|
||||
'sezona': sezona,
|
||||
'klub': klub_text.strip()[:200],
|
||||
'nastupi': int(nums[0]) if len(nums) > 0 else 0,
|
||||
'golovi': int(nums[1]) if len(nums) > 1 else 0,
|
||||
})
|
||||
|
||||
tables = page.locator('table').all()
|
||||
for t in tables:
|
||||
rows = t.locator('tr').all()
|
||||
if len(rows) < 2: continue
|
||||
# Header
|
||||
header = [c.inner_text().strip() for c in rows[0].locator('th, td').all()]
|
||||
for r in rows[1:]:
|
||||
cells = [c.inner_text().strip() for c in r.locator('th, td').all()]
|
||||
if not cells: continue
|
||||
row_dict = dict(zip(header, cells))
|
||||
# Detect: has season column?
|
||||
sezona = next((v for k,v in row_dict.items() if re.match(r'\d{4}/\d{2}', v)), None)
|
||||
if sezona:
|
||||
seasons_data.append({**row_dict, 'sezona': sezona})
|
||||
|
||||
return {
|
||||
'hns_id': hns_id,
|
||||
'slug': slug,
|
||||
'naziv': h1,
|
||||
'url': url,
|
||||
'current_klub': current_klub,
|
||||
'sezone_count': len(seasons_data),
|
||||
'seasons': seasons_data,
|
||||
'matches': matches_data,
|
||||
'body_text_len': len(body_text),
|
||||
}
|
||||
|
||||
def scrape_klub_roster(page, klub_hns_id, klub_slug):
|
||||
"""Scrape klub roster — sve igrače trenutno u klubu."""
|
||||
url = f"https://semafor.hns.family/klubovi/{klub_hns_id}/{klub_slug}/"
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
except Exception as e:
|
||||
log(f" ❌ Goto fail {url}: {e}")
|
||||
return []
|
||||
|
||||
# Sve linkove na igrače
|
||||
players = []
|
||||
player_links = page.locator('a[href*="/igraci/"]').all()
|
||||
seen_ids = set()
|
||||
for a in player_links:
|
||||
href = a.get_attribute('href') or ''
|
||||
m = re.search(r'/igraci/(\d+)/([\w-]+)', href)
|
||||
if m:
|
||||
hns_id = m.group(1)
|
||||
if hns_id in seen_ids: continue
|
||||
seen_ids.add(hns_id)
|
||||
players.append({
|
||||
'hns_id': hns_id,
|
||||
'slug': m.group(2),
|
||||
'naziv': a.inner_text().strip(),
|
||||
'url': f"https://semafor.hns.family{href}" if href.startswith('/') else href
|
||||
})
|
||||
return players
|
||||
|
||||
def upsert_clan(conn, klub_id, player_data):
|
||||
"""Upsert člana iz HNS profil podataka."""
|
||||
# Naziv split: "FrankoAndrijašević" → ime/prezime
|
||||
naziv = re.sub(r'\s+', ' ', player_data.get('naziv', '')).strip()
|
||||
# Better: ako h1 join-an, podijeli camelcase
|
||||
parts = re.findall(r'[A-ZČĆŠŽĐ][a-zčćšžđ\']+', naziv)
|
||||
if len(parts) >= 2:
|
||||
ime = parts[0]
|
||||
prezime = ' '.join(parts[1:])
|
||||
else:
|
||||
ime = naziv
|
||||
prezime = ''
|
||||
|
||||
hns_id = player_data['hns_id']
|
||||
url = player_data['url']
|
||||
|
||||
with conn.cursor() as cur:
|
||||
# Try find existing
|
||||
cur.execute("""
|
||||
SELECT id FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id = %s
|
||||
ORDER BY id LIMIT 1
|
||||
""", (hns_id,))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
clan_id = row[0]
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.clanovi
|
||||
SET ime = COALESCE(NULLIF(ime,''), %s),
|
||||
prezime = COALESCE(NULLIF(prezime,''), %s),
|
||||
klub_id = COALESCE(klub_id, %s),
|
||||
hns_igrac_id = %s,
|
||||
source = 'hns_semafor',
|
||||
source_url = %s,
|
||||
last_updated = now(),
|
||||
last_scraped_at = now(),
|
||||
sport = COALESCE(sport, 'nogomet')
|
||||
WHERE id = %s
|
||||
""", (ime, prezime, klub_id, hns_id, url, clan_id))
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, sport, source, source_url, hns_igrac_id, last_scraped_at, aktivan)
|
||||
VALUES (%s, %s, %s, 'nogomet', 'hns_semafor', %s, %s, now(), true)
|
||||
RETURNING id
|
||||
""", (klub_id, ime, prezime, url, hns_id))
|
||||
clan_id = cur.fetchone()[0]
|
||||
return clan_id
|
||||
|
||||
def upsert_seasons(conn, hns_id, clan_id, seasons):
|
||||
if not seasons: return 0
|
||||
rows = []
|
||||
skipped = 0
|
||||
# Reject rows where klub_naziv is obviously a misparsed HTML stat-block
|
||||
# (the parser at scrape_player_full() can produce these when a <table> row
|
||||
# has fewer cells than the header — dict(zip(...)) silently drops, leaving
|
||||
# whole-block dumps or bare numbers in the value).
|
||||
_BAD_PREFIXES = ('STATISTIKA', 'NASTUPI', 'ZAPOČEO', 'ZAMJENA',
|
||||
'POGOTCI', 'ŽUTI', 'CRVENI', 'UKUPNO', 'SUPERSPORT')
|
||||
def _looks_like_garbage(klub_text):
|
||||
if not klub_text: return True
|
||||
t = klub_text.strip()
|
||||
if not t: return True # whitespace only
|
||||
if re.match(r'^\d+$', t): return True # bare number (year, jersey #)
|
||||
if t.count('\n') >= 2: return True # multi-line label dump
|
||||
u = t.upper()
|
||||
return any(u.startswith(p) for p in _BAD_PREFIXES)
|
||||
for s in seasons:
|
||||
sezona = s.get('sezona', '')
|
||||
if not sezona: continue
|
||||
# Try extract klub iz row
|
||||
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
|
||||
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
|
||||
if _looks_like_garbage(klub):
|
||||
skipped += 1
|
||||
continue
|
||||
def num(key):
|
||||
for k in s.keys():
|
||||
if key in k.lower():
|
||||
try: return int(re.sub(r'\D','', s[k]) or 0)
|
||||
except: return 0
|
||||
return 0
|
||||
rows.append((
|
||||
hns_id, clan_id, sezona, None, klub, natjecanje,
|
||||
num('nastup'), num('start'), num('zamj'),
|
||||
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
|
||||
))
|
||||
if skipped:
|
||||
print(f'[hns_master_harvester] upsert_seasons: skipped {skipped} garbage rows for hns_id={hns_id}')
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
|
||||
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje)
|
||||
DO UPDATE SET
|
||||
nastupi = EXCLUDED.nastupi, startna = EXCLUDED.startna,
|
||||
zamjena = EXCLUDED.zamjena, golovi = EXCLUDED.golovi,
|
||||
asistencije = EXCLUDED.asistencije, zuti = EXCLUDED.zuti,
|
||||
crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
|
||||
scraped_at = now()
|
||||
""", rows)
|
||||
return len(rows)
|
||||
|
||||
def upsert_klub_roster(conn, klub_id, klub_hns_id, players):
|
||||
if not players: return 0
|
||||
rows = [(klub_id, klub_hns_id, p['hns_id'],
|
||||
p.get('naziv','').split()[0] if p.get('naziv') else '',
|
||||
' '.join(p.get('naziv','').split()[1:]) if p.get('naziv') else '',
|
||||
p.get('pozicija',''), p.get('url',''))
|
||||
for p in players]
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_klub_roster
|
||||
(klub_id, klub_hns_id, hns_igrac_id, ime, prezime, pozicija, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (klub_hns_id, hns_igrac_id)
|
||||
DO UPDATE SET klub_id = EXCLUDED.klub_id, scraped_at = now()
|
||||
""", rows)
|
||||
return len(rows)
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument('--limit', type=int, default=999)
|
||||
ap.add_argument('--klub-id', type=int, default=None)
|
||||
ap.add_argument('--single-player', help='HNS ID of single player to scrape')
|
||||
args = ap.parse_args()
|
||||
|
||||
conn = db_conn()
|
||||
|
||||
# Get target klubs: PGŽ financirani nogometni
|
||||
if args.single_player:
|
||||
klubovi = []
|
||||
else:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
if args.klub_id:
|
||||
cur.execute("SELECT * FROM pgz_sport.klubovi WHERE id = %s", (args.klub_id,))
|
||||
else:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_financirani_klubovi
|
||||
WHERE sport = 'nogomet' AND source_url LIKE %s
|
||||
ORDER BY id LIMIT %s
|
||||
""", ('%semafor.hns.family/klubovi%', args.limit))
|
||||
klubovi = cur.fetchall()
|
||||
|
||||
log(f"🚀 HNS Harvester starting. Target klubova: {len(klubovi)}", telegram=True)
|
||||
|
||||
stats = {'klubova': 0, 'players_scraped': 0, 'seasons_upserted': 0, 'errors': 0}
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||
ctx = browser.new_context(
|
||||
ignore_https_errors=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
if args.single_player:
|
||||
# Test mode
|
||||
log(f"🔬 Single player mode: {args.single_player}")
|
||||
data = scrape_player(page, args.single_player, 'unknown')
|
||||
log(f" Data: {json.dumps(data, default=str, ensure_ascii=False)[:500]}")
|
||||
browser.close()
|
||||
return
|
||||
|
||||
for klub in klubovi:
|
||||
try:
|
||||
src = klub.get('source_url', '') or ''
|
||||
m = re.search(r'/klubovi/(\d+)/([^/]*)', src)
|
||||
if not m:
|
||||
log(f" ⏭ Klub {klub['id']} {klub['naziv']} — no HNS URL")
|
||||
continue
|
||||
khns, kslug = m.group(1), m.group(2) or 'klub'
|
||||
log(f"\n🏟 Klub {klub['id']} {klub['naziv']} → HNS {khns}/{kslug}")
|
||||
|
||||
roster = scrape_klub_roster(page, khns, kslug)
|
||||
log(f" Roster: {len(roster)} igrača")
|
||||
|
||||
if roster:
|
||||
upsert_klub_roster(conn, klub['id'], khns, roster)
|
||||
|
||||
# Each player
|
||||
for p in roster[:30]: # safety: max 30 per klub for now
|
||||
try:
|
||||
time.sleep(0.5)
|
||||
pdata = scrape_player(page, p['hns_id'], p['slug'])
|
||||
if pdata:
|
||||
clan_id = upsert_clan(conn, klub['id'], pdata)
|
||||
n_seas = upsert_seasons(conn, pdata['hns_id'], clan_id, pdata.get('seasons', []))
|
||||
stats['players_scraped'] += 1
|
||||
stats['seasons_upserted'] += n_seas
|
||||
log(f" ✓ {pdata['naziv']} (clan_id={clan_id}, seasons={n_seas})")
|
||||
except Exception as e:
|
||||
stats['errors'] += 1
|
||||
log(f" ❌ Player {p['hns_id']}: {e}")
|
||||
|
||||
stats['klubova'] += 1
|
||||
except Exception as e:
|
||||
stats['errors'] += 1
|
||||
log(f" ❌ Klub {klub['id']}: {e}\n{traceback.format_exc()[:500]}")
|
||||
|
||||
browser.close()
|
||||
|
||||
summary = f"✅ HNS Harvester done. Klubova: {stats['klubova']}, Players: {stats['players_scraped']}, Seasons: {stats['seasons_upserted']}, Errors: {stats['errors']}"
|
||||
log(summary, telegram=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+300
@@ -0,0 +1,300 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
HNS PGŽ FULL SCRAPER – svi klubovi, igrači, sezone, utakmice
|
||||
─────────────────────────────────────────────────────────────
|
||||
Sprema u:
|
||||
pgz_sport.clanovi
|
||||
pgz_sport.hns_player_seasons
|
||||
pgz_sport.hns_player_matches
|
||||
Autor: Damir Radulić / dradulic@outlook.com
|
||||
Datum: 2026-05-15 (robustna verzija)
|
||||
"""
|
||||
|
||||
import os, re, sys, time, logging, json
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
# ─── LOG ───────────────────────────────────────────
|
||||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [FULL] %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
log = logging.getLogger("hns_full")
|
||||
|
||||
# ─── DB CONN ────────────────────────────────────────
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
|
||||
def get_conn():
|
||||
c = psycopg2.connect(DSN)
|
||||
c.autocommit = True
|
||||
return c
|
||||
|
||||
# ─── HTTP FETCH ─────────────────────────────────────
|
||||
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot; contact dradulic@outlook.com)"
|
||||
def fetch(url, retries=3):
|
||||
for i in range(retries):
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except Exception as e:
|
||||
time.sleep(1.5 * (i+1))
|
||||
return None
|
||||
|
||||
# ─── PARSIRANJE ─────────────────────────────────────
|
||||
def parse_roster(html, klub_hns_id):
|
||||
"""Vraća listu (hns_igrac_id, ime, prezime, url)"""
|
||||
igraci = []
|
||||
# pronađi linkove na igrače
|
||||
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
|
||||
url = "https://semafor.hns.family" + m.group(1)
|
||||
hns_id = int(m.group(2))
|
||||
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
|
||||
raw_name = re.sub(r'\s+', ' ', raw_name)
|
||||
if not raw_name:
|
||||
continue
|
||||
parts = raw_name.split(' ', 1)
|
||||
ime = parts[0].strip()
|
||||
prezime = parts[1].strip() if len(parts) > 1 else ''
|
||||
# preskoči administrativne linkove
|
||||
if not prezime or len(prezime) < 2:
|
||||
continue
|
||||
igraci.append((hns_id, ime, prezime, url))
|
||||
return igraci
|
||||
|
||||
def parse_player_seasons(html, hns_igrac_id):
|
||||
"""Vraća listu dictova za svaku sezonu s poljima:
|
||||
sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute"""
|
||||
seasons = []
|
||||
# Pronađi blok sa sezonama – obično unutar <div class="playerStats"> ili tablice
|
||||
# Pojednostavljeno: tražimo sve redove tablice koje sadrže godinu i link na klub
|
||||
table_match = re.search(r'<table[^>]*class="[^"]*playerSeason[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
|
||||
if not table_match:
|
||||
table_match = re.search(r'<table[^>]*class="[^"]*career[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
|
||||
if not table_match:
|
||||
# fallback – tražimo bilo koju tablicu
|
||||
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
|
||||
if table_match:
|
||||
table_html = table_match.group(1)
|
||||
# parsiraj redove
|
||||
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
|
||||
if len(cells) < 3:
|
||||
continue
|
||||
# očekivani format: sezona (npr. "2025/26"), klub (link), natjecanje, nastupi, golovi, asistencije, žuti, crveni, minute
|
||||
# ali može varirati – tražimo barem sezonu i link na klub
|
||||
sezona = None
|
||||
klub_hns_id = None
|
||||
klub_naziv = ""
|
||||
natjecanje = ""
|
||||
nastupi = golovi = asistencije = zuti = crveni = minute = 0
|
||||
|
||||
# prva ćelija često sezona
|
||||
season_text = re.sub(r'<[^>]+>', '', cells[0]).strip()
|
||||
if re.match(r'\d{4}/\d{2,4}', season_text):
|
||||
sezona = season_text
|
||||
|
||||
# tražimo link na klub
|
||||
club_link = re.search(r'<a[^>]*href="(/klubovi/(\d+)/[^"]*)"[^>]*>(.*?)</a>', row.group(1), re.DOTALL)
|
||||
if club_link:
|
||||
klub_hns_id = int(club_link.group(2))
|
||||
klub_naziv = re.sub(r'<[^>]+>', '', club_link.group(3)).strip()
|
||||
|
||||
# natjecanje (obično treća ćelija ili druga ako nema kluba)
|
||||
if len(cells) >= 2 and not klub_hns_id:
|
||||
natjecanje = re.sub(r'<[^>]+>', '', cells[1]).strip()
|
||||
elif len(cells) >= 3:
|
||||
natjecanje = re.sub(r'<[^>]+>', '', cells[2]).strip()
|
||||
|
||||
# statistika
|
||||
for i, cell in enumerate(cells):
|
||||
text = re.sub(r'<[^>]+>', '', cell).strip()
|
||||
if text.isdigit():
|
||||
val = int(text)
|
||||
if i == 3: nastupi = val
|
||||
elif i == 4: golovi = val
|
||||
elif i == 5: asistencije = val
|
||||
elif i == 6: zuti = val
|
||||
elif i == 7: crveni = val
|
||||
elif i == 8: minute = val
|
||||
|
||||
if sezona and klub_hns_id:
|
||||
seasons.append({
|
||||
"hns_igrac_id": hns_igrac_id,
|
||||
"sezona": sezona,
|
||||
"klub_hns_id": str(klub_hns_id),
|
||||
"klub_naziv": klub_naziv,
|
||||
"natjecanje": natjecanje,
|
||||
"nastupi": nastupi,
|
||||
"golovi": golovi,
|
||||
"asistencije": asistencije,
|
||||
"zuti": zuti,
|
||||
"crveni": crveni,
|
||||
"minute": minute
|
||||
})
|
||||
return seasons
|
||||
|
||||
def parse_player_matches(html, hns_igrac_id):
|
||||
"""Vraća listu dictova za zadnje utakmice"""
|
||||
matches = []
|
||||
# slično parsiranje, tablica utakmica
|
||||
table_match = re.search(r'<table[^>]*class="[^"]*match[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
|
||||
if not table_match:
|
||||
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
|
||||
if table_match:
|
||||
table_html = table_match.group(1)
|
||||
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
|
||||
if len(cells) < 5:
|
||||
continue
|
||||
# format: datum, domaćin, gost, rezultat, (možda minutaža, golovi...)
|
||||
datum = re.sub(r'<[^>]+>', '', cells[0]).strip()
|
||||
domacin = re.sub(r'<[^>]+>', '', cells[1]).strip()
|
||||
gost = re.sub(r'<[^>]+>', '', cells[2]).strip()
|
||||
rezultat = re.sub(r'<[^>]+>', '', cells[3]).strip()
|
||||
if datum and domacin:
|
||||
matches.append({
|
||||
"hns_igrac_id": hns_igrac_id,
|
||||
"datum": datum,
|
||||
"domacin": domacin,
|
||||
"gost": gost,
|
||||
"rezultat": rezultat
|
||||
})
|
||||
return matches
|
||||
|
||||
# ─── UPSERT U BAZU ──────────────────────────────────
|
||||
def upsert_players(conn, players):
|
||||
sql = """
|
||||
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url, klub_hns_id)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id) DO UPDATE SET
|
||||
ime = EXCLUDED.ime,
|
||||
prezime = EXCLUDED.prezime,
|
||||
source_url = EXCLUDED.source_url,
|
||||
klub_hns_id = EXCLUDED.klub_hns_id
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, players)
|
||||
|
||||
def upsert_seasons(conn, seasons):
|
||||
if not seasons:
|
||||
return
|
||||
sql = """
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
|
||||
nastupi, golovi, asistencije, zuti, crveni, minute, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
|
||||
nastupi = EXCLUDED.nastupi,
|
||||
golovi = EXCLUDED.golovi,
|
||||
asistencije = EXCLUDED.asistencije,
|
||||
zuti = EXCLUDED.zuti,
|
||||
crveni = EXCLUDED.crveni,
|
||||
minute = EXCLUDED.minute,
|
||||
klub_naziv = EXCLUDED.klub_naziv
|
||||
"""
|
||||
vals = []
|
||||
for s in seasons:
|
||||
vals.append((
|
||||
s["hns_igrac_id"], s["sezona"], s["natjecanje"], s["klub_hns_id"],
|
||||
s["klub_naziv"], s["nastupi"], s["golovi"], s["asistencije"],
|
||||
s["zuti"], s["crveni"], s["minute"], ""
|
||||
))
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, vals, page_size=100)
|
||||
|
||||
def upsert_matches(conn, matches):
|
||||
if not matches:
|
||||
return
|
||||
sql = """
|
||||
INSERT INTO pgz_sport.hns_player_matches
|
||||
(hns_igrac_id, datum, domacin, gost, rezultat)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING
|
||||
"""
|
||||
vals = [(m["hns_igrac_id"], m["datum"], m["domacin"], m["gost"], m["rezultat"]) for m in matches]
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, vals, page_size=100)
|
||||
|
||||
# ─── MAIN ───────────────────────────────────────────
|
||||
def main():
|
||||
log.info("=== START FULL PGŽ HNS SCRAPE ===")
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
# 1. Dohvati sve PGŽ klubove s hns_klub_id
|
||||
cur.execute("""
|
||||
SELECT id, naziv, hns_klub_id
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE savez_id = 10 AND hns_klub_id IS NOT NULL
|
||||
""")
|
||||
klubovi = cur.fetchall()
|
||||
log.info(f"Klubova za obradu: {len(klubovi)}")
|
||||
|
||||
total_players = 0
|
||||
total_seasons = 0
|
||||
total_matches = 0
|
||||
|
||||
for klub_id, klub_naziv, hns_klub_id in klubovi:
|
||||
log.info(f"🏟️ {klub_naziv} (HNS {hns_klub_id})")
|
||||
# 2. Roster
|
||||
roster_url = f"https://semafor.hns.family/klubovi/{hns_klub_id}/igraci/"
|
||||
html = fetch(roster_url)
|
||||
if not html:
|
||||
log.warning(f" ⚠️ Nema rostera za {klub_naziv}")
|
||||
continue
|
||||
players = parse_roster(html, hns_klub_id)
|
||||
if players:
|
||||
# dodaj klub_hns_id u igrače (za update)
|
||||
players_with_klub = [(p[0], p[1], p[2], p[3], str(hns_klub_id)) for p in players]
|
||||
upsert_players(conn, players_with_klub)
|
||||
log.info(f" 👥 {len(players)} igrača")
|
||||
else:
|
||||
log.warning(f" ⚠️ Nema igrača")
|
||||
continue
|
||||
|
||||
# 3. Za svakog igrača skini sezone i utakmice ako nije skoro rađen
|
||||
for hns_id, ime, prezime, url in players:
|
||||
# provjeri kada je zadnji put scrape-an
|
||||
cur.execute("""
|
||||
SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons
|
||||
WHERE hns_igrac_id = %s
|
||||
""", (hns_id,))
|
||||
last = cur.fetchone()[0]
|
||||
if last and (datetime.now() - last) < timedelta(days=7):
|
||||
continue # preskoči svježe igrače
|
||||
|
||||
html = fetch(url)
|
||||
if not html:
|
||||
continue
|
||||
seasons = parse_player_seasons(html, hns_id)
|
||||
if seasons:
|
||||
upsert_seasons(conn, seasons)
|
||||
total_seasons += len(seasons)
|
||||
matches = parse_player_matches(html, hns_id)
|
||||
if matches:
|
||||
upsert_matches(conn, matches)
|
||||
total_matches += len(matches)
|
||||
time.sleep(0.3) # pristojnost prema serveru
|
||||
|
||||
total_players += len(players)
|
||||
time.sleep(1) # kratka pauza između klubova
|
||||
|
||||
conn.close()
|
||||
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+219
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
HNS PGŽ FULL SCRAPER v2 – ispravljen URL roster-a
|
||||
Koristi sub1_hns_catalog.json za točne URL-ove klubova
|
||||
"""
|
||||
|
||||
import os, re, sys, time, logging, json
|
||||
from datetime import datetime, timedelta
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
# ─── LOG ───────────────────────────────────────────
|
||||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [FULL] %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
log = logging.getLogger("hns_full")
|
||||
|
||||
# ─── DB ────────────────────────────────────────────
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(DSN)
|
||||
|
||||
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"
|
||||
|
||||
def fetch(url, retries=3):
|
||||
for i in range(retries):
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
|
||||
if r.status_code == 404: return None
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except: time.sleep(1.5 * (i+1))
|
||||
return None
|
||||
|
||||
# ─── PARSIRANJE ─────────────────────────────────────
|
||||
def parse_roster(html):
|
||||
"""Vraća listu (hns_igrac_id, ime, prezime, profil_url)"""
|
||||
igraci = []
|
||||
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
|
||||
url = "https://semafor.hns.family" + m.group(1)
|
||||
hns_id = int(m.group(2))
|
||||
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
|
||||
raw_name = re.sub(r'\s+', ' ', raw_name)
|
||||
if not raw_name: continue
|
||||
parts = raw_name.split(' ', 1)
|
||||
ime = parts[0].strip()
|
||||
prezime = parts[1].strip() if len(parts) > 1 else ''
|
||||
if not prezime or len(prezime) < 2: continue
|
||||
igraci.append((hns_id, ime, prezime, url))
|
||||
return igraci
|
||||
|
||||
def parse_seasons(html, hns_igrac_id):
|
||||
"""Vraća listu dictova sezona"""
|
||||
# tražimo JSON-LD ili tablicu
|
||||
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if json_match:
|
||||
try:
|
||||
data = json.loads(json_match.group(1))
|
||||
seasons_data = data.get('playerSeason', [])
|
||||
if not seasons_data:
|
||||
return []
|
||||
seasons = []
|
||||
for s in seasons_data:
|
||||
seasons.append({
|
||||
"hns_igrac_id": hns_igrac_id,
|
||||
"sezona": s.get("season", ""),
|
||||
"klub_hns_id": str(s.get("clubId", "")),
|
||||
"klub_naziv": s.get("clubName", ""),
|
||||
"natjecanje": s.get("competition", ""),
|
||||
"nastupi": int(s.get("apps", 0)),
|
||||
"golovi": int(s.get("goals", 0)),
|
||||
"asistencije": int(s.get("assists", 0)),
|
||||
"zuti": int(s.get("yellow", 0)),
|
||||
"crveni": int(s.get("red", 0)),
|
||||
"minute": int(s.get("minutes", 0))
|
||||
})
|
||||
return seasons
|
||||
except:
|
||||
pass
|
||||
return []
|
||||
|
||||
def parse_matches(html, hns_igrac_id):
|
||||
"""Vraća listu dictova utakmica iz JSON-LD"""
|
||||
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if json_match:
|
||||
try:
|
||||
data = json.loads(json_match.group(1))
|
||||
matches_data = data.get('playerMatch', [])
|
||||
matches = []
|
||||
for m in matches_data:
|
||||
matches.append({
|
||||
"hns_igrac_id": hns_igrac_id,
|
||||
"datum": m.get("date", ""),
|
||||
"domacin": m.get("homeTeam", ""),
|
||||
"gost": m.get("awayTeam", ""),
|
||||
"rezultat": m.get("result", "")
|
||||
})
|
||||
return matches
|
||||
except:
|
||||
pass
|
||||
return []
|
||||
|
||||
# ─── UPSERT ─────────────────────────────────────────
|
||||
def upsert_players(conn, players):
|
||||
sql = """INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id) DO UPDATE SET
|
||||
ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url"""
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, players)
|
||||
|
||||
def upsert_seasons(conn, seasons):
|
||||
if not seasons: return
|
||||
sql = """INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
|
||||
nastupi, golovi, asistencije, zuti, crveni, minute)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
|
||||
nastupi=EXCLUDED.nastupi, golovi=EXCLUDED.golovi,
|
||||
asistencije=EXCLUDED.asistencije, zuti=EXCLUDED.zuti,
|
||||
crveni=EXCLUDED.crveni, minute=EXCLUDED.minute,
|
||||
klub_naziv=EXCLUDED.klub_naziv"""
|
||||
vals = [(s['hns_igrac_id'], s['sezona'], s['natjecanje'], s['klub_hns_id'],
|
||||
s['klub_naziv'], s['nastupi'], s['golovi'], s['asistencije'],
|
||||
s['zuti'], s['crveni'], s['minute']) for s in seasons]
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, vals, page_size=50)
|
||||
|
||||
def upsert_matches(conn, matches):
|
||||
if not matches: return
|
||||
sql = """INSERT INTO pgz_sport.hns_player_matches
|
||||
(hns_igrac_id, datum, domacin, gost, rezultat)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING"""
|
||||
vals = [(m['hns_igrac_id'], m['datum'], m['domacin'], m['gost'], m['rezultat']) for m in matches]
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, sql, vals, page_size=50)
|
||||
|
||||
# ─── MAIN ───────────────────────────────────────────
|
||||
def main():
|
||||
log.info("=== START FULL PGŽ HNS SCRAPE v2 ===")
|
||||
conn = get_conn()
|
||||
conn.autocommit = True
|
||||
|
||||
# 1. Učitaj katalog za URL-ove klubova
|
||||
with open('/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json', 'r') as f:
|
||||
catalog = json.load(f)
|
||||
klub_url_map = {}
|
||||
for item in catalog:
|
||||
klub_url_map[item['id']] = f"https://semafor.hns.family/klubovi/{item['id']}/{item['slug']}/"
|
||||
log.info(f"Učitano {len(klub_url_map)} klubova iz kataloga.")
|
||||
|
||||
# 2. Dohvati klubove iz baze koji imaju hns_klub_id i savez_id=10
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL")
|
||||
klubovi = cur.fetchall()
|
||||
log.info(f"Klubova za obradu: {len(klubovi)}")
|
||||
|
||||
total_players = total_seasons = total_matches = 0
|
||||
|
||||
for klub_id, naziv, hns_id in klubovi:
|
||||
klub_url = klub_url_map.get(hns_id)
|
||||
if not klub_url:
|
||||
log.warning(f" ⚠️ {naziv} (HNS {hns_id}) nema URL u katalogu, preskačem.")
|
||||
continue
|
||||
log.info(f"🏟️ {naziv} → {klub_url}")
|
||||
html = fetch(klub_url)
|
||||
if not html:
|
||||
log.warning(f" ❌ Ne mogu dohvatiti stranicu kluba.")
|
||||
continue
|
||||
|
||||
players = parse_roster(html)
|
||||
if not players:
|
||||
log.warning(f" ⚠️ Nema igrača.")
|
||||
continue
|
||||
|
||||
# upsert igrača
|
||||
player_tuples = [(p[0], p[1], p[2], p[3]) for p in players]
|
||||
upsert_players(conn, player_tuples)
|
||||
log.info(f" 👥 {len(players)} igrača")
|
||||
|
||||
# za svakog igrača skini detalje
|
||||
for hns_igrac_id, ime, prezime, profile_url in players:
|
||||
# provjeri da li smo nedavno scrapeali sezone
|
||||
cur.execute("SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s", (hns_igrac_id,))
|
||||
last = cur.fetchone()[0]
|
||||
if last and (datetime.now() - last) < timedelta(days=7):
|
||||
continue
|
||||
html = fetch(profile_url)
|
||||
if not html:
|
||||
continue
|
||||
seasons = parse_seasons(html, hns_igrac_id)
|
||||
if seasons:
|
||||
upsert_seasons(conn, seasons)
|
||||
total_seasons += len(seasons)
|
||||
matches = parse_matches(html, hns_igrac_id)
|
||||
if matches:
|
||||
upsert_matches(conn, matches)
|
||||
total_matches += len(matches)
|
||||
time.sleep(0.3)
|
||||
total_players += len(players)
|
||||
|
||||
conn.close()
|
||||
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+218
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os, sys, re, time, logging
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv('/opt/.env.rinet')
|
||||
|
||||
# --- LOGGING ---
|
||||
LOG_DIR = "/var/log/pgz-sport-sync"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
LOG_FILE = os.path.join(LOG_DIR, "sync_master.log")
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE, encoding='utf-8'),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- CONFIG ---
|
||||
db_pass = os.environ.get('PG_PASS')
|
||||
if not db_pass:
|
||||
logger.critical("PG_PASS nije pronađen u /opt/.env.rinet")
|
||||
sys.exit(1)
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={db_pass}"
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": UA})
|
||||
|
||||
BASE_URL = "https://semafor.hns.family"
|
||||
|
||||
# Dodana sva natjecanja koja si naveo
|
||||
NATJECANJA_URLS = [
|
||||
"https://semafor.hns.family/natjecanja/101025334/1-nl-ns-rijeka-juniori-2526/",
|
||||
"https://semafor.hns.family/natjecanja/100585203/treca-nl-zapad-2526/",
|
||||
"https://semafor.hns.family/natjecanja/101555188/1-znl-seniori-2526/",
|
||||
"https://semafor.hns.family/natjecanja/102503486/1-zupanijska-omladinska-liga-kadeti-skupina-a-2526/"
|
||||
]
|
||||
|
||||
def strip_tags(text):
|
||||
"""Uklanja sve ugniježđene HTML tagove i vraća čisti string."""
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# --- DATABASE ---
|
||||
def db_conn():
|
||||
try:
|
||||
c = psycopg2.connect(DSN)
|
||||
c.autocommit = True
|
||||
return c
|
||||
except psycopg2.Error as e:
|
||||
logger.critical(f"DB Connection failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# --- HTTP FETCH ---
|
||||
def fetch(url, retries=3):
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
r = SESSION.get(url, timeout=15)
|
||||
if r.status_code == 404:
|
||||
logger.warning(f"HTTP 404 Not Found: {url}")
|
||||
return None
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except RequestException as e:
|
||||
logger.warning(f"HTTP GET failed ({attempt}/{retries}) for {url}: {e}")
|
||||
time.sleep(2 * attempt)
|
||||
logger.error(f"Gave up fetching {url} after {retries} attempts.")
|
||||
return None
|
||||
|
||||
# --- SYNC PROCEDURES ---
|
||||
def extract_klubovi(html):
|
||||
if not html: return []
|
||||
klubovi = {}
|
||||
|
||||
# Prilagođen regex za prepoznavanje svega unutar <a> taga, bez obzira na slike i spanove
|
||||
for m in re.finditer(r'<a[^>]*href="(/klubovi/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
|
||||
hns_id = m.group(2)
|
||||
slug = m.group(3)
|
||||
naziv = strip_tags(m.group(4))
|
||||
|
||||
if not naziv:
|
||||
naziv = slug.replace('-', ' ').title()
|
||||
|
||||
# Makni potencijalne krive linkove
|
||||
if len(naziv) < 50 and hns_id:
|
||||
klubovi[hns_id] = (hns_id, naziv, BASE_URL + m.group(1))
|
||||
|
||||
return list(klubovi.values())
|
||||
|
||||
def upsert_klubovi(conn, klubovi):
|
||||
if not klubovi: return []
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.klubovi (hns_id, naziv, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_id) DO UPDATE SET
|
||||
naziv = EXCLUDED.naziv,
|
||||
source_url = EXCLUDED.source_url
|
||||
WHERE pgz_sport.klubovi.naziv IS DISTINCT FROM EXCLUDED.naziv
|
||||
OR pgz_sport.klubovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
|
||||
""", klubovi)
|
||||
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE hns_id = ANY(%s)", ([k[0] for k in klubovi],))
|
||||
return cur.fetchall()
|
||||
except psycopg2.Error as e:
|
||||
logger.error(f"DB Greška pri UPSERT klubova: {e}")
|
||||
return []
|
||||
|
||||
def sync_roster(conn, klub_hns_id, klub_url):
|
||||
target_url = klub_url if klub_url.endswith('/') else klub_url + '/'
|
||||
target_url += "igraci/"
|
||||
|
||||
html = fetch(target_url)
|
||||
if not html: return []
|
||||
|
||||
igraci = {}
|
||||
for m in re.finditer(r'<a[^>]*href="(/igraci/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
|
||||
hns_igrac_id = m.group(2)
|
||||
slug = m.group(3)
|
||||
ime_prezime = strip_tags(m.group(4))
|
||||
|
||||
if not ime_prezime or len(ime_prezime) > 60:
|
||||
continue
|
||||
|
||||
parts = ime_prezime.split(' ', 1)
|
||||
ime = parts[0] if parts else "Nepoznato"
|
||||
prezime = parts[1] if len(parts) > 1 else slug.replace('-', ' ').title()
|
||||
|
||||
igraci[hns_igrac_id] = (hns_igrac_id, ime, prezime, klub_hns_id, BASE_URL + m.group(1), slug)
|
||||
|
||||
igraci_list = list(igraci.values())
|
||||
if not igraci_list:
|
||||
logger.debug(f"Klub {klub_hns_id} nema igrača (ili greška u parsiranju).")
|
||||
return []
|
||||
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, klub_hns_id, source_url, slug)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id) DO UPDATE SET
|
||||
ime = EXCLUDED.ime,
|
||||
prezime = EXCLUDED.prezime,
|
||||
klub_hns_id = EXCLUDED.klub_hns_id,
|
||||
source_url = EXCLUDED.source_url,
|
||||
slug = EXCLUDED.slug
|
||||
WHERE pgz_sport.clanovi.ime IS DISTINCT FROM EXCLUDED.ime
|
||||
OR pgz_sport.clanovi.prezime IS DISTINCT FROM EXCLUDED.prezime
|
||||
OR pgz_sport.clanovi.klub_hns_id IS DISTINCT FROM EXCLUDED.klub_hns_id
|
||||
OR pgz_sport.clanovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
|
||||
""", igraci_list)
|
||||
logger.info(f"Roster za klub {klub_hns_id}: uspješno sinkronizirano {len(igraci_list)} igrača.")
|
||||
return igraci_list
|
||||
except psycopg2.Error as e:
|
||||
logger.error(f"DB Greška pri UPSERT rostera za klub {klub_hns_id}: {e}")
|
||||
return []
|
||||
|
||||
def get_all_db_clubs(conn):
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE source_url IS NOT NULL")
|
||||
return cur.fetchall()
|
||||
except:
|
||||
return []
|
||||
|
||||
# --- MAIN ENGINE ---
|
||||
def main():
|
||||
logger.info("=== START: HNS PGŽ FULL SYNC ===")
|
||||
conn = db_conn()
|
||||
all_extracted_klubovi = []
|
||||
|
||||
# 1. Traži klubove po ligama
|
||||
for url in NATJECANJA_URLS:
|
||||
logger.info(f"Preuzimanje klubova iz natjecanja: {url}")
|
||||
html = fetch(url)
|
||||
extracted = extract_klubovi(html)
|
||||
logger.info(f"Pronađeno {len(extracted)} klubova u natjecanju.")
|
||||
all_extracted_klubovi.extend(extracted)
|
||||
time.sleep(1)
|
||||
|
||||
unique_klubovi = list({k[0]: k for k in all_extracted_klubovi}.values())
|
||||
logger.info(f"Ukupno jedinstvenih klubova za UPSERT: {len(unique_klubovi)}")
|
||||
upsert_klubovi(conn, unique_klubovi)
|
||||
|
||||
# 2. Skini roster za svaki klub iz baze
|
||||
db_klubovi = get_all_db_clubs(conn)
|
||||
logger.info(f"Pokrećem sync rostera za {len(db_klubovi)} klubova iz baze...")
|
||||
|
||||
for _, klub_hns_id, klub_url in db_klubovi:
|
||||
try:
|
||||
sync_roster(conn, klub_hns_id, klub_url)
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
logger.critical(f"Kritična greška kod kluba {klub_hns_id}: {e}")
|
||||
continue
|
||||
|
||||
logger.info("=== KRAJ: HNS PGŽ FULL SYNC ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Skripta prekinuta.")
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logger.critical(f"Neočekivani pad skripte: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
hns_player_deep.py — SUB3 deep HNS player scraper
|
||||
@@ -29,7 +32,7 @@ import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
SLEEP = float(os.getenv("SLEEP", "0.8"))
|
||||
|
||||
@@ -0,0 +1,534 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
hns_player_deep.py — SUB3 deep HNS player scraper
|
||||
─────────────────────────────────────────────────
|
||||
Author: dradulic@outlook.com / damir@rinet.one
|
||||
Date: 2026-05-05
|
||||
Version: 1.0
|
||||
|
||||
Scrapes semafor.hns.family/igraci/{id}/{slug}/ for every clanovi.hns_igrac_id row,
|
||||
extracting:
|
||||
• profil meta (datum_rodenja, mjesto_rodenja, broj_dresa, current klub)
|
||||
• per-season stats per natjecanje (UPSERT pgz_sport.hns_player_seasons)
|
||||
• last 30+ matches (UPSERT pgz_sport.hns_player_matches)
|
||||
|
||||
Server-rendered HTML — no Playwright needed → uses requests for 5–10× speedup.
|
||||
Fallback to Playwright if --use-playwright is passed.
|
||||
|
||||
Resume-able: skips clanovi where last_scraped_at > now() - interval N days.
|
||||
|
||||
Usage:
|
||||
python3 hns_player_deep.py [--limit 200] [--days 7] [--player HNS_ID] [--use-playwright]
|
||||
"""
|
||||
import os, sys, re, time, json, argparse, traceback
|
||||
from datetime import datetime, date
|
||||
from urllib.parse import urljoin
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
SLEEP = float(os.getenv("SLEEP", "0.8"))
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
LOG_FILE = f"{LOG_DIR}/sub3_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
LOG_FH = open(LOG_FILE, "a", encoding="utf-8")
|
||||
|
||||
def log(msg: str, telegram: bool = False) -> None:
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True)
|
||||
LOG_FH.write(line + "\n"); LOG_FH.flush()
|
||||
if telegram and TG_TOKEN and TG_CHAT:
|
||||
try:
|
||||
requests.post(
|
||||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
data={"chat_id": TG_CHAT, "text": msg[:4000]},
|
||||
timeout=8,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── HTTP session ──────────────────────────────────────────────────────────
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": UA, "Accept-Language": "hr,en;q=0.7"})
|
||||
|
||||
def fetch_html(url: str, timeout: int = 20) -> str | None:
|
||||
try:
|
||||
r = SESSION.get(url, timeout=timeout)
|
||||
if r.status_code != 200:
|
||||
log(f" HTTP {r.status_code} {url}")
|
||||
return None
|
||||
return r.text
|
||||
except Exception as e:
|
||||
log(f" fetch fail {url}: {e}")
|
||||
return None
|
||||
|
||||
# ── Parsers ───────────────────────────────────────────────────────────────
|
||||
def _strip_html(s: str) -> str:
|
||||
s = re.sub(r"<[^>]+>", " ", s)
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
def parse_profile(html: str) -> dict:
|
||||
"""Extract player profile meta (HNS exposes only birth date / city / jersey / current club)."""
|
||||
out = {
|
||||
"broj_dresa": None,
|
||||
"datum_rodenja": None,
|
||||
"mjesto_rodenja": None,
|
||||
"klub_hns_id": None,
|
||||
"klub_naziv": None,
|
||||
}
|
||||
# playerHeader block (everything from header to first <!--)
|
||||
m = re.search(r'<div class="block playerHeader"[^>]*>(.*?)<!--', html, re.DOTALL)
|
||||
header_html = m.group(1) if m else html
|
||||
|
||||
# Jersey number
|
||||
m = re.search(r'<span class="number"[^>]*>(\d+)</span>', header_html)
|
||||
if not m:
|
||||
# fallback: number in playerHeader text region (first standalone digit before name)
|
||||
text = _strip_html(header_html)
|
||||
mm = re.match(r'^\s*(\d{1,2})\s+[A-ZČĆŠŽĐ]', text)
|
||||
if mm:
|
||||
out["broj_dresa"] = int(mm.group(1))
|
||||
else:
|
||||
out["broj_dresa"] = int(m.group(1))
|
||||
|
||||
# Trenutni klub (first /klubovi/ link in header)
|
||||
m = re.search(r'<a[^>]+href="/klubovi/(\d+)/([\w-]+)/?"[^>]*>([^<]+)<', header_html)
|
||||
if m:
|
||||
out["klub_hns_id"] = m.group(1)
|
||||
out["klub_naziv"] = m.group(3).strip()
|
||||
|
||||
# Datum rođenja (dd.mm.yyyy.)
|
||||
m = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})\.\s*(?:</[^>]+>\s*)?(?:<[^>]+>\s*)*\(?\s*\d+\s*godin', header_html)
|
||||
if not m:
|
||||
# Looser pattern in playerData
|
||||
m = re.search(r'<div[^>]*class="[^"]*birth[^"]*"[^>]*>(\d{1,2})\.(\d{1,2})\.(\d{4})', header_html)
|
||||
if not m:
|
||||
# Fallback: any dd.mm.yyyy. near "Datum rođenja"
|
||||
text = _strip_html(header_html)
|
||||
mm = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})\.\s*\(?\s*\d+\s*godin[ae]?\)?\s*Datum rođenja', text)
|
||||
if mm:
|
||||
m = mm
|
||||
if m:
|
||||
try:
|
||||
out["datum_rodenja"] = date(int(m.group(3)), int(m.group(2)), int(m.group(1)))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Mjesto rođenja: text right before "Mjesto rođenja"
|
||||
text_all = _strip_html(header_html)
|
||||
mm = re.search(r'([A-ZČĆŠŽĐ][\w\sčćšžđČĆŠŽĐ\-]{1,80}?)\s+Mjesto rođenja', text_all)
|
||||
if mm:
|
||||
out["mjesto_rodenja"] = mm.group(1).strip()
|
||||
|
||||
return out
|
||||
|
||||
# Each season block: "{YYYY/YY} Statistika Utakmice ... <playerCompetitionStatsTable> ... <matchlist>"
|
||||
# We split player_profile_matches by the recurring pattern.
|
||||
SEASON_HEADER_RE = re.compile(
|
||||
r'(?:<[^>]+>\s*)?(20\d{2}/\d{2})(?:\s*<[^>]+>)?\s*Statistika\s+Utakmice',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def parse_seasons_and_matches(html: str) -> tuple[list[dict], list[dict]]:
|
||||
"""Return (season_rows, match_rows) for ALL seasons on the profile page."""
|
||||
# Limit to player_profile_matches block to avoid noise
|
||||
m = re.search(
|
||||
r'<div class="block w1280 matchlist style1 player_profile_matches"[^>]*>(.*?)(?=<!--|<footer)',
|
||||
html, re.DOTALL,
|
||||
)
|
||||
if not m:
|
||||
return [], []
|
||||
block = m.group(1)
|
||||
|
||||
# Find season header positions: <h2 class="seasonTitle ...">YYYY/YY</h2>
|
||||
headers = list(re.finditer(
|
||||
r'<h2\s+class="seasonTitle[^"]*"[^>]*>\s*(20\d{2}/\d{2})\s*</h2>',
|
||||
block,
|
||||
))
|
||||
if not headers:
|
||||
# Fallback: any <h2> with season label
|
||||
headers = list(re.finditer(r'<h2[^>]*>\s*(20\d{2}/\d{2})\s*</h2>', block))
|
||||
if not headers:
|
||||
plain = re.sub(r'<[^>]+>', ' ', block)
|
||||
plain = re.sub(r'\s+', ' ', plain)
|
||||
return _parse_plain(plain)
|
||||
|
||||
sections = []
|
||||
for i, h in enumerate(headers):
|
||||
sezona = h.group(1)
|
||||
start = h.start()
|
||||
end = headers[i + 1].start() if i + 1 < len(headers) else len(block)
|
||||
sections.append((sezona, block[start:end]))
|
||||
|
||||
season_rows: list[dict] = []
|
||||
match_rows: list[dict] = []
|
||||
|
||||
for sezona, sec in sections:
|
||||
# ── Per-season per-natjecanje stats (playerCompetitionStatsTable) ──
|
||||
cs = re.search(
|
||||
r'<div class="block w1280 playerCompetitionStatsTable"[^>]*>(.*?)</div>\s*</div>\s*</div>',
|
||||
sec, re.DOTALL,
|
||||
)
|
||||
if cs:
|
||||
stab = cs.group(1)
|
||||
# Header row → identify columns; body rows have natjecanje + 6 ints
|
||||
# Extract: total row "Ukupno" + per-competition rows
|
||||
# Each row appears as <td>…</td>. Use table-agnostic approach: find every block of
|
||||
# "<td>NATJECANJE</td><td>N</td><td>S</td><td>Z</td><td>G</td><td>YEL</td><td>RED</td>"
|
||||
# but tables here use divs not td. Walk plain text per line.
|
||||
stext = _strip_html(stab)
|
||||
# Split by competition-row pattern: "<label> <int> <int> <int> <int> <int> <int>"
|
||||
for rm in re.finditer(
|
||||
r'([A-ZČĆŠŽĐa-zčćšžđ0-9][^|]*?)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)(?=\s|$)',
|
||||
stext,
|
||||
):
|
||||
label = rm.group(1).strip()
|
||||
if label.lower().startswith("ukupno"):
|
||||
continue # we keep per-natjecanje rows only (UNIQUE prefers natjecanje)
|
||||
if "Nastupi" in label or "Započeo" in label or "Statistika" in label:
|
||||
continue
|
||||
try:
|
||||
season_rows.append({
|
||||
"sezona": sezona,
|
||||
"natjecanje": label[:200],
|
||||
"nastupi": int(rm.group(2)),
|
||||
"startna": int(rm.group(3)),
|
||||
"zamjena": int(rm.group(4)),
|
||||
"golovi": int(rm.group(5)),
|
||||
"zuti": int(rm.group(6)),
|
||||
"crveni": int(rm.group(7)),
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Matches (matchlist style2) ──
|
||||
ml = re.search(
|
||||
r'<div class="matchlist style2 semafor player[^"]*"[^>]*>(.*?)</ul>',
|
||||
sec, re.DOTALL,
|
||||
)
|
||||
if ml:
|
||||
list_html = ml.group(1)
|
||||
for row in re.finditer(
|
||||
r'<li class="row[^"]*"[^>]*data-match="(\d+)"[^>]*>(.*?)</li>',
|
||||
list_html, re.DOTALL,
|
||||
):
|
||||
row_html = row.group(2)
|
||||
# Date
|
||||
d = re.search(r'<div class="date">([^<]+)</div>', row_html)
|
||||
# club1 / club2
|
||||
c1 = re.search(r'<div class="club1"[^>]*>\s*<a[^>]*>([^<]+?)<', row_html)
|
||||
c2 = re.search(r'<div class="club2"[^>]*>\s*<a[^>]*>([^<]+?)<', row_html)
|
||||
# result
|
||||
r1 = re.search(r'<div class="res1">(\d+)</div>', row_html)
|
||||
r2 = re.search(r'<div class="res2">(\d+)</div>', row_html)
|
||||
# natjecanje
|
||||
cr = re.search(r'<div class="competitionround">([^<]+)</div>', row_html)
|
||||
# goals
|
||||
gl = re.search(r'<div class="goals">(\d+)</div>', row_html)
|
||||
# cards "Y / R"
|
||||
ca = re.search(r'<div class="cards">.*?(\d+)\s*/\s*(\d+).*?</div>', row_html, re.DOTALL)
|
||||
# minutes
|
||||
mn = re.search(r'<div class="minutes">(\d+)</div>', row_html)
|
||||
|
||||
# Parse date dd.mm.yyyy. HH:MM
|
||||
datum = None
|
||||
if d:
|
||||
dm = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', d.group(1))
|
||||
if dm:
|
||||
try:
|
||||
datum = date(int(dm.group(3)), int(dm.group(2)), int(dm.group(1)))
|
||||
except Exception:
|
||||
pass
|
||||
rezultat = f"{r1.group(1)}:{r2.group(1)}" if r1 and r2 else None
|
||||
|
||||
match_rows.append({
|
||||
"datum": datum,
|
||||
"domacin": (c1.group(1).strip() if c1 else "")[:120],
|
||||
"gost": (c2.group(1).strip() if c2 else "")[:120],
|
||||
"rezultat": rezultat,
|
||||
"natjecanje": (cr.group(1).strip() if cr else "")[:200],
|
||||
"golovi": int(gl.group(1)) if gl else 0,
|
||||
"zuti": int(ca.group(1)) if ca else 0,
|
||||
"crveni": int(ca.group(2)) if ca else 0,
|
||||
"minute_do": int(mn.group(1)) if mn else None,
|
||||
})
|
||||
|
||||
return season_rows, match_rows
|
||||
|
||||
|
||||
def _parse_plain(plain_text: str) -> tuple[list[dict], list[dict]]:
|
||||
"""Fallback: parse from already-stripped plain text (no match-row HTML access)."""
|
||||
# Best effort: extract season totals only
|
||||
season_rows: list[dict] = []
|
||||
# Split by season headers
|
||||
parts = re.split(r'(20\d{2}/\d{2})\s+Statistika\s+Utakmice', plain_text)
|
||||
# parts: [pre, season1, body1, season2, body2, ...]
|
||||
for i in range(1, len(parts), 2):
|
||||
sezona = parts[i]
|
||||
body = parts[i + 1] if i + 1 < len(parts) else ""
|
||||
# Find the "Ukupno N N N G Y R" then per-competition lines
|
||||
for rm in re.finditer(
|
||||
r'([A-ZČĆŠŽĐa-zčćšžđ0-9][^|]*?)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)(?=\s|$)',
|
||||
body[:3000],
|
||||
):
|
||||
label = rm.group(1).strip()
|
||||
if label.lower().startswith("ukupno"):
|
||||
continue
|
||||
if "Nastupi" in label or "Statistika" in label:
|
||||
continue
|
||||
season_rows.append({
|
||||
"sezona": sezona,
|
||||
"natjecanje": label[:200],
|
||||
"nastupi": int(rm.group(2)),
|
||||
"startna": int(rm.group(3)),
|
||||
"zamjena": int(rm.group(4)),
|
||||
"golovi": int(rm.group(5)),
|
||||
"zuti": int(rm.group(6)),
|
||||
"crveni": int(rm.group(7)),
|
||||
})
|
||||
return season_rows, []
|
||||
|
||||
# ── DB ────────────────────────────────────────────────────────────────────
|
||||
def db_conn():
|
||||
c = psycopg2.connect(DSN); c.autocommit = True; return c
|
||||
|
||||
def get_targets(conn, limit: int, days: int, force_player: str | None = None) -> list[dict]:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
if force_player:
|
||||
cur.execute("""
|
||||
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id = %s
|
||||
LIMIT 1
|
||||
""", (force_player,))
|
||||
else:
|
||||
cur.execute("""
|
||||
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL
|
||||
AND (last_scraped_at IS NULL OR last_scraped_at < now() - %s::interval)
|
||||
ORDER BY (last_scraped_at IS NULL) DESC, id ASC
|
||||
LIMIT %s
|
||||
""", (f"{days} days", limit))
|
||||
return cur.fetchall()
|
||||
|
||||
def update_clan(conn, clan_id: int, profile: dict, url: str) -> None:
|
||||
sets, vals = [], []
|
||||
if profile.get("datum_rodenja"):
|
||||
sets.append("datum_rodenja = COALESCE(datum_rodenja, %s)")
|
||||
vals.append(profile["datum_rodenja"])
|
||||
sets.append("datum_rodjenja = COALESCE(datum_rodjenja, %s)")
|
||||
vals.append(profile["datum_rodenja"])
|
||||
if profile.get("mjesto_rodenja"):
|
||||
sets.append("mjesto_rodenja = COALESCE(NULLIF(mjesto_rodenja,''), %s)")
|
||||
vals.append(profile["mjesto_rodenja"])
|
||||
sets.append("mjesto_rodjenja = COALESCE(NULLIF(mjesto_rodjenja,''), %s)")
|
||||
vals.append(profile["mjesto_rodenja"])
|
||||
if profile.get("broj_dresa") is not None:
|
||||
sets.append("broj_dresa = COALESCE(broj_dresa, %s)")
|
||||
vals.append(profile["broj_dresa"])
|
||||
sets.append("source_url = %s"); vals.append(url)
|
||||
sets.append("source = COALESCE(NULLIF(source,''), 'hns_semafor')")
|
||||
sets.append("sport = COALESCE(NULLIF(sport,''), 'nogomet')")
|
||||
sets.append("last_scraped_at = now()")
|
||||
sets.append("source_synced_at = now()")
|
||||
vals.append(clan_id)
|
||||
sql = f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s"
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, tuple(vals))
|
||||
|
||||
def upsert_seasons(conn, hns_id: str, clan_id: int, url: str, rows: list[dict]) -> int:
|
||||
if not rows:
|
||||
return 0
|
||||
raw = [
|
||||
(hns_id, clan_id, r["sezona"], None, None, r["natjecanje"][:200],
|
||||
r.get("nastupi", 0), r.get("startna", 0), r.get("zamjena", 0),
|
||||
r.get("golovi", 0), 0, r.get("zuti", 0), r.get("crveni", 0), 0, url)
|
||||
for r in rows
|
||||
]
|
||||
# Dedupe by UNIQUE (hns_igrac_id, sezona, klub_hns_id, natjecanje)
|
||||
dedup: dict[tuple, tuple] = {}
|
||||
for row in raw:
|
||||
k = (row[0], row[2], row[3], row[5])
|
||||
dedup[k] = row
|
||||
data = list(dedup.values())
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
|
||||
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
|
||||
nastupi = EXCLUDED.nastupi,
|
||||
startna = EXCLUDED.startna,
|
||||
zamjena = EXCLUDED.zamjena,
|
||||
golovi = EXCLUDED.golovi,
|
||||
zuti = EXCLUDED.zuti,
|
||||
crveni = EXCLUDED.crveni,
|
||||
source_url = EXCLUDED.source_url,
|
||||
scraped_at = now()
|
||||
""", data)
|
||||
return len(rows)
|
||||
|
||||
def upsert_matches(conn, hns_id: str, clan_id: int, url: str, rows: list[dict]) -> int:
|
||||
if not rows:
|
||||
return 0
|
||||
raw = [
|
||||
(hns_id, clan_id, r["datum"], r["natjecanje"], r["domacin"], r["gost"],
|
||||
r["rezultat"], None, None, None, r.get("minute_do"),
|
||||
r.get("golovi", 0), 0, r.get("zuti", 0), r.get("crveni", 0), url)
|
||||
for r in rows if r["datum"] and r["domacin"] and r["gost"]
|
||||
]
|
||||
# Dedupe by UNIQUE key (hns_igrac_id, datum, domacin, gost) — keep last occurrence
|
||||
dedup: dict[tuple, tuple] = {}
|
||||
for row in raw:
|
||||
k = (row[0], row[2], row[4], row[5])
|
||||
dedup[k] = row
|
||||
data = list(dedup.values())
|
||||
if not data:
|
||||
return 0
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_matches
|
||||
(hns_igrac_id, clan_id, datum, natjecanje, domacin, gost,
|
||||
rezultat, pozicija, startna, minute_od, minute_do,
|
||||
golovi, asistencije, zuti, crveni, source_url)
|
||||
VALUES %s
|
||||
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO UPDATE SET
|
||||
rezultat = EXCLUDED.rezultat,
|
||||
natjecanje = EXCLUDED.natjecanje,
|
||||
minute_do = EXCLUDED.minute_do,
|
||||
golovi = EXCLUDED.golovi,
|
||||
zuti = EXCLUDED.zuti,
|
||||
crveni = EXCLUDED.crveni,
|
||||
source_url = EXCLUDED.source_url,
|
||||
scraped_at = now()
|
||||
""", data)
|
||||
return len(data)
|
||||
|
||||
# ── Slug helper ───────────────────────────────────────────────────────────
|
||||
def slugify(text: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
repl = str.maketrans("čćžšđČĆŽŠĐ", "ccczsdcczsd"[:10])
|
||||
t = text.lower().translate(repl)
|
||||
t = re.sub(r"[^a-z0-9\s-]", "", t)
|
||||
return re.sub(r"\s+", "-", t).strip("-")
|
||||
|
||||
def build_url(t: dict) -> str:
|
||||
if t.get("source_url") and "semafor.hns.family/igraci/" in t["source_url"]:
|
||||
return t["source_url"]
|
||||
slug = (t.get("slug") or slugify(f"{t['ime']} {t['prezime']}")) or "x"
|
||||
return f"https://semafor.hns.family/igraci/{t['hns_igrac_id']}/{slug}/"
|
||||
|
||||
# ── Driver ────────────────────────────────────────────────────────────────
|
||||
def process_one(conn, t: dict) -> dict:
|
||||
url = build_url(t)
|
||||
html = fetch_html(url)
|
||||
if not html or "playerHeader" not in html:
|
||||
log(f" ✗ no playerHeader for {t['ime']} {t['prezime']} ({t['hns_igrac_id']}) → {url}")
|
||||
# Mark as scraped to avoid hot-loop on broken URL
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"UPDATE pgz_sport.clanovi SET last_scraped_at = now() WHERE id = %s",
|
||||
(t["id"],),
|
||||
)
|
||||
return {"profile": False, "seasons": 0, "matches": 0, "fields": 0}
|
||||
|
||||
profile = parse_profile(html)
|
||||
seasons, matches = parse_seasons_and_matches(html)
|
||||
|
||||
# Update clan profile
|
||||
update_clan(conn, t["id"], profile, url)
|
||||
n_fields = sum(1 for k in ("datum_rodenja", "mjesto_rodenja", "broj_dresa") if profile.get(k))
|
||||
|
||||
n_s = upsert_seasons(conn, t["hns_igrac_id"], t["id"], url, seasons)
|
||||
n_m = upsert_matches(conn, t["hns_igrac_id"], t["id"], url, matches)
|
||||
|
||||
return {"profile": True, "seasons": n_s, "matches": n_m, "fields": n_fields}
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--limit", type=int, default=200)
|
||||
ap.add_argument("--days", type=int, default=7)
|
||||
ap.add_argument("--player", help="Single HNS ID (debug)")
|
||||
ap.add_argument("--missing-matches", action="store_true",
|
||||
help="Only target clanovi without rows in hns_player_matches")
|
||||
ap.add_argument("--no-telegram", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
log(f"SUB3 deep scraper start | limit={args.limit} | days={args.days} | "
|
||||
f"missing_matches={args.missing_matches} | log={LOG_FILE}",
|
||||
telegram=not args.no_telegram)
|
||||
|
||||
conn = db_conn()
|
||||
if args.missing_matches:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL
|
||||
AND id NOT IN (
|
||||
SELECT clan_id FROM pgz_sport.hns_player_matches WHERE clan_id IS NOT NULL
|
||||
)
|
||||
ORDER BY id ASC
|
||||
LIMIT %s
|
||||
""", (args.limit,))
|
||||
targets = cur.fetchall()
|
||||
else:
|
||||
targets = get_targets(conn, args.limit, args.days, args.player)
|
||||
log(f"Targets: {len(targets)}")
|
||||
|
||||
stats = {"scraped": 0, "seasons": 0, "matches": 0, "fields": 0, "errors": 0}
|
||||
t0 = time.time()
|
||||
|
||||
for i, t in enumerate(targets, 1):
|
||||
try:
|
||||
r = process_one(conn, t)
|
||||
stats["scraped"] += 1
|
||||
stats["seasons"] += r["seasons"]
|
||||
stats["matches"] += r["matches"]
|
||||
stats["fields"] += r["fields"]
|
||||
if i % 10 == 0 or r["matches"] > 0:
|
||||
log(f" [{i}/{len(targets)}] {t['ime']} {t['prezime']} "
|
||||
f"→ seasons +{r['seasons']} matches +{r['matches']} fields +{r['fields']} "
|
||||
f"(totals: s={stats['seasons']} m={stats['matches']})")
|
||||
except Exception as e:
|
||||
stats["errors"] += 1
|
||||
log(f" ✗ ERROR {t['ime']} {t['prezime']} ({t['hns_igrac_id']}): {e}")
|
||||
log(traceback.format_exc()[:500])
|
||||
time.sleep(SLEEP)
|
||||
|
||||
dur = time.time() - t0
|
||||
summary = (
|
||||
f"SUB3 done in {dur:.0f}s | scraped={stats['scraped']} "
|
||||
f"seasons +{stats['seasons']} matches +{stats['matches']} "
|
||||
f"fields +{stats['fields']} errors={stats['errors']}"
|
||||
)
|
||||
log(summary, telegram=not args.no_telegram)
|
||||
|
||||
# Result file
|
||||
res_path = "/opt/pgz-sport/cc_tasks/SUB3_RESULT.md"
|
||||
with open(res_path, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n## Run {datetime.now().isoformat(timespec='seconds')}\n")
|
||||
f.write(f"- batch_limit: {args.limit}\n")
|
||||
f.write(f"- targets: {len(targets)}\n")
|
||||
f.write(f"- scraped: {stats['scraped']}\n")
|
||||
f.write(f"- seasons +{stats['seasons']}\n")
|
||||
f.write(f"- matches +{stats['matches']}\n")
|
||||
f.write(f"- profile fields enriched: +{stats['fields']}\n")
|
||||
f.write(f"- errors: {stats['errors']}\n")
|
||||
f.write(f"- duration: {dur:.0f}s\n")
|
||||
f.write(f"- log: {LOG_FILE}\n")
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Dohvaća sezone i utakmice za HNS igrača preko Playwrighta."""
|
||||
import json, sys, time
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Koristi: hns_player_stats.py <hns_igrac_id>")
|
||||
sys.exit(1)
|
||||
|
||||
hns_id = sys.argv[1]
|
||||
url = f"https://semafor.hns.family/igraci/{hns_id}/"
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
page.goto(url, wait_until='networkidle', timeout=30000)
|
||||
# dohvaćanje __NEXT_DATA__
|
||||
next_data = page.inner_text('#__NEXT_DATA__')
|
||||
data = json.loads(next_data)
|
||||
browser.close()
|
||||
|
||||
# izvlačenje sezona i utakmica
|
||||
props = data['props']['pageProps']
|
||||
player = props.get('player', {})
|
||||
seasons = player.get('seasons', [])
|
||||
matches = props.get('matches', [])
|
||||
|
||||
print(f"Igrač: {player.get('name', '')} {player.get('surname', '')}")
|
||||
print(f"Sezona: {len(seasons)}")
|
||||
for s in seasons:
|
||||
print(f" {s.get('season','?')} {s.get('competition','')} {s.get('clubName','')} "
|
||||
f"N:{s.get('apps',0)} G:{s.get('goals',0)} A:{s.get('assists',0)}")
|
||||
|
||||
print(f"\nUtakmica: {len(matches)}")
|
||||
for m in matches[:5]: # prvih 5
|
||||
print(f" {m.get('date','')} {m.get('homeTeam','')} vs {m.get('awayTeam','')} {m.get('result','')}")
|
||||
+50
-16
@@ -1,24 +1,58 @@
|
||||
#!/bin/bash
|
||||
# Wrapper za HNS harvester koji uvijek koristi sistemski python3 (ima psycopg2)
|
||||
# Damir-friendly - ignorira venv aktivaciju
|
||||
# Wrapper za HNS harvester - koristi sistemski python3 (psycopg2)
|
||||
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
||||
PYTHON="/usr/bin/python3"
|
||||
|
||||
case "$1" in
|
||||
master) /usr/bin/python3 "$SCRIPT_DIR/hns_master_harvester.py" "${@:2}" ;;
|
||||
deep) /usr/bin/python3 "$SCRIPT_DIR/hns_player_deep.py" "${@:2}" ;;
|
||||
avatar) /usr/bin/python3 "$SCRIPT_DIR/hns_avatar_harvester.py" "${@:2}" ;;
|
||||
season) /usr/bin/python3 "$SCRIPT_DIR/hns_season_retry.py" "${@:2}" ;;
|
||||
watchdog) /usr/bin/python3 "$SCRIPT_DIR/hns_watchdog.py" "${@:2}" ;;
|
||||
objekti) /usr/bin/python3 "$SCRIPT_DIR/objekti_enrich_address.py" "${@:2}" ;;
|
||||
"") echo "Usage: $0 {master|deep|avatar|season|watchdog|objekti} [args]"
|
||||
master) $PYTHON "$SCRIPT_DIR/hns_master_harvester.py" "${@:2}" ;;
|
||||
deep) $PYTHON "$SCRIPT_DIR/hns_player_deep.py" "${@:2}" ;;
|
||||
|
||||
# Pojedinačni igrač: sve u jednom potezu
|
||||
player)
|
||||
if [ -z "$2" ]; then
|
||||
echo "Greška: potreban ID igrača. Primjer: $0 player 86290"
|
||||
exit 1
|
||||
fi
|
||||
echo ">>> Osnovni podaci za igrača $2"
|
||||
$PYTHON "$SCRIPT_DIR/hns_master_harvester.py" --single-player "$2"
|
||||
echo ">>> Sezone i utakmice za igrača $2"
|
||||
$PYTHON "$SCRIPT_DIR/hns_player_deep.py" --player "$2"
|
||||
;;
|
||||
|
||||
# Svi klubovi (bez igrača? master svakako povlači i igrače iz tih klubova)
|
||||
all-clubs)
|
||||
echo "Dohvat svih klubova (limit 10000)..."
|
||||
$PYTHON "$SCRIPT_DIR/hns_master_harvester.py" --limit 10000
|
||||
;;
|
||||
|
||||
# Svi igrači sa svim detaljima (klubovi + sezone + utakmice)
|
||||
all-players)
|
||||
echo ">>> 1/2 Dohvat svih klubova i osnovnih podataka igrača"
|
||||
$PYTHON "$SCRIPT_DIR/hns_master_harvester.py" --limit 10000
|
||||
echo ">>> 2/2 Dohvat sezona i utakmica za sve igrače"
|
||||
$PYTHON "$SCRIPT_DIR/hns_player_deep.py" --limit 50000
|
||||
;;
|
||||
|
||||
# Kompletno: svi klubovi + svi igrači (all-in-one)
|
||||
all)
|
||||
echo "===== FULL HARVEST ====="
|
||||
$0 all-clubs
|
||||
$0 all-players
|
||||
;;
|
||||
|
||||
avatar) $PYTHON "$SCRIPT_DIR/hns_avatar_harvester.py" "${@:2}" ;;
|
||||
season) $PYTHON "$SCRIPT_DIR/hns_season_retry.py" "${@:2}" ;;
|
||||
watchdog) $PYTHON "$SCRIPT_DIR/hns_watchdog.py" "${@:2}" ;;
|
||||
objekti) $PYTHON "$SCRIPT_DIR/objekti_enrich_address.py" "${@:2}" ;;
|
||||
|
||||
"") echo "Usage: $0 {master|deep|player|all-clubs|all-players|all|avatar|season|watchdog|objekti} [args]"
|
||||
echo
|
||||
echo "Primjeri:"
|
||||
echo " $0 master --limit 100 # Sve PGŽ klubove (~59)"
|
||||
echo " $0 master --klub-id 2613 # Jedan klub"
|
||||
echo " $0 master --single-player 436387 # Jedan igrač"
|
||||
echo " $0 deep # DEEP scrape svih (sezone+utakmice)"
|
||||
echo " $0 avatar # Avatar slike"
|
||||
echo " $0 watchdog # Auto-recovery"
|
||||
echo " $0 player 86290 # Jedan igrač (osnovno+sezone+utakmice)"
|
||||
echo " $0 all-clubs # Svi klubovi i njihovi osnovni igrači"
|
||||
echo " $0 all-players # Svi klubovi + svi igrači sa svim sezonama"
|
||||
echo " $0 all # Kompletno: klubovi i svi detalji igrača"
|
||||
echo " $0 deep --limit 50000 # Osvježi sezone za 50000 igrača"
|
||||
;;
|
||||
*) echo "Unknown command: $1"; exit 1 ;;
|
||||
esac
|
||||
esac
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""HNS sezone retry — pojednostavljen extract."""
|
||||
import os, time, re, json, sys
|
||||
from datetime import datetime
|
||||
@@ -6,7 +9,7 @@ import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
|
||||
def find_seasons_in_obj(obj, found=None):
|
||||
if found is None: found = []
|
||||
|
||||
+112
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HNS sezone retry — pojednostavljen extract."""
|
||||
import os, time, re, json, sys
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
|
||||
def find_seasons_in_obj(obj, found=None):
|
||||
if found is None: found = []
|
||||
if isinstance(obj, dict):
|
||||
if 'season' in obj or 'sezona' in obj:
|
||||
found.append(obj)
|
||||
for v in obj.values():
|
||||
find_seasons_in_obj(v, found)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
find_seasons_in_obj(item, found)
|
||||
return found
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
|
||||
FROM pgz_sport.clanovi c
|
||||
WHERE c.hns_igrac_id IS NOT NULL
|
||||
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
|
||||
ORDER BY c.id LIMIT 200
|
||||
""")
|
||||
targets = cur.fetchall()
|
||||
|
||||
print(f"Targets: {len(targets)}", flush=True)
|
||||
|
||||
seasons_added = 0
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||
ctx = browser.new_context(ignore_https_errors=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0")
|
||||
page = ctx.new_page()
|
||||
|
||||
for i, t in enumerate(targets):
|
||||
url = t['source_url']
|
||||
if not url or 'semafor.hns.family/igraci/' not in url:
|
||||
continue
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=20000)
|
||||
try: page.wait_for_selector('table, .karijera, [class*="season"]', timeout=6000)
|
||||
except: pass
|
||||
time.sleep(0.5)
|
||||
|
||||
rows = []
|
||||
|
||||
# Extract from __NEXT_DATA__ if exists
|
||||
html = page.content()
|
||||
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
|
||||
if m:
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
sezone = find_seasons_in_obj(data)
|
||||
for s in sezone:
|
||||
sezona = s.get('season') or s.get('sezona')
|
||||
if sezona:
|
||||
rows.append({'sezona': str(sezona), 'klub': '', 'natjecanje': '', 'nastupi': 0, 'golovi': 0})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback regex on body
|
||||
if not rows:
|
||||
body = page.locator('body').inner_text()
|
||||
for line in body.split('\n'):
|
||||
match = re.match(r'^(20\d{2}/\d{2})\s+(.+?)\s+(\d+(?:\s+\d+)*)\s*$', line.strip())
|
||||
if match:
|
||||
sezona = match.group(1)
|
||||
rest = match.group(2)
|
||||
nums = [int(x) for x in match.group(3).split()]
|
||||
rows.append({
|
||||
'sezona': sezona, 'klub': rest[:200], 'natjecanje': '',
|
||||
'nastupi': nums[0] if nums else 0,
|
||||
'golovi': nums[1] if len(nums) > 1 else 0,
|
||||
})
|
||||
|
||||
if rows:
|
||||
with conn.cursor() as cur:
|
||||
for r in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, golovi)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'],
|
||||
r['natjecanje'], r['nastupi'], r['golovi']))
|
||||
seasons_added += 1
|
||||
except Exception:
|
||||
pass
|
||||
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: {len(rows)} sezone (total added: {seasons_added})", flush=True)
|
||||
|
||||
if i % 20 == 0:
|
||||
print(f" [{i}/{len(targets)}] processed, total added: {seasons_added}", flush=True)
|
||||
except Exception as e:
|
||||
print(f" ❌ {t['ime']}: {e}", flush=True)
|
||||
|
||||
browser.close()
|
||||
|
||||
print(f"\nDone. Total sezone added: {seasons_added}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily."""
|
||||
import os, time, re, json, sys
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
|
||||
def find_seasons(obj, found=None, depth=0):
|
||||
if depth > 25: return found or []
|
||||
|
||||
Executable
+141
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily."""
|
||||
import os, time, re, json, sys
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
|
||||
def find_seasons(obj, found=None, depth=0):
|
||||
if depth > 25: return found or []
|
||||
if found is None: found = []
|
||||
if isinstance(obj, dict):
|
||||
keys = set(obj.keys())
|
||||
# Detect season-like dict
|
||||
if ('season' in keys and isinstance(obj.get('season'), (str, dict))) or 'sezona' in keys:
|
||||
found.append(obj)
|
||||
# Detect career object with seasons array
|
||||
for k, v in obj.items():
|
||||
if k.lower() in ('careers','career','seasons','sezone','statistics','stats') and isinstance(v, list):
|
||||
for item in v:
|
||||
if isinstance(item, dict) and any(kk in item for kk in ('season','sezona','year','godina')):
|
||||
found.append(item)
|
||||
find_seasons(v, found, depth+1)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
find_seasons(item, found, depth+1)
|
||||
return found
|
||||
|
||||
def normalize_season(s):
|
||||
"""Convert season dict to flat row."""
|
||||
sezona = s.get('season') or s.get('sezona') or s.get('year') or s.get('godina') or ''
|
||||
if isinstance(sezona, dict):
|
||||
sezona = sezona.get('name') or sezona.get('label') or str(sezona.get('year',''))
|
||||
sezona = str(sezona)
|
||||
|
||||
klub = s.get('club') or s.get('klub') or s.get('team') or ''
|
||||
if isinstance(klub, dict):
|
||||
klub = klub.get('name') or klub.get('naziv') or ''
|
||||
|
||||
natj = s.get('competition') or s.get('natjecanje') or s.get('league') or ''
|
||||
if isinstance(natj, dict):
|
||||
natj = natj.get('name') or natj.get('naziv') or ''
|
||||
|
||||
def num(*keys):
|
||||
for k in keys:
|
||||
for kk in s.keys():
|
||||
if k.lower() in kk.lower():
|
||||
v = s[kk]
|
||||
try: return int(v)
|
||||
except:
|
||||
try: return int(re.sub(r'\D','', str(v)) or 0)
|
||||
except: return 0
|
||||
return 0
|
||||
|
||||
return {
|
||||
'sezona': sezona, 'klub': str(klub)[:200], 'natjecanje': str(natj)[:100],
|
||||
'nastupi': num('matches','nastup','appearance'),
|
||||
'startna': num('start'),
|
||||
'zamjena': num('sub','zamjen'),
|
||||
'golovi': num('goal','gol'),
|
||||
'asistencije': num('assist','asist'),
|
||||
'zuti': num('yellow','žut','zut'),
|
||||
'crveni': num('red','crv'),
|
||||
'minute': num('minute','minut','min'),
|
||||
}
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
|
||||
FROM pgz_sport.clanovi c
|
||||
WHERE c.hns_igrac_id IS NOT NULL
|
||||
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
|
||||
ORDER BY c.id LIMIT 200
|
||||
""")
|
||||
targets = cur.fetchall()
|
||||
|
||||
print(f"Targets: {len(targets)}", flush=True)
|
||||
|
||||
seasons_added = 0
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||
page = browser.new_context(ignore_https_errors=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0").new_page()
|
||||
|
||||
for i, t in enumerate(targets):
|
||||
url = t['source_url']
|
||||
if not url or 'semafor.hns.family/igraci/' not in url:
|
||||
continue
|
||||
try:
|
||||
page.goto(url, wait_until="networkidle", timeout=20000)
|
||||
time.sleep(0.8)
|
||||
|
||||
html = page.content()
|
||||
rows = []
|
||||
|
||||
# Extract __NEXT_DATA__
|
||||
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
|
||||
if m:
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
seasons_raw = find_seasons(data)
|
||||
for s in seasons_raw:
|
||||
n = normalize_season(s)
|
||||
if n['sezona']:
|
||||
rows.append(n)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
# Insert
|
||||
if rows:
|
||||
with conn.cursor() as cur:
|
||||
for r in rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje,
|
||||
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'], r['natjecanje'],
|
||||
r['nastupi'], r['startna'], r['zamjena'], r['golovi'],
|
||||
r['asistencije'], r['zuti'], r['crveni'], r['minute']))
|
||||
seasons_added += 1
|
||||
except: pass
|
||||
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: +{len(rows)} sezone (total: {seasons_added})", flush=True)
|
||||
|
||||
if i % 30 == 0 and i > 0:
|
||||
print(f" [{i}/{len(targets)}] processed, total: {seasons_added}", flush=True)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
browser.close()
|
||||
|
||||
print(f"\n✅ Done. Total: {seasons_added}", flush=True)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# hns_watchdog.py — PGŽ Sport HNS pipeline watchdog (SUB7)
|
||||
# Author : Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
||||
@@ -32,7 +35,7 @@ import requests
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
DSN = os.getenv(
|
||||
"RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
|
||||
)
|
||||
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
|
||||
Executable
+340
@@ -0,0 +1,340 @@
|
||||
#!/usr/bin/env python3
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# hns_watchdog.py — PGŽ Sport HNS pipeline watchdog (SUB7)
|
||||
# Author : Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
||||
# Date : 2026-05-05
|
||||
# Version: 1.0.0
|
||||
# Purpose: Periodically poll DB progress for the HNS scraping pipeline,
|
||||
# detect stalls, restart fallen worker processes and send Telegram
|
||||
# status updates every 30 minutes. Fires a special "ALL DONE" alert
|
||||
# once the mission goal is reached.
|
||||
#
|
||||
# Modes : --once run a single check and exit (cron-friendly)
|
||||
# --daemon loop forever, sleeping CHECK_INTERVAL_SEC between checks
|
||||
#
|
||||
# Goal : 59/59 PGŽ financirani klubovi sa hns_klub_id, ≥80% igrača s
|
||||
# profile_complete=true (visina_cm IS NOT NULL), ≥1000 utakmica.
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
import logging.handlers
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
DSN = os.getenv(
|
||||
"RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
|
||||
)
|
||||
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
|
||||
LOG_DIR = Path("/var/log/pgz-sport-debug")
|
||||
LOG_FILE = LOG_DIR / "hns_watchdog.log"
|
||||
STATE_FILE = LOG_DIR / "hns_watchdog_state.json"
|
||||
|
||||
CHECK_INTERVAL_SEC = 30 * 60 # 30 min between daemon iterations
|
||||
STALL_WINDOW_SEC = 30 * 60 # consider stale if no growth in 30 min
|
||||
DONE_FLAG_FILE = LOG_DIR / "hns_watchdog_DONE.flag"
|
||||
|
||||
# Mission targets
|
||||
TARGET_KLUBOVI = 59
|
||||
TARGET_PROFILE_PCT = 0.80
|
||||
TARGET_MATCHES = 1000
|
||||
|
||||
# Worker processes to keep alive (process_name : restart_command)
|
||||
WORKERS = {
|
||||
"hns_master_harvester": [
|
||||
"python3", "/opt/pgz-sport/scripts/hns_master_harvester.py",
|
||||
],
|
||||
"hns_season_v3": [
|
||||
"python3", "/opt/pgz-sport/scripts/hns_season_v3.py",
|
||||
],
|
||||
}
|
||||
|
||||
# ── Logging ───────────────────────────────────────────────────────────────────
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
logger = logging.getLogger("hns_watchdog")
|
||||
logger.setLevel(logging.INFO)
|
||||
if not logger.handlers:
|
||||
handler = logging.handlers.RotatingFileHandler(
|
||||
LOG_FILE, maxBytes=5_000_000, backupCount=5
|
||||
)
|
||||
handler.setFormatter(logging.Formatter(
|
||||
"%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S",
|
||||
))
|
||||
logger.addHandler(handler)
|
||||
logger.addHandler(logging.StreamHandler(sys.stdout))
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
PROGRESS_SQL = """
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM pgz_sport.klubovi
|
||||
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
||||
AND hns_klub_id IS NOT NULL) AS klubovi_hns,
|
||||
(SELECT COUNT(DISTINCT klub_id) FROM pgz_sport.hns_klub_roster) AS roster_klubovi,
|
||||
(SELECT COUNT(*) FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL) AS igraci_hns,
|
||||
(SELECT COUNT(*) FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL AND visina_cm IS NOT NULL) AS igraci_profil,
|
||||
(SELECT COUNT(*) FROM pgz_sport.hns_player_seasons) AS seasons_rec,
|
||||
(SELECT COUNT(*) FROM pgz_sport.hns_player_matches) AS matches_rec
|
||||
;
|
||||
"""
|
||||
|
||||
PENDING_SQL = """
|
||||
SELECT COUNT(*) FROM pgz_sport.clanovi
|
||||
WHERE hns_igrac_id IS NOT NULL
|
||||
AND visina_cm IS NULL;
|
||||
"""
|
||||
|
||||
|
||||
def db_query():
|
||||
"""Returns dict of progress counters (or None on failure)."""
|
||||
try:
|
||||
conn = psycopg2.connect(DSN, connect_timeout=10)
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(PROGRESS_SQL)
|
||||
row = cur.fetchone()
|
||||
cols = ["klubovi_hns", "roster_klubovi", "igraci_hns",
|
||||
"igraci_profil", "seasons_rec", "matches_rec"]
|
||||
counts = dict(zip(cols, row))
|
||||
try:
|
||||
cur.execute(PENDING_SQL)
|
||||
counts["pending_players"] = cur.fetchone()[0]
|
||||
except Exception as e:
|
||||
logger.warning("PENDING_SQL failed: %s", e)
|
||||
counts["pending_players"] = None
|
||||
conn.close()
|
||||
return counts
|
||||
except Exception as e:
|
||||
logger.error("DB query failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def telegram(text):
|
||||
try:
|
||||
r = requests.post(
|
||||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
data={"chat_id": TG_CHAT, "text": text[:4000],
|
||||
"parse_mode": "HTML", "disable_web_page_preview": "true"},
|
||||
timeout=10,
|
||||
)
|
||||
ok = r.ok and r.json().get("ok", False)
|
||||
if not ok:
|
||||
logger.warning("Telegram returned: %s", r.text[:300])
|
||||
return ok
|
||||
except Exception as e:
|
||||
logger.error("Telegram send failed: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def load_state():
|
||||
if STATE_FILE.exists():
|
||||
try:
|
||||
return json.loads(STATE_FILE.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def save_state(state):
|
||||
try:
|
||||
STATE_FILE.write_text(json.dumps(state, indent=2, default=str))
|
||||
except Exception as e:
|
||||
logger.warning("Cannot persist state: %s", e)
|
||||
|
||||
|
||||
def proc_alive(name):
|
||||
"""True if a process matching `name` is currently running."""
|
||||
try:
|
||||
# pgrep -f returns 0 if at least one match
|
||||
r = subprocess.run(
|
||||
["pgrep", "-f", name],
|
||||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||||
timeout=5,
|
||||
)
|
||||
return r.returncode == 0
|
||||
except Exception as e:
|
||||
logger.warning("pgrep failed for %s: %s", name, e)
|
||||
return True # err on caution: do not respawn if uncertain
|
||||
|
||||
|
||||
def restart_worker(name, cmd):
|
||||
log_path = LOG_DIR / f"{name}_respawn_{datetime.now():%Y%m%d_%H%M}.log"
|
||||
try:
|
||||
with open(log_path, "ab") as logf:
|
||||
subprocess.Popen(
|
||||
cmd,
|
||||
stdout=logf, stderr=subprocess.STDOUT,
|
||||
cwd="/opt/pgz-sport/scripts",
|
||||
start_new_session=True,
|
||||
)
|
||||
logger.info("Re-spawned worker %s -> %s", name, log_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("Failed to respawn %s: %s", name, e)
|
||||
return False
|
||||
|
||||
|
||||
def check_workers():
|
||||
"""Return list of worker names that were re-spawned."""
|
||||
respawned = []
|
||||
for name, cmd in WORKERS.items():
|
||||
if not proc_alive(name):
|
||||
if restart_worker(name, cmd):
|
||||
respawned.append(name)
|
||||
return respawned
|
||||
|
||||
|
||||
def detect_stale(prev, curr):
|
||||
"""True if seasons_rec did not grow even though there are pending players."""
|
||||
if not prev or not curr:
|
||||
return False
|
||||
if curr.get("pending_players") in (None, 0):
|
||||
return False
|
||||
try:
|
||||
ts_prev = datetime.fromisoformat(prev.get("ts"))
|
||||
except Exception:
|
||||
return False
|
||||
if datetime.utcnow() - ts_prev < timedelta(seconds=STALL_WINDOW_SEC):
|
||||
return False # not enough time elapsed
|
||||
grew = (curr.get("seasons_rec", 0) > prev.get("seasons_rec", 0) or
|
||||
curr.get("matches_rec", 0) > prev.get("matches_rec", 0) or
|
||||
curr.get("igraci_profil", 0) > prev.get("igraci_profil", 0))
|
||||
return not grew
|
||||
|
||||
|
||||
def goal_reached(c):
|
||||
if not c:
|
||||
return False
|
||||
if c["klubovi_hns"] < TARGET_KLUBOVI:
|
||||
return False
|
||||
if c["matches_rec"] < TARGET_MATCHES:
|
||||
return False
|
||||
if c["igraci_hns"] <= 0:
|
||||
return False
|
||||
pct = c["igraci_profil"] / c["igraci_hns"]
|
||||
return pct >= TARGET_PROFILE_PCT
|
||||
|
||||
|
||||
def fmt_status(c, respawned, stale, suffix=""):
|
||||
if not c:
|
||||
return f"<b>HNS watchdog</b>\nDB query failed at {datetime.utcnow():%Y-%m-%d %H:%M}Z"
|
||||
pct = (c["igraci_profil"] / c["igraci_hns"] * 100) if c["igraci_hns"] else 0
|
||||
body = (
|
||||
f"<b>HNS watchdog</b> {datetime.utcnow():%Y-%m-%d %H:%MZ}\n"
|
||||
f"Klubovi (HNS id): <b>{c['klubovi_hns']}/{TARGET_KLUBOVI}</b>\n"
|
||||
f"Roster scraped: {c['roster_klubovi']}\n"
|
||||
f"Igrači (HNS id): {c['igraci_hns']}\n"
|
||||
f"Igrači s profilom: {c['igraci_profil']} ({pct:0.1f}%)\n"
|
||||
f"Sezone: {c['seasons_rec']}\n"
|
||||
f"Utakmice: <b>{c['matches_rec']}</b>/{TARGET_MATCHES}\n"
|
||||
f"Pending igrači: {c.get('pending_players')}\n"
|
||||
)
|
||||
if respawned:
|
||||
body += f"\nRe-spawned: {', '.join(respawned)}"
|
||||
if stale:
|
||||
body += "\nSTALE: nema rasta u zadnjih 30 min"
|
||||
if suffix:
|
||||
body += f"\n{suffix}"
|
||||
return body
|
||||
|
||||
|
||||
# ── Main check ────────────────────────────────────────────────────────────────
|
||||
def run_check(send_telegram=True):
|
||||
logger.info("=== watchdog cycle ===")
|
||||
state = load_state()
|
||||
prev = state.get("last_counts")
|
||||
|
||||
counts = db_query()
|
||||
respawned = check_workers()
|
||||
stale = detect_stale(prev, counts) if counts else False
|
||||
|
||||
done = goal_reached(counts)
|
||||
msg = fmt_status(counts, respawned, stale)
|
||||
|
||||
notify = False
|
||||
suffix = ""
|
||||
|
||||
if done and not DONE_FLAG_FILE.exists():
|
||||
DONE_FLAG_FILE.write_text(datetime.utcnow().isoformat())
|
||||
suffix = "\nALL DONE — mission target reached!"
|
||||
msg = fmt_status(counts, respawned, stale, suffix=suffix)
|
||||
notify = True
|
||||
elif respawned or stale:
|
||||
notify = True
|
||||
else:
|
||||
# routine 30-min heartbeat: send only if last notify >= 30 min ago
|
||||
last_ts = state.get("last_notify_ts")
|
||||
if not last_ts:
|
||||
notify = True
|
||||
else:
|
||||
try:
|
||||
last_dt = datetime.fromisoformat(last_ts)
|
||||
if datetime.utcnow() - last_dt >= timedelta(minutes=29):
|
||||
notify = True
|
||||
except Exception:
|
||||
notify = True
|
||||
|
||||
logger.info("counts=%s respawned=%s stale=%s notify=%s done=%s",
|
||||
counts, respawned, stale, notify, done)
|
||||
|
||||
if send_telegram and notify:
|
||||
if telegram(msg):
|
||||
state["last_notify_ts"] = datetime.utcnow().isoformat()
|
||||
else:
|
||||
logger.warning("Telegram delivery failed")
|
||||
|
||||
if counts:
|
||||
state["last_counts"] = {**counts, "ts": datetime.utcnow().isoformat()}
|
||||
save_state(state)
|
||||
|
||||
return {"counts": counts, "respawned": respawned,
|
||||
"stale": stale, "done": done, "notified": notify}
|
||||
|
||||
|
||||
# ── Daemon loop ───────────────────────────────────────────────────────────────
|
||||
def run_daemon():
|
||||
logger.info("Starting watchdog daemon (interval=%ss)", CHECK_INTERVAL_SEC)
|
||||
while True:
|
||||
try:
|
||||
run_check(send_telegram=True)
|
||||
except Exception as e:
|
||||
logger.exception("cycle crashed: %s", e)
|
||||
time.sleep(CHECK_INTERVAL_SEC)
|
||||
|
||||
|
||||
# ── Entry point ───────────────────────────────────────────────────────────────
|
||||
def main():
|
||||
p = argparse.ArgumentParser(description="HNS pipeline watchdog")
|
||||
g = p.add_mutually_exclusive_group(required=True)
|
||||
g.add_argument("--once", action="store_true",
|
||||
help="Run a single check and exit (cron-friendly)")
|
||||
g.add_argument("--daemon", action="store_true",
|
||||
help="Run forever, sleeping 30 min between checks")
|
||||
p.add_argument("--no-telegram", action="store_true",
|
||||
help="Skip Telegram notifications (debug)")
|
||||
args = p.parse_args()
|
||||
|
||||
if args.daemon:
|
||||
run_daemon()
|
||||
else:
|
||||
result = run_check(send_telegram=not args.no_telegram)
|
||||
# Print compact JSON for cron / shell usage
|
||||
print(json.dumps(result, default=str, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# hns_youth_categories.py — SUB5 — HNS Semafor youth team scraper (v1.0)
|
||||
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
||||
# Date: 2026-05-05
|
||||
@@ -59,7 +62,7 @@ except Exception:
|
||||
|
||||
DB_DSN = dict(
|
||||
host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7",
|
||||
user="rinet", password=os.environ["DB_PASSWORD"],
|
||||
)
|
||||
BASE = "https://semafor.hns.family"
|
||||
RATE_S = 1.0
|
||||
|
||||
@@ -0,0 +1,581 @@
|
||||
#!/usr/bin/env python3
|
||||
# hns_youth_categories.py — SUB5 — HNS Semafor youth team scraper (v1.0)
|
||||
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
|
||||
# Date: 2026-05-05
|
||||
# Description:
|
||||
# Discovers per-club age categories (Seniori / U-19 juniori / U-17 kadeti /
|
||||
# U-15 stariji pioniri / U-13 mlađi pioniri / U-11/U-9 početnici) by
|
||||
# scraping HNS COMET Semafor competition pages and matching participating
|
||||
# klubovi with hns_klub_id in pgz_sport.klubovi. For each (klub, kategorija,
|
||||
# sezona) the per-club competition roster is fetched and players are
|
||||
# upserted into pgz_sport.clan_kategorije (M2M player x category x season).
|
||||
#
|
||||
# Strategy:
|
||||
# 1. Hardcoded list of per-season national + 2.NL competitions whose
|
||||
# cid → kategorija mapping is known (PGZ regional 3.NL/ŽNS leagues
|
||||
# are added as discovered).
|
||||
# 2. For each competition, fetch /natjecanja/{cid}/{slug}/ and extract
|
||||
# all participating /klubovi/{kid}/{slug}/ links.
|
||||
# 3. Match against pgz_sport.klubovi (hns_klub_id). For each match,
|
||||
# fetch /klubovi/{kid}/{slug}/?cid={cid} and parse player /igraci/
|
||||
# links — these are the players belonging to this age category.
|
||||
# 4. Upsert each player as clanovi (source=hns_semafor) and write
|
||||
# clan_kategorije(clan_id, klub_id, kategorija, sezona, source,
|
||||
# source_url, scraped_at).
|
||||
#
|
||||
# Run modes:
|
||||
# python hns_youth_categories.py discover # dry-run, only logs
|
||||
# python hns_youth_categories.py run # full scrape + DB upsert
|
||||
# python hns_youth_categories.py klub <db_kid> # one club only
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from urllib.parse import unquote
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Try to use SUB4's hns_api_client for shared session/UA
|
||||
SCRIPTS_DIR = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(SCRIPTS_DIR))
|
||||
try:
|
||||
import hns_api_client as hns_api # type: ignore
|
||||
_GET_HTML = hns_api._get_html
|
||||
_UA = hns_api.UA
|
||||
except Exception:
|
||||
_GET_HTML = None
|
||||
_UA = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
DB_DSN = dict(
|
||||
host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password=os.environ["DB_PASSWORD"],
|
||||
)
|
||||
BASE = "https://semafor.hns.family"
|
||||
RATE_S = 1.0
|
||||
TIMEOUT = 25
|
||||
|
||||
LOG_DIR = Path("/var/log/pgz-sport-debug")
|
||||
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
LOG_FILE = LOG_DIR / f"sub5_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
|
||||
log = logging.getLogger("sub5")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE, encoding="utf-8"),
|
||||
logging.StreamHandler(sys.stdout),
|
||||
],
|
||||
)
|
||||
|
||||
# ── Telegram ───────────────────────────────────────────────────────────────
|
||||
TG_TOKEN = os.environ.get("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.environ.get("TG_CHAT", "7969491558")
|
||||
|
||||
|
||||
def tg_send(msg: str):
|
||||
if not TG_TOKEN or not TG_CHAT:
|
||||
return
|
||||
try:
|
||||
# Use bare requests (no shared session) with short timeout to avoid
|
||||
# hangs on flaky outbound paths.
|
||||
r = requests.post(
|
||||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
data={"chat_id": TG_CHAT, "text": msg, "parse_mode": "Markdown"},
|
||||
timeout=(5, 10),
|
||||
)
|
||||
log.info(f"telegram: {r.status_code}")
|
||||
except Exception as e:
|
||||
log.warning(f"telegram failed: {e}")
|
||||
|
||||
|
||||
# ── HTTP fallback ──────────────────────────────────────────────────────────
|
||||
_session = requests.Session()
|
||||
_session.headers.update({"User-Agent": _UA, "Accept-Language": "hr,en;q=0.7"})
|
||||
|
||||
|
||||
def fetch(url: str) -> str:
|
||||
if _GET_HTML is not None:
|
||||
return _GET_HTML(url)
|
||||
log.debug(f"GET {url}")
|
||||
r = _session.get(url, timeout=TIMEOUT)
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
|
||||
|
||||
# ── Competition catalogue ─────────────────────────────────────────────────
|
||||
# Each entry: (cid, slug, kategorija, sezona). PGZ-relevant national /
|
||||
# 2.NL leagues per season. Regional ŽNS leagues are discovered dynamically
|
||||
# via discover_pgz_competitions() once we find them inside klub raspored.
|
||||
COMP_CATALOG = [
|
||||
# 2025/2026 season
|
||||
("100454960", "1-nl-juniori", "juniori-u19", "2025/2026"),
|
||||
("100454979", "1-nl-kadeti", "kadeti-u17", "2025/2026"),
|
||||
("100454999", "1-nl-pioniri", "pioniri-u15", "2025/2026"),
|
||||
("100540163", "2-nl-juniori-a", "juniori-u19", "2025/2026"),
|
||||
("100540177", "2-nl-juniori-b", "juniori-u19", "2025/2026"),
|
||||
("100540032", "2-nl-kadeti-a", "kadeti-u17", "2025/2026"),
|
||||
("100540109", "2-nl-kadeti-b", "kadeti-u17", "2025/2026"),
|
||||
("100381663", "kvalifikacije-za-prvu-nl-juniori", "juniori-u19", "2025/2026"),
|
||||
("100381584", "kvalifikacije-za-prvu-nl-kadeti", "kadeti-u17", "2025/2026"),
|
||||
("100381484", "kvalifikacije-za-prvu-nl-pioniri", "pioniri-u15", "2025/2026"),
|
||||
("100569152", "treca-nl-istok", "seniori", "2025/2026"), # Treća NL Istok
|
||||
("100585203", "treca-nl-zapad", "seniori", "2025/2026"), # Treća NL Zapad (PGŽ klubovi)
|
||||
# PGŽ-region ŽNL leagues discovered via klub raspored auto-discovery
|
||||
("101555188", "1-znl-seniori", "seniori", "2025/2026"), # 1.ŽNL PGŽ seniori
|
||||
("112195128", "kup-zns", "seniori", "2025/2026"), # Kup ŽNS Vinodolsko-Senjsko
|
||||
("104425442", "kup-mladezi-juniori", "juniori-u19", "2025/2026"),
|
||||
("104464435", "kup-mladezi-kadeti", "kadeti-u17", "2025/2026"),
|
||||
("100391485", "supersport-hnl", "seniori", "2025/2026"),
|
||||
("100413651", "supersport-prva-nl", "seniori", "2025/2026"),
|
||||
("100418001", "supersport-druga-nl", "seniori", "2025/2026"),
|
||||
("100439118", "supersport-hnk", "seniori", "2025/2026"), # Cup, all seniori
|
||||
("101411063", "hrvatski-nogometni-kup", "seniori", "2025/2026"),
|
||||
# 2024/2025 season — same structure, slightly different cids; will be
|
||||
# discovered dynamically per-klub as well.
|
||||
]
|
||||
|
||||
# Map from acat dropdown values (HR semantic labels) → kategorija
|
||||
ACAT_MAP = {
|
||||
"Seniors": "seniori",
|
||||
"Juniors": "juniori-u19",
|
||||
"Juniors 2": "juniori-u19",
|
||||
"Cadets": "kadeti-u17",
|
||||
"Cadets 2": "kadeti-u17",
|
||||
"Pioneers": "pioniri-u15",
|
||||
"Pioneers 2": "pioniri-u15",
|
||||
"Young pioneers": "mladji-pioniri-u13",
|
||||
"Beginners": "pocetnici-u11",
|
||||
"Pre-beginners (6+1, 20min)": "pocetnici-u9",
|
||||
}
|
||||
|
||||
# Heuristic from competition name → kategorija
|
||||
def kategorija_from_name(name: str) -> str:
|
||||
nl = name.lower()
|
||||
if "juniori" in nl or "juniors" in nl:
|
||||
return "juniori-u19"
|
||||
if "kadeti" in nl or "cadets" in nl or "kadetkinje" in nl:
|
||||
return "kadeti-u17"
|
||||
if "stariji pioniri" in nl:
|
||||
return "pioniri-u15"
|
||||
if "mladji pioniri" in nl or "mlađi pioniri" in nl or "young pioneers" in nl:
|
||||
return "mladji-pioniri-u13"
|
||||
if "pioniri" in nl or "pioneers" in nl or "pionirke" in nl:
|
||||
return "pioniri-u15"
|
||||
if "pocetnici u-9" in nl or "pre-beginners" in nl or "pocetnici-u-9" in nl:
|
||||
return "pocetnici-u9"
|
||||
if "pocetnici u-11" in nl or "beginners" in nl or "pocetnici-u-11" in nl:
|
||||
return "pocetnici-u11"
|
||||
return "seniori"
|
||||
|
||||
|
||||
# ── DB helpers ─────────────────────────────────────────────────────────────
|
||||
def conn():
|
||||
return psycopg2.connect(**DB_DSN)
|
||||
|
||||
|
||||
def ensure_schema():
|
||||
"""Verify clan_kategorije table exists; the schema in production already
|
||||
matches the M2M shape required (no DDL change needed here)."""
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""SELECT 1 FROM information_schema.tables
|
||||
WHERE table_schema='pgz_sport' AND table_name='clan_kategorije'"""
|
||||
)
|
||||
if cu.fetchone():
|
||||
log.info("clan_kategorije table verified.")
|
||||
return
|
||||
cu.execute(
|
||||
"""CREATE TABLE pgz_sport.clan_kategorije (
|
||||
id SERIAL PRIMARY KEY,
|
||||
clan_id INTEGER REFERENCES pgz_sport.clanovi(id) ON DELETE CASCADE,
|
||||
klub_id INTEGER REFERENCES pgz_sport.klubovi(id),
|
||||
kategorija TEXT NOT NULL,
|
||||
sezona TEXT,
|
||||
source TEXT,
|
||||
source_url TEXT,
|
||||
scraped_at TIMESTAMPTZ DEFAULT now(),
|
||||
UNIQUE (clan_id, kategorija, sezona, klub_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_clan_kat_clan
|
||||
ON pgz_sport.clan_kategorije(clan_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_clan_kat_sezona
|
||||
ON pgz_sport.clan_kategorije(sezona);
|
||||
CREATE INDEX IF NOT EXISTS idx_clan_kat_klub
|
||||
ON pgz_sport.clan_kategorije(klub_id);
|
||||
"""
|
||||
)
|
||||
c.commit()
|
||||
log.info("Created pgz_sport.clan_kategorije.")
|
||||
|
||||
|
||||
def load_pgz_klubovi() -> dict[int, dict]:
|
||||
"""Returns {hns_klub_id: {db_id, naziv, slug}}, deduped by hns_klub_id
|
||||
(keeping the first / lowest-id row when duplicates exist)."""
|
||||
out: dict[int, dict] = {}
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""SELECT id, naziv, hns_klub_id, COALESCE(NULLIF(hns_slug,''), slug)
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE hns_klub_id IS NOT NULL
|
||||
ORDER BY id"""
|
||||
)
|
||||
for kid_db, naziv, hns_id, slug in cu.fetchall():
|
||||
if hns_id in out:
|
||||
continue # keep first occurrence
|
||||
out[hns_id] = {
|
||||
"db_id": kid_db,
|
||||
"naziv": naziv,
|
||||
"slug": slug or _slugify(naziv),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def _slugify(name: str) -> str:
|
||||
name = (name or "").lower()
|
||||
repl = {"č": "c", "ć": "c", "ž": "z", "š": "s", "đ": "d"}
|
||||
for k, v in repl.items():
|
||||
name = name.replace(k, v)
|
||||
name = re.sub(r"[^a-z0-9]+", "-", name).strip("-")
|
||||
return name
|
||||
|
||||
|
||||
def upsert_clan(klub_db_id: int, hns_pid: int, ime_prezime: str, slug: str) -> int:
|
||||
"""Upsert a player into pgz_sport.clanovi keyed on (source='hns_semafor',
|
||||
source_id=hns_pid). Returns clan_id."""
|
||||
ime, prezime = "", ""
|
||||
if ime_prezime:
|
||||
parts = ime_prezime.strip().split(" ", 1)
|
||||
ime = parts[0]
|
||||
prezime = parts[1] if len(parts) > 1 else ""
|
||||
url = f"{BASE}/igraci/{hns_pid}/{slug or 'x'}/"
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE source='hns_semafor' AND source_id=%s LIMIT 1""",
|
||||
(str(hns_pid),),
|
||||
)
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
return row[0]
|
||||
# Try secondary lookup by hns_igrac_id (some rows from earlier runs)
|
||||
# NOTE: hns_igrac_id is varchar in DB, cast to text
|
||||
cu.execute(
|
||||
"SELECT id FROM pgz_sport.clanovi WHERE hns_igrac_id=%s LIMIT 1",
|
||||
(str(hns_pid),),
|
||||
)
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
cu.execute(
|
||||
"""UPDATE pgz_sport.clanovi
|
||||
SET source='hns_semafor', source_id=%s, source_url=%s,
|
||||
source_synced_at=now()
|
||||
WHERE id=%s""",
|
||||
(str(hns_pid), url, row[0]),
|
||||
)
|
||||
c.commit()
|
||||
return row[0]
|
||||
cu.execute(
|
||||
"""INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, source, source_id, source_url,
|
||||
source_synced_at, slug, hns_igrac_id, sport, aktivan,
|
||||
verified, created_at, updated_at)
|
||||
VALUES (%s,%s,%s,'hns_semafor',%s,%s,now(),%s,%s,'nogomet',
|
||||
true, false, now(), now())
|
||||
RETURNING id""",
|
||||
(klub_db_id, ime, prezime, str(hns_pid), url, slug or None, str(hns_pid)),
|
||||
)
|
||||
cid = cu.fetchone()[0]
|
||||
c.commit()
|
||||
return cid
|
||||
|
||||
|
||||
def upsert_clan_kategorija(
|
||||
clan_id: int, klub_db_id: int, kategorija: str, sezona: str,
|
||||
source_url: str,
|
||||
):
|
||||
with conn() as c, c.cursor() as cu:
|
||||
cu.execute(
|
||||
"""INSERT INTO pgz_sport.clan_kategorije
|
||||
(clan_id, klub_id, kategorija, sezona, source, source_url,
|
||||
scraped_at)
|
||||
VALUES (%s,%s,%s,%s,'hns_semafor',%s,now())
|
||||
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO UPDATE
|
||||
SET source_url=EXCLUDED.source_url,
|
||||
scraped_at=now()""",
|
||||
(clan_id, klub_db_id, kategorija, sezona, source_url),
|
||||
)
|
||||
c.commit()
|
||||
|
||||
|
||||
# ── Scrape primitives ─────────────────────────────────────────────────────
|
||||
def parse_competition_klubovi(html: str) -> list[tuple[int, str]]:
|
||||
"""Extract participating klubovi from a /natjecanja/{cid}/ page.
|
||||
Returns list of (hns_klub_id, slug)."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
seen = set()
|
||||
out = []
|
||||
for a in soup.find_all("a", href=re.compile(r"^/klubovi/\d+/[a-z0-9-]+/?")):
|
||||
m = re.match(r"^/klubovi/(\d+)/([a-z0-9-]+)/?", a["href"])
|
||||
if not m:
|
||||
continue
|
||||
kid, slug = int(m.group(1)), m.group(2)
|
||||
if kid in seen:
|
||||
continue
|
||||
seen.add(kid)
|
||||
out.append((kid, slug))
|
||||
return out
|
||||
|
||||
|
||||
def parse_klub_roster(html: str) -> list[tuple[int, str, str]]:
|
||||
"""Extract (hns_pid, slug, name) from a klub-with-cid page."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
seen = set()
|
||||
out = []
|
||||
for a in soup.find_all("a", href=re.compile(r"^/?(?:https?://[^/]+)?/igraci/\d+/[a-z0-9-]+/?")):
|
||||
href = a["href"]
|
||||
m = re.search(r"/igraci/(\d+)/([a-z0-9-]+)/?", href)
|
||||
if not m:
|
||||
continue
|
||||
pid, slug = int(m.group(1)), m.group(2)
|
||||
if pid in seen:
|
||||
continue
|
||||
seen.add(pid)
|
||||
name = (a.get_text(" ", strip=True) or "").strip()
|
||||
out.append((pid, slug, name))
|
||||
return out
|
||||
|
||||
|
||||
def parse_klub_competitions(html: str) -> list[tuple[int, str]]:
|
||||
"""From a klub page, parse the cid options dropdown — those are the
|
||||
competitions the club currently participates in (default season+acat
|
||||
only, but useful to discover more cids)."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
out = []
|
||||
for opt in soup.select('select#cid option'):
|
||||
val = opt.get("value") or ""
|
||||
m = re.search(r"\?cid=(\d+)", val)
|
||||
if not m:
|
||||
continue
|
||||
out.append((int(m.group(1)), opt.get_text(" ", strip=True)))
|
||||
return out
|
||||
|
||||
|
||||
# ── Main flow ──────────────────────────────────────────────────────────────
|
||||
def harvest():
|
||||
pgz = load_pgz_klubovi()
|
||||
log.info(
|
||||
f"Loaded {len(pgz)} unique PGŽ klubovi with hns_klub_id "
|
||||
f"({sum(1 for v in pgz.values() if v['slug'])} have slug)."
|
||||
)
|
||||
|
||||
stats = {
|
||||
"competitions_processed": 0,
|
||||
"competitions_skipped": 0,
|
||||
"klubovi_matched": 0,
|
||||
"rosters_fetched": 0,
|
||||
"players_upserted": 0,
|
||||
"kategorije_inserted": 0,
|
||||
"errors": 0,
|
||||
"per_kategorija": {},
|
||||
"per_klub": {},
|
||||
}
|
||||
|
||||
discovered_extra: set[tuple[str, str, str]] = set() # (cid, slug, sezona)
|
||||
seen_clan_kat: set[tuple[int, int, str, str]] = set()
|
||||
|
||||
for cid, slug, kategorija, sezona in COMP_CATALOG:
|
||||
comp_url = f"{BASE}/natjecanja/{cid}/{slug}/"
|
||||
try:
|
||||
html = fetch(comp_url)
|
||||
except Exception as e:
|
||||
log.warning(f"comp {cid} fetch failed: {e}")
|
||||
stats["competitions_skipped"] += 1
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
klubovi = parse_competition_klubovi(html)
|
||||
log.info(
|
||||
f"COMP cid={cid} '{slug}' [{kategorija}/{sezona}] -> "
|
||||
f"{len(klubovi)} participating klubovi"
|
||||
)
|
||||
stats["competitions_processed"] += 1
|
||||
time.sleep(RATE_S)
|
||||
|
||||
for hns_kid, k_slug in klubovi:
|
||||
if hns_kid not in pgz:
|
||||
continue
|
||||
klub = pgz[hns_kid]
|
||||
klub_db_id = klub["db_id"]
|
||||
stats["klubovi_matched"] += 1
|
||||
stats["per_klub"].setdefault(klub["naziv"], set()).add(kategorija)
|
||||
|
||||
# Fetch klub roster filtered by this competition cid
|
||||
slug_use = klub["slug"] or k_slug
|
||||
roster_url = f"{BASE}/klubovi/{hns_kid}/{slug_use}/?cid={cid}"
|
||||
try:
|
||||
rhtml = fetch(roster_url)
|
||||
except Exception as e:
|
||||
log.warning(f"roster {hns_kid} cid={cid} failed: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
stats["rosters_fetched"] += 1
|
||||
time.sleep(RATE_S)
|
||||
|
||||
# Discover any other cids this klub plays in
|
||||
for ocid, oname in parse_klub_competitions(rhtml):
|
||||
if ocid != int(cid):
|
||||
discovered_extra.add((str(ocid), oname, sezona))
|
||||
|
||||
roster = parse_klub_roster(rhtml)
|
||||
if not roster:
|
||||
log.info(f" {klub['naziv']} cid={cid}: empty roster")
|
||||
continue
|
||||
log.info(
|
||||
f" KLUB '{klub['naziv']}' (db={klub_db_id}, hns={hns_kid}) "
|
||||
f"cid={cid} -> {len(roster)} igraca [{kategorija}]"
|
||||
)
|
||||
|
||||
for hns_pid, p_slug, name in roster:
|
||||
try:
|
||||
clan_id = upsert_clan(klub_db_id, hns_pid, name, p_slug)
|
||||
except Exception as e:
|
||||
log.error(f"upsert_clan({hns_pid}) fail: {e}")
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
stats["players_upserted"] += 1
|
||||
key = (clan_id, klub_db_id, kategorija, sezona)
|
||||
if key in seen_clan_kat:
|
||||
continue
|
||||
seen_clan_kat.add(key)
|
||||
try:
|
||||
upsert_clan_kategorija(
|
||||
clan_id, klub_db_id, kategorija, sezona, roster_url
|
||||
)
|
||||
stats["kategorije_inserted"] += 1
|
||||
stats["per_kategorija"][kategorija] = (
|
||||
stats["per_kategorija"].get(kategorija, 0) + 1
|
||||
)
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"upsert_clan_kategorija(clan={clan_id} "
|
||||
f"klub={klub_db_id} kat={kategorija}) fail: {e}"
|
||||
)
|
||||
stats["errors"] += 1
|
||||
|
||||
# Summarize discovered extra cids (not yet in catalog) for next run
|
||||
if discovered_extra:
|
||||
log.info(
|
||||
f"Discovered {len(discovered_extra)} extra cids not in catalog "
|
||||
f"(top 15 below):"
|
||||
)
|
||||
for cid, name, sezona in list(discovered_extra)[:15]:
|
||||
log.info(f" + cid={cid} '{name}' sezona={sezona}")
|
||||
|
||||
# Convert per_klub sets to lists for JSON serialisation
|
||||
stats["per_klub"] = {k: sorted(v) for k, v in stats["per_klub"].items()}
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
global load_pgz_klubovi # noqa: PLW0603
|
||||
cmd = sys.argv[1] if len(sys.argv) > 1 else "run"
|
||||
log.info(f"=== SUB5 hns_youth_categories START cmd={cmd} log={LOG_FILE} ===")
|
||||
ensure_schema()
|
||||
|
||||
if cmd == "discover":
|
||||
pgz = load_pgz_klubovi()
|
||||
log.info(f"PGŽ klubovi with hns_klub_id: {len(pgz)}")
|
||||
for hk, v in list(pgz.items())[:10]:
|
||||
log.info(f" hns={hk} db={v['db_id']} slug={v['slug']} naziv={v['naziv']}")
|
||||
return
|
||||
|
||||
if cmd == "klub" and len(sys.argv) > 2:
|
||||
# narrow-scope debug mode — monkey-patch loader before harvest()
|
||||
target_db = int(sys.argv[2])
|
||||
_orig = load_pgz_klubovi
|
||||
pgz = {k: v for k, v in _orig().items() if v["db_id"] == target_db}
|
||||
log.info(f"Restricted to db_id={target_db}: {len(pgz)} match")
|
||||
load_pgz_klubovi = lambda: pgz # type: ignore
|
||||
try:
|
||||
stats = harvest()
|
||||
finally:
|
||||
load_pgz_klubovi = _orig # type: ignore
|
||||
else:
|
||||
stats = harvest()
|
||||
|
||||
log.info("=== SUMMARY ===")
|
||||
log.info(json.dumps(stats, ensure_ascii=False, indent=2))
|
||||
|
||||
# Write SUB5_RESULT.md
|
||||
md_path = Path("/opt/pgz-sport/cc_tasks/SUB5_RESULT.md")
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
md = render_summary_md(stats)
|
||||
md_path.write_text(md, encoding="utf-8")
|
||||
log.info(f"Result MD written → {md_path}")
|
||||
|
||||
# Telegram
|
||||
tg_send(
|
||||
"*SUB5 — HNS youth categories*\n"
|
||||
f"Klubovi matched: *{stats['klubovi_matched']}*\n"
|
||||
f"Rosters fetched: *{stats['rosters_fetched']}*\n"
|
||||
f"Players upserted: *{stats['players_upserted']}*\n"
|
||||
f"clan_kategorije: *{stats['kategorije_inserted']}*\n"
|
||||
f"Errors: {stats['errors']}\n"
|
||||
f"Log: `{LOG_FILE.name}`"
|
||||
)
|
||||
|
||||
|
||||
def render_summary_md(stats: dict) -> str:
|
||||
lines = [
|
||||
"# SUB5 — HNS youth categories result",
|
||||
"",
|
||||
f"_Generated: {datetime.now().isoformat(timespec='seconds')}_",
|
||||
"",
|
||||
"## High-level counters",
|
||||
"",
|
||||
f"- Competitions processed: **{stats['competitions_processed']}**",
|
||||
f"- Competitions skipped: {stats['competitions_skipped']}",
|
||||
f"- Klubovi (DB) matched in competitions: **{stats['klubovi_matched']}**",
|
||||
f"- Rosters fetched: **{stats['rosters_fetched']}**",
|
||||
f"- Players upserted into `clanovi`: **{stats['players_upserted']}**",
|
||||
f"- M2M rows written into `clan_kategorije`: **{stats['kategorije_inserted']}**",
|
||||
f"- Errors: {stats['errors']}",
|
||||
"",
|
||||
"## Per kategorija",
|
||||
"",
|
||||
"| Kategorija | M2M zapisa |",
|
||||
"|---|---:|",
|
||||
]
|
||||
for k in sorted(stats["per_kategorija"].keys()):
|
||||
lines.append(f"| {k} | {stats['per_kategorija'][k]} |")
|
||||
lines.append("")
|
||||
lines.append("## Per klub — kategorije pronadjene")
|
||||
lines.append("")
|
||||
lines.append("| Klub | Kategorije |")
|
||||
lines.append("|---|---|")
|
||||
for klub in sorted(stats["per_klub"].keys()):
|
||||
kats = ", ".join(stats["per_klub"][klub])
|
||||
lines.append(f"| {klub} | {kats} |")
|
||||
lines.append("")
|
||||
lines.append(f"_Log: `{LOG_FILE}`_")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log.exception(f"FATAL: {e}")
|
||||
tg_send(f"*SUB5 FATAL*: {e}")
|
||||
sys.exit(1)
|
||||
@@ -1,11 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# Fajl: objekti_enrich_address.py | v1.0 | 05.05.2026
|
||||
# Author: Damir Radulić
|
||||
# Svrha: Reverse-geocode lat/lng → adresa za sportski_objekti
|
||||
import os, time, json
|
||||
import psycopg2, requests
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
HEADERS = {"User-Agent": "Ri.NET PGŽ Sport (dradulic@outlook.com)"}
|
||||
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python3
|
||||
# Fajl: objekti_enrich_address.py | v1.0 | 05.05.2026
|
||||
# Author: Damir Radulić
|
||||
# Svrha: Reverse-geocode lat/lng → adresa za sportski_objekti
|
||||
import os, time, json
|
||||
import psycopg2, requests
|
||||
|
||||
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
||||
HEADERS = {"User-Agent": "Ri.NET PGŽ Sport (dradulic@outlook.com)"}
|
||||
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT id, naziv, lat, lng FROM pgz_sport.sportski_objekti
|
||||
WHERE aktivan = true AND lat IS NOT NULL AND lng IS NOT NULL
|
||||
AND (adresa IS NULL OR adresa = '')
|
||||
LIMIT 60
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
|
||||
print(f"Total: {len(rows)} objekata bez adrese")
|
||||
|
||||
for i, (oid, naziv, lat, lng) in enumerate(rows):
|
||||
try:
|
||||
# Nominatim reverse geocoding
|
||||
r = requests.get(
|
||||
f"https://nominatim.openstreetmap.org/reverse",
|
||||
params={"lat": lat, "lon": lng, "format": "json", "accept-language": "hr"},
|
||||
headers=HEADERS, timeout=10
|
||||
)
|
||||
if r.status_code == 200:
|
||||
d = r.json()
|
||||
addr = d.get("display_name", "")
|
||||
# Krat: ulica + broj + grad
|
||||
a = d.get("address", {})
|
||||
short = []
|
||||
for k in ["road", "house_number", "suburb", "city", "town", "village"]:
|
||||
if a.get(k): short.append(a[k])
|
||||
addr_short = ", ".join(short[:4]) or addr[:100]
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("UPDATE pgz_sport.sportski_objekti SET adresa = %s WHERE id = %s", (addr_short, oid))
|
||||
print(f" [{i+1}/{len(rows)}] {naziv} → {addr_short}")
|
||||
time.sleep(1.1) # Nominatim rate-limit 1 req/s
|
||||
except Exception as e:
|
||||
print(f" [FAIL] {naziv}: {e}")
|
||||
|
||||
print("DONE")
|
||||
@@ -10,7 +10,7 @@ DSN_HOST="${RINET_DB_HOST:-10.10.0.2}"
|
||||
DSN_PORT="${RINET_DB_PORT:-6432}"
|
||||
DSN_DB="${RINET_DB_NAME:-rinet_v3}"
|
||||
DSN_USER="${RINET_DB_USER:-rinet}"
|
||||
DSN_PASS="${RINET_DB_PASS:-R1net2026!SecureDB#v7}"
|
||||
DSN_PASS="${DB_PASSWORD:?DB_PASSWORD not set}"
|
||||
|
||||
BACKUP_DIR="/opt/pgz-sport/_backups"
|
||||
LOG_DIR="/var/log/pgz-sport-debug"
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# sport_harvest_health.py — staleness check za pgz_sport klubove
|
||||
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
||||
# Description: Provjerava kad je svaki aktivan klub zadnji put scrape-an
|
||||
@@ -16,7 +19,7 @@ from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv(
|
||||
"RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
|
||||
)
|
||||
TG_TOKEN = os.getenv("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
|
||||
+120
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env python3
|
||||
# sport_harvest_health.py — staleness check za pgz_sport klubove
|
||||
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
|
||||
# Description: Provjerava kad je svaki aktivan klub zadnji put scrape-an
|
||||
# (klub_roster.scraped_at ∪ clanovi.last_scraped_at). Klubovi >7 dana
|
||||
# flag-irani su za re-scrape; Telegram alert se šalje ako ima staleova.
|
||||
# Pokreće ga /etc/cron.d/sport-harvesters u 04:30 svaki 2. dan.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv(
|
||||
"RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
|
||||
)
|
||||
TG_TOKEN = os.getenv("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
STALE_DAYS = int(os.getenv("SPORT_STALE_DAYS", "7"))
|
||||
LOG_DIR = "/var/log/pgz-sport-debug"
|
||||
|
||||
LOG_PATH = os.path.join(LOG_DIR, f"health_{datetime.now().strftime('%Y%m%d_%H%M')}.log")
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
_logfh = open(LOG_PATH, "a")
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True)
|
||||
_logfh.write(line + "\n")
|
||||
_logfh.flush()
|
||||
|
||||
|
||||
SQL = """
|
||||
WITH last_per_klub AS (
|
||||
SELECT k.id AS klub_id, k.naziv, k.sport,
|
||||
GREATEST(
|
||||
COALESCE((SELECT MAX(scraped_at) FROM pgz_sport.klub_roster WHERE klub_id = k.id), 'epoch'::timestamptz),
|
||||
COALESCE((SELECT MAX(last_scraped_at) FROM pgz_sport.clanovi WHERE klub_id = k.id), 'epoch'::timestamptz)
|
||||
) AS last_scrape
|
||||
FROM pgz_sport.klubovi k
|
||||
WHERE k.aktivan = true
|
||||
)
|
||||
SELECT klub_id, naziv, sport, last_scrape,
|
||||
(last_scrape <= 'epoch'::timestamptz OR last_scrape < now() - interval %s) AS stale
|
||||
FROM last_per_klub;
|
||||
"""
|
||||
|
||||
|
||||
def telegram(text: str) -> None:
|
||||
try:
|
||||
subprocess.run(
|
||||
[
|
||||
"curl", "-sS", "-X", "POST",
|
||||
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={text}",
|
||||
],
|
||||
capture_output=True,
|
||||
timeout=10,
|
||||
check=False,
|
||||
)
|
||||
log(f"telegram sent ({len(text)} chars)")
|
||||
except Exception as e:
|
||||
log(f"telegram fail: {e}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
log(f"sport_harvest_health START stale_days={STALE_DAYS}")
|
||||
try:
|
||||
conn = psycopg2.connect(DSN)
|
||||
except Exception as e:
|
||||
log(f"DB connect FAIL: {e}")
|
||||
telegram(f"🚨 sport_harvest_health: DB connect FAIL — {e}")
|
||||
return 2
|
||||
|
||||
interval_str = f"{STALE_DAYS} days"
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute(SQL, (interval_str,))
|
||||
rows = cur.fetchall()
|
||||
|
||||
total = len(rows)
|
||||
stale_rows = [r for r in rows if r["stale"]]
|
||||
by_sport: dict = {}
|
||||
for r in stale_rows:
|
||||
s = (r["sport"] or "?").lower()
|
||||
by_sport[s] = by_sport.get(s, 0) + 1
|
||||
|
||||
top_stale = sorted(
|
||||
stale_rows,
|
||||
key=lambda r: (r["last_scrape"] or datetime(1970, 1, 1, tzinfo=timezone.utc)),
|
||||
)[:10]
|
||||
|
||||
log(f"klubova_total={total} stale={len(stale_rows)} by_sport={json.dumps(by_sport, ensure_ascii=False)}")
|
||||
for r in top_stale:
|
||||
log(f" STALE klub_id={r['klub_id']} sport={r['sport']} last={r['last_scrape']} naziv={r['naziv']}")
|
||||
|
||||
if stale_rows:
|
||||
sport_summary = ", ".join(f"{k.upper()}:{v}" for k, v in sorted(by_sport.items()))
|
||||
top_lines = "\n".join(
|
||||
f" • {r['naziv']} ({(r['sport'] or '?')}) — {r['last_scrape']}"
|
||||
for r in top_stale[:5]
|
||||
)
|
||||
msg = (
|
||||
f"⚠️ Sport harvest stale: {len(stale_rows)}/{total} klubova "
|
||||
f">{STALE_DAYS} dana ({sport_summary})\nTop:\n{top_lines}"
|
||||
)
|
||||
telegram(msg)
|
||||
|
||||
conn.close()
|
||||
log("sport_harvest_health DONE")
|
||||
return 1 if stale_rows else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -1,3 +1,6 @@
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
Multi-sport scrape base class.
|
||||
Usage: subclass + implement scrape_klub(), scrape_player()
|
||||
@@ -9,7 +12,7 @@ import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
|
||||
class SportHarvester:
|
||||
SPORT = None # override
|
||||
|
||||
+149
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
Multi-sport scrape base class.
|
||||
Usage: subclass + implement scrape_klub(), scrape_player()
|
||||
"""
|
||||
import os, time, json, re, sys
|
||||
from datetime import datetime
|
||||
from playwright.sync_api import sync_playwright
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor, execute_values
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
|
||||
class SportHarvester:
|
||||
SPORT = None # override
|
||||
SOURCE = None # override
|
||||
|
||||
def __init__(self):
|
||||
self.conn = psycopg2.connect(DSN)
|
||||
self.conn.autocommit = True
|
||||
self.stats = {'klubova': 0, 'players': 0, 'stats': 0, 'errors': 0}
|
||||
self.log_file = open(f"/var/log/pgz-sport-debug/harvest_{self.SPORT}_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
||||
|
||||
def log(self, msg):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] [{self.SPORT}] {msg}"
|
||||
print(line, flush=True)
|
||||
self.log_file.write(line + "\n"); self.log_file.flush()
|
||||
|
||||
def slugify(self, s):
|
||||
if not s: return ""
|
||||
t = s.lower().strip()
|
||||
for old, new in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d')]:
|
||||
t = t.replace(old, new)
|
||||
t = re.sub(r'[^a-z0-9\s-]', '', t)
|
||||
return re.sub(r'\s+', '-', t).strip('-')
|
||||
|
||||
def get_target_klubovi(self, limit=999):
|
||||
"""Get PGŽ priority klubovi for this sport."""
|
||||
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
|
||||
WHERE sport = %s AND (financiran OR u_godisnjaku)
|
||||
ORDER BY financiran DESC, u_godisnjaku DESC, id
|
||||
LIMIT %s
|
||||
""", (self.SPORT, limit))
|
||||
return cur.fetchall()
|
||||
|
||||
def upsert_clan(self, klub_id, source_id, ime, prezime, source_url, kategorija=None, sezona=None, extra=None):
|
||||
"""Upsert player + return clan_id."""
|
||||
ime = re.sub(r'\s+', ' ', (ime or '')).strip()
|
||||
prezime = re.sub(r'\s+', ' ', (prezime or '')).strip()
|
||||
with self.conn.cursor() as cur:
|
||||
# Try find existing by source+source_id
|
||||
cur.execute("""
|
||||
SELECT id FROM pgz_sport.clanovi
|
||||
WHERE source = %s AND source_id = %s
|
||||
ORDER BY id LIMIT 1
|
||||
""", (self.SOURCE, str(source_id)))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
clan_id = row[0]
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.clanovi
|
||||
SET ime = COALESCE(NULLIF(ime,''), %s),
|
||||
prezime = COALESCE(NULLIF(prezime,''), %s),
|
||||
klub_id = COALESCE(klub_id, %s),
|
||||
source_url = %s, last_updated = now(), last_scraped_at = now(),
|
||||
sport = COALESCE(sport, %s),
|
||||
metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
|
||||
WHERE id = %s
|
||||
""", (ime, prezime, klub_id, source_url, self.SPORT, json.dumps(extra or {}), clan_id))
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, sport, source, source_id, source_url, last_scraped_at, aktivan, metadata)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, now(), true, %s::jsonb)
|
||||
RETURNING id
|
||||
""", (klub_id, ime, prezime, self.SPORT, self.SOURCE, str(source_id), source_url, json.dumps(extra or {})))
|
||||
clan_id = cur.fetchone()[0]
|
||||
|
||||
# Add kategorija if specified (many-to-many)
|
||||
if kategorija:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.clan_kategorije
|
||||
(clan_id, kategorija, sezona, klub_id, source, source_url)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO NOTHING
|
||||
""", (clan_id, kategorija, sezona, klub_id, self.SOURCE, source_url))
|
||||
return clan_id
|
||||
|
||||
def upsert_stats(self, clan_id, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict, raw=None):
|
||||
"""Upsert player_stats row."""
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.player_stats
|
||||
(clan_id, sport, source, sezona, klub_id, klub_naziv, natjecanje, kategorija,
|
||||
nastupi, golovi, asistencije, bodovi, trice, skokovi, blokade, servis_asovi,
|
||||
zuti, crveni, minute, metadata)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
|
||||
ON CONFLICT (clan_id, sport, sezona, klub_id, natjecanje)
|
||||
DO UPDATE SET
|
||||
nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi,
|
||||
asistencije = EXCLUDED.asistencije, bodovi = EXCLUDED.bodovi,
|
||||
trice = EXCLUDED.trice, skokovi = EXCLUDED.skokovi,
|
||||
blokade = EXCLUDED.blokade, servis_asovi = EXCLUDED.servis_asovi,
|
||||
zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
|
||||
metadata = EXCLUDED.metadata, scraped_at = now()
|
||||
""", (clan_id, self.SPORT, self.SOURCE, sezona, klub_id, klub_naziv, natjecanje, kategorija,
|
||||
stats_dict.get('nastupi'), stats_dict.get('golovi'), stats_dict.get('asistencije'),
|
||||
stats_dict.get('bodovi'), stats_dict.get('trice'), stats_dict.get('skokovi'),
|
||||
stats_dict.get('blokade'), stats_dict.get('servis_asovi'),
|
||||
stats_dict.get('zuti'), stats_dict.get('crveni'), stats_dict.get('minute'),
|
||||
json.dumps(raw or {})))
|
||||
|
||||
def run(self, limit=999):
|
||||
klubovi = self.get_target_klubovi(limit)
|
||||
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova")
|
||||
|
||||
with sync_playwright() as pw:
|
||||
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
|
||||
ctx = browser.new_context(
|
||||
ignore_https_errors=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
for klub in klubovi:
|
||||
try:
|
||||
self.scrape_klub(page, klub)
|
||||
self.stats['klubova'] += 1
|
||||
except Exception as e:
|
||||
self.stats['errors'] += 1
|
||||
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
|
||||
|
||||
browser.close()
|
||||
|
||||
self.log(f"✅ Done. Stats: {self.stats}")
|
||||
# Telegram
|
||||
import subprocess
|
||||
try:
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
|
||||
"-d","chat_id=7969491558",
|
||||
"--data-urlencode", f"text={self.SPORT.upper()} harvest done: {self.stats}"],
|
||||
timeout=8, capture_output=True)
|
||||
except: pass
|
||||
|
||||
def scrape_klub(self, page, klub):
|
||||
raise NotImplementedError("subclass must implement")
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_fix_and_extract.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_fix_and_extract.py
|
||||
@@ -14,7 +17,7 @@ import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_fix_and_extract.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_fix_and_extract.py
|
||||
# Autor: dradulic@outlook.com / damir@rinet.one
|
||||
# Svrha: SUB1 finalize — (a) rollback false positives,
|
||||
# (b) extract hns_klub_id iz već postojećeg source_url,
|
||||
# (c) verify presence preko HEAD i upsert.
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""SUB1 fix-up: false-positive rollback + source_url-based extraction."""
|
||||
import os, re, sys, time, json, subprocess, urllib.request
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
|
||||
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_fix_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
LOG = open(LOG_PATH, "a")
|
||||
|
||||
# False positives to ROLLBACK (cleared and marked not_found)
|
||||
FALSE_POS = {
|
||||
2572: "NK Hajduk Tovarnik (matched HNK Hajduk Split — different club)",
|
||||
600: "Ženski NK XXL Kraljevica (matched men's NK Kraljevica — wrong sex)",
|
||||
}
|
||||
|
||||
def log(msg, telegram=False):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True); LOG.write(line+"\n"); LOG.flush()
|
||||
if telegram:
|
||||
try:
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot{TG}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={msg[:3500]}"],
|
||||
timeout=8, capture_output=True)
|
||||
except: pass
|
||||
|
||||
def http_head_or_get(url, timeout=12):
|
||||
"""Verify URL exists. Return (status, title)."""
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
html = r.read().decode("utf-8", errors="replace")
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||||
title = m.group(1).strip() if m else None
|
||||
return r.status, title
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, None
|
||||
except Exception as e:
|
||||
return 0, str(e)
|
||||
|
||||
URL_RE = re.compile(r'/klubovi/(\d+)/([a-z0-9-]*)/?')
|
||||
|
||||
def main():
|
||||
log(f"=== SUB1 fix start; log={LOG_PATH} ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
# Phase 1: Rollback false positives
|
||||
rb = 0
|
||||
for kid, reason in FALSE_POS.items():
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = NULL,
|
||||
hns_slug = NULL,
|
||||
scrape_source = 'hns_not_found',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (kid,))
|
||||
log(f" ROLLBACK [{kid}] — {reason}")
|
||||
rb += 1
|
||||
|
||||
# Phase 2: Extract hns_klub_id from existing source_url
|
||||
cur.execute("""
|
||||
SELECT id, naziv, source_url
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
||||
AND hns_klub_id IS NULL
|
||||
AND source_url ~ 'semafor\\.hns\\.family/klubovi/[0-9]+'
|
||||
ORDER BY id
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
log(f"Source-URL extraction candidates: {len(rows)}")
|
||||
|
||||
extracted = 0; verify_fail = 0
|
||||
for r in rows:
|
||||
kid, naziv, url = r['id'], r['naziv'], r['source_url']
|
||||
m = URL_RE.search(url)
|
||||
if not m:
|
||||
log(f" SKIP [{kid}] no match in {url}")
|
||||
continue
|
||||
hns_id = int(m.group(1))
|
||||
slug = m.group(2) or None
|
||||
# Verify
|
||||
verify_url = f"https://semafor.hns.family/klubovi/{hns_id}/"
|
||||
status, title = http_head_or_get(verify_url)
|
||||
time.sleep(0.8)
|
||||
if status != 200 or not title:
|
||||
log(f" VERIFY FAIL [{kid}] {naziv} -> {hns_id}: status={status} title={title}")
|
||||
verify_fail += 1
|
||||
continue
|
||||
# If slug missing, try inferring from title
|
||||
if not slug and title:
|
||||
slug = re.sub(r'[^a-z0-9]+', '-',
|
||||
title.lower()
|
||||
.replace('č','c').replace('ć','c').replace('š','s').replace('ž','z').replace('đ','d')
|
||||
).strip('-')
|
||||
canonical = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/" if slug else verify_url
|
||||
try:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = %s,
|
||||
hns_slug = %s,
|
||||
source_url = %s,
|
||||
scrape_source = 'hns_semafor',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (hns_id, slug, canonical, kid))
|
||||
log(f" EXTRACT [{kid}] {naziv} -> HNS {hns_id} '{title}' (slug={slug})")
|
||||
extracted += 1
|
||||
except Exception as e:
|
||||
log(f" UPDATE fail [{kid}]: {e}")
|
||||
|
||||
# Phase 3: Final stats
|
||||
cur.execute("""
|
||||
SELECT
|
||||
COUNT(*) FILTER (WHERE hns_klub_id IS NOT NULL) AS mapped,
|
||||
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND scrape_source='hns_not_found') AS marked_nf,
|
||||
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND (scrape_source IS NULL OR scrape_source != 'hns_not_found')) AS untouched
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
||||
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
|
||||
""")
|
||||
stats = cur.fetchone()
|
||||
log(f"=== Final state (real football, PGŽ priority): mapped={stats['mapped']}, marked_not_found={stats['marked_nf']}, untouched={stats['untouched']} ===")
|
||||
|
||||
msg = (f"SUB1 fix done: rollback={rb}, source_url-extracted={extracted}, "
|
||||
f"verify_fail={verify_fail}. Final mapped={stats['mapped']} / "
|
||||
f"not_found={stats['marked_nf']} / untouched={stats['untouched']}")
|
||||
log(msg, telegram=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
|
||||
@@ -24,7 +27,7 @@ import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
|
||||
|
||||
@@ -0,0 +1,358 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
|
||||
# Autor: dradulic@outlook.com / damir@rinet.one
|
||||
# Svrha: SUB1 — Pronađi semafor.hns.family link za PGŽ priority
|
||||
# nogometne klubove koji nemaju hns_klub_id.
|
||||
# Strategija:
|
||||
# 1. Enumerate ŽNS Primorsko-goranski (oid=51) competitions across
|
||||
# seasons, plus 4. NL NS Rijeka, 3. HNL Zapad arhive
|
||||
# 2. Za svaki natjecanje GET /natjecanja/{cid}/{cname}/ i izvuci
|
||||
# sve <a href="/klubovi/{id}/{slug}/">{naziv}</a>
|
||||
# 3. Build catalog (hns_id, slug, naziv) — skup unique
|
||||
# 4. Fuzzy match candidate klubovi: normalize, drop NK/HNK/GNK
|
||||
# prefiks, ukloni dijakritike, pa equality + substring + ratio
|
||||
# 5. UPDATE pgz_sport.klubovi za matche; mark not_found za ostalo
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""SUB1 — HNS link harvester for PGŽ priority football clubs."""
|
||||
import os, re, sys, time, json, traceback, subprocess, difflib
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote
|
||||
import urllib.request, urllib.error
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
|
||||
SLEEP = 1.1
|
||||
BASE = "https://semafor.hns.family"
|
||||
|
||||
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
|
||||
LOG = open(LOG_PATH, "a")
|
||||
|
||||
def log(msg, telegram=False):
|
||||
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
|
||||
print(line, flush=True)
|
||||
LOG.write(line + "\n"); LOG.flush()
|
||||
if telegram:
|
||||
try:
|
||||
subprocess.run(["curl","-s","-X","POST",
|
||||
f"https://api.telegram.org/bot{TG}/sendMessage",
|
||||
"-d", f"chat_id={TG_CHAT}",
|
||||
"--data-urlencode", f"text={msg[:3500]}"],
|
||||
timeout=8, capture_output=True)
|
||||
except Exception as e:
|
||||
log(f"TG error: {e}")
|
||||
|
||||
def http_get(url, accept_json=False, timeout=25):
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": UA,
|
||||
"Accept": "application/json, */*" if accept_json else "text/html,*/*",
|
||||
"X-Requested-With": "XMLHttpRequest" if accept_json else "",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
|
||||
# ── Normalization for fuzzy match ──
|
||||
DIACRITIC_MAP = str.maketrans({
|
||||
'č':'c','ć':'c','ž':'z','š':'s','đ':'d',
|
||||
'Č':'c','Ć':'c','Ž':'z','Š':'s','Đ':'d',
|
||||
'á':'a','é':'e','í':'i','ó':'o','ú':'u',
|
||||
})
|
||||
PREFIX_RE = re.compile(
|
||||
r'^(hrvatski\s+nogometni\s+klub|hrvatski\s+nogometni\s+klub\.?|'
|
||||
r'nogometni\s+klub|nogometna\s+akademija|nogometna\s+škola|'
|
||||
r'sportska\s+akademija|Ženski\s+nogometni\s+klub|'
|
||||
r'hnk|nk|gnk|znk|žnk|nk\.?|hnk\.?)\s+',
|
||||
re.IGNORECASE
|
||||
)
|
||||
SUFFIX_NOISE_RE = re.compile(
|
||||
r'\b(veterani|veterana|gornji\s+zamet|grada\s+crikvenice|'
|
||||
r'gomirje\s+gomirje|mrkopalj\s+mrkopalj|snježnik\s+gerovo|'
|
||||
r'-?\s*\d{4}\s*$)', re.IGNORECASE)
|
||||
|
||||
def norm(s):
|
||||
if not s: return ""
|
||||
s = s.lower().strip()
|
||||
s = s.translate(DIACRITIC_MAP)
|
||||
s = re.sub(r'["\'`]', '', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s
|
||||
|
||||
def core_name(naziv):
|
||||
"""Strip prefixes and noise; return core token list + joined."""
|
||||
s = norm(naziv)
|
||||
# remove prefix(es) (sometimes nested e.g. "Nogometni Klub HNK ...")
|
||||
for _ in range(3):
|
||||
s2 = PREFIX_RE.sub('', s)
|
||||
if s2 == s: break
|
||||
s = s2
|
||||
s = SUFFIX_NOISE_RE.sub('', s).strip()
|
||||
s = re.sub(r'\s+', ' ', s).strip()
|
||||
return s
|
||||
|
||||
def slugify(s):
|
||||
s = core_name(s)
|
||||
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
||||
return s
|
||||
|
||||
# ── Catalog harvest ──
|
||||
def get_pgz_competitions(season):
|
||||
"""Fetch list of competitions for ŽNS Primorsko-goranski (oid=51) for a season."""
|
||||
t = int(time.time()*1000)
|
||||
url = (f"{BASE}/handlers/getCompetitions/"
|
||||
f"?season={quote(season)}&oid=51&teamch=Club"
|
||||
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
|
||||
f"&lang=hr&t={t}")
|
||||
try:
|
||||
body = http_get(url, accept_json=True)
|
||||
return json.loads(body)
|
||||
except Exception as e:
|
||||
log(f" comps fetch fail {season}: {e}")
|
||||
return []
|
||||
|
||||
def get_organizations(season):
|
||||
"""List all organizations (regional federations) for a season."""
|
||||
t = int(time.time()*1000)
|
||||
url = (f"{BASE}/handlers/getOrganizations/"
|
||||
f"?season={quote(season)}&teamch=Club&lang=hr&t={t}")
|
||||
try:
|
||||
body = http_get(url, accept_json=True)
|
||||
return json.loads(body)
|
||||
except Exception as e:
|
||||
log(f" orgs fetch fail {season}: {e}")
|
||||
return []
|
||||
|
||||
# Match <a href="/klubovi/{id}/{slug}/">NAME<div...>...</a> — name is anything before first child element
|
||||
CLUB_LINK_RE2 = re.compile(
|
||||
r'<a[^>]+href="(?:https?://semafor\.hns\.family)?/klubovi/(\d+)/([a-z0-9-]*)/?"[^>]*>([^<]{1,150})(?:<|</a>)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
def harvest_competition(cid):
|
||||
"""GET natjecanje page and extract all club refs."""
|
||||
# The dynamic linkConstructor returned literal {cid}/{cname} — try direct id
|
||||
url = f"{BASE}/natjecanja/{cid}/x/"
|
||||
try:
|
||||
html = http_get(url)
|
||||
except Exception as e:
|
||||
log(f" nat fetch fail {cid}: {e}")
|
||||
return []
|
||||
found = []
|
||||
for m in CLUB_LINK_RE2.finditer(html):
|
||||
hns_id, slug, naziv = m.group(1), m.group(2), m.group(3).strip()
|
||||
# filter: real club name (not "Klubovi" navigation etc.)
|
||||
if len(naziv) > 1 and not naziv.lower().startswith('klubov'):
|
||||
found.append((hns_id, slug, naziv))
|
||||
return found
|
||||
|
||||
# ── Match logic ──
|
||||
def match_score(candidate_naziv, candidate_grad, hns_naziv):
|
||||
"""Score 0-100 how well candidate matches an HNS club entry."""
|
||||
cand_core = core_name(candidate_naziv)
|
||||
hns_core = core_name(hns_naziv)
|
||||
if not cand_core or not hns_core:
|
||||
return 0
|
||||
if cand_core == hns_core:
|
||||
return 100
|
||||
# ratio
|
||||
r = difflib.SequenceMatcher(None, cand_core, hns_core).ratio()
|
||||
score = int(r * 100)
|
||||
# bonus if grad in HNS naziv (e.g. "NK Borac (Ba)" + grad="Bakar")
|
||||
if candidate_grad:
|
||||
gnorm = norm(candidate_grad)
|
||||
if gnorm and (gnorm[:3] in norm(hns_naziv) or norm(hns_naziv).endswith('('+gnorm[:1]+')')):
|
||||
score = min(100, score + 5)
|
||||
# substring containment bonus (one fully contained)
|
||||
if cand_core in hns_core or hns_core in cand_core:
|
||||
score = max(score, 85)
|
||||
return score
|
||||
|
||||
# ── Main ──
|
||||
def main():
|
||||
log(f"=== SUB1 HNS link harvester start; log={LOG_PATH} ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor(cursor_factory=RealDictCursor)
|
||||
|
||||
# 1) Get candidate clubs
|
||||
cur.execute("""
|
||||
SELECT id, naziv, grad
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE sport='nogomet' AND pgz_sufinanciran=true
|
||||
AND hns_klub_id IS NULL
|
||||
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
|
||||
ORDER BY naziv
|
||||
""")
|
||||
candidates = cur.fetchall()
|
||||
log(f"Candidates: {len(candidates)}")
|
||||
|
||||
# 2) Build HNS catalog from PGŽ competitions across recent seasons
|
||||
SEASONS = ["2025/2026","2024/2025","2023/2024","2022/2023","2021/2022","2020/2021","2019/2020","2018/2019","2017/2018"]
|
||||
catalog = {} # hns_id -> {slug, naziv, sources:set}
|
||||
|
||||
seen_cids = set()
|
||||
for season in SEASONS:
|
||||
log(f"-- season {season}")
|
||||
comps = get_pgz_competitions(season)
|
||||
time.sleep(SLEEP)
|
||||
log(f" PGŽ comps: {len(comps)}")
|
||||
for c in comps:
|
||||
cid = str(c.get('id',''))
|
||||
if not cid or cid in seen_cids: continue
|
||||
seen_cids.add(cid)
|
||||
cname = c.get('value','')
|
||||
try:
|
||||
clubs = harvest_competition(cid)
|
||||
except Exception as e:
|
||||
log(f" {cid} ({cname}) fetch error: {e}")
|
||||
clubs = []
|
||||
for hns_id, slug, naziv in clubs:
|
||||
if hns_id not in catalog:
|
||||
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
|
||||
else:
|
||||
if slug and not catalog[hns_id]['slug']:
|
||||
catalog[hns_id]['slug'] = slug
|
||||
catalog[hns_id]['sources'].add(f"{season}:{cname[:30]}")
|
||||
log(f" {cid} '{cname[:40]}' -> {len(clubs)} clubs (catalog={len(catalog)})")
|
||||
time.sleep(SLEEP)
|
||||
# also sweep top-tier comps to catch HNK Rijeka-tier (though those usually mapped)
|
||||
# Also: 3.HNL Zapad / 4.NL NS Rijeka by oid=178180 (NS Rijeka)
|
||||
log("-- NS Rijeka oid=178180 sweep")
|
||||
for season in SEASONS:
|
||||
t = int(time.time()*1000)
|
||||
url = (f"{BASE}/handlers/getCompetitions/"
|
||||
f"?season={quote(season)}&oid=178180&teamch=Club"
|
||||
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
|
||||
f"&lang=hr&t={t}")
|
||||
try:
|
||||
comps = json.loads(http_get(url, accept_json=True))
|
||||
except Exception as e:
|
||||
log(f" ns_rijeka {season} fail: {e}"); comps = []
|
||||
time.sleep(SLEEP)
|
||||
for c in comps:
|
||||
cid = str(c.get('id',''))
|
||||
if not cid or cid in seen_cids: continue
|
||||
seen_cids.add(cid)
|
||||
cname = c.get('value','')
|
||||
try:
|
||||
clubs = harvest_competition(cid)
|
||||
except Exception as e:
|
||||
clubs = []
|
||||
for hns_id, slug, naziv in clubs:
|
||||
if hns_id not in catalog:
|
||||
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
|
||||
catalog[hns_id]['sources'].add(f"NSR:{season}:{cname[:30]}")
|
||||
log(f" NSR {cid} '{cname[:40]}' -> {len(clubs)} (cat={len(catalog)})")
|
||||
time.sleep(SLEEP)
|
||||
|
||||
log(f"=== Catalog built: {len(catalog)} unique HNS clubs ===")
|
||||
|
||||
# Save catalog snapshot
|
||||
snap = {hid: {'slug': v['slug'], 'naziv': v['naziv'], 'sources': sorted(v['sources'])[:5]}
|
||||
for hid,v in catalog.items()}
|
||||
with open("/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json","w") as f:
|
||||
json.dump(snap, f, ensure_ascii=False, indent=2)
|
||||
log(f"Catalog snapshot -> /opt/pgz-sport/cc_tasks/sub1_hns_catalog.json")
|
||||
|
||||
# 3) Match candidates
|
||||
matched = [] # (db_id, db_naziv, hns_id, slug, hns_naziv, score)
|
||||
not_found = []
|
||||
ambiguous = []
|
||||
|
||||
for cand in candidates:
|
||||
db_id, naziv, grad = cand['id'], cand['naziv'], cand['grad']
|
||||
ranked = []
|
||||
for hid, v in catalog.items():
|
||||
sc = match_score(naziv, grad, v['naziv'])
|
||||
if sc >= 70:
|
||||
ranked.append((sc, hid, v['slug'], v['naziv']))
|
||||
ranked.sort(reverse=True)
|
||||
if not ranked:
|
||||
not_found.append((db_id, naziv, grad))
|
||||
log(f" NOT FOUND: [{db_id}] {naziv} ({grad})")
|
||||
continue
|
||||
top = ranked[0]
|
||||
if len(ranked) > 1 and ranked[1][0] >= top[0] - 3 and top[0] < 95:
|
||||
ambiguous.append((db_id, naziv, grad, ranked[:3]))
|
||||
log(f" AMBIGUOUS: [{db_id}] {naziv} -> top: {top[3]} ({top[0]}), 2nd: {ranked[1][3]} ({ranked[1][0]})")
|
||||
# Skip ambiguous, mark not_found for safety
|
||||
not_found.append((db_id, naziv, grad))
|
||||
continue
|
||||
matched.append((db_id, naziv, top[1], top[2], top[3], top[0]))
|
||||
log(f" MATCH [{db_id}] {naziv} -> HNS {top[1]} '{top[3]}' (slug={top[2]}, score={top[0]})")
|
||||
|
||||
log(f"=== Match results: {len(matched)} matched, {len(not_found)} not_found, {len(ambiguous)} ambiguous ===")
|
||||
|
||||
# 4) Apply UPDATEs
|
||||
upd_ok, upd_fail = 0, 0
|
||||
for db_id, naziv, hns_id, slug, hns_naziv, sc in matched:
|
||||
try:
|
||||
source_url = f"{BASE}/klubovi/{hns_id}/{slug}/" if slug else f"{BASE}/klubovi/{hns_id}/"
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = %s,
|
||||
hns_slug = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
scrape_source = 'hns_semafor',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (int(hns_id), slug or None, source_url, db_id))
|
||||
upd_ok += 1
|
||||
except Exception as e:
|
||||
upd_fail += 1
|
||||
log(f" UPDATE fail [{db_id}] {naziv}: {e}")
|
||||
|
||||
# Mark not_found
|
||||
nf_ok = 0
|
||||
for db_id, naziv, grad in not_found:
|
||||
try:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET scrape_source = 'hns_not_found',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s AND hns_klub_id IS NULL
|
||||
""", (db_id,))
|
||||
nf_ok += 1
|
||||
except Exception as e:
|
||||
log(f" not_found mark fail [{db_id}]: {e}")
|
||||
|
||||
# 5) Write result md
|
||||
res_path = "/opt/pgz-sport/cc_tasks/SUB1_RESULT.md"
|
||||
with open(res_path, "w") as f:
|
||||
f.write(f"# SUB1 — HNS Link Harvest Result\n\n")
|
||||
f.write(f"Date: {datetime.now().isoformat(timespec='seconds')}\n\n")
|
||||
f.write(f"- Candidates processed: **{len(candidates)}**\n")
|
||||
f.write(f"- HNS catalog built: **{len(catalog)}** unique clubs from {len(seen_cids)} competitions\n")
|
||||
f.write(f"- Matched: **{len(matched)}** (DB updated: {upd_ok}, fail: {upd_fail})\n")
|
||||
f.write(f"- Ambiguous (skipped to safety): **{len(ambiguous)}**\n")
|
||||
f.write(f"- Not found (marked hns_not_found): **{len(not_found)}** (mark ok: {nf_ok})\n\n")
|
||||
f.write(f"## Matched\n\n| db_id | DB naziv | HNS id | HNS naziv | slug | score |\n|---|---|---|---|---|---|\n")
|
||||
for db_id, naziv, hns_id, slug, hns_naziv, sc in sorted(matched, key=lambda x: -x[5]):
|
||||
f.write(f"| {db_id} | {naziv} | {hns_id} | {hns_naziv} | {slug} | {sc} |\n")
|
||||
f.write(f"\n## Ambiguous (manual review)\n\n")
|
||||
for db_id, naziv, grad, ranked in ambiguous:
|
||||
f.write(f"- **[{db_id}] {naziv}** ({grad})\n")
|
||||
for sc, hid, slug, hns_naziv in ranked:
|
||||
f.write(f" - {sc}: HNS {hid} '{hns_naziv}' (slug={slug})\n")
|
||||
f.write(f"\n## Not Found\n\n")
|
||||
for db_id, naziv, grad in not_found:
|
||||
f.write(f"- [{db_id}] {naziv} ({grad})\n")
|
||||
f.write(f"\n## Log\n\n`{LOG_PATH}`\n")
|
||||
log(f"Result -> {res_path}")
|
||||
|
||||
# 6) Telegram notify
|
||||
msg = (f"SUB1 HNS done: matched {len(matched)}, not_found {len(not_found)}, "
|
||||
f"ambiguous {len(ambiguous)}. Catalog={len(catalog)}. "
|
||||
f"DB upd ok={upd_ok}/fail={upd_fail}. See SUB1_RESULT.md")
|
||||
log(msg, telegram=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: {e}\n{traceback.format_exc()}", telegram=True)
|
||||
sys.exit(1)
|
||||
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_manual_overrides.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_manual_overrides.py
|
||||
@@ -12,7 +15,7 @@ from datetime import datetime
|
||||
import psycopg2
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
|
||||
|
||||
# Manual mappings — verified by visiting semafor.hns.family
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sub1_hns_manual_overrides.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_manual_overrides.py
|
||||
# Autor: dradulic@outlook.com / damir@rinet.one
|
||||
# Svrha: SUB1 — Manual high-confidence overrides za klubove koje
|
||||
# fuzzy match nije uhvatio (ali postoje u HNS-u).
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""SUB1 manual overrides — verified mapping for special cases."""
|
||||
import os, re, sys, time, urllib.request
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
|
||||
|
||||
# Manual mappings — verified by visiting semafor.hns.family
|
||||
# Format: db_id -> (hns_id, slug, naziv-na-HNS, reason)
|
||||
OVERRIDES = {
|
||||
9: (3440, "znk-rijeka", "ŽNK Rijeka", "Ženski NK Rijeka — same modern club"),
|
||||
101: (3440, "znk-rijeka", "ŽNK Rijeka", "Ženski NK Rijeka 'Jack Pot' — sponsor naming, same club"),
|
||||
574: (5239, "nk-medicinar", "NK Medicinar", "NK Medicinar Rijeka (osnovan 1996, SRC Belveder)"),
|
||||
}
|
||||
|
||||
def http_check(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
html = r.read().decode("utf-8", errors="replace")
|
||||
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
|
||||
return r.status, (m.group(1).strip() if m else None)
|
||||
except Exception as e:
|
||||
return 0, str(e)
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
print(f"[{datetime.now().isoformat(timespec='seconds')}] Manual overrides start")
|
||||
ok = 0; fail = 0
|
||||
for kid, (hns_id, slug, naziv, reason) in OVERRIDES.items():
|
||||
url = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/"
|
||||
status, title = http_check(url)
|
||||
time.sleep(0.8)
|
||||
if status != 200:
|
||||
print(f" VERIFY FAIL [{kid}] {hns_id}: {status} {title}")
|
||||
fail += 1
|
||||
continue
|
||||
try:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id = %s,
|
||||
hns_slug = %s,
|
||||
source_url = %s,
|
||||
scrape_source = 'hns_semafor_manual',
|
||||
last_scraped_at = now()
|
||||
WHERE id = %s
|
||||
""", (hns_id, slug, url, kid))
|
||||
print(f" OVERRIDE [{kid}] -> HNS {hns_id} '{title}' ({reason})")
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
print(f" UPDATE fail [{kid}]: {e}")
|
||||
fail += 1
|
||||
print(f"Done: ok={ok}, fail={fail}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user