feat: /api/v2/analiza/* endpoints - sport analytics backend

This commit is contained in:
Damir Radulic
2026-05-16 00:28:12 +02:00
parent 7ca5d7d94e
commit aca5051418
1355 changed files with 321891 additions and 4128 deletions
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
coverage_report.py — Per-entity coverage scoring across pgz_sport schema
@@ -14,7 +17,7 @@ from datetime import datetime, timezone
import psycopg2, psycopg2.extras
PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
user='rinet', password=os.environ["DB_PASSWORD"])
# Per-type coverage definition: list of fields that count toward coverage
DEFS = {
+197
View File
@@ -0,0 +1,197 @@
#!/usr/bin/env python3
"""
coverage_report.py — Per-entity coverage scoring across pgz_sport schema
Fills /opt/pgz-sport/data_quality_report.md with:
- per-type aggregate (n, mean coverage, median, # zero-coverage, # complete)
- distribution histogram
- top 50 entities most needing manual review (lowest coverage AND non-empty name)
- link to detail panel for each (so audit.html-style triage is one click away)
"""
import os, json
from collections import Counter
from datetime import datetime, timezone
import psycopg2, psycopg2.extras
PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3',
user='rinet', password=os.environ["DB_PASSWORD"])
# Per-type coverage definition: list of fields that count toward coverage
DEFS = {
'savez': {
'table': 'pgz_sport.savezi',
'name_col': 'naziv',
'fields': ['naziv','sport','predsjednik','tajnik','email','telefon','web','oib','adresa','godina_osnutka'],
'panel_path': lambda i: f'/?nav=savezi&open={i}',
},
'klub': {
'table': 'pgz_sport.klubovi',
'name_col': 'naziv',
# Use COALESCE-ish: web OR web_stranica counts; sjediste OR adresa counts
'fields': ['naziv','sport','grad','oib','predsjednik','tajnik','email','telefon',
'web_or_stranica','sjediste_or_adresa','ciljevi','opis_djelatnosti'],
'panel_path': lambda i: f'/?nav=klubovi&open={i}',
},
'sportas': {
'table': 'pgz_sport.clanovi',
'name_col': "ime||' '||prezime",
'fields': ['ime','prezime','sport','klub_id','datum_rodenja','slika_url','oib','profile_url','biografija','hns_igrac_id'],
'panel_path': lambda i: f'/?nav=sportasi&open={i}',
},
'objekt': {
'table': 'pgz_sport.sportski_objekti',
'name_col': 'naziv',
'fields': ['naziv','tip','grad','adresa','lat','lng','upravitelj','kapacitet','sportovi','izgradeno'],
'panel_path': lambda i: f'/?nav=objekti&open={i}',
},
'manifestacija': {
'table': 'pgz_sport.manifestacije',
'name_col': 'naziv',
'fields': ['naziv','mjesto','organizator','razina','broj_ucesnika','godina_od','source_url'],
'panel_path': lambda i: f'/?nav=manifestacije&open={i}',
},
}
def fetch_rows(cur, kind: str):
spec = DEFS[kind]
table = spec['table']
if kind == 'klub':
sql = f"""
SELECT id, naziv,
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END +
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END +
CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END +
CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END +
CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END +
CASE WHEN COALESCE(web, web_stranica) IS NOT NULL AND COALESCE(web, web_stranica)<>'' THEN 1 ELSE 0 END +
CASE WHEN COALESCE(sjediste, adresa) IS NOT NULL AND COALESCE(sjediste, adresa)<>'' THEN 1 ELSE 0 END +
CASE WHEN ciljevi IS NOT NULL AND ciljevi<>'' THEN 1 ELSE 0 END +
CASE WHEN opis_djelatnosti IS NOT NULL AND opis_djelatnosti<>'' THEN 1 ELSE 0 END
) AS filled
FROM {table}
"""
elif kind == 'sportas':
sql = f"""
SELECT id, (COALESCE(ime,'')||' '||COALESCE(prezime,'')) AS naziv,
(CASE WHEN ime IS NOT NULL AND ime<>'' THEN 1 ELSE 0 END +
CASE WHEN prezime IS NOT NULL AND prezime<>'' THEN 1 ELSE 0 END +
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
CASE WHEN klub_id IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN datum_rodenja IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN slika_url IS NOT NULL AND slika_url<>'' THEN 1 ELSE 0 END +
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
CASE WHEN profile_url IS NOT NULL AND profile_url<>'' THEN 1 ELSE 0 END +
CASE WHEN biografija IS NOT NULL AND biografija<>'' THEN 1 ELSE 0 END +
CASE WHEN hns_igrac_id IS NOT NULL AND hns_igrac_id<>'' THEN 1 ELSE 0 END
) AS filled
FROM {table}
"""
elif kind == 'objekt':
sql = f"""
SELECT id, naziv,
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
CASE WHEN tip IS NOT NULL AND tip<>'' THEN 1 ELSE 0 END +
CASE WHEN grad IS NOT NULL AND grad<>'' THEN 1 ELSE 0 END +
CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END +
CASE WHEN lat IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN lng IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN upravitelj IS NOT NULL AND upravitelj<>'' THEN 1 ELSE 0 END +
CASE WHEN kapacitet IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN sportovi IS NOT NULL AND array_length(sportovi,1)>0 THEN 1 ELSE 0 END +
CASE WHEN izgradeno IS NOT NULL THEN 1 ELSE 0 END
) AS filled
FROM {table}
"""
elif kind == 'manifestacija':
sql = f"""
SELECT id, naziv,
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
CASE WHEN mjesto IS NOT NULL AND mjesto<>'' THEN 1 ELSE 0 END +
CASE WHEN organizator IS NOT NULL AND organizator<>'' THEN 1 ELSE 0 END +
CASE WHEN razina IS NOT NULL AND razina<>'' THEN 1 ELSE 0 END +
CASE WHEN broj_ucesnika IS NOT NULL AND broj_ucesnika::text<>'' THEN 1 ELSE 0 END +
CASE WHEN godina_od IS NOT NULL THEN 1 ELSE 0 END +
CASE WHEN source_url IS NOT NULL AND source_url<>'' THEN 1 ELSE 0 END
) AS filled
FROM {table}
"""
else: # savez
sql = f"""
SELECT id, naziv,
(CASE WHEN naziv IS NOT NULL AND naziv<>'' THEN 1 ELSE 0 END +
CASE WHEN sport IS NOT NULL AND sport<>'' THEN 1 ELSE 0 END +
CASE WHEN predsjednik IS NOT NULL AND predsjednik<>'' THEN 1 ELSE 0 END +
CASE WHEN tajnik IS NOT NULL AND tajnik<>'' THEN 1 ELSE 0 END +
CASE WHEN email IS NOT NULL AND email<>'' THEN 1 ELSE 0 END +
CASE WHEN telefon IS NOT NULL AND telefon<>'' THEN 1 ELSE 0 END +
CASE WHEN web IS NOT NULL AND web<>'' THEN 1 ELSE 0 END +
CASE WHEN oib IS NOT NULL AND oib<>'' THEN 1 ELSE 0 END +
CASE WHEN adresa IS NOT NULL AND adresa<>'' THEN 1 ELSE 0 END +
CASE WHEN godina_osnutka IS NOT NULL THEN 1 ELSE 0 END
) AS filled
FROM {table}
"""
cur.execute(sql)
rows = []
for r in cur.fetchall():
rows.append({'kind': kind, 'id': r['id'], 'naziv': r['naziv'] or '',
'filled': int(r['filled']),
'total': len(spec['fields'])})
return rows
def stats(rows):
if not rows: return {}
pcts = [r['filled']/r['total']*100 for r in rows]
pcts.sort()
n = len(pcts)
mean = sum(pcts)/n
median = pcts[n//2]
zero = sum(1 for p in pcts if p == 0)
complete = sum(1 for p in pcts if p >= 99.0)
bins = Counter()
for p in pcts:
b = int(p // 10) * 10
if b == 100: b = 90
bins[b] += 1
return {'n': n, 'mean': round(mean,1), 'median': round(median,1),
'zero': zero, 'complete': complete,
'distribution': dict(sorted(bins.items()))}
def main():
conn = psycopg2.connect(**PG)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
all_rows = []
by_kind = {}
for kind in DEFS:
rows = fetch_rows(cur, kind)
by_kind[kind] = rows
all_rows.extend(rows)
print(f'{kind:14s} n={len(rows):5d} mean={stats(rows)["mean"]:.1f}% complete={stats(rows)["complete"]}')
# Top 50 worst — exclude rows with empty naziv (those are flagged separately)
valid = [r for r in all_rows if (r['naziv'] or '').strip()]
# Sort by coverage ASC, then by total DESC
worst = sorted(valid, key=lambda r: (r['filled']/r['total'], -r['total']))[:50]
out = {
'generated_at': datetime.now(timezone.utc).isoformat(),
'totals': {k: len(v) for k,v in by_kind.items()},
'total_entities': len(all_rows),
'per_type_stats': {k: stats(v) for k,v in by_kind.items()},
'top50_review': worst,
}
print(f'\nTotal entities: {len(all_rows)}')
print(f'Top 50 worst — sample:')
for r in worst[:5]:
pct = r['filled']/r['total']*100
print(f" {r['kind']:14s} id={r['id']:7d} {r['naziv'][:50]:50s} {r['filled']}/{r['total']} ({pct:.0f}%)")
json.dump(out, open('/tmp/coverage_data.json','w'), ensure_ascii=False, default=str)
cur.close(); conn.close()
if __name__ == '__main__':
main()
+5 -1
View File
@@ -1,4 +1,8 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
import os
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_extract.py
# Verzija: 1.0.0
@@ -25,7 +29,7 @@ logging.basicConfig(
)
log = logging.getLogger("extract")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
@@ -0,0 +1,210 @@
#!/usr/bin/env python3
import os
# ═══════════════════════════════════════════════════════════════════
# Fajl: godisnjak_extract.py
# Verzija: 1.0.0
# Datum: 03.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scripts/godisnjak_extract.py
# Svrha: LLM ekstrakcija osoba/uloga iz godisnjaka PGZ (Phase 2)
# Zavisi od: httpx, psycopg2, rapidfuzz
# Utječe na: pgz_sport.clanovi
# ═══════════════════════════════════════════════════════════════════
import asyncio, glob, json, logging, re, sys, time
import httpx, psycopg2
from psycopg2.extras import execute_batch
from rapidfuzz import fuzz
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s] %(message)s",
datefmt="%H:%M:%S",
handlers=[
logging.FileHandler("/opt/pgz-sport/logs/godisnjak_extract.log"),
logging.StreamHandler(),
],
)
log = logging.getLogger("extract")
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
VLLM_URL = "http://localhost:8001/v1/chat/completions"
VLLM_MODEL = "Qwen/Qwen2.5-7B-Instruct-AWQ"
DATA_DIR = "/opt/pgz-sport/_data/godisnjaci"
MAX_WORKERS = 4
CHUNK_SIZE = 1400
EXTRACT_PROMPT = """Ekstrahiraj iz teksta SVA imena osoba i njihove uloge.
Vrati ISKLJUCIVO valid JSON (bez markdown, bez objasnjenja):
{"osobe": [{"ime":"X","prezime":"Y","klub":"Z","uloga":"igrac","godina_rodenja":1990}]}
Dozvoljene uloge: predsjednik, dopredsjednik, tajnik, blagajnik, clan_uprave,
igrac, sportas, glavni_trener, trener, pomocni_trener, kondicioni_trener,
selektor, izbornik, team_manager, voditelj, lijecnik, fizioterapeut,
kineziolog, maser, sudac, volonter
Pravila:
1. Samo HRVATSKA osobe s punim imenom i prezimenom
2. Ako klub nije eksplicitno naveden -> klub=""
3. NE izmisljaj - samo jasno navedena imena u tekstu
4. Godina rodenja samo ako eksplicitno u tekstu, inace izostavi"""
def chunk_text(text, size=CHUNK_SIZE):
paragraphs = re.split(r'\n\n+', text)
chunks, cur = [], ""
for p in paragraphs:
if len(cur) + len(p) > size:
if cur: chunks.append(cur.strip())
cur = p
else:
cur += "\n\n" + p
if cur: chunks.append(cur.strip())
return [c for c in chunks if len(c) > 80]
# Preload klub cache
def load_klub_cache(conn):
cur = conn.cursor()
cur.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true OR aktivan IS NULL LIMIT 2000")
return cur.fetchall()
def fuzzy_klub(naziv, cache):
if not naziv or len(naziv) < 3:
return None
best_id, best_score = None, 0
for kid, kname in cache:
score = fuzz.token_set_ratio(naziv.lower(), kname.lower())
if score > best_score:
best_score, best_id = score, kid
return best_id if best_score > 72 else None
async def extract_persons(chunk_text_str, semaphore):
async with semaphore:
try:
async with httpx.AsyncClient(timeout=90.0) as c:
r = await c.post(VLLM_URL, json={
"model": VLLM_MODEL,
"messages": [
{"role": "system", "content": EXTRACT_PROMPT},
{"role": "user", "content": chunk_text_str[:5000]},
],
"temperature": 0.05,
"max_tokens": 2500,
"response_format": {"type": "json_object"},
})
d = r.json()
content = d["choices"][0]["message"]["content"]
return json.loads(content)
except Exception as e:
log.debug(f"Extract fail: {e}")
return {"osobe": []}
VALID_ULOGE = {
"predsjednik","dopredsjednik","tajnik","blagajnik","clan_uprave",
"igrac","sportas","glavni_trener","trener","pomocni_trener","kondicioni_trener",
"selektor","izbornik","team_manager","voditelj","lijecnik","fizioterapeut",
"kineziolog","maser","sudac","volonter"
}
async def main():
conn = psycopg2.connect(DSN)
conn.autocommit = True
cur = conn.cursor()
# Backup
cur.execute("""CREATE TABLE IF NOT EXISTS pgz_sport.clanovi_pre_godisnjak_backup
AS SELECT * FROM pgz_sport.clanovi WHERE 1=0""")
cur.execute("""INSERT INTO pgz_sport.clanovi_pre_godisnjak_backup
SELECT * FROM pgz_sport.clanovi""")
log.info("Backup created")
klub_cache = load_klub_cache(conn)
log.info(f"Klub cache: {len(klub_cache)} klubova")
files = sorted(glob.glob(f"{DATA_DIR}/godisnjak_*_layout.txt"))
log.info(f"Files: {len(files)}")
semaphore = asyncio.Semaphore(MAX_WORKERS)
total_inserted = 0
total_skipped = 0
for f in files:
m = re.search(r'godisnjak_(\d{4})_layout', f)
year = m.group(1) if m else "?"
with open(f) as fp:
text = fp.read()
chunks = chunk_text(text)
log.info(f"Year {year}: {len(chunks)} chunks")
tasks = [extract_persons(c, semaphore) for c in chunks]
results = await asyncio.gather(*tasks)
year_ins = 0
rows = []
for res in results:
for o in res.get("osobe", []):
ime = (o.get("ime") or "").strip()
prezime = (o.get("prezime") or "").strip()
if not ime or not prezime or len(ime) < 2 or len(prezime) < 2:
continue
# Basic sanity — no numbers, no too-long names
if re.search(r'\d', ime+prezime) or len(ime+prezime) > 60:
continue
uloga = (o.get("uloga") or "igrac").lower().strip()
if uloga not in VALID_ULOGE:
uloga = "igrac"
klub_naziv = (o.get("klub") or "").strip()
klub_id = fuzzy_klub(klub_naziv, klub_cache)
rows.append((
ime, prezime, uloga, klub_id,
"godisnjak",
json.dumps({"year": int(year), "klub_naziv": klub_naziv}),
"sportas",
))
# Batch upsert — ON CONFLICT skip duplicates by ime+prezime+savez_izvor+year via metadata
for row in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(ime, prezime, uloga, klub_id, savez_izvor, metadata, kategorija)
VALUES (%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT DO NOTHING
""", row)
if cur.rowcount:
year_ins += 1
except Exception as e:
log.debug(f"Insert skip: {e}")
total_inserted += year_ins
log.info(f" {year}: {year_ins} osoba inserted (running total: {total_inserted})")
cur.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE savez_izvor='godisnjak'")
final = cur.fetchone()[0]
conn.close()
log.info(f"""
=== EXTRACT DONE ===
Inserted this run: {total_inserted}
Total godisnjak u DB: {final}
""")
import requests as rq
rq.post(
"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
data={"chat_id": "7969491558",
"text": f"✅ Godisnjak LLM extract DONE: {total_inserted} novih osoba, {final} total"},
timeout=10,
)
if __name__ == "__main__":
asyncio.run(main())
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
Godišnjak pipeline:
1. Find godišnjak PDFs in DB (table dokumenti) + scrape sport-pgz.hr
@@ -13,7 +16,7 @@ import psycopg2
from psycopg2.extras import RealDictCursor
import pypdf
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+112
View File
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
Godišnjak pipeline:
1. Find godišnjak PDFs in DB (table dokumenti) + scrape sport-pgz.hr
2. Download PDF lokalno
3. Parse text iz PDF
4. UPDATE pgz_sport.dokumenti SET sadrzaj = parsed_text
5. Save chunks za RAG
"""
import os, sys, hashlib, requests, re
from pathlib import Path
import psycopg2
from psycopg2.extras import RealDictCursor
import pypdf
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
def download_pdf(url, dest):
if dest.exists() and dest.stat().st_size > 1000:
return dest
try:
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=60, allow_redirects=True)
if r.status_code == 200 and len(r.content) > 1000:
dest.write_bytes(r.content)
return dest
except Exception as e:
print(f" ERR download {url}: {e}")
return None
def parse_pdf(path):
try:
r = pypdf.PdfReader(str(path))
text = ''
for p in r.pages:
try: text += (p.extract_text() or '') + '\n'
except: pass
return text, len(r.pages)
except Exception as e:
print(f" ERR parse {path}: {e}")
return '', 0
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
# 1. Get all godišnjaci s pdf_url ili url s .pdf
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, title, url, pdf_url, vrsta, sadrzaj
FROM pgz_sport.dokumenti
WHERE (
title ILIKE '%sportski godi%njak%' OR title ILIKE '%godi%njak HNS%'
OR title ILIKE 'ZSPGZ%' OR title ILIKE '%godi%njak ZSPGZ%'
OR url ILIKE '%godisnjak%.pdf' OR pdf_url ILIKE '%godisnjak%.pdf'
OR title ILIKE '%godi%njak%' AND (url ILIKE '%pdf' OR pdf_url IS NOT NULL)
)
ORDER BY id DESC
""")
targets = cur.fetchall()
print(f"Targets: {len(targets)}")
parsed_count = 0
for t in targets:
url = t['pdf_url'] or t['url']
if not url or not url.lower().endswith('.pdf'):
continue
if t['sadrzaj'] and len(t['sadrzaj']) > 500:
print(f" ⏭ ID {t['id']}: already parsed ({len(t['sadrzaj'])} chars)")
continue
print(f" 📄 ID {t['id']}: {t['title'][:60]}")
fname = re.sub(r'[^\w.-]', '_', os.path.basename(url))[:100]
dest = UPLOAD_DIR / f"{t['id']}_{fname}"
downloaded = download_pdf(url, dest)
if not downloaded:
continue
text, pages = parse_pdf(downloaded)
if not text:
continue
print(f" ✓ {pages} pages, {len(text)} chars")
# UPDATE sadrzaj
with conn.cursor() as cur:
cur.execute("""
UPDATE pgz_sport.dokumenti
SET sadrzaj = %s, last_updated = now()
WHERE id = %s
""", (text[:500000], t['id'])) # cap 500K
# Chunks za RAG (1000 chars each)
with conn.cursor() as cur:
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (t['id'],))
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
for i, ch in enumerate(chunks[:200]):
if len(ch.strip()) > 50:
cur.execute("""
INSERT INTO pgz_sport.dokument_chunks (dokument_id, chunk_idx, content)
VALUES (%s, %s, %s)
""", (t['id'], i, ch))
parsed_count += 1
print(f"\nDone. Parsed: {parsed_count}")
if __name__ == '__main__':
main()
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
Godišnjak pipeline v2 — popravljen za pravu shemu.
"""
@@ -8,7 +11,7 @@ import psycopg2
from psycopg2.extras import RealDictCursor
import pypdf
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+150
View File
@@ -0,0 +1,150 @@
#!/usr/bin/env python3
"""
Godišnjak pipeline v2 — popravljen za pravu shemu.
"""
import os, sys, hashlib, requests, re
from pathlib import Path
import psycopg2
from psycopg2.extras import RealDictCursor
import pypdf
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
# 18 godišnjaka 2006-2024 (otkriveni scrapeom)
GODISNJAK_URLS = [
("https://sport-pgz.hr/upload/dokumenti/publikacije/godisnjak-2006-print.pdf", 2006),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2007.pdf", 2007),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2008.pdf", 2008),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2009.pdf", 2009),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2010.pdf", 2010),
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf", 2011),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2012.pdf", 2012),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2013.pdf", 2013),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2014.pdf", 2014),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2015.pdf", 2015),
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2017.pdf", 2017),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2018.pdf", 2018),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2019.pdf", 2019),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2020.pdf", 2020),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2021.pdf", 2021),
("https://sport-pgz.hr/upload/dokumenti/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf", 2022),
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf", 2023),
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf", 2024),
]
def download_pdf(url, dest):
if dest.exists() and dest.stat().st_size > 1000:
print(f" [cached] {dest.name} ({dest.stat().st_size//1024}KB)")
return dest
try:
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=120, allow_redirects=True)
if r.status_code == 200 and len(r.content) > 1000:
dest.write_bytes(r.content)
return dest
else:
print(f" ✗ HTTP {r.status_code}")
except Exception as e:
print(f" ERR: {e}")
return None
def parse_pdf(path):
try:
r = pypdf.PdfReader(str(path))
text = ''
for p in r.pages:
try: text += (p.extract_text() or '') + '\n'
except: pass
return text, len(r.pages)
except Exception as e:
return '', 0
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
# Get chunks table column name
with conn.cursor() as cur:
cur.execute("""
SELECT column_name FROM information_schema.columns
WHERE table_schema='pgz_sport' AND table_name='dokument_chunks'
""")
cols = [r[0] for r in cur.fetchall()]
print(f"dokument_chunks columns: {cols}")
parsed_count = 0
for url, godina in GODISNJAK_URLS:
title = f"Sportski godišnjak ZSPGZ {godina}"
fname = f"sportski-godisnjak-{godina}.pdf"
dest = UPLOAD_DIR / fname
print(f"\n📄 {title}")
downloaded = download_pdf(url, dest)
if not downloaded:
continue
# Compute SHA1
sha1 = hashlib.sha1(downloaded.read_bytes()).hexdigest()
text, pages = parse_pdf(downloaded)
if not text:
print(f" ✗ parse failed")
continue
print(f" ✓ {pages} pages, {len(text)} chars")
# UPSERT u dokumenti
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Check if exists by sha1
cur.execute("SELECT id FROM pgz_sport.dokumenti WHERE sha1 = %s LIMIT 1", (sha1,))
existing = cur.fetchone()
if existing:
doc_id = existing['id']
cur.execute("""
UPDATE pgz_sport.dokumenti
SET title = %s, godina = %s, vrsta = 'sportski-godisnjak',
url = %s, pdf_url = %s, sadrzaj = %s,
sluzbeni_glasnik = 'ZSPGZ', razina = 'zupanijska',
organizacija = 'Zajednica sportova Primorsko-goranske županije',
izvor_url = %s, last_updated = now()
WHERE id = %s
""", (title, godina, url, url, text[:500000], 'https://sport-pgz.hr', doc_id))
print(f" ↻ UPDATE id={doc_id}")
else:
cur.execute("""
INSERT INTO pgz_sport.dokumenti
(title, fname, vrsta, godina, url, pdf_url, sha1, sadrzaj,
sluzbeni_glasnik, razina, organizacija, izvor_url)
VALUES (%s, %s, 'sportski-godisnjak', %s, %s, %s, %s, %s,
'ZSPGZ', 'zupanijska', 'Zajednica sportova PGŽ', 'https://sport-pgz.hr')
RETURNING id
""", (title, fname, godina, url, url, sha1, text[:500000]))
doc_id = cur.fetchone()['id']
print(f" + INSERT id={doc_id}")
# Chunks (proper schema)
if 'idx' in cols or 'chunk_idx' in cols or 'page' in cols or 'i' in cols:
with conn.cursor() as cur:
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (doc_id,))
# Find INSERT pattern by columns
idx_col = 'idx' if 'idx' in cols else ('chunk_idx' if 'chunk_idx' in cols else ('i' if 'i' in cols else 'page'))
content_col = 'content' if 'content' in cols else ('chunk' if 'chunk' in cols else ('text' if 'text' in cols else 'sadrzaj'))
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
for i, ch in enumerate(chunks[:300]):
if len(ch.strip()) > 50:
try:
cur.execute(f"""
INSERT INTO pgz_sport.dokument_chunks (dokument_id, {idx_col}, {content_col})
VALUES (%s, %s, %s)
""", (doc_id, i, ch))
except Exception as e:
print(f" ERR chunk {i}: {e}"); break
parsed_count += 1
print(f"\n✅ Done. Parsed: {parsed_count}/{len(GODISNJAK_URLS)}")
if __name__ == '__main__':
main()
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# Fajl: hns_avatar_harvester.py | v1.0 | 05.05.2026
# Author: Damir Radulić
# Lokacija: /opt/pgz-sport/scripts/hns_avatar_harvester.py
@@ -8,7 +11,7 @@ import psycopg2
import requests
from bs4 import BeautifulSoup
DSN = os.environ.get("RINET_DSN", "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
DSN = os.environ.get("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
HEADERS = {"User-Agent": "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"}
conn = psycopg2.connect(DSN); conn.autocommit = True
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
# Fajl: hns_avatar_harvester.py | v1.0 | 05.05.2026
# Author: Damir Radulić
# Lokacija: /opt/pgz-sport/scripts/hns_avatar_harvester.py
# Svrha: Dohvati avatar URL za svakog igrača sa HNS profila
import os, time, re, json, sys
import psycopg2
import requests
from bs4 import BeautifulSoup
DSN = os.environ.get("RINET_DSN", f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
HEADERS = {"User-Agent": "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"}
conn = psycopg2.connect(DSN); conn.autocommit = True
def fetch_avatar(hns_id, slug=""):
url = f"https://semafor.hns.family/igraci/{hns_id}/"
if slug: url += f"{slug}/"
try:
r = requests.get(url, headers=HEADERS, timeout=15)
if r.status_code != 200: return None
soup = BeautifulSoup(r.text, "html.parser")
# Player photo selectors
for sel in [".playerPhoto img", ".player-photo img", ".playerHeader img", "img.player_photo"]:
img = soup.select_one(sel)
if img and img.get("src"):
src = img["src"]
if src.startswith("/"): src = "https://hns.family" + src
return src
# Generic: first img inside header
hdr = soup.select_one(".playerHeader, .player-header, .basic_info")
if hdr:
img = hdr.find("img")
if img and img.get("src"):
src = img["src"]
if src.startswith("/"): src = "https://hns.family" + src
return src
return None
except Exception as e:
return None
with conn.cursor() as cur:
cur.execute("""
SELECT id, hns_igrac_id, ime, prezime
FROM pgz_sport.clanovi
WHERE hns_igrac_id IS NOT NULL AND foto_url IS NULL
LIMIT 200
""")
rows = cur.fetchall()
print(f"Total: {len(rows)} igrača za avatar fetch")
hits = 0
for i, (cid, hns_id, ime, prezime) in enumerate(rows):
slug = f"{ime}-{prezime}".lower().replace("ć","c").replace("č","c").replace("š","s").replace("ž","z").replace("đ","d").replace(" ","-")
slug = re.sub(r"[^a-z0-9-]", "", slug)
avatar = fetch_avatar(hns_id, slug)
if avatar:
with conn.cursor() as cur:
cur.execute("UPDATE pgz_sport.clanovi SET foto_url=%s WHERE id=%s", (avatar, cid))
hits += 1
if i % 10 == 0: print(f" [{i+1}/{len(rows)}] {ime} {prezime} → {avatar[:80]}")
time.sleep(0.5)
print(f"\nDONE: {hits}/{len(rows)} avatar URL-ova spremljen")
+24 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
HNS Master Harvester — Playwright-based scrape semafor.hns.family
─────────────────────────────────────────────────────────────────
@@ -18,7 +21,7 @@ from psycopg2.extras import RealDictCursor, execute_values
from playwright.sync_api import sync_playwright
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
@@ -210,12 +213,30 @@ def upsert_clan(conn, klub_id, player_data):
def upsert_seasons(conn, hns_id, clan_id, seasons):
if not seasons: return 0
rows = []
skipped = 0
# Reject rows where klub_naziv is obviously a misparsed HTML stat-block
# (the parser at scrape_player_full() can produce these when a <table> row
# has fewer cells than the header — dict(zip(...)) silently drops, leaving
# whole-block dumps or bare numbers in the value).
_BAD_PREFIXES = ('STATISTIKA', 'NASTUPI', 'ZAPOČEO', 'ZAMJENA',
'POGOTCI', 'ŽUTI', 'CRVENI', 'UKUPNO', 'SUPERSPORT')
def _looks_like_garbage(klub_text):
if not klub_text: return True
t = klub_text.strip()
if not t: return True # whitespace only
if re.match(r'^\d+$', t): return True # bare number (year, jersey #)
if t.count('\n') >= 2: return True # multi-line label dump
u = t.upper()
return any(u.startswith(p) for p in _BAD_PREFIXES)
for s in seasons:
sezona = s.get('sezona', '')
if not sezona: continue
# Try extract klub iz row
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
if _looks_like_garbage(klub):
skipped += 1
continue
def num(key):
for k in s.keys():
if key in k.lower():
@@ -227,6 +248,8 @@ def upsert_seasons(conn, hns_id, clan_id, seasons):
num('nastup'), num('start'), num('zamj'),
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
))
if skipped:
print(f'[hns_master_harvester] upsert_seasons: skipped {skipped} garbage rows for hns_id={hns_id}')
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.hns_player_seasons
+369
View File
@@ -0,0 +1,369 @@
#!/usr/bin/env python3
"""
HNS Master Harvester — Playwright-based scrape semafor.hns.family
─────────────────────────────────────────────────────────────────
1. List PGŽ financirani nogometni klubovi
2. For each klub: scrape klub roster
3. For each player: scrape full profile (sezone, utakmice)
4. UPSERT u pgz_sport: hns_klub_roster, hns_player_seasons, hns_player_matches, clanovi
5. Audit log
Usage: python3 hns_master_harvester.py [--limit N] [--klub-id X] [--players-only]
"""
import os, sys, time, json, re, argparse, traceback
from datetime import datetime
from urllib.parse import urlparse
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
from playwright.sync_api import sync_playwright
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
def log(msg, telegram=False):
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
LOG.write(line + "\n"); LOG.flush()
if telegram:
try:
import subprocess
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot{TG}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={msg[:2000]}"],
timeout=8, capture_output=True)
except: pass
def db_conn():
c = psycopg2.connect(DSN); c.autocommit = True; return c
# ── Slug HNS = "Franko Andrijašević" → "franko-andrijasevic" ──
def slugify_hns(text):
if not text: return ""
t = text.lower().strip()
t = (t.replace('č','c').replace('ć','c').replace('ž','z').replace('š','s').replace('đ','d')
.replace('Č','c').replace('Ć','c').replace('Ž','z').replace('Š','s').replace('Đ','d'))
t = re.sub(r'[^a-z0-9\s-]', '', t)
t = re.sub(r'\s+', '-', t).strip('-')
return t
def scrape_player(page, hns_id, slug):
"""Scrape player profile + sezone + utakmice."""
url = f"https://semafor.hns.family/igraci/{hns_id}/{slug}/"
try:
page.goto(url, wait_until="networkidle", timeout=30000)
except Exception as e:
log(f" ❌ Goto fail {url}: {e}")
return None
h1 = page.locator('h1').first.inner_text() if page.locator('h1').count() else ''
# Body text
body_text = page.locator('body').inner_text()
# Trenutni klub link (first /klubovi/ link)
current_klub = None
klub_links = page.locator('a[href*="/klubovi/"]').all()
if klub_links:
href = klub_links[0].get_attribute('href') or ''
m = re.search(r'/klubovi/(\d+)/([\w-]+)/', href)
if m:
current_klub = {'hns_id': m.group(1), 'slug': m.group(2), 'naziv': klub_links[0].inner_text().strip()}
# Karijera: regex za sezone (npr "2024/25", "2023/24")
sezone = []
# Potraži pattern "Sezona | Klub | ..." u tekstu
season_lines = re.findall(r'(20\d{2}/\d{2}).{0,200}', body_text)
# Tables (možda dynamiclli rendered)
seasons_data = []
matches_data = []
# Wait for dynamic content
try: page.wait_for_selector('table, .karijera, .sezona, [class*="season"]', timeout=8000)
except: pass
time.sleep(1)
# Re-grab full body after wait
body_text = page.locator('body').inner_text()
# Parse karijera section: "Sezona | Klub | Natjecanje | Nastupi | Golovi"
# Pattern: 2024/25 ... HNK Orijent ... 3.HNL ... 14 ... 2
season_blocks = re.findall(r'(20\d{2}/\d{2})\s+([\w\s\u017c-\u017e\u0107\u010d\u0161\u017d\u0110\.\-]+?)\s+([\d\.\s]+)(?=20\d{2}/\d{2}|$)', body_text)
for sb in season_blocks:
sezona, klub_text, stats_text = sb
nums = re.findall(r'\d+', stats_text)
if len(nums) >= 1:
seasons_data.append({
'sezona': sezona,
'klub': klub_text.strip()[:200],
'nastupi': int(nums[0]) if len(nums) > 0 else 0,
'golovi': int(nums[1]) if len(nums) > 1 else 0,
})
tables = page.locator('table').all()
for t in tables:
rows = t.locator('tr').all()
if len(rows) < 2: continue
# Header
header = [c.inner_text().strip() for c in rows[0].locator('th, td').all()]
for r in rows[1:]:
cells = [c.inner_text().strip() for c in r.locator('th, td').all()]
if not cells: continue
row_dict = dict(zip(header, cells))
# Detect: has season column?
sezona = next((v for k,v in row_dict.items() if re.match(r'\d{4}/\d{2}', v)), None)
if sezona:
seasons_data.append({**row_dict, 'sezona': sezona})
return {
'hns_id': hns_id,
'slug': slug,
'naziv': h1,
'url': url,
'current_klub': current_klub,
'sezone_count': len(seasons_data),
'seasons': seasons_data,
'matches': matches_data,
'body_text_len': len(body_text),
}
def scrape_klub_roster(page, klub_hns_id, klub_slug):
"""Scrape klub roster — sve igrače trenutno u klubu."""
url = f"https://semafor.hns.family/klubovi/{klub_hns_id}/{klub_slug}/"
try:
page.goto(url, wait_until="networkidle", timeout=30000)
except Exception as e:
log(f" ❌ Goto fail {url}: {e}")
return []
# Sve linkove na igrače
players = []
player_links = page.locator('a[href*="/igraci/"]').all()
seen_ids = set()
for a in player_links:
href = a.get_attribute('href') or ''
m = re.search(r'/igraci/(\d+)/([\w-]+)', href)
if m:
hns_id = m.group(1)
if hns_id in seen_ids: continue
seen_ids.add(hns_id)
players.append({
'hns_id': hns_id,
'slug': m.group(2),
'naziv': a.inner_text().strip(),
'url': f"https://semafor.hns.family{href}" if href.startswith('/') else href
})
return players
def upsert_clan(conn, klub_id, player_data):
"""Upsert člana iz HNS profil podataka."""
# Naziv split: "FrankoAndrijašević" → ime/prezime
naziv = re.sub(r'\s+', ' ', player_data.get('naziv', '')).strip()
# Better: ako h1 join-an, podijeli camelcase
parts = re.findall(r'[A-ZČĆŠŽĐ][a-zčćšžđ\']+', naziv)
if len(parts) >= 2:
ime = parts[0]
prezime = ' '.join(parts[1:])
else:
ime = naziv
prezime = ''
hns_id = player_data['hns_id']
url = player_data['url']
with conn.cursor() as cur:
# Try find existing
cur.execute("""
SELECT id FROM pgz_sport.clanovi
WHERE hns_igrac_id = %s
ORDER BY id LIMIT 1
""", (hns_id,))
row = cur.fetchone()
if row:
clan_id = row[0]
cur.execute("""
UPDATE pgz_sport.clanovi
SET ime = COALESCE(NULLIF(ime,''), %s),
prezime = COALESCE(NULLIF(prezime,''), %s),
klub_id = COALESCE(klub_id, %s),
hns_igrac_id = %s,
source = 'hns_semafor',
source_url = %s,
last_updated = now(),
last_scraped_at = now(),
sport = COALESCE(sport, 'nogomet')
WHERE id = %s
""", (ime, prezime, klub_id, hns_id, url, clan_id))
else:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, sport, source, source_url, hns_igrac_id, last_scraped_at, aktivan)
VALUES (%s, %s, %s, 'nogomet', 'hns_semafor', %s, %s, now(), true)
RETURNING id
""", (klub_id, ime, prezime, url, hns_id))
clan_id = cur.fetchone()[0]
return clan_id
def upsert_seasons(conn, hns_id, clan_id, seasons):
if not seasons: return 0
rows = []
skipped = 0
# Reject rows where klub_naziv is obviously a misparsed HTML stat-block
# (the parser at scrape_player_full() can produce these when a <table> row
# has fewer cells than the header — dict(zip(...)) silently drops, leaving
# whole-block dumps or bare numbers in the value).
_BAD_PREFIXES = ('STATISTIKA', 'NASTUPI', 'ZAPOČEO', 'ZAMJENA',
'POGOTCI', 'ŽUTI', 'CRVENI', 'UKUPNO', 'SUPERSPORT')
def _looks_like_garbage(klub_text):
if not klub_text: return True
t = klub_text.strip()
if not t: return True # whitespace only
if re.match(r'^\d+$', t): return True # bare number (year, jersey #)
if t.count('\n') >= 2: return True # multi-line label dump
u = t.upper()
return any(u.startswith(p) for p in _BAD_PREFIXES)
for s in seasons:
sezona = s.get('sezona', '')
if not sezona: continue
# Try extract klub iz row
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
if _looks_like_garbage(klub):
skipped += 1
continue
def num(key):
for k in s.keys():
if key in k.lower():
try: return int(re.sub(r'\D','', s[k]) or 0)
except: return 0
return 0
rows.append((
hns_id, clan_id, sezona, None, klub, natjecanje,
num('nastup'), num('start'), num('zamj'),
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
))
if skipped:
print(f'[hns_master_harvester] upsert_seasons: skipped {skipped} garbage rows for hns_id={hns_id}')
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje)
DO UPDATE SET
nastupi = EXCLUDED.nastupi, startna = EXCLUDED.startna,
zamjena = EXCLUDED.zamjena, golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije, zuti = EXCLUDED.zuti,
crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
scraped_at = now()
""", rows)
return len(rows)
def upsert_klub_roster(conn, klub_id, klub_hns_id, players):
if not players: return 0
rows = [(klub_id, klub_hns_id, p['hns_id'],
p.get('naziv','').split()[0] if p.get('naziv') else '',
' '.join(p.get('naziv','').split()[1:]) if p.get('naziv') else '',
p.get('pozicija',''), p.get('url',''))
for p in players]
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.hns_klub_roster
(klub_id, klub_hns_id, hns_igrac_id, ime, prezime, pozicija, source_url)
VALUES %s
ON CONFLICT (klub_hns_id, hns_igrac_id)
DO UPDATE SET klub_id = EXCLUDED.klub_id, scraped_at = now()
""", rows)
return len(rows)
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--limit', type=int, default=999)
ap.add_argument('--klub-id', type=int, default=None)
ap.add_argument('--single-player', help='HNS ID of single player to scrape')
args = ap.parse_args()
conn = db_conn()
# Get target klubs: PGŽ financirani nogometni
if args.single_player:
klubovi = []
else:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
if args.klub_id:
cur.execute("SELECT * FROM pgz_sport.klubovi WHERE id = %s", (args.klub_id,))
else:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_financirani_klubovi
WHERE sport = 'nogomet' AND source_url LIKE %s
ORDER BY id LIMIT %s
""", ('%semafor.hns.family/klubovi%', args.limit))
klubovi = cur.fetchall()
log(f"🚀 HNS Harvester starting. Target klubova: {len(klubovi)}", telegram=True)
stats = {'klubova': 0, 'players_scraped': 0, 'seasons_upserted': 0, 'errors': 0}
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(
ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = ctx.new_page()
if args.single_player:
# Test mode
log(f"🔬 Single player mode: {args.single_player}")
data = scrape_player(page, args.single_player, 'unknown')
log(f" Data: {json.dumps(data, default=str, ensure_ascii=False)[:500]}")
browser.close()
return
for klub in klubovi:
try:
src = klub.get('source_url', '') or ''
m = re.search(r'/klubovi/(\d+)/([^/]*)', src)
if not m:
log(f" ⏭ Klub {klub['id']} {klub['naziv']} — no HNS URL")
continue
khns, kslug = m.group(1), m.group(2) or 'klub'
log(f"\n🏟 Klub {klub['id']} {klub['naziv']} → HNS {khns}/{kslug}")
roster = scrape_klub_roster(page, khns, kslug)
log(f" Roster: {len(roster)} igrača")
if roster:
upsert_klub_roster(conn, klub['id'], khns, roster)
# Each player
for p in roster[:30]: # safety: max 30 per klub for now
try:
time.sleep(0.5)
pdata = scrape_player(page, p['hns_id'], p['slug'])
if pdata:
clan_id = upsert_clan(conn, klub['id'], pdata)
n_seas = upsert_seasons(conn, pdata['hns_id'], clan_id, pdata.get('seasons', []))
stats['players_scraped'] += 1
stats['seasons_upserted'] += n_seas
log(f" ✓ {pdata['naziv']} (clan_id={clan_id}, seasons={n_seas})")
except Exception as e:
stats['errors'] += 1
log(f" ❌ Player {p['hns_id']}: {e}")
stats['klubova'] += 1
except Exception as e:
stats['errors'] += 1
log(f" ❌ Klub {klub['id']}: {e}\n{traceback.format_exc()[:500]}")
browser.close()
summary = f"✅ HNS Harvester done. Klubova: {stats['klubova']}, Players: {stats['players_scraped']}, Seasons: {stats['seasons_upserted']}, Errors: {stats['errors']}"
log(summary, telegram=True)
if __name__ == '__main__':
main()
+300
View File
@@ -0,0 +1,300 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HNS PGŽ FULL SCRAPER svi klubovi, igrači, sezone, utakmice
─────────────────────────────────────────────────────────────
Sprema u:
pgz_sport.clanovi
pgz_sport.hns_player_seasons
pgz_sport.hns_player_matches
Autor: Damir Radulić / dradulic@outlook.com
Datum: 2026-05-15 (robustna verzija)
"""
import os, re, sys, time, logging, json
from datetime import datetime, timedelta
import requests
import psycopg2
from psycopg2.extras import execute_values
# ─── LOG ───────────────────────────────────────────
LOG_DIR = "/var/log/pgz-sport-debug"
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [FULL] %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
logging.StreamHandler(sys.stdout)
]
)
log = logging.getLogger("hns_full")
# ─── DB CONN ────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def get_conn():
c = psycopg2.connect(DSN)
c.autocommit = True
return c
# ─── HTTP FETCH ─────────────────────────────────────
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot; contact dradulic@outlook.com)"
def fetch(url, retries=3):
for i in range(retries):
try:
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
if r.status_code == 404:
return None
r.raise_for_status()
return r.text
except Exception as e:
time.sleep(1.5 * (i+1))
return None
# ─── PARSIRANJE ─────────────────────────────────────
def parse_roster(html, klub_hns_id):
"""Vraća listu (hns_igrac_id, ime, prezime, url)"""
igraci = []
# pronađi linkove na igrače
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
url = "https://semafor.hns.family" + m.group(1)
hns_id = int(m.group(2))
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
raw_name = re.sub(r'\s+', ' ', raw_name)
if not raw_name:
continue
parts = raw_name.split(' ', 1)
ime = parts[0].strip()
prezime = parts[1].strip() if len(parts) > 1 else ''
# preskoči administrativne linkove
if not prezime or len(prezime) < 2:
continue
igraci.append((hns_id, ime, prezime, url))
return igraci
def parse_player_seasons(html, hns_igrac_id):
"""Vraća listu dictova za svaku sezonu s poljima:
sezona, natjecanje, klub_hns_id, klub_naziv, nastupi, golovi, asistencije, zuti, crveni, minute"""
seasons = []
# Pronađi blok sa sezonama obično unutar <div class="playerStats"> ili tablice
# Pojednostavljeno: tražimo sve redove tablice koje sadrže godinu i link na klub
table_match = re.search(r'<table[^>]*class="[^"]*playerSeason[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
if not table_match:
table_match = re.search(r'<table[^>]*class="[^"]*career[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
if not table_match:
# fallback tražimo bilo koju tablicu
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
if table_match:
table_html = table_match.group(1)
# parsiraj redove
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
if len(cells) < 3:
continue
# očekivani format: sezona (npr. "2025/26"), klub (link), natjecanje, nastupi, golovi, asistencije, žuti, crveni, minute
# ali može varirati tražimo barem sezonu i link na klub
sezona = None
klub_hns_id = None
klub_naziv = ""
natjecanje = ""
nastupi = golovi = asistencije = zuti = crveni = minute = 0
# prva ćelija često sezona
season_text = re.sub(r'<[^>]+>', '', cells[0]).strip()
if re.match(r'\d{4}/\d{2,4}', season_text):
sezona = season_text
# tražimo link na klub
club_link = re.search(r'<a[^>]*href="(/klubovi/(\d+)/[^"]*)"[^>]*>(.*?)</a>', row.group(1), re.DOTALL)
if club_link:
klub_hns_id = int(club_link.group(2))
klub_naziv = re.sub(r'<[^>]+>', '', club_link.group(3)).strip()
# natjecanje (obično treća ćelija ili druga ako nema kluba)
if len(cells) >= 2 and not klub_hns_id:
natjecanje = re.sub(r'<[^>]+>', '', cells[1]).strip()
elif len(cells) >= 3:
natjecanje = re.sub(r'<[^>]+>', '', cells[2]).strip()
# statistika
for i, cell in enumerate(cells):
text = re.sub(r'<[^>]+>', '', cell).strip()
if text.isdigit():
val = int(text)
if i == 3: nastupi = val
elif i == 4: golovi = val
elif i == 5: asistencije = val
elif i == 6: zuti = val
elif i == 7: crveni = val
elif i == 8: minute = val
if sezona and klub_hns_id:
seasons.append({
"hns_igrac_id": hns_igrac_id,
"sezona": sezona,
"klub_hns_id": str(klub_hns_id),
"klub_naziv": klub_naziv,
"natjecanje": natjecanje,
"nastupi": nastupi,
"golovi": golovi,
"asistencije": asistencije,
"zuti": zuti,
"crveni": crveni,
"minute": minute
})
return seasons
def parse_player_matches(html, hns_igrac_id):
"""Vraća listu dictova za zadnje utakmice"""
matches = []
# slično parsiranje, tablica utakmica
table_match = re.search(r'<table[^>]*class="[^"]*match[^"]*"[^>]*>(.*?)</table>', html, re.DOTALL)
if not table_match:
table_match = re.search(r'<table[^>]*>(.*?)</table>', html, re.DOTALL)
if table_match:
table_html = table_match.group(1)
for row in re.finditer(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL):
cells = re.findall(r'<td[^>]*>(.*?)</td>', row.group(1), re.DOTALL)
if len(cells) < 5:
continue
# format: datum, domaćin, gost, rezultat, (možda minutaža, golovi...)
datum = re.sub(r'<[^>]+>', '', cells[0]).strip()
domacin = re.sub(r'<[^>]+>', '', cells[1]).strip()
gost = re.sub(r'<[^>]+>', '', cells[2]).strip()
rezultat = re.sub(r'<[^>]+>', '', cells[3]).strip()
if datum and domacin:
matches.append({
"hns_igrac_id": hns_igrac_id,
"datum": datum,
"domacin": domacin,
"gost": gost,
"rezultat": rezultat
})
return matches
# ─── UPSERT U BAZU ──────────────────────────────────
def upsert_players(conn, players):
sql = """
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url, klub_hns_id)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime,
prezime = EXCLUDED.prezime,
source_url = EXCLUDED.source_url,
klub_hns_id = EXCLUDED.klub_hns_id
"""
with conn.cursor() as cur:
execute_values(cur, sql, players)
def upsert_seasons(conn, seasons):
if not seasons:
return
sql = """
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
nastupi, golovi, asistencije, zuti, crveni, minute, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
nastupi = EXCLUDED.nastupi,
golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije,
zuti = EXCLUDED.zuti,
crveni = EXCLUDED.crveni,
minute = EXCLUDED.minute,
klub_naziv = EXCLUDED.klub_naziv
"""
vals = []
for s in seasons:
vals.append((
s["hns_igrac_id"], s["sezona"], s["natjecanje"], s["klub_hns_id"],
s["klub_naziv"], s["nastupi"], s["golovi"], s["asistencije"],
s["zuti"], s["crveni"], s["minute"], ""
))
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=100)
def upsert_matches(conn, matches):
if not matches:
return
sql = """
INSERT INTO pgz_sport.hns_player_matches
(hns_igrac_id, datum, domacin, gost, rezultat)
VALUES %s
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING
"""
vals = [(m["hns_igrac_id"], m["datum"], m["domacin"], m["gost"], m["rezultat"]) for m in matches]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=100)
# ─── MAIN ───────────────────────────────────────────
def main():
log.info("=== START FULL PGŽ HNS SCRAPE ===")
conn = get_conn()
cur = conn.cursor()
# 1. Dohvati sve PGŽ klubove s hns_klub_id
cur.execute("""
SELECT id, naziv, hns_klub_id
FROM pgz_sport.klubovi
WHERE savez_id = 10 AND hns_klub_id IS NOT NULL
""")
klubovi = cur.fetchall()
log.info(f"Klubova za obradu: {len(klubovi)}")
total_players = 0
total_seasons = 0
total_matches = 0
for klub_id, klub_naziv, hns_klub_id in klubovi:
log.info(f"🏟️ {klub_naziv} (HNS {hns_klub_id})")
# 2. Roster
roster_url = f"https://semafor.hns.family/klubovi/{hns_klub_id}/igraci/"
html = fetch(roster_url)
if not html:
log.warning(f" ⚠️ Nema rostera za {klub_naziv}")
continue
players = parse_roster(html, hns_klub_id)
if players:
# dodaj klub_hns_id u igrače (za update)
players_with_klub = [(p[0], p[1], p[2], p[3], str(hns_klub_id)) for p in players]
upsert_players(conn, players_with_klub)
log.info(f" 👥 {len(players)} igrača")
else:
log.warning(f" ⚠️ Nema igrača")
continue
# 3. Za svakog igrača skini sezone i utakmice ako nije skoro rađen
for hns_id, ime, prezime, url in players:
# provjeri kada je zadnji put scrape-an
cur.execute("""
SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons
WHERE hns_igrac_id = %s
""", (hns_id,))
last = cur.fetchone()[0]
if last and (datetime.now() - last) < timedelta(days=7):
continue # preskoči svježe igrače
html = fetch(url)
if not html:
continue
seasons = parse_player_seasons(html, hns_id)
if seasons:
upsert_seasons(conn, seasons)
total_seasons += len(seasons)
matches = parse_player_matches(html, hns_id)
if matches:
upsert_matches(conn, matches)
total_matches += len(matches)
time.sleep(0.3) # pristojnost prema serveru
total_players += len(players)
time.sleep(1) # kratka pauza između klubova
conn.close()
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
if __name__ == "__main__":
main()
+219
View File
@@ -0,0 +1,219 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HNS PGŽ FULL SCRAPER v2 ispravljen URL roster-a
Koristi sub1_hns_catalog.json za točne URL-ove klubova
"""
import os, re, sys, time, logging, json
from datetime import datetime, timedelta
import requests
import psycopg2
from psycopg2.extras import execute_values
# ─── LOG ───────────────────────────────────────────
LOG_DIR = "/var/log/pgz-sport-debug"
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [FULL] %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "hns_full.log"), encoding="utf-8"),
logging.StreamHandler(sys.stdout)
]
)
log = logging.getLogger("hns_full")
# ─── DB ────────────────────────────────────────────
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def get_conn():
return psycopg2.connect(DSN)
UA = "Mozilla/5.0 (Ri.NET PGŽ Sport Bot)"
def fetch(url, retries=3):
for i in range(retries):
try:
r = requests.get(url, headers={"User-Agent": UA}, timeout=15)
if r.status_code == 404: return None
r.raise_for_status()
return r.text
except: time.sleep(1.5 * (i+1))
return None
# ─── PARSIRANJE ─────────────────────────────────────
def parse_roster(html):
"""Vraća listu (hns_igrac_id, ime, prezime, profil_url)"""
igraci = []
for m in re.finditer(r'<a\s+[^>]*href="(/igraci/(\d+)/[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL):
url = "https://semafor.hns.family" + m.group(1)
hns_id = int(m.group(2))
raw_name = re.sub(r'<[^>]+>', ' ', m.group(3)).strip()
raw_name = re.sub(r'\s+', ' ', raw_name)
if not raw_name: continue
parts = raw_name.split(' ', 1)
ime = parts[0].strip()
prezime = parts[1].strip() if len(parts) > 1 else ''
if not prezime or len(prezime) < 2: continue
igraci.append((hns_id, ime, prezime, url))
return igraci
def parse_seasons(html, hns_igrac_id):
"""Vraća listu dictova sezona"""
# tražimo JSON-LD ili tablicu
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
seasons_data = data.get('playerSeason', [])
if not seasons_data:
return []
seasons = []
for s in seasons_data:
seasons.append({
"hns_igrac_id": hns_igrac_id,
"sezona": s.get("season", ""),
"klub_hns_id": str(s.get("clubId", "")),
"klub_naziv": s.get("clubName", ""),
"natjecanje": s.get("competition", ""),
"nastupi": int(s.get("apps", 0)),
"golovi": int(s.get("goals", 0)),
"asistencije": int(s.get("assists", 0)),
"zuti": int(s.get("yellow", 0)),
"crveni": int(s.get("red", 0)),
"minute": int(s.get("minutes", 0))
})
return seasons
except:
pass
return []
def parse_matches(html, hns_igrac_id):
"""Vraća listu dictova utakmica iz JSON-LD"""
json_match = re.search(r'<script[^>]*type="application/ld\+json"[^>]*>(.*?)</script>', html, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
matches_data = data.get('playerMatch', [])
matches = []
for m in matches_data:
matches.append({
"hns_igrac_id": hns_igrac_id,
"datum": m.get("date", ""),
"domacin": m.get("homeTeam", ""),
"gost": m.get("awayTeam", ""),
"rezultat": m.get("result", "")
})
return matches
except:
pass
return []
# ─── UPSERT ─────────────────────────────────────────
def upsert_players(conn, players):
sql = """INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime, prezime = EXCLUDED.prezime, source_url = EXCLUDED.source_url"""
with conn.cursor() as cur:
execute_values(cur, sql, players)
def upsert_seasons(conn, seasons):
if not seasons: return
sql = """INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, sezona, natjecanje, klub_hns_id, klub_naziv,
nastupi, golovi, asistencije, zuti, crveni, minute)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
nastupi=EXCLUDED.nastupi, golovi=EXCLUDED.golovi,
asistencije=EXCLUDED.asistencije, zuti=EXCLUDED.zuti,
crveni=EXCLUDED.crveni, minute=EXCLUDED.minute,
klub_naziv=EXCLUDED.klub_naziv"""
vals = [(s['hns_igrac_id'], s['sezona'], s['natjecanje'], s['klub_hns_id'],
s['klub_naziv'], s['nastupi'], s['golovi'], s['asistencije'],
s['zuti'], s['crveni'], s['minute']) for s in seasons]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=50)
def upsert_matches(conn, matches):
if not matches: return
sql = """INSERT INTO pgz_sport.hns_player_matches
(hns_igrac_id, datum, domacin, gost, rezultat)
VALUES %s
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO NOTHING"""
vals = [(m['hns_igrac_id'], m['datum'], m['domacin'], m['gost'], m['rezultat']) for m in matches]
with conn.cursor() as cur:
execute_values(cur, sql, vals, page_size=50)
# ─── MAIN ───────────────────────────────────────────
def main():
log.info("=== START FULL PGŽ HNS SCRAPE v2 ===")
conn = get_conn()
conn.autocommit = True
# 1. Učitaj katalog za URL-ove klubova
with open('/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json', 'r') as f:
catalog = json.load(f)
klub_url_map = {}
for item in catalog:
klub_url_map[item['id']] = f"https://semafor.hns.family/klubovi/{item['id']}/{item['slug']}/"
log.info(f"Učitano {len(klub_url_map)} klubova iz kataloga.")
# 2. Dohvati klubove iz baze koji imaju hns_klub_id i savez_id=10
cur = conn.cursor()
cur.execute("SELECT id, naziv, hns_klub_id FROM pgz_sport.klubovi WHERE savez_id = 10 AND hns_klub_id IS NOT NULL")
klubovi = cur.fetchall()
log.info(f"Klubova za obradu: {len(klubovi)}")
total_players = total_seasons = total_matches = 0
for klub_id, naziv, hns_id in klubovi:
klub_url = klub_url_map.get(hns_id)
if not klub_url:
log.warning(f" ⚠️ {naziv} (HNS {hns_id}) nema URL u katalogu, preskačem.")
continue
log.info(f"🏟️ {naziv}{klub_url}")
html = fetch(klub_url)
if not html:
log.warning(f" ❌ Ne mogu dohvatiti stranicu kluba.")
continue
players = parse_roster(html)
if not players:
log.warning(f" ⚠️ Nema igrača.")
continue
# upsert igrača
player_tuples = [(p[0], p[1], p[2], p[3]) for p in players]
upsert_players(conn, player_tuples)
log.info(f" 👥 {len(players)} igrača")
# za svakog igrača skini detalje
for hns_igrac_id, ime, prezime, profile_url in players:
# provjeri da li smo nedavno scrapeali sezone
cur.execute("SELECT MAX(scraped_at) FROM pgz_sport.hns_player_seasons WHERE hns_igrac_id = %s", (hns_igrac_id,))
last = cur.fetchone()[0]
if last and (datetime.now() - last) < timedelta(days=7):
continue
html = fetch(profile_url)
if not html:
continue
seasons = parse_seasons(html, hns_igrac_id)
if seasons:
upsert_seasons(conn, seasons)
total_seasons += len(seasons)
matches = parse_matches(html, hns_igrac_id)
if matches:
upsert_matches(conn, matches)
total_matches += len(matches)
time.sleep(0.3)
total_players += len(players)
conn.close()
log.info(f"=== GOTOVO: {total_players} igrača, {total_seasons} sezona, {total_matches} utakmica ===")
if __name__ == "__main__":
main()
+218
View File
@@ -0,0 +1,218 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os, sys, re, time, logging
import requests
from requests.exceptions import RequestException
import psycopg2
from psycopg2.extras import execute_values
from dotenv import load_dotenv
load_dotenv('/opt/.env.rinet')
# --- LOGGING ---
LOG_DIR = "/var/log/pgz-sport-sync"
os.makedirs(LOG_DIR, exist_ok=True)
LOG_FILE = os.path.join(LOG_DIR, "sync_master.log")
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(LOG_FILE, encoding='utf-8'),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
# --- CONFIG ---
db_pass = os.environ.get('PG_PASS')
if not db_pass:
logger.critical("PG_PASS nije pronađen u /opt/.env.rinet")
sys.exit(1)
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={db_pass}"
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA})
BASE_URL = "https://semafor.hns.family"
# Dodana sva natjecanja koja si naveo
NATJECANJA_URLS = [
"https://semafor.hns.family/natjecanja/101025334/1-nl-ns-rijeka-juniori-2526/",
"https://semafor.hns.family/natjecanja/100585203/treca-nl-zapad-2526/",
"https://semafor.hns.family/natjecanja/101555188/1-znl-seniori-2526/",
"https://semafor.hns.family/natjecanja/102503486/1-zupanijska-omladinska-liga-kadeti-skupina-a-2526/"
]
def strip_tags(text):
"""Uklanja sve ugniježđene HTML tagove i vraća čisti string."""
text = re.sub(r'<[^>]+>', ' ', text)
return re.sub(r'\s+', ' ', text).strip()
# --- DATABASE ---
def db_conn():
try:
c = psycopg2.connect(DSN)
c.autocommit = True
return c
except psycopg2.Error as e:
logger.critical(f"DB Connection failed: {e}")
sys.exit(1)
# --- HTTP FETCH ---
def fetch(url, retries=3):
for attempt in range(1, retries + 1):
try:
r = SESSION.get(url, timeout=15)
if r.status_code == 404:
logger.warning(f"HTTP 404 Not Found: {url}")
return None
r.raise_for_status()
return r.text
except RequestException as e:
logger.warning(f"HTTP GET failed ({attempt}/{retries}) for {url}: {e}")
time.sleep(2 * attempt)
logger.error(f"Gave up fetching {url} after {retries} attempts.")
return None
# --- SYNC PROCEDURES ---
def extract_klubovi(html):
if not html: return []
klubovi = {}
# Prilagođen regex za prepoznavanje svega unutar <a> taga, bez obzira na slike i spanove
for m in re.finditer(r'<a[^>]*href="(/klubovi/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
hns_id = m.group(2)
slug = m.group(3)
naziv = strip_tags(m.group(4))
if not naziv:
naziv = slug.replace('-', ' ').title()
# Makni potencijalne krive linkove
if len(naziv) < 50 and hns_id:
klubovi[hns_id] = (hns_id, naziv, BASE_URL + m.group(1))
return list(klubovi.values())
def upsert_klubovi(conn, klubovi):
if not klubovi: return []
try:
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.klubovi (hns_id, naziv, source_url)
VALUES %s
ON CONFLICT (hns_id) DO UPDATE SET
naziv = EXCLUDED.naziv,
source_url = EXCLUDED.source_url
WHERE pgz_sport.klubovi.naziv IS DISTINCT FROM EXCLUDED.naziv
OR pgz_sport.klubovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
""", klubovi)
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE hns_id = ANY(%s)", ([k[0] for k in klubovi],))
return cur.fetchall()
except psycopg2.Error as e:
logger.error(f"DB Greška pri UPSERT klubova: {e}")
return []
def sync_roster(conn, klub_hns_id, klub_url):
target_url = klub_url if klub_url.endswith('/') else klub_url + '/'
target_url += "igraci/"
html = fetch(target_url)
if not html: return []
igraci = {}
for m in re.finditer(r'<a[^>]*href="(/igraci/(\d+)/([^/"]+)[^"]*)"[^>]*>(.*?)</a>', html, re.DOTALL | re.IGNORECASE):
hns_igrac_id = m.group(2)
slug = m.group(3)
ime_prezime = strip_tags(m.group(4))
if not ime_prezime or len(ime_prezime) > 60:
continue
parts = ime_prezime.split(' ', 1)
ime = parts[0] if parts else "Nepoznato"
prezime = parts[1] if len(parts) > 1 else slug.replace('-', ' ').title()
igraci[hns_igrac_id] = (hns_igrac_id, ime, prezime, klub_hns_id, BASE_URL + m.group(1), slug)
igraci_list = list(igraci.values())
if not igraci_list:
logger.debug(f"Klub {klub_hns_id} nema igrača (ili greška u parsiranju).")
return []
try:
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.clanovi (hns_igrac_id, ime, prezime, klub_hns_id, source_url, slug)
VALUES %s
ON CONFLICT (hns_igrac_id) DO UPDATE SET
ime = EXCLUDED.ime,
prezime = EXCLUDED.prezime,
klub_hns_id = EXCLUDED.klub_hns_id,
source_url = EXCLUDED.source_url,
slug = EXCLUDED.slug
WHERE pgz_sport.clanovi.ime IS DISTINCT FROM EXCLUDED.ime
OR pgz_sport.clanovi.prezime IS DISTINCT FROM EXCLUDED.prezime
OR pgz_sport.clanovi.klub_hns_id IS DISTINCT FROM EXCLUDED.klub_hns_id
OR pgz_sport.clanovi.source_url IS DISTINCT FROM EXCLUDED.source_url;
""", igraci_list)
logger.info(f"Roster za klub {klub_hns_id}: uspješno sinkronizirano {len(igraci_list)} igrača.")
return igraci_list
except psycopg2.Error as e:
logger.error(f"DB Greška pri UPSERT rostera za klub {klub_hns_id}: {e}")
return []
def get_all_db_clubs(conn):
try:
with conn.cursor() as cur:
cur.execute("SELECT id, hns_id, source_url FROM pgz_sport.klubovi WHERE source_url IS NOT NULL")
return cur.fetchall()
except:
return []
# --- MAIN ENGINE ---
def main():
logger.info("=== START: HNS PGŽ FULL SYNC ===")
conn = db_conn()
all_extracted_klubovi = []
# 1. Traži klubove po ligama
for url in NATJECANJA_URLS:
logger.info(f"Preuzimanje klubova iz natjecanja: {url}")
html = fetch(url)
extracted = extract_klubovi(html)
logger.info(f"Pronađeno {len(extracted)} klubova u natjecanju.")
all_extracted_klubovi.extend(extracted)
time.sleep(1)
unique_klubovi = list({k[0]: k for k in all_extracted_klubovi}.values())
logger.info(f"Ukupno jedinstvenih klubova za UPSERT: {len(unique_klubovi)}")
upsert_klubovi(conn, unique_klubovi)
# 2. Skini roster za svaki klub iz baze
db_klubovi = get_all_db_clubs(conn)
logger.info(f"Pokrećem sync rostera za {len(db_klubovi)} klubova iz baze...")
for _, klub_hns_id, klub_url in db_klubovi:
try:
sync_roster(conn, klub_hns_id, klub_url)
time.sleep(0.5)
except Exception as e:
logger.critical(f"Kritična greška kod kluba {klub_hns_id}: {e}")
continue
logger.info("=== KRAJ: HNS PGŽ FULL SYNC ===")
conn.close()
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
logger.info("Skripta prekinuta.")
sys.exit(0)
except Exception as e:
logger.critical(f"Neočekivani pad skripte: {e}", exc_info=True)
sys.exit(1)
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# -*- coding: utf-8 -*-
"""
hns_player_deep.py — SUB3 deep HNS player scraper
@@ -29,7 +32,7 @@ import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
SLEEP = float(os.getenv("SLEEP", "0.8"))
@@ -0,0 +1,534 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
hns_player_deep.py — SUB3 deep HNS player scraper
─────────────────────────────────────────────────
Author: dradulic@outlook.com / damir@rinet.one
Date: 2026-05-05
Version: 1.0
Scrapes semafor.hns.family/igraci/{id}/{slug}/ for every clanovi.hns_igrac_id row,
extracting:
• profil meta (datum_rodenja, mjesto_rodenja, broj_dresa, current klub)
• per-season stats per natjecanje (UPSERT pgz_sport.hns_player_seasons)
• last 30+ matches (UPSERT pgz_sport.hns_player_matches)
Server-rendered HTML — no Playwright needed → uses requests for 510× speedup.
Fallback to Playwright if --use-playwright is passed.
Resume-able: skips clanovi where last_scraped_at > now() - interval N days.
Usage:
python3 hns_player_deep.py [--limit 200] [--days 7] [--player HNS_ID] [--use-playwright]
"""
import os, sys, re, time, json, argparse, traceback
from datetime import datetime, date
from urllib.parse import urljoin
import requests
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
SLEEP = float(os.getenv("SLEEP", "0.8"))
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
LOG_DIR = "/var/log/pgz-sport-debug"
os.makedirs(LOG_DIR, exist_ok=True)
LOG_FILE = f"{LOG_DIR}/sub3_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
LOG_FH = open(LOG_FILE, "a", encoding="utf-8")
def log(msg: str, telegram: bool = False) -> None:
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
LOG_FH.write(line + "\n"); LOG_FH.flush()
if telegram and TG_TOKEN and TG_CHAT:
try:
requests.post(
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
data={"chat_id": TG_CHAT, "text": msg[:4000]},
timeout=8,
)
except Exception:
pass
# ── HTTP session ──────────────────────────────────────────────────────────
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA, "Accept-Language": "hr,en;q=0.7"})
def fetch_html(url: str, timeout: int = 20) -> str | None:
try:
r = SESSION.get(url, timeout=timeout)
if r.status_code != 200:
log(f" HTTP {r.status_code} {url}")
return None
return r.text
except Exception as e:
log(f" fetch fail {url}: {e}")
return None
# ── Parsers ───────────────────────────────────────────────────────────────
def _strip_html(s: str) -> str:
s = re.sub(r"<[^>]+>", " ", s)
return re.sub(r"\s+", " ", s).strip()
def parse_profile(html: str) -> dict:
"""Extract player profile meta (HNS exposes only birth date / city / jersey / current club)."""
out = {
"broj_dresa": None,
"datum_rodenja": None,
"mjesto_rodenja": None,
"klub_hns_id": None,
"klub_naziv": None,
}
# playerHeader block (everything from header to first <!--)
m = re.search(r'<div class="block playerHeader"[^>]*>(.*?)<!--', html, re.DOTALL)
header_html = m.group(1) if m else html
# Jersey number
m = re.search(r'<span class="number"[^>]*>(\d+)</span>', header_html)
if not m:
# fallback: number in playerHeader text region (first standalone digit before name)
text = _strip_html(header_html)
mm = re.match(r'^\s*(\d{1,2})\s+[A-ZČĆŠŽĐ]', text)
if mm:
out["broj_dresa"] = int(mm.group(1))
else:
out["broj_dresa"] = int(m.group(1))
# Trenutni klub (first /klubovi/ link in header)
m = re.search(r'<a[^>]+href="/klubovi/(\d+)/([\w-]+)/?"[^>]*>([^<]+)<', header_html)
if m:
out["klub_hns_id"] = m.group(1)
out["klub_naziv"] = m.group(3).strip()
# Datum rođenja (dd.mm.yyyy.)
m = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})\.\s*(?:</[^>]+>\s*)?(?:<[^>]+>\s*)*\(?\s*\d+\s*godin', header_html)
if not m:
# Looser pattern in playerData
m = re.search(r'<div[^>]*class="[^"]*birth[^"]*"[^>]*>(\d{1,2})\.(\d{1,2})\.(\d{4})', header_html)
if not m:
# Fallback: any dd.mm.yyyy. near "Datum rođenja"
text = _strip_html(header_html)
mm = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})\.\s*\(?\s*\d+\s*godin[ae]?\)?\s*Datum rođenja', text)
if mm:
m = mm
if m:
try:
out["datum_rodenja"] = date(int(m.group(3)), int(m.group(2)), int(m.group(1)))
except Exception:
pass
# Mjesto rođenja: text right before "Mjesto rođenja"
text_all = _strip_html(header_html)
mm = re.search(r'([A-ZČĆŠŽĐ][\w\sčćšžđČĆŠŽĐ\-]{1,80}?)\s+Mjesto rođenja', text_all)
if mm:
out["mjesto_rodenja"] = mm.group(1).strip()
return out
# Each season block: "{YYYY/YY} Statistika Utakmice ... <playerCompetitionStatsTable> ... <matchlist>"
# We split player_profile_matches by the recurring pattern.
SEASON_HEADER_RE = re.compile(
r'(?:<[^>]+>\s*)?(20\d{2}/\d{2})(?:\s*<[^>]+>)?\s*Statistika\s+Utakmice',
re.IGNORECASE,
)
def parse_seasons_and_matches(html: str) -> tuple[list[dict], list[dict]]:
"""Return (season_rows, match_rows) for ALL seasons on the profile page."""
# Limit to player_profile_matches block to avoid noise
m = re.search(
r'<div class="block w1280 matchlist style1 player_profile_matches"[^>]*>(.*?)(?=<!--|<footer)',
html, re.DOTALL,
)
if not m:
return [], []
block = m.group(1)
# Find season header positions: <h2 class="seasonTitle ...">YYYY/YY</h2>
headers = list(re.finditer(
r'<h2\s+class="seasonTitle[^"]*"[^>]*>\s*(20\d{2}/\d{2})\s*</h2>',
block,
))
if not headers:
# Fallback: any <h2> with season label
headers = list(re.finditer(r'<h2[^>]*>\s*(20\d{2}/\d{2})\s*</h2>', block))
if not headers:
plain = re.sub(r'<[^>]+>', ' ', block)
plain = re.sub(r'\s+', ' ', plain)
return _parse_plain(plain)
sections = []
for i, h in enumerate(headers):
sezona = h.group(1)
start = h.start()
end = headers[i + 1].start() if i + 1 < len(headers) else len(block)
sections.append((sezona, block[start:end]))
season_rows: list[dict] = []
match_rows: list[dict] = []
for sezona, sec in sections:
# ── Per-season per-natjecanje stats (playerCompetitionStatsTable) ──
cs = re.search(
r'<div class="block w1280 playerCompetitionStatsTable"[^>]*>(.*?)</div>\s*</div>\s*</div>',
sec, re.DOTALL,
)
if cs:
stab = cs.group(1)
# Header row → identify columns; body rows have natjecanje + 6 ints
# Extract: total row "Ukupno" + per-competition rows
# Each row appears as <td>…</td>. Use table-agnostic approach: find every block of
# "<td>NATJECANJE</td><td>N</td><td>S</td><td>Z</td><td>G</td><td>YEL</td><td>RED</td>"
# but tables here use divs not td. Walk plain text per line.
stext = _strip_html(stab)
# Split by competition-row pattern: "<label> <int> <int> <int> <int> <int> <int>"
for rm in re.finditer(
r'([A-ZČĆŠŽĐa-zčćšžđ0-9][^|]*?)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)(?=\s|$)',
stext,
):
label = rm.group(1).strip()
if label.lower().startswith("ukupno"):
continue # we keep per-natjecanje rows only (UNIQUE prefers natjecanje)
if "Nastupi" in label or "Započeo" in label or "Statistika" in label:
continue
try:
season_rows.append({
"sezona": sezona,
"natjecanje": label[:200],
"nastupi": int(rm.group(2)),
"startna": int(rm.group(3)),
"zamjena": int(rm.group(4)),
"golovi": int(rm.group(5)),
"zuti": int(rm.group(6)),
"crveni": int(rm.group(7)),
})
except Exception:
pass
# ── Matches (matchlist style2) ──
ml = re.search(
r'<div class="matchlist style2 semafor player[^"]*"[^>]*>(.*?)</ul>',
sec, re.DOTALL,
)
if ml:
list_html = ml.group(1)
for row in re.finditer(
r'<li class="row[^"]*"[^>]*data-match="(\d+)"[^>]*>(.*?)</li>',
list_html, re.DOTALL,
):
row_html = row.group(2)
# Date
d = re.search(r'<div class="date">([^<]+)</div>', row_html)
# club1 / club2
c1 = re.search(r'<div class="club1"[^>]*>\s*<a[^>]*>([^<]+?)<', row_html)
c2 = re.search(r'<div class="club2"[^>]*>\s*<a[^>]*>([^<]+?)<', row_html)
# result
r1 = re.search(r'<div class="res1">(\d+)</div>', row_html)
r2 = re.search(r'<div class="res2">(\d+)</div>', row_html)
# natjecanje
cr = re.search(r'<div class="competitionround">([^<]+)</div>', row_html)
# goals
gl = re.search(r'<div class="goals">(\d+)</div>', row_html)
# cards "Y / R"
ca = re.search(r'<div class="cards">.*?(\d+)\s*/\s*(\d+).*?</div>', row_html, re.DOTALL)
# minutes
mn = re.search(r'<div class="minutes">(\d+)</div>', row_html)
# Parse date dd.mm.yyyy. HH:MM
datum = None
if d:
dm = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', d.group(1))
if dm:
try:
datum = date(int(dm.group(3)), int(dm.group(2)), int(dm.group(1)))
except Exception:
pass
rezultat = f"{r1.group(1)}:{r2.group(1)}" if r1 and r2 else None
match_rows.append({
"datum": datum,
"domacin": (c1.group(1).strip() if c1 else "")[:120],
"gost": (c2.group(1).strip() if c2 else "")[:120],
"rezultat": rezultat,
"natjecanje": (cr.group(1).strip() if cr else "")[:200],
"golovi": int(gl.group(1)) if gl else 0,
"zuti": int(ca.group(1)) if ca else 0,
"crveni": int(ca.group(2)) if ca else 0,
"minute_do": int(mn.group(1)) if mn else None,
})
return season_rows, match_rows
def _parse_plain(plain_text: str) -> tuple[list[dict], list[dict]]:
"""Fallback: parse from already-stripped plain text (no match-row HTML access)."""
# Best effort: extract season totals only
season_rows: list[dict] = []
# Split by season headers
parts = re.split(r'(20\d{2}/\d{2})\s+Statistika\s+Utakmice', plain_text)
# parts: [pre, season1, body1, season2, body2, ...]
for i in range(1, len(parts), 2):
sezona = parts[i]
body = parts[i + 1] if i + 1 < len(parts) else ""
# Find the "Ukupno N N N G Y R" then per-competition lines
for rm in re.finditer(
r'([A-ZČĆŠŽĐa-zčćšžđ0-9][^|]*?)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)(?=\s|$)',
body[:3000],
):
label = rm.group(1).strip()
if label.lower().startswith("ukupno"):
continue
if "Nastupi" in label or "Statistika" in label:
continue
season_rows.append({
"sezona": sezona,
"natjecanje": label[:200],
"nastupi": int(rm.group(2)),
"startna": int(rm.group(3)),
"zamjena": int(rm.group(4)),
"golovi": int(rm.group(5)),
"zuti": int(rm.group(6)),
"crveni": int(rm.group(7)),
})
return season_rows, []
# ── DB ────────────────────────────────────────────────────────────────────
def db_conn():
c = psycopg2.connect(DSN); c.autocommit = True; return c
def get_targets(conn, limit: int, days: int, force_player: str | None = None) -> list[dict]:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
if force_player:
cur.execute("""
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
FROM pgz_sport.clanovi
WHERE hns_igrac_id = %s
LIMIT 1
""", (force_player,))
else:
cur.execute("""
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
FROM pgz_sport.clanovi
WHERE hns_igrac_id IS NOT NULL
AND (last_scraped_at IS NULL OR last_scraped_at < now() - %s::interval)
ORDER BY (last_scraped_at IS NULL) DESC, id ASC
LIMIT %s
""", (f"{days} days", limit))
return cur.fetchall()
def update_clan(conn, clan_id: int, profile: dict, url: str) -> None:
sets, vals = [], []
if profile.get("datum_rodenja"):
sets.append("datum_rodenja = COALESCE(datum_rodenja, %s)")
vals.append(profile["datum_rodenja"])
sets.append("datum_rodjenja = COALESCE(datum_rodjenja, %s)")
vals.append(profile["datum_rodenja"])
if profile.get("mjesto_rodenja"):
sets.append("mjesto_rodenja = COALESCE(NULLIF(mjesto_rodenja,''), %s)")
vals.append(profile["mjesto_rodenja"])
sets.append("mjesto_rodjenja = COALESCE(NULLIF(mjesto_rodjenja,''), %s)")
vals.append(profile["mjesto_rodenja"])
if profile.get("broj_dresa") is not None:
sets.append("broj_dresa = COALESCE(broj_dresa, %s)")
vals.append(profile["broj_dresa"])
sets.append("source_url = %s"); vals.append(url)
sets.append("source = COALESCE(NULLIF(source,''), 'hns_semafor')")
sets.append("sport = COALESCE(NULLIF(sport,''), 'nogomet')")
sets.append("last_scraped_at = now()")
sets.append("source_synced_at = now()")
vals.append(clan_id)
sql = f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s"
with conn.cursor() as cur:
cur.execute(sql, tuple(vals))
def upsert_seasons(conn, hns_id: str, clan_id: int, url: str, rows: list[dict]) -> int:
if not rows:
return 0
raw = [
(hns_id, clan_id, r["sezona"], None, None, r["natjecanje"][:200],
r.get("nastupi", 0), r.get("startna", 0), r.get("zamjena", 0),
r.get("golovi", 0), 0, r.get("zuti", 0), r.get("crveni", 0), 0, url)
for r in rows
]
# Dedupe by UNIQUE (hns_igrac_id, sezona, klub_hns_id, natjecanje)
dedup: dict[tuple, tuple] = {}
for row in raw:
k = (row[0], row[2], row[3], row[5])
dedup[k] = row
data = list(dedup.values())
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_hns_id, klub_naziv, natjecanje,
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id, sezona, klub_hns_id, natjecanje) DO UPDATE SET
nastupi = EXCLUDED.nastupi,
startna = EXCLUDED.startna,
zamjena = EXCLUDED.zamjena,
golovi = EXCLUDED.golovi,
zuti = EXCLUDED.zuti,
crveni = EXCLUDED.crveni,
source_url = EXCLUDED.source_url,
scraped_at = now()
""", data)
return len(rows)
def upsert_matches(conn, hns_id: str, clan_id: int, url: str, rows: list[dict]) -> int:
if not rows:
return 0
raw = [
(hns_id, clan_id, r["datum"], r["natjecanje"], r["domacin"], r["gost"],
r["rezultat"], None, None, None, r.get("minute_do"),
r.get("golovi", 0), 0, r.get("zuti", 0), r.get("crveni", 0), url)
for r in rows if r["datum"] and r["domacin"] and r["gost"]
]
# Dedupe by UNIQUE key (hns_igrac_id, datum, domacin, gost) — keep last occurrence
dedup: dict[tuple, tuple] = {}
for row in raw:
k = (row[0], row[2], row[4], row[5])
dedup[k] = row
data = list(dedup.values())
if not data:
return 0
with conn.cursor() as cur:
execute_values(cur, """
INSERT INTO pgz_sport.hns_player_matches
(hns_igrac_id, clan_id, datum, natjecanje, domacin, gost,
rezultat, pozicija, startna, minute_od, minute_do,
golovi, asistencije, zuti, crveni, source_url)
VALUES %s
ON CONFLICT (hns_igrac_id, datum, domacin, gost) DO UPDATE SET
rezultat = EXCLUDED.rezultat,
natjecanje = EXCLUDED.natjecanje,
minute_do = EXCLUDED.minute_do,
golovi = EXCLUDED.golovi,
zuti = EXCLUDED.zuti,
crveni = EXCLUDED.crveni,
source_url = EXCLUDED.source_url,
scraped_at = now()
""", data)
return len(data)
# ── Slug helper ───────────────────────────────────────────────────────────
def slugify(text: str) -> str:
if not text:
return ""
repl = str.maketrans("čćžšđČĆŽŠĐ", "ccczsdcczsd"[:10])
t = text.lower().translate(repl)
t = re.sub(r"[^a-z0-9\s-]", "", t)
return re.sub(r"\s+", "-", t).strip("-")
def build_url(t: dict) -> str:
if t.get("source_url") and "semafor.hns.family/igraci/" in t["source_url"]:
return t["source_url"]
slug = (t.get("slug") or slugify(f"{t['ime']} {t['prezime']}")) or "x"
return f"https://semafor.hns.family/igraci/{t['hns_igrac_id']}/{slug}/"
# ── Driver ────────────────────────────────────────────────────────────────
def process_one(conn, t: dict) -> dict:
url = build_url(t)
html = fetch_html(url)
if not html or "playerHeader" not in html:
log(f" ✗ no playerHeader for {t['ime']} {t['prezime']} ({t['hns_igrac_id']}) → {url}")
# Mark as scraped to avoid hot-loop on broken URL
with conn.cursor() as cur:
cur.execute(
"UPDATE pgz_sport.clanovi SET last_scraped_at = now() WHERE id = %s",
(t["id"],),
)
return {"profile": False, "seasons": 0, "matches": 0, "fields": 0}
profile = parse_profile(html)
seasons, matches = parse_seasons_and_matches(html)
# Update clan profile
update_clan(conn, t["id"], profile, url)
n_fields = sum(1 for k in ("datum_rodenja", "mjesto_rodenja", "broj_dresa") if profile.get(k))
n_s = upsert_seasons(conn, t["hns_igrac_id"], t["id"], url, seasons)
n_m = upsert_matches(conn, t["hns_igrac_id"], t["id"], url, matches)
return {"profile": True, "seasons": n_s, "matches": n_m, "fields": n_fields}
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--limit", type=int, default=200)
ap.add_argument("--days", type=int, default=7)
ap.add_argument("--player", help="Single HNS ID (debug)")
ap.add_argument("--missing-matches", action="store_true",
help="Only target clanovi without rows in hns_player_matches")
ap.add_argument("--no-telegram", action="store_true")
args = ap.parse_args()
log(f"SUB3 deep scraper start | limit={args.limit} | days={args.days} | "
f"missing_matches={args.missing_matches} | log={LOG_FILE}",
telegram=not args.no_telegram)
conn = db_conn()
if args.missing_matches:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT id, hns_igrac_id, ime, prezime, source_url, slug
FROM pgz_sport.clanovi
WHERE hns_igrac_id IS NOT NULL
AND id NOT IN (
SELECT clan_id FROM pgz_sport.hns_player_matches WHERE clan_id IS NOT NULL
)
ORDER BY id ASC
LIMIT %s
""", (args.limit,))
targets = cur.fetchall()
else:
targets = get_targets(conn, args.limit, args.days, args.player)
log(f"Targets: {len(targets)}")
stats = {"scraped": 0, "seasons": 0, "matches": 0, "fields": 0, "errors": 0}
t0 = time.time()
for i, t in enumerate(targets, 1):
try:
r = process_one(conn, t)
stats["scraped"] += 1
stats["seasons"] += r["seasons"]
stats["matches"] += r["matches"]
stats["fields"] += r["fields"]
if i % 10 == 0 or r["matches"] > 0:
log(f" [{i}/{len(targets)}] {t['ime']} {t['prezime']} "
f"→ seasons +{r['seasons']} matches +{r['matches']} fields +{r['fields']} "
f"(totals: s={stats['seasons']} m={stats['matches']})")
except Exception as e:
stats["errors"] += 1
log(f" ✗ ERROR {t['ime']} {t['prezime']} ({t['hns_igrac_id']}): {e}")
log(traceback.format_exc()[:500])
time.sleep(SLEEP)
dur = time.time() - t0
summary = (
f"SUB3 done in {dur:.0f}s | scraped={stats['scraped']} "
f"seasons +{stats['seasons']} matches +{stats['matches']} "
f"fields +{stats['fields']} errors={stats['errors']}"
)
log(summary, telegram=not args.no_telegram)
# Result file
res_path = "/opt/pgz-sport/cc_tasks/SUB3_RESULT.md"
with open(res_path, "a", encoding="utf-8") as f:
f.write(f"\n## Run {datetime.now().isoformat(timespec='seconds')}\n")
f.write(f"- batch_limit: {args.limit}\n")
f.write(f"- targets: {len(targets)}\n")
f.write(f"- scraped: {stats['scraped']}\n")
f.write(f"- seasons +{stats['seasons']}\n")
f.write(f"- matches +{stats['matches']}\n")
f.write(f"- profile fields enriched: +{stats['fields']}\n")
f.write(f"- errors: {stats['errors']}\n")
f.write(f"- duration: {dur:.0f}s\n")
f.write(f"- log: {LOG_FILE}\n")
return 0
if __name__ == "__main__":
sys.exit(main())
+36
View File
@@ -0,0 +1,36 @@
#!/usr/bin/env python3
"""Dohvaća sezone i utakmice za HNS igrača preko Playwrighta."""
import json, sys, time
from playwright.sync_api import sync_playwright
if len(sys.argv) < 2:
print("Koristi: hns_player_stats.py <hns_igrac_id>")
sys.exit(1)
hns_id = sys.argv[1]
url = f"https://semafor.hns.family/igraci/{hns_id}/"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until='networkidle', timeout=30000)
# dohvaćanje __NEXT_DATA__
next_data = page.inner_text('#__NEXT_DATA__')
data = json.loads(next_data)
browser.close()
# izvlačenje sezona i utakmica
props = data['props']['pageProps']
player = props.get('player', {})
seasons = player.get('seasons', [])
matches = props.get('matches', [])
print(f"Igrač: {player.get('name', '')} {player.get('surname', '')}")
print(f"Sezona: {len(seasons)}")
for s in seasons:
print(f" {s.get('season','?')} {s.get('competition','')} {s.get('clubName','')} "
f"N:{s.get('apps',0)} G:{s.get('goals',0)} A:{s.get('assists',0)}")
print(f"\nUtakmica: {len(matches)}")
for m in matches[:5]: # prvih 5
print(f" {m.get('date','')} {m.get('homeTeam','')} vs {m.get('awayTeam','')} {m.get('result','')}")
+50 -16
View File
@@ -1,24 +1,58 @@
#!/bin/bash
# Wrapper za HNS harvester koji uvijek koristi sistemski python3 (ima psycopg2)
# Damir-friendly - ignorira venv aktivaciju
# Wrapper za HNS harvester - koristi sistemski python3 (psycopg2)
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
PYTHON="/usr/bin/python3"
case "$1" in
master) /usr/bin/python3 "$SCRIPT_DIR/hns_master_harvester.py" "${@:2}" ;;
deep) /usr/bin/python3 "$SCRIPT_DIR/hns_player_deep.py" "${@:2}" ;;
avatar) /usr/bin/python3 "$SCRIPT_DIR/hns_avatar_harvester.py" "${@:2}" ;;
season) /usr/bin/python3 "$SCRIPT_DIR/hns_season_retry.py" "${@:2}" ;;
watchdog) /usr/bin/python3 "$SCRIPT_DIR/hns_watchdog.py" "${@:2}" ;;
objekti) /usr/bin/python3 "$SCRIPT_DIR/objekti_enrich_address.py" "${@:2}" ;;
"") echo "Usage: $0 {master|deep|avatar|season|watchdog|objekti} [args]"
master) $PYTHON "$SCRIPT_DIR/hns_master_harvester.py" "${@:2}" ;;
deep) $PYTHON "$SCRIPT_DIR/hns_player_deep.py" "${@:2}" ;;
# Pojedinačni igrač: sve u jednom potezu
player)
if [ -z "$2" ]; then
echo "Greška: potreban ID igrača. Primjer: $0 player 86290"
exit 1
fi
echo ">>> Osnovni podaci za igrača $2"
$PYTHON "$SCRIPT_DIR/hns_master_harvester.py" --single-player "$2"
echo ">>> Sezone i utakmice za igrača $2"
$PYTHON "$SCRIPT_DIR/hns_player_deep.py" --player "$2"
;;
# Svi klubovi (bez igrača? master svakako povlači i igrače iz tih klubova)
all-clubs)
echo "Dohvat svih klubova (limit 10000)..."
$PYTHON "$SCRIPT_DIR/hns_master_harvester.py" --limit 10000
;;
# Svi igrači sa svim detaljima (klubovi + sezone + utakmice)
all-players)
echo ">>> 1/2 Dohvat svih klubova i osnovnih podataka igrača"
$PYTHON "$SCRIPT_DIR/hns_master_harvester.py" --limit 10000
echo ">>> 2/2 Dohvat sezona i utakmica za sve igrače"
$PYTHON "$SCRIPT_DIR/hns_player_deep.py" --limit 50000
;;
# Kompletno: svi klubovi + svi igrači (all-in-one)
all)
echo "===== FULL HARVEST ====="
$0 all-clubs
$0 all-players
;;
avatar) $PYTHON "$SCRIPT_DIR/hns_avatar_harvester.py" "${@:2}" ;;
season) $PYTHON "$SCRIPT_DIR/hns_season_retry.py" "${@:2}" ;;
watchdog) $PYTHON "$SCRIPT_DIR/hns_watchdog.py" "${@:2}" ;;
objekti) $PYTHON "$SCRIPT_DIR/objekti_enrich_address.py" "${@:2}" ;;
"") echo "Usage: $0 {master|deep|player|all-clubs|all-players|all|avatar|season|watchdog|objekti} [args]"
echo
echo "Primjeri:"
echo " $0 master --limit 100 # Sve PGŽ klubove (~59)"
echo " $0 master --klub-id 2613 # Jedan klub"
echo " $0 master --single-player 436387 # Jedan igrač"
echo " $0 deep # DEEP scrape svih (sezone+utakmice)"
echo " $0 avatar # Avatar slike"
echo " $0 watchdog # Auto-recovery"
echo " $0 player 86290 # Jedan igrač (osnovno+sezone+utakmice)"
echo " $0 all-clubs # Svi klubovi i njihovi osnovni igrači"
echo " $0 all-players # Svi klubovi + svi igrači sa svim sezonama"
echo " $0 all # Kompletno: klubovi i svi detalji igrača"
echo " $0 deep --limit 50000 # Osvježi sezone za 50000 igrača"
;;
*) echo "Unknown command: $1"; exit 1 ;;
esac
esac
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""HNS sezone retry — pojednostavljen extract."""
import os, time, re, json, sys
from datetime import datetime
@@ -6,7 +9,7 @@ import psycopg2
from psycopg2.extras import RealDictCursor
from playwright.sync_api import sync_playwright
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def find_seasons_in_obj(obj, found=None):
if found is None: found = []
+112
View File
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""HNS sezone retry — pojednostavljen extract."""
import os, time, re, json, sys
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
from playwright.sync_api import sync_playwright
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def find_seasons_in_obj(obj, found=None):
if found is None: found = []
if isinstance(obj, dict):
if 'season' in obj or 'sezona' in obj:
found.append(obj)
for v in obj.values():
find_seasons_in_obj(v, found)
elif isinstance(obj, list):
for item in obj:
find_seasons_in_obj(item, found)
return found
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
FROM pgz_sport.clanovi c
WHERE c.hns_igrac_id IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
ORDER BY c.id LIMIT 200
""")
targets = cur.fetchall()
print(f"Targets: {len(targets)}", flush=True)
seasons_added = 0
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0")
page = ctx.new_page()
for i, t in enumerate(targets):
url = t['source_url']
if not url or 'semafor.hns.family/igraci/' not in url:
continue
try:
page.goto(url, wait_until="networkidle", timeout=20000)
try: page.wait_for_selector('table, .karijera, [class*="season"]', timeout=6000)
except: pass
time.sleep(0.5)
rows = []
# Extract from __NEXT_DATA__ if exists
html = page.content()
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
if m:
try:
data = json.loads(m.group(1))
sezone = find_seasons_in_obj(data)
for s in sezone:
sezona = s.get('season') or s.get('sezona')
if sezona:
rows.append({'sezona': str(sezona), 'klub': '', 'natjecanje': '', 'nastupi': 0, 'golovi': 0})
except Exception:
pass
# Fallback regex on body
if not rows:
body = page.locator('body').inner_text()
for line in body.split('\n'):
match = re.match(r'^(20\d{2}/\d{2})\s+(.+?)\s+(\d+(?:\s+\d+)*)\s*$', line.strip())
if match:
sezona = match.group(1)
rest = match.group(2)
nums = [int(x) for x in match.group(3).split()]
rows.append({
'sezona': sezona, 'klub': rest[:200], 'natjecanje': '',
'nastupi': nums[0] if nums else 0,
'golovi': nums[1] if len(nums) > 1 else 0,
})
if rows:
with conn.cursor() as cur:
for r in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje, nastupi, golovi)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'],
r['natjecanje'], r['nastupi'], r['golovi']))
seasons_added += 1
except Exception:
pass
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: {len(rows)} sezone (total added: {seasons_added})", flush=True)
if i % 20 == 0:
print(f" [{i}/{len(targets)}] processed, total added: {seasons_added}", flush=True)
except Exception as e:
print(f" ❌ {t['ime']}: {e}", flush=True)
browser.close()
print(f"\nDone. Total sezone added: {seasons_added}")
if __name__ == '__main__':
main()
+4 -1
View File
@@ -1,11 +1,14 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily."""
import os, time, re, json, sys
import psycopg2
from psycopg2.extras import RealDictCursor
from playwright.sync_api import sync_playwright
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def find_seasons(obj, found=None, depth=0):
if depth > 25: return found or []
+141
View File
@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""HNS sezone v3 — koristi __NEXT_DATA__ JSON parser primarily."""
import os, time, re, json, sys
import psycopg2
from psycopg2.extras import RealDictCursor
from playwright.sync_api import sync_playwright
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def find_seasons(obj, found=None, depth=0):
if depth > 25: return found or []
if found is None: found = []
if isinstance(obj, dict):
keys = set(obj.keys())
# Detect season-like dict
if ('season' in keys and isinstance(obj.get('season'), (str, dict))) or 'sezona' in keys:
found.append(obj)
# Detect career object with seasons array
for k, v in obj.items():
if k.lower() in ('careers','career','seasons','sezone','statistics','stats') and isinstance(v, list):
for item in v:
if isinstance(item, dict) and any(kk in item for kk in ('season','sezona','year','godina')):
found.append(item)
find_seasons(v, found, depth+1)
elif isinstance(obj, list):
for item in obj:
find_seasons(item, found, depth+1)
return found
def normalize_season(s):
"""Convert season dict to flat row."""
sezona = s.get('season') or s.get('sezona') or s.get('year') or s.get('godina') or ''
if isinstance(sezona, dict):
sezona = sezona.get('name') or sezona.get('label') or str(sezona.get('year',''))
sezona = str(sezona)
klub = s.get('club') or s.get('klub') or s.get('team') or ''
if isinstance(klub, dict):
klub = klub.get('name') or klub.get('naziv') or ''
natj = s.get('competition') or s.get('natjecanje') or s.get('league') or ''
if isinstance(natj, dict):
natj = natj.get('name') or natj.get('naziv') or ''
def num(*keys):
for k in keys:
for kk in s.keys():
if k.lower() in kk.lower():
v = s[kk]
try: return int(v)
except:
try: return int(re.sub(r'\D','', str(v)) or 0)
except: return 0
return 0
return {
'sezona': sezona, 'klub': str(klub)[:200], 'natjecanje': str(natj)[:100],
'nastupi': num('matches','nastup','appearance'),
'startna': num('start'),
'zamjena': num('sub','zamjen'),
'golovi': num('goal','gol'),
'asistencije': num('assist','asist'),
'zuti': num('yellow','žut','zut'),
'crveni': num('red','crv'),
'minute': num('minute','minut','min'),
}
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT c.id AS clan_id, c.hns_igrac_id, c.ime, c.prezime, c.source_url
FROM pgz_sport.clanovi c
WHERE c.hns_igrac_id IS NOT NULL
AND NOT EXISTS (SELECT 1 FROM pgz_sport.hns_player_seasons s WHERE s.hns_igrac_id = c.hns_igrac_id)
ORDER BY c.id LIMIT 200
""")
targets = cur.fetchall()
print(f"Targets: {len(targets)}", flush=True)
seasons_added = 0
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
page = browser.new_context(ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/120.0.0.0").new_page()
for i, t in enumerate(targets):
url = t['source_url']
if not url or 'semafor.hns.family/igraci/' not in url:
continue
try:
page.goto(url, wait_until="networkidle", timeout=20000)
time.sleep(0.8)
html = page.content()
rows = []
# Extract __NEXT_DATA__
m = re.search(r'__NEXT_DATA__"\s*type="application/json">([^<]+)</script>', html)
if m:
try:
data = json.loads(m.group(1))
seasons_raw = find_seasons(data)
for s in seasons_raw:
n = normalize_season(s)
if n['sezona']:
rows.append(n)
except Exception as e:
pass
# Insert
if rows:
with conn.cursor() as cur:
for r in rows:
try:
cur.execute("""
INSERT INTO pgz_sport.hns_player_seasons
(hns_igrac_id, clan_id, sezona, klub_naziv, natjecanje,
nastupi, startna, zamjena, golovi, asistencije, zuti, crveni, minute)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
""", (t['hns_igrac_id'], t['clan_id'], r['sezona'], r['klub'], r['natjecanje'],
r['nastupi'], r['startna'], r['zamjena'], r['golovi'],
r['asistencije'], r['zuti'], r['crveni'], r['minute']))
seasons_added += 1
except: pass
print(f" ✓ [{i}/{len(targets)}] {t['ime']} {t['prezime']}: +{len(rows)} sezone (total: {seasons_added})", flush=True)
if i % 30 == 0 and i > 0:
print(f" [{i}/{len(targets)}] processed, total: {seasons_added}", flush=True)
except Exception as e:
pass
browser.close()
print(f"\n✅ Done. Total: {seasons_added}", flush=True)
if __name__ == '__main__':
main()
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ──────────────────────────────────────────────────────────────────────────────
# hns_watchdog.py — PGŽ Sport HNS pipeline watchdog (SUB7)
# Author : Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
@@ -32,7 +35,7 @@ import requests
# ── Config ────────────────────────────────────────────────────────────────────
DSN = os.getenv(
"RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
)
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
+340
View File
@@ -0,0 +1,340 @@
#!/usr/bin/env python3
# ──────────────────────────────────────────────────────────────────────────────
# hns_watchdog.py — PGŽ Sport HNS pipeline watchdog (SUB7)
# Author : Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
# Date : 2026-05-05
# Version: 1.0.0
# Purpose: Periodically poll DB progress for the HNS scraping pipeline,
# detect stalls, restart fallen worker processes and send Telegram
# status updates every 30 minutes. Fires a special "ALL DONE" alert
# once the mission goal is reached.
#
# Modes : --once run a single check and exit (cron-friendly)
# --daemon loop forever, sleeping CHECK_INTERVAL_SEC between checks
#
# Goal : 59/59 PGŽ financirani klubovi sa hns_klub_id, ≥80% igrača s
# profile_complete=true (visina_cm IS NOT NULL), ≥1000 utakmica.
# ──────────────────────────────────────────────────────────────────────────────
import os
import sys
import time
import json
import argparse
import logging
import logging.handlers
import subprocess
from datetime import datetime, timedelta
from pathlib import Path
import psycopg2
import requests
# ── Config ────────────────────────────────────────────────────────────────────
DSN = os.getenv(
"RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
)
TG_TOKEN = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
LOG_DIR = Path("/var/log/pgz-sport-debug")
LOG_FILE = LOG_DIR / "hns_watchdog.log"
STATE_FILE = LOG_DIR / "hns_watchdog_state.json"
CHECK_INTERVAL_SEC = 30 * 60 # 30 min between daemon iterations
STALL_WINDOW_SEC = 30 * 60 # consider stale if no growth in 30 min
DONE_FLAG_FILE = LOG_DIR / "hns_watchdog_DONE.flag"
# Mission targets
TARGET_KLUBOVI = 59
TARGET_PROFILE_PCT = 0.80
TARGET_MATCHES = 1000
# Worker processes to keep alive (process_name : restart_command)
WORKERS = {
"hns_master_harvester": [
"python3", "/opt/pgz-sport/scripts/hns_master_harvester.py",
],
"hns_season_v3": [
"python3", "/opt/pgz-sport/scripts/hns_season_v3.py",
],
}
# ── Logging ───────────────────────────────────────────────────────────────────
LOG_DIR.mkdir(parents=True, exist_ok=True)
logger = logging.getLogger("hns_watchdog")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.handlers.RotatingFileHandler(
LOG_FILE, maxBytes=5_000_000, backupCount=5
)
handler.setFormatter(logging.Formatter(
"%(asctime)s %(levelname)s %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S",
))
logger.addHandler(handler)
logger.addHandler(logging.StreamHandler(sys.stdout))
# ── Helpers ───────────────────────────────────────────────────────────────────
PROGRESS_SQL = """
SELECT
(SELECT COUNT(*) FROM pgz_sport.klubovi
WHERE sport='nogomet' AND pgz_sufinanciran=true
AND hns_klub_id IS NOT NULL) AS klubovi_hns,
(SELECT COUNT(DISTINCT klub_id) FROM pgz_sport.hns_klub_roster) AS roster_klubovi,
(SELECT COUNT(*) FROM pgz_sport.clanovi
WHERE hns_igrac_id IS NOT NULL) AS igraci_hns,
(SELECT COUNT(*) FROM pgz_sport.clanovi
WHERE hns_igrac_id IS NOT NULL AND visina_cm IS NOT NULL) AS igraci_profil,
(SELECT COUNT(*) FROM pgz_sport.hns_player_seasons) AS seasons_rec,
(SELECT COUNT(*) FROM pgz_sport.hns_player_matches) AS matches_rec
;
"""
PENDING_SQL = """
SELECT COUNT(*) FROM pgz_sport.clanovi
WHERE hns_igrac_id IS NOT NULL
AND visina_cm IS NULL;
"""
def db_query():
"""Returns dict of progress counters (or None on failure)."""
try:
conn = psycopg2.connect(DSN, connect_timeout=10)
conn.autocommit = True
with conn.cursor() as cur:
cur.execute(PROGRESS_SQL)
row = cur.fetchone()
cols = ["klubovi_hns", "roster_klubovi", "igraci_hns",
"igraci_profil", "seasons_rec", "matches_rec"]
counts = dict(zip(cols, row))
try:
cur.execute(PENDING_SQL)
counts["pending_players"] = cur.fetchone()[0]
except Exception as e:
logger.warning("PENDING_SQL failed: %s", e)
counts["pending_players"] = None
conn.close()
return counts
except Exception as e:
logger.error("DB query failed: %s", e)
return None
def telegram(text):
try:
r = requests.post(
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
data={"chat_id": TG_CHAT, "text": text[:4000],
"parse_mode": "HTML", "disable_web_page_preview": "true"},
timeout=10,
)
ok = r.ok and r.json().get("ok", False)
if not ok:
logger.warning("Telegram returned: %s", r.text[:300])
return ok
except Exception as e:
logger.error("Telegram send failed: %s", e)
return False
def load_state():
if STATE_FILE.exists():
try:
return json.loads(STATE_FILE.read_text())
except Exception:
pass
return {}
def save_state(state):
try:
STATE_FILE.write_text(json.dumps(state, indent=2, default=str))
except Exception as e:
logger.warning("Cannot persist state: %s", e)
def proc_alive(name):
"""True if a process matching `name` is currently running."""
try:
# pgrep -f returns 0 if at least one match
r = subprocess.run(
["pgrep", "-f", name],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
timeout=5,
)
return r.returncode == 0
except Exception as e:
logger.warning("pgrep failed for %s: %s", name, e)
return True # err on caution: do not respawn if uncertain
def restart_worker(name, cmd):
log_path = LOG_DIR / f"{name}_respawn_{datetime.now():%Y%m%d_%H%M}.log"
try:
with open(log_path, "ab") as logf:
subprocess.Popen(
cmd,
stdout=logf, stderr=subprocess.STDOUT,
cwd="/opt/pgz-sport/scripts",
start_new_session=True,
)
logger.info("Re-spawned worker %s -> %s", name, log_path)
return True
except Exception as e:
logger.error("Failed to respawn %s: %s", name, e)
return False
def check_workers():
"""Return list of worker names that were re-spawned."""
respawned = []
for name, cmd in WORKERS.items():
if not proc_alive(name):
if restart_worker(name, cmd):
respawned.append(name)
return respawned
def detect_stale(prev, curr):
"""True if seasons_rec did not grow even though there are pending players."""
if not prev or not curr:
return False
if curr.get("pending_players") in (None, 0):
return False
try:
ts_prev = datetime.fromisoformat(prev.get("ts"))
except Exception:
return False
if datetime.utcnow() - ts_prev < timedelta(seconds=STALL_WINDOW_SEC):
return False # not enough time elapsed
grew = (curr.get("seasons_rec", 0) > prev.get("seasons_rec", 0) or
curr.get("matches_rec", 0) > prev.get("matches_rec", 0) or
curr.get("igraci_profil", 0) > prev.get("igraci_profil", 0))
return not grew
def goal_reached(c):
if not c:
return False
if c["klubovi_hns"] < TARGET_KLUBOVI:
return False
if c["matches_rec"] < TARGET_MATCHES:
return False
if c["igraci_hns"] <= 0:
return False
pct = c["igraci_profil"] / c["igraci_hns"]
return pct >= TARGET_PROFILE_PCT
def fmt_status(c, respawned, stale, suffix=""):
if not c:
return f"<b>HNS watchdog</b>\nDB query failed at {datetime.utcnow():%Y-%m-%d %H:%M}Z"
pct = (c["igraci_profil"] / c["igraci_hns"] * 100) if c["igraci_hns"] else 0
body = (
f"<b>HNS watchdog</b> {datetime.utcnow():%Y-%m-%d %H:%MZ}\n"
f"Klubovi (HNS id): <b>{c['klubovi_hns']}/{TARGET_KLUBOVI}</b>\n"
f"Roster scraped: {c['roster_klubovi']}\n"
f"Igrači (HNS id): {c['igraci_hns']}\n"
f"Igrači s profilom: {c['igraci_profil']} ({pct:0.1f}%)\n"
f"Sezone: {c['seasons_rec']}\n"
f"Utakmice: <b>{c['matches_rec']}</b>/{TARGET_MATCHES}\n"
f"Pending igrači: {c.get('pending_players')}\n"
)
if respawned:
body += f"\nRe-spawned: {', '.join(respawned)}"
if stale:
body += "\nSTALE: nema rasta u zadnjih 30 min"
if suffix:
body += f"\n{suffix}"
return body
# ── Main check ────────────────────────────────────────────────────────────────
def run_check(send_telegram=True):
logger.info("=== watchdog cycle ===")
state = load_state()
prev = state.get("last_counts")
counts = db_query()
respawned = check_workers()
stale = detect_stale(prev, counts) if counts else False
done = goal_reached(counts)
msg = fmt_status(counts, respawned, stale)
notify = False
suffix = ""
if done and not DONE_FLAG_FILE.exists():
DONE_FLAG_FILE.write_text(datetime.utcnow().isoformat())
suffix = "\nALL DONE — mission target reached!"
msg = fmt_status(counts, respawned, stale, suffix=suffix)
notify = True
elif respawned or stale:
notify = True
else:
# routine 30-min heartbeat: send only if last notify >= 30 min ago
last_ts = state.get("last_notify_ts")
if not last_ts:
notify = True
else:
try:
last_dt = datetime.fromisoformat(last_ts)
if datetime.utcnow() - last_dt >= timedelta(minutes=29):
notify = True
except Exception:
notify = True
logger.info("counts=%s respawned=%s stale=%s notify=%s done=%s",
counts, respawned, stale, notify, done)
if send_telegram and notify:
if telegram(msg):
state["last_notify_ts"] = datetime.utcnow().isoformat()
else:
logger.warning("Telegram delivery failed")
if counts:
state["last_counts"] = {**counts, "ts": datetime.utcnow().isoformat()}
save_state(state)
return {"counts": counts, "respawned": respawned,
"stale": stale, "done": done, "notified": notify}
# ── Daemon loop ───────────────────────────────────────────────────────────────
def run_daemon():
logger.info("Starting watchdog daemon (interval=%ss)", CHECK_INTERVAL_SEC)
while True:
try:
run_check(send_telegram=True)
except Exception as e:
logger.exception("cycle crashed: %s", e)
time.sleep(CHECK_INTERVAL_SEC)
# ── Entry point ───────────────────────────────────────────────────────────────
def main():
p = argparse.ArgumentParser(description="HNS pipeline watchdog")
g = p.add_mutually_exclusive_group(required=True)
g.add_argument("--once", action="store_true",
help="Run a single check and exit (cron-friendly)")
g.add_argument("--daemon", action="store_true",
help="Run forever, sleeping 30 min between checks")
p.add_argument("--no-telegram", action="store_true",
help="Skip Telegram notifications (debug)")
args = p.parse_args()
if args.daemon:
run_daemon()
else:
result = run_check(send_telegram=not args.no_telegram)
# Print compact JSON for cron / shell usage
print(json.dumps(result, default=str, ensure_ascii=False))
if __name__ == "__main__":
main()
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# hns_youth_categories.py — SUB5 — HNS Semafor youth team scraper (v1.0)
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
# Date: 2026-05-05
@@ -59,7 +62,7 @@ except Exception:
DB_DSN = dict(
host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7",
user="rinet", password=os.environ["DB_PASSWORD"],
)
BASE = "https://semafor.hns.family"
RATE_S = 1.0
@@ -0,0 +1,581 @@
#!/usr/bin/env python3
# hns_youth_categories.py — SUB5 — HNS Semafor youth team scraper (v1.0)
# Author: Damir Radulić <dradulic@outlook.com> / <damir@rinet.one>
# Date: 2026-05-05
# Description:
# Discovers per-club age categories (Seniori / U-19 juniori / U-17 kadeti /
# U-15 stariji pioniri / U-13 mlađi pioniri / U-11/U-9 početnici) by
# scraping HNS COMET Semafor competition pages and matching participating
# klubovi with hns_klub_id in pgz_sport.klubovi. For each (klub, kategorija,
# sezona) the per-club competition roster is fetched and players are
# upserted into pgz_sport.clan_kategorije (M2M player x category x season).
#
# Strategy:
# 1. Hardcoded list of per-season national + 2.NL competitions whose
# cid → kategorija mapping is known (PGZ regional 3.NL/ŽNS leagues
# are added as discovered).
# 2. For each competition, fetch /natjecanja/{cid}/{slug}/ and extract
# all participating /klubovi/{kid}/{slug}/ links.
# 3. Match against pgz_sport.klubovi (hns_klub_id). For each match,
# fetch /klubovi/{kid}/{slug}/?cid={cid} and parse player /igraci/
# links — these are the players belonging to this age category.
# 4. Upsert each player as clanovi (source=hns_semafor) and write
# clan_kategorije(clan_id, klub_id, kategorija, sezona, source,
# source_url, scraped_at).
#
# Run modes:
# python hns_youth_categories.py discover # dry-run, only logs
# python hns_youth_categories.py run # full scrape + DB upsert
# python hns_youth_categories.py klub <db_kid> # one club only
import os
import re
import sys
import time
import json
import logging
from datetime import datetime
from urllib.parse import unquote
from pathlib import Path
import psycopg2
import psycopg2.extras
import requests
from bs4 import BeautifulSoup
# Try to use SUB4's hns_api_client for shared session/UA
SCRIPTS_DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(SCRIPTS_DIR))
try:
import hns_api_client as hns_api # type: ignore
_GET_HTML = hns_api._get_html
_UA = hns_api.UA
except Exception:
_GET_HTML = None
_UA = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
DB_DSN = dict(
host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password=os.environ["DB_PASSWORD"],
)
BASE = "https://semafor.hns.family"
RATE_S = 1.0
TIMEOUT = 25
LOG_DIR = Path("/var/log/pgz-sport-debug")
LOG_DIR.mkdir(parents=True, exist_ok=True)
LOG_FILE = LOG_DIR / f"sub5_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
log = logging.getLogger("sub5")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler(LOG_FILE, encoding="utf-8"),
logging.StreamHandler(sys.stdout),
],
)
# ── Telegram ───────────────────────────────────────────────────────────────
TG_TOKEN = os.environ.get("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.environ.get("TG_CHAT", "7969491558")
def tg_send(msg: str):
if not TG_TOKEN or not TG_CHAT:
return
try:
# Use bare requests (no shared session) with short timeout to avoid
# hangs on flaky outbound paths.
r = requests.post(
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
data={"chat_id": TG_CHAT, "text": msg, "parse_mode": "Markdown"},
timeout=(5, 10),
)
log.info(f"telegram: {r.status_code}")
except Exception as e:
log.warning(f"telegram failed: {e}")
# ── HTTP fallback ──────────────────────────────────────────────────────────
_session = requests.Session()
_session.headers.update({"User-Agent": _UA, "Accept-Language": "hr,en;q=0.7"})
def fetch(url: str) -> str:
if _GET_HTML is not None:
return _GET_HTML(url)
log.debug(f"GET {url}")
r = _session.get(url, timeout=TIMEOUT)
r.raise_for_status()
return r.text
# ── Competition catalogue ─────────────────────────────────────────────────
# Each entry: (cid, slug, kategorija, sezona). PGZ-relevant national /
# 2.NL leagues per season. Regional ŽNS leagues are discovered dynamically
# via discover_pgz_competitions() once we find them inside klub raspored.
COMP_CATALOG = [
# 2025/2026 season
("100454960", "1-nl-juniori", "juniori-u19", "2025/2026"),
("100454979", "1-nl-kadeti", "kadeti-u17", "2025/2026"),
("100454999", "1-nl-pioniri", "pioniri-u15", "2025/2026"),
("100540163", "2-nl-juniori-a", "juniori-u19", "2025/2026"),
("100540177", "2-nl-juniori-b", "juniori-u19", "2025/2026"),
("100540032", "2-nl-kadeti-a", "kadeti-u17", "2025/2026"),
("100540109", "2-nl-kadeti-b", "kadeti-u17", "2025/2026"),
("100381663", "kvalifikacije-za-prvu-nl-juniori", "juniori-u19", "2025/2026"),
("100381584", "kvalifikacije-za-prvu-nl-kadeti", "kadeti-u17", "2025/2026"),
("100381484", "kvalifikacije-za-prvu-nl-pioniri", "pioniri-u15", "2025/2026"),
("100569152", "treca-nl-istok", "seniori", "2025/2026"), # Treća NL Istok
("100585203", "treca-nl-zapad", "seniori", "2025/2026"), # Treća NL Zapad (PGŽ klubovi)
# PGŽ-region ŽNL leagues discovered via klub raspored auto-discovery
("101555188", "1-znl-seniori", "seniori", "2025/2026"), # 1.ŽNL PGŽ seniori
("112195128", "kup-zns", "seniori", "2025/2026"), # Kup ŽNS Vinodolsko-Senjsko
("104425442", "kup-mladezi-juniori", "juniori-u19", "2025/2026"),
("104464435", "kup-mladezi-kadeti", "kadeti-u17", "2025/2026"),
("100391485", "supersport-hnl", "seniori", "2025/2026"),
("100413651", "supersport-prva-nl", "seniori", "2025/2026"),
("100418001", "supersport-druga-nl", "seniori", "2025/2026"),
("100439118", "supersport-hnk", "seniori", "2025/2026"), # Cup, all seniori
("101411063", "hrvatski-nogometni-kup", "seniori", "2025/2026"),
# 2024/2025 season — same structure, slightly different cids; will be
# discovered dynamically per-klub as well.
]
# Map from acat dropdown values (HR semantic labels) → kategorija
ACAT_MAP = {
"Seniors": "seniori",
"Juniors": "juniori-u19",
"Juniors 2": "juniori-u19",
"Cadets": "kadeti-u17",
"Cadets 2": "kadeti-u17",
"Pioneers": "pioniri-u15",
"Pioneers 2": "pioniri-u15",
"Young pioneers": "mladji-pioniri-u13",
"Beginners": "pocetnici-u11",
"Pre-beginners (6+1, 20min)": "pocetnici-u9",
}
# Heuristic from competition name → kategorija
def kategorija_from_name(name: str) -> str:
nl = name.lower()
if "juniori" in nl or "juniors" in nl:
return "juniori-u19"
if "kadeti" in nl or "cadets" in nl or "kadetkinje" in nl:
return "kadeti-u17"
if "stariji pioniri" in nl:
return "pioniri-u15"
if "mladji pioniri" in nl or "mlađi pioniri" in nl or "young pioneers" in nl:
return "mladji-pioniri-u13"
if "pioniri" in nl or "pioneers" in nl or "pionirke" in nl:
return "pioniri-u15"
if "pocetnici u-9" in nl or "pre-beginners" in nl or "pocetnici-u-9" in nl:
return "pocetnici-u9"
if "pocetnici u-11" in nl or "beginners" in nl or "pocetnici-u-11" in nl:
return "pocetnici-u11"
return "seniori"
# ── DB helpers ─────────────────────────────────────────────────────────────
def conn():
return psycopg2.connect(**DB_DSN)
def ensure_schema():
"""Verify clan_kategorije table exists; the schema in production already
matches the M2M shape required (no DDL change needed here)."""
with conn() as c, c.cursor() as cu:
cu.execute(
"""SELECT 1 FROM information_schema.tables
WHERE table_schema='pgz_sport' AND table_name='clan_kategorije'"""
)
if cu.fetchone():
log.info("clan_kategorije table verified.")
return
cu.execute(
"""CREATE TABLE pgz_sport.clan_kategorije (
id SERIAL PRIMARY KEY,
clan_id INTEGER REFERENCES pgz_sport.clanovi(id) ON DELETE CASCADE,
klub_id INTEGER REFERENCES pgz_sport.klubovi(id),
kategorija TEXT NOT NULL,
sezona TEXT,
source TEXT,
source_url TEXT,
scraped_at TIMESTAMPTZ DEFAULT now(),
UNIQUE (clan_id, kategorija, sezona, klub_id)
);
CREATE INDEX IF NOT EXISTS idx_clan_kat_clan
ON pgz_sport.clan_kategorije(clan_id);
CREATE INDEX IF NOT EXISTS idx_clan_kat_sezona
ON pgz_sport.clan_kategorije(sezona);
CREATE INDEX IF NOT EXISTS idx_clan_kat_klub
ON pgz_sport.clan_kategorije(klub_id);
"""
)
c.commit()
log.info("Created pgz_sport.clan_kategorije.")
def load_pgz_klubovi() -> dict[int, dict]:
"""Returns {hns_klub_id: {db_id, naziv, slug}}, deduped by hns_klub_id
(keeping the first / lowest-id row when duplicates exist)."""
out: dict[int, dict] = {}
with conn() as c, c.cursor() as cu:
cu.execute(
"""SELECT id, naziv, hns_klub_id, COALESCE(NULLIF(hns_slug,''), slug)
FROM pgz_sport.klubovi
WHERE hns_klub_id IS NOT NULL
ORDER BY id"""
)
for kid_db, naziv, hns_id, slug in cu.fetchall():
if hns_id in out:
continue # keep first occurrence
out[hns_id] = {
"db_id": kid_db,
"naziv": naziv,
"slug": slug or _slugify(naziv),
}
return out
def _slugify(name: str) -> str:
name = (name or "").lower()
repl = {"č": "c", "ć": "c", "ž": "z", "š": "s", "đ": "d"}
for k, v in repl.items():
name = name.replace(k, v)
name = re.sub(r"[^a-z0-9]+", "-", name).strip("-")
return name
def upsert_clan(klub_db_id: int, hns_pid: int, ime_prezime: str, slug: str) -> int:
"""Upsert a player into pgz_sport.clanovi keyed on (source='hns_semafor',
source_id=hns_pid). Returns clan_id."""
ime, prezime = "", ""
if ime_prezime:
parts = ime_prezime.strip().split(" ", 1)
ime = parts[0]
prezime = parts[1] if len(parts) > 1 else ""
url = f"{BASE}/igraci/{hns_pid}/{slug or 'x'}/"
with conn() as c, c.cursor() as cu:
cu.execute(
"""SELECT id FROM pgz_sport.clanovi
WHERE source='hns_semafor' AND source_id=%s LIMIT 1""",
(str(hns_pid),),
)
row = cu.fetchone()
if row:
return row[0]
# Try secondary lookup by hns_igrac_id (some rows from earlier runs)
# NOTE: hns_igrac_id is varchar in DB, cast to text
cu.execute(
"SELECT id FROM pgz_sport.clanovi WHERE hns_igrac_id=%s LIMIT 1",
(str(hns_pid),),
)
row = cu.fetchone()
if row:
cu.execute(
"""UPDATE pgz_sport.clanovi
SET source='hns_semafor', source_id=%s, source_url=%s,
source_synced_at=now()
WHERE id=%s""",
(str(hns_pid), url, row[0]),
)
c.commit()
return row[0]
cu.execute(
"""INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, source, source_id, source_url,
source_synced_at, slug, hns_igrac_id, sport, aktivan,
verified, created_at, updated_at)
VALUES (%s,%s,%s,'hns_semafor',%s,%s,now(),%s,%s,'nogomet',
true, false, now(), now())
RETURNING id""",
(klub_db_id, ime, prezime, str(hns_pid), url, slug or None, str(hns_pid)),
)
cid = cu.fetchone()[0]
c.commit()
return cid
def upsert_clan_kategorija(
clan_id: int, klub_db_id: int, kategorija: str, sezona: str,
source_url: str,
):
with conn() as c, c.cursor() as cu:
cu.execute(
"""INSERT INTO pgz_sport.clan_kategorije
(clan_id, klub_id, kategorija, sezona, source, source_url,
scraped_at)
VALUES (%s,%s,%s,%s,'hns_semafor',%s,now())
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO UPDATE
SET source_url=EXCLUDED.source_url,
scraped_at=now()""",
(clan_id, klub_db_id, kategorija, sezona, source_url),
)
c.commit()
# ── Scrape primitives ─────────────────────────────────────────────────────
def parse_competition_klubovi(html: str) -> list[tuple[int, str]]:
"""Extract participating klubovi from a /natjecanja/{cid}/ page.
Returns list of (hns_klub_id, slug)."""
soup = BeautifulSoup(html, "html.parser")
seen = set()
out = []
for a in soup.find_all("a", href=re.compile(r"^/klubovi/\d+/[a-z0-9-]+/?")):
m = re.match(r"^/klubovi/(\d+)/([a-z0-9-]+)/?", a["href"])
if not m:
continue
kid, slug = int(m.group(1)), m.group(2)
if kid in seen:
continue
seen.add(kid)
out.append((kid, slug))
return out
def parse_klub_roster(html: str) -> list[tuple[int, str, str]]:
"""Extract (hns_pid, slug, name) from a klub-with-cid page."""
soup = BeautifulSoup(html, "html.parser")
seen = set()
out = []
for a in soup.find_all("a", href=re.compile(r"^/?(?:https?://[^/]+)?/igraci/\d+/[a-z0-9-]+/?")):
href = a["href"]
m = re.search(r"/igraci/(\d+)/([a-z0-9-]+)/?", href)
if not m:
continue
pid, slug = int(m.group(1)), m.group(2)
if pid in seen:
continue
seen.add(pid)
name = (a.get_text(" ", strip=True) or "").strip()
out.append((pid, slug, name))
return out
def parse_klub_competitions(html: str) -> list[tuple[int, str]]:
"""From a klub page, parse the cid options dropdown — those are the
competitions the club currently participates in (default season+acat
only, but useful to discover more cids)."""
soup = BeautifulSoup(html, "html.parser")
out = []
for opt in soup.select('select#cid option'):
val = opt.get("value") or ""
m = re.search(r"\?cid=(\d+)", val)
if not m:
continue
out.append((int(m.group(1)), opt.get_text(" ", strip=True)))
return out
# ── Main flow ──────────────────────────────────────────────────────────────
def harvest():
pgz = load_pgz_klubovi()
log.info(
f"Loaded {len(pgz)} unique PGŽ klubovi with hns_klub_id "
f"({sum(1 for v in pgz.values() if v['slug'])} have slug)."
)
stats = {
"competitions_processed": 0,
"competitions_skipped": 0,
"klubovi_matched": 0,
"rosters_fetched": 0,
"players_upserted": 0,
"kategorije_inserted": 0,
"errors": 0,
"per_kategorija": {},
"per_klub": {},
}
discovered_extra: set[tuple[str, str, str]] = set() # (cid, slug, sezona)
seen_clan_kat: set[tuple[int, int, str, str]] = set()
for cid, slug, kategorija, sezona in COMP_CATALOG:
comp_url = f"{BASE}/natjecanja/{cid}/{slug}/"
try:
html = fetch(comp_url)
except Exception as e:
log.warning(f"comp {cid} fetch failed: {e}")
stats["competitions_skipped"] += 1
stats["errors"] += 1
continue
klubovi = parse_competition_klubovi(html)
log.info(
f"COMP cid={cid} '{slug}' [{kategorija}/{sezona}] -> "
f"{len(klubovi)} participating klubovi"
)
stats["competitions_processed"] += 1
time.sleep(RATE_S)
for hns_kid, k_slug in klubovi:
if hns_kid not in pgz:
continue
klub = pgz[hns_kid]
klub_db_id = klub["db_id"]
stats["klubovi_matched"] += 1
stats["per_klub"].setdefault(klub["naziv"], set()).add(kategorija)
# Fetch klub roster filtered by this competition cid
slug_use = klub["slug"] or k_slug
roster_url = f"{BASE}/klubovi/{hns_kid}/{slug_use}/?cid={cid}"
try:
rhtml = fetch(roster_url)
except Exception as e:
log.warning(f"roster {hns_kid} cid={cid} failed: {e}")
stats["errors"] += 1
continue
stats["rosters_fetched"] += 1
time.sleep(RATE_S)
# Discover any other cids this klub plays in
for ocid, oname in parse_klub_competitions(rhtml):
if ocid != int(cid):
discovered_extra.add((str(ocid), oname, sezona))
roster = parse_klub_roster(rhtml)
if not roster:
log.info(f" {klub['naziv']} cid={cid}: empty roster")
continue
log.info(
f" KLUB '{klub['naziv']}' (db={klub_db_id}, hns={hns_kid}) "
f"cid={cid} -> {len(roster)} igraca [{kategorija}]"
)
for hns_pid, p_slug, name in roster:
try:
clan_id = upsert_clan(klub_db_id, hns_pid, name, p_slug)
except Exception as e:
log.error(f"upsert_clan({hns_pid}) fail: {e}")
stats["errors"] += 1
continue
stats["players_upserted"] += 1
key = (clan_id, klub_db_id, kategorija, sezona)
if key in seen_clan_kat:
continue
seen_clan_kat.add(key)
try:
upsert_clan_kategorija(
clan_id, klub_db_id, kategorija, sezona, roster_url
)
stats["kategorije_inserted"] += 1
stats["per_kategorija"][kategorija] = (
stats["per_kategorija"].get(kategorija, 0) + 1
)
except Exception as e:
log.error(
f"upsert_clan_kategorija(clan={clan_id} "
f"klub={klub_db_id} kat={kategorija}) fail: {e}"
)
stats["errors"] += 1
# Summarize discovered extra cids (not yet in catalog) for next run
if discovered_extra:
log.info(
f"Discovered {len(discovered_extra)} extra cids not in catalog "
f"(top 15 below):"
)
for cid, name, sezona in list(discovered_extra)[:15]:
log.info(f" + cid={cid} '{name}' sezona={sezona}")
# Convert per_klub sets to lists for JSON serialisation
stats["per_klub"] = {k: sorted(v) for k, v in stats["per_klub"].items()}
return stats
def main():
global load_pgz_klubovi # noqa: PLW0603
cmd = sys.argv[1] if len(sys.argv) > 1 else "run"
log.info(f"=== SUB5 hns_youth_categories START cmd={cmd} log={LOG_FILE} ===")
ensure_schema()
if cmd == "discover":
pgz = load_pgz_klubovi()
log.info(f"PGŽ klubovi with hns_klub_id: {len(pgz)}")
for hk, v in list(pgz.items())[:10]:
log.info(f" hns={hk} db={v['db_id']} slug={v['slug']} naziv={v['naziv']}")
return
if cmd == "klub" and len(sys.argv) > 2:
# narrow-scope debug mode — monkey-patch loader before harvest()
target_db = int(sys.argv[2])
_orig = load_pgz_klubovi
pgz = {k: v for k, v in _orig().items() if v["db_id"] == target_db}
log.info(f"Restricted to db_id={target_db}: {len(pgz)} match")
load_pgz_klubovi = lambda: pgz # type: ignore
try:
stats = harvest()
finally:
load_pgz_klubovi = _orig # type: ignore
else:
stats = harvest()
log.info("=== SUMMARY ===")
log.info(json.dumps(stats, ensure_ascii=False, indent=2))
# Write SUB5_RESULT.md
md_path = Path("/opt/pgz-sport/cc_tasks/SUB5_RESULT.md")
md_path.parent.mkdir(parents=True, exist_ok=True)
md = render_summary_md(stats)
md_path.write_text(md, encoding="utf-8")
log.info(f"Result MD written → {md_path}")
# Telegram
tg_send(
"*SUB5 — HNS youth categories*\n"
f"Klubovi matched: *{stats['klubovi_matched']}*\n"
f"Rosters fetched: *{stats['rosters_fetched']}*\n"
f"Players upserted: *{stats['players_upserted']}*\n"
f"clan_kategorije: *{stats['kategorije_inserted']}*\n"
f"Errors: {stats['errors']}\n"
f"Log: `{LOG_FILE.name}`"
)
def render_summary_md(stats: dict) -> str:
lines = [
"# SUB5 — HNS youth categories result",
"",
f"_Generated: {datetime.now().isoformat(timespec='seconds')}_",
"",
"## High-level counters",
"",
f"- Competitions processed: **{stats['competitions_processed']}**",
f"- Competitions skipped: {stats['competitions_skipped']}",
f"- Klubovi (DB) matched in competitions: **{stats['klubovi_matched']}**",
f"- Rosters fetched: **{stats['rosters_fetched']}**",
f"- Players upserted into `clanovi`: **{stats['players_upserted']}**",
f"- M2M rows written into `clan_kategorije`: **{stats['kategorije_inserted']}**",
f"- Errors: {stats['errors']}",
"",
"## Per kategorija",
"",
"| Kategorija | M2M zapisa |",
"|---|---:|",
]
for k in sorted(stats["per_kategorija"].keys()):
lines.append(f"| {k} | {stats['per_kategorija'][k]} |")
lines.append("")
lines.append("## Per klub — kategorije pronadjene")
lines.append("")
lines.append("| Klub | Kategorije |")
lines.append("|---|---|")
for klub in sorted(stats["per_klub"].keys()):
kats = ", ".join(stats["per_klub"][klub])
lines.append(f"| {klub} | {kats} |")
lines.append("")
lines.append(f"_Log: `{LOG_FILE}`_")
return "\n".join(lines)
if __name__ == "__main__":
try:
main()
except Exception as e:
log.exception(f"FATAL: {e}")
tg_send(f"*SUB5 FATAL*: {e}")
sys.exit(1)
+4 -1
View File
@@ -1,11 +1,14 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# Fajl: objekti_enrich_address.py | v1.0 | 05.05.2026
# Author: Damir Radulić
# Svrha: Reverse-geocode lat/lng → adresa za sportski_objekti
import os, time, json
import psycopg2, requests
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
HEADERS = {"User-Agent": "Ri.NET PGŽ Sport (dradulic@outlook.com)"}
conn = psycopg2.connect(DSN); conn.autocommit = True
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
# Fajl: objekti_enrich_address.py | v1.0 | 05.05.2026
# Author: Damir Radulić
# Svrha: Reverse-geocode lat/lng → adresa za sportski_objekti
import os, time, json
import psycopg2, requests
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
HEADERS = {"User-Agent": "Ri.NET PGŽ Sport (dradulic@outlook.com)"}
conn = psycopg2.connect(DSN); conn.autocommit = True
with conn.cursor() as cur:
cur.execute("""
SELECT id, naziv, lat, lng FROM pgz_sport.sportski_objekti
WHERE aktivan = true AND lat IS NOT NULL AND lng IS NOT NULL
AND (adresa IS NULL OR adresa = '')
LIMIT 60
""")
rows = cur.fetchall()
print(f"Total: {len(rows)} objekata bez adrese")
for i, (oid, naziv, lat, lng) in enumerate(rows):
try:
# Nominatim reverse geocoding
r = requests.get(
f"https://nominatim.openstreetmap.org/reverse",
params={"lat": lat, "lon": lng, "format": "json", "accept-language": "hr"},
headers=HEADERS, timeout=10
)
if r.status_code == 200:
d = r.json()
addr = d.get("display_name", "")
# Krat: ulica + broj + grad
a = d.get("address", {})
short = []
for k in ["road", "house_number", "suburb", "city", "town", "village"]:
if a.get(k): short.append(a[k])
addr_short = ", ".join(short[:4]) or addr[:100]
with conn.cursor() as cur:
cur.execute("UPDATE pgz_sport.sportski_objekti SET adresa = %s WHERE id = %s", (addr_short, oid))
print(f" [{i+1}/{len(rows)}] {naziv} → {addr_short}")
time.sleep(1.1) # Nominatim rate-limit 1 req/s
except Exception as e:
print(f" [FAIL] {naziv}: {e}")
print("DONE")
+1 -1
View File
@@ -10,7 +10,7 @@ DSN_HOST="${RINET_DB_HOST:-10.10.0.2}"
DSN_PORT="${RINET_DB_PORT:-6432}"
DSN_DB="${RINET_DB_NAME:-rinet_v3}"
DSN_USER="${RINET_DB_USER:-rinet}"
DSN_PASS="${RINET_DB_PASS:-R1net2026!SecureDB#v7}"
DSN_PASS="${DB_PASSWORD:?DB_PASSWORD not set}"
BACKUP_DIR="/opt/pgz-sport/_backups"
LOG_DIR="/var/log/pgz-sport-debug"
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# sport_harvest_health.py — staleness check za pgz_sport klubove
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# Description: Provjerava kad je svaki aktivan klub zadnji put scrape-an
@@ -16,7 +19,7 @@ from psycopg2.extras import RealDictCursor
DSN = os.getenv(
"RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
)
TG_TOKEN = os.getenv("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
+120
View File
@@ -0,0 +1,120 @@
#!/usr/bin/env python3
# sport_harvest_health.py — staleness check za pgz_sport klubove
# v1.0 — dradulic@outlook.com / damir@rinet.one — 2026-05-05
# Description: Provjerava kad je svaki aktivan klub zadnji put scrape-an
# (klub_roster.scraped_at clanovi.last_scraped_at). Klubovi >7 dana
# flag-irani su za re-scrape; Telegram alert se šalje ako ima staleova.
# Pokreće ga /etc/cron.d/sport-harvesters u 04:30 svaki 2. dan.
import os
import sys
import json
import subprocess
from datetime import datetime, timedelta, timezone
import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv(
"RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
)
TG_TOKEN = os.getenv("TG_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
STALE_DAYS = int(os.getenv("SPORT_STALE_DAYS", "7"))
LOG_DIR = "/var/log/pgz-sport-debug"
LOG_PATH = os.path.join(LOG_DIR, f"health_{datetime.now().strftime('%Y%m%d_%H%M')}.log")
os.makedirs(LOG_DIR, exist_ok=True)
_logfh = open(LOG_PATH, "a")
def log(msg: str) -> None:
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
_logfh.write(line + "\n")
_logfh.flush()
SQL = """
WITH last_per_klub AS (
SELECT k.id AS klub_id, k.naziv, k.sport,
GREATEST(
COALESCE((SELECT MAX(scraped_at) FROM pgz_sport.klub_roster WHERE klub_id = k.id), 'epoch'::timestamptz),
COALESCE((SELECT MAX(last_scraped_at) FROM pgz_sport.clanovi WHERE klub_id = k.id), 'epoch'::timestamptz)
) AS last_scrape
FROM pgz_sport.klubovi k
WHERE k.aktivan = true
)
SELECT klub_id, naziv, sport, last_scrape,
(last_scrape <= 'epoch'::timestamptz OR last_scrape < now() - interval %s) AS stale
FROM last_per_klub;
"""
def telegram(text: str) -> None:
try:
subprocess.run(
[
"curl", "-sS", "-X", "POST",
f"https://api.telegram.org/bot{TG_TOKEN}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={text}",
],
capture_output=True,
timeout=10,
check=False,
)
log(f"telegram sent ({len(text)} chars)")
except Exception as e:
log(f"telegram fail: {e}")
def main() -> int:
log(f"sport_harvest_health START stale_days={STALE_DAYS}")
try:
conn = psycopg2.connect(DSN)
except Exception as e:
log(f"DB connect FAIL: {e}")
telegram(f"🚨 sport_harvest_health: DB connect FAIL — {e}")
return 2
interval_str = f"{STALE_DAYS} days"
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute(SQL, (interval_str,))
rows = cur.fetchall()
total = len(rows)
stale_rows = [r for r in rows if r["stale"]]
by_sport: dict = {}
for r in stale_rows:
s = (r["sport"] or "?").lower()
by_sport[s] = by_sport.get(s, 0) + 1
top_stale = sorted(
stale_rows,
key=lambda r: (r["last_scrape"] or datetime(1970, 1, 1, tzinfo=timezone.utc)),
)[:10]
log(f"klubova_total={total} stale={len(stale_rows)} by_sport={json.dumps(by_sport, ensure_ascii=False)}")
for r in top_stale:
log(f" STALE klub_id={r['klub_id']} sport={r['sport']} last={r['last_scrape']} naziv={r['naziv']}")
if stale_rows:
sport_summary = ", ".join(f"{k.upper()}:{v}" for k, v in sorted(by_sport.items()))
top_lines = "\n".join(
f" • {r['naziv']} ({(r['sport'] or '?')}) — {r['last_scrape']}"
for r in top_stale[:5]
)
msg = (
f"⚠️ Sport harvest stale: {len(stale_rows)}/{total} klubova "
f">{STALE_DAYS} dana ({sport_summary})\nTop:\n{top_lines}"
)
telegram(msg)
conn.close()
log("sport_harvest_health DONE")
return 1 if stale_rows else 0
if __name__ == "__main__":
sys.exit(main())
+4 -1
View File
@@ -1,3 +1,6 @@
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
Multi-sport scrape base class.
Usage: subclass + implement scrape_klub(), scrape_player()
@@ -9,7 +12,7 @@ import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
class SportHarvester:
SPORT = None # override
+149
View File
@@ -0,0 +1,149 @@
"""
Multi-sport scrape base class.
Usage: subclass + implement scrape_klub(), scrape_player()
"""
import os, time, json, re, sys
from datetime import datetime
from playwright.sync_api import sync_playwright
import psycopg2
from psycopg2.extras import RealDictCursor, execute_values
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
class SportHarvester:
SPORT = None # override
SOURCE = None # override
def __init__(self):
self.conn = psycopg2.connect(DSN)
self.conn.autocommit = True
self.stats = {'klubova': 0, 'players': 0, 'stats': 0, 'errors': 0}
self.log_file = open(f"/var/log/pgz-sport-debug/harvest_{self.SPORT}_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
def log(self, msg):
line = f"[{datetime.now().isoformat(timespec='seconds')}] [{self.SPORT}] {msg}"
print(line, flush=True)
self.log_file.write(line + "\n"); self.log_file.flush()
def slugify(self, s):
if not s: return ""
t = s.lower().strip()
for old, new in [('č','c'),('ć','c'),('ž','z'),('š','s'),('đ','d')]:
t = t.replace(old, new)
t = re.sub(r'[^a-z0-9\s-]', '', t)
return re.sub(r'\s+', '-', t).strip('-')
def get_target_klubovi(self, limit=999):
"""Get PGŽ priority klubovi for this sport."""
with self.conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT * FROM pgz_sport.v_pgz_priority_klubovi
WHERE sport = %s AND (financiran OR u_godisnjaku)
ORDER BY financiran DESC, u_godisnjaku DESC, id
LIMIT %s
""", (self.SPORT, limit))
return cur.fetchall()
def upsert_clan(self, klub_id, source_id, ime, prezime, source_url, kategorija=None, sezona=None, extra=None):
"""Upsert player + return clan_id."""
ime = re.sub(r'\s+', ' ', (ime or '')).strip()
prezime = re.sub(r'\s+', ' ', (prezime or '')).strip()
with self.conn.cursor() as cur:
# Try find existing by source+source_id
cur.execute("""
SELECT id FROM pgz_sport.clanovi
WHERE source = %s AND source_id = %s
ORDER BY id LIMIT 1
""", (self.SOURCE, str(source_id)))
row = cur.fetchone()
if row:
clan_id = row[0]
cur.execute("""
UPDATE pgz_sport.clanovi
SET ime = COALESCE(NULLIF(ime,''), %s),
prezime = COALESCE(NULLIF(prezime,''), %s),
klub_id = COALESCE(klub_id, %s),
source_url = %s, last_updated = now(), last_scraped_at = now(),
sport = COALESCE(sport, %s),
metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
WHERE id = %s
""", (ime, prezime, klub_id, source_url, self.SPORT, json.dumps(extra or {}), clan_id))
else:
cur.execute("""
INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, sport, source, source_id, source_url, last_scraped_at, aktivan, metadata)
VALUES (%s, %s, %s, %s, %s, %s, %s, now(), true, %s::jsonb)
RETURNING id
""", (klub_id, ime, prezime, self.SPORT, self.SOURCE, str(source_id), source_url, json.dumps(extra or {})))
clan_id = cur.fetchone()[0]
# Add kategorija if specified (many-to-many)
if kategorija:
cur.execute("""
INSERT INTO pgz_sport.clan_kategorije
(clan_id, kategorija, sezona, klub_id, source, source_url)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (clan_id, kategorija, sezona, klub_id) DO NOTHING
""", (clan_id, kategorija, sezona, klub_id, self.SOURCE, source_url))
return clan_id
def upsert_stats(self, clan_id, sezona, klub_id, klub_naziv, natjecanje, kategorija, stats_dict, raw=None):
"""Upsert player_stats row."""
with self.conn.cursor() as cur:
cur.execute("""
INSERT INTO pgz_sport.player_stats
(clan_id, sport, source, sezona, klub_id, klub_naziv, natjecanje, kategorija,
nastupi, golovi, asistencije, bodovi, trice, skokovi, blokade, servis_asovi,
zuti, crveni, minute, metadata)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (clan_id, sport, sezona, klub_id, natjecanje)
DO UPDATE SET
nastupi = EXCLUDED.nastupi, golovi = EXCLUDED.golovi,
asistencije = EXCLUDED.asistencije, bodovi = EXCLUDED.bodovi,
trice = EXCLUDED.trice, skokovi = EXCLUDED.skokovi,
blokade = EXCLUDED.blokade, servis_asovi = EXCLUDED.servis_asovi,
zuti = EXCLUDED.zuti, crveni = EXCLUDED.crveni, minute = EXCLUDED.minute,
metadata = EXCLUDED.metadata, scraped_at = now()
""", (clan_id, self.SPORT, self.SOURCE, sezona, klub_id, klub_naziv, natjecanje, kategorija,
stats_dict.get('nastupi'), stats_dict.get('golovi'), stats_dict.get('asistencije'),
stats_dict.get('bodovi'), stats_dict.get('trice'), stats_dict.get('skokovi'),
stats_dict.get('blokade'), stats_dict.get('servis_asovi'),
stats_dict.get('zuti'), stats_dict.get('crveni'), stats_dict.get('minute'),
json.dumps(raw or {})))
def run(self, limit=999):
klubovi = self.get_target_klubovi(limit)
self.log(f"🚀 Starting {self.SPORT} harvest. Target: {len(klubovi)} klubova")
with sync_playwright() as pw:
browser = pw.chromium.launch(headless=True, args=["--no-sandbox","--ignore-certificate-errors"])
ctx = browser.new_context(
ignore_https_errors=True,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
)
page = ctx.new_page()
for klub in klubovi:
try:
self.scrape_klub(page, klub)
self.stats['klubova'] += 1
except Exception as e:
self.stats['errors'] += 1
self.log(f" ❌ Klub {klub['id']} {klub['naziv']}: {e}")
browser.close()
self.log(f"✅ Done. Stats: {self.stats}")
# Telegram
import subprocess
try:
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y/sendMessage",
"-d","chat_id=7969491558",
"--data-urlencode", f"text={self.SPORT.upper()} harvest done: {self.stats}"],
timeout=8, capture_output=True)
except: pass
def scrape_klub(self, page, klub):
raise NotImplementedError("subclass must implement")
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_fix_and_extract.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_fix_and_extract.py
@@ -14,7 +17,7 @@ import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
@@ -0,0 +1,147 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_fix_and_extract.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_fix_and_extract.py
# Autor: dradulic@outlook.com / damir@rinet.one
# Svrha: SUB1 finalize — (a) rollback false positives,
# (b) extract hns_klub_id iz već postojećeg source_url,
# (c) verify presence preko HEAD i upsert.
# ═══════════════════════════════════════════════════════════════════
"""SUB1 fix-up: false-positive rollback + source_url-based extraction."""
import os, re, sys, time, json, subprocess, urllib.request
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_fix_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
LOG = open(LOG_PATH, "a")
# False positives to ROLLBACK (cleared and marked not_found)
FALSE_POS = {
2572: "NK Hajduk Tovarnik (matched HNK Hajduk Split — different club)",
600: "Ženski NK XXL Kraljevica (matched men's NK Kraljevica — wrong sex)",
}
def log(msg, telegram=False):
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True); LOG.write(line+"\n"); LOG.flush()
if telegram:
try:
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot{TG}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={msg[:3500]}"],
timeout=8, capture_output=True)
except: pass
def http_head_or_get(url, timeout=12):
"""Verify URL exists. Return (status, title)."""
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
html = r.read().decode("utf-8", errors="replace")
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
title = m.group(1).strip() if m else None
return r.status, title
except urllib.error.HTTPError as e:
return e.code, None
except Exception as e:
return 0, str(e)
URL_RE = re.compile(r'/klubovi/(\d+)/([a-z0-9-]*)/?')
def main():
log(f"=== SUB1 fix start; log={LOG_PATH} ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor(cursor_factory=RealDictCursor)
# Phase 1: Rollback false positives
rb = 0
for kid, reason in FALSE_POS.items():
cur.execute("""
UPDATE pgz_sport.klubovi
SET hns_klub_id = NULL,
hns_slug = NULL,
scrape_source = 'hns_not_found',
last_scraped_at = now()
WHERE id = %s
""", (kid,))
log(f" ROLLBACK [{kid}] — {reason}")
rb += 1
# Phase 2: Extract hns_klub_id from existing source_url
cur.execute("""
SELECT id, naziv, source_url
FROM pgz_sport.klubovi
WHERE sport='nogomet' AND pgz_sufinanciran=true
AND hns_klub_id IS NULL
AND source_url ~ 'semafor\\.hns\\.family/klubovi/[0-9]+'
ORDER BY id
""")
rows = cur.fetchall()
log(f"Source-URL extraction candidates: {len(rows)}")
extracted = 0; verify_fail = 0
for r in rows:
kid, naziv, url = r['id'], r['naziv'], r['source_url']
m = URL_RE.search(url)
if not m:
log(f" SKIP [{kid}] no match in {url}")
continue
hns_id = int(m.group(1))
slug = m.group(2) or None
# Verify
verify_url = f"https://semafor.hns.family/klubovi/{hns_id}/"
status, title = http_head_or_get(verify_url)
time.sleep(0.8)
if status != 200 or not title:
log(f" VERIFY FAIL [{kid}] {naziv} -> {hns_id}: status={status} title={title}")
verify_fail += 1
continue
# If slug missing, try inferring from title
if not slug and title:
slug = re.sub(r'[^a-z0-9]+', '-',
title.lower()
.replace('č','c').replace('ć','c').replace('š','s').replace('ž','z').replace('đ','d')
).strip('-')
canonical = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/" if slug else verify_url
try:
cur.execute("""
UPDATE pgz_sport.klubovi
SET hns_klub_id = %s,
hns_slug = %s,
source_url = %s,
scrape_source = 'hns_semafor',
last_scraped_at = now()
WHERE id = %s
""", (hns_id, slug, canonical, kid))
log(f" EXTRACT [{kid}] {naziv} -> HNS {hns_id} '{title}' (slug={slug})")
extracted += 1
except Exception as e:
log(f" UPDATE fail [{kid}]: {e}")
# Phase 3: Final stats
cur.execute("""
SELECT
COUNT(*) FILTER (WHERE hns_klub_id IS NOT NULL) AS mapped,
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND scrape_source='hns_not_found') AS marked_nf,
COUNT(*) FILTER (WHERE hns_klub_id IS NULL AND (scrape_source IS NULL OR scrape_source != 'hns_not_found')) AS untouched
FROM pgz_sport.klubovi
WHERE sport='nogomet' AND pgz_sufinanciran=true
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
""")
stats = cur.fetchone()
log(f"=== Final state (real football, PGŽ priority): mapped={stats['mapped']}, marked_not_found={stats['marked_nf']}, untouched={stats['untouched']} ===")
msg = (f"SUB1 fix done: rollback={rb}, source_url-extracted={extracted}, "
f"verify_fail={verify_fail}. Final mapped={stats['mapped']} / "
f"not_found={stats['marked_nf']} / untouched={stats['untouched']}")
log(msg, telegram=True)
if __name__ == "__main__":
main()
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
@@ -24,7 +27,7 @@ import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
@@ -0,0 +1,358 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_link_harvester.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_link_harvester.py
# Autor: dradulic@outlook.com / damir@rinet.one
# Svrha: SUB1 — Pronađi semafor.hns.family link za PGŽ priority
# nogometne klubove koji nemaju hns_klub_id.
# Strategija:
# 1. Enumerate ŽNS Primorsko-goranski (oid=51) competitions across
# seasons, plus 4. NL NS Rijeka, 3. HNL Zapad arhive
# 2. Za svaki natjecanje GET /natjecanja/{cid}/{cname}/ i izvuci
# sve <a href="/klubovi/{id}/{slug}/">{naziv}</a>
# 3. Build catalog (hns_id, slug, naziv) — skup unique
# 4. Fuzzy match candidate klubovi: normalize, drop NK/HNK/GNK
# prefiks, ukloni dijakritike, pa equality + substring + ratio
# 5. UPDATE pgz_sport.klubovi za matche; mark not_found za ostalo
# ═══════════════════════════════════════════════════════════════════
"""SUB1 — HNS link harvester for PGŽ priority football clubs."""
import os, re, sys, time, json, traceback, subprocess, difflib
from datetime import datetime
from urllib.parse import quote
import urllib.request, urllib.error
import psycopg2
from psycopg2.extras import RealDictCursor
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ; contact dradulic@outlook.com)"
SLEEP = 1.1
BASE = "https://semafor.hns.family"
LOG_PATH = f"/var/log/pgz-sport-debug/sub1_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
LOG = open(LOG_PATH, "a")
def log(msg, telegram=False):
line = f"[{datetime.now().isoformat(timespec='seconds')}] {msg}"
print(line, flush=True)
LOG.write(line + "\n"); LOG.flush()
if telegram:
try:
subprocess.run(["curl","-s","-X","POST",
f"https://api.telegram.org/bot{TG}/sendMessage",
"-d", f"chat_id={TG_CHAT}",
"--data-urlencode", f"text={msg[:3500]}"],
timeout=8, capture_output=True)
except Exception as e:
log(f"TG error: {e}")
def http_get(url, accept_json=False, timeout=25):
req = urllib.request.Request(url, headers={
"User-Agent": UA,
"Accept": "application/json, */*" if accept_json else "text/html,*/*",
"X-Requested-With": "XMLHttpRequest" if accept_json else "",
})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
# ── Normalization for fuzzy match ──
DIACRITIC_MAP = str.maketrans({
'č':'c','ć':'c','ž':'z','š':'s','đ':'d',
'Č':'c','Ć':'c','Ž':'z','Š':'s','Đ':'d',
'á':'a','é':'e','í':'i','ó':'o','ú':'u',
})
PREFIX_RE = re.compile(
r'^(hrvatski\s+nogometni\s+klub|hrvatski\s+nogometni\s+klub\.?|'
r'nogometni\s+klub|nogometna\s+akademija|nogometna\s+škola|'
r'sportska\s+akademija|Ženski\s+nogometni\s+klub|'
r'hnk|nk|gnk|znk|žnk|nk\.?|hnk\.?)\s+',
re.IGNORECASE
)
SUFFIX_NOISE_RE = re.compile(
r'\b(veterani|veterana|gornji\s+zamet|grada\s+crikvenice|'
r'gomirje\s+gomirje|mrkopalj\s+mrkopalj|snježnik\s+gerovo|'
r'-?\s*\d{4}\s*$)', re.IGNORECASE)
def norm(s):
if not s: return ""
s = s.lower().strip()
s = s.translate(DIACRITIC_MAP)
s = re.sub(r'["\'`]', '', s)
s = re.sub(r'\s+', ' ', s)
return s
def core_name(naziv):
"""Strip prefixes and noise; return core token list + joined."""
s = norm(naziv)
# remove prefix(es) (sometimes nested e.g. "Nogometni Klub HNK ...")
for _ in range(3):
s2 = PREFIX_RE.sub('', s)
if s2 == s: break
s = s2
s = SUFFIX_NOISE_RE.sub('', s).strip()
s = re.sub(r'\s+', ' ', s).strip()
return s
def slugify(s):
s = core_name(s)
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
return s
# ── Catalog harvest ──
def get_pgz_competitions(season):
"""Fetch list of competitions for ŽNS Primorsko-goranski (oid=51) for a season."""
t = int(time.time()*1000)
url = (f"{BASE}/handlers/getCompetitions/"
f"?season={quote(season)}&oid=51&teamch=Club"
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
f"&lang=hr&t={t}")
try:
body = http_get(url, accept_json=True)
return json.loads(body)
except Exception as e:
log(f" comps fetch fail {season}: {e}")
return []
def get_organizations(season):
"""List all organizations (regional federations) for a season."""
t = int(time.time()*1000)
url = (f"{BASE}/handlers/getOrganizations/"
f"?season={quote(season)}&teamch=Club&lang=hr&t={t}")
try:
body = http_get(url, accept_json=True)
return json.loads(body)
except Exception as e:
log(f" orgs fetch fail {season}: {e}")
return []
# Match <a href="/klubovi/{id}/{slug}/">NAME<div...>...</a> — name is anything before first child element
CLUB_LINK_RE2 = re.compile(
r'<a[^>]+href="(?:https?://semafor\.hns\.family)?/klubovi/(\d+)/([a-z0-9-]*)/?"[^>]*>([^<]{1,150})(?:<|</a>)',
re.IGNORECASE
)
def harvest_competition(cid):
"""GET natjecanje page and extract all club refs."""
# The dynamic linkConstructor returned literal {cid}/{cname} — try direct id
url = f"{BASE}/natjecanja/{cid}/x/"
try:
html = http_get(url)
except Exception as e:
log(f" nat fetch fail {cid}: {e}")
return []
found = []
for m in CLUB_LINK_RE2.finditer(html):
hns_id, slug, naziv = m.group(1), m.group(2), m.group(3).strip()
# filter: real club name (not "Klubovi" navigation etc.)
if len(naziv) > 1 and not naziv.lower().startswith('klubov'):
found.append((hns_id, slug, naziv))
return found
# ── Match logic ──
def match_score(candidate_naziv, candidate_grad, hns_naziv):
"""Score 0-100 how well candidate matches an HNS club entry."""
cand_core = core_name(candidate_naziv)
hns_core = core_name(hns_naziv)
if not cand_core or not hns_core:
return 0
if cand_core == hns_core:
return 100
# ratio
r = difflib.SequenceMatcher(None, cand_core, hns_core).ratio()
score = int(r * 100)
# bonus if grad in HNS naziv (e.g. "NK Borac (Ba)" + grad="Bakar")
if candidate_grad:
gnorm = norm(candidate_grad)
if gnorm and (gnorm[:3] in norm(hns_naziv) or norm(hns_naziv).endswith('('+gnorm[:1]+')')):
score = min(100, score + 5)
# substring containment bonus (one fully contained)
if cand_core in hns_core or hns_core in cand_core:
score = max(score, 85)
return score
# ── Main ──
def main():
log(f"=== SUB1 HNS link harvester start; log={LOG_PATH} ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor(cursor_factory=RealDictCursor)
# 1) Get candidate clubs
cur.execute("""
SELECT id, naziv, grad
FROM pgz_sport.klubovi
WHERE sport='nogomet' AND pgz_sufinanciran=true
AND hns_klub_id IS NULL
AND naziv !~* 'Malonogometni|Mini Nogomet|Mali Nogomet|Američkog|Plaž|Pijesku|HMNK|MNK|preteča|povijesn|1903|1906|1908|1917|1919|1926|1929'
ORDER BY naziv
""")
candidates = cur.fetchall()
log(f"Candidates: {len(candidates)}")
# 2) Build HNS catalog from PGŽ competitions across recent seasons
SEASONS = ["2025/2026","2024/2025","2023/2024","2022/2023","2021/2022","2020/2021","2019/2020","2018/2019","2017/2018"]
catalog = {} # hns_id -> {slug, naziv, sources:set}
seen_cids = set()
for season in SEASONS:
log(f"-- season {season}")
comps = get_pgz_competitions(season)
time.sleep(SLEEP)
log(f" PGŽ comps: {len(comps)}")
for c in comps:
cid = str(c.get('id',''))
if not cid or cid in seen_cids: continue
seen_cids.add(cid)
cname = c.get('value','')
try:
clubs = harvest_competition(cid)
except Exception as e:
log(f" {cid} ({cname}) fetch error: {e}")
clubs = []
for hns_id, slug, naziv in clubs:
if hns_id not in catalog:
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
else:
if slug and not catalog[hns_id]['slug']:
catalog[hns_id]['slug'] = slug
catalog[hns_id]['sources'].add(f"{season}:{cname[:30]}")
log(f" {cid} '{cname[:40]}' -> {len(clubs)} clubs (catalog={len(catalog)})")
time.sleep(SLEEP)
# also sweep top-tier comps to catch HNK Rijeka-tier (though those usually mapped)
# Also: 3.HNL Zapad / 4.NL NS Rijeka by oid=178180 (NS Rijeka)
log("-- NS Rijeka oid=178180 sweep")
for season in SEASONS:
t = int(time.time()*1000)
url = (f"{BASE}/handlers/getCompetitions/"
f"?season={quote(season)}&oid=178180&teamch=Club"
f"&linkType=competitions&linkConstructor={quote(BASE+'/natjecanja/{cid}/{cname}/')}"
f"&lang=hr&t={t}")
try:
comps = json.loads(http_get(url, accept_json=True))
except Exception as e:
log(f" ns_rijeka {season} fail: {e}"); comps = []
time.sleep(SLEEP)
for c in comps:
cid = str(c.get('id',''))
if not cid or cid in seen_cids: continue
seen_cids.add(cid)
cname = c.get('value','')
try:
clubs = harvest_competition(cid)
except Exception as e:
clubs = []
for hns_id, slug, naziv in clubs:
if hns_id not in catalog:
catalog[hns_id] = {'slug': slug, 'naziv': naziv, 'sources': set()}
catalog[hns_id]['sources'].add(f"NSR:{season}:{cname[:30]}")
log(f" NSR {cid} '{cname[:40]}' -> {len(clubs)} (cat={len(catalog)})")
time.sleep(SLEEP)
log(f"=== Catalog built: {len(catalog)} unique HNS clubs ===")
# Save catalog snapshot
snap = {hid: {'slug': v['slug'], 'naziv': v['naziv'], 'sources': sorted(v['sources'])[:5]}
for hid,v in catalog.items()}
with open("/opt/pgz-sport/cc_tasks/sub1_hns_catalog.json","w") as f:
json.dump(snap, f, ensure_ascii=False, indent=2)
log(f"Catalog snapshot -> /opt/pgz-sport/cc_tasks/sub1_hns_catalog.json")
# 3) Match candidates
matched = [] # (db_id, db_naziv, hns_id, slug, hns_naziv, score)
not_found = []
ambiguous = []
for cand in candidates:
db_id, naziv, grad = cand['id'], cand['naziv'], cand['grad']
ranked = []
for hid, v in catalog.items():
sc = match_score(naziv, grad, v['naziv'])
if sc >= 70:
ranked.append((sc, hid, v['slug'], v['naziv']))
ranked.sort(reverse=True)
if not ranked:
not_found.append((db_id, naziv, grad))
log(f" NOT FOUND: [{db_id}] {naziv} ({grad})")
continue
top = ranked[0]
if len(ranked) > 1 and ranked[1][0] >= top[0] - 3 and top[0] < 95:
ambiguous.append((db_id, naziv, grad, ranked[:3]))
log(f" AMBIGUOUS: [{db_id}] {naziv} -> top: {top[3]} ({top[0]}), 2nd: {ranked[1][3]} ({ranked[1][0]})")
# Skip ambiguous, mark not_found for safety
not_found.append((db_id, naziv, grad))
continue
matched.append((db_id, naziv, top[1], top[2], top[3], top[0]))
log(f" MATCH [{db_id}] {naziv} -> HNS {top[1]} '{top[3]}' (slug={top[2]}, score={top[0]})")
log(f"=== Match results: {len(matched)} matched, {len(not_found)} not_found, {len(ambiguous)} ambiguous ===")
# 4) Apply UPDATEs
upd_ok, upd_fail = 0, 0
for db_id, naziv, hns_id, slug, hns_naziv, sc in matched:
try:
source_url = f"{BASE}/klubovi/{hns_id}/{slug}/" if slug else f"{BASE}/klubovi/{hns_id}/"
cur.execute("""
UPDATE pgz_sport.klubovi
SET hns_klub_id = %s,
hns_slug = %s,
source_url = COALESCE(source_url, %s),
scrape_source = 'hns_semafor',
last_scraped_at = now()
WHERE id = %s
""", (int(hns_id), slug or None, source_url, db_id))
upd_ok += 1
except Exception as e:
upd_fail += 1
log(f" UPDATE fail [{db_id}] {naziv}: {e}")
# Mark not_found
nf_ok = 0
for db_id, naziv, grad in not_found:
try:
cur.execute("""
UPDATE pgz_sport.klubovi
SET scrape_source = 'hns_not_found',
last_scraped_at = now()
WHERE id = %s AND hns_klub_id IS NULL
""", (db_id,))
nf_ok += 1
except Exception as e:
log(f" not_found mark fail [{db_id}]: {e}")
# 5) Write result md
res_path = "/opt/pgz-sport/cc_tasks/SUB1_RESULT.md"
with open(res_path, "w") as f:
f.write(f"# SUB1 — HNS Link Harvest Result\n\n")
f.write(f"Date: {datetime.now().isoformat(timespec='seconds')}\n\n")
f.write(f"- Candidates processed: **{len(candidates)}**\n")
f.write(f"- HNS catalog built: **{len(catalog)}** unique clubs from {len(seen_cids)} competitions\n")
f.write(f"- Matched: **{len(matched)}** (DB updated: {upd_ok}, fail: {upd_fail})\n")
f.write(f"- Ambiguous (skipped to safety): **{len(ambiguous)}**\n")
f.write(f"- Not found (marked hns_not_found): **{len(not_found)}** (mark ok: {nf_ok})\n\n")
f.write(f"## Matched\n\n| db_id | DB naziv | HNS id | HNS naziv | slug | score |\n|---|---|---|---|---|---|\n")
for db_id, naziv, hns_id, slug, hns_naziv, sc in sorted(matched, key=lambda x: -x[5]):
f.write(f"| {db_id} | {naziv} | {hns_id} | {hns_naziv} | {slug} | {sc} |\n")
f.write(f"\n## Ambiguous (manual review)\n\n")
for db_id, naziv, grad, ranked in ambiguous:
f.write(f"- **[{db_id}] {naziv}** ({grad})\n")
for sc, hid, slug, hns_naziv in ranked:
f.write(f" - {sc}: HNS {hid} '{hns_naziv}' (slug={slug})\n")
f.write(f"\n## Not Found\n\n")
for db_id, naziv, grad in not_found:
f.write(f"- [{db_id}] {naziv} ({grad})\n")
f.write(f"\n## Log\n\n`{LOG_PATH}`\n")
log(f"Result -> {res_path}")
# 6) Telegram notify
msg = (f"SUB1 HNS done: matched {len(matched)}, not_found {len(not_found)}, "
f"ambiguous {len(ambiguous)}. Catalog={len(catalog)}. "
f"DB upd ok={upd_ok}/fail={upd_fail}. See SUB1_RESULT.md")
log(msg, telegram=True)
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: {e}\n{traceback.format_exc()}", telegram=True)
sys.exit(1)
+4 -1
View File
@@ -1,4 +1,7 @@
#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_manual_overrides.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_manual_overrides.py
@@ -12,7 +15,7 @@ from datetime import datetime
import psycopg2
DSN = os.getenv("RINET_DSN",
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
# Manual mappings — verified by visiting semafor.hns.family
@@ -0,0 +1,67 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: sub1_hns_manual_overrides.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scripts/sub1_hns_manual_overrides.py
# Autor: dradulic@outlook.com / damir@rinet.one
# Svrha: SUB1 — Manual high-confidence overrides za klubove koje
# fuzzy match nije uhvatio (ali postoje u HNS-u).
# ═══════════════════════════════════════════════════════════════════
"""SUB1 manual overrides — verified mapping for special cases."""
import os, re, sys, time, urllib.request
from datetime import datetime
import psycopg2
DSN = os.getenv("RINET_DSN",
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; contact dradulic@outlook.com)"
# Manual mappings — verified by visiting semafor.hns.family
# Format: db_id -> (hns_id, slug, naziv-na-HNS, reason)
OVERRIDES = {
9: (3440, "znk-rijeka", "ŽNK Rijeka", "Ženski NK Rijeka — same modern club"),
101: (3440, "znk-rijeka", "ŽNK Rijeka", "Ženski NK Rijeka 'Jack Pot' — sponsor naming, same club"),
574: (5239, "nk-medicinar", "NK Medicinar", "NK Medicinar Rijeka (osnovan 1996, SRC Belveder)"),
}
def http_check(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
html = r.read().decode("utf-8", errors="replace")
m = re.search(r'<h1[^>]*>([^<]+)</h1>', html)
return r.status, (m.group(1).strip() if m else None)
except Exception as e:
return 0, str(e)
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
print(f"[{datetime.now().isoformat(timespec='seconds')}] Manual overrides start")
ok = 0; fail = 0
for kid, (hns_id, slug, naziv, reason) in OVERRIDES.items():
url = f"https://semafor.hns.family/klubovi/{hns_id}/{slug}/"
status, title = http_check(url)
time.sleep(0.8)
if status != 200:
print(f" VERIFY FAIL [{kid}] {hns_id}: {status} {title}")
fail += 1
continue
try:
cur.execute("""
UPDATE pgz_sport.klubovi
SET hns_klub_id = %s,
hns_slug = %s,
source_url = %s,
scrape_source = 'hns_semafor_manual',
last_scraped_at = now()
WHERE id = %s
""", (hns_id, slug, url, kid))
print(f" OVERRIDE [{kid}] -> HNS {hns_id} '{title}' ({reason})")
ok += 1
except Exception as e:
print(f" UPDATE fail [{kid}]: {e}")
fail += 1
print(f"Done: ok={ok}, fail={fail}")
if __name__ == "__main__":
main()