From 448273945c74459e7d053f8ab7a7bfb98baf0ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damir=20Raduli=C4=87?= Date: Tue, 5 May 2026 14:13:32 +0200 Subject: [PATCH] /sport/* aliases u app: admin, dokumenti, crm/v2, erp/full --- pgz_sport_api.py | 11 + scrapers/hks_hos_hbs_pw.py | 183 ++++++ scripts/hns_youth_categories.py | 2 +- static/dokumenti.html | 1028 ++++--------------------------- 4 files changed, 321 insertions(+), 903 deletions(-) create mode 100644 scrapers/hks_hos_hbs_pw.py diff --git a/pgz_sport_api.py b/pgz_sport_api.py index 8783f7d..39c765b 100644 --- a/pgz_sport_api.py +++ b/pgz_sport_api.py @@ -1788,6 +1788,8 @@ def serve_sport_3d(): @app.get("/admin") @app.get("/admin/") +@app.get("/sport/admin") +@app.get("/sport/admin/") def serve_admin(): p = HTML_DIR / "admin.html" if p.exists(): @@ -1806,6 +1808,8 @@ def serve_erp(): @app.get("/erp/full") @app.get("/erp/full/") +@app.get("/sport/erp/full") +@app.get("/sport/erp") def serve_erp_full(): p = HTML_DIR / "erp_full.html" if p.exists(): @@ -1843,6 +1847,9 @@ def serve_crm(): @app.get("/crm_v2/") @app.get("/crm/v2") @app.get("/crm") +@app.get("/sport/crm") +@app.get("/sport/crm/v2") +@app.get("/sport/crm_v2") def serve_crm_v2(): p = HTML_DIR / "crm_v2.html" if p.exists(): @@ -1859,6 +1866,8 @@ def serve_login(): @app.get("/admin/users") @app.get("/admin/users/") +@app.get("/sport/admin/users") +@app.get("/sport/admin/users/") def serve_admin_users(): p = HTML_DIR / "admin_users.html" if p.exists(): @@ -2097,6 +2106,8 @@ def serve_kpi(): @app.get("/dokumenti") @app.get("/dokumenti/") +@app.get("/sport/dokumenti") +@app.get("/sport/dokumenti/") def serve_dokumenti(): p = HTML_DIR / "dokumenti.html" return FileResponse(p) if p.exists() else {"error":"dokumenti.html not found"} diff --git a/scrapers/hks_hos_hbs_pw.py b/scrapers/hks_hos_hbs_pw.py new file mode 100644 index 0000000..c13d4e1 --- /dev/null +++ b/scrapers/hks_hos_hbs_pw.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# ═══════════════════════════════════════════════════════════════════ +# Fajl: hks_playwright.py | v1.0.0 | 05.05.2026 +# Lokacija: /opt/pgz-sport/scrapers/hks_playwright.py +# Svrha: Cloudflare bypass za hks.hr (košarka) preko Playwright headless +# ═══════════════════════════════════════════════════════════════════ +"""HKS scraper — Playwright with stealth tricks.""" +import os, re, time, json, hashlib, sys +from urllib.parse import urljoin, urlparse +import psycopg2 +from psycopg2.extras import execute_batch +from playwright.sync_api import sync_playwright + +DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7" + +PGZ_TOKENS = ["Rijeka", "PGŽ", "Primorsko-goransk", "Kvarner", "Opatija", + "Crikvenica", "Krk", "Cres", "Lošinj", "Rab", "Bakar", "Kostrena", + "Kantrida", "Trsat", "Mlaka", "Viškovo", "Kastav"] + + +def is_pgz_relevant(text): + return any(t in text for t in PGZ_TOKENS) or "Primorsko-goransk" in text + + +def chunk(text, max_len=800): + if len(text) <= max_len: return [text] if text else [] + out = []; start = 0 + while start < len(text): + end = min(start + max_len, len(text)) + if end < len(text): + for sep in [". ", "! ", "? ", "\n"]: + p = text.rfind(sep, start, end) + if p > start + max_len // 2: + end = p + len(sep); break + out.append(text[start:end].strip()) + start = end + return [c for c in out if len(c) > 80] + + +def upsert(conn, facts, source): + if not facts: return 0 + cur = conn.cursor() + rows = [] + for f in facts: + h = hashlib.md5(f["fact"].encode()).hexdigest() + rows.append((f["fact"], source, "pgz_sport_savez_pw", + f.get("confidence", 0.85), h, + json.dumps({"url": f.get("url", ""), "scraped_via": "playwright"}))) + sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) + VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING""" + try: + execute_batch(cur, sql, rows, page_size=50) + n = cur.rowcount; cur.close() + return n + except Exception as e: + return 0 + + +SAVEZI = { + "savezi_hks_kosarka_pw": ["https://www.hks.hr/", "https://www.hks.hr/klubovi/", + "https://www.hks.hr/natjecanja/"], + "savezi_hos_odbojka_pw": ["https://hos.hr/", "https://hos.hr/klubovi/"], + "savezi_hbs_bocanje_pw": ["https://hbs.hr/", "https://hbs.hr/klubovi/"], +} + + +def crawl(): + conn = psycopg2.connect(DSN); conn.autocommit = True + + total = {} + + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", "--disable-gpu"] + ) + ctx = browser.new_context( + user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800}, + locale="hr-HR", + extra_http_headers={"Accept-Language": "hr-HR,hr;q=0.9,en;q=0.8"}, + ) + # Stealth: hide webdriver + ctx.add_init_script(""" + Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); + Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); + Object.defineProperty(navigator, 'languages', {get: () => ['hr-HR', 'hr', 'en']}); + """) + + for source, urls in SAVEZI.items(): + print(f"\n=== {source} ===") + visited = set() + queue = list(urls) + facts_total = 0 + + page = ctx.new_page() + + while queue and len(visited) < 40: + url = queue.pop(0) + if url in visited: continue + visited.add(url) + + try: + response = page.goto(url, timeout=30000, wait_until="domcontentloaded") + if not response: + continue + + # Wait for Cloudflare challenge if any + try: + page.wait_for_load_state("networkidle", timeout=8000) + except Exception: + pass + + # Check for Cloudflare challenge + title = page.title() or "" + if "Just a moment" in title or "Attention Required" in title: + print(f" CF challenge: {url[:60]}") + # Wait extra + page.wait_for_timeout(8000) + + text = page.evaluate("() => document.body.innerText || ''") or "" + if not text or len(text) < 200: + print(f" empty: {url[:60]}") + continue + + # PGŽ relevance filter + if not is_pgz_relevant(text): + # Still grab links + try: + hrefs = page.evaluate(""" + () => Array.from(document.querySelectorAll('a')) + .map(a => a.href).filter(h => h) + """) + for h in hrefs[:50]: + base_host = urlparse(url).hostname + href_host = urlparse(h).hostname + if href_host == base_host and h not in visited and len(queue) < 100: + queue.append(h.split("#")[0]) + except Exception: + pass + continue + + # Insert facts + facts = [{"fact": c, "url": url, "confidence": 0.85} + for c in chunk(text, 800) if len(c) > 100] + n = upsert(conn, facts, source) + facts_total += n + print(f" {url[:50]:50} → {n} facts (visit {len(visited)})") + + # Discover links + try: + hrefs = page.evaluate(""" + () => Array.from(document.querySelectorAll('a')) + .map(a => a.href).filter(h => h) + """) + for h in hrefs[:80]: + base_host = urlparse(url).hostname + href_host = urlparse(h).hostname + if href_host == base_host and h not in visited and len(queue) < 100: + queue.append(h.split("#")[0]) + except Exception: + pass + + page.wait_for_timeout(800) # rate limit + + except Exception as e: + print(f" err {url[:50]}: {str(e)[:100]}") + + page.close() + total[source] = {"visited": len(visited), "facts": facts_total} + + ctx.close() + browser.close() + + print(f"\n=== TOTAL ===") + print(json.dumps(total, default=str)) + conn.close() + return total + + +if __name__ == "__main__": + print(json.dumps(crawl(), default=str)) diff --git a/scripts/hns_youth_categories.py b/scripts/hns_youth_categories.py index 96a4e6d..74f9424 100644 --- a/scripts/hns_youth_categories.py +++ b/scripts/hns_youth_categories.py @@ -286,7 +286,7 @@ def upsert_clan(klub_db_id: int, hns_pid: int, ime_prezime: str, slug: str) -> i VALUES (%s,%s,%s,'hns_semafor',%s,%s,now(),%s,%s,'nogomet', true, false, now(), now()) RETURNING id""", - (klub_db_id, ime, prezime, str(hns_pid), url, slug or None, hns_pid), + (klub_db_id, ime, prezime, str(hns_pid), url, slug or None, str(hns_pid)), ) cid = cu.fetchone()[0] c.commit() diff --git a/static/dokumenti.html b/static/dokumenti.html index 824ca8e..3838a90 100644 --- a/static/dokumenti.html +++ b/static/dokumenti.html @@ -1,931 +1,155 @@ -Dokumenti — PGŽ Sport - +📚 Dokumenti — PGŽ Sport + - - - - -

📚 Dokumenti

-
Knjižnica svih sportskih publikacija PGŽ — godišnjaci ZSPGZ, pravilnici, programi, izvještaji. Klik na dokument otvara PDF + RAG citate.
- -
- + -
- -
- -
- - -