/sport/* aliases u app: admin, dokumenti, crm/v2, erp/full
This commit is contained in:
@@ -1788,6 +1788,8 @@ def serve_sport_3d():
|
|||||||
|
|
||||||
@app.get("/admin")
|
@app.get("/admin")
|
||||||
@app.get("/admin/")
|
@app.get("/admin/")
|
||||||
|
@app.get("/sport/admin")
|
||||||
|
@app.get("/sport/admin/")
|
||||||
def serve_admin():
|
def serve_admin():
|
||||||
p = HTML_DIR / "admin.html"
|
p = HTML_DIR / "admin.html"
|
||||||
if p.exists():
|
if p.exists():
|
||||||
@@ -1806,6 +1808,8 @@ def serve_erp():
|
|||||||
|
|
||||||
@app.get("/erp/full")
|
@app.get("/erp/full")
|
||||||
@app.get("/erp/full/")
|
@app.get("/erp/full/")
|
||||||
|
@app.get("/sport/erp/full")
|
||||||
|
@app.get("/sport/erp")
|
||||||
def serve_erp_full():
|
def serve_erp_full():
|
||||||
p = HTML_DIR / "erp_full.html"
|
p = HTML_DIR / "erp_full.html"
|
||||||
if p.exists():
|
if p.exists():
|
||||||
@@ -1843,6 +1847,9 @@ def serve_crm():
|
|||||||
@app.get("/crm_v2/")
|
@app.get("/crm_v2/")
|
||||||
@app.get("/crm/v2")
|
@app.get("/crm/v2")
|
||||||
@app.get("/crm")
|
@app.get("/crm")
|
||||||
|
@app.get("/sport/crm")
|
||||||
|
@app.get("/sport/crm/v2")
|
||||||
|
@app.get("/sport/crm_v2")
|
||||||
def serve_crm_v2():
|
def serve_crm_v2():
|
||||||
p = HTML_DIR / "crm_v2.html"
|
p = HTML_DIR / "crm_v2.html"
|
||||||
if p.exists():
|
if p.exists():
|
||||||
@@ -1859,6 +1866,8 @@ def serve_login():
|
|||||||
|
|
||||||
@app.get("/admin/users")
|
@app.get("/admin/users")
|
||||||
@app.get("/admin/users/")
|
@app.get("/admin/users/")
|
||||||
|
@app.get("/sport/admin/users")
|
||||||
|
@app.get("/sport/admin/users/")
|
||||||
def serve_admin_users():
|
def serve_admin_users():
|
||||||
p = HTML_DIR / "admin_users.html"
|
p = HTML_DIR / "admin_users.html"
|
||||||
if p.exists():
|
if p.exists():
|
||||||
@@ -2097,6 +2106,8 @@ def serve_kpi():
|
|||||||
|
|
||||||
@app.get("/dokumenti")
|
@app.get("/dokumenti")
|
||||||
@app.get("/dokumenti/")
|
@app.get("/dokumenti/")
|
||||||
|
@app.get("/sport/dokumenti")
|
||||||
|
@app.get("/sport/dokumenti/")
|
||||||
def serve_dokumenti():
|
def serve_dokumenti():
|
||||||
p = HTML_DIR / "dokumenti.html"
|
p = HTML_DIR / "dokumenti.html"
|
||||||
return FileResponse(p) if p.exists() else {"error":"dokumenti.html not found"}
|
return FileResponse(p) if p.exists() else {"error":"dokumenti.html not found"}
|
||||||
|
|||||||
@@ -0,0 +1,183 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
|
# Fajl: hks_playwright.py | v1.0.0 | 05.05.2026
|
||||||
|
# Lokacija: /opt/pgz-sport/scrapers/hks_playwright.py
|
||||||
|
# Svrha: Cloudflare bypass za hks.hr (košarka) preko Playwright headless
|
||||||
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
|
"""HKS scraper — Playwright with stealth tricks."""
|
||||||
|
import os, re, time, json, hashlib, sys
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import execute_batch
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||||
|
|
||||||
|
PGZ_TOKENS = ["Rijeka", "PGŽ", "Primorsko-goransk", "Kvarner", "Opatija",
|
||||||
|
"Crikvenica", "Krk", "Cres", "Lošinj", "Rab", "Bakar", "Kostrena",
|
||||||
|
"Kantrida", "Trsat", "Mlaka", "Viškovo", "Kastav"]
|
||||||
|
|
||||||
|
|
||||||
|
def is_pgz_relevant(text):
|
||||||
|
return any(t in text for t in PGZ_TOKENS) or "Primorsko-goransk" in text
|
||||||
|
|
||||||
|
|
||||||
|
def chunk(text, max_len=800):
|
||||||
|
if len(text) <= max_len: return [text] if text else []
|
||||||
|
out = []; start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = min(start + max_len, len(text))
|
||||||
|
if end < len(text):
|
||||||
|
for sep in [". ", "! ", "? ", "\n"]:
|
||||||
|
p = text.rfind(sep, start, end)
|
||||||
|
if p > start + max_len // 2:
|
||||||
|
end = p + len(sep); break
|
||||||
|
out.append(text[start:end].strip())
|
||||||
|
start = end
|
||||||
|
return [c for c in out if len(c) > 80]
|
||||||
|
|
||||||
|
|
||||||
|
def upsert(conn, facts, source):
|
||||||
|
if not facts: return 0
|
||||||
|
cur = conn.cursor()
|
||||||
|
rows = []
|
||||||
|
for f in facts:
|
||||||
|
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||||
|
rows.append((f["fact"], source, "pgz_sport_savez_pw",
|
||||||
|
f.get("confidence", 0.85), h,
|
||||||
|
json.dumps({"url": f.get("url", ""), "scraped_via": "playwright"})))
|
||||||
|
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
||||||
|
try:
|
||||||
|
execute_batch(cur, sql, rows, page_size=50)
|
||||||
|
n = cur.rowcount; cur.close()
|
||||||
|
return n
|
||||||
|
except Exception as e:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
SAVEZI = {
|
||||||
|
"savezi_hks_kosarka_pw": ["https://www.hks.hr/", "https://www.hks.hr/klubovi/",
|
||||||
|
"https://www.hks.hr/natjecanja/"],
|
||||||
|
"savezi_hos_odbojka_pw": ["https://hos.hr/", "https://hos.hr/klubovi/"],
|
||||||
|
"savezi_hbs_bocanje_pw": ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def crawl():
|
||||||
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||||
|
|
||||||
|
total = {}
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=["--no-sandbox", "--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-dev-shm-usage", "--disable-gpu"]
|
||||||
|
)
|
||||||
|
ctx = browser.new_context(
|
||||||
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||||||
|
viewport={"width": 1280, "height": 800},
|
||||||
|
locale="hr-HR",
|
||||||
|
extra_http_headers={"Accept-Language": "hr-HR,hr;q=0.9,en;q=0.8"},
|
||||||
|
)
|
||||||
|
# Stealth: hide webdriver
|
||||||
|
ctx.add_init_script("""
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
||||||
|
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
||||||
|
Object.defineProperty(navigator, 'languages', {get: () => ['hr-HR', 'hr', 'en']});
|
||||||
|
""")
|
||||||
|
|
||||||
|
for source, urls in SAVEZI.items():
|
||||||
|
print(f"\n=== {source} ===")
|
||||||
|
visited = set()
|
||||||
|
queue = list(urls)
|
||||||
|
facts_total = 0
|
||||||
|
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
while queue and len(visited) < 40:
|
||||||
|
url = queue.pop(0)
|
||||||
|
if url in visited: continue
|
||||||
|
visited.add(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
||||||
|
if not response:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Wait for Cloudflare challenge if any
|
||||||
|
try:
|
||||||
|
page.wait_for_load_state("networkidle", timeout=8000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for Cloudflare challenge
|
||||||
|
title = page.title() or ""
|
||||||
|
if "Just a moment" in title or "Attention Required" in title:
|
||||||
|
print(f" CF challenge: {url[:60]}")
|
||||||
|
# Wait extra
|
||||||
|
page.wait_for_timeout(8000)
|
||||||
|
|
||||||
|
text = page.evaluate("() => document.body.innerText || ''") or ""
|
||||||
|
if not text or len(text) < 200:
|
||||||
|
print(f" empty: {url[:60]}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# PGŽ relevance filter
|
||||||
|
if not is_pgz_relevant(text):
|
||||||
|
# Still grab links
|
||||||
|
try:
|
||||||
|
hrefs = page.evaluate("""
|
||||||
|
() => Array.from(document.querySelectorAll('a'))
|
||||||
|
.map(a => a.href).filter(h => h)
|
||||||
|
""")
|
||||||
|
for h in hrefs[:50]:
|
||||||
|
base_host = urlparse(url).hostname
|
||||||
|
href_host = urlparse(h).hostname
|
||||||
|
if href_host == base_host and h not in visited and len(queue) < 100:
|
||||||
|
queue.append(h.split("#")[0])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Insert facts
|
||||||
|
facts = [{"fact": c, "url": url, "confidence": 0.85}
|
||||||
|
for c in chunk(text, 800) if len(c) > 100]
|
||||||
|
n = upsert(conn, facts, source)
|
||||||
|
facts_total += n
|
||||||
|
print(f" {url[:50]:50} → {n} facts (visit {len(visited)})")
|
||||||
|
|
||||||
|
# Discover links
|
||||||
|
try:
|
||||||
|
hrefs = page.evaluate("""
|
||||||
|
() => Array.from(document.querySelectorAll('a'))
|
||||||
|
.map(a => a.href).filter(h => h)
|
||||||
|
""")
|
||||||
|
for h in hrefs[:80]:
|
||||||
|
base_host = urlparse(url).hostname
|
||||||
|
href_host = urlparse(h).hostname
|
||||||
|
if href_host == base_host and h not in visited and len(queue) < 100:
|
||||||
|
queue.append(h.split("#")[0])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
page.wait_for_timeout(800) # rate limit
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" err {url[:50]}: {str(e)[:100]}")
|
||||||
|
|
||||||
|
page.close()
|
||||||
|
total[source] = {"visited": len(visited), "facts": facts_total}
|
||||||
|
|
||||||
|
ctx.close()
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
print(f"\n=== TOTAL ===")
|
||||||
|
print(json.dumps(total, default=str))
|
||||||
|
conn.close()
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(json.dumps(crawl(), default=str))
|
||||||
@@ -286,7 +286,7 @@ def upsert_clan(klub_db_id: int, hns_pid: int, ime_prezime: str, slug: str) -> i
|
|||||||
VALUES (%s,%s,%s,'hns_semafor',%s,%s,now(),%s,%s,'nogomet',
|
VALUES (%s,%s,%s,'hns_semafor',%s,%s,now(),%s,%s,'nogomet',
|
||||||
true, false, now(), now())
|
true, false, now(), now())
|
||||||
RETURNING id""",
|
RETURNING id""",
|
||||||
(klub_db_id, ime, prezime, str(hns_pid), url, slug or None, hns_pid),
|
(klub_db_id, ime, prezime, str(hns_pid), url, slug or None, str(hns_pid)),
|
||||||
)
|
)
|
||||||
cid = cu.fetchone()[0]
|
cid = cu.fetchone()[0]
|
||||||
c.commit()
|
c.commit()
|
||||||
|
|||||||
+118
-894
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user