Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: pgz_savezi_deep.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scrapers/pgz_savezi_deep.py
|
||||
# Svrha: Deep crawl glavnih sportskih saveza za PGŽ klubove
|
||||
# - HNS (nogomet) — hns-cff.hr, prvahnl.hr
|
||||
# - HKS (košarka) — hks.hr, abaliga.com
|
||||
# - HRS (rukomet) — hrs.hr
|
||||
# - HOS (odbojka) — hos.hr
|
||||
# - HBS (boćanje) — hbs.hr
|
||||
# - HVS (vaterpolo) — hvs.hr
|
||||
# Sve klube + utakmice + rezultate koji su u PGŽ
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""Multi-savez deep scrape for PGŽ clubs."""
|
||||
import os, sys, re, time, hashlib, logging, json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [savezi] %(message)s")
|
||||
log = logging.getLogger("savezi")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
# PGŽ municipalities — for filtering relevant clubs
|
||||
PGZ_TOWNS = ["Rijeka", "Opatija", "Crikvenica", "Krk", "Cres", "Mali Lošinj",
|
||||
"Rab", "Delnice", "Vrbovsko", "Čabar", "Bakar", "Kraljevica",
|
||||
"Kastav", "Viškovo", "Klana", "Mošćenička Draga", "Lovran",
|
||||
"Matulji", "Omišalj", "Punat", "Vrbnik", "Baška", "Dobrinj",
|
||||
"Malinska", "Jelenje", "Costrena", "Kostrena", "Čavle", "Lopar",
|
||||
"Brod Moravice", "Mrkopalj", "Ravna Gora", "Lokve", "Skrad",
|
||||
"Fužine", "Novi Vinodolski", "Vinodol"]
|
||||
|
||||
ROOTS = {
|
||||
"hns_nogomet": ["https://hns-cff.hr/", "https://prvahnl.hr/", "https://hns-cff.hr/klubovi/"],
|
||||
"hks_kosarka": ["https://hks.hr/", "https://hks.hr/klubovi/"],
|
||||
"hrs_rukomet": ["https://hrs.hr/", "https://hrs.hr/klubovi/"],
|
||||
"hos_odbojka": ["https://hos.hr/", "https://hos.hr/klubovi/"],
|
||||
"hbs_bocanje": ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
|
||||
"hvs_vaterpolo": ["https://hvs.hr/", "https://hvs.hr/klubovi/"],
|
||||
"hps_plivanje": ["https://hps.hr/"],
|
||||
"haof_atletika": ["https://haaf.hr/"],
|
||||
"hgsf_gimnastika":["https://hgsf.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def fetch(url, timeout=20, retries=2):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace"), r.status
|
||||
except Exception:
|
||||
time.sleep(2*(i+1))
|
||||
return None, 0
|
||||
|
||||
|
||||
def extract_text(html):
|
||||
h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
|
||||
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
|
||||
t = re.sub(r"<[^>]+>", " ", h)
|
||||
return re.sub(r"\s+", " ", unescape(t)).strip()
|
||||
|
||||
|
||||
def is_pgz_relevant(text):
|
||||
"""Check if text mentions PGŽ towns/clubs."""
|
||||
return any(t in text for t in PGZ_TOWNS) or "Primorsko-goranska" in text or "PGŽ" in text
|
||||
|
||||
|
||||
def chunk(text, max_len=800):
|
||||
if len(text) <= max_len: return [text] if text else []
|
||||
out = []; start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_len, len(text))
|
||||
if end < len(text):
|
||||
for sep in [". ", "! ", "? ", "\n"]:
|
||||
p = text.rfind(sep, start, end)
|
||||
if p > start + max_len // 2:
|
||||
end = p + len(sep); break
|
||||
out.append(text[start:end].strip())
|
||||
start = end
|
||||
return [c for c in out if len(c) > 80]
|
||||
|
||||
|
||||
def upsert(conn, facts, savez_key):
|
||||
if not facts: return 0
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((f["fact"], f"savezi_{savez_key}", "pgz_sport_savezi",
|
||||
f.get("confidence", 0.82), h,
|
||||
json.dumps({"url": f.get("url", "")})))
|
||||
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
||||
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
||||
try:
|
||||
execute_batch(cur, sql, rows, page_size=50)
|
||||
n = cur.rowcount
|
||||
cur.close()
|
||||
return n
|
||||
except Exception as e:
|
||||
log.error(f"upsert: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def crawl_savez(savez_key, urls, max_per=80):
|
||||
log.info(f"=== {savez_key} ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
visited = set()
|
||||
queue = list(urls)
|
||||
total_facts = 0
|
||||
pgz_relevant = 0
|
||||
|
||||
while queue and len(visited) < max_per:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
|
||||
html, _ = fetch(url, timeout=15)
|
||||
if not html: continue
|
||||
|
||||
text = extract_text(html)
|
||||
if not text or len(text) < 100: continue
|
||||
|
||||
# Add subpages
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
|
||||
u = urljoin(url, m.group(1))
|
||||
host = urlparse(u).hostname or ""
|
||||
base_host = urlparse(url).hostname or ""
|
||||
if host == base_host and u not in visited and len(queue) < 200:
|
||||
queue.append(u.split("#")[0])
|
||||
|
||||
# Only ingest PGŽ-relevant content
|
||||
if not is_pgz_relevant(text):
|
||||
continue
|
||||
pgz_relevant += 1
|
||||
|
||||
facts = [{"fact": c, "url": url, "confidence": 0.82}
|
||||
for c in chunk(text, 800) if len(c) > 100]
|
||||
total_facts += upsert(conn, facts, savez_key)
|
||||
|
||||
time.sleep(0.4)
|
||||
|
||||
log.info(f" {savez_key}: visited={len(visited)} pgz_relevant={pgz_relevant} facts={total_facts}")
|
||||
conn.close()
|
||||
return {"savez": savez_key, "visited": len(visited),
|
||||
"pgz_relevant": pgz_relevant, "facts": total_facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for savez, urls in ROOTS.items():
|
||||
try:
|
||||
r = crawl_savez(savez, urls, max_per=60)
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
log.error(f"{savez} fail: {e}")
|
||||
results.append({"savez": savez, "error": str(e)})
|
||||
|
||||
print(json.dumps({"summary": results,
|
||||
"total_facts": sum(r.get("facts", 0) for r in results)}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: rss_hr_full.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scrapers/rss_hr_full.py
|
||||
# Svrha: rss.hr (Riječki sport savez) full crawl
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""rss.hr complete corpus."""
|
||||
import os, sys, re, time, hashlib, logging, json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [rss_hr] %(message)s")
|
||||
log = logging.getLogger("rss_hr")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
||||
ROOT = "https://rss.hr"
|
||||
HTML_DIR = "/opt/pgz-sport/data/rss_hr_html"
|
||||
PDF_DIR = "/opt/pgz-sport/data/rss_hr_pdf"
|
||||
os.makedirs(HTML_DIR, exist_ok=True)
|
||||
os.makedirs(PDF_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def fetch(url, timeout=20, retries=3, binary=False):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
d = r.read()
|
||||
return d if binary else d.decode("utf-8", errors="replace"), r.status
|
||||
except Exception:
|
||||
time.sleep(3*(i+1))
|
||||
return None, 0
|
||||
|
||||
|
||||
def extract_title(html):
|
||||
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
|
||||
return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
|
||||
|
||||
|
||||
def extract_text(html):
|
||||
h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
|
||||
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
|
||||
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
|
||||
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
|
||||
t = re.sub(r"<[^>]+>", " ", h)
|
||||
return re.sub(r"\s+", " ", unescape(t)).strip()
|
||||
|
||||
|
||||
def find_internal_links(html, base):
|
||||
if not html: return []
|
||||
out = set()
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
|
||||
u = urljoin(base, m.group(1))
|
||||
host = urlparse(u).hostname or ""
|
||||
if "rss.hr" in host:
|
||||
# Strip query/fragment
|
||||
u = u.split("#")[0]
|
||||
out.add(u)
|
||||
return list(out)
|
||||
|
||||
|
||||
def find_pdfs(html, base):
|
||||
out = set()
|
||||
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html or "", re.I):
|
||||
out.add(urljoin(base, m.group(1)))
|
||||
return list(out)
|
||||
|
||||
|
||||
def chunk(text, max_len=800):
|
||||
if len(text) <= max_len: return [text] if text else []
|
||||
out = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_len, len(text))
|
||||
if end < len(text):
|
||||
for sep in [". ", "! ", "? ", "\n"]:
|
||||
p = text.rfind(sep, start, end)
|
||||
if p > start + max_len // 2:
|
||||
end = p + len(sep); break
|
||||
out.append(text[start:end].strip())
|
||||
start = end
|
||||
return [c for c in out if len(c) > 50]
|
||||
|
||||
|
||||
def upsert(conn, facts):
|
||||
if not facts: return 0
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((f["fact"], f["source"], f.get("category", "rss_hr"),
|
||||
f.get("confidence", 0.85), h,
|
||||
json.dumps({"url": f.get("url", "")})))
|
||||
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
||||
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
||||
try:
|
||||
execute_batch(cur, sql, rows, page_size=50)
|
||||
n = cur.rowcount
|
||||
cur.close()
|
||||
return n
|
||||
except Exception as e:
|
||||
log.error(f"upsert: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def crawl(max_pages=400):
|
||||
log.info(f"=== rss.hr crawl (max {max_pages} pages) ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
visited = set()
|
||||
queue = [ROOT, f"{ROOT}/clanovi/", f"{ROOT}/natjecanja/",
|
||||
f"{ROOT}/dokumenti/", f"{ROOT}/o-nama/",
|
||||
f"{ROOT}/sportasi-sezone/", f"{ROOT}/povjerenstva/",
|
||||
f"{ROOT}/strucni-savjet/"]
|
||||
|
||||
total_facts = 0
|
||||
pdfs = set()
|
||||
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
|
||||
if len(visited) % 20 == 0:
|
||||
log.info(f" visited {len(visited)}, queue {len(queue)}, facts {total_facts}")
|
||||
|
||||
result = fetch(url, timeout=15)
|
||||
if not result or not result[0]:
|
||||
continue
|
||||
html = result[0]
|
||||
|
||||
# Save html
|
||||
try:
|
||||
h = hashlib.md5(url.encode()).hexdigest()[:16]
|
||||
with open(f"{HTML_DIR}/{h}.html", "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
title = extract_title(html)
|
||||
text = extract_text(html)
|
||||
|
||||
# PDFs
|
||||
for p in find_pdfs(html, url):
|
||||
pdfs.add(p)
|
||||
|
||||
# Facts
|
||||
facts = []
|
||||
if title and len(title) > 8:
|
||||
facts.append({"fact": f"rss.hr — {title}", "source": "rss.hr",
|
||||
"category": "rss_hr_riecki_sport_savez",
|
||||
"confidence": 0.90, "url": url})
|
||||
for c in chunk(text, 800):
|
||||
if len(c) < 80: continue
|
||||
facts.append({"fact": c, "source": "rss.hr",
|
||||
"category": "rss_hr_riecki_sport_savez",
|
||||
"confidence": 0.85, "url": url})
|
||||
|
||||
total_facts += upsert(conn, facts)
|
||||
|
||||
# Discover more links
|
||||
for l in find_internal_links(html, url):
|
||||
if l not in visited and len(queue) < 1000:
|
||||
queue.append(l)
|
||||
|
||||
time.sleep(0.4)
|
||||
|
||||
# Download PDFs
|
||||
pdf_dl = 0
|
||||
for p in list(pdfs)[:100]:
|
||||
try:
|
||||
h = hashlib.md5(p.encode()).hexdigest()[:16]
|
||||
path = f"{PDF_DIR}/{h}.pdf"
|
||||
if os.path.exists(path): continue
|
||||
data, st = fetch(p, timeout=30, binary=True)
|
||||
if data and st == 200:
|
||||
with open(path, "wb") as f: f.write(data)
|
||||
pdf_dl += 1
|
||||
time.sleep(0.8)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
log.info(f"=== DONE: {len(visited)} visited, {total_facts} facts, {pdf_dl} pdfs ===")
|
||||
conn.close()
|
||||
return {"visited": len(visited), "facts": total_facts, "pdfs": pdf_dl}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
r = crawl()
|
||||
print(json.dumps(r))
|
||||
@@ -0,0 +1,244 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sport_pgz_full.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scrapers/sport_pgz_full.py
|
||||
# Svrha: Sitemap-driven full crawl of sport-pgz.hr
|
||||
# - All 4 sitemaps: objave (1+2), natječaji, stranice
|
||||
# - PDF download + OCR ingest
|
||||
# - Article parsing → dabi.knowledge ingest
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""sport-pgz.hr complete corpus via sitemap."""
|
||||
import os, sys, re, time, hashlib, logging, json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [sport_pgz] %(message)s")
|
||||
log = logging.getLogger("sport_pgz")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
||||
SITEMAP_INDEX = "https://sport-pgz.hr/sitemap.xml"
|
||||
PDF_DIR = "/opt/pgz-sport/data/sport_pgz_pdf"
|
||||
HTML_DIR = "/opt/pgz-sport/data/sport_pgz_html"
|
||||
|
||||
os.makedirs(PDF_DIR, exist_ok=True)
|
||||
os.makedirs(HTML_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def fetch(url, timeout=20, retries=3, binary=False):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
data = r.read()
|
||||
if binary:
|
||||
return data, r.status
|
||||
return data.decode("utf-8", errors="replace"), r.status
|
||||
except Exception as e:
|
||||
log.warning(f"fetch fail attempt {i+1} {url}: {e}")
|
||||
time.sleep(3*(i+1))
|
||||
return None, 0
|
||||
|
||||
|
||||
def parse_sitemap_index(xml):
|
||||
"""Return list of sub-sitemap URLs."""
|
||||
return re.findall(r"<loc>(https?://[^<]+)</loc>", xml or "")
|
||||
|
||||
|
||||
def parse_sitemap_urls(xml):
|
||||
"""Return list of (url, lastmod) pairs."""
|
||||
out = []
|
||||
for m in re.finditer(r"<url>\s*<loc>([^<]+)</loc>(?:\s*<lastmod>([^<]*)</lastmod>)?", xml or ""):
|
||||
out.append((m.group(1), m.group(2) or ""))
|
||||
return out
|
||||
|
||||
|
||||
def extract_main_text(html):
|
||||
if not html: return ""
|
||||
h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S|re.I)
|
||||
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
|
||||
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
|
||||
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
|
||||
text = re.sub(r"<[^>]+>", " ", h)
|
||||
text = unescape(text)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def extract_title(html):
|
||||
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
|
||||
if m:
|
||||
return re.sub(r"\s+", " ", unescape(m.group(1))).strip()
|
||||
return ""
|
||||
|
||||
|
||||
def find_pdf_links(html, base):
|
||||
if not html: return []
|
||||
out = set()
|
||||
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
|
||||
out.add(urljoin(base, m.group(1)))
|
||||
return list(out)
|
||||
|
||||
|
||||
def chunk_text(text, max_len=800):
|
||||
"""Split into ~800 char chunks, prefer sentence boundaries."""
|
||||
if len(text) <= max_len:
|
||||
return [text] if text else []
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_len, len(text))
|
||||
# Find last . or ! or ? or newline
|
||||
if end < len(text):
|
||||
for sep in [". ", "! ", "? ", "\n"]:
|
||||
p = text.rfind(sep, start, end)
|
||||
if p > start + max_len // 2:
|
||||
end = p + len(sep)
|
||||
break
|
||||
chunks.append(text[start:end].strip())
|
||||
start = end
|
||||
return [c for c in chunks if len(c) > 50]
|
||||
|
||||
|
||||
def upsert_facts(conn, facts):
|
||||
"""Bulk insert into dabi.knowledge."""
|
||||
if not facts:
|
||||
return 0
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((
|
||||
f["fact"], f["source"], f.get("category", "sport_pgz"),
|
||||
f.get("confidence", 0.85), h,
|
||||
json.dumps({"url": f.get("url", "")}),
|
||||
))
|
||||
sql = """
|
||||
INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
||||
VALUES (%s, %s, %s, %s, %s, %s::jsonb)
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
"""
|
||||
try:
|
||||
execute_batch(cur, sql, rows, page_size=50)
|
||||
cnt = cur.rowcount
|
||||
cur.close()
|
||||
return cnt
|
||||
except Exception as e:
|
||||
log.error(f"upsert err: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def crawl():
|
||||
"""Main crawl entry."""
|
||||
log.info(f"=== sport-pgz.hr full crawl ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
# 1. Get sitemap index
|
||||
xml, _ = fetch(SITEMAP_INDEX)
|
||||
if not xml:
|
||||
log.error("sitemap fetch failed"); return
|
||||
|
||||
sub_sitemaps = parse_sitemap_index(xml)
|
||||
log.info(f"sub-sitemaps: {len(sub_sitemaps)}")
|
||||
|
||||
all_urls = []
|
||||
for sm in sub_sitemaps:
|
||||
sub_xml, _ = fetch(sm)
|
||||
if sub_xml:
|
||||
urls = parse_sitemap_urls(sub_xml)
|
||||
all_urls.extend(urls)
|
||||
log.info(f" {sm}: {len(urls)} urls")
|
||||
|
||||
log.info(f"TOTAL URLs to crawl: {len(all_urls)}")
|
||||
|
||||
# 2. Crawl each URL → text → facts
|
||||
total_facts = 0
|
||||
crawled = 0
|
||||
pdfs_found = []
|
||||
|
||||
for idx, (url, lastmod) in enumerate(all_urls, 1):
|
||||
if idx % 20 == 0:
|
||||
log.info(f" progress: {idx}/{len(all_urls)} crawled, {total_facts} facts")
|
||||
|
||||
try:
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html:
|
||||
continue
|
||||
|
||||
# Save HTML for replay
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()[:16]
|
||||
html_path = f"{HTML_DIR}/{url_hash}.html"
|
||||
try:
|
||||
with open(html_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
title = extract_title(html)
|
||||
text = extract_main_text(html)
|
||||
|
||||
# Collect PDFs
|
||||
for pdf_url in find_pdf_links(html, url):
|
||||
pdfs_found.append(pdf_url)
|
||||
|
||||
# Build facts
|
||||
facts = []
|
||||
if title and len(title) > 10:
|
||||
facts.append({
|
||||
"fact": f"sport-pgz.hr — {title}",
|
||||
"source": "sport-pgz.hr",
|
||||
"category": "sport_pgz_official",
|
||||
"confidence": 0.92,
|
||||
"url": url,
|
||||
})
|
||||
|
||||
for chunk in chunk_text(text, max_len=800):
|
||||
if len(chunk) < 80:
|
||||
continue
|
||||
facts.append({
|
||||
"fact": chunk,
|
||||
"source": "sport-pgz.hr",
|
||||
"category": "sport_pgz_official",
|
||||
"confidence": 0.88,
|
||||
"url": url,
|
||||
})
|
||||
|
||||
inserted = upsert_facts(conn, facts)
|
||||
total_facts += inserted
|
||||
crawled += 1
|
||||
|
||||
time.sleep(0.5) # rate limit
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"err {url}: {e}")
|
||||
|
||||
# 3. Download PDFs
|
||||
pdfs_set = list(set(pdfs_found))
|
||||
log.info(f"PDF links found: {len(pdfs_set)}")
|
||||
pdf_downloaded = 0
|
||||
for pdf_url in pdfs_set[:200]: # limit for first run
|
||||
try:
|
||||
url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:16]
|
||||
pdf_path = f"{PDF_DIR}/{url_hash}.pdf"
|
||||
if os.path.exists(pdf_path):
|
||||
continue
|
||||
data, status = fetch(pdf_url, timeout=30, binary=True)
|
||||
if data and status == 200:
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(data)
|
||||
pdf_downloaded += 1
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
log.warning(f"pdf err {pdf_url}: {e}")
|
||||
|
||||
log.info(f"=== DONE: {crawled} pages crawled, {total_facts} facts inserted, {pdf_downloaded} PDFs downloaded ===")
|
||||
conn.close()
|
||||
|
||||
return {"crawled": crawled, "facts": total_facts, "pdfs": pdf_downloaded}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
r = crawl()
|
||||
print(json.dumps(r))
|
||||
@@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: wiki_pgz_sport.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scrapers/wiki_pgz_sport.py
|
||||
# Svrha: Wikipedia HR/EN scrape — PGŽ sport klubovi + sportaši
|
||||
# - Iterate kroz sve known PGŽ klubove
|
||||
# - Wiki API → page extract
|
||||
# - Plus historical match results od Wikipedia season tables
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""Wikipedia PGŽ sport corpus."""
|
||||
import os, sys, re, time, hashlib, logging, json
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [wiki_sport] %(message)s")
|
||||
log = logging.getLogger("wiki_sport")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Ri.NET Civic Bot 1.0 (contact: dradulic@outlook.com)"
|
||||
API_HR = "https://hr.wikipedia.org/w/api.php"
|
||||
API_EN = "https://en.wikipedia.org/w/api.php"
|
||||
|
||||
|
||||
def wiki_extract(api, title, sentences=None):
|
||||
"""Get plain text extract for a Wikipedia page."""
|
||||
params = {
|
||||
"action": "query", "prop": "extracts", "explaintext": "1",
|
||||
"redirects": "1", "format": "json", "titles": title,
|
||||
}
|
||||
if sentences:
|
||||
params["exsentences"] = str(sentences)
|
||||
|
||||
url = api + "?" + urllib.parse.urlencode(params)
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as r:
|
||||
d = json.loads(r.read())
|
||||
pages = d.get("query", {}).get("pages", {})
|
||||
for pid, p in pages.items():
|
||||
if pid == "-1": return None # not found
|
||||
return p.get("extract", "")
|
||||
except Exception as e:
|
||||
log.warning(f"wiki err {title}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def wiki_search(api, query, limit=5):
|
||||
"""Search Wikipedia for related pages."""
|
||||
params = {"action": "query", "list": "search", "srsearch": query,
|
||||
"format": "json", "srlimit": str(limit)}
|
||||
url = api + "?" + urllib.parse.urlencode(params)
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=10) as r:
|
||||
d = json.loads(r.read())
|
||||
return [p["title"] for p in d.get("query", {}).get("search", [])]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def get_pgz_clubs(conn):
|
||||
"""Fetch active PGŽ clubs from DB."""
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT naziv, COALESCE(skraceni_naziv, '')
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE COALESCE(aktivan, true) = true
|
||||
ORDER BY naziv
|
||||
""")
|
||||
out = [(r[0], r[1]) for r in cur.fetchall()]
|
||||
cur.close()
|
||||
return out
|
||||
|
||||
|
||||
def chunk(text, max_len=700):
|
||||
if len(text) <= max_len: return [text] if text else []
|
||||
out = []; start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_len, len(text))
|
||||
if end < len(text):
|
||||
for sep in [". ", "! ", "? ", "\n"]:
|
||||
p = text.rfind(sep, start, end)
|
||||
if p > start + max_len // 2:
|
||||
end = p + len(sep); break
|
||||
out.append(text[start:end].strip())
|
||||
start = end
|
||||
return [c for c in out if len(c) > 80]
|
||||
|
||||
|
||||
def upsert(conn, facts):
|
||||
if not facts: return 0
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((f["fact"], "wikipedia_pgz_sport", "pgz_sport_wiki",
|
||||
f.get("confidence", 0.84), h,
|
||||
json.dumps({"page": f.get("page", ""), "lang": f.get("lang", "hr")})))
|
||||
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
||||
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
||||
try:
|
||||
execute_batch(cur, sql, rows, page_size=50)
|
||||
n = cur.rowcount; cur.close()
|
||||
return n
|
||||
except Exception as e:
|
||||
log.error(f"upsert: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
clubs = get_pgz_clubs(conn)
|
||||
log.info(f"PGŽ active clubs: {len(clubs)}")
|
||||
|
||||
total_facts = 0
|
||||
found_pages = 0
|
||||
|
||||
for naziv, kraci in clubs[:200]: # limit first run
|
||||
# Try direct page first
|
||||
text = wiki_extract(API_HR, naziv)
|
||||
if not text:
|
||||
# Try search
|
||||
candidates = wiki_search(API_HR, naziv, limit=3)
|
||||
for c in candidates:
|
||||
if any(t.lower() in c.lower() for t in [naziv.split()[-1], "Rijeka", "Opatija", "Krk"]):
|
||||
text = wiki_extract(API_HR, c)
|
||||
if text:
|
||||
break
|
||||
|
||||
if text and len(text) > 200:
|
||||
found_pages += 1
|
||||
facts = [{"fact": c, "page": naziv, "lang": "hr", "confidence": 0.85}
|
||||
for c in chunk(text, 700)]
|
||||
total_facts += upsert(conn, facts)
|
||||
|
||||
time.sleep(0.5)
|
||||
if found_pages % 20 == 0 and found_pages > 0:
|
||||
log.info(f" progress: pages {found_pages}, facts {total_facts}")
|
||||
|
||||
log.info(f"=== DONE: pages={found_pages} facts={total_facts} ===")
|
||||
print(json.dumps({"pages": found_pages, "facts": total_facts}))
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user