Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
@@ -0,0 +1,244 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: sport_pgz_full.py | v1.0.0 | 05.05.2026
|
||||
# Lokacija: /opt/pgz-sport/scrapers/sport_pgz_full.py
|
||||
# Svrha: Sitemap-driven full crawl of sport-pgz.hr
|
||||
# - All 4 sitemaps: objave (1+2), natječaji, stranice
|
||||
# - PDF download + OCR ingest
|
||||
# - Article parsing → dabi.knowledge ingest
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""sport-pgz.hr complete corpus via sitemap."""
|
||||
import os, sys, re, time, hashlib, logging, json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [sport_pgz] %(message)s")
|
||||
log = logging.getLogger("sport_pgz")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
||||
SITEMAP_INDEX = "https://sport-pgz.hr/sitemap.xml"
|
||||
PDF_DIR = "/opt/pgz-sport/data/sport_pgz_pdf"
|
||||
HTML_DIR = "/opt/pgz-sport/data/sport_pgz_html"
|
||||
|
||||
os.makedirs(PDF_DIR, exist_ok=True)
|
||||
os.makedirs(HTML_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def fetch(url, timeout=20, retries=3, binary=False):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
data = r.read()
|
||||
if binary:
|
||||
return data, r.status
|
||||
return data.decode("utf-8", errors="replace"), r.status
|
||||
except Exception as e:
|
||||
log.warning(f"fetch fail attempt {i+1} {url}: {e}")
|
||||
time.sleep(3*(i+1))
|
||||
return None, 0
|
||||
|
||||
|
||||
def parse_sitemap_index(xml):
|
||||
"""Return list of sub-sitemap URLs."""
|
||||
return re.findall(r"<loc>(https?://[^<]+)</loc>", xml or "")
|
||||
|
||||
|
||||
def parse_sitemap_urls(xml):
|
||||
"""Return list of (url, lastmod) pairs."""
|
||||
out = []
|
||||
for m in re.finditer(r"<url>\s*<loc>([^<]+)</loc>(?:\s*<lastmod>([^<]*)</lastmod>)?", xml or ""):
|
||||
out.append((m.group(1), m.group(2) or ""))
|
||||
return out
|
||||
|
||||
|
||||
def extract_main_text(html):
|
||||
if not html: return ""
|
||||
h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S|re.I)
|
||||
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
|
||||
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
|
||||
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
|
||||
text = re.sub(r"<[^>]+>", " ", h)
|
||||
text = unescape(text)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def extract_title(html):
|
||||
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
|
||||
if m:
|
||||
return re.sub(r"\s+", " ", unescape(m.group(1))).strip()
|
||||
return ""
|
||||
|
||||
|
||||
def find_pdf_links(html, base):
|
||||
if not html: return []
|
||||
out = set()
|
||||
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
|
||||
out.add(urljoin(base, m.group(1)))
|
||||
return list(out)
|
||||
|
||||
|
||||
def chunk_text(text, max_len=800):
|
||||
"""Split into ~800 char chunks, prefer sentence boundaries."""
|
||||
if len(text) <= max_len:
|
||||
return [text] if text else []
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_len, len(text))
|
||||
# Find last . or ! or ? or newline
|
||||
if end < len(text):
|
||||
for sep in [". ", "! ", "? ", "\n"]:
|
||||
p = text.rfind(sep, start, end)
|
||||
if p > start + max_len // 2:
|
||||
end = p + len(sep)
|
||||
break
|
||||
chunks.append(text[start:end].strip())
|
||||
start = end
|
||||
return [c for c in chunks if len(c) > 50]
|
||||
|
||||
|
||||
def upsert_facts(conn, facts):
|
||||
"""Bulk insert into dabi.knowledge."""
|
||||
if not facts:
|
||||
return 0
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((
|
||||
f["fact"], f["source"], f.get("category", "sport_pgz"),
|
||||
f.get("confidence", 0.85), h,
|
||||
json.dumps({"url": f.get("url", "")}),
|
||||
))
|
||||
sql = """
|
||||
INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
||||
VALUES (%s, %s, %s, %s, %s, %s::jsonb)
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
"""
|
||||
try:
|
||||
execute_batch(cur, sql, rows, page_size=50)
|
||||
cnt = cur.rowcount
|
||||
cur.close()
|
||||
return cnt
|
||||
except Exception as e:
|
||||
log.error(f"upsert err: {e}")
|
||||
return 0
|
||||
|
||||
|
||||
def crawl():
|
||||
"""Main crawl entry."""
|
||||
log.info(f"=== sport-pgz.hr full crawl ===")
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
# 1. Get sitemap index
|
||||
xml, _ = fetch(SITEMAP_INDEX)
|
||||
if not xml:
|
||||
log.error("sitemap fetch failed"); return
|
||||
|
||||
sub_sitemaps = parse_sitemap_index(xml)
|
||||
log.info(f"sub-sitemaps: {len(sub_sitemaps)}")
|
||||
|
||||
all_urls = []
|
||||
for sm in sub_sitemaps:
|
||||
sub_xml, _ = fetch(sm)
|
||||
if sub_xml:
|
||||
urls = parse_sitemap_urls(sub_xml)
|
||||
all_urls.extend(urls)
|
||||
log.info(f" {sm}: {len(urls)} urls")
|
||||
|
||||
log.info(f"TOTAL URLs to crawl: {len(all_urls)}")
|
||||
|
||||
# 2. Crawl each URL → text → facts
|
||||
total_facts = 0
|
||||
crawled = 0
|
||||
pdfs_found = []
|
||||
|
||||
for idx, (url, lastmod) in enumerate(all_urls, 1):
|
||||
if idx % 20 == 0:
|
||||
log.info(f" progress: {idx}/{len(all_urls)} crawled, {total_facts} facts")
|
||||
|
||||
try:
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html:
|
||||
continue
|
||||
|
||||
# Save HTML for replay
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()[:16]
|
||||
html_path = f"{HTML_DIR}/{url_hash}.html"
|
||||
try:
|
||||
with open(html_path, "w", encoding="utf-8") as f:
|
||||
f.write(html)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
title = extract_title(html)
|
||||
text = extract_main_text(html)
|
||||
|
||||
# Collect PDFs
|
||||
for pdf_url in find_pdf_links(html, url):
|
||||
pdfs_found.append(pdf_url)
|
||||
|
||||
# Build facts
|
||||
facts = []
|
||||
if title and len(title) > 10:
|
||||
facts.append({
|
||||
"fact": f"sport-pgz.hr — {title}",
|
||||
"source": "sport-pgz.hr",
|
||||
"category": "sport_pgz_official",
|
||||
"confidence": 0.92,
|
||||
"url": url,
|
||||
})
|
||||
|
||||
for chunk in chunk_text(text, max_len=800):
|
||||
if len(chunk) < 80:
|
||||
continue
|
||||
facts.append({
|
||||
"fact": chunk,
|
||||
"source": "sport-pgz.hr",
|
||||
"category": "sport_pgz_official",
|
||||
"confidence": 0.88,
|
||||
"url": url,
|
||||
})
|
||||
|
||||
inserted = upsert_facts(conn, facts)
|
||||
total_facts += inserted
|
||||
crawled += 1
|
||||
|
||||
time.sleep(0.5) # rate limit
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"err {url}: {e}")
|
||||
|
||||
# 3. Download PDFs
|
||||
pdfs_set = list(set(pdfs_found))
|
||||
log.info(f"PDF links found: {len(pdfs_set)}")
|
||||
pdf_downloaded = 0
|
||||
for pdf_url in pdfs_set[:200]: # limit for first run
|
||||
try:
|
||||
url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:16]
|
||||
pdf_path = f"{PDF_DIR}/{url_hash}.pdf"
|
||||
if os.path.exists(pdf_path):
|
||||
continue
|
||||
data, status = fetch(pdf_url, timeout=30, binary=True)
|
||||
if data and status == 200:
|
||||
with open(pdf_path, "wb") as f:
|
||||
f.write(data)
|
||||
pdf_downloaded += 1
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
log.warning(f"pdf err {pdf_url}: {e}")
|
||||
|
||||
log.info(f"=== DONE: {crawled} pages crawled, {total_facts} facts inserted, {pdf_downloaded} PDFs downloaded ===")
|
||||
conn.close()
|
||||
|
||||
return {"crawled": crawled, "facts": total_facts, "pdfs": pdf_downloaded}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
r = crawl()
|
||||
print(json.dumps(r))
|
||||
Reference in New Issue
Block a user