#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh # ═══════════════════════════════════════════════════════════════════ # Fajl: sport_pgz_full.py | v1.0.0 | 05.05.2026 # Lokacija: /opt/pgz-sport/scrapers/sport_pgz_full.py # Svrha: Sitemap-driven full crawl of sport-pgz.hr # - All 4 sitemaps: objave (1+2), natječaji, stranice # - PDF download + OCR ingest # - Article parsing → dabi.knowledge ingest # ═══════════════════════════════════════════════════════════════════ """sport-pgz.hr complete corpus via sitemap.""" import os, sys, re, time, hashlib, logging, json from urllib.parse import urljoin, urlparse import urllib.request from html import unescape import psycopg2 from psycopg2.extras import execute_batch logging.basicConfig(level=logging.INFO, format="%(asctime)s [sport_pgz] %(message)s") log = logging.getLogger("sport_pgz") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)" SITEMAP_INDEX = "https://sport-pgz.hr/sitemap.xml" PDF_DIR = "/opt/pgz-sport/data/sport_pgz_pdf" HTML_DIR = "/opt/pgz-sport/data/sport_pgz_html" os.makedirs(PDF_DIR, exist_ok=True) os.makedirs(HTML_DIR, exist_ok=True) def fetch(url, timeout=20, retries=3, binary=False): for i in range(retries): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: data = r.read() if binary: return data, r.status return data.decode("utf-8", errors="replace"), r.status except Exception as e: log.warning(f"fetch fail attempt {i+1} {url}: {e}") time.sleep(3*(i+1)) return None, 0 def parse_sitemap_index(xml): """Return list of sub-sitemap URLs.""" return re.findall(r"(https?://[^<]+)", xml or "") def parse_sitemap_urls(xml): """Return list of (url, lastmod) pairs.""" out = [] for m in re.finditer(r"\s*([^<]+)(?:\s*([^<]*))?", xml or ""): out.append((m.group(1), m.group(2) or "")) return out def extract_main_text(html): if not html: return "" h = re.sub(r"]*>.*?", "", html, flags=re.S|re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S|re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S|re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S|re.I) text = re.sub(r"<[^>]+>", " ", h) text = unescape(text) return re.sub(r"\s+", " ", text).strip() def extract_title(html): m = re.search(r"([^<]+)", html or "", re.I) if m: return re.sub(r"\s+", " ", unescape(m.group(1))).strip() return "" def find_pdf_links(html, base): if not html: return [] out = set() for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I): out.add(urljoin(base, m.group(1))) return list(out) def chunk_text(text, max_len=800): """Split into ~800 char chunks, prefer sentence boundaries.""" if len(text) <= max_len: return [text] if text else [] chunks = [] start = 0 while start < len(text): end = min(start + max_len, len(text)) # Find last . or ! or ? or newline if end < len(text): for sep in [". ", "! ", "? ", "\n"]: p = text.rfind(sep, start, end) if p > start + max_len // 2: end = p + len(sep) break chunks.append(text[start:end].strip()) start = end return [c for c in chunks if len(c) > 50] def upsert_facts(conn, facts): """Bulk insert into dabi.knowledge.""" if not facts: return 0 cur = conn.cursor() rows = [] for f in facts: h = hashlib.md5(f["fact"].encode()).hexdigest() rows.append(( f["fact"], f["source"], f.get("category", "sport_pgz"), f.get("confidence", 0.85), h, json.dumps({"url": f.get("url", "")}), )) sql = """ INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING """ try: execute_batch(cur, sql, rows, page_size=50) cnt = cur.rowcount cur.close() return cnt except Exception as e: log.error(f"upsert err: {e}") return 0 def crawl(): """Main crawl entry.""" log.info(f"=== sport-pgz.hr full crawl ===") conn = psycopg2.connect(DSN); conn.autocommit = True # 1. Get sitemap index xml, _ = fetch(SITEMAP_INDEX) if not xml: log.error("sitemap fetch failed"); return sub_sitemaps = parse_sitemap_index(xml) log.info(f"sub-sitemaps: {len(sub_sitemaps)}") all_urls = [] for sm in sub_sitemaps: sub_xml, _ = fetch(sm) if sub_xml: urls = parse_sitemap_urls(sub_xml) all_urls.extend(urls) log.info(f" {sm}: {len(urls)} urls") log.info(f"TOTAL URLs to crawl: {len(all_urls)}") # 2. Crawl each URL → text → facts total_facts = 0 crawled = 0 pdfs_found = [] for idx, (url, lastmod) in enumerate(all_urls, 1): if idx % 20 == 0: log.info(f" progress: {idx}/{len(all_urls)} crawled, {total_facts} facts") try: html, status = fetch(url, timeout=15) if not html: continue # Save HTML for replay url_hash = hashlib.md5(url.encode()).hexdigest()[:16] html_path = f"{HTML_DIR}/{url_hash}.html" try: with open(html_path, "w", encoding="utf-8") as f: f.write(html) except Exception: pass title = extract_title(html) text = extract_main_text(html) # Collect PDFs for pdf_url in find_pdf_links(html, url): pdfs_found.append(pdf_url) # Build facts facts = [] if title and len(title) > 10: facts.append({ "fact": f"sport-pgz.hr — {title}", "source": "sport-pgz.hr", "category": "sport_pgz_official", "confidence": 0.92, "url": url, }) for chunk in chunk_text(text, max_len=800): if len(chunk) < 80: continue facts.append({ "fact": chunk, "source": "sport-pgz.hr", "category": "sport_pgz_official", "confidence": 0.88, "url": url, }) inserted = upsert_facts(conn, facts) total_facts += inserted crawled += 1 time.sleep(0.5) # rate limit except Exception as e: log.warning(f"err {url}: {e}") # 3. Download PDFs pdfs_set = list(set(pdfs_found)) log.info(f"PDF links found: {len(pdfs_set)}") pdf_downloaded = 0 for pdf_url in pdfs_set[:200]: # limit for first run try: url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:16] pdf_path = f"{PDF_DIR}/{url_hash}.pdf" if os.path.exists(pdf_path): continue data, status = fetch(pdf_url, timeout=30, binary=True) if data and status == 200: with open(pdf_path, "wb") as f: f.write(data) pdf_downloaded += 1 time.sleep(1) except Exception as e: log.warning(f"pdf err {pdf_url}: {e}") log.info(f"=== DONE: {crawled} pages crawled, {total_facts} facts inserted, {pdf_downloaded} PDFs downloaded ===") conn.close() return {"crawled": crawled, "facts": total_facts, "pdfs": pdf_downloaded} if __name__ == "__main__": r = crawl() print(json.dumps(r))