([^<]+)

#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv("/opt/rinet-gpu/.env.master") # auto-added # ═══════════════════════════════════════════════════════════════════ # Fajl: rss_hr_full.py | v1.0.0 | 05.05.2026 # Lokacija: /opt/pgz-sport/scrapers/rss_hr_full.py # Svrha: rss.hr (Riječki sport savez) full crawl # ═══════════════════════════════════════════════════════════════════ """rss.hr complete corpus.""" import os, sys, re, time, hashlib, logging, json from urllib.parse import urljoin, urlparse import urllib.request from html import unescape import psycopg2 from psycopg2.extras import execute_batch logging.basicConfig(level=logging.INFO, format="%(asctime)s [rss_hr] %(message)s") log = logging.getLogger("rss_hr") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)" ROOT = "https://rss.hr" HTML_DIR = "/opt/pgz-sport/data/rss_hr_html" PDF_DIR = "/opt/pgz-sport/data/rss_hr_pdf" os.makedirs(HTML_DIR, exist_ok=True) os.makedirs(PDF_DIR, exist_ok=True) def fetch(url, timeout=20, retries=3, binary=False): for i in range(retries): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: d = r.read() return d if binary else d.decode("utf-8", errors="replace"), r.status except Exception: time.sleep(3*(i+1)) return None, 0 def extract_title(html): m = re.search(r"([^<]+)", html or "", re.I) return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else "" def extract_text(html): h = re.sub(r"]*>.*?", "", html or "", flags=re.S|re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S|re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S|re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S|re.I) t = re.sub(r"<[^>]+>", " ", h) return re.sub(r"\s+", " ", unescape(t)).strip() def find_internal_links(html, base): if not html: return [] out = set() for m in re.finditer(r'href=["\']([^"\']+)["\']', html): u = urljoin(base, m.group(1)) host = urlparse(u).hostname or "" if "rss.hr" in host: # Strip query/fragment u = u.split("#")[0] out.add(u) return list(out) def find_pdfs(html, base): out = set() for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html or "", re.I): out.add(urljoin(base, m.group(1))) return list(out) def chunk(text, max_len=800): if len(text) <= max_len: return [text] if text else [] out = [] start = 0 while start < len(text): end = min(start + max_len, len(text)) if end < len(text): for sep in [". ", "! ", "? ", "\n"]: p = text.rfind(sep, start, end) if p > start + max_len // 2: end = p + len(sep); break out.append(text[start:end].strip()) start = end return [c for c in out if len(c) > 50] def upsert(conn, facts): if not facts: return 0 cur = conn.cursor() rows = [] for f in facts: f["fact"] = f["fact"].replace("\x00", "") h = hashlib.md5(f["fact"].encode()).hexdigest() rows.append((f["fact"], f["source"], f.get("category", "rss_hr"), f.get("confidence", 0.85), h, json.dumps({"url": f.get("url", "")}))) sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING""" try: execute_batch(cur, sql, rows, page_size=50) n = cur.rowcount cur.close() return n except Exception as e: log.error(f"upsert: {e}") return 0 def crawl(max_pages=400): log.info(f"=== rss.hr crawl (max {max_pages} pages) ===") conn = psycopg2.connect(DSN); conn.autocommit = True visited = set() queue = [ROOT, f"{ROOT}/clanovi/", f"{ROOT}/natjecanja/", f"{ROOT}/dokumenti/", f"{ROOT}/o-nama/", f"{ROOT}/sportasi-sezone/", f"{ROOT}/povjerenstva/", f"{ROOT}/strucni-savjet/"] total_facts = 0 pdfs = set() while queue and len(visited) < max_pages: url = queue.pop(0) if url in visited: continue visited.add(url) if len(visited) % 20 == 0: log.info(f" visited {len(visited)}, queue {len(queue)}, facts {total_facts}") result = fetch(url, timeout=15) if not result or not result[0]: continue html = result[0] # Save html try: h = hashlib.md5(url.encode()).hexdigest()[:16] with open(f"{HTML_DIR}/{h}.html", "w", encoding="utf-8") as f: f.write(html) except Exception: pass title = extract_title(html) text = extract_text(html) # PDFs for p in find_pdfs(html, url): pdfs.add(p) # Facts facts = [] if title and len(title) > 8: facts.append({"fact": f"rss.hr — {title}", "source": "rss.hr", "category": "rss_hr_riecki_sport_savez", "confidence": 0.90, "url": url}) for c in chunk(text, 800): if len(c) < 80: continue facts.append({"fact": c, "source": "rss.hr", "category": "rss_hr_riecki_sport_savez", "confidence": 0.85, "url": url}) total_facts += upsert(conn, facts) # Discover more links for l in find_internal_links(html, url): if l not in visited and len(queue) < 1000: queue.append(l) time.sleep(0.4) # Download PDFs pdf_dl = 0 for p in list(pdfs)[:100]: try: h = hashlib.md5(p.encode()).hexdigest()[:16] path = f"{PDF_DIR}/{h}.pdf" if os.path.exists(path): continue data, st = fetch(p, timeout=30, binary=True) if data and st == 200: with open(path, "wb") as f: f.write(data) pdf_dl += 1 time.sleep(0.8) except Exception: pass log.info(f"=== DONE: {len(visited)} visited, {total_facts} facts, {pdf_dl} pdfs ===") conn.close() return {"visited": len(visited), "facts": total_facts, "pdfs": pdf_dl} if __name__ == "__main__": r = crawl() print(json.dumps(r))