#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: rss_hr_full.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/rss_hr_full.py
# Svrha: rss.hr (Riječki sport savez) full crawl
# ═══════════════════════════════════════════════════════════════════
"""rss.hr complete corpus."""
import os, sys, re, time, hashlib, logging, json
from urllib.parse import urljoin, urlparse
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch
logging.basicConfig(level=logging.INFO, format="%(asctime)s [rss_hr] %(message)s")
log = logging.getLogger("rss_hr")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
ROOT = "https://rss.hr"
HTML_DIR = "/opt/pgz-sport/data/rss_hr_html"
PDF_DIR = "/opt/pgz-sport/data/rss_hr_pdf"
os.makedirs(HTML_DIR, exist_ok=True)
os.makedirs(PDF_DIR, exist_ok=True)
def fetch(url, timeout=20, retries=3, binary=False):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
d = r.read()
return d if binary else d.decode("utf-8", errors="replace"), r.status
except Exception:
time.sleep(3*(i+1))
return None, 0
def extract_title(html):
m = re.search(r"
([^<]+)", html or "", re.I)
return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
def extract_text(html):
h = re.sub(r"", "", html or "", flags=re.S|re.I)
h = re.sub(r"", "", h, flags=re.S|re.I)
h = re.sub(r"", "", h, flags=re.S|re.I)
h = re.sub(r"", "", h, flags=re.S|re.I)
t = re.sub(r"<[^>]+>", " ", h)
return re.sub(r"\s+", " ", unescape(t)).strip()
def find_internal_links(html, base):
if not html: return []
out = set()
for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
u = urljoin(base, m.group(1))
host = urlparse(u).hostname or ""
if "rss.hr" in host:
# Strip query/fragment
u = u.split("#")[0]
out.add(u)
return list(out)
def find_pdfs(html, base):
out = set()
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html or "", re.I):
out.add(urljoin(base, m.group(1)))
return list(out)
def chunk(text, max_len=800):
if len(text) <= max_len: return [text] if text else []
out = []
start = 0
while start < len(text):
end = min(start + max_len, len(text))
if end < len(text):
for sep in [". ", "! ", "? ", "\n"]:
p = text.rfind(sep, start, end)
if p > start + max_len // 2:
end = p + len(sep); break
out.append(text[start:end].strip())
start = end
return [c for c in out if len(c) > 50]
def upsert(conn, facts):
if not facts: return 0
cur = conn.cursor()
rows = []
for f in facts:
h = hashlib.md5(f["fact"].encode()).hexdigest()
rows.append((f["fact"], f["source"], f.get("category", "rss_hr"),
f.get("confidence", 0.85), h,
json.dumps({"url": f.get("url", "")})))
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
try:
execute_batch(cur, sql, rows, page_size=50)
n = cur.rowcount
cur.close()
return n
except Exception as e:
log.error(f"upsert: {e}")
return 0
def crawl(max_pages=400):
log.info(f"=== rss.hr crawl (max {max_pages} pages) ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set()
queue = [ROOT, f"{ROOT}/clanovi/", f"{ROOT}/natjecanja/",
f"{ROOT}/dokumenti/", f"{ROOT}/o-nama/",
f"{ROOT}/sportasi-sezone/", f"{ROOT}/povjerenstva/",
f"{ROOT}/strucni-savjet/"]
total_facts = 0
pdfs = set()
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
if len(visited) % 20 == 0:
log.info(f" visited {len(visited)}, queue {len(queue)}, facts {total_facts}")
result = fetch(url, timeout=15)
if not result or not result[0]:
continue
html = result[0]
# Save html
try:
h = hashlib.md5(url.encode()).hexdigest()[:16]
with open(f"{HTML_DIR}/{h}.html", "w", encoding="utf-8") as f:
f.write(html)
except Exception:
pass
title = extract_title(html)
text = extract_text(html)
# PDFs
for p in find_pdfs(html, url):
pdfs.add(p)
# Facts
facts = []
if title and len(title) > 8:
facts.append({"fact": f"rss.hr — {title}", "source": "rss.hr",
"category": "rss_hr_riecki_sport_savez",
"confidence": 0.90, "url": url})
for c in chunk(text, 800):
if len(c) < 80: continue
facts.append({"fact": c, "source": "rss.hr",
"category": "rss_hr_riecki_sport_savez",
"confidence": 0.85, "url": url})
total_facts += upsert(conn, facts)
# Discover more links
for l in find_internal_links(html, url):
if l not in visited and len(queue) < 1000:
queue.append(l)
time.sleep(0.4)
# Download PDFs
pdf_dl = 0
for p in list(pdfs)[:100]:
try:
h = hashlib.md5(p.encode()).hexdigest()[:16]
path = f"{PDF_DIR}/{h}.pdf"
if os.path.exists(path): continue
data, st = fetch(p, timeout=30, binary=True)
if data and st == 200:
with open(path, "wb") as f: f.write(data)
pdf_dl += 1
time.sleep(0.8)
except Exception:
pass
log.info(f"=== DONE: {len(visited)} visited, {total_facts} facts, {pdf_dl} pdfs ===")
conn.close()
return {"visited": len(visited), "facts": total_facts, "pdfs": pdf_dl}
if __name__ == "__main__":
r = crawl()
print(json.dumps(r))