#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh # gov_hr_sport_scraper.py — Ministarstvo turizma i sporta import os, time, hashlib, logging, re, json from urllib.parse import urljoin, urlparse import urllib.request import psycopg2 from html import unescape logging.basicConfig(level=logging.INFO, format='%(asctime)s [gov_sport] %(message)s') log = logging.getLogger("gov_sport") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)" ROOTS = [ "https://mint.gov.hr", "https://mint.gov.hr/sport-i-rekreacija/87", "https://mint.gov.hr/sport-i-rekreacija/javne-potrebe-u-sportu", "https://sport.gov.hr", "https://hoo.hr", ] def fetch(url): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=20) as r: return r.read().decode('utf-8', errors='replace'), r.status except Exception as e: log.warning(f"Fail {url}: {e}") return None, 0 def extract_text(html): if not html: return "" text = re.sub(r'', '', html, flags=re.S|re.I) text = re.sub(r'', '', text, flags=re.S|re.I) text = re.sub(r'<[^>]+>', ' ', text) return re.sub(r'\s+', ' ', unescape(text)).strip().replace('\x00', '') def find_links(html, base): if not html: return [] out = [] for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I): u = urljoin(base, m.group(1)) host = urlparse(u).hostname or "" if any(d in host for d in ['mint.gov.hr', 'sport.gov.hr', 'hoo.hr']): out.append(u) return list(set(out)) def harvest(): conn = psycopg2.connect(DSN); conn.autocommit = True cur = conn.cursor() visited = set(); queue = list(ROOTS) docs = facts = 0 while queue and len(visited) < 150: url = queue.pop(0) if url in visited: continue visited.add(url) time.sleep(2) html, status = fetch(url) if not html or status != 200: continue log.info(f"[{status}] {url[:80]}") text = extract_text(html) if len(text) < 200: continue title_m = re.search(r'