95 lines
3.9 KiB
Python
95 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
# gov_hr_sport_scraper.py — Ministarstvo turizma i sporta
|
|
import os, time, hashlib, logging, re, json
|
|
from urllib.parse import urljoin, urlparse
|
|
import urllib.request
|
|
import psycopg2
|
|
from html import unescape
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [gov_sport] %(message)s')
|
|
log = logging.getLogger("gov_sport")
|
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)"
|
|
|
|
ROOTS = [
|
|
"https://mint.gov.hr",
|
|
"https://mint.gov.hr/sport-i-rekreacija/87",
|
|
"https://mint.gov.hr/sport-i-rekreacija/javne-potrebe-u-sportu",
|
|
"https://sport.gov.hr",
|
|
"https://hoo.hr",
|
|
]
|
|
|
|
def fetch(url):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=20) as r:
|
|
return r.read().decode('utf-8', errors='replace'), r.status
|
|
except Exception as e:
|
|
log.warning(f"Fail {url}: {e}")
|
|
return None, 0
|
|
|
|
def extract_text(html):
|
|
if not html: return ""
|
|
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
|
|
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S|re.I)
|
|
text = re.sub(r'<[^>]+>', ' ', text)
|
|
return re.sub(r'\s+', ' ', unescape(text)).strip().replace('\x00', '')
|
|
|
|
def find_links(html, base):
|
|
if not html: return []
|
|
out = []
|
|
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
|
u = urljoin(base, m.group(1))
|
|
host = urlparse(u).hostname or ""
|
|
if any(d in host for d in ['mint.gov.hr', 'sport.gov.hr', 'hoo.hr']):
|
|
out.append(u)
|
|
return list(set(out))
|
|
|
|
def harvest():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
cur = conn.cursor()
|
|
visited = set(); queue = list(ROOTS)
|
|
docs = facts = 0
|
|
while queue and len(visited) < 150:
|
|
url = queue.pop(0)
|
|
if url in visited: continue
|
|
visited.add(url)
|
|
time.sleep(2)
|
|
html, status = fetch(url)
|
|
if not html or status != 200: continue
|
|
log.info(f"[{status}] {url[:80]}")
|
|
text = extract_text(html)
|
|
if len(text) < 200: continue
|
|
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
|
title = title_m.group(1).strip() if title_m else url[:80]
|
|
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
|
|
try:
|
|
cur.execute("""INSERT INTO pgz_sport.dokumenti
|
|
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
|
VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
|
|
(url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'MTIS/HOO'))
|
|
docs += cur.rowcount
|
|
except Exception as e:
|
|
pass
|
|
# Knowledge extract — sport-relevant
|
|
if any(kw in text.lower() for kw in ['sport', 'klub', 'savez', 'sportaš', 'natjecanj', 'olimpij', 'paraolimp']):
|
|
chunks = [text[i:i+800] for i in range(0, min(len(text), 4000), 800)]
|
|
for ci, chunk in enumerate(chunks[:4]):
|
|
if len(chunk) < 200: continue
|
|
fact_hash = hashlib.sha256((url+str(ci)+chunk[:100]).encode()).hexdigest()[:32]
|
|
try:
|
|
cur.execute("""INSERT INTO dabi.knowledge
|
|
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
|
VALUES (%s,'gov_hr_sport','gov_hr_sport_scraper',%s::jsonb,0.85,%s,now())
|
|
ON CONFLICT (data_hash) DO NOTHING""",
|
|
(chunk[:1500], json.dumps([{"url":url}]), fact_hash))
|
|
facts += cur.rowcount
|
|
except: pass
|
|
for l in find_links(html, url)[:25]:
|
|
if l not in visited and l not in queue: queue.append(l)
|
|
log.info(f"FINAL: visited={len(visited)} docs={docs} facts={facts}")
|
|
cur.close(); conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
harvest()
|