#!/usr/bin/env python3
"""Fetch real legal texts from narodne-novine.nn.hr and key sources.
Update sadrzaj column for accurate RAG."""
import os
import psycopg2
import psycopg2.extras
import requests
import re
import time
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
# Top legal documents to fetch (ID-podudaranje preko title pattern)
TARGETS = [
{
'title_pattern': 'Zakon o sportu',
'razina': 'RH',
'urls': [
'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html',
'https://narodne-novine.nn.hr/clanci/sluzbeni/2024_10_122_2087.html',
],
},
{
'title_pattern': 'Zakon o udrugama',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2014_06_74_1390.html'],
},
{
'title_pattern': 'Zakon o sprečavanju dopinga',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html'],
},
{
'title_pattern': 'Pravilnik o stručnim poslovima',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html'],
},
{
'title_pattern': 'Zakon o lovstvu',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/full/2018_11_99_1955.html'],
},
{
'title_pattern': 'Zakon o volonterstvu',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2007_06_58_1813.html'],
},
{
'title_pattern': 'Zakon o pravu na pristup informacijama',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2013_02_25_403.html'],
},
{
'title_pattern': 'Zakon o sprječavanju nereda',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2003_07_117_1631.html'],
},
{
'title_pattern': 'GDPR',
'razina': 'EU',
'urls': ['https://eur-lex.europa.eu/legal-content/HR/TXT/HTML/?uri=CELEX:32016R0679'],
},
]
HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 PGZSport/1.0',
'Accept': 'text/html,application/xhtml+xml',
}
def clean_html(html):
"""Strip HTML tags, scripts, styles. Return clean text."""
# Remove scripts and styles
html = re.sub(r'', '', html, flags=re.DOTALL|re.I)
html = re.sub(r'', '', html, flags=re.DOTALL|re.I)
html = re.sub(r'', '', html, flags=re.DOTALL)
# Replace breaks with newlines
html = re.sub(r'
', '\n', html, flags=re.I)
html = re.sub(r'(p|div|h[1-6]|li|tr)\s*>', '\n', html, flags=re.I)
# Strip remaining tags
html = re.sub(r'<[^>]+>', '', html)
# Decode entities
html = html.replace(' ', ' ').replace('&', '&')
html = html.replace('<', '<').replace('>', '>')
html = html.replace('"', '"').replace(''', "'")
# Collapse whitespace
html = re.sub(r'[ \t]+', ' ', html)
html = re.sub(r'\n\s*\n+', '\n\n', html)
html = html.strip()
return html
def fetch_url(url, max_size=200000):
try:
r = requests.get(url, headers=HEADERS, timeout=20)
if r.status_code != 200:
return None
text = r.text
# NN.hr structure: extract main article body
m = re.search(r'