#!/usr/bin/env python3 """Fetch real legal texts from narodne-novine.nn.hr and key sources. Update sadrzaj column for accurate RAG.""" import os import psycopg2 import psycopg2.extras import requests import re import time DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) # Top legal documents to fetch (ID-podudaranje preko title pattern) TARGETS = [ { 'title_pattern': 'Zakon o sportu', 'razina': 'RH', 'urls': [ 'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html', 'https://narodne-novine.nn.hr/clanci/sluzbeni/2024_10_122_2087.html', ], }, { 'title_pattern': 'Zakon o udrugama', 'razina': 'RH', 'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2014_06_74_1390.html'], }, { 'title_pattern': 'Zakon o sprečavanju dopinga', 'razina': 'RH', 'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html'], }, { 'title_pattern': 'Pravilnik o stručnim poslovima', 'razina': 'RH', 'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html'], }, { 'title_pattern': 'Zakon o lovstvu', 'razina': 'RH', 'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/full/2018_11_99_1955.html'], }, { 'title_pattern': 'Zakon o volonterstvu', 'razina': 'RH', 'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2007_06_58_1813.html'], }, { 'title_pattern': 'Zakon o pravu na pristup informacijama', 'razina': 'RH', 'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2013_02_25_403.html'], }, { 'title_pattern': 'Zakon o sprječavanju nereda', 'razina': 'RH', 'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2003_07_117_1631.html'], }, { 'title_pattern': 'GDPR', 'razina': 'EU', 'urls': ['https://eur-lex.europa.eu/legal-content/HR/TXT/HTML/?uri=CELEX:32016R0679'], }, ] HEADERS = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 PGZSport/1.0', 'Accept': 'text/html,application/xhtml+xml', } def clean_html(html): """Strip HTML tags, scripts, styles. Return clean text.""" # Remove scripts and styles html = re.sub(r']*>.*?', '', html, flags=re.DOTALL|re.I) html = re.sub(r']*>.*?', '', html, flags=re.DOTALL|re.I) html = re.sub(r'', '', html, flags=re.DOTALL) # Replace breaks with newlines html = re.sub(r'', '\n', html, flags=re.I) html = re.sub(r'', '\n', html, flags=re.I) # Strip remaining tags html = re.sub(r'<[^>]+>', '', html) # Decode entities html = html.replace(' ', ' ').replace('&', '&') html = html.replace('<', '<').replace('>', '>') html = html.replace('"', '"').replace(''', "'") # Collapse whitespace html = re.sub(r'[ \t]+', ' ', html) html = re.sub(r'\n\s*\n+', '\n\n', html) html = html.strip() return html def fetch_url(url, max_size=200000): try: r = requests.get(url, headers=HEADERS, timeout=20) if r.status_code != 200: return None text = r.text # NN.hr structure: extract main article body m = re.search(r']*class="[^"]*clanak[^"]*"[^>]*>(.*?)\s*]*class="metapodaci"', text, re.DOTALL|re.I) if m: text = m.group(1) else: # Fallback: remove navigation, headers, footers text = re.sub(r'', '', text, flags=re.DOTALL|re.I) text = re.sub(r'', '', text, flags=re.DOTALL|re.I) text = re.sub(r'', '', text, flags=re.DOTALL|re.I) clean = clean_html(text) return clean[:max_size] except Exception as e: print(f" err fetch {url[:80]}: {e}") return None def main(): conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) n_updated = 0 n_failed = 0 for tgt in TARGETS: cu.execute("""SELECT id, title FROM pgz_sport.dokumenti WHERE title ILIKE %s AND razina = %s ORDER BY id LIMIT 1""", (f"%{tgt['title_pattern']}%", tgt['razina'])) row = cu.fetchone() if not row: print(f" ⊘ Not found: {tgt['title_pattern']} ({tgt['razina']})") continue # Try urls in order until one works full_text = '' used_url = None for url in tgt['urls']: text = fetch_url(url) if text and len(text) > 1000: full_text = text used_url = url break time.sleep(0.5) if not full_text: print(f" ✗ {row['title'][:50]} — failed all URLs") n_failed += 1 continue # Update DB cu.execute("""UPDATE pgz_sport.dokumenti SET sadrzaj = %s, izvor_url = COALESCE(izvor_url, %s) WHERE id = %s""", (full_text, used_url, row['id'])) n_updated += 1 print(f" ✓ {row['title'][:60]} ({len(full_text)} bytes)") time.sleep(1) # rate limit print(f"\nUpdated: {n_updated}, Failed: {n_failed}") conn.close() if __name__ == '__main__': main()