feat: /api/v2/analiza/* endpoints - sport analytics backend

2026-05-16 00:28:12 +02:00
parent 7ca5d7d94e
commit aca5051418
1355 changed files with 321891 additions and 4128 deletions
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""Fetch real legal texts from narodne-novine.nn.hr and key sources.
+Update sadrzaj column for accurate RAG."""
+import os
+import psycopg2
+import psycopg2.extras
+import requests
+import re
+import time
+
+DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
+
+# Top legal documents to fetch (ID-podudaranje preko title pattern)
+TARGETS = [
+    {
+        'title_pattern': 'Zakon o sportu',
+        'razina': 'RH',
+        'urls': [
+            'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html',
+            'https://narodne-novine.nn.hr/clanci/sluzbeni/2024_10_122_2087.html',
+        ],
+    },
+    {
+        'title_pattern': 'Zakon o udrugama',
+        'razina': 'RH',
+        'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2014_06_74_1390.html'],
+    },
+    {
+        'title_pattern': 'Zakon o sprečavanju dopinga',
+        'razina': 'RH',
+        'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html'],
+    },
+    {
+        'title_pattern': 'Pravilnik o stručnim poslovima',
+        'razina': 'RH',
+        'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html'],
+    },
+    {
+        'title_pattern': 'Zakon o lovstvu',
+        'razina': 'RH',
+        'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/full/2018_11_99_1955.html'],
+    },
+    {
+        'title_pattern': 'Zakon o volonterstvu',
+        'razina': 'RH',
+        'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2007_06_58_1813.html'],
+    },
+    {
+        'title_pattern': 'Zakon o pravu na pristup informacijama',
+        'razina': 'RH',
+        'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2013_02_25_403.html'],
+    },
+    {
+        'title_pattern': 'Zakon o sprječavanju nereda',
+        'razina': 'RH',
+        'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2003_07_117_1631.html'],
+    },
+    {
+        'title_pattern': 'GDPR',
+        'razina': 'EU',
+        'urls': ['https://eur-lex.europa.eu/legal-content/HR/TXT/HTML/?uri=CELEX:32016R0679'],
+    },
+]
+
+HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 PGZSport/1.0',
+    'Accept': 'text/html,application/xhtml+xml',
+}
+
+def clean_html(html):
+    """Strip HTML tags, scripts, styles. Return clean text."""
+    # Remove scripts and styles
+    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL|re.I)
+    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL|re.I)
+    html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
+    # Replace breaks with newlines
+    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
+    html = re.sub(r'</(p|div|h[1-6]|li|tr)\s*>', '\n', html, flags=re.I)
+    # Strip remaining tags
+    html = re.sub(r'<[^>]+>', '', html)
+    # Decode entities
+    html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
+    html = html.replace('&lt;', '<').replace('&gt;', '>')
+    html = html.replace('&quot;', '"').replace('&#39;', "'")
+    # Collapse whitespace
+    html = re.sub(r'[ \t]+', ' ', html)
+    html = re.sub(r'\n\s*\n+', '\n\n', html)
+    html = html.strip()
+    return html
+
+def fetch_url(url, max_size=200000):
+    try:
+        r = requests.get(url, headers=HEADERS, timeout=20)
+        if r.status_code != 200:
+            return None
+        text = r.text
+        # NN.hr structure: extract main article body
+        m = re.search(r'<div[^>]*class="[^"]*clanak[^"]*"[^>]*>(.*?)</div>\s*<div[^>]*class="metapodaci"', text, re.DOTALL|re.I)
+        if m:
+            text = m.group(1)
+        else:
+            # Fallback: remove navigation, headers, footers
+            text = re.sub(r'<header.*?</header>', '', text, flags=re.DOTALL|re.I)
+            text = re.sub(r'<footer.*?</footer>', '', text, flags=re.DOTALL|re.I)
+            text = re.sub(r'<nav.*?</nav>', '', text, flags=re.DOTALL|re.I)
+        clean = clean_html(text)
+        return clean[:max_size]
+    except Exception as e:
+        print(f"  err fetch {url[:80]}: {e}")
+        return None
+
+def main():
+    conn = psycopg2.connect(**DB); conn.autocommit = True
+    cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    
+    n_updated = 0
+    n_failed = 0
+    
+    for tgt in TARGETS:
+        cu.execute("""SELECT id, title FROM pgz_sport.dokumenti
+                      WHERE title ILIKE %s AND razina = %s
+                      ORDER BY id LIMIT 1""",
+                   (f"%{tgt['title_pattern']}%", tgt['razina']))
+        row = cu.fetchone()
+        if not row:
+            print(f"  ⊘ Not found: {tgt['title_pattern']} ({tgt['razina']})")
+            continue
+        
+        # Try urls in order until one works
+        full_text = ''
+        used_url = None
+        for url in tgt['urls']:
+            text = fetch_url(url)
+            if text and len(text) > 1000:
+                full_text = text
+                used_url = url
+                break
+            time.sleep(0.5)
+        
+        if not full_text:
+            print(f"  ✗ {row['title'][:50]} — failed all URLs")
+            n_failed += 1
+            continue
+        
+        # Update DB
+        cu.execute("""UPDATE pgz_sport.dokumenti 
+                      SET sadrzaj = %s, izvor_url = COALESCE(izvor_url, %s)
+                      WHERE id = %s""",
+                   (full_text, used_url, row['id']))
+        n_updated += 1
+        print(f"  ✓ {row['title'][:60]} ({len(full_text)} bytes)")
+        time.sleep(1)  # rate limit
+    
+    print(f"\nUpdated: {n_updated}, Failed: {n_failed}")
+    conn.close()
+
+if __name__ == '__main__':
+    main()