feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
+158
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fetch real legal texts from narodne-novine.nn.hr and key sources.
|
||||
Update sadrzaj column for accurate RAG."""
|
||||
import os
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"])
|
||||
|
||||
# Top legal documents to fetch (ID-podudaranje preko title pattern)
|
||||
TARGETS = [
|
||||
{
|
||||
'title_pattern': 'Zakon o sportu',
|
||||
'razina': 'RH',
|
||||
'urls': [
|
||||
'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html',
|
||||
'https://narodne-novine.nn.hr/clanci/sluzbeni/2024_10_122_2087.html',
|
||||
],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o udrugama',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2014_06_74_1390.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o sprečavanju dopinga',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Pravilnik o stručnim poslovima',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o lovstvu',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/full/2018_11_99_1955.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o volonterstvu',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2007_06_58_1813.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o pravu na pristup informacijama',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2013_02_25_403.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o sprječavanju nereda',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2003_07_117_1631.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'GDPR',
|
||||
'razina': 'EU',
|
||||
'urls': ['https://eur-lex.europa.eu/legal-content/HR/TXT/HTML/?uri=CELEX:32016R0679'],
|
||||
},
|
||||
]
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 PGZSport/1.0',
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
}
|
||||
|
||||
def clean_html(html):
|
||||
"""Strip HTML tags, scripts, styles. Return clean text."""
|
||||
# Remove scripts and styles
|
||||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL|re.I)
|
||||
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL|re.I)
|
||||
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
|
||||
# Replace breaks with newlines
|
||||
html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
|
||||
html = re.sub(r'</(p|div|h[1-6]|li|tr)\s*>', '\n', html, flags=re.I)
|
||||
# Strip remaining tags
|
||||
html = re.sub(r'<[^>]+>', '', html)
|
||||
# Decode entities
|
||||
html = html.replace(' ', ' ').replace('&', '&')
|
||||
html = html.replace('<', '<').replace('>', '>')
|
||||
html = html.replace('"', '"').replace(''', "'")
|
||||
# Collapse whitespace
|
||||
html = re.sub(r'[ \t]+', ' ', html)
|
||||
html = re.sub(r'\n\s*\n+', '\n\n', html)
|
||||
html = html.strip()
|
||||
return html
|
||||
|
||||
def fetch_url(url, max_size=200000):
|
||||
try:
|
||||
r = requests.get(url, headers=HEADERS, timeout=20)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
text = r.text
|
||||
# NN.hr structure: extract main article body
|
||||
m = re.search(r'<div[^>]*class="[^"]*clanak[^"]*"[^>]*>(.*?)</div>\s*<div[^>]*class="metapodaci"', text, re.DOTALL|re.I)
|
||||
if m:
|
||||
text = m.group(1)
|
||||
else:
|
||||
# Fallback: remove navigation, headers, footers
|
||||
text = re.sub(r'<header.*?</header>', '', text, flags=re.DOTALL|re.I)
|
||||
text = re.sub(r'<footer.*?</footer>', '', text, flags=re.DOTALL|re.I)
|
||||
text = re.sub(r'<nav.*?</nav>', '', text, flags=re.DOTALL|re.I)
|
||||
clean = clean_html(text)
|
||||
return clean[:max_size]
|
||||
except Exception as e:
|
||||
print(f" err fetch {url[:80]}: {e}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
n_updated = 0
|
||||
n_failed = 0
|
||||
|
||||
for tgt in TARGETS:
|
||||
cu.execute("""SELECT id, title FROM pgz_sport.dokumenti
|
||||
WHERE title ILIKE %s AND razina = %s
|
||||
ORDER BY id LIMIT 1""",
|
||||
(f"%{tgt['title_pattern']}%", tgt['razina']))
|
||||
row = cu.fetchone()
|
||||
if not row:
|
||||
print(f" ⊘ Not found: {tgt['title_pattern']} ({tgt['razina']})")
|
||||
continue
|
||||
|
||||
# Try urls in order until one works
|
||||
full_text = ''
|
||||
used_url = None
|
||||
for url in tgt['urls']:
|
||||
text = fetch_url(url)
|
||||
if text and len(text) > 1000:
|
||||
full_text = text
|
||||
used_url = url
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
if not full_text:
|
||||
print(f" ✗ {row['title'][:50]} — failed all URLs")
|
||||
n_failed += 1
|
||||
continue
|
||||
|
||||
# Update DB
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti
|
||||
SET sadrzaj = %s, izvor_url = COALESCE(izvor_url, %s)
|
||||
WHERE id = %s""",
|
||||
(full_text, used_url, row['id']))
|
||||
n_updated += 1
|
||||
print(f" ✓ {row['title'][:60]} ({len(full_text)} bytes)")
|
||||
time.sleep(1) # rate limit
|
||||
|
||||
print(f"\nUpdated: {n_updated}, Failed: {n_failed}")
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user