#!/usr/bin/env python3 # sukob_sport_scraper.py — Povjerenstvo za sukob interesa, filter za sport funkcionere import os, time, hashlib, logging, re, json from urllib.parse import urljoin import urllib.request import psycopg2 from html import unescape logging.basicConfig(level=logging.INFO, format='%(asctime)s [sukob] %(message)s') log = logging.getLogger("sukob") DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7" UA = "Mozilla/5.0 (Ri.NET Bot 1.0)" def fetch(url): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=20) as r: return r.read().decode('utf-8', errors='replace'), r.status except: return None, 0 def harvest(): conn = psycopg2.connect(DSN); conn.autocommit = True cur = conn.cursor() # Get OIB-ovi predsjednika i tajnika klubova/saveza za cross-check cur.execute(""" SELECT DISTINCT ime FROM ( SELECT predsjednik AS ime FROM pgz_sport.klubovi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5 UNION SELECT tajnik FROM pgz_sport.klubovi WHERE tajnik IS NOT NULL AND length(tajnik)>5 UNION SELECT predsjednik FROM pgz_sport.savezi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5 ) t LIMIT 100 """) sport_imena = [r[0].strip() for r in cur.fetchall() if r[0]] log.info(f"Sport imena za cross-check: {len(sport_imena)}") # Search sukobinteresa.hr za neka imena facts = 0 for ime in sport_imena[:20]: # Pretraga po imenu from urllib.parse import quote_plus url = f"https://www.sukobinteresa.hr/hr/imovinsko-stanje/imovinske-kartice?search={quote_plus(ime)}" html, status = fetch(url) time.sleep(2) if not html or status != 200: continue # Ako ima rezultata, izvuci if ime.lower() in html.lower(): # cleanup html text = re.sub(r']*>.*?', '', html, flags=re.S|re.I) text = re.sub(r'<[^>]+>', ' ', text) text = re.sub(r'\s+', ' ', unescape(text)).strip() # Find context around ime idx = text.lower().find(ime.lower()) if idx > 0: ctx = text[max(0, idx-300):idx+500] fact = f"FORENSIČKI FLAG: {ime} se nalazi u registru imovinskih kartica Povjerenstva za sukob interesa. Kontekst: {ctx[:600]}" fh = hashlib.sha256(f"sukob:{ime}".encode()).hexdigest()[:32] try: cur.execute("""INSERT INTO dabi.knowledge (fact, category, source, source_refs, confidence, data_hash, created_at) VALUES (%s,'sukob_interesa_sport','sukob_scraper',%s::jsonb,0.90,%s,now()) ON CONFLICT (data_hash) DO NOTHING""", (fact[:2000], json.dumps([{"url":url, "ime":ime}]), fh)) facts += cur.rowcount log.info(f"✓ Match: {ime}") except: pass log.info(f"FINAL: {facts} sukob facts") cur.close(); conn.close() if __name__ == "__main__": harvest()