Files
pgz-sport/scrapers/sukob_sport_scraper.py_prije_env_deepseek
T

75 lines
3.2 KiB
Python

#!/usr/bin/env python3
# sukob_sport_scraper.py — Povjerenstvo za sukob interesa, filter za sport funkcionere
import os, time, hashlib, logging, re, json
from urllib.parse import urljoin
import urllib.request
import psycopg2
from html import unescape
logging.basicConfig(level=logging.INFO, format='%(asctime)s [sukob] %(message)s')
log = logging.getLogger("sukob")
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UA = "Mozilla/5.0 (Ri.NET Bot 1.0)"
def fetch(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode('utf-8', errors='replace'), r.status
except: return None, 0
def harvest():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
# Get OIB-ovi predsjednika i tajnika klubova/saveza za cross-check
cur.execute("""
SELECT DISTINCT ime FROM (
SELECT predsjednik AS ime FROM pgz_sport.klubovi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
UNION
SELECT tajnik FROM pgz_sport.klubovi WHERE tajnik IS NOT NULL AND length(tajnik)>5
UNION
SELECT predsjednik FROM pgz_sport.savezi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
) t LIMIT 100
""")
sport_imena = [r[0].strip() for r in cur.fetchall() if r[0]]
log.info(f"Sport imena za cross-check: {len(sport_imena)}")
# Search sukobinteresa.hr za neka imena
facts = 0
for ime in sport_imena[:20]:
# Pretraga po imenu
from urllib.parse import quote_plus
url = f"https://www.sukobinteresa.hr/hr/imovinsko-stanje/imovinske-kartice?search={quote_plus(ime)}"
html, status = fetch(url)
time.sleep(2)
if not html or status != 200: continue
# Ako ima rezultata, izvuci
if ime.lower() in html.lower():
# cleanup html
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', unescape(text)).strip()
# Find context around ime
idx = text.lower().find(ime.lower())
if idx > 0:
ctx = text[max(0, idx-300):idx+500]
fact = f"FORENSIČKI FLAG: {ime} se nalazi u registru imovinskih kartica Povjerenstva za sukob interesa. Kontekst: {ctx[:600]}"
fh = hashlib.sha256(f"sukob:{ime}".encode()).hexdigest()[:32]
try:
cur.execute("""INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s,'sukob_interesa_sport','sukob_scraper',%s::jsonb,0.90,%s,now())
ON CONFLICT (data_hash) DO NOTHING""",
(fact[:2000], json.dumps([{"url":url, "ime":ime}]), fh))
facts += cur.rowcount
log.info(f"✓ Match: {ime}")
except: pass
log.info(f"FINAL: {facts} sukob facts")
cur.close(); conn.close()
if __name__ == "__main__":
harvest()