78 lines
3.3 KiB
Python
78 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
from dotenv import load_dotenv
|
|
load_dotenv('/opt/rinet-gpu/.env.master')
|
|
# auto-added by patch_scrapers_with_dotenv.sh
|
|
# sukob_sport_scraper.py — Povjerenstvo za sukob interesa, filter za sport funkcionere
|
|
import os, time, hashlib, logging, re, json
|
|
from urllib.parse import urljoin
|
|
import urllib.request
|
|
import psycopg2
|
|
from html import unescape
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [sukob] %(message)s')
|
|
log = logging.getLogger("sukob")
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
UA = "Mozilla/5.0 (Ri.NET Bot 1.0)"
|
|
|
|
def fetch(url):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=20) as r:
|
|
return r.read().decode('utf-8', errors='replace'), r.status
|
|
except: return None, 0
|
|
|
|
def harvest():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
cur = conn.cursor()
|
|
|
|
# Get OIB-ovi predsjednika i tajnika klubova/saveza za cross-check
|
|
cur.execute("""
|
|
SELECT DISTINCT ime FROM (
|
|
SELECT predsjednik AS ime FROM pgz_sport.klubovi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
|
|
UNION
|
|
SELECT tajnik FROM pgz_sport.klubovi WHERE tajnik IS NOT NULL AND length(tajnik)>5
|
|
UNION
|
|
SELECT predsjednik FROM pgz_sport.savezi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
|
|
) t LIMIT 100
|
|
""")
|
|
sport_imena = [r[0].strip() for r in cur.fetchall() if r[0]]
|
|
log.info(f"Sport imena za cross-check: {len(sport_imena)}")
|
|
|
|
# Search sukobinteresa.hr za neka imena
|
|
facts = 0
|
|
for ime in sport_imena[:20]:
|
|
# Pretraga po imenu
|
|
from urllib.parse import quote_plus
|
|
url = f"https://www.sukobinteresa.hr/hr/imovinsko-stanje/imovinske-kartice?search={quote_plus(ime)}"
|
|
html, status = fetch(url)
|
|
time.sleep(2)
|
|
if not html or status != 200: continue
|
|
|
|
# Ako ima rezultata, izvuci
|
|
if ime.lower() in html.lower():
|
|
# cleanup html
|
|
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
|
|
text = re.sub(r'<[^>]+>', ' ', text)
|
|
text = re.sub(r'\s+', ' ', unescape(text)).strip()
|
|
# Find context around ime
|
|
idx = text.lower().find(ime.lower())
|
|
if idx > 0:
|
|
ctx = text[max(0, idx-300):idx+500]
|
|
fact = f"FORENSIČKI FLAG: {ime} se nalazi u registru imovinskih kartica Povjerenstva za sukob interesa. Kontekst: {ctx[:600]}"
|
|
fh = hashlib.sha256(f"sukob:{ime}".encode()).hexdigest()[:32]
|
|
try:
|
|
cur.execute("""INSERT INTO dabi.knowledge
|
|
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
|
VALUES (%s,'sukob_interesa_sport','sukob_scraper',%s::jsonb,0.90,%s,now())
|
|
ON CONFLICT (data_hash) DO NOTHING""",
|
|
(fact[:2000], json.dumps([{"url":url, "ime":ime}]), fh))
|
|
facts += cur.rowcount
|
|
log.info(f"✓ Match: {ime}")
|
|
except: pass
|
|
|
|
log.info(f"FINAL: {facts} sukob facts")
|
|
cur.close(); conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
harvest()
|