PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env python3
|
||||
# sukob_sport_scraper.py — Povjerenstvo za sukob interesa, filter za sport funkcionere
|
||||
import os, time, hashlib, logging, re, json
|
||||
from urllib.parse import urljoin
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
from html import unescape
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [sukob] %(message)s')
|
||||
log = logging.getLogger("sukob")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Bot 1.0)"
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode('utf-8', errors='replace'), r.status
|
||||
except: return None, 0
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get OIB-ovi predsjednika i tajnika klubova/saveza za cross-check
|
||||
cur.execute("""
|
||||
SELECT DISTINCT ime FROM (
|
||||
SELECT predsjednik AS ime FROM pgz_sport.klubovi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
|
||||
UNION
|
||||
SELECT tajnik FROM pgz_sport.klubovi WHERE tajnik IS NOT NULL AND length(tajnik)>5
|
||||
UNION
|
||||
SELECT predsjednik FROM pgz_sport.savezi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
|
||||
) t LIMIT 100
|
||||
""")
|
||||
sport_imena = [r[0].strip() for r in cur.fetchall() if r[0]]
|
||||
log.info(f"Sport imena za cross-check: {len(sport_imena)}")
|
||||
|
||||
# Search sukobinteresa.hr za neka imena
|
||||
facts = 0
|
||||
for ime in sport_imena[:20]:
|
||||
# Pretraga po imenu
|
||||
from urllib.parse import quote_plus
|
||||
url = f"https://www.sukobinteresa.hr/hr/imovinsko-stanje/imovinske-kartice?search={quote_plus(ime)}"
|
||||
html, status = fetch(url)
|
||||
time.sleep(2)
|
||||
if not html or status != 200: continue
|
||||
|
||||
# Ako ima rezultata, izvuci
|
||||
if ime.lower() in html.lower():
|
||||
# cleanup html
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = re.sub(r'\s+', ' ', unescape(text)).strip()
|
||||
# Find context around ime
|
||||
idx = text.lower().find(ime.lower())
|
||||
if idx > 0:
|
||||
ctx = text[max(0, idx-300):idx+500]
|
||||
fact = f"FORENSIČKI FLAG: {ime} se nalazi u registru imovinskih kartica Povjerenstva za sukob interesa. Kontekst: {ctx[:600]}"
|
||||
fh = hashlib.sha256(f"sukob:{ime}".encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
||||
VALUES (%s,'sukob_interesa_sport','sukob_scraper',%s::jsonb,0.90,%s,now())
|
||||
ON CONFLICT (data_hash) DO NOTHING""",
|
||||
(fact[:2000], json.dumps([{"url":url, "ime":ime}]), fh))
|
||||
facts += cur.rowcount
|
||||
log.info(f"✓ Match: {ime}")
|
||||
except: pass
|
||||
|
||||
log.info(f"FINAL: {facts} sukob facts")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
Reference in New Issue
Block a user