PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: rss_rijeka_scraper.py | v1.0.0 | 04.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scrapers/rss_rijeka_scraper.py
|
||||
# Svrha: RSS / Zajednica sportskih udruga grada Rijeke deep scraper
|
||||
# Cilj: financijski izvještaji, klubovi, sportaši, dokumenti
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""RSS Rijeka scraper — klubovi, financiranje, dokumenti."""
|
||||
import os, sys, time, hashlib, logging, re, json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from html import unescape
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [rss] %(message)s')
|
||||
log = logging.getLogger("rss")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Intelligence Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
# Probe potential domains
|
||||
RSS_DOMAINS = [
|
||||
"https://rijeckisportskisavez.hr",
|
||||
"https://www.zsus-rijeka.hr",
|
||||
"https://sport.rijeka.hr",
|
||||
"https://rss-rijeka.hr",
|
||||
"https://www.rijeka.hr/teme-za-gradane/sport-i-rekreacija/"
|
||||
]
|
||||
|
||||
def fetch(url, retries=3):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
content = r.read().decode('utf-8', errors='replace')
|
||||
time.sleep(2.0)
|
||||
return content, r.status
|
||||
except Exception as e:
|
||||
log.warning(f"Fetch fail {i+1}: {url} {e}")
|
||||
time.sleep(3 * (i+1))
|
||||
return None, 0
|
||||
|
||||
def find_links(html, base_url):
|
||||
"""Extract internal links and PDFs."""
|
||||
if not html: return []
|
||||
links = []
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
||||
url = m.group(1)
|
||||
if url.startswith('#') or url.startswith('mailto:') or url.startswith('javascript:'):
|
||||
continue
|
||||
full = urljoin(base_url, url)
|
||||
try:
|
||||
host = urlparse(full).hostname or ""
|
||||
if any(d in host for d in ['rijeckisportskisavez', 'zsus-rijeka', 'rijeka.hr', 'rss-rijeka']):
|
||||
links.append(full)
|
||||
except: pass
|
||||
return list(set(links))
|
||||
|
||||
def extract_text(html):
|
||||
"""Strip HTML tags."""
|
||||
if not html: return ""
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S | re.I)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S | re.I)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = unescape(text)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
def extract_oibs(text):
|
||||
"""Find OIB numbers in text."""
|
||||
return re.findall(r'\b(\d{11})\b', text)
|
||||
|
||||
def extract_money(text):
|
||||
"""Find EUR amounts."""
|
||||
return re.findall(r'(\d{1,3}(?:[.,]\d{3})+(?:[.,]\d{2})?)\s*(?:EUR|€|kn|HRK)', text)
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
visited = set()
|
||||
queue = list(RSS_DOMAINS)
|
||||
docs_inserted = 0
|
||||
facts_inserted = 0
|
||||
|
||||
while queue and len(visited) < 200:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
|
||||
html, status = fetch(url)
|
||||
if not html or status != 200: continue
|
||||
|
||||
log.info(f"[{status}] {url} ({len(html)} bytes)")
|
||||
|
||||
text = extract_text(html)
|
||||
if len(text) < 100: continue
|
||||
|
||||
# Insert dokument
|
||||
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
||||
title = title_m.group(1).strip() if title_m else url[:80]
|
||||
|
||||
sha1 = hashlib.sha1(text.encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'RSS Rijeka'))
|
||||
docs_inserted += cur.rowcount
|
||||
except Exception as e:
|
||||
log.warning(f"Insert fail: {e}")
|
||||
|
||||
# Extract OIBs and create facts
|
||||
oibs = set(extract_oibs(text))
|
||||
for oib in oibs:
|
||||
if not oib.startswith('0000'):
|
||||
fact = f"OIB {oib} pojavljuje se na RSS Rijeka stranici: {title[:100]}"
|
||||
fact_hash = hashlib.sha256((url + fact).encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_url, source_date, confidence, data_hash)
|
||||
VALUES (%s, 'rss_rijeka', 'rss_rijeka_scraper', %s, CURRENT_DATE, 0.75, %s)
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
""", (fact[:500], url, fact_hash))
|
||||
facts_inserted += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Find more links to follow
|
||||
links = find_links(html, url)
|
||||
for link in links[:30]:
|
||||
if link not in visited and link not in queue:
|
||||
queue.append(link)
|
||||
|
||||
log.info(f"FINAL: visited={len(visited)} docs={docs_inserted} facts={facts_inserted}")
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
Reference in New Issue
Block a user