PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# ═══════════════════════════════════════════════════════════════════
+# Fajl: rss_rijeka_scraper.py | v1.0.0 | 04.05.2026
+# Autor: Damir Radulić <dradulic@outlook.com>
+# Lokacija: /opt/pgz-sport/scrapers/rss_rijeka_scraper.py
+# Svrha: RSS / Zajednica sportskih udruga grada Rijeke deep scraper
+# Cilj: financijski izvještaji, klubovi, sportaši, dokumenti
+# ═══════════════════════════════════════════════════════════════════
+"""RSS Rijeka scraper — klubovi, financiranje, dokumenti."""
+import os, sys, time, hashlib, logging, re, json
+from urllib.parse import urljoin, urlparse
+import urllib.request
+import psycopg2
+from psycopg2.extras import execute_batch
+from html import unescape
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s [rss] %(message)s')
+log = logging.getLogger("rss")
+
+DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
+UA = "Mozilla/5.0 (Ri.NET Civic Intelligence Bot 1.0; contact: dradulic@outlook.com)"
+
+# Probe potential domains
+RSS_DOMAINS = [
+    "https://rijeckisportskisavez.hr",
+    "https://www.zsus-rijeka.hr",
+    "https://sport.rijeka.hr",
+    "https://rss-rijeka.hr",
+    "https://www.rijeka.hr/teme-za-gradane/sport-i-rekreacija/"
+]
+
+def fetch(url, retries=3):
+    for i in range(retries):
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": UA})
+            with urllib.request.urlopen(req, timeout=20) as r:
+                content = r.read().decode('utf-8', errors='replace')
+                time.sleep(2.0)
+                return content, r.status
+        except Exception as e:
+            log.warning(f"Fetch fail {i+1}: {url} {e}")
+            time.sleep(3 * (i+1))
+    return None, 0
+
+def find_links(html, base_url):
+    """Extract internal links and PDFs."""
+    if not html: return []
+    links = []
+    for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
+        url = m.group(1)
+        if url.startswith('#') or url.startswith('mailto:') or url.startswith('javascript:'):
+            continue
+        full = urljoin(base_url, url)
+        try:
+            host = urlparse(full).hostname or ""
+            if any(d in host for d in ['rijeckisportskisavez', 'zsus-rijeka', 'rijeka.hr', 'rss-rijeka']):
+                links.append(full)
+        except: pass
+    return list(set(links))
+
+def extract_text(html):
+    """Strip HTML tags."""
+    if not html: return ""
+    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S | re.I)
+    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S | re.I)
+    text = re.sub(r'<[^>]+>', ' ', text)
+    text = unescape(text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+
+def extract_oibs(text):
+    """Find OIB numbers in text."""
+    return re.findall(r'\b(\d{11})\b', text)
+
+def extract_money(text):
+    """Find EUR amounts."""
+    return re.findall(r'(\d{1,3}(?:[.,]\d{3})+(?:[.,]\d{2})?)\s*(?:EUR|€|kn|HRK)', text)
+
+def harvest():
+    conn = psycopg2.connect(DSN)
+    conn.autocommit = True
+    cur = conn.cursor()
+    
+    visited = set()
+    queue = list(RSS_DOMAINS)
+    docs_inserted = 0
+    facts_inserted = 0
+    
+    while queue and len(visited) < 200:
+        url = queue.pop(0)
+        if url in visited: continue
+        visited.add(url)
+        
+        html, status = fetch(url)
+        if not html or status != 200: continue
+        
+        log.info(f"[{status}] {url} ({len(html)} bytes)")
+        
+        text = extract_text(html)
+        if len(text) < 100: continue
+        
+        # Insert dokument
+        title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
+        title = title_m.group(1).strip() if title_m else url[:80]
+        
+        sha1 = hashlib.sha1(text.encode()).hexdigest()
+        try:
+            cur.execute("""
+                INSERT INTO pgz_sport.dokumenti 
+                    (url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
+                VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
+                ON CONFLICT DO NOTHING
+            """, (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'RSS Rijeka'))
+            docs_inserted += cur.rowcount
+        except Exception as e:
+            log.warning(f"Insert fail: {e}")
+        
+        # Extract OIBs and create facts
+        oibs = set(extract_oibs(text))
+        for oib in oibs:
+            if not oib.startswith('0000'):
+                fact = f"OIB {oib} pojavljuje se na RSS Rijeka stranici: {title[:100]}"
+                fact_hash = hashlib.sha256((url + fact).encode()).hexdigest()
+                try:
+                    cur.execute("""
+                        INSERT INTO dabi.knowledge 
+                            (fact, category, source, source_url, source_date, confidence, data_hash)
+                        VALUES (%s, 'rss_rijeka', 'rss_rijeka_scraper', %s, CURRENT_DATE, 0.75, %s)
+                        ON CONFLICT (data_hash) DO NOTHING
+                    """, (fact[:500], url, fact_hash))
+                    facts_inserted += cur.rowcount
+                except: pass
+        
+        # Find more links to follow
+        links = find_links(html, url)
+        for link in links[:30]:
+            if link not in visited and link not in queue:
+                queue.append(link)
+    
+    log.info(f"FINAL: visited={len(visited)} docs={docs_inserted} facts={facts_inserted}")
+    cur.close()
+    conn.close()
+
+if __name__ == "__main__":
+    harvest()