#!/usr/bin/env python3 # ═══════════════════════════════════════════════════════════════════ # Fajl: news_rss_pgz_sport.py | v1.0.0 | 05.05.2026 # Lokacija: /opt/pgz-sport/scrapers/news_rss_pgz_sport.py # Svrha: Hrvatski news RSS feeds — filter po PGŽ + sport # - Novi list, Glas Istre, 24sata, Index, T-Portal, HRT # - Filter samo članci koji spominju PGŽ + sport entitete # ═══════════════════════════════════════════════════════════════════ """News RSS feeds — PGŽ sport filter.""" import re, json, time, hashlib import urllib.request from html import unescape import psycopg2 from psycopg2.extras import execute_batch DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7" UA = "Ri.NET Civic Bot 1.0" # Croatian news RSS feeds — sport-related FEEDS = [ ("novi_list", "https://www.novilist.hr/rss/sport.xml"), ("novi_list", "https://www.novilist.hr/rss/rijeka.xml"), ("hrt", "https://www.hrt.hr/rss/sport"), ("24sata_sport","https://www.24sata.hr/feeds/sport.xml"), ("tportal", "https://www.tportal.hr/feed/sport"), ("index_sport", "https://www.index.hr/sport/rss"), ("rijeka_danas","https://rijekadanas.com/feed/"), ] PGZ_KEYWORDS = ["Rijeka", "PGŽ", "Primorsko-goransk", "Kvarner", "HNK Rijeka", "Opatija", "Crikvenica", "Krk", "Cres", "Lošinj", "Rab", "Kantrida", "Trsat", "Orijent", "Pomorac", "Zamet", "Mladost", "Pomorac", "Mlaka", "Bakar", "Kostrena", "Viškovo", "Kastav"] def fetch(url, timeout=15): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: return r.read().decode("utf-8", errors="replace") except Exception as e: return None def parse_rss(xml): """Extract entries — title, link, description, pubDate.""" items = [] for m in re.finditer(r"(.*?)", xml, re.S | re.I): item = m.group(1) def grab(tag): mt = re.search(f"<{tag}[^>]*>(.*?)", item, re.S | re.I) if mt: txt = mt.group(1) # Strip CDATA txt = re.sub(r"", r"\1", txt, flags=re.S) txt = re.sub(r"<[^>]+>", " ", txt) return unescape(re.sub(r"\s+", " ", txt).strip()) return "" items.append({ "title": grab("title"), "link": grab("link"), "description": grab("description"), "pubDate": grab("pubDate"), }) return items def is_pgz_relevant(text): return any(k in text for k in PGZ_KEYWORDS) def main(): conn = psycopg2.connect(DSN); conn.autocommit = True total_articles = 0 pgz_relevant = 0 inserted = 0 for portal, url in FEEDS: xml = fetch(url) if not xml: print(f" {portal:20} fetch FAIL") continue items = parse_rss(xml) total_articles += len(items) cur = conn.cursor() rows = [] relevant_for_portal = 0 for it in items: full = (it["title"] + " " + it["description"]) if not is_pgz_relevant(full): continue relevant_for_portal += 1 fact = f"{it['title']} — {it['description'][:400]}" if not fact.strip(): continue h = hashlib.md5(fact.encode()).hexdigest() rows.append((fact, f"news_rss_{portal}", "news_pgz_sport", 0.85, h, json.dumps({"link": it["link"], "pubDate": it["pubDate"]}))) pgz_relevant += relevant_for_portal if rows: sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING""" try: execute_batch(cur, sql, rows, page_size=50) n = cur.rowcount inserted += n print(f" {portal:20} items={len(items):>3} relevant={relevant_for_portal:>3} inserted={n:>3}") except Exception as e: print(f" {portal:20} insert err: {e}") else: print(f" {portal:20} items={len(items):>3} relevant=0") cur.close() time.sleep(1) print(f"\n=== DONE: {total_articles} total / {pgz_relevant} pgz-relevant / {inserted} inserted ===") conn.close() return {"total": total_articles, "pgz_relevant": pgz_relevant, "inserted": inserted} if __name__ == "__main__": print(json.dumps(main()))