Files
pgz-sport/scrapers/news_rss_pgz_sport.py
T

133 lines
5.0 KiB
Python

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
import os
# ═══════════════════════════════════════════════════════════════════
# Fajl: news_rss_pgz_sport.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/news_rss_pgz_sport.py
# Svrha: Hrvatski news RSS feeds — filter po PGŽ + sport
# - Novi list, Glas Istre, 24sata, Index, T-Portal, HRT
# - Filter samo članci koji spominju PGŽ + sport entitete
# ═══════════════════════════════════════════════════════════════════
"""News RSS feeds — PGŽ sport filter."""
import re, json, time, hashlib
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UA = "Ri.NET Civic Bot 1.0"
# Croatian news RSS feeds — sport-related
FEEDS = [
("novi_list", "https://www.novilist.hr/rss/sport.xml"),
("novi_list", "https://www.novilist.hr/rss/rijeka.xml"),
("hrt", "https://www.hrt.hr/rss/sport"),
("24sata_sport","https://www.24sata.hr/feeds/sport.xml"),
("tportal", "https://www.tportal.hr/feed/sport"),
("index_sport", "https://www.index.hr/sport/rss"),
("rijeka_danas","https://rijekadanas.com/feed/"),
]
PGZ_KEYWORDS = ["Rijeka", "PGŽ", "Primorsko-goransk", "Kvarner", "HNK Rijeka",
"Opatija", "Crikvenica", "Krk", "Cres", "Lošinj", "Rab",
"Kantrida", "Trsat", "Orijent", "Pomorac", "Zamet", "Mladost",
"Pomorac", "Mlaka", "Bakar", "Kostrena", "Viškovo", "Kastav"]
def fetch(url, timeout=15):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
return None
def parse_rss(xml):
"""Extract <item> entries — title, link, description, pubDate."""
items = []
for m in re.finditer(r"<item>(.*?)</item>", xml, re.S | re.I):
item = m.group(1)
def grab(tag):
mt = re.search(f"<{tag}[^>]*>(.*?)</{tag}>", item, re.S | re.I)
if mt:
txt = mt.group(1)
# Strip CDATA
txt = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", txt, flags=re.S)
txt = re.sub(r"<[^>]+>", " ", txt)
return unescape(re.sub(r"\s+", " ", txt).strip())
return ""
items.append({
"title": grab("title"),
"link": grab("link"),
"description": grab("description"),
"pubDate": grab("pubDate"),
})
return items
def is_pgz_relevant(text):
return any(k in text for k in PGZ_KEYWORDS)
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
total_articles = 0
pgz_relevant = 0
inserted = 0
for portal, url in FEEDS:
xml = fetch(url)
if not xml:
print(f" {portal:20} fetch FAIL")
continue
items = parse_rss(xml)
total_articles += len(items)
cur = conn.cursor()
rows = []
relevant_for_portal = 0
for it in items:
full = (it["title"] + " " + it["description"])
if not is_pgz_relevant(full):
continue
relevant_for_portal += 1
fact = f"{it['title']}{it['description'][:400]}"
if not fact.strip():
continue
h = hashlib.md5(fact.encode()).hexdigest()
rows.append((fact, f"news_rss_{portal}", "news_pgz_sport", 0.85, h,
json.dumps({"link": it["link"], "pubDate": it["pubDate"]})))
pgz_relevant += relevant_for_portal
if rows:
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
try:
execute_batch(cur, sql, rows, page_size=50)
n = cur.rowcount
inserted += n
print(f" {portal:20} items={len(items):>3} relevant={relevant_for_portal:>3} inserted={n:>3}")
except Exception as e:
print(f" {portal:20} insert err: {e}")
else:
print(f" {portal:20} items={len(items):>3} relevant=0")
cur.close()
time.sleep(1)
print(f"\n=== DONE: {total_articles} total / {pgz_relevant} pgz-relevant / {inserted} inserted ===")
conn.close()
return {"total": total_articles, "pgz_relevant": pgz_relevant, "inserted": inserted}
if __name__ == "__main__":
print(json.dumps(main()))