129 lines
4.9 KiB
Python
129 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Fajl: news_rss_pgz_sport.py | v1.0.0 | 05.05.2026
|
|
# Lokacija: /opt/pgz-sport/scrapers/news_rss_pgz_sport.py
|
|
# Svrha: Hrvatski news RSS feeds — filter po PGŽ + sport
|
|
# - Novi list, Glas Istre, 24sata, Index, T-Portal, HRT
|
|
# - Filter samo članci koji spominju PGŽ + sport entitete
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""News RSS feeds — PGŽ sport filter."""
|
|
import re, json, time, hashlib
|
|
import urllib.request
|
|
from html import unescape
|
|
import psycopg2
|
|
from psycopg2.extras import execute_batch
|
|
|
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
UA = "Ri.NET Civic Bot 1.0"
|
|
|
|
# Croatian news RSS feeds — sport-related
|
|
FEEDS = [
|
|
("novi_list", "https://www.novilist.hr/rss/sport.xml"),
|
|
("novi_list", "https://www.novilist.hr/rss/rijeka.xml"),
|
|
("hrt", "https://www.hrt.hr/rss/sport"),
|
|
("24sata_sport","https://www.24sata.hr/feeds/sport.xml"),
|
|
("tportal", "https://www.tportal.hr/feed/sport"),
|
|
("index_sport", "https://www.index.hr/sport/rss"),
|
|
("rijeka_danas","https://rijekadanas.com/feed/"),
|
|
]
|
|
|
|
PGZ_KEYWORDS = ["Rijeka", "PGŽ", "Primorsko-goransk", "Kvarner", "HNK Rijeka",
|
|
"Opatija", "Crikvenica", "Krk", "Cres", "Lošinj", "Rab",
|
|
"Kantrida", "Trsat", "Orijent", "Pomorac", "Zamet", "Mladost",
|
|
"Pomorac", "Mlaka", "Bakar", "Kostrena", "Viškovo", "Kastav"]
|
|
|
|
|
|
def fetch(url, timeout=15):
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read().decode("utf-8", errors="replace")
|
|
except Exception as e:
|
|
return None
|
|
|
|
|
|
def parse_rss(xml):
|
|
"""Extract <item> entries — title, link, description, pubDate."""
|
|
items = []
|
|
for m in re.finditer(r"<item>(.*?)</item>", xml, re.S | re.I):
|
|
item = m.group(1)
|
|
def grab(tag):
|
|
mt = re.search(f"<{tag}[^>]*>(.*?)</{tag}>", item, re.S | re.I)
|
|
if mt:
|
|
txt = mt.group(1)
|
|
# Strip CDATA
|
|
txt = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", txt, flags=re.S)
|
|
txt = re.sub(r"<[^>]+>", " ", txt)
|
|
return unescape(re.sub(r"\s+", " ", txt).strip())
|
|
return ""
|
|
items.append({
|
|
"title": grab("title"),
|
|
"link": grab("link"),
|
|
"description": grab("description"),
|
|
"pubDate": grab("pubDate"),
|
|
})
|
|
return items
|
|
|
|
|
|
def is_pgz_relevant(text):
|
|
return any(k in text for k in PGZ_KEYWORDS)
|
|
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
|
|
total_articles = 0
|
|
pgz_relevant = 0
|
|
inserted = 0
|
|
|
|
for portal, url in FEEDS:
|
|
xml = fetch(url)
|
|
if not xml:
|
|
print(f" {portal:20} fetch FAIL")
|
|
continue
|
|
|
|
items = parse_rss(xml)
|
|
total_articles += len(items)
|
|
|
|
cur = conn.cursor()
|
|
rows = []
|
|
relevant_for_portal = 0
|
|
|
|
for it in items:
|
|
full = (it["title"] + " " + it["description"])
|
|
if not is_pgz_relevant(full):
|
|
continue
|
|
relevant_for_portal += 1
|
|
|
|
fact = f"{it['title']} — {it['description'][:400]}"
|
|
if not fact.strip():
|
|
continue
|
|
h = hashlib.md5(fact.encode()).hexdigest()
|
|
rows.append((fact, f"news_rss_{portal}", "news_pgz_sport", 0.85, h,
|
|
json.dumps({"link": it["link"], "pubDate": it["pubDate"]})))
|
|
|
|
pgz_relevant += relevant_for_portal
|
|
if rows:
|
|
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
|
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
|
try:
|
|
execute_batch(cur, sql, rows, page_size=50)
|
|
n = cur.rowcount
|
|
inserted += n
|
|
print(f" {portal:20} items={len(items):>3} relevant={relevant_for_portal:>3} inserted={n:>3}")
|
|
except Exception as e:
|
|
print(f" {portal:20} insert err: {e}")
|
|
else:
|
|
print(f" {portal:20} items={len(items):>3} relevant=0")
|
|
|
|
cur.close()
|
|
time.sleep(1)
|
|
|
|
print(f"\n=== DONE: {total_articles} total / {pgz_relevant} pgz-relevant / {inserted} inserted ===")
|
|
conn.close()
|
|
return {"total": total_articles, "pgz_relevant": pgz_relevant, "inserted": inserted}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print(json.dumps(main()))
|