#!/usr/bin/env python3 """Lokalni news RSS PGZ.""" import sys, json, time, re sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") from _common import (fetch, extract_text, chunk_text, upsert_facts, DSN) from html import unescape import psycopg2 FEEDS = [ ("novi_list", "https://www.novilist.hr/rss/rijeka.xml"), ("novi_list_pgz", "https://www.novilist.hr/rss/regija.xml"), ("rijeka_danas", "https://rijekadanas.com/feed/"), ("rijeka_in", "https://rijekain.hr/feed/"), ("primorske_novice","https://primorskenovice.hr/feed/"), ("kvarner_news", "https://www.kvarner.news/feed/"), ("oradio", "https://otvoreniradio.hr/rss/sve.xml"), ("rijeka_today", "https://www.rijekatoday.com/feed/"), ] def parse_rss(xml): items = [] for m in re.finditer(r"(.*?)", xml, re.S | re.I): item = m.group(1) def grab(tag): mt = re.search(f"<{tag}[^>]*>(.*?)", item, re.S | re.I) if mt: t = mt.group(1) t = re.sub(r"", r"\1", t, flags=re.S) t = re.sub(r"<[^>]+>", " ", t) return unescape(re.sub(r"\s+", " ", t).strip()) return "" items.append({"title": grab("title"), "link": grab("link"), "description": grab("description"), "pubDate": grab("pubDate")}) return items def main(): conn = psycopg2.connect(DSN); conn.autocommit = True total_inserted = 0 for portal, url in FEEDS: xml, status = fetch(url, timeout=15) if not xml: print(f" {portal:20} fetch FAIL") continue items = parse_rss(xml) if not items: print(f" {portal:20} parse 0 items") continue ff = [] for it in items: title = it.get("title", "") desc = it.get("description", "") if not title and not desc: continue fact = f"{title} - {desc[:400]}".strip() if len(fact) < 30: continue ff.append({"fact": fact, "url": it.get("link", ""), "title": title}) n = upsert_facts(conn, ff, source_name=f"news_{portal}", category="news_pgz_rss", confidence=0.84) total_inserted += n print(f" {portal:20} items={len(items):>3} inserted={n:>3}") time.sleep(1) conn.close() print(f"=== TOTAL inserted: {total_inserted} ===") print(json.dumps({"feeds": len(FEEDS), "inserted": total_inserted})) if __name__ == "__main__": main()