007825acee
DB: - Aggressive je_klub=false flag for programs/treninzi/totals (>100K€ no klub_id) - 53 ne-klubovi flagged false (RSS Rijeka ukupni, Stručni rad, Potpora loptačkim, etc) Frontend (sport2.html): - Panel back button (← Natrag) + history stack - window._panelHistory + pushPanelState + panelBack functions - closePanel resets history
74 lines
2.5 KiB
Python
74 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Lokalni news RSS PGZ."""
|
|
import sys, json, time, re
|
|
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
|
from _common import (fetch, extract_text, chunk_text, upsert_facts, DSN)
|
|
from html import unescape
|
|
import psycopg2
|
|
|
|
FEEDS = [
|
|
("novi_list", "https://www.novilist.hr/rss/rijeka.xml"),
|
|
("novi_list_pgz", "https://www.novilist.hr/rss/regija.xml"),
|
|
("rijeka_danas", "https://rijekadanas.com/feed/"),
|
|
("rijeka_in", "https://rijekain.hr/feed/"),
|
|
("primorske_novice","https://primorskenovice.hr/feed/"),
|
|
("kvarner_news", "https://www.kvarner.news/feed/"),
|
|
("oradio", "https://otvoreniradio.hr/rss/sve.xml"),
|
|
("rijeka_today", "https://www.rijekatoday.com/feed/"),
|
|
]
|
|
|
|
|
|
def parse_rss(xml):
|
|
items = []
|
|
for m in re.finditer(r"<item>(.*?)</item>", xml, re.S | re.I):
|
|
item = m.group(1)
|
|
def grab(tag):
|
|
mt = re.search(f"<{tag}[^>]*>(.*?)</{tag}>", item, re.S | re.I)
|
|
if mt:
|
|
t = mt.group(1)
|
|
t = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", t, flags=re.S)
|
|
t = re.sub(r"<[^>]+>", " ", t)
|
|
return unescape(re.sub(r"\s+", " ", t).strip())
|
|
return ""
|
|
items.append({"title": grab("title"), "link": grab("link"),
|
|
"description": grab("description"), "pubDate": grab("pubDate")})
|
|
return items
|
|
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
total_inserted = 0
|
|
|
|
for portal, url in FEEDS:
|
|
xml, status = fetch(url, timeout=15)
|
|
if not xml:
|
|
print(f" {portal:20} fetch FAIL")
|
|
continue
|
|
items = parse_rss(xml)
|
|
if not items:
|
|
print(f" {portal:20} parse 0 items")
|
|
continue
|
|
|
|
ff = []
|
|
for it in items:
|
|
title = it.get("title", "")
|
|
desc = it.get("description", "")
|
|
if not title and not desc: continue
|
|
fact = f"{title} - {desc[:400]}".strip()
|
|
if len(fact) < 30: continue
|
|
ff.append({"fact": fact, "url": it.get("link", ""), "title": title})
|
|
|
|
n = upsert_facts(conn, ff, source_name=f"news_{portal}",
|
|
category="news_pgz_rss", confidence=0.84)
|
|
total_inserted += n
|
|
print(f" {portal:20} items={len(items):>3} inserted={n:>3}")
|
|
time.sleep(1)
|
|
|
|
conn.close()
|
|
print(f"=== TOTAL inserted: {total_inserted} ===")
|
|
print(json.dumps({"feeds": len(FEEDS), "inserted": total_inserted}))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|