Bug hunt V7:
DB: - Aggressive je_klub=false flag for programs/treninzi/totals (>100K€ no klub_id) - 53 ne-klubovi flagged false (RSS Rijeka ukupni, Stručni rad, Potpora loptačkim, etc) Frontend (sport2.html): - Panel back button (← Natrag) + history stack - window._panelHistory + pushPanelState + panelBack functions - closePanel resets history
This commit is contained in:
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Lokalni news RSS PGZ."""
|
||||
import sys, json, time, re
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, chunk_text, upsert_facts, DSN)
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
|
||||
FEEDS = [
|
||||
("novi_list", "https://www.novilist.hr/rss/rijeka.xml"),
|
||||
("novi_list_pgz", "https://www.novilist.hr/rss/regija.xml"),
|
||||
("rijeka_danas", "https://rijekadanas.com/feed/"),
|
||||
("rijeka_in", "https://rijekain.hr/feed/"),
|
||||
("primorske_novice","https://primorskenovice.hr/feed/"),
|
||||
("kvarner_news", "https://www.kvarner.news/feed/"),
|
||||
("oradio", "https://otvoreniradio.hr/rss/sve.xml"),
|
||||
("rijeka_today", "https://www.rijekatoday.com/feed/"),
|
||||
]
|
||||
|
||||
|
||||
def parse_rss(xml):
|
||||
items = []
|
||||
for m in re.finditer(r"<item>(.*?)</item>", xml, re.S | re.I):
|
||||
item = m.group(1)
|
||||
def grab(tag):
|
||||
mt = re.search(f"<{tag}[^>]*>(.*?)</{tag}>", item, re.S | re.I)
|
||||
if mt:
|
||||
t = mt.group(1)
|
||||
t = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", t, flags=re.S)
|
||||
t = re.sub(r"<[^>]+>", " ", t)
|
||||
return unescape(re.sub(r"\s+", " ", t).strip())
|
||||
return ""
|
||||
items.append({"title": grab("title"), "link": grab("link"),
|
||||
"description": grab("description"), "pubDate": grab("pubDate")})
|
||||
return items
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
total_inserted = 0
|
||||
|
||||
for portal, url in FEEDS:
|
||||
xml, status = fetch(url, timeout=15)
|
||||
if not xml:
|
||||
print(f" {portal:20} fetch FAIL")
|
||||
continue
|
||||
items = parse_rss(xml)
|
||||
if not items:
|
||||
print(f" {portal:20} parse 0 items")
|
||||
continue
|
||||
|
||||
ff = []
|
||||
for it in items:
|
||||
title = it.get("title", "")
|
||||
desc = it.get("description", "")
|
||||
if not title and not desc: continue
|
||||
fact = f"{title} - {desc[:400]}".strip()
|
||||
if len(fact) < 30: continue
|
||||
ff.append({"fact": fact, "url": it.get("link", ""), "title": title})
|
||||
|
||||
n = upsert_facts(conn, ff, source_name=f"news_{portal}",
|
||||
category="news_pgz_rss", confidence=0.84)
|
||||
total_inserted += n
|
||||
print(f" {portal:20} items={len(items):>3} inserted={n:>3}")
|
||||
time.sleep(1)
|
||||
|
||||
conn.close()
|
||||
print(f"=== TOTAL inserted: {total_inserted} ===")
|
||||
print(json.dumps({"feeds": len(FEEDS), "inserted": total_inserted}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user