Bug hunt V7:
DB: - Aggressive je_klub=false flag for programs/treninzi/totals (>100K€ no klub_id) - 53 ne-klubovi flagged false (RSS Rijeka ukupni, Stručni rad, Potpora loptačkim, etc) Frontend (sport2.html): - Panel back button (← Natrag) + history stack - window._panelHistory + pushPanelState + panelBack functions - closePanel resets history
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Common scraper helpers."""
|
||||
import os, re, time, json, hashlib
|
||||
from urllib.parse import urljoin, urlparse, urlencode, quote
|
||||
import urllib.request
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
|
||||
def fetch(url, timeout=20, retries=3, binary=False):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
d = r.read()
|
||||
return (d if binary else d.decode("utf-8", errors="replace")), r.status
|
||||
except Exception:
|
||||
time.sleep(2 * (i + 1))
|
||||
return None, 0
|
||||
|
||||
|
||||
def extract_text(html):
|
||||
if not html:
|
||||
return ""
|
||||
h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S | re.I)
|
||||
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S | re.I)
|
||||
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S | re.I)
|
||||
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S | re.I)
|
||||
t = re.sub(r"<[^>]+>", " ", h)
|
||||
return re.sub(r"\s+", " ", unescape(t)).strip()
|
||||
|
||||
|
||||
def extract_title(html):
|
||||
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
|
||||
return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
|
||||
|
||||
|
||||
def chunk_text(text, max_len=800):
|
||||
if len(text) <= max_len:
|
||||
return [text] if text else []
|
||||
out = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = min(start + max_len, len(text))
|
||||
if end < len(text):
|
||||
for sep in [". ", "! ", "? ", "\n"]:
|
||||
p = text.rfind(sep, start, end)
|
||||
if p > start + max_len // 2:
|
||||
end = p + len(sep)
|
||||
break
|
||||
out.append(text[start:end].strip())
|
||||
start = end
|
||||
return [c for c in out if len(c) > 80]
|
||||
|
||||
|
||||
def upsert_facts(conn, facts, source_name, category, confidence=0.85):
|
||||
if not facts:
|
||||
return 0
|
||||
cur = conn.cursor()
|
||||
rows = []
|
||||
for f in facts:
|
||||
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
||||
rows.append((f["fact"], source_name, category, confidence, h,
|
||||
json.dumps({"url": f.get("url", ""), "title": f.get("title", "")})))
|
||||
sql = ("INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) "
|
||||
"VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING")
|
||||
try:
|
||||
execute_batch(cur, sql, rows, page_size=50)
|
||||
n = cur.rowcount
|
||||
cur.close()
|
||||
return n
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
HREF_RE = re.compile("href=[\"']([^\"']+)[\"']")
|
||||
|
||||
|
||||
def find_internal_links(html, base_url):
|
||||
if not html:
|
||||
return []
|
||||
base_host = urlparse(base_url).hostname or ""
|
||||
out = set()
|
||||
for m in HREF_RE.finditer(html):
|
||||
u = urljoin(base_url, m.group(1))
|
||||
host = urlparse(u).hostname or ""
|
||||
if host == base_host:
|
||||
out.add(u.split("#")[0])
|
||||
return list(out)
|
||||
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Gospodarstvo PGZ — luke, brodogradilista, komora."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
GOSPOD = {
|
||||
"luka_rijeka": ["https://www.lukarijeka.hr/"],
|
||||
"brodogradiliste_3maj":["https://www.3maj.hr/"],
|
||||
"viktor_lenac": ["https://www.lenac.hr/"],
|
||||
"ina_rafinerija": ["https://www.ina.hr/"],
|
||||
"rrif": ["https://www.rrif.hr/"],
|
||||
"hgk_rijeka": ["https://www.hgk.hr/zk-rijeka"],
|
||||
"porin": ["https://www.porin.hr/"],
|
||||
"tehnopolis": ["https://www.tehnopolis.hr/"],
|
||||
"step_ri": ["https://step-ri.hr/"],
|
||||
"luka_ploce": ["https://www.luka-ploce.hr/"],
|
||||
"rijeka_gateway": ["https://www.rijeka-gateway.com/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=12):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="gospodarstvo_pgz", confidence=0.86)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in GOSPOD.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=12)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"gospod_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
"""JLS PGZ — 36 jedinica."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
JLS_PGZ = {
|
||||
"Rijeka": "https://www.rijeka.hr/",
|
||||
"Opatija": "https://www.opatija.hr/",
|
||||
"Crikvenica": "https://www.crikvenica.hr/",
|
||||
"Krk": "https://www.grad-krk.hr/",
|
||||
"Kraljevica": "https://www.kraljevica.hr/",
|
||||
"Rab": "https://www.rab.hr/",
|
||||
"Cres": "https://www.cres.hr/",
|
||||
"Mali_Losinj": "https://www.mali-losinj.hr/",
|
||||
"Delnice": "https://www.delnice.hr/",
|
||||
"Vrbovsko": "https://www.vrbovsko.hr/",
|
||||
"Cabar": "https://www.cabar.hr/",
|
||||
"Bakar": "https://www.bakar.hr/",
|
||||
"Kastav": "https://www.kastav.hr/",
|
||||
"Novi_Vinodolski": "https://www.novi-vinodolski.hr/",
|
||||
"Viskovo": "https://www.opcina-viskovo.hr/",
|
||||
"Klana": "https://www.klana.hr/",
|
||||
"Moscenicka_Draga":"https://www.moscenicka-draga.hr/",
|
||||
"Lovran": "https://www.opcinalovran.hr/",
|
||||
"Matulji": "https://www.matulji.hr/",
|
||||
"Omisalj": "https://www.omisalj.hr/",
|
||||
"Punat": "https://www.punat.hr/",
|
||||
"Vrbnik": "https://www.vrbnik.hr/",
|
||||
"Baska": "https://www.baska.hr/",
|
||||
"Dobrinj": "https://www.opcina-dobrinj.hr/",
|
||||
"Malinska": "https://www.malinska.hr/",
|
||||
"Jelenje": "https://www.jelenje.hr/",
|
||||
"Kostrena": "https://www.kostrena.hr/",
|
||||
"Cavle": "https://www.cavle.hr/",
|
||||
"Lopar": "https://www.opcina-lopar.hr/",
|
||||
"Brod_Moravice": "https://www.brod-moravice.hr/",
|
||||
"Mrkopalj": "https://www.mrkopalj.hr/",
|
||||
"Ravna_Gora": "https://www.ravnagora.hr/",
|
||||
"Lokve": "https://www.lokve.hr/",
|
||||
"Skrad": "https://www.skrad.hr/",
|
||||
"Fuzine": "https://www.fuzine.hr/",
|
||||
"Vinodolska": "https://www.vinodolska-opcina.hr/",
|
||||
"PGZ_zupanija": "https://www.pgz.hr/",
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, root, max_pages=25):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = [root]; facts = 0
|
||||
base_host = urlparse(root).hostname or ""
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=f"jls_pgz_{name.lower()}",
|
||||
category="jls_pgz_official", confidence=0.90)
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base_host and len(queue) < 200:
|
||||
queue.append(link)
|
||||
time.sleep(0.4)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, url in JLS_PGZ.items():
|
||||
try:
|
||||
r = crawl(name, url, max_pages=25)
|
||||
results.append(r)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"jls_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Kultura PGZ — muzeji, kazalista, knjiznice, festival."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
KULTURA = {
|
||||
"muzej_pomorski": ["https://ppmhp.hr/"],
|
||||
"muzej_grada_rijeke": ["https://www.muzej-rijeka.hr/"],
|
||||
"muzej_marine": ["https://www.maritime-museum-rijeka.com/"],
|
||||
"muzej_grada_krka": ["https://www.gradkrk.hr/"],
|
||||
"kazalist_zajca": ["https://www.hnk-zajc.hr/"],
|
||||
"knjiznica_rijeka": ["https://gkri.hr/"],
|
||||
"knjiznica_opatija": ["https://www.gradskaknjiznica-opatija.hr/"],
|
||||
"rijecki_karneval": ["https://www.rijecki-karneval.hr/"],
|
||||
"rijeka_ekc2020": ["https://rijeka2020.eu/"],
|
||||
"art_kino_rijeka": ["https://art-kino.hr/"],
|
||||
"filodrammatica": ["https://www.filodrammatica.eu/"],
|
||||
"drustvo_pisaca": ["https://drustvohrvatskihknjizevnika.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=15):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="kultura_pgz", confidence=0.86)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 50:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in KULTURA.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=15)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"kultura_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Lokalni news RSS PGZ."""
|
||||
import sys, json, time, re
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, chunk_text, upsert_facts, DSN)
|
||||
from html import unescape
|
||||
import psycopg2
|
||||
|
||||
FEEDS = [
|
||||
("novi_list", "https://www.novilist.hr/rss/rijeka.xml"),
|
||||
("novi_list_pgz", "https://www.novilist.hr/rss/regija.xml"),
|
||||
("rijeka_danas", "https://rijekadanas.com/feed/"),
|
||||
("rijeka_in", "https://rijekain.hr/feed/"),
|
||||
("primorske_novice","https://primorskenovice.hr/feed/"),
|
||||
("kvarner_news", "https://www.kvarner.news/feed/"),
|
||||
("oradio", "https://otvoreniradio.hr/rss/sve.xml"),
|
||||
("rijeka_today", "https://www.rijekatoday.com/feed/"),
|
||||
]
|
||||
|
||||
|
||||
def parse_rss(xml):
|
||||
items = []
|
||||
for m in re.finditer(r"<item>(.*?)</item>", xml, re.S | re.I):
|
||||
item = m.group(1)
|
||||
def grab(tag):
|
||||
mt = re.search(f"<{tag}[^>]*>(.*?)</{tag}>", item, re.S | re.I)
|
||||
if mt:
|
||||
t = mt.group(1)
|
||||
t = re.sub(r"<!\[CDATA\[(.*?)\]\]>", r"\1", t, flags=re.S)
|
||||
t = re.sub(r"<[^>]+>", " ", t)
|
||||
return unescape(re.sub(r"\s+", " ", t).strip())
|
||||
return ""
|
||||
items.append({"title": grab("title"), "link": grab("link"),
|
||||
"description": grab("description"), "pubDate": grab("pubDate")})
|
||||
return items
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
total_inserted = 0
|
||||
|
||||
for portal, url in FEEDS:
|
||||
xml, status = fetch(url, timeout=15)
|
||||
if not xml:
|
||||
print(f" {portal:20} fetch FAIL")
|
||||
continue
|
||||
items = parse_rss(xml)
|
||||
if not items:
|
||||
print(f" {portal:20} parse 0 items")
|
||||
continue
|
||||
|
||||
ff = []
|
||||
for it in items:
|
||||
title = it.get("title", "")
|
||||
desc = it.get("description", "")
|
||||
if not title and not desc: continue
|
||||
fact = f"{title} - {desc[:400]}".strip()
|
||||
if len(fact) < 30: continue
|
||||
ff.append({"fact": fact, "url": it.get("link", ""), "title": title})
|
||||
|
||||
n = upsert_facts(conn, ff, source_name=f"news_{portal}",
|
||||
category="news_pgz_rss", confidence=0.84)
|
||||
total_inserted += n
|
||||
print(f" {portal:20} items={len(items):>3} inserted={n:>3}")
|
||||
time.sleep(1)
|
||||
|
||||
conn.close()
|
||||
print(f"=== TOTAL inserted: {total_inserted} ===")
|
||||
print(json.dumps({"feeds": len(FEEDS), "inserted": total_inserted}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Obrazovanje PGZ — Sveuciliste + fakulteti + skole."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
EDU = {
|
||||
"uniri": ["https://www.uniri.hr/"],
|
||||
"ffri": ["https://www.ffri.uniri.hr/"],
|
||||
"tfr": ["https://www.tehnickifakultet.uniri.hr/"],
|
||||
"pfri": ["https://www.pfri.uniri.hr/"],
|
||||
"med_fri": ["https://medri.uniri.hr/"],
|
||||
"efri": ["https://www.efri.uniri.hr/"],
|
||||
"pravniri": ["https://www.pravri.uniri.hr/"],
|
||||
"ufri": ["https://www.ufri.uniri.hr/"],
|
||||
"akademija_pri": ["https://www.apuri.uniri.hr/"],
|
||||
"ucitelji_ri": ["https://www.ufri.uniri.hr/"],
|
||||
"vss_ri": ["https://www.veleri.hr/"],
|
||||
"rkc_pgz": ["https://www.rkcrijeka.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=15):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="obrazovanje_pgz", confidence=0.88)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in EDU.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=15)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"edu_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Servisne usluge PGZ — komunalije, transport."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
SERVIS = {
|
||||
"kd_komunalc": ["https://www.kd-komunalac.hr/"],
|
||||
"kd_kozala": ["https://www.kd-kozala.hr/"],
|
||||
"rijekapromet": ["https://www.rijekapromet.hr/"],
|
||||
"vodovod_pgz": ["https://www.kdvik-rijeka.hr/"],
|
||||
"rgcc_plin": ["https://www.energo.hr/"],
|
||||
"hep_rijeka": ["https://www.hep.hr/elektrodalmacija/"],
|
||||
"rijeka_parking": ["https://www.rijekaplus.hr/"],
|
||||
"ana_aerodrom": ["https://rijeka-airport.hr/"],
|
||||
"rijeka_busplus": ["https://www.autotrans.hr/"],
|
||||
"jadrolinija": ["https://www.jadrolinija.hr/"],
|
||||
"kbc_rijeka": ["https://www.kbc-rijeka.hr/"],
|
||||
"thalassotherapia": ["https://thalassotherapia-opatija.hr/"],
|
||||
"klinika_opatija": ["https://www.opatija.medicus.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=15):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="servisne_pgz", confidence=0.86)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in SERVIS.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=12)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"servis_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""TZ Kvarner + sve TZ PGZ."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
TZ_SITES = {
|
||||
"tz_kvarner": ["https://www.kvarner.hr/"],
|
||||
"tz_rijeka": ["https://www.visitrijeka.hr/"],
|
||||
"tz_opatija": ["https://www.visitopatija.com/"],
|
||||
"tz_crikvenica": ["https://www.tz-crikvenica.hr/"],
|
||||
"tz_krk": ["https://www.krk.hr/"],
|
||||
"tz_rab": ["https://www.rab-visit.com/"],
|
||||
"tz_cres": ["https://www.tzg-cres.hr/"],
|
||||
"tz_losinj": ["https://www.visitlosinj.hr/"],
|
||||
"tz_gorski_kotar": ["https://www.gorskikotar.hr/"],
|
||||
"tz_baska": ["https://www.tz-baska.hr/"],
|
||||
"tz_lovran": ["https://www.tz-lovran.hr/"],
|
||||
"tz_kastav": ["https://www.tz-kastav.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=20):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="turizam_pgz", confidence=0.85)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 80:
|
||||
queue.append(link)
|
||||
time.sleep(0.4)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in TZ_SITES.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=20)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"tz_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Wikipedia deep PGZ encyclopedia."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import chunk_text, upsert_facts, DSN, UA
|
||||
from urllib.parse import urlencode, quote
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
|
||||
API_HR = "https://hr.wikipedia.org/w/api.php"
|
||||
API_EN = "https://en.wikipedia.org/w/api.php"
|
||||
|
||||
|
||||
def wiki_extract(api, title, timeout=15):
|
||||
params = {"action":"query","prop":"extracts","explaintext":"1",
|
||||
"redirects":"1","format":"json","titles":title}
|
||||
url = api + "?" + urlencode(params)
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
d = json.loads(r.read())
|
||||
for pid, p in d.get("query", {}).get("pages", {}).items():
|
||||
if pid == "-1":
|
||||
return None
|
||||
return p.get("extract", "")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
PAGES = {
|
||||
"wiki_pgz_grad": ["Rijeka","Opatija","Crikvenica","Krk_(grad)","Kraljevica",
|
||||
"Rab_(grad)","Cres_(grad)","Mali_Losinj","Delnice","Vrbovsko",
|
||||
"Cabar","Bakar","Kastav","Novi_Vinodolski","Susak","Unije"],
|
||||
"wiki_pgz_opcina": ["Opcina_Viskovo","Opcina_Klana","Opcina_Lovran","Opcina_Matulji",
|
||||
"Opcina_Omisalj","Opcina_Punat","Opcina_Vrbnik","Opcina_Baska",
|
||||
"Opcina_Dobrinj","Opcina_Jelenje","Opcina_Kostrena","Opcina_Cavle",
|
||||
"Opcina_Lopar","Opcina_Brod_Moravice","Opcina_Mrkopalj",
|
||||
"Opcina_Ravna_Gora","Opcina_Lokve","Opcina_Skrad","Opcina_Fuzine"],
|
||||
"wiki_pgz_otok": ["Krk","Cres","Losinj","Rab","Susak","Unije","Ilovik","Ist",
|
||||
"Goli_otok","Sveti_Grgur"],
|
||||
"wiki_pgz_povijest": ["Vinodolski_zakonik","Frankopani","Krcki_knezovi",
|
||||
"Liburnija","Liburni","Trsat","Tvrdjava_Trsat",
|
||||
"Slobodna_Drzava_Rijeka","Rijecka_rezolucija"],
|
||||
"wiki_pgz_kultura": ["Glagoljica","Bascanska_ploca","Rijecki_karneval",
|
||||
"Halubajski_zvoncari","Hrvatsko_narodno_kazaliste_Ivana_pl._Zajca"],
|
||||
"wiki_pgz_priroda": ["Ucka","Risnjak","Park_prirode_Ucka",
|
||||
"Nacionalni_park_Risnjak","Velebit","Kvarnerski_zaljev"],
|
||||
"wiki_pgz_gospodarstvo": ["Luka_Rijeka","Brodogradiliste_3._maj",
|
||||
"Brodogradiliste_Viktor_Lenac","Rafinerija_nafte_Rijeka"],
|
||||
"wiki_pgz_obrazovanje": ["Sveuciliste_u_Rijeci","Tehnicki_fakultet_u_Rijeci",
|
||||
"Pomorski_fakultet_u_Rijeci","Filozofski_fakultet_u_Rijeci",
|
||||
"Medicinski_fakultet_u_Rijeci"],
|
||||
"wiki_pgz_osobe": ["Janica_Kostelic","Ivica_Kostelic","Janko_Polic_Kamov"],
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
total = 0; found = 0
|
||||
for category, titles in PAGES.items():
|
||||
cnt = 0
|
||||
for title in titles:
|
||||
for api, lang in [(API_HR, "hr"), (API_EN, "en")]:
|
||||
text = wiki_extract(api, title)
|
||||
if not text or len(text) < 250: continue
|
||||
found += 1
|
||||
facts = [{"fact": c, "url": f"https://{lang}.wikipedia.org/wiki/{quote(title)}", "title": title}
|
||||
for c in chunk_text(text, 700) if len(c) > 100]
|
||||
n = upsert_facts(conn, facts, source_name=f"wikipedia_pgz_{lang}",
|
||||
category=category, confidence=0.88)
|
||||
total += n; cnt += n
|
||||
time.sleep(0.4)
|
||||
print(f" {category:25} +{cnt:>5}f")
|
||||
conn.close()
|
||||
print(f"=== TOTAL pages={found} facts={total} ===")
|
||||
print(json.dumps({"pages_found": found, "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user