Crisis V7 MEGA: sufinanciranje_sport + panel + CRM auth
DB: - pgz_sport.sufinanciranje_sport.je_klub flag (RSS programi/totals false) - pgz_sport.sufinanciranje_sport.klub_id matched Endpoints: - /v2/potpore/by-year: samo_klubovi=True default + davatelj filter Frontend: - sport2.html PANEL FORCE HIDE CSS (right:-100vw default) - crm_v2.html: redirect to /login only on actual 401, not on page load
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""data.gov.hr — Open Data PGZ."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import upsert_facts, DSN, UA
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
|
||||
API = "https://data.gov.hr/api/3/action"
|
||||
|
||||
|
||||
def search(query, rows=50):
|
||||
url = f"{API}/package_search?q={urllib.parse.quote(query)}&rows={rows}"
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
d = json.loads(r.read())
|
||||
return d.get("result", {}).get("results", [])
|
||||
except Exception as e:
|
||||
print(f"search err: {e}")
|
||||
return []
|
||||
|
||||
import urllib.parse
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
queries = [
|
||||
"Primorsko-goranska", "Rijeka", "Opatija", "Crikvenica", "Krk",
|
||||
"Cres", "Lošinj", "Rab", "Delnice", "Bakar", "Kvarner",
|
||||
]
|
||||
|
||||
total_inserted = 0
|
||||
seen = set()
|
||||
|
||||
for q in queries:
|
||||
results = search(q, rows=50)
|
||||
ff = []
|
||||
for pkg in results:
|
||||
pkg_id = pkg.get("id", "")
|
||||
if pkg_id in seen: continue
|
||||
seen.add(pkg_id)
|
||||
|
||||
title = pkg.get("title", "")
|
||||
notes = pkg.get("notes", "")[:600]
|
||||
org = pkg.get("organization", {}).get("title", "")
|
||||
tags = ", ".join([t.get("name", "") for t in pkg.get("tags", [])])
|
||||
|
||||
fact = f"[OpenData] {title} | Org: {org} | {notes} | Tags: {tags}"[:1200]
|
||||
if len(fact) > 50:
|
||||
ff.append({
|
||||
"fact": fact,
|
||||
"url": f"https://data.gov.hr/dataset/{pkg.get('name', '')}",
|
||||
"title": title,
|
||||
})
|
||||
|
||||
n = upsert_facts(conn, ff, source_name="data_gov_hr_pgz",
|
||||
category="opendata_pgz", confidence=0.85)
|
||||
total_inserted += n
|
||||
print(f" query='{q}' -> {len(results)} results, {n} new facts")
|
||||
time.sleep(1)
|
||||
|
||||
conn.close()
|
||||
print(f"=== TOTAL: {total_inserted} ===")
|
||||
print(json.dumps({"queries": len(queries), "total_facts": total_inserted}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Public events i festivali PGZ."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
EVENTS = {
|
||||
"rijeka_karneval": ["https://www.rijecki-karneval.hr/", "https://rijekakarneval.hr/"],
|
||||
"ljeto_kvarner": ["https://www.kvarner-ljeto.hr/"],
|
||||
"rijeka_film_fest": ["https://www.riff.hr/"],
|
||||
"vinski_festival_op": ["https://opatijawine.hr/"],
|
||||
"festival_culture": ["https://www.festival-of-cultures.hr/"],
|
||||
"muzicki_kvarner": ["https://www.kvarnermusic.hr/"],
|
||||
"porto_etno": ["https://www.portoetno.eu/"],
|
||||
"ri_rock": ["https://rockkonferencija.com/"],
|
||||
"fjeshta_lovran": ["https://www.fjeshta.hr/"],
|
||||
"njanje_zvoncari": ["https://halubajskizvoncari.hr/"],
|
||||
"skoljka_festival": ["https://www.kostrena.hr/"],
|
||||
"sumski_film": ["https://www.lifftrijeka.hr/"],
|
||||
"zicfest": ["https://www.zicfest.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=12):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="events_pgz", confidence=0.85)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in EVENTS.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=10)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"events_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Gorski kotar deep — Risnjak, Delnice, Cabar, Lokve."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
GORSKI = {
|
||||
"tz_gorski_kotar": ["https://www.gorskikotar.hr/", "https://www.tz-gorskikotar.hr/"],
|
||||
"np_risnjak": ["https://www.np-risnjak.hr/"],
|
||||
"delnice_info": ["https://delnice.com/"],
|
||||
"cabar_info": ["https://www.cabar.hr/"],
|
||||
"lokvarsko_jezero": ["https://www.lokve.hr/"],
|
||||
"fuzine_info": ["https://www.fuzine.hr/"],
|
||||
"ravnagora_blog": ["https://www.ravnagora.hr/"],
|
||||
"skrad_info": ["https://www.skrad.hr/"],
|
||||
"vrbovsko_info": ["https://www.vrbovsko.hr/"],
|
||||
"spilja_lokvarka": ["https://www.spilja-lokvarka.com/"],
|
||||
"skijanje_platak": ["https://www.platak.hr/"],
|
||||
"snowboard_klub": ["https://www.snowboardklub-rijeka.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=15):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="gorski_kotar_pgz", confidence=0.85)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in GORSKI.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=12)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"gorski_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Media deep crawl — full pages of local portals."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
MEDIA = {
|
||||
"novilist_rijeka": ["https://www.novilist.hr/rijeka/", "https://www.novilist.hr/regija/"],
|
||||
"rijekadanas_full": ["https://rijekadanas.com/category/rijeka/", "https://rijekadanas.com/category/pgz/"],
|
||||
"rijekain_full": ["https://rijekain.hr/category/rijeka/"],
|
||||
"primorske_full": ["https://primorskenovice.hr/"],
|
||||
"rkc_blog": ["https://www.rkcrijeka.hr/blog/"],
|
||||
"rijeka2020_arhiva": ["https://rijeka2020.eu/category/news/"],
|
||||
"kulturpunkt_ri": ["https://www.kulturpunkt.hr/tag/rijeka"],
|
||||
"5portala_hr_pgz": ["https://www.5portala.hr/regije/primorsko-goranska/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=20):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 300: continue
|
||||
ff = []
|
||||
if title and len(title) > 12:
|
||||
ff.append({"fact": f"[{name}] {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 120:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=f"media_{name}",
|
||||
category="media_pgz_deep", confidence=0.82)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 60:
|
||||
queue.append(link)
|
||||
time.sleep(0.6)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in MEDIA.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=18)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"media_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Naselja PGZ — sela, zaseoci, otocna mjesta."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import chunk_text, upsert_facts, DSN, UA
|
||||
from urllib.parse import urlencode, quote
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
|
||||
API_HR = "https://hr.wikipedia.org/w/api.php"
|
||||
|
||||
|
||||
def wiki_cat_members(cat, limit=200):
|
||||
"""Get pages in a Wikipedia category."""
|
||||
params = {"action":"query","list":"categorymembers","cmtitle":cat,
|
||||
"cmlimit":str(limit),"format":"json"}
|
||||
url = API_HR + "?" + urlencode(params)
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
d = json.loads(r.read())
|
||||
return [m["title"] for m in d.get("query", {}).get("categorymembers", [])]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def wiki_extract(title, timeout=15):
|
||||
params = {"action":"query","prop":"extracts","explaintext":"1",
|
||||
"redirects":"1","format":"json","titles":title}
|
||||
url = API_HR + "?" + urlencode(params)
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
d = json.loads(r.read())
|
||||
for pid, p in d.get("query", {}).get("pages", {}).items():
|
||||
if pid == "-1": return None
|
||||
return p.get("extract", "")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
CATEGORIES = [
|
||||
"Kategorija:Naselja_u_Primorsko-goranskoj_županiji",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Krk)",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Cres)",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Lošinj)",
|
||||
"Kategorija:Naselja_u_Hrvatskoj_(otok_Rab)",
|
||||
"Kategorija:Gorski_kotar",
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
total = 0; pages = 0
|
||||
|
||||
seen = set()
|
||||
for cat in CATEGORIES:
|
||||
members = wiki_cat_members(cat, limit=200)
|
||||
print(f" {cat[:50]:50} {len(members):>3} members")
|
||||
|
||||
for title in members:
|
||||
if title in seen: continue
|
||||
seen.add(title)
|
||||
|
||||
text = wiki_extract(title)
|
||||
if not text or len(text) < 200: continue
|
||||
pages += 1
|
||||
|
||||
facts = [{"fact": c, "url": f"https://hr.wikipedia.org/wiki/{quote(title)}",
|
||||
"title": title}
|
||||
for c in chunk_text(text, 600) if len(c) > 100]
|
||||
n = upsert_facts(conn, facts, source_name="wikipedia_pgz_naselja",
|
||||
category="naselja_pgz", confidence=0.86)
|
||||
total += n
|
||||
time.sleep(0.3)
|
||||
|
||||
conn.close()
|
||||
print(f"=== TOTAL pages={pages} facts={total} ===")
|
||||
print(json.dumps({"pages": pages, "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Real estate + housing PGZ."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
REAL = {
|
||||
"rijeka_najam": ["https://www.rijeka.hr/javnu-najam/"],
|
||||
"stanovanje_pgz": ["https://www.dom-rijeka.hr/"],
|
||||
"katastar_pgz": ["https://geoportal.dgu.hr/"],
|
||||
"uprava_imovine": ["https://www.rijeka.hr/imovinsko-pravna/"],
|
||||
"rijeka_arhitekt": ["https://www.rijeka.hr/arhitektonska/"],
|
||||
"drzavna_imovina": ["https://www.drzavnaimovina.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=8):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="nekretnine_pgz", confidence=0.83)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 25:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in REAL.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=8)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"real_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Otoci PGZ deep — Krk, Cres, Losinj, Rab portali."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
OTOCI = {
|
||||
"krk_info": ["https://www.krk.com/"],
|
||||
"krkonline": ["https://www.krkonline.com/"],
|
||||
"krkinfo_news": ["https://www.krk-info.com/"],
|
||||
"cres_info": ["https://www.cres.info/"],
|
||||
"cres_lapis": ["https://lapis.cres.hr/"],
|
||||
"losinj_info": ["https://www.losinj.info/"],
|
||||
"losinj_centar": ["https://www.muzejmalilosinj.hr/"],
|
||||
"rab_info": ["https://www.rab.com/"],
|
||||
"rab_news": ["https://rabnews.com/"],
|
||||
"susak_info": ["https://www.susakisland.com/"],
|
||||
"ilovik_info": ["https://www.ilovik.eu/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=12):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="otoci_pgz", confidence=0.85)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in OTOCI.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=12)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"otoci_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Povijesni izvori PGZ — Liburnija, arhivi."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
HISTORY = {
|
||||
"drzavni_arhiv_ri": ["https://www.dari.hr/"],
|
||||
"arhiv_pazin": ["https://www.dapa.hr/"],
|
||||
"muzej_glagoljice": ["https://glagoljica.hr/"],
|
||||
"glagoljaska_alea": ["https://www.aleja-glagoljasa.hr/"],
|
||||
"frankopani": ["https://www.frankopani.eu/"],
|
||||
"trsatske_legende": ["https://www.trsat-svetiste.com/povijest/"],
|
||||
"rijeka_povijest": ["https://rijeka-history.eu/"],
|
||||
"stare_rijeke": ["https://www.stararijeka.com/"],
|
||||
"kvarner_arhiv": ["https://www.kvarnerheritage.eu/"],
|
||||
"muzeji_pgz_arhiv": ["https://www.muzeji-pgz.hr/"],
|
||||
"razno_pomorski": ["https://www.kpu.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=12):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="povijest_pgz", confidence=0.86)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in HISTORY.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=10)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"hist_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Lokalni propisi PGZ — sluzbene novine, statuti."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
PROPISI = {
|
||||
"sluzbene_novine_pgz": ["https://www.sn.pgz.hr/"],
|
||||
"sluzbene_glasnik_rijeka": ["https://www.sluzbene-novine.com/"],
|
||||
"sn_opatija": ["https://www.opatija.hr/sluzbene-novine"],
|
||||
"pgz_dokumenti": ["https://www.pgz.hr/dokumenti"],
|
||||
"pgz_skupstina": ["https://www.pgz.hr/skupstina"],
|
||||
"rijeka_grad_v_savjet":["https://www.rijeka.hr/gradska-uprava/"],
|
||||
"pgz_javnatime": ["https://www.pgz.hr/javna-nabava/"],
|
||||
"rijeka_javna_nabava": ["https://www.rijeka.hr/javna-nabava/"],
|
||||
"narodne_novine_pgz": ["https://narodne-novine.nn.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=15):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="propisi_pgz", confidence=0.88)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 60:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in PROPISI.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=15)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"propisi_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sport infrastruktura PGZ — dvorane, baze, skole."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
INFRA = {
|
||||
"dvorana_zamet": ["https://www.dvoranazamet.hr/"],
|
||||
"stadion_kantrida": ["https://www.kantrida.hr/"],
|
||||
"stadion_rujevica": ["https://www.nk-rijeka.hr/stadion-rujevica/"],
|
||||
"ck_kantrida_pliv": ["https://kantridapool.hr/"],
|
||||
"ri_sport_centar": ["https://www.ri-sport.hr/"],
|
||||
"delta_jumbo": ["https://www.deltajumbo.hr/"],
|
||||
"skolski_sport": ["https://www.hsss.hr/"],
|
||||
"platak_skijanje": ["https://www.platak.hr/"],
|
||||
"rec_velebit": ["https://www.velebit.hr/"],
|
||||
"platak_ski_klub": ["https://www.skiclub-platak.hr/"],
|
||||
"rijeka_marina": ["https://www.aci-marinas.com/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=15):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="sport_infra_pgz", confidence=0.85)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in INFRA.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=12)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"infra_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sport klubovi PGZ — direktno s web stranica."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
KLUBOVI = {
|
||||
"hnk_rijeka": ["https://www.nk-rijeka.hr/"],
|
||||
"kk_kvarner": ["https://www.kk-kvarner.hr/"],
|
||||
"rk_zamet": ["https://www.rk-zamet.hr/"],
|
||||
"vk_primorje": ["https://www.vkprimorje.hr/"],
|
||||
"ok_rijeka": ["https://www.ok-rijeka.hr/"],
|
||||
"haok_mladost": ["https://www.haok-mladost.hr/"],
|
||||
"abc_rijeka": ["https://www.abc-rijeka.hr/"],
|
||||
"rugby_rijeka": ["https://www.rugbyrijeka.hr/"],
|
||||
"pliva_klub_primorje":["https://www.primorje-aquarius.hr/"],
|
||||
"judo_kvarner": ["https://www.judokvarner.hr/"],
|
||||
"kuglacki_savez_pgz": ["https://www.kuglacki-savez-pgz.hr/"],
|
||||
"tenis_kvarner": ["https://www.tk-kvarner.hr/"],
|
||||
"atletika_rijeka": ["https://www.akrijeka.hr/"],
|
||||
"biciklisticki": ["https://www.bk-rijeka.hr/"],
|
||||
"stoljecesporta": ["https://stoljecesporta.com/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=12):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="sport_klub_pgz", confidence=0.88)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in KLUBOVI.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=10)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"klub_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Crkve i vjerske institucije PGZ."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
CRKVA = {
|
||||
"rijecka_nadbiskupija":["https://www.ri-nadbiskupija.hr/"],
|
||||
"krcka_biskupija": ["https://www.biskupija-krk.hr/"],
|
||||
"isusovci_rijeka": ["https://isusovci.hr/"],
|
||||
"trsat_svetiste": ["https://trsat-svetiste.com/"],
|
||||
"katedrala_rijeka": ["https://katedrala-rijeka.hr/"],
|
||||
"samostan_kosljun": ["https://www.kosljun.hr/"],
|
||||
"samostan_glavotok": ["https://www.glavotok.hr/"],
|
||||
"katedrala_krk": ["https://www.biskupija-krk.hr/katedrala/"],
|
||||
"crkva_opatija": ["https://www.zupa-opatija.hr/"],
|
||||
"rijecka_eparhija": ["https://www.eparhija-zagrebackoljubljanska.com/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=10):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="vjera_pgz", confidence=0.84)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 25:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in CRKVA.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=10)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"crkva_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Zdravstvo + udruge PGZ."""
|
||||
import sys, json, time
|
||||
sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters")
|
||||
from _common import (fetch, extract_text, extract_title, chunk_text,
|
||||
upsert_facts, find_internal_links, DSN)
|
||||
from urllib.parse import urlparse
|
||||
import psycopg2
|
||||
|
||||
ZDRAVSTVO = {
|
||||
"kbc_rijeka": ["https://www.kbc-rijeka.hr/"],
|
||||
"thalassotherapia": ["https://thalassotherapia-opatija.hr/"],
|
||||
"klinika_lovran": ["https://www.tnz-lovran.hr/"],
|
||||
"klinika_crikvenica": ["https://www.thalassotherapia-crikvenica.hr/"],
|
||||
"dom_zdravlja_pgz": ["https://www.dom-zdravlja-pgz.hr/"],
|
||||
"zavod_javno_zdravlje":["https://www.zzjzpgz.hr/"],
|
||||
"crveni_kriz_pgz": ["https://www.crveni-kriz-rijeka.hr/"],
|
||||
"hzzo_rijeka": ["https://hzzo.hr/"],
|
||||
"savjetovaliste_ri": ["https://www.zzjzpgz.hr/savjetovaliste/"],
|
||||
"deinstitucionalizacija":["https://www.cczg-rijeka.hr/"],
|
||||
"centar_socijalne": ["https://www.czss.rijeka.hr/"],
|
||||
}
|
||||
|
||||
|
||||
def crawl(name, urls, max_pages=12):
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
visited = set(); queue = list(urls); facts = 0
|
||||
while queue and len(visited) < max_pages:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
html, status = fetch(url, timeout=15)
|
||||
if not html or status != 200: continue
|
||||
title = extract_title(html); text = extract_text(html)
|
||||
if not text or len(text) < 200: continue
|
||||
ff = []
|
||||
if title and len(title) > 8:
|
||||
ff.append({"fact": f"{name} - {title}", "url": url, "title": title})
|
||||
for c in chunk_text(text, 800):
|
||||
if len(c) > 100:
|
||||
ff.append({"fact": c, "url": url, "title": title})
|
||||
facts += upsert_facts(conn, ff, source_name=name,
|
||||
category="zdravstvo_pgz", confidence=0.86)
|
||||
base = urlparse(url).hostname
|
||||
for link in find_internal_links(html, url):
|
||||
if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 40:
|
||||
queue.append(link)
|
||||
time.sleep(0.5)
|
||||
conn.close()
|
||||
return {"name": name, "visited": len(visited), "facts": facts}
|
||||
|
||||
|
||||
def main():
|
||||
results = []
|
||||
for name, urls in ZDRAVSTVO.items():
|
||||
try:
|
||||
r = crawl(name, urls, max_pages=10)
|
||||
print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f")
|
||||
results.append(r)
|
||||
except Exception as e:
|
||||
print(f" {name:25} FAIL: {str(e)[:60]}")
|
||||
total = sum(r.get("facts", 0) for r in results)
|
||||
print(f"=== TOTAL: {total} ===")
|
||||
print(json.dumps({"zdr_count": len(results), "total_facts": total}))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user