Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers

- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
2026-05-05 13:08:11 +02:00
parent 9fb512932a
commit 1d02c0897d
970 changed files with 268354 additions and 434 deletions
+169
View File
@@ -0,0 +1,169 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: pgz_savezi_deep.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/pgz_savezi_deep.py
# Svrha: Deep crawl glavnih sportskih saveza za PGŽ klubove
# - HNS (nogomet) — hns-cff.hr, prvahnl.hr
# - HKS (košarka) — hks.hr, abaliga.com
# - HRS (rukomet) — hrs.hr
# - HOS (odbojka) — hos.hr
# - HBS (boćanje) — hbs.hr
# - HVS (vaterpolo) — hvs.hr
# Sve klube + utakmice + rezultate koji su u PGŽ
# ═══════════════════════════════════════════════════════════════════
"""Multi-savez deep scrape for PGŽ clubs."""
import os, sys, re, time, hashlib, logging, json
from urllib.parse import urljoin, urlparse
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch
logging.basicConfig(level=logging.INFO, format="%(asctime)s [savezi] %(message)s")
log = logging.getLogger("savezi")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
# PGŽ municipalities — for filtering relevant clubs
PGZ_TOWNS = ["Rijeka", "Opatija", "Crikvenica", "Krk", "Cres", "Mali Lošinj",
"Rab", "Delnice", "Vrbovsko", "Čabar", "Bakar", "Kraljevica",
"Kastav", "Viškovo", "Klana", "Mošćenička Draga", "Lovran",
"Matulji", "Omišalj", "Punat", "Vrbnik", "Baška", "Dobrinj",
"Malinska", "Jelenje", "Costrena", "Kostrena", "Čavle", "Lopar",
"Brod Moravice", "Mrkopalj", "Ravna Gora", "Lokve", "Skrad",
"Fužine", "Novi Vinodolski", "Vinodol"]
ROOTS = {
"hns_nogomet": ["https://hns-cff.hr/", "https://prvahnl.hr/", "https://hns-cff.hr/klubovi/"],
"hks_kosarka": ["https://hks.hr/", "https://hks.hr/klubovi/"],
"hrs_rukomet": ["https://hrs.hr/", "https://hrs.hr/klubovi/"],
"hos_odbojka": ["https://hos.hr/", "https://hos.hr/klubovi/"],
"hbs_bocanje": ["https://hbs.hr/", "https://hbs.hr/klubovi/"],
"hvs_vaterpolo": ["https://hvs.hr/", "https://hvs.hr/klubovi/"],
"hps_plivanje": ["https://hps.hr/"],
"haof_atletika": ["https://haaf.hr/"],
"hgsf_gimnastika":["https://hgsf.hr/"],
}
def fetch(url, timeout=20, retries=2):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace"), r.status
except Exception:
time.sleep(2*(i+1))
return None, 0
def extract_text(html):
h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
t = re.sub(r"<[^>]+>", " ", h)
return re.sub(r"\s+", " ", unescape(t)).strip()
def is_pgz_relevant(text):
"""Check if text mentions PGŽ towns/clubs."""
return any(t in text for t in PGZ_TOWNS) or "Primorsko-goranska" in text or "PGŽ" in text
def chunk(text, max_len=800):
if len(text) <= max_len: return [text] if text else []
out = []; start = 0
while start < len(text):
end = min(start + max_len, len(text))
if end < len(text):
for sep in [". ", "! ", "? ", "\n"]:
p = text.rfind(sep, start, end)
if p > start + max_len // 2:
end = p + len(sep); break
out.append(text[start:end].strip())
start = end
return [c for c in out if len(c) > 80]
def upsert(conn, facts, savez_key):
if not facts: return 0
cur = conn.cursor()
rows = []
for f in facts:
h = hashlib.md5(f["fact"].encode()).hexdigest()
rows.append((f["fact"], f"savezi_{savez_key}", "pgz_sport_savezi",
f.get("confidence", 0.82), h,
json.dumps({"url": f.get("url", "")})))
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
try:
execute_batch(cur, sql, rows, page_size=50)
n = cur.rowcount
cur.close()
return n
except Exception as e:
log.error(f"upsert: {e}")
return 0
def crawl_savez(savez_key, urls, max_per=80):
log.info(f"=== {savez_key} ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set()
queue = list(urls)
total_facts = 0
pgz_relevant = 0
while queue and len(visited) < max_per:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, _ = fetch(url, timeout=15)
if not html: continue
text = extract_text(html)
if not text or len(text) < 100: continue
# Add subpages
for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
u = urljoin(url, m.group(1))
host = urlparse(u).hostname or ""
base_host = urlparse(url).hostname or ""
if host == base_host and u not in visited and len(queue) < 200:
queue.append(u.split("#")[0])
# Only ingest PGŽ-relevant content
if not is_pgz_relevant(text):
continue
pgz_relevant += 1
facts = [{"fact": c, "url": url, "confidence": 0.82}
for c in chunk(text, 800) if len(c) > 100]
total_facts += upsert(conn, facts, savez_key)
time.sleep(0.4)
log.info(f" {savez_key}: visited={len(visited)} pgz_relevant={pgz_relevant} facts={total_facts}")
conn.close()
return {"savez": savez_key, "visited": len(visited),
"pgz_relevant": pgz_relevant, "facts": total_facts}
def main():
results = []
for savez, urls in ROOTS.items():
try:
r = crawl_savez(savez, urls, max_per=60)
results.append(r)
except Exception as e:
log.error(f"{savez} fail: {e}")
results.append({"savez": savez, "error": str(e)})
print(json.dumps({"summary": results,
"total_facts": sum(r.get("facts", 0) for r in results)}))
if __name__ == "__main__":
main()
+194
View File
@@ -0,0 +1,194 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: rss_hr_full.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/rss_hr_full.py
# Svrha: rss.hr (Riječki sport savez) full crawl
# ═══════════════════════════════════════════════════════════════════
"""rss.hr complete corpus."""
import os, sys, re, time, hashlib, logging, json
from urllib.parse import urljoin, urlparse
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch
logging.basicConfig(level=logging.INFO, format="%(asctime)s [rss_hr] %(message)s")
log = logging.getLogger("rss_hr")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
ROOT = "https://rss.hr"
HTML_DIR = "/opt/pgz-sport/data/rss_hr_html"
PDF_DIR = "/opt/pgz-sport/data/rss_hr_pdf"
os.makedirs(HTML_DIR, exist_ok=True)
os.makedirs(PDF_DIR, exist_ok=True)
def fetch(url, timeout=20, retries=3, binary=False):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
d = r.read()
return d if binary else d.decode("utf-8", errors="replace"), r.status
except Exception:
time.sleep(3*(i+1))
return None, 0
def extract_title(html):
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else ""
def extract_text(html):
h = re.sub(r"<script[^>]*>.*?</script>", "", html or "", flags=re.S|re.I)
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
t = re.sub(r"<[^>]+>", " ", h)
return re.sub(r"\s+", " ", unescape(t)).strip()
def find_internal_links(html, base):
if not html: return []
out = set()
for m in re.finditer(r'href=["\']([^"\']+)["\']', html):
u = urljoin(base, m.group(1))
host = urlparse(u).hostname or ""
if "rss.hr" in host:
# Strip query/fragment
u = u.split("#")[0]
out.add(u)
return list(out)
def find_pdfs(html, base):
out = set()
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html or "", re.I):
out.add(urljoin(base, m.group(1)))
return list(out)
def chunk(text, max_len=800):
if len(text) <= max_len: return [text] if text else []
out = []
start = 0
while start < len(text):
end = min(start + max_len, len(text))
if end < len(text):
for sep in [". ", "! ", "? ", "\n"]:
p = text.rfind(sep, start, end)
if p > start + max_len // 2:
end = p + len(sep); break
out.append(text[start:end].strip())
start = end
return [c for c in out if len(c) > 50]
def upsert(conn, facts):
if not facts: return 0
cur = conn.cursor()
rows = []
for f in facts:
h = hashlib.md5(f["fact"].encode()).hexdigest()
rows.append((f["fact"], f["source"], f.get("category", "rss_hr"),
f.get("confidence", 0.85), h,
json.dumps({"url": f.get("url", "")})))
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
try:
execute_batch(cur, sql, rows, page_size=50)
n = cur.rowcount
cur.close()
return n
except Exception as e:
log.error(f"upsert: {e}")
return 0
def crawl(max_pages=400):
log.info(f"=== rss.hr crawl (max {max_pages} pages) ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
visited = set()
queue = [ROOT, f"{ROOT}/clanovi/", f"{ROOT}/natjecanja/",
f"{ROOT}/dokumenti/", f"{ROOT}/o-nama/",
f"{ROOT}/sportasi-sezone/", f"{ROOT}/povjerenstva/",
f"{ROOT}/strucni-savjet/"]
total_facts = 0
pdfs = set()
while queue and len(visited) < max_pages:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
if len(visited) % 20 == 0:
log.info(f" visited {len(visited)}, queue {len(queue)}, facts {total_facts}")
result = fetch(url, timeout=15)
if not result or not result[0]:
continue
html = result[0]
# Save html
try:
h = hashlib.md5(url.encode()).hexdigest()[:16]
with open(f"{HTML_DIR}/{h}.html", "w", encoding="utf-8") as f:
f.write(html)
except Exception:
pass
title = extract_title(html)
text = extract_text(html)
# PDFs
for p in find_pdfs(html, url):
pdfs.add(p)
# Facts
facts = []
if title and len(title) > 8:
facts.append({"fact": f"rss.hr — {title}", "source": "rss.hr",
"category": "rss_hr_riecki_sport_savez",
"confidence": 0.90, "url": url})
for c in chunk(text, 800):
if len(c) < 80: continue
facts.append({"fact": c, "source": "rss.hr",
"category": "rss_hr_riecki_sport_savez",
"confidence": 0.85, "url": url})
total_facts += upsert(conn, facts)
# Discover more links
for l in find_internal_links(html, url):
if l not in visited and len(queue) < 1000:
queue.append(l)
time.sleep(0.4)
# Download PDFs
pdf_dl = 0
for p in list(pdfs)[:100]:
try:
h = hashlib.md5(p.encode()).hexdigest()[:16]
path = f"{PDF_DIR}/{h}.pdf"
if os.path.exists(path): continue
data, st = fetch(p, timeout=30, binary=True)
if data and st == 200:
with open(path, "wb") as f: f.write(data)
pdf_dl += 1
time.sleep(0.8)
except Exception:
pass
log.info(f"=== DONE: {len(visited)} visited, {total_facts} facts, {pdf_dl} pdfs ===")
conn.close()
return {"visited": len(visited), "facts": total_facts, "pdfs": pdf_dl}
if __name__ == "__main__":
r = crawl()
print(json.dumps(r))
+244
View File
@@ -0,0 +1,244 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: sport_pgz_full.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/sport_pgz_full.py
# Svrha: Sitemap-driven full crawl of sport-pgz.hr
# - All 4 sitemaps: objave (1+2), natječaji, stranice
# - PDF download + OCR ingest
# - Article parsing → dabi.knowledge ingest
# ═══════════════════════════════════════════════════════════════════
"""sport-pgz.hr complete corpus via sitemap."""
import os, sys, re, time, hashlib, logging, json
from urllib.parse import urljoin, urlparse
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch
logging.basicConfig(level=logging.INFO, format="%(asctime)s [sport_pgz] %(message)s")
log = logging.getLogger("sport_pgz")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
SITEMAP_INDEX = "https://sport-pgz.hr/sitemap.xml"
PDF_DIR = "/opt/pgz-sport/data/sport_pgz_pdf"
HTML_DIR = "/opt/pgz-sport/data/sport_pgz_html"
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(HTML_DIR, exist_ok=True)
def fetch(url, timeout=20, retries=3, binary=False):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
data = r.read()
if binary:
return data, r.status
return data.decode("utf-8", errors="replace"), r.status
except Exception as e:
log.warning(f"fetch fail attempt {i+1} {url}: {e}")
time.sleep(3*(i+1))
return None, 0
def parse_sitemap_index(xml):
"""Return list of sub-sitemap URLs."""
return re.findall(r"<loc>(https?://[^<]+)</loc>", xml or "")
def parse_sitemap_urls(xml):
"""Return list of (url, lastmod) pairs."""
out = []
for m in re.finditer(r"<url>\s*<loc>([^<]+)</loc>(?:\s*<lastmod>([^<]*)</lastmod>)?", xml or ""):
out.append((m.group(1), m.group(2) or ""))
return out
def extract_main_text(html):
if not html: return ""
h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S|re.I)
h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
text = re.sub(r"<[^>]+>", " ", h)
text = unescape(text)
return re.sub(r"\s+", " ", text).strip()
def extract_title(html):
m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
if m:
return re.sub(r"\s+", " ", unescape(m.group(1))).strip()
return ""
def find_pdf_links(html, base):
if not html: return []
out = set()
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
out.add(urljoin(base, m.group(1)))
return list(out)
def chunk_text(text, max_len=800):
"""Split into ~800 char chunks, prefer sentence boundaries."""
if len(text) <= max_len:
return [text] if text else []
chunks = []
start = 0
while start < len(text):
end = min(start + max_len, len(text))
# Find last . or ! or ? or newline
if end < len(text):
for sep in [". ", "! ", "? ", "\n"]:
p = text.rfind(sep, start, end)
if p > start + max_len // 2:
end = p + len(sep)
break
chunks.append(text[start:end].strip())
start = end
return [c for c in chunks if len(c) > 50]
def upsert_facts(conn, facts):
"""Bulk insert into dabi.knowledge."""
if not facts:
return 0
cur = conn.cursor()
rows = []
for f in facts:
h = hashlib.md5(f["fact"].encode()).hexdigest()
rows.append((
f["fact"], f["source"], f.get("category", "sport_pgz"),
f.get("confidence", 0.85), h,
json.dumps({"url": f.get("url", "")}),
))
sql = """
INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
VALUES (%s, %s, %s, %s, %s, %s::jsonb)
ON CONFLICT (data_hash) DO NOTHING
"""
try:
execute_batch(cur, sql, rows, page_size=50)
cnt = cur.rowcount
cur.close()
return cnt
except Exception as e:
log.error(f"upsert err: {e}")
return 0
def crawl():
"""Main crawl entry."""
log.info(f"=== sport-pgz.hr full crawl ===")
conn = psycopg2.connect(DSN); conn.autocommit = True
# 1. Get sitemap index
xml, _ = fetch(SITEMAP_INDEX)
if not xml:
log.error("sitemap fetch failed"); return
sub_sitemaps = parse_sitemap_index(xml)
log.info(f"sub-sitemaps: {len(sub_sitemaps)}")
all_urls = []
for sm in sub_sitemaps:
sub_xml, _ = fetch(sm)
if sub_xml:
urls = parse_sitemap_urls(sub_xml)
all_urls.extend(urls)
log.info(f" {sm}: {len(urls)} urls")
log.info(f"TOTAL URLs to crawl: {len(all_urls)}")
# 2. Crawl each URL → text → facts
total_facts = 0
crawled = 0
pdfs_found = []
for idx, (url, lastmod) in enumerate(all_urls, 1):
if idx % 20 == 0:
log.info(f" progress: {idx}/{len(all_urls)} crawled, {total_facts} facts")
try:
html, status = fetch(url, timeout=15)
if not html:
continue
# Save HTML for replay
url_hash = hashlib.md5(url.encode()).hexdigest()[:16]
html_path = f"{HTML_DIR}/{url_hash}.html"
try:
with open(html_path, "w", encoding="utf-8") as f:
f.write(html)
except Exception:
pass
title = extract_title(html)
text = extract_main_text(html)
# Collect PDFs
for pdf_url in find_pdf_links(html, url):
pdfs_found.append(pdf_url)
# Build facts
facts = []
if title and len(title) > 10:
facts.append({
"fact": f"sport-pgz.hr — {title}",
"source": "sport-pgz.hr",
"category": "sport_pgz_official",
"confidence": 0.92,
"url": url,
})
for chunk in chunk_text(text, max_len=800):
if len(chunk) < 80:
continue
facts.append({
"fact": chunk,
"source": "sport-pgz.hr",
"category": "sport_pgz_official",
"confidence": 0.88,
"url": url,
})
inserted = upsert_facts(conn, facts)
total_facts += inserted
crawled += 1
time.sleep(0.5) # rate limit
except Exception as e:
log.warning(f"err {url}: {e}")
# 3. Download PDFs
pdfs_set = list(set(pdfs_found))
log.info(f"PDF links found: {len(pdfs_set)}")
pdf_downloaded = 0
for pdf_url in pdfs_set[:200]: # limit for first run
try:
url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:16]
pdf_path = f"{PDF_DIR}/{url_hash}.pdf"
if os.path.exists(pdf_path):
continue
data, status = fetch(pdf_url, timeout=30, binary=True)
if data and status == 200:
with open(pdf_path, "wb") as f:
f.write(data)
pdf_downloaded += 1
time.sleep(1)
except Exception as e:
log.warning(f"pdf err {pdf_url}: {e}")
log.info(f"=== DONE: {crawled} pages crawled, {total_facts} facts inserted, {pdf_downloaded} PDFs downloaded ===")
conn.close()
return {"crawled": crawled, "facts": total_facts, "pdfs": pdf_downloaded}
if __name__ == "__main__":
r = crawl()
print(json.dumps(r))
+147
View File
@@ -0,0 +1,147 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: wiki_pgz_sport.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/wiki_pgz_sport.py
# Svrha: Wikipedia HR/EN scrape — PGŽ sport klubovi + sportaši
# - Iterate kroz sve known PGŽ klubove
# - Wiki API → page extract
# - Plus historical match results od Wikipedia season tables
# ═══════════════════════════════════════════════════════════════════
"""Wikipedia PGŽ sport corpus."""
import os, sys, re, time, hashlib, logging, json
import urllib.request, urllib.parse
import psycopg2
from psycopg2.extras import execute_batch
logging.basicConfig(level=logging.INFO, format="%(asctime)s [wiki_sport] %(message)s")
log = logging.getLogger("wiki_sport")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Ri.NET Civic Bot 1.0 (contact: dradulic@outlook.com)"
API_HR = "https://hr.wikipedia.org/w/api.php"
API_EN = "https://en.wikipedia.org/w/api.php"
def wiki_extract(api, title, sentences=None):
"""Get plain text extract for a Wikipedia page."""
params = {
"action": "query", "prop": "extracts", "explaintext": "1",
"redirects": "1", "format": "json", "titles": title,
}
if sentences:
params["exsentences"] = str(sentences)
url = api + "?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url, headers={"User-Agent": UA})
try:
with urllib.request.urlopen(req, timeout=15) as r:
d = json.loads(r.read())
pages = d.get("query", {}).get("pages", {})
for pid, p in pages.items():
if pid == "-1": return None # not found
return p.get("extract", "")
except Exception as e:
log.warning(f"wiki err {title}: {e}")
return None
def wiki_search(api, query, limit=5):
"""Search Wikipedia for related pages."""
params = {"action": "query", "list": "search", "srsearch": query,
"format": "json", "srlimit": str(limit)}
url = api + "?" + urllib.parse.urlencode(params)
req = urllib.request.Request(url, headers={"User-Agent": UA})
try:
with urllib.request.urlopen(req, timeout=10) as r:
d = json.loads(r.read())
return [p["title"] for p in d.get("query", {}).get("search", [])]
except Exception:
return []
def get_pgz_clubs(conn):
"""Fetch active PGŽ clubs from DB."""
cur = conn.cursor()
cur.execute("""
SELECT naziv, COALESCE(skraceni_naziv, '')
FROM pgz_sport.klubovi
WHERE COALESCE(aktivan, true) = true
ORDER BY naziv
""")
out = [(r[0], r[1]) for r in cur.fetchall()]
cur.close()
return out
def chunk(text, max_len=700):
if len(text) <= max_len: return [text] if text else []
out = []; start = 0
while start < len(text):
end = min(start + max_len, len(text))
if end < len(text):
for sep in [". ", "! ", "? ", "\n"]:
p = text.rfind(sep, start, end)
if p > start + max_len // 2:
end = p + len(sep); break
out.append(text[start:end].strip())
start = end
return [c for c in out if len(c) > 80]
def upsert(conn, facts):
if not facts: return 0
cur = conn.cursor()
rows = []
for f in facts:
h = hashlib.md5(f["fact"].encode()).hexdigest()
rows.append((f["fact"], "wikipedia_pgz_sport", "pgz_sport_wiki",
f.get("confidence", 0.84), h,
json.dumps({"page": f.get("page", ""), "lang": f.get("lang", "hr")})))
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
try:
execute_batch(cur, sql, rows, page_size=50)
n = cur.rowcount; cur.close()
return n
except Exception as e:
log.error(f"upsert: {e}")
return 0
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
clubs = get_pgz_clubs(conn)
log.info(f"PGŽ active clubs: {len(clubs)}")
total_facts = 0
found_pages = 0
for naziv, kraci in clubs[:200]: # limit first run
# Try direct page first
text = wiki_extract(API_HR, naziv)
if not text:
# Try search
candidates = wiki_search(API_HR, naziv, limit=3)
for c in candidates:
if any(t.lower() in c.lower() for t in [naziv.split()[-1], "Rijeka", "Opatija", "Krk"]):
text = wiki_extract(API_HR, c)
if text:
break
if text and len(text) > 200:
found_pages += 1
facts = [{"fact": c, "page": naziv, "lang": "hr", "confidence": 0.85}
for c in chunk(text, 700)]
total_facts += upsert(conn, facts)
time.sleep(0.5)
if found_pages % 20 == 0 and found_pages > 0:
log.info(f" progress: pages {found_pages}, facts {total_facts}")
log.info(f"=== DONE: pages={found_pages} facts={total_facts} ===")
print(json.dumps({"pages": found_pages, "facts": total_facts}))
conn.close()
if __name__ == "__main__":
main()