#!/usr/bin/env python3 """Sport klubovi PGZ — direktno s web stranica.""" import sys, json, time sys.path.insert(0, "/opt/pgz-sport/scrapers/harvesters") from _common import (fetch, extract_text, extract_title, chunk_text, upsert_facts, find_internal_links, DSN) from urllib.parse import urlparse import psycopg2 KLUBOVI = { "hnk_rijeka": ["https://www.nk-rijeka.hr/"], "kk_kvarner": ["https://www.kk-kvarner.hr/"], "rk_zamet": ["https://www.rk-zamet.hr/"], "vk_primorje": ["https://www.vkprimorje.hr/"], "ok_rijeka": ["https://www.ok-rijeka.hr/"], "haok_mladost": ["https://www.haok-mladost.hr/"], "abc_rijeka": ["https://www.abc-rijeka.hr/"], "rugby_rijeka": ["https://www.rugbyrijeka.hr/"], "pliva_klub_primorje":["https://www.primorje-aquarius.hr/"], "judo_kvarner": ["https://www.judokvarner.hr/"], "kuglacki_savez_pgz": ["https://www.kuglacki-savez-pgz.hr/"], "tenis_kvarner": ["https://www.tk-kvarner.hr/"], "atletika_rijeka": ["https://www.akrijeka.hr/"], "biciklisticki": ["https://www.bk-rijeka.hr/"], "stoljecesporta": ["https://stoljecesporta.com/"], } def crawl(name, urls, max_pages=12): conn = psycopg2.connect(DSN); conn.autocommit = True visited = set(); queue = list(urls); facts = 0 while queue and len(visited) < max_pages: url = queue.pop(0) if url in visited: continue visited.add(url) html, status = fetch(url, timeout=15) if not html or status != 200: continue title = extract_title(html); text = extract_text(html) if not text or len(text) < 200: continue ff = [] if title and len(title) > 8: ff.append({"fact": f"{name} - {title}", "url": url, "title": title}) for c in chunk_text(text, 800): if len(c) > 100: ff.append({"fact": c, "url": url, "title": title}) facts += upsert_facts(conn, ff, source_name=name, category="sport_klub_pgz", confidence=0.88) base = urlparse(url).hostname for link in find_internal_links(html, url): if link not in visited and (urlparse(link).hostname or "") == base and len(queue) < 30: queue.append(link) time.sleep(0.5) conn.close() return {"name": name, "visited": len(visited), "facts": facts} def main(): results = [] for name, urls in KLUBOVI.items(): try: r = crawl(name, urls, max_pages=10) print(f" {name:25} {r['visited']:>3}p {r['facts']:>5}f") results.append(r) except Exception as e: print(f" {name:25} FAIL: {str(e)[:60]}") total = sum(r.get("facts", 0) for r in results) print(f"=== TOTAL: {total} ===") print(json.dumps({"klub_count": len(results), "total_facts": total})) if __name__ == "__main__": main()