#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh import os # Federation deep scrape — HNS, HPS, HRS import sys sys.path.insert(0, '/opt/pgz-sport/scrapers') from gov_hr_sport_scraper import fetch, extract_text, find_links from urllib.parse import urljoin, urlparse import time, re, hashlib, json, psycopg2 from html import unescape import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s [fed] %(message)s') log = logging.getLogger("fed") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" ROOTS = { "HNS": ["https://hns-cff.hr", "https://www.nspgz.hr"], "HPS": ["https://www.hps.hr"], "HRS": ["https://www.hrs.hr"], "HOK": ["https://www.hok.hr"], # Hrvatski olimpijski komitet "HKS": ["https://www.hks.hr"], # Hrvatski karatraski savez } def harvest(): conn = psycopg2.connect(DSN); conn.autocommit = True cur = conn.cursor() total_docs = total_facts = 0 for fed, roots in ROOTS.items(): log.info(f"=== {fed} deep ===") visited = set(); queue = list(roots) while queue and len(visited) < 80: url = queue.pop(0) if url in visited: continue visited.add(url) time.sleep(2) html, status = fetch(url) if not html or status != 200: continue log.info(f" [{status}] {url[:80]}") text = extract_text(html) if len(text) < 200: continue title_m = re.search(r']*>([^<]+)', html, re.I) title = title_m.group(1).strip() if title_m else url[:80] sha1 = hashlib.sha1(text[:5000].encode()).hexdigest() try: cur.execute("""INSERT INTO pgz_sport.dokumenti (url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija) VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""", (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, fed)) total_docs += cur.rowcount except: pass # Facts ako ima sport-relevant if any(kw in text.lower() for kw in ['klub', 'sportaš', 'natjecanj', 'liga', 'kup', 'prvenstvo']): chunks = [text[i:i+800] for i in range(0, min(len(text), 3000), 800)] for ci, chunk in enumerate(chunks[:3]): if len(chunk) < 200: continue fh = hashlib.sha256((url+str(ci)+chunk[:80]).encode()).hexdigest()[:32] try: cur.execute("""INSERT INTO dabi.knowledge (fact, category, source, source_refs, confidence, data_hash, created_at) VALUES (%s,%s,'fed_deep_scraper',%s::jsonb,0.80,%s,now()) ON CONFLICT (data_hash) DO NOTHING""", (chunk[:1500], f'fed_{fed.lower()}', json.dumps([{"url":url}]), fh)) total_facts += cur.rowcount except: pass # Follow internal links host = urlparse(url).hostname for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I): u = urljoin(url, m.group(1)) if urlparse(u).hostname == host and u not in visited and u not in queue: queue.append(u) if len(queue) > 100: break log.info(f"TOTAL: docs={total_docs} facts={total_facts}") cur.close(); conn.close() if __name__ == "__main__": harvest()