#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh # ═══════════════════════════════════════════════════════════════════ # wiki_hr_scraper.py | v1.0.0 | 04.05.2026 # Svrha: Hrvatska Wikipedia — extract relevant pages za HR knowledge # Strategy: API search po HR-relevant kategorijama + fetch top results # ═══════════════════════════════════════════════════════════════════ """Hrvatska Wikipedia scraper (preko API).""" import os, time, hashlib, logging, re, json import urllib.request, urllib.parse import psycopg2 import sys logging.basicConfig(level=logging.INFO, format='%(asctime)s [wiki_hr] %(message)s') log = logging.getLogger("wiki_hr") DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Ri.NET Bot 1.0 (contact: dradulic@outlook.com)" API = "https://hr.wikipedia.org/w/api.php" # Kategorije — širok HR knowledge bazu CATEGORIES = [ "Gradovi_u_Hrvatskoj", "Hrvatski_otoci", "Planine_u_Hrvatskoj", "Rijeke_u_Hrvatskoj", "Primorsko-goranska_županija", "Naselja_u_Primorsko-goranskoj_županiji", "Hrvatski_političari", "Hrvatski_športaši", "Hrvatski_glazbenici", "Hrvatski_književnici", "Hrvatski_glumci", "Hrvatska_povijest", "Hrvatska_kuhinja", "Hrvatska_kultura", "Domovinski_rat", "Gospodarstvo_Hrvatske", "Hrvatski_nogometni_klubovi", "Hrvatski_košarkaški_klubovi", "Hrvatski_rukometni_klubovi", "Hrvatski_odbojkaški_klubovi", "Hrvatske_političke_stranke", "Rijeka", "Krk", "Cres", "Lošinj", "Rab", "Pag", "Učka", "HNK_Rijeka" ] def api_get(params): """Wikipedia API GET.""" p = dict(params) p['format'] = 'json' p['utf8'] = '1' url = API + '?' + urllib.parse.urlencode(p) try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=20) as r: return json.loads(r.read().decode('utf-8')) except Exception as e: log.warning(f"API fail: {e}") return {} def category_members(cat, limit=500): """Get all pages in category.""" pages = [] cont = '' while True: params = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': f'Kategorija:{cat}', 'cmlimit': '500', 'cmtype': 'page' } if cont: params['cmcontinue'] = cont d = api_get(params) if not d.get('query'): break for m in d['query'].get('categorymembers', []): pages.append(m['title']) if len(pages) >= limit: return pages cont = d.get('continue', {}).get('cmcontinue') if not cont: break time.sleep(0.5) return pages def fetch_page_extract(title): """Get plain text extract of a page.""" params = { 'action': 'query', 'prop': 'extracts|info', 'exintro': '0', 'explaintext': '1', 'inprop': 'url', 'titles': title, 'exsectionformat': 'plain', 'exlimit': '1', } d = api_get(params) if not d.get('query'): return None, None pages = d['query'].get('pages', {}) for pid, p in pages.items(): if pid == '-1': continue return p.get('extract', ''), p.get('fullurl', '') return None, None def harvest(): conn = psycopg2.connect(DSN); conn.autocommit = True cur = conn.cursor() total_facts = 0 total_pages = 0 for cat in CATEGORIES: log.info(f"=== Kategorija: {cat} ===") pages = category_members(cat, limit=200) log.info(f" pages: {len(pages)}") for title in pages: try: # Skip ako već imamo cur.execute("SELECT 1 FROM dabi.knowledge WHERE source = 'wiki_hr' AND fact LIKE %s LIMIT 1", (f'{title[:50]}%',)) if cur.fetchone(): continue extract, url = fetch_page_extract(title) time.sleep(0.5) if not extract or len(extract) < 200: continue # Razdvoji na chunks (svaki chunk = jedan fact) # Prvi chunk je intro (najvažniji) chunks = [] first_chunk = extract[:1500] chunks.append((title + " — " + first_chunk, 0.92)) # Sljedeći chunks (manje confidence) for i in range(1500, min(len(extract), 6000), 1500): chunks.append((title + " — " + extract[i:i+1500], 0.85)) for chunk_text, conf in chunks: fh = hashlib.sha256(f"wiki:{title}:{chunk_text[:80]}".encode()).hexdigest()[:32] refs = json.dumps([{"url": url, "title": title, "wikipedia": "hr"}]) try: cur.execute(""" INSERT INTO dabi.knowledge (fact, category, source, source_refs, confidence, data_hash, created_at) VALUES (%s, %s, 'wiki_hr', %s::jsonb, %s, %s, now()) ON CONFLICT (data_hash) DO NOTHING """, (chunk_text[:2000], f'wiki_{cat[:30]}', refs, conf, fh)) total_facts += cur.rowcount except Exception as e: if total_facts < 5: log.warning(f"insert: {e}") total_pages += 1 if total_pages % 20 == 0: log.info(f" Progress: {total_pages} pages, {total_facts} facts") except Exception as e: log.warning(f" Page fail '{title}': {e}") continue log.info(f" Done {cat}: total facts={total_facts}") log.info(f"═══ FINAL: {total_pages} pages, {total_facts} facts ═══") cur.close(); conn.close() if __name__ == "__main__": harvest()