172 lines
6.1 KiB
Python
172 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# wiki_hr_scraper.py | v1.0.0 | 04.05.2026
|
|
# Svrha: Hrvatska Wikipedia — extract relevant pages za HR knowledge
|
|
# Strategy: API search po HR-relevant kategorijama + fetch top results
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""Hrvatska Wikipedia scraper (preko API)."""
|
|
import os, time, hashlib, logging, re, json
|
|
import urllib.request, urllib.parse
|
|
import psycopg2
|
|
import sys
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [wiki_hr] %(message)s')
|
|
log = logging.getLogger("wiki_hr")
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
UA = "Ri.NET Bot 1.0 (contact: dradulic@outlook.com)"
|
|
API = "https://hr.wikipedia.org/w/api.php"
|
|
|
|
# Kategorije — širok HR knowledge bazu
|
|
CATEGORIES = [
|
|
"Gradovi_u_Hrvatskoj",
|
|
"Hrvatski_otoci",
|
|
"Planine_u_Hrvatskoj",
|
|
"Rijeke_u_Hrvatskoj",
|
|
"Primorsko-goranska_županija",
|
|
"Naselja_u_Primorsko-goranskoj_županiji",
|
|
"Hrvatski_političari",
|
|
"Hrvatski_športaši",
|
|
"Hrvatski_glazbenici",
|
|
"Hrvatski_književnici",
|
|
"Hrvatski_glumci",
|
|
"Hrvatska_povijest",
|
|
"Hrvatska_kuhinja",
|
|
"Hrvatska_kultura",
|
|
"Domovinski_rat",
|
|
"Gospodarstvo_Hrvatske",
|
|
"Hrvatski_nogometni_klubovi",
|
|
"Hrvatski_košarkaški_klubovi",
|
|
"Hrvatski_rukometni_klubovi",
|
|
"Hrvatski_odbojkaški_klubovi",
|
|
"Hrvatske_političke_stranke",
|
|
"Rijeka",
|
|
"Krk",
|
|
"Cres",
|
|
"Lošinj",
|
|
"Rab",
|
|
"Pag",
|
|
"Učka",
|
|
"HNK_Rijeka"
|
|
]
|
|
|
|
def api_get(params):
|
|
"""Wikipedia API GET."""
|
|
p = dict(params)
|
|
p['format'] = 'json'
|
|
p['utf8'] = '1'
|
|
url = API + '?' + urllib.parse.urlencode(p)
|
|
try:
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
with urllib.request.urlopen(req, timeout=20) as r:
|
|
return json.loads(r.read().decode('utf-8'))
|
|
except Exception as e:
|
|
log.warning(f"API fail: {e}")
|
|
return {}
|
|
|
|
def category_members(cat, limit=500):
|
|
"""Get all pages in category."""
|
|
pages = []
|
|
cont = ''
|
|
while True:
|
|
params = {
|
|
'action': 'query',
|
|
'list': 'categorymembers',
|
|
'cmtitle': f'Kategorija:{cat}',
|
|
'cmlimit': '500',
|
|
'cmtype': 'page'
|
|
}
|
|
if cont: params['cmcontinue'] = cont
|
|
d = api_get(params)
|
|
if not d.get('query'): break
|
|
for m in d['query'].get('categorymembers', []):
|
|
pages.append(m['title'])
|
|
if len(pages) >= limit: return pages
|
|
cont = d.get('continue', {}).get('cmcontinue')
|
|
if not cont: break
|
|
time.sleep(0.5)
|
|
return pages
|
|
|
|
def fetch_page_extract(title):
|
|
"""Get plain text extract of a page."""
|
|
params = {
|
|
'action': 'query',
|
|
'prop': 'extracts|info',
|
|
'exintro': '0',
|
|
'explaintext': '1',
|
|
'inprop': 'url',
|
|
'titles': title,
|
|
'exsectionformat': 'plain',
|
|
'exlimit': '1',
|
|
}
|
|
d = api_get(params)
|
|
if not d.get('query'): return None, None
|
|
pages = d['query'].get('pages', {})
|
|
for pid, p in pages.items():
|
|
if pid == '-1': continue
|
|
return p.get('extract', ''), p.get('fullurl', '')
|
|
return None, None
|
|
|
|
def harvest():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
cur = conn.cursor()
|
|
|
|
total_facts = 0
|
|
total_pages = 0
|
|
|
|
for cat in CATEGORIES:
|
|
log.info(f"=== Kategorija: {cat} ===")
|
|
pages = category_members(cat, limit=200)
|
|
log.info(f" pages: {len(pages)}")
|
|
|
|
for title in pages:
|
|
try:
|
|
# Skip ako već imamo
|
|
cur.execute("SELECT 1 FROM dabi.knowledge WHERE source = 'wiki_hr' AND fact LIKE %s LIMIT 1",
|
|
(f'{title[:50]}%',))
|
|
if cur.fetchone():
|
|
continue
|
|
|
|
extract, url = fetch_page_extract(title)
|
|
time.sleep(0.5)
|
|
if not extract or len(extract) < 200: continue
|
|
|
|
# Razdvoji na chunks (svaki chunk = jedan fact)
|
|
# Prvi chunk je intro (najvažniji)
|
|
chunks = []
|
|
first_chunk = extract[:1500]
|
|
chunks.append((title + " — " + first_chunk, 0.92))
|
|
|
|
# Sljedeći chunks (manje confidence)
|
|
for i in range(1500, min(len(extract), 6000), 1500):
|
|
chunks.append((title + " — " + extract[i:i+1500], 0.85))
|
|
|
|
for chunk_text, conf in chunks:
|
|
fh = hashlib.sha256(f"wiki:{title}:{chunk_text[:80]}".encode()).hexdigest()[:32]
|
|
refs = json.dumps([{"url": url, "title": title, "wikipedia": "hr"}])
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO dabi.knowledge
|
|
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
|
VALUES (%s, %s, 'wiki_hr', %s::jsonb, %s, %s, now())
|
|
ON CONFLICT (data_hash) DO NOTHING
|
|
""", (chunk_text[:2000], f'wiki_{cat[:30]}', refs, conf, fh))
|
|
total_facts += cur.rowcount
|
|
except Exception as e:
|
|
if total_facts < 5: log.warning(f"insert: {e}")
|
|
|
|
total_pages += 1
|
|
if total_pages % 20 == 0:
|
|
log.info(f" Progress: {total_pages} pages, {total_facts} facts")
|
|
|
|
except Exception as e:
|
|
log.warning(f" Page fail '{title}': {e}")
|
|
continue
|
|
|
|
log.info(f" Done {cat}: total facts={total_facts}")
|
|
|
|
log.info(f"═══ FINAL: {total_pages} pages, {total_facts} facts ═══")
|
|
cur.close(); conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
harvest()
|