Files
pgz-sport/scrapers/wiki_hr_scraper.py
T

177 lines
6.2 KiB
Python

#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# wiki_hr_scraper.py | v1.0.0 | 04.05.2026
# Svrha: Hrvatska Wikipedia — extract relevant pages za HR knowledge
# Strategy: API search po HR-relevant kategorijama + fetch top results
# ═══════════════════════════════════════════════════════════════════
"""Hrvatska Wikipedia scraper (preko API)."""
import os, time, hashlib, logging, re, json
import urllib.request, urllib.parse
import psycopg2
import sys
logging.basicConfig(level=logging.INFO, format='%(asctime)s [wiki_hr] %(message)s')
log = logging.getLogger("wiki_hr")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Ri.NET Bot 1.0 (contact: dradulic@outlook.com)"
API = "https://hr.wikipedia.org/w/api.php"
# Kategorije — širok HR knowledge bazu
CATEGORIES = [
"Hrvatski_gradovi",
"Hrvatske_općine",
"Hrvatski_otoci",
"Hrvatske_planine",
"Hrvatske_rijeke",
"Primorsko-goranska_županija",
"Naselja_u_Primorsko-goranskoj_županiji",
"Hrvatski_političari",
"Hrvatski_sportaši",
"Hrvatski_glazbenici",
"Hrvatski_pisci",
"Hrvatski_glumci",
"Hrvatska_povijest",
"Hrvatska_arhitektura",
"Hrvatska_kuhinja",
"Hrvatska_kultura",
"Hrvatska_znanost",
"Domovinski_rat",
"Hrvatska_ekonomija",
"Hrvatski_klubovi",
"Hrvatski_nogometni_klubovi",
"Hrvatski_košarkaški_klubovi",
"Hrvatske_političke_stranke",
"Predsjednici_Hrvatske",
"Premijeri_Hrvatske",
"Rijeka",
"Kvarner",
"Krk",
"Cres",
"Lošinj",
"Rab",
"Pag",
"Učka",
"Risnjak",
]
def api_get(params):
"""Wikipedia API GET."""
p = dict(params)
p['format'] = 'json'
p['utf8'] = '1'
url = API + '?' + urllib.parse.urlencode(p)
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return json.loads(r.read().decode('utf-8'))
except Exception as e:
log.warning(f"API fail: {e}")
return {}
def category_members(cat, limit=500):
"""Get all pages in category."""
pages = []
cont = ''
while True:
params = {
'action': 'query',
'list': 'categorymembers',
'cmtitle': f'Kategorija:{cat}',
'cmlimit': '500',
'cmtype': 'page'
}
if cont: params['cmcontinue'] = cont
d = api_get(params)
if not d.get('query'): break
for m in d['query'].get('categorymembers', []):
pages.append(m['title'])
if len(pages) >= limit: return pages
cont = d.get('continue', {}).get('cmcontinue')
if not cont: break
time.sleep(0.5)
return pages
def fetch_page_extract(title):
"""Get plain text extract of a page."""
params = {
'action': 'query',
'prop': 'extracts|info',
'exintro': '0',
'explaintext': '1',
'inprop': 'url',
'titles': title,
'exsectionformat': 'plain',
'exlimit': '1',
}
d = api_get(params)
if not d.get('query'): return None, None
pages = d['query'].get('pages', {})
for pid, p in pages.items():
if pid == '-1': continue
return p.get('extract', ''), p.get('fullurl', '')
return None, None
def harvest():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
total_facts = 0
total_pages = 0
for cat in CATEGORIES:
log.info(f"=== Kategorija: {cat} ===")
pages = category_members(cat, limit=200)
log.info(f" pages: {len(pages)}")
for title in pages:
try:
# Skip ako već imamo
cur.execute("SELECT 1 FROM dabi.knowledge WHERE source = 'wiki_hr' AND fact LIKE %s LIMIT 1",
(f'{title[:50]}%',))
if cur.fetchone():
continue
extract, url = fetch_page_extract(title)
time.sleep(0.5)
if not extract or len(extract) < 200: continue
# Razdvoji na chunks (svaki chunk = jedan fact)
# Prvi chunk je intro (najvažniji)
chunks = []
first_chunk = extract[:1500]
chunks.append((title + "" + first_chunk, 0.92))
# Sljedeći chunks (manje confidence)
for i in range(1500, min(len(extract), 6000), 1500):
chunks.append((title + "" + extract[i:i+1500], 0.85))
for chunk_text, conf in chunks:
fh = hashlib.sha256(f"wiki:{title}:{chunk_text[:80]}".encode()).hexdigest()[:32]
refs = json.dumps([{"url": url, "title": title, "wikipedia": "hr"}])
try:
cur.execute("""
INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s, %s, 'wiki_hr', %s::jsonb, %s, %s, now())
ON CONFLICT (data_hash) DO NOTHING
""", (chunk_text[:2000], f'wiki_{cat[:30]}', refs, conf, fh))
total_facts += cur.rowcount
except Exception as e:
if total_facts < 5: log.warning(f"insert: {e}")
total_pages += 1
if total_pages % 20 == 0:
log.info(f" Progress: {total_pages} pages, {total_facts} facts")
except Exception as e:
log.warning(f" Page fail '{title}': {e}")
continue
log.info(f" Done {cat}: total facts={total_facts}")
log.info(f"═══ FINAL: {total_pages} pages, {total_facts} facts ═══")
cur.close(); conn.close()
if __name__ == "__main__":
harvest()