148 lines
5.5 KiB
Python
148 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Fajl: wiki_pgz_sport.py | v1.0.0 | 05.05.2026
|
|
# Lokacija: /opt/pgz-sport/scrapers/wiki_pgz_sport.py
|
|
# Svrha: Wikipedia HR/EN scrape — PGŽ sport klubovi + sportaši
|
|
# - Iterate kroz sve known PGŽ klubove
|
|
# - Wiki API → page extract
|
|
# - Plus historical match results od Wikipedia season tables
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""Wikipedia PGŽ sport corpus."""
|
|
import os, sys, re, time, hashlib, logging, json
|
|
import urllib.request, urllib.parse
|
|
import psycopg2
|
|
from psycopg2.extras import execute_batch
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [wiki_sport] %(message)s")
|
|
log = logging.getLogger("wiki_sport")
|
|
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
UA = "Ri.NET Civic Bot 1.0 (contact: dradulic@outlook.com)"
|
|
API_HR = "https://hr.wikipedia.org/w/api.php"
|
|
API_EN = "https://en.wikipedia.org/w/api.php"
|
|
|
|
|
|
def wiki_extract(api, title, sentences=None):
|
|
"""Get plain text extract for a Wikipedia page."""
|
|
params = {
|
|
"action": "query", "prop": "extracts", "explaintext": "1",
|
|
"redirects": "1", "format": "json", "titles": title,
|
|
}
|
|
if sentences:
|
|
params["exsentences"] = str(sentences)
|
|
|
|
url = api + "?" + urllib.parse.urlencode(params)
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as r:
|
|
d = json.loads(r.read())
|
|
pages = d.get("query", {}).get("pages", {})
|
|
for pid, p in pages.items():
|
|
if pid == "-1": return None # not found
|
|
return p.get("extract", "")
|
|
except Exception as e:
|
|
log.warning(f"wiki err {title}: {e}")
|
|
return None
|
|
|
|
|
|
def wiki_search(api, query, limit=5):
|
|
"""Search Wikipedia for related pages."""
|
|
params = {"action": "query", "list": "search", "srsearch": query,
|
|
"format": "json", "srlimit": str(limit)}
|
|
url = api + "?" + urllib.parse.urlencode(params)
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=10) as r:
|
|
d = json.loads(r.read())
|
|
return [p["title"] for p in d.get("query", {}).get("search", [])]
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def get_pgz_clubs(conn):
|
|
"""Fetch active PGŽ clubs from DB."""
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
SELECT naziv, COALESCE(skraceni_naziv, '')
|
|
FROM pgz_sport.klubovi
|
|
WHERE COALESCE(aktivan, true) = true
|
|
ORDER BY naziv
|
|
""")
|
|
out = [(r[0], r[1]) for r in cur.fetchall()]
|
|
cur.close()
|
|
return out
|
|
|
|
|
|
def chunk(text, max_len=700):
|
|
if len(text) <= max_len: return [text] if text else []
|
|
out = []; start = 0
|
|
while start < len(text):
|
|
end = min(start + max_len, len(text))
|
|
if end < len(text):
|
|
for sep in [". ", "! ", "? ", "\n"]:
|
|
p = text.rfind(sep, start, end)
|
|
if p > start + max_len // 2:
|
|
end = p + len(sep); break
|
|
out.append(text[start:end].strip())
|
|
start = end
|
|
return [c for c in out if len(c) > 80]
|
|
|
|
|
|
def upsert(conn, facts):
|
|
if not facts: return 0
|
|
cur = conn.cursor()
|
|
rows = []
|
|
for f in facts:
|
|
h = hashlib.md5(f["fact"].encode()).hexdigest()
|
|
rows.append((f["fact"], "wikipedia_pgz_sport", "pgz_sport_wiki",
|
|
f.get("confidence", 0.84), h,
|
|
json.dumps({"page": f.get("page", ""), "lang": f.get("lang", "hr")})))
|
|
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
|
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
|
try:
|
|
execute_batch(cur, sql, rows, page_size=50)
|
|
n = cur.rowcount; cur.close()
|
|
return n
|
|
except Exception as e:
|
|
log.error(f"upsert: {e}")
|
|
return 0
|
|
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
clubs = get_pgz_clubs(conn)
|
|
log.info(f"PGŽ active clubs: {len(clubs)}")
|
|
|
|
total_facts = 0
|
|
found_pages = 0
|
|
|
|
for naziv, kraci in clubs[:200]: # limit first run
|
|
# Try direct page first
|
|
text = wiki_extract(API_HR, naziv)
|
|
if not text:
|
|
# Try search
|
|
candidates = wiki_search(API_HR, naziv, limit=3)
|
|
for c in candidates:
|
|
if any(t.lower() in c.lower() for t in [naziv.split()[-1], "Rijeka", "Opatija", "Krk"]):
|
|
text = wiki_extract(API_HR, c)
|
|
if text:
|
|
break
|
|
|
|
if text and len(text) > 200:
|
|
found_pages += 1
|
|
facts = [{"fact": c, "page": naziv, "lang": "hr", "confidence": 0.85}
|
|
for c in chunk(text, 700)]
|
|
total_facts += upsert(conn, facts)
|
|
|
|
time.sleep(0.5)
|
|
if found_pages % 20 == 0 and found_pages > 0:
|
|
log.info(f" progress: pages {found_pages}, facts {total_facts}")
|
|
|
|
log.info(f"=== DONE: pages={found_pages} facts={total_facts} ===")
|
|
print(json.dumps({"pages": found_pages, "facts": total_facts}))
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|