Files
pgz-sport/scrapers/wikidata_enrich.py.pre_b_switch.1777897180
T

133 lines
4.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""D.2b — Better URL handling for HR Wikipedia + Wikidata."""
import psycopg2, requests, re, json, time, urllib.parse
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0 (https://rinet.one)"
TIMEOUT = 20
DELAY = 0.4
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
def get_wikidata_id(title, lang):
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json","prop":"pageprops",
"ppprop":"wikibase_item","titles":title}, timeout=TIMEOUT)
pages = r.json().get("query",{}).get("pages",{})
for pid, p in pages.items():
if pid == "-1": continue
qid = p.get("pageprops",{}).get("wikibase_item")
if qid: return qid
except: pass
return None
def get_wikidata_entity(qid):
try:
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
return r.json().get("entities",{}).get(qid)
except: return None
def get_label(qid, lang="hr"):
try:
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
ent = r.json().get("entities",{}).get(qid,{})
labels = ent.get("labels",{})
return (labels.get(lang,{}).get("value") or
labels.get("en",{}).get("value") or
labels.get("hr",{}).get("value"))
except: return None
def parse_birth(entity):
out = {}
if not entity: return out
claims = entity.get("claims",{})
for prop in ["P569"]:
for c in claims.get(prop,[]):
try:
t = c["mainsnak"]["datavalue"]["value"]["time"]
m = re.match(r"^[+-]?(\d{4})-(\d{2})-(\d{2})", t)
if m:
y,mo,d = int(m.group(1)),int(m.group(2)),int(m.group(3))
if 1900 < y < 2026 and 1 <= mo <= 12 and 1 <= d <= 31:
out["datum_rodenja"] = f"{y}-{mo:02d}-{d:02d}"
break
except: continue
for prop in ["P19"]:
for c in claims.get(prop,[]):
try:
qid = c["mainsnak"]["datavalue"]["value"]["id"]
lbl = get_label(qid,"hr")
time.sleep(DELAY)
if lbl:
out["mjesto_rodenja"] = lbl[:100]
break
except: continue
return out
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""
SELECT id, ime, prezime, source_url, datum_rodenja, mjesto_rodenja
FROM pgz_sport.clanovi
WHERE source_url LIKE '%wikipedia.org/wiki/%'
AND (datum_rodenja IS NULL OR mjesto_rodenja IS NULL)
""")
todo = cr.fetchall()
print(f"Pending: {len(todo)}")
success = 0
for cid, ime, prezime, source_url, dob, mjesto in todo:
m = re.match(r"https://(\w+)\.wikipedia\.org/wiki/(.+)", source_url)
if not m: continue
lang = m.group(1)
raw = m.group(2)
title = urllib.parse.unquote(raw).replace("_"," ")
qid = get_wikidata_id(title, lang)
time.sleep(DELAY)
if not qid:
# Try alternate lang
alt = "en" if lang == "hr" else "hr"
qid = get_wikidata_id(title, alt)
time.sleep(DELAY)
if not qid:
print(f"{ime} {prezime}: no Q-id"); continue
entity = get_wikidata_entity(qid)
time.sleep(DELAY)
parsed = parse_birth(entity)
if not parsed:
print(f"{ime} {prezime} ({qid}): no birth data"); continue
sets, vals = [], []
if parsed.get("datum_rodenja") and not dob:
sets.append("datum_rodenja = %s"); vals.append(parsed["datum_rodenja"])
if parsed.get("mjesto_rodenja") and not mjesto:
sets.append("mjesto_rodenja = %s"); vals.append(parsed["mjesto_rodenja"])
if sets:
sets.append("source_synced_at = now()")
vals.append(cid)
cr.execute(f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
success += 1
flags = []
if parsed.get("datum_rodenja"): flags.append(f"DOB={parsed['datum_rodenja']}")
if parsed.get("mjesto_rodenja"): flags.append(f"M={parsed['mjesto_rodenja']}")
print(f"{ime} {prezime} ({qid}): {' '.join(flags)}")
print(f"\n=== Round 2 Updated: {success}/{len(todo)} ===")
# Re-stats
cr.execute("""SELECT
count(*) FILTER (WHERE datum_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS dob,
count(*) FILTER (WHERE mjesto_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS mjesto,
count(*) FILTER (WHERE source_url LIKE '%wikipedia%') AS total
FROM pgz_sport.clanovi""")
r = cr.fetchone()
print(f"Final wiki: dob={r[0]} mjesto={r[1]} of total {r[2]}")
conn.close()