133 lines
4.8 KiB
Python
Executable File
133 lines
4.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""D.2b — Better URL handling for HR Wikipedia + Wikidata."""
|
|
import psycopg2, requests, re, json, time, urllib.parse
|
|
|
|
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
|
|
user="rinet", password="R1net2026!SecureDB#v7")
|
|
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
|
TIMEOUT = 20
|
|
DELAY = 0.4
|
|
|
|
s = requests.Session()
|
|
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
|
|
|
|
def get_wikidata_id(title, lang):
|
|
try:
|
|
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
|
params={"action":"query","format":"json","prop":"pageprops",
|
|
"ppprop":"wikibase_item","titles":title}, timeout=TIMEOUT)
|
|
pages = r.json().get("query",{}).get("pages",{})
|
|
for pid, p in pages.items():
|
|
if pid == "-1": continue
|
|
qid = p.get("pageprops",{}).get("wikibase_item")
|
|
if qid: return qid
|
|
except: pass
|
|
return None
|
|
|
|
def get_wikidata_entity(qid):
|
|
try:
|
|
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
|
|
return r.json().get("entities",{}).get(qid)
|
|
except: return None
|
|
|
|
def get_label(qid, lang="hr"):
|
|
try:
|
|
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
|
|
ent = r.json().get("entities",{}).get(qid,{})
|
|
labels = ent.get("labels",{})
|
|
return (labels.get(lang,{}).get("value") or
|
|
labels.get("en",{}).get("value") or
|
|
labels.get("hr",{}).get("value"))
|
|
except: return None
|
|
|
|
def parse_birth(entity):
|
|
out = {}
|
|
if not entity: return out
|
|
claims = entity.get("claims",{})
|
|
for prop in ["P569"]:
|
|
for c in claims.get(prop,[]):
|
|
try:
|
|
t = c["mainsnak"]["datavalue"]["value"]["time"]
|
|
m = re.match(r"^[+-]?(\d{4})-(\d{2})-(\d{2})", t)
|
|
if m:
|
|
y,mo,d = int(m.group(1)),int(m.group(2)),int(m.group(3))
|
|
if 1900 < y < 2026 and 1 <= mo <= 12 and 1 <= d <= 31:
|
|
out["datum_rodenja"] = f"{y}-{mo:02d}-{d:02d}"
|
|
break
|
|
except: continue
|
|
for prop in ["P19"]:
|
|
for c in claims.get(prop,[]):
|
|
try:
|
|
qid = c["mainsnak"]["datavalue"]["value"]["id"]
|
|
lbl = get_label(qid,"hr")
|
|
time.sleep(DELAY)
|
|
if lbl:
|
|
out["mjesto_rodenja"] = lbl[:100]
|
|
break
|
|
except: continue
|
|
return out
|
|
|
|
# === MAIN ===
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cr = conn.cursor()
|
|
|
|
cr.execute("""
|
|
SELECT id, ime, prezime, source_url, datum_rodenja, mjesto_rodenja
|
|
FROM pgz_sport.clanovi
|
|
WHERE source_url LIKE '%wikipedia.org/wiki/%'
|
|
AND (datum_rodenja IS NULL OR mjesto_rodenja IS NULL)
|
|
""")
|
|
todo = cr.fetchall()
|
|
print(f"Pending: {len(todo)}")
|
|
|
|
success = 0
|
|
for cid, ime, prezime, source_url, dob, mjesto in todo:
|
|
m = re.match(r"https://(\w+)\.wikipedia\.org/wiki/(.+)", source_url)
|
|
if not m: continue
|
|
lang = m.group(1)
|
|
raw = m.group(2)
|
|
title = urllib.parse.unquote(raw).replace("_"," ")
|
|
|
|
qid = get_wikidata_id(title, lang)
|
|
time.sleep(DELAY)
|
|
if not qid:
|
|
# Try alternate lang
|
|
alt = "en" if lang == "hr" else "hr"
|
|
qid = get_wikidata_id(title, alt)
|
|
time.sleep(DELAY)
|
|
if not qid:
|
|
print(f" ✗ {ime} {prezime}: no Q-id"); continue
|
|
|
|
entity = get_wikidata_entity(qid)
|
|
time.sleep(DELAY)
|
|
parsed = parse_birth(entity)
|
|
if not parsed:
|
|
print(f" ✗ {ime} {prezime} ({qid}): no birth data"); continue
|
|
|
|
sets, vals = [], []
|
|
if parsed.get("datum_rodenja") and not dob:
|
|
sets.append("datum_rodenja = %s"); vals.append(parsed["datum_rodenja"])
|
|
if parsed.get("mjesto_rodenja") and not mjesto:
|
|
sets.append("mjesto_rodenja = %s"); vals.append(parsed["mjesto_rodenja"])
|
|
if sets:
|
|
sets.append("source_synced_at = now()")
|
|
vals.append(cid)
|
|
cr.execute(f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
|
|
success += 1
|
|
flags = []
|
|
if parsed.get("datum_rodenja"): flags.append(f"DOB={parsed['datum_rodenja']}")
|
|
if parsed.get("mjesto_rodenja"): flags.append(f"M={parsed['mjesto_rodenja']}")
|
|
print(f" ✓ {ime} {prezime} ({qid}): {' '.join(flags)}")
|
|
|
|
print(f"\n=== Round 2 Updated: {success}/{len(todo)} ===")
|
|
|
|
# Re-stats
|
|
cr.execute("""SELECT
|
|
count(*) FILTER (WHERE datum_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS dob,
|
|
count(*) FILTER (WHERE mjesto_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS mjesto,
|
|
count(*) FILTER (WHERE source_url LIKE '%wikipedia%') AS total
|
|
FROM pgz_sport.clanovi""")
|
|
r = cr.fetchone()
|
|
print(f"Final wiki: dob={r[0]} mjesto={r[1]} of total {r[2]}")
|
|
conn.close()
|