#!/usr/bin/env python3 """D.2b — Better URL handling for HR Wikipedia + Wikidata.""" import psycopg2, requests, re, json, time, urllib.parse DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7") UA = "RiNET-Civic/1.0 (https://rinet.one)" TIMEOUT = 20 DELAY = 0.4 s = requests.Session() s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"}) def get_wikidata_id(title, lang): try: r = s.get(f"https://{lang}.wikipedia.org/w/api.php", params={"action":"query","format":"json","prop":"pageprops", "ppprop":"wikibase_item","titles":title}, timeout=TIMEOUT) pages = r.json().get("query",{}).get("pages",{}) for pid, p in pages.items(): if pid == "-1": continue qid = p.get("pageprops",{}).get("wikibase_item") if qid: return qid except: pass return None def get_wikidata_entity(qid): try: r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT) return r.json().get("entities",{}).get(qid) except: return None def get_label(qid, lang="hr"): try: r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT) ent = r.json().get("entities",{}).get(qid,{}) labels = ent.get("labels",{}) return (labels.get(lang,{}).get("value") or labels.get("en",{}).get("value") or labels.get("hr",{}).get("value")) except: return None def parse_birth(entity): out = {} if not entity: return out claims = entity.get("claims",{}) for prop in ["P569"]: for c in claims.get(prop,[]): try: t = c["mainsnak"]["datavalue"]["value"]["time"] m = re.match(r"^[+-]?(\d{4})-(\d{2})-(\d{2})", t) if m: y,mo,d = int(m.group(1)),int(m.group(2)),int(m.group(3)) if 1900 < y < 2026 and 1 <= mo <= 12 and 1 <= d <= 31: out["datum_rodenja"] = f"{y}-{mo:02d}-{d:02d}" break except: continue for prop in ["P19"]: for c in claims.get(prop,[]): try: qid = c["mainsnak"]["datavalue"]["value"]["id"] lbl = get_label(qid,"hr") time.sleep(DELAY) if lbl: out["mjesto_rodenja"] = lbl[:100] break except: continue return out # === MAIN === conn = psycopg2.connect(**DB); conn.autocommit = True cr = conn.cursor() cr.execute(""" SELECT id, ime, prezime, source_url, datum_rodenja, mjesto_rodenja FROM pgz_sport.clanovi WHERE source_url LIKE '%wikipedia.org/wiki/%' AND (datum_rodenja IS NULL OR mjesto_rodenja IS NULL) """) todo = cr.fetchall() print(f"Pending: {len(todo)}") success = 0 for cid, ime, prezime, source_url, dob, mjesto in todo: m = re.match(r"https://(\w+)\.wikipedia\.org/wiki/(.+)", source_url) if not m: continue lang = m.group(1) raw = m.group(2) title = urllib.parse.unquote(raw).replace("_"," ") qid = get_wikidata_id(title, lang) time.sleep(DELAY) if not qid: # Try alternate lang alt = "en" if lang == "hr" else "hr" qid = get_wikidata_id(title, alt) time.sleep(DELAY) if not qid: print(f" ✗ {ime} {prezime}: no Q-id"); continue entity = get_wikidata_entity(qid) time.sleep(DELAY) parsed = parse_birth(entity) if not parsed: print(f" ✗ {ime} {prezime} ({qid}): no birth data"); continue sets, vals = [], [] if parsed.get("datum_rodenja") and not dob: sets.append("datum_rodenja = %s"); vals.append(parsed["datum_rodenja"]) if parsed.get("mjesto_rodenja") and not mjesto: sets.append("mjesto_rodenja = %s"); vals.append(parsed["mjesto_rodenja"]) if sets: sets.append("source_synced_at = now()") vals.append(cid) cr.execute(f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s", tuple(vals)) success += 1 flags = [] if parsed.get("datum_rodenja"): flags.append(f"DOB={parsed['datum_rodenja']}") if parsed.get("mjesto_rodenja"): flags.append(f"M={parsed['mjesto_rodenja']}") print(f" ✓ {ime} {prezime} ({qid}): {' '.join(flags)}") print(f"\n=== Round 2 Updated: {success}/{len(todo)} ===") # Re-stats cr.execute("""SELECT count(*) FILTER (WHERE datum_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS dob, count(*) FILTER (WHERE mjesto_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS mjesto, count(*) FILTER (WHERE source_url LIKE '%wikipedia%') AS total FROM pgz_sport.clanovi""") r = cr.fetchone() print(f"Final wiki: dob={r[0]} mjesto={r[1]} of total {r[2]}") conn.close()