#!/usr/bin/env python3 """ geocode_objekti_v2.py — precision geocoding for pgz_sport.sportski_objekti Re-geocodes all objects via Nominatim using {naziv} + {grad} + ", Hrvatska" queries. Verifies result is within PGŽ bounds (44.5-45.6, 14.0-15.1) and NOT a duplicated "city centroid" (where multiple objects share identical coordinates from a previous fallback pass). Updates lat/lng only when a more precise result is found. Usage: python3 geocode_objekti_v2.py [--dry-run] [--only-duplicates] """ import os, sys, time, json, urllib.parse, argparse import urllib.request import psycopg2 PG = dict(host=os.environ.get('PG_HOST','10.10.0.2'), port=int(os.environ.get('PG_PORT','6432')), dbname=os.environ.get('PG_DB','rinet_v3'), user=os.environ.get('PG_USER','rinet'), password=os.environ.get('PG_PASS','')) PGZ_LAT = (44.5, 45.6) PGZ_LNG = (14.0, 15.1) UA = 'pgz-sport/2.0 (dradulic@outlook.com)' def nominatim(q, country='hr', limit=3): url = ('https://nominatim.openstreetmap.org/search?' 'q='+urllib.parse.quote(q)+ '&format=json&limit='+str(limit)+ '&countrycodes='+country+ '&addressdetails=1') req = urllib.request.Request(url, headers={'User-Agent': UA}) try: with urllib.request.urlopen(req, timeout=10) as r: return json.loads(r.read().decode()) except Exception as e: print(f' ! nominatim error: {e}') return [] def in_pgz(lat, lng): return PGZ_LAT[0] <= lat <= PGZ_LAT[1] and PGZ_LNG[0] <= lng <= PGZ_LNG[1] def best_result(results): """Pick best precision: prefer leisure/sports types, then building, then place.""" if not results: return None type_priority = { 'sports_centre': 100, 'stadium': 95, 'pitch': 90, 'swimming_pool': 90, 'sports_hall': 95, 'leisure': 80, 'building': 70, 'tourism': 60, 'highway': 30, 'place': 20, } best = None best_score = -1 for r in results: try: lat = float(r['lat']); lng = float(r['lon']) except (KeyError, ValueError): continue if not in_pgz(lat, lng): continue cls = r.get('class','') typ = r.get('type','') # importance is Nominatim's intrinsic relevance score importance = float(r.get('importance', 0)) score = type_priority.get(typ, type_priority.get(cls, 50)) + importance*10 if score > best_score: best_score = score best = (lat, lng, r) return best def queries_for(naziv, grad, adresa): """Generate ordered queries from most specific to most general.""" qs = [] n = (naziv or '').strip() g = (grad or '').strip() a = (adresa or '').strip() if a and g: qs.append(f'{a}, {g}, Hrvatska') if n and g: qs.append(f'{n}, {g}, Hrvatska') # Strip common prefixes for a cleaner search short = n for prefix in ('Sportska dvorana ', 'Gradska sportska dvorana ', 'Multifunkcionalna dvorana za sport i turizam ', 'Stadion ', 'Bazen ', 'Bazeni ', 'Dvorana ', 'Boćalište ', 'Kuglana ', 'Marina '): if short.startswith(prefix): short = short[len(prefix):].strip() break if short and short != n and g: qs.append(f'{short}, {g}, Hrvatska') if n: qs.append(f'{n}, Hrvatska') if g and a: qs.append(f'{a}, {g}') # dedup preserving order seen = set(); out = [] for q in qs: if q not in seen: seen.add(q); out.append(q) return out def main(): ap = argparse.ArgumentParser() ap.add_argument('--dry-run', action='store_true') ap.add_argument('--only-duplicates', action='store_true', help='only re-geocode objects sharing coordinates with another object') ap.add_argument('--id', type=int, help='single object ID to re-geocode') args = ap.parse_args() conn = psycopg2.connect(**PG) cur = conn.cursor() if args.id: cur.execute("SELECT id, naziv, grad, adresa, lat, lng FROM pgz_sport.sportski_objekti WHERE id=%s", (args.id,)) elif args.only_duplicates: cur.execute(""" WITH dup AS ( SELECT lat, lng FROM pgz_sport.sportski_objekti WHERE lat IS NOT NULL GROUP BY lat, lng HAVING count(*)>1 ) SELECT s.id, s.naziv, s.grad, s.adresa, s.lat, s.lng FROM pgz_sport.sportski_objekti s JOIN dup d USING (lat, lng) ORDER BY s.id """) else: cur.execute("SELECT id, naziv, grad, adresa, lat, lng FROM pgz_sport.sportski_objekti ORDER BY id") rows = cur.fetchall() print(f'== Processing {len(rows)} objects (dry_run={args.dry_run}) ==') updated = 0 skipped = 0 failed = [] for i, (oid, naziv, grad, adresa, oldlat, oldlng) in enumerate(rows, 1): print(f'[{i}/{len(rows)}] #{oid} {naziv} ({grad}) — current: {oldlat},{oldlng}') new_pos = None for q in queries_for(naziv, grad, adresa): results = nominatim(q) time.sleep(1.05) # Nominatim 1 req/s policy best = best_result(results) if best: lat, lng, raw = best # Skip queries that just resolve to a place/town center if raw.get('class') == 'place' and raw.get('type') in ('city','town','village','suburb','locality'): print(f' "{q}" -> {raw.get("display_name","")[:60]} (place type, skip)') continue print(f' "{q}" -> {lat},{lng} [{raw.get("class")}/{raw.get("type")}]') new_pos = (lat, lng, q) break else: print(f' "{q}" -> no result in PGŽ bounds') if not new_pos: failed.append((oid, naziv, grad)) print(' ✗ no precise match found') continue nlat, nlng, nq = new_pos # Detect meaningful change (>50m). 0.0005° ≈ 55m at this latitude. if oldlat is not None and oldlng is not None: dlat = abs(float(oldlat) - nlat) dlng = abs(float(oldlng) - nlng) if dlat < 0.0005 and dlng < 0.0005: print(f' = unchanged (within 50m)') skipped += 1 continue if args.dry_run: print(f' [DRY] would UPDATE id={oid} -> {nlat},{nlng}') else: cur.execute(""" UPDATE pgz_sport.sportski_objekti SET lat=%s, lng=%s WHERE id=%s """, (nlat, nlng, oid)) conn.commit() print(f' ✓ UPDATED -> {nlat},{nlng}') updated += 1 print('') print(f'== Summary: {updated} updated, {skipped} unchanged, {len(failed)} failed ==') if failed: print('Failed:') for oid, n, g in failed: print(f' #{oid} {n} ({g})') cur.close(); conn.close() if __name__ == '__main__': main()