From 64082d0642d45f06e1342b0a9674a0dd75acd785 Mon Sep 17 00:00:00 2001 From: claude-cc1 Date: Tue, 5 May 2026 00:04:50 +0200 Subject: [PATCH] =?UTF-8?q?CC1=20R3B-P3=20=E2=80=94=20geocoding=20precisio?= =?UTF-8?q?n=20(Crikvenica=20+=20OSM=20cross-check)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New scripts/geocode_v3_osm.py: matches DB objekti against OSM Overpass sports facilities - Applied 53 OSM updates, then reverted bad cross-city matches to hand-curated coords - Crikvenica venues now precise (Gradska dvorana, SS Antun Barac, Stadion, Sport+ Centar) - Atletska dvorana Luciano Sušanj fixed to Kantrida - Skate park Delta, Boulder dvorana, Boćarski Podvežica reverted from wrong matches - Google Places API not available (project disabled), Overpass + curated fallback used Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/geocode_v3_osm.py | 128 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100755 scripts/geocode_v3_osm.py diff --git a/scripts/geocode_v3_osm.py b/scripts/geocode_v3_osm.py new file mode 100755 index 0000000..9513d9a --- /dev/null +++ b/scripts/geocode_v3_osm.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +""" +geocode_v3_osm.py — fuzzy-match objekti against OSM sports facilities + +Strategy: +1) Pull all named sports leisure objects from OSM via Overpass API in PGŽ bounds. +2) For each pgz_sport.sportski_objekti row, compute a similarity match against OSM names. +3) When a confident match is found AND new coords differ from current by >100m, + update the DB. +""" +import os, time, json, urllib.parse, urllib.request +import psycopg2, psycopg2.extras +import re +from difflib import SequenceMatcher + +PG = dict(host=os.environ.get('PG_HOST','10.10.0.2'), + port=int(os.environ.get('PG_PORT','6432')), + dbname=os.environ.get('PG_DB','rinet_v3'), + user=os.environ.get('PG_USER','rinet'), + password=os.environ.get('PG_PASS','')) + +UA = 'pgz-sport/2.0 (dradulic@outlook.com)' + +OVERPASS = """[out:json][timeout:60]; +( + node["leisure"~"sports_centre|sports_hall|stadium|pitch|swimming_pool|ice_rink"](44.5,14.0,45.6,15.1); + way["leisure"~"sports_centre|sports_hall|stadium|pitch|swimming_pool|ice_rink"](44.5,14.0,45.6,15.1); + node["sport"]["name"](44.5,14.0,45.6,15.1); + way["sport"]["name"](44.5,14.0,45.6,15.1); + node["amenity"~"sports_centre|gymnasium"](44.5,14.0,45.6,15.1); + way["amenity"~"sports_centre|gymnasium"](44.5,14.0,45.6,15.1); +); +out center tags;""" + +def fetch_osm(): + req = urllib.request.Request( + 'https://overpass-api.de/api/interpreter', + data=urllib.parse.urlencode({'data': OVERPASS}).encode(), + headers={'User-Agent': UA, 'Content-Type': 'application/x-www-form-urlencoded'}) + with urllib.request.urlopen(req, timeout=120) as r: + return json.loads(r.read().decode()) + +def normalize(s): + s = (s or '').lower() + s = re.sub(r'[^\w\s]', ' ', s, flags=re.UNICODE) + # Strip common Croatian sport prefixes that confuse matching + for w in ['sportska dvorana', 'gradska sportska dvorana', 'multifunkcionalna dvorana', + 'sportski centar', 'gradski stadion', 'sportski kompleks', 'srednja skola', + 'srednje skole', 'osnovna skola', 'os ', 'ss ', 'dr ', 'prof ', + 'centar', 'stadion', 'dvorana', 'bazen', 'bazeni']: + s = s.replace(w, ' ') + s = re.sub(r'\s+', ' ', s).strip() + return s + +def similarity(a, b): + return SequenceMatcher(None, normalize(a), normalize(b)).ratio() + +def haversine(lat1, lng1, lat2, lng2): + """Distance in meters.""" + import math + R = 6371000 + p1 = math.radians(lat1); p2 = math.radians(lat2) + dp = math.radians(lat2-lat1); dl = math.radians(lng2-lng1) + a = math.sin(dp/2)**2 + math.cos(p1)*math.cos(p2)*math.sin(dl/2)**2 + return 2*R*math.asin(math.sqrt(a)) + +def main(): + print('Fetching OSM sports data...') + osm = fetch_osm() + elems = [] + for e in osm.get('elements', []): + t = e.get('tags', {}) + name = t.get('name') + if not name: continue + lat = e.get('lat') or e.get('center',{}).get('lat') + lon = e.get('lon') or e.get('center',{}).get('lon') + if lat is None or lon is None: continue + elems.append({'name': name, 'lat': lat, 'lng': lon, 'tags': t}) + print(f'OSM named sports elements: {len(elems)}') + + conn = psycopg2.connect(**PG) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + cur.execute("SELECT id, naziv, grad, lat, lng FROM pgz_sport.sportski_objekti ORDER BY id") + objekti = cur.fetchall() + print(f'DB objekti: {len(objekti)}') + + updated = 0 + skipped_close = 0 + skipped_low = 0 + for o in objekti: + # Find best fuzzy match + best = None + best_sim = 0.0 + nname = normalize(o['naziv']) + if not nname: continue + for e in elems: + sim = similarity(o['naziv'], e['name']) + # Boost if same city contained in either name + if o['grad'] and (o['grad'].lower() in (e['name'] or '').lower() or + o['grad'].lower() in (e['tags'].get('addr:city','') or '').lower()): + sim += 0.05 + if sim > best_sim: + best_sim = sim + best = e + # Require strong match + if best_sim < 0.55: + skipped_low += 1 + continue + # Skip if already within 100m + if o['lat'] and o['lng']: + d = haversine(float(o['lat']), float(o['lng']), best['lat'], best['lng']) + if d < 100: + skipped_close += 1 + continue + else: + pass + # Apply update + print(f" #{o['id']:3} {o['naziv'][:55]:55} -> '{best['name'][:40]}' sim={best_sim:.2f} {best['lat']:.6f},{best['lng']:.6f}") + cur.execute("UPDATE pgz_sport.sportski_objekti SET lat=%s, lng=%s WHERE id=%s", + (best['lat'], best['lng'], o['id'])) + conn.commit() + updated += 1 + + print(f'\nUpdated: {updated} Already-close: {skipped_close} Low-similarity: {skipped_low}') + cur.close(); conn.close() + +if __name__ == '__main__': + main()