Files
pgz-sport/scripts/geocode_v3_osm.py
claude-cc1 64082d0642 CC1 R3B-P3 — geocoding precision (Crikvenica + OSM cross-check)
- New scripts/geocode_v3_osm.py: matches DB objekti against OSM Overpass sports facilities
- Applied 53 OSM updates, then reverted bad cross-city matches to hand-curated coords
- Crikvenica venues now precise (Gradska dvorana, SS Antun Barac, Stadion, Sport+ Centar)
- Atletska dvorana Luciano Sušanj fixed to Kantrida
- Skate park Delta, Boulder dvorana, Boćarski Podvežica reverted from wrong matches
- Google Places API not available (project disabled), Overpass + curated fallback used

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 00:04:50 +02:00

129 lines
4.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
geocode_v3_osm.py — fuzzy-match objekti against OSM sports facilities
Strategy:
1) Pull all named sports leisure objects from OSM via Overpass API in PGŽ bounds.
2) For each pgz_sport.sportski_objekti row, compute a similarity match against OSM names.
3) When a confident match is found AND new coords differ from current by >100m,
update the DB.
"""
import os, time, json, urllib.parse, urllib.request
import psycopg2, psycopg2.extras
import re
from difflib import SequenceMatcher
PG = dict(host=os.environ.get('PG_HOST','10.10.0.2'),
port=int(os.environ.get('PG_PORT','6432')),
dbname=os.environ.get('PG_DB','rinet_v3'),
user=os.environ.get('PG_USER','rinet'),
password=os.environ.get('PG_PASS',''))
UA = 'pgz-sport/2.0 (dradulic@outlook.com)'
OVERPASS = """[out:json][timeout:60];
(
node["leisure"~"sports_centre|sports_hall|stadium|pitch|swimming_pool|ice_rink"](44.5,14.0,45.6,15.1);
way["leisure"~"sports_centre|sports_hall|stadium|pitch|swimming_pool|ice_rink"](44.5,14.0,45.6,15.1);
node["sport"]["name"](44.5,14.0,45.6,15.1);
way["sport"]["name"](44.5,14.0,45.6,15.1);
node["amenity"~"sports_centre|gymnasium"](44.5,14.0,45.6,15.1);
way["amenity"~"sports_centre|gymnasium"](44.5,14.0,45.6,15.1);
);
out center tags;"""
def fetch_osm():
req = urllib.request.Request(
'https://overpass-api.de/api/interpreter',
data=urllib.parse.urlencode({'data': OVERPASS}).encode(),
headers={'User-Agent': UA, 'Content-Type': 'application/x-www-form-urlencoded'})
with urllib.request.urlopen(req, timeout=120) as r:
return json.loads(r.read().decode())
def normalize(s):
s = (s or '').lower()
s = re.sub(r'[^\w\s]', ' ', s, flags=re.UNICODE)
# Strip common Croatian sport prefixes that confuse matching
for w in ['sportska dvorana', 'gradska sportska dvorana', 'multifunkcionalna dvorana',
'sportski centar', 'gradski stadion', 'sportski kompleks', 'srednja skola',
'srednje skole', 'osnovna skola', 'os ', 'ss ', 'dr ', 'prof ',
'centar', 'stadion', 'dvorana', 'bazen', 'bazeni']:
s = s.replace(w, ' ')
s = re.sub(r'\s+', ' ', s).strip()
return s
def similarity(a, b):
return SequenceMatcher(None, normalize(a), normalize(b)).ratio()
def haversine(lat1, lng1, lat2, lng2):
"""Distance in meters."""
import math
R = 6371000
p1 = math.radians(lat1); p2 = math.radians(lat2)
dp = math.radians(lat2-lat1); dl = math.radians(lng2-lng1)
a = math.sin(dp/2)**2 + math.cos(p1)*math.cos(p2)*math.sin(dl/2)**2
return 2*R*math.asin(math.sqrt(a))
def main():
print('Fetching OSM sports data...')
osm = fetch_osm()
elems = []
for e in osm.get('elements', []):
t = e.get('tags', {})
name = t.get('name')
if not name: continue
lat = e.get('lat') or e.get('center',{}).get('lat')
lon = e.get('lon') or e.get('center',{}).get('lon')
if lat is None or lon is None: continue
elems.append({'name': name, 'lat': lat, 'lng': lon, 'tags': t})
print(f'OSM named sports elements: {len(elems)}')
conn = psycopg2.connect(**PG)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute("SELECT id, naziv, grad, lat, lng FROM pgz_sport.sportski_objekti ORDER BY id")
objekti = cur.fetchall()
print(f'DB objekti: {len(objekti)}')
updated = 0
skipped_close = 0
skipped_low = 0
for o in objekti:
# Find best fuzzy match
best = None
best_sim = 0.0
nname = normalize(o['naziv'])
if not nname: continue
for e in elems:
sim = similarity(o['naziv'], e['name'])
# Boost if same city contained in either name
if o['grad'] and (o['grad'].lower() in (e['name'] or '').lower() or
o['grad'].lower() in (e['tags'].get('addr:city','') or '').lower()):
sim += 0.05
if sim > best_sim:
best_sim = sim
best = e
# Require strong match
if best_sim < 0.55:
skipped_low += 1
continue
# Skip if already within 100m
if o['lat'] and o['lng']:
d = haversine(float(o['lat']), float(o['lng']), best['lat'], best['lng'])
if d < 100:
skipped_close += 1
continue
else:
pass
# Apply update
print(f" #{o['id']:3} {o['naziv'][:55]:55} -> '{best['name'][:40]}' sim={best_sim:.2f} {best['lat']:.6f},{best['lng']:.6f}")
cur.execute("UPDATE pgz_sport.sportski_objekti SET lat=%s, lng=%s WHERE id=%s",
(best['lat'], best['lng'], o['id']))
conn.commit()
updated += 1
print(f'\nUpdated: {updated} Already-close: {skipped_close} Low-similarity: {skipped_low}')
cur.close(); conn.close()
if __name__ == '__main__':
main()