CC1 R3B-P3 — geocoding precision (Crikvenica + OSM cross-check)
- New scripts/geocode_v3_osm.py: matches DB objekti against OSM Overpass sports facilities - Applied 53 OSM updates, then reverted bad cross-city matches to hand-curated coords - Crikvenica venues now precise (Gradska dvorana, SS Antun Barac, Stadion, Sport+ Centar) - Atletska dvorana Luciano Sušanj fixed to Kantrida - Skate park Delta, Boulder dvorana, Boćarski Podvežica reverted from wrong matches - Google Places API not available (project disabled), Overpass + curated fallback used Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Executable
+128
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
geocode_v3_osm.py — fuzzy-match objekti against OSM sports facilities
|
||||
|
||||
Strategy:
|
||||
1) Pull all named sports leisure objects from OSM via Overpass API in PGŽ bounds.
|
||||
2) For each pgz_sport.sportski_objekti row, compute a similarity match against OSM names.
|
||||
3) When a confident match is found AND new coords differ from current by >100m,
|
||||
update the DB.
|
||||
"""
|
||||
import os, time, json, urllib.parse, urllib.request
|
||||
import psycopg2, psycopg2.extras
|
||||
import re
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
PG = dict(host=os.environ.get('PG_HOST','10.10.0.2'),
|
||||
port=int(os.environ.get('PG_PORT','6432')),
|
||||
dbname=os.environ.get('PG_DB','rinet_v3'),
|
||||
user=os.environ.get('PG_USER','rinet'),
|
||||
password=os.environ.get('PG_PASS',''))
|
||||
|
||||
UA = 'pgz-sport/2.0 (dradulic@outlook.com)'
|
||||
|
||||
OVERPASS = """[out:json][timeout:60];
|
||||
(
|
||||
node["leisure"~"sports_centre|sports_hall|stadium|pitch|swimming_pool|ice_rink"](44.5,14.0,45.6,15.1);
|
||||
way["leisure"~"sports_centre|sports_hall|stadium|pitch|swimming_pool|ice_rink"](44.5,14.0,45.6,15.1);
|
||||
node["sport"]["name"](44.5,14.0,45.6,15.1);
|
||||
way["sport"]["name"](44.5,14.0,45.6,15.1);
|
||||
node["amenity"~"sports_centre|gymnasium"](44.5,14.0,45.6,15.1);
|
||||
way["amenity"~"sports_centre|gymnasium"](44.5,14.0,45.6,15.1);
|
||||
);
|
||||
out center tags;"""
|
||||
|
||||
def fetch_osm():
|
||||
req = urllib.request.Request(
|
||||
'https://overpass-api.de/api/interpreter',
|
||||
data=urllib.parse.urlencode({'data': OVERPASS}).encode(),
|
||||
headers={'User-Agent': UA, 'Content-Type': 'application/x-www-form-urlencoded'})
|
||||
with urllib.request.urlopen(req, timeout=120) as r:
|
||||
return json.loads(r.read().decode())
|
||||
|
||||
def normalize(s):
|
||||
s = (s or '').lower()
|
||||
s = re.sub(r'[^\w\s]', ' ', s, flags=re.UNICODE)
|
||||
# Strip common Croatian sport prefixes that confuse matching
|
||||
for w in ['sportska dvorana', 'gradska sportska dvorana', 'multifunkcionalna dvorana',
|
||||
'sportski centar', 'gradski stadion', 'sportski kompleks', 'srednja skola',
|
||||
'srednje skole', 'osnovna skola', 'os ', 'ss ', 'dr ', 'prof ',
|
||||
'centar', 'stadion', 'dvorana', 'bazen', 'bazeni']:
|
||||
s = s.replace(w, ' ')
|
||||
s = re.sub(r'\s+', ' ', s).strip()
|
||||
return s
|
||||
|
||||
def similarity(a, b):
|
||||
return SequenceMatcher(None, normalize(a), normalize(b)).ratio()
|
||||
|
||||
def haversine(lat1, lng1, lat2, lng2):
|
||||
"""Distance in meters."""
|
||||
import math
|
||||
R = 6371000
|
||||
p1 = math.radians(lat1); p2 = math.radians(lat2)
|
||||
dp = math.radians(lat2-lat1); dl = math.radians(lng2-lng1)
|
||||
a = math.sin(dp/2)**2 + math.cos(p1)*math.cos(p2)*math.sin(dl/2)**2
|
||||
return 2*R*math.asin(math.sqrt(a))
|
||||
|
||||
def main():
|
||||
print('Fetching OSM sports data...')
|
||||
osm = fetch_osm()
|
||||
elems = []
|
||||
for e in osm.get('elements', []):
|
||||
t = e.get('tags', {})
|
||||
name = t.get('name')
|
||||
if not name: continue
|
||||
lat = e.get('lat') or e.get('center',{}).get('lat')
|
||||
lon = e.get('lon') or e.get('center',{}).get('lon')
|
||||
if lat is None or lon is None: continue
|
||||
elems.append({'name': name, 'lat': lat, 'lng': lon, 'tags': t})
|
||||
print(f'OSM named sports elements: {len(elems)}')
|
||||
|
||||
conn = psycopg2.connect(**PG)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute("SELECT id, naziv, grad, lat, lng FROM pgz_sport.sportski_objekti ORDER BY id")
|
||||
objekti = cur.fetchall()
|
||||
print(f'DB objekti: {len(objekti)}')
|
||||
|
||||
updated = 0
|
||||
skipped_close = 0
|
||||
skipped_low = 0
|
||||
for o in objekti:
|
||||
# Find best fuzzy match
|
||||
best = None
|
||||
best_sim = 0.0
|
||||
nname = normalize(o['naziv'])
|
||||
if not nname: continue
|
||||
for e in elems:
|
||||
sim = similarity(o['naziv'], e['name'])
|
||||
# Boost if same city contained in either name
|
||||
if o['grad'] and (o['grad'].lower() in (e['name'] or '').lower() or
|
||||
o['grad'].lower() in (e['tags'].get('addr:city','') or '').lower()):
|
||||
sim += 0.05
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best = e
|
||||
# Require strong match
|
||||
if best_sim < 0.55:
|
||||
skipped_low += 1
|
||||
continue
|
||||
# Skip if already within 100m
|
||||
if o['lat'] and o['lng']:
|
||||
d = haversine(float(o['lat']), float(o['lng']), best['lat'], best['lng'])
|
||||
if d < 100:
|
||||
skipped_close += 1
|
||||
continue
|
||||
else:
|
||||
pass
|
||||
# Apply update
|
||||
print(f" #{o['id']:3} {o['naziv'][:55]:55} -> '{best['name'][:40]}' sim={best_sim:.2f} {best['lat']:.6f},{best['lng']:.6f}")
|
||||
cur.execute("UPDATE pgz_sport.sportski_objekti SET lat=%s, lng=%s WHERE id=%s",
|
||||
(best['lat'], best['lng'], o['id']))
|
||||
conn.commit()
|
||||
updated += 1
|
||||
|
||||
print(f'\nUpdated: {updated} Already-close: {skipped_close} Low-similarity: {skipped_low}')
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user