Merge pull request #496 from yuyangshu/reverse-geocoder

whosonfirst neighbourhood reverse geocoder
This commit is contained in:
Al B
2025-02-08 12:18:52 -05:00
committed by GitHub
3 changed files with 105 additions and 39 deletions

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import argparse import argparse
import fnmatch
import logging import logging
import operator import operator
import os import os
@@ -24,7 +25,7 @@ from geodata.osm.components import osm_address_components
from geodata.osm.definitions import osm_definitions from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
from geodata.polygons.index import * from geodata.polygons.index import *
from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder
from geodata.statistics.tf_idf import IDFIndex from geodata.statistics.tf_idf import IDFIndex
@@ -212,6 +213,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
(ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon (ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon
tests for neighborhoods. The properties vary by source but each has tests for neighborhoods. The properties vary by source but each has
source has least a "name" key which in practice is what we care about. source has least a "name" key which in practice is what we care about.
Quattroshapes data is no longer accessible and has been replaced by
WhosOnFirst.
''' '''
PRIORITIES_FILENAME = 'priorities.json' PRIORITIES_FILENAME = 'priorities.json'
@@ -224,9 +228,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
source_priorities = { source_priorities = {
'osm': 0, # Best names/polygons, same coordinate system 'osm': 0, # Best names/polygons, same coordinate system
'osm_cth': 1, # Prefer the OSM names if possible 'osm_cth': 1, # Prefer the OSM names if possible
'clickthathood': 2, # Better names/polygons than Quattroshapes 'clickthathood': 2, # Better names/polygons than WhosOnFirst
'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon 'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon
'quattroshapes': 4, # Good results in some countries/areas 'wof': 4, # Replacement of Quattroshapes
} }
level_priorities = { level_priorities = {
@@ -235,7 +239,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
} }
regex_replacements = [ regex_replacements = [
# Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes
(re.compile('^paris-(?=[\d])', re.I), ''), (re.compile('^paris-(?=[\d])', re.I), ''),
(re.compile('^prague(?= [\d]+$)', re.I), 'Praha'), (re.compile('^prague(?= [\d]+$)', re.I), 'Praha'),
] ]
@@ -254,7 +258,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return doc return doc
@classmethod @classmethod
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir):
''' '''
Given an OSM file (planet or some other bounds) containing neighborhoods Given an OSM file (planet or some other bounds) containing neighborhoods
as points (some suburbs have boundaries) as points (some suburbs have boundaries)
@@ -270,17 +274,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
logger = logging.getLogger('neighborhoods') logger = logging.getLogger('neighborhoods')
qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods')
ensure_dir(qs_scratch_dir)
logger.info('Creating ClickThatHood neighborhoods') logger.info('Creating ClickThatHood neighborhoods')
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
logger.info('Creating OSM neighborhoods') logger.info('Creating OSM neighborhoods')
osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file) osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file)
logger.info('Creating Quattroshapes neighborhoods') logger.info('Creating WhosOnFirst neighborhoods')
qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods"))
country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir) country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
@@ -292,7 +293,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
char_scripts = get_chars_by_script() char_scripts = get_chars_by_script()
for idx in (cth, qs, osmn): for idx in (cth, wof, osmn):
for i in xrange(idx.i): for i in xrange(idx.i):
props = idx.get_properties(i) props = idx.get_properties(i)
name = props.get('name') name = props.get('name')
@@ -317,11 +318,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
index.index_polygon(poly.context) index.index_polygon(poly.context)
index.add_polygon(poly.context, props) index.add_polygon(poly.context, props)
qs.matched = [False] * qs.i wof.matched = [False] * wof.i
cth.matched = [False] * cth.i cth.matched = [False] * cth.i
logger.info('Matching OSM points to neighborhood polygons') logger.info('Matching OSM points to neighborhood polygons')
# Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons # Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons
num_polys = 0 num_polys = 0
for element_id, attrs, deps in parse_osm(filename): for element_id, attrs, deps in parse_osm(filename):
try: try:
@@ -359,14 +360,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
for name_key in OSM_NAME_TAGS: for name_key in OSM_NAME_TAGS:
osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))])
for idx in (cth, qs): for idx in (cth, wof):
candidates = idx.get_candidate_polygons(lat, lon, return_all=True) candidates = idx.get_candidate_polygons(lat, lon, return_all=True)
if candidates: if candidates:
max_sim = 0.0 max_sim = 0.0
arg_max = None arg_max = None
normalized_qs_names = {} normalized_wof_names = {}
for osm_name in osm_names: for osm_name in osm_names:
@@ -375,16 +376,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
for i in candidates: for i in candidates:
props = idx.get_properties(i) props = idx.get_properties(i)
name = normalized_qs_names.get(i) name = normalized_wof_names.get(i)
if not name: if not name:
name = props.get('name') name = props.get('name')
if not name: if not name:
continue continue
for pattern, repl in cls.regex_replacements: for pattern, repl in cls.regex_replacements:
name = pattern.sub(repl, name) name = pattern.sub(repl, name)
normalized_qs_names[i] = name normalized_wof_names[i] = name
if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood': if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood':
continue continue
if not contains_ideographs: if not contains_ideographs:
@@ -446,7 +447,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
continue continue
source = 'osm_cth' source = 'osm_cth'
else: else:
level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None)
source = 'osm_quattro' source = 'osm_quattro'
if level == 'neighborhood': if level == 'neighborhood':
@@ -467,7 +468,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
if num_polys % 1000 == 0 and num_polys > 0: if num_polys % 1000 == 0 and num_polys > 0:
logger.info('did {} neighborhoods'.format(num_polys)) logger.info('did {} neighborhoods'.format(num_polys))
for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')): for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')):
for i in xrange(idx.i): for i in xrange(idx.i):
props = idx.get_properties(i) props = idx.get_properties(i)
poly = idx.get_polygon(i) poly = idx.get_polygon(i)
@@ -482,7 +483,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
props['polygon_type'] = 'local_admin' props['polygon_type'] = 'local_admin'
else: else:
continue continue
elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood':
component = AddressFormatter.SUBURB component = AddressFormatter.SUBURB
name = props.get('name') name = props.get('name')
if not name: if not name:
@@ -525,28 +526,67 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return sorted(candidates, key=self.priority) return sorted(candidates, key=self.priority)
class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder): class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex):
persistent_polygons = False persistent_polygons = False
cache_size = None cache_size = None
NAME = "wof:name"
ASCII_NAME = "gn:asciiname"
LEVEL = "wof:placetype"
GEONAMES_ID = "gn:geonameid"
SUPERSEDED = "wof:superseded_by"
NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"}
POLYGON_TYPES = {"Polygon", "MultiPolygon"}
@classmethod @classmethod
def create_neighborhoods_index(cls, quattroshapes_dir, def is_valid_neighbourhood(cls, geojson):
output_dir, validity = not geojson["properties"].get(cls.SUPERSEDED)
index_filename=None, for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}:
polys_filename=DEFAULT_POLYS_FILENAME): validity &= geojson["properties"].get(field)
local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME) return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES
neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME)
return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename], @classmethod
output_dir, index_filename=index_filename, def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None):
polys_filename=polys_filename) index = cls(save_dir=output_dir, index_filename=index_filename)
for root, dirnames, filenames in os.walk(wof_dir):
for fname in fnmatch.filter(filenames, "*.geojson"):
with open(os.path.join(root, fname)) as f:
geojson = json.load(f)
if cls.is_valid_neighbourhood(geojson):
properties = {
"name": safe_decode(geojson["properties"].get(cls.NAME)),
"name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)),
"qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)),
"gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID))
}
poly_type = geojson['geometry']['type']
if poly_type == 'Polygon':
poly = cls.to_polygon(geojson['geometry']['coordinates'][0])
index.index_polygon(poly)
poly = index.simplify_polygon(poly)
index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props)
elif poly_type == 'MultiPolygon':
polys = []
for coords in geojson['geometry']['coordinates']:
poly = cls.to_polygon(coords[0])
polys.append(poly)
index.index_polygon(poly)
multi_poly = index.simplify_polygon(MultiPolygon(polys))
index.add_polygon(multi_poly, dict(geojson['properties']))
return index
if __name__ == '__main__': if __name__ == '__main__':
# Handle argument parsing here # Handle argument parsing here
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-q', '--quattroshapes-dir', parser.add_argument('-w', '--wof-dir',
help='Path to quattroshapes dir') help='Path to WhosOnFirst dir')
parser.add_argument('-a', '--osm-admin-rtree-dir', parser.add_argument('-a', '--osm-admin-rtree-dir',
help='Path to OSM admin rtree dir') help='Path to OSM admin rtree dir')
@@ -567,16 +607,16 @@ if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
args = parser.parse_args() args = parser.parse_args()
if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file: if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file:
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( index = NeighborhoodReverseGeocoder.create_from_osm_and_wof(
args.osm_neighborhoods_file, args.osm_neighborhoods_file,
args.quattroshapes_dir, args.wof_dir,
args.country_rtree_dir, args.country_rtree_dir,
args.osm_admin_rtree_dir, args.osm_admin_rtree_dir,
args.osm_neighborhood_borders_file, args.osm_neighborhood_borders_file,
args.out_dir args.out_dir
) )
else: else:
parser.error('Must specify quattroshapes dir or osm admin borders file') parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file')
index.save() index.save()

View File

@@ -226,7 +226,6 @@ class PolygonIndex(object):
@classmethod @classmethod
def create_from_geojson_files(cls, inputs, output_dir, def create_from_geojson_files(cls, inputs, output_dir,
index_filename=None, index_filename=None,
polys_filename=DEFAULT_POLYS_FILENAME,
include_only_properties=None): include_only_properties=None):
index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME) index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME)
for input_file in inputs: for input_file in inputs:

View File

@@ -0,0 +1,27 @@
import os
import pycountry
import subprocess
import sys
this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/"
WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-"
def download_wof_data_admin(wof_dir):
for country_object in pycountry.countries:
repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower()
repo_location = os.path.join(wof_dir, repo_name)
if not os.path.exists(repo_location):
subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name])
if __name__ == '__main__':
if len(sys.argv) < 2:
sys.exit('Usage: python download_whosonfirst_data.py wof_dir')
download_wof_data_admin(sys.argv[1])