From b3dcfe610b3646679f911ff3bd292d9421338a85 Mon Sep 17 00:00:00 2001 From: Yuyang Shu Date: Tue, 5 May 2020 11:50:30 +1000 Subject: [PATCH] whosonfirst neighbourhood reverse geocoder --- .../geodata/neighborhoods/reverse_geocode.py | 116 ++++++++++++------ scripts/geodata/polygons/index.py | 1 - .../whosonfirst/download_wof_admin_polygon.py | 27 ++++ 3 files changed, 105 insertions(+), 39 deletions(-) create mode 100644 scripts/geodata/whosonfirst/download_wof_admin_polygon.py diff --git a/scripts/geodata/neighborhoods/reverse_geocode.py b/scripts/geodata/neighborhoods/reverse_geocode.py index 4c699813..ec8083a1 100644 --- a/scripts/geodata/neighborhoods/reverse_geocode.py +++ b/scripts/geodata/neighborhoods/reverse_geocode.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import argparse +import fnmatch import logging import operator import os @@ -24,7 +25,7 @@ from geodata.osm.components import osm_address_components from geodata.osm.definitions import osm_definitions from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.polygons.index import * -from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder +from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder, OSMReverseGeocoder from geodata.statistics.tf_idf import IDFIndex @@ -212,6 +213,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): (ClickThatHood > OSM > Quattroshapes) to provide unified point-in-polygon tests for neighborhoods. The properties vary by source but each has source has least a "name" key which in practice is what we care about. + + Quattroshapes data is no longer accessible and has been replaced by + WhosOnFirst. ''' PRIORITIES_FILENAME = 'priorities.json' @@ -224,9 +228,9 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): source_priorities = { 'osm': 0, # Best names/polygons, same coordinate system 'osm_cth': 1, # Prefer the OSM names if possible - 'clickthathood': 2, # Better names/polygons than Quattroshapes - 'osm_quattro': 3, # Prefer OSM names matched with Quattroshapes polygon - 'quattroshapes': 4, # Good results in some countries/areas + 'clickthathood': 2, # Better names/polygons than WhosOnFirst + 'osm_wof': 3, # Prefer OSM names matched with WhosOnFirst polygon + 'wof': 4, # Replacement of Quattroshapes } level_priorities = { @@ -235,7 +239,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): } regex_replacements = [ - # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quqttroshapes + # Paris arrondissements, listed like "PARIS-1ER-ARRONDISSEMENT" in Quattroshapes (re.compile('^paris-(?=[\d])', re.I), ''), (re.compile('^prague(?= [\d]+$)', re.I), 'Praha'), ] @@ -254,7 +258,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return doc @classmethod - def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): + def create_from_osm_and_wof(cls, filename, wof_dir, country_rtree_dir, osm_rtree_dir, osm_neighborhood_borders_file, output_dir): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) @@ -270,17 +274,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): logger = logging.getLogger('neighborhoods') - qs_scratch_dir = os.path.join(quattroshapes_dir, 'qs_neighborhoods') - ensure_dir(qs_scratch_dir) - logger.info('Creating ClickThatHood neighborhoods') cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() logger.info('Creating OSM neighborhoods') osmn = OSMNeighborhoodReverseGeocoder.create_neighborhoods_index(osm_neighborhood_borders_file) - logger.info('Creating Quattroshapes neighborhoods') - qs = QuattroshapesNeighborhoodsReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) + logger.info('Creating WhosOnFirst neighborhoods') + wof = WhosOnFirstNeighborhoodsReverseGeocoder.create_neighborhoods_index(wof_dir, os.path.join(wof_dir, "wof_neighbourhoods")) country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir) @@ -292,7 +293,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): char_scripts = get_chars_by_script() - for idx in (cth, qs, osmn): + for idx in (cth, wof, osmn): for i in xrange(idx.i): props = idx.get_properties(i) name = props.get('name') @@ -317,11 +318,11 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): index.index_polygon(poly.context) index.add_polygon(poly.context, props) - qs.matched = [False] * qs.i + wof.matched = [False] * wof.i cth.matched = [False] * cth.i logger.info('Matching OSM points to neighborhood polygons') - # Parse OSM and match neighborhood/suburb points to Quattroshapes/ClickThatHood polygons + # Parse OSM and match neighborhood/suburb points to ClickThatHood/WhosOnFirst polygons num_polys = 0 for element_id, attrs, deps in parse_osm(filename): try: @@ -359,14 +360,14 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) - for idx in (cth, qs): + for idx in (cth, wof): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) if candidates: max_sim = 0.0 arg_max = None - normalized_qs_names = {} + normalized_wof_names = {} for osm_name in osm_names: @@ -375,16 +376,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): for i in candidates: props = idx.get_properties(i) - name = normalized_qs_names.get(i) + name = normalized_wof_names.get(i) if not name: name = props.get('name') if not name: continue for pattern, repl in cls.regex_replacements: name = pattern.sub(repl, name) - normalized_qs_names[i] = name + normalized_wof_names[i] = name - if is_neighborhood and idx is qs and props.get(QuattroshapesReverseGeocoder.LEVEL) != 'neighborhood': + if is_neighborhood and idx is wof and props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL) != 'neighborhood': continue if not contains_ideographs: @@ -446,7 +447,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): continue source = 'osm_cth' else: - level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) + level = props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) source = 'osm_quattro' if level == 'neighborhood': @@ -467,7 +468,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): if num_polys % 1000 == 0 and num_polys > 0: logger.info('did {} neighborhoods'.format(num_polys)) - for idx, source in ((cth, 'clickthathood'), (qs, 'quattroshapes')): + for idx, source in ((cth, 'clickthathood'), (wof, 'whosonfirst')): for i in xrange(idx.i): props = idx.get_properties(i) poly = idx.get_polygon(i) @@ -482,7 +483,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): props['polygon_type'] = 'local_admin' else: continue - elif props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': + elif props.get(WhosOnFirstNeighborhoodsReverseGeocoder.LEVEL, None) == 'neighborhood': component = AddressFormatter.SUBURB name = props.get('name') if not name: @@ -525,28 +526,67 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return sorted(candidates, key=self.priority) -class QuattroshapesNeighborhoodsReverseGeocoder(GeohashPolygonIndex, QuattroshapesReverseGeocoder): +class WhosOnFirstNeighborhoodsReverseGeocoder(GeohashPolygonIndex): persistent_polygons = False cache_size = None + NAME = "wof:name" + ASCII_NAME = "gn:asciiname" + LEVEL = "wof:placetype" + GEONAMES_ID = "gn:geonameid" + SUPERSEDED = "wof:superseded_by" + + NEIGHBOURHOOD_TYPES = {"localadmin", "locality", "neighbourhood"} + POLYGON_TYPES = {"Polygon", "MultiPolygon"} + @classmethod - def create_neighborhoods_index(cls, quattroshapes_dir, - output_dir, - index_filename=None, - polys_filename=DEFAULT_POLYS_FILENAME): - local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME) - neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME) - return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename], - output_dir, index_filename=index_filename, - polys_filename=polys_filename) + def is_valid_neighbourhood(cls, geojson): + validity = not geojson["properties"].get(cls.SUPERSEDED) + for field in {cls.NAME, cls.ASCII_NAME, cls.GEONAMES_ID}: + validity &= geojson["properties"].get(field) + return validity and geojson["properties"].get(cls.LEVEL) in cls.NEIGHBOURHOOD_TYPES and geojson["geometry"]["type"] in cls.POLYGON_TYPES + + @classmethod + def create_neighborhoods_index(cls, wof_dir, output_dir, index_filename=None): + index = cls(save_dir=output_dir, index_filename=index_filename) + + for root, dirnames, filenames in os.walk(wof_dir): + for fname in fnmatch.filter(filenames, "*.geojson"): + with open(os.path.join(root, fname)) as f: + geojson = json.load(f) + if cls.is_valid_neighbourhood(geojson): + properties = { + "name": safe_decode(geojson["properties"].get(cls.NAME)), + "name_en": safe_decode(geojson["properties"].get(cls.ASCII_NAME)), + "qs_level": safe_decode(geojson["properties"].get(cls.LEVEL)), + "gn_id": safe_decode(geojson["properties"].get(cls.GEONAMES_ID)) + } + + poly_type = geojson['geometry']['type'] + if poly_type == 'Polygon': + poly = cls.to_polygon(geojson['geometry']['coordinates'][0]) + index.index_polygon(poly) + poly = index.simplify_polygon(poly) + index.add_polygon(poly, dict(geojson['properties']), include_only_properties=include_props) + elif poly_type == 'MultiPolygon': + polys = [] + for coords in geojson['geometry']['coordinates']: + poly = cls.to_polygon(coords[0]) + polys.append(poly) + index.index_polygon(poly) + + multi_poly = index.simplify_polygon(MultiPolygon(polys)) + index.add_polygon(multi_poly, dict(geojson['properties'])) + + return index if __name__ == '__main__': # Handle argument parsing here parser = argparse.ArgumentParser() - parser.add_argument('-q', '--quattroshapes-dir', - help='Path to quattroshapes dir') + parser.add_argument('-w', '--wof-dir', + help='Path to WhosOnFirst dir') parser.add_argument('-a', '--osm-admin-rtree-dir', help='Path to OSM admin rtree dir') @@ -567,16 +607,16 @@ if __name__ == '__main__': logging.basicConfig(level=logging.INFO) args = parser.parse_args() - if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file: - index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( + if args.osm_neighborhoods_file and args.wof_dir and args.osm_admin_rtree_dir and args.country_rtree_dir and args.osm_neighborhood_borders_file: + index = NeighborhoodReverseGeocoder.create_from_osm_and_wof( args.osm_neighborhoods_file, - args.quattroshapes_dir, + args.wof_dir, args.country_rtree_dir, args.osm_admin_rtree_dir, args.osm_neighborhood_borders_file, args.out_dir ) else: - parser.error('Must specify quattroshapes dir or osm admin borders file') + parser.error('Must specify whosonfirst dir, osm-admin, country rtrees, and osm-neighbourhood-border file') index.save() diff --git a/scripts/geodata/polygons/index.py b/scripts/geodata/polygons/index.py index 59010800..41521f81 100644 --- a/scripts/geodata/polygons/index.py +++ b/scripts/geodata/polygons/index.py @@ -226,7 +226,6 @@ class PolygonIndex(object): @classmethod def create_from_geojson_files(cls, inputs, output_dir, index_filename=None, - polys_filename=DEFAULT_POLYS_FILENAME, include_only_properties=None): index = cls(save_dir=output_dir, index_filename=index_filename or cls.INDEX_FILENAME) for input_file in inputs: diff --git a/scripts/geodata/whosonfirst/download_wof_admin_polygon.py b/scripts/geodata/whosonfirst/download_wof_admin_polygon.py new file mode 100644 index 00000000..51d029d1 --- /dev/null +++ b/scripts/geodata/whosonfirst/download_wof_admin_polygon.py @@ -0,0 +1,27 @@ +import os +import pycountry +import subprocess +import sys + + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + + +WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/" +WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-" + + +def download_wof_data_admin(wof_dir): + for country_object in pycountry.countries: + repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower() + repo_location = os.path.join(wof_dir, repo_name) + if not os.path.exists(repo_location): + subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name]) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + sys.exit('Usage: python download_whosonfirst_data.py wof_dir') + + download_wof_data_admin(sys.argv[1])