diff --git a/scripts/geodata/graph/scc.py b/scripts/geodata/graph/scc.py index e1db8707..7c0d5a1f 100644 --- a/scripts/geodata/graph/scc.py +++ b/scripts/geodata/graph/scc.py @@ -2,6 +2,13 @@ VISIT, VISIT_EDGE, POST_VISIT = range(3) def strongly_connected_components(graph): + ''' + Find strongly connected components in a graph using iterative + depth-first search. + + Based on: + http://code.activestate.com/recipes/578507-strongly-connected-components-of-a-directed-graph/ + ''' identified = set() stack = [] index = {} diff --git a/scripts/geodata/names/deduping.py b/scripts/geodata/names/deduping.py index a5a0e358..00911bbc 100644 --- a/scripts/geodata/names/deduping.py +++ b/scripts/geodata/names/deduping.py @@ -70,7 +70,7 @@ class NameDeduper(object): tokens2 = cls.content_tokens(s2) if not cls.possible_match(tokens1, tokens2): - return max(cls.dupe_threshold - 0.1, 0.0) + return 0.0 tokens1_only = [t for t, c in tokens1] tokens2_only = [t for t, c in tokens2] @@ -87,7 +87,7 @@ class NameDeduper(object): tokens2 = cls.content_tokens(s2) if not cls.possible_match(tokens1, tokens2): - return max(cls.dupe_threshold - 0.1, 0.0) + return 0.0 tokens1_only = [t for t, c in tokens1] tokens2_only = [t for t, c in tokens2] diff --git a/scripts/geodata/polygons/reverse_geocode.py b/scripts/geodata/polygons/reverse_geocode.py index cd92e3a9..b852644c 100644 --- a/scripts/geodata/polygons/reverse_geocode.py +++ b/scripts/geodata/polygons/reverse_geocode.py @@ -12,18 +12,33 @@ Usage: ''' import argparse import logging +import operator import os +import re +import requests +import shutil +import subprocess import sys +import tempfile from functools import partial this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) +from geodata.coordinates.conversion import latlon_to_decimal from geodata.encoding import safe_decode +from geodata.file_utils import ensure_dir +from geodata.i18n.unicode_properties import get_chars_by_script +from geodata.i18n.word_breaks import ideographic_scripts +from geodata.names.similarity import soft_tfidf +from geodata.osm.extract import parse_osm, OSM_NAME_TAGS from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader from geodata.polygons.index import * +from geodata.statistics.tf_idf import IDFIndex +from postal.text.tokenize import tokenize, token_types +from postal.text.normalize import * decode_latin1 = partial(safe_decode, encoding='latin1') @@ -35,7 +50,345 @@ def str_id(v): return str(v) -class QuattroshapesReverseGeocoder(RTreePolygonIndex): +class NeighborhoodDeduper(NameDeduper): + # Lossless conversions only + replacements = { + u'saint': u'st', + u'and': u'&', + } + + discriminative_words = set([ + # Han numbers + u'〇', u'一', + u'二', u'三', + u'四', u'五', + u'六', u'七', + u'八', u'九', + u'十', u'百', + u'千', u'万', + u'億', u'兆', + u'京', u'第', + + # Roman numerals + u'i', u'ii', + u'iii', u'iv', + u'v', u'vi', + u'vii', u'viii', + u'ix', u'x', + u'xi', u'xii', + u'xiii', u'xiv', + u'xv', u'xvi', + u'xvii', u'xviii', + u'xix', u'xx', + + # English directionals + u'north', u'south', + u'east', u'west', + u'northeast', u'northwest', + u'southeast', u'southwest', + + # Spanish, Portguese and Italian directionals + u'norte', u'nord', u'sur', u'sul', u'sud', + u'est', u'este', u'leste', u'oeste', u'ovest', + + # New in various languages + u'new', + u'nova', + u'novo', + u'nuevo', + u'nueva', + u'nuovo', + u'nuova', + + # Qualifiers + u'heights', + u'hills', + + u'upper', u'lower', + u'little', u'great', + + u'park', + u'parque', + + u'village', + + ]) + + stopwords = set([ + u'cp', + u'de', + u'la', + u'urbanizacion', + u'do', + u'da', + u'dos', + u'del', + u'community', + u'bairro', + u'barrio', + u'le', + u'el', + u'mah', + u'раион', + u'vila', + u'villa', + u'kampung', + u'ahupua`a', + + ]) + + +class NeighborhoodReverseGeocoder(RTreePolygonIndex): + ''' + Neighborhoods are very important in cities like NYC, SF, Chicago, London + and many others. We want the address parser to be trained with addresses + that sufficiently capture variations in address patterns, including + neighborhoods. Quattroshapes neighborhood data (in the US at least) + is not great in terms of names, mostly becasue GeoPlanet has so many + incorrect names. The neighborhoods project, also known as Zetashapes + has very accurate polygons with correct names, but only for a handful + of cities. OSM usually lists neighborhoods and some other local admin + areas like boroughs as points rather than polygons. + + This index merges all of the above data sets in prioritized order + (Zetashapes > OSM > Quattroshapes) to provide unified point-in-polygon + tests for neighborhoods. The properties vary by source but each has + source has least a "name" key which in practice is what we care about. + ''' + NEIGHBORHOODS_REPO = 'https://github.com/blackmad/neighborhoods' + + SCRATCH_DIR = '/tmp' + + DUPE_THRESHOLD = 0.9 + + source_priorities = { + 'zetashapes': 0, # Best names/polygons + 'osm': 1, # OSM names with Quattroshapes/Zetashapes polygon + 'quattroshapes': 2, # Good results in some countries/areas + } + + level_priorities = { + 'neighborhood': 0, + 'local_admin': 1, + } + + @classmethod + def clone_repo(cls, path): + subprocess.check_call(['rm', '-rf', path]) + subprocess.check_call(['git', 'clone', cls.NEIGHBORHOODS_REPO, path]) + + @classmethod + def create_zetashapes_neighborhoods_index(cls): + scratch_dir = cls.SCRATCH_DIR + repo_path = os.path.join(scratch_dir, 'neighborhoods') + cls.clone_repo(repo_path) + + neighborhoods_dir = os.path.join(scratch_dir, 'neighborhoods', 'index') + ensure_dir(neighborhoods_dir) + + index = GeohashPolygonIndex() + + have_geonames = set() + is_neighborhood = set() + + for filename in os.listdir(repo_path): + path = os.path.join(repo_path, filename) + base_name = filename.split('.')[0].split('gn-')[-1] + if filename.endswith('.geojson') and filename.startswith('gn-'): + have_geonames.add(base_name) + elif filename.endswith('metadata.json'): + data = json.load(open(os.path.join(repo_path, filename))) + if data.get('neighborhoodNoun', [None])[0] in (None, 'rione'): + is_neighborhood.add(base_name) + + for filename in os.listdir(repo_path): + if not filename.endswith('.geojson'): + continue + base_name = filename.rsplit('.geojson')[0] + if base_name in have_geonames: + f = open(os.path.join(repo_path, 'gn-{}'.format(filename))) + elif base_name in is_neighborhood: + f = open(os.path.join(repo_path, filename)) + else: + continue + index.add_geojson_like_file(json.load(f)['features']) + + return index + + @classmethod + def count_words(cls, s): + doc = defaultdict(int) + for t, c in NeighborhoodDeduper.content_tokens(s): + doc[t] += 1 + return doc + + @classmethod + def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): + ''' + Given an OSM file (planet or some other bounds) containing neighborhoods + as points (some suburbs have boundaries) + + and their dependencies, create an R-tree index for coarse-grained + reverse geocoding. + + Note: the input file is expected to have been created using + osmfilter. Use fetch_osm_address_data.sh for planet or copy the + admin borders commands if using other geometries. + ''' + index = cls(save_dir=output_dir) + + ensure_dir(scratch_dir) + + logger = logging.getLogger('neighborhoods') + + qs_scratch_dir = os.path.join(scratch_dir, 'qs_neighborhoods') + ensure_dir(qs_scratch_dir) + logger.info('Creating Quattroshapes neighborhoods') + + qs = QuattroshapesReverseGeocoder.create_neighborhoods_index(quattroshapes_dir, qs_scratch_dir) + logger.info('Creating Zetashapes neighborhoods') + zs = cls.create_zetashapes_neighborhoods_index() + + logger.info('Creating IDF index') + idf = IDFIndex() + + char_scripts = get_chars_by_script() + + for idx in (zs, qs): + for i, (props, poly) in enumerate(idx.polygons): + name = props.get('name') + if name is not None: + doc = cls.count_words(name) + idf.update(doc) + + for key, attrs, deps in parse_osm(filename): + for k, v in attrs.iteritems(): + if any((k.startswith(name_key) for name_key in OSM_NAME_TAGS)): + doc = cls.count_words(v) + idf.update(doc) + + qs.matched = [False] * qs.i + zs.matched = [False] * zs.i + + logger.info('Matching OSM points to neighborhood polygons') + # Parse OSM and match neighborhood/suburb points to Quattroshapes/Zetashapes polygons + num_polys = 0 + for node_id, attrs, deps in parse_osm(filename): + try: + lat, lon = latlon_to_decimal(attrs['lat'], attrs['lon']) + except ValueError: + continue + + osm_name = attrs.get('name') + if not osm_name: + continue + + is_neighborhood = attrs.get('place') == 'neighbourhood' + + ranks = [] + osm_names = [] + + for key in OSM_NAME_TAGS: + name = attrs.get(key) + if name: + osm_names.append(name) + + for name_key in OSM_NAME_TAGS: + osm_names.extend([v for k, v in attrs.iteritems() if k.startswith('{}:'.format(name_key))]) + + for idx in (zs, qs): + candidates = idx.get_candidate_polygons(lat, lon, all_levels=True) + + if candidates: + max_sim = 0.0 + arg_max = None + + for osm_name in osm_names: + + contains_ideographs = any(((char_scripts[ord(c)] or '').lower() in ideographic_scripts + for c in safe_decode(osm_name))) + + for i in candidates: + props, poly = idx.polygons[i] + name = props.get('name') + if not name: + continue + + level = props.get(QuattroshapesReverseGeocoder.LEVEL) + if is_neighborhood and level != 'neighborhood': + continue + + if not contains_ideographs: + sim = NeighborhoodDeduper.compare(osm_name, name, idf) + else: + # Many Han/Hangul characters are common, shouldn't use IDF + sim = NeighborhoodDeduper.compare_ideographs(osm_name, name) + + if sim > max_sim: + max_sim = sim + arg_max = (max_sim, props, poly.context, idx, i) + + if arg_max: + ranks.append(arg_max) + + ranks.sort(key=operator.itemgetter(0), reverse=True) + if ranks and ranks[0][0] >= cls.DUPE_THRESHOLD: + score, props, poly, idx, i = ranks[0] + matches.append((score, [safe_decode(attrs[k]) for k in OSM_NAME_TAGS if k in attrs], safe_decode(props['name']))) + + if idx is zs: + attrs['polygon_type'] = 'neighborhood' + else: + level = props.get(QuattroshapesReverseGeocoder.LEVEL, None) + if level == 'neighborhood': + attrs['polygon_type'] = 'neighborhood' + else: + attrs['polygon_type'] = 'local_admin' + + attrs['source'] = 'osm' + index.index_polygon(poly) + index.add_polygon(poly, attrs) + idx.matched[i] = True + else: + if ranks: + score, props, poly, idx, i = ranks[0] + top_matches.append((ranks[0][0], [safe_decode(attrs[k]) for k in OSM_NAME_TAGS if k in attrs], safe_decode(props['name']))) + + non_match.append((node_id, attrs)) + + num_polys += 1 + if num_polys % 1000 == 0 and num_polys > 0: + logger.info('did {} neighborhoods'.format(num_polys)) + + for idx, source in ((zs, 'zetashapes'), (qs, 'quattroshapes')): + for i, (props, poly) in enumerate(idx.polygons): + if idx.matched[i]: + continue + props['source'] = source + if idx is zs or props.get(QuattroshapesReverseGeocoder.LEVEL, None) == 'neighborhood': + props['polygon_type'] = 'neighborhood' + else: + props['polygon_type'] = 'local_admin' + index.index_polygon(poly.context) + index.add_polygon(poly.context, props) + + return index + + def priority(self, i): + props, p = self.polygons[i] + return (self.level_priorities[props['polygon_type']], self.source_priorities[props['source']]) + + def get_candidate_polygons(self, lat, lon): + candidates = super(NeighborhoodReverseGeocoder, self).get_candidate_polygons(lat, lon) + return sorted(candidates, key=self.priority) + + +class QuattroshapesReverseGeocoder(GeohashPolygonIndex): + ''' + Quattroshapes polygons, for levels up to localities, are relatively + accurate and provide concordance with GeoPlanet and in some cases + GeoNames (which is used in other parts of this project). + ''' COUNTRIES_FILENAME = 'qs_adm0.shp' ADMIN1_FILENAME = 'qs_adm1.shp' ADMIN1_REGION_FILENAME = 'qs_adm1_region.shp' @@ -134,18 +487,23 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex): def create_from_shapefiles(cls, input_files, output_dir, - index_filename=DEFAULT_INDEX_FILENAME, - polys_filename=DEFAULT_POLYS_FILENAME): + index_filename=None, + polys_filename=DEFAULT_POLYS_FILENAME, + use_all_props=False): index = cls(save_dir=output_dir, index_filename=index_filename) - i = 0 - for input_file in input_files: f = fiona.open(input_file) filename = os.path.split(input_file)[-1] - include_props = cls.polygon_properties.get(filename) + + aliases = cls.polygon_properties.get(filename) + + if not use_all_props: + include_props = aliases + else: + include_props = None for rec in f: if not rec or not rec.get('geometry') or 'type' not in rec['geometry']: @@ -156,24 +514,23 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex): if filename == cls.NEIGHBORHOODS_FILENAME: properties['qs_level'] = 'neighborhood' - if include_props: - have_all_props = False - for k, (prop, func) in include_props.iteritems(): - v = properties.get(prop, None) - if v is not None: - try: - properties[k] = func(v) - except Exception: - break - else: - have_all_props = True - if not have_all_props or not properties.get(cls.NAME): - continue + have_all_props = False + for k, (prop, func) in aliases.iteritems(): + v = properties.get(prop, None) + if v is not None: + try: + properties[k] = func(v) + except Exception: + break + else: + have_all_props = True + if not have_all_props or not properties.get(cls.NAME): + continue poly_type = rec['geometry']['type'] if poly_type == 'Polygon': poly = Polygon(rec['geometry']['coordinates'][0]) - index.index_polygon(i, poly) + index.index_polygon(poly) poly = index.simplify_polygon(poly) index.add_polygon(poly, dict(rec['properties']), include_only_properties=include_props) elif poly_type == 'MultiPolygon': @@ -181,20 +538,19 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex): for coords in rec['geometry']['coordinates']: poly = Polygon(coords[0]) polys.append(poly) - index.index_polygon(i, poly) + index.index_polygon(poly) multi_poly = index.simplify_polygon(MultiPolygon(polys)) index.add_polygon(multi_poly, dict(rec['properties']), include_only_properties=include_props) else: continue - i += 1 return index @classmethod def create_with_quattroshapes(cls, quattroshapes_dir, output_dir, - index_filename=DEFAULT_INDEX_FILENAME, + index_filename=None, polys_filename=DEFAULT_POLYS_FILENAME): admin0_filename = os.path.join(quattroshapes_dir, cls.COUNTRIES_FILENAME) @@ -212,16 +568,41 @@ class QuattroshapesReverseGeocoder(RTreePolygonIndex): output_dir, index_filename=index_filename, polys_filename=polys_filename) + @classmethod + def create_neighborhoods_index(cls, quattroshapes_dir, + output_dir, + index_filename=None, + polys_filename=DEFAULT_POLYS_FILENAME): + local_admin_filename = os.path.join(quattroshapes_dir, cls.LOCAL_ADMIN_FILENAME) + neighborhoods_filename = os.path.join(quattroshapes_dir, cls.NEIGHBORHOODS_FILENAME) + return cls.create_from_shapefiles([local_admin_filename, neighborhoods_filename], + output_dir, index_filename=index_filename, + polys_filename=polys_filename) + def sort_level(self, i): props, p = self.polygons[i] return self.sort_levels.get(props[self.LEVEL], 0) - def get_candidate_polygons(self, lat, lon): - candidates = OrderedDict.fromkeys(self.index.intersection((lon, lat, lon, lat))).keys() + def get_candidate_polygons(self, lat, lon, all_levels=False): + candidates = super(QuattroshapesReverseGeocoder, self).get_candidate_polygons(lat, lon, all_levels=all_levels) return sorted(candidates, key=self.sort_level, reverse=True) class OSMReverseGeocoder(RTreePolygonIndex): + ''' + OSM has among the best, most carefully-crafted, accurate administrative + polygons in the business in addition to using local language naming + conventions which is desirable for creating a truly multilingual address + parser. + + The storage of these polygons is byzantine. See geodata.osm.osm_admin_boundaries + for more details. + + Suffice to say, this reverse geocoder builds an R-tree index on OSM planet + in a reasonable amount of memory using arrays of C integers and binary search + for the dependency lookups and Tarjan's algorithm for finding strongly connected + components to stitch together the polygons. + ''' include_property_patterns = set([ 'name', 'name:*', @@ -237,27 +618,6 @@ class OSMReverseGeocoder(RTreePolygonIndex): 'wikipedia:*', ]) - @classmethod - def fix_polygon(cls, poly): - ''' - Coerce to valid polygon - ''' - if not poly.is_valid: - poly = poly.buffer(0) - if not poly.is_valid: - return None - return poly - - @classmethod - def to_polygon(cls, coords): - ''' - Create shapely polygon from list of coordinate tuples if valid - ''' - if not coords or len(coords) < 3: - return None - poly = Polygon(coords) - return cls.fix_polygon(poly) - @classmethod def create_from_osm_file(cls, filename, output_dir, index_filename=DEFAULT_INDEX_FILENAME, @@ -269,7 +629,7 @@ class OSMReverseGeocoder(RTreePolygonIndex): Note: the input file is expected to have been created using osmfilter. Use fetch_osm_address_data.sh for planet or copy the - admin borders commands if using other geometries. + admin borders commands if using other bounds. ''' index = cls(save_dir=output_dir, index_filename=index_filename) @@ -282,8 +642,6 @@ class OSMReverseGeocoder(RTreePolygonIndex): logger = logging.getLogger('osm.reverse_geocode') - polygon_index = 0 - for relation_id, props, outer_polys, inner_polys in polygons: props = {k: v for k, v in props.iteritems() if k in cls.include_property_patterns or (':' in k and '{}:*'.format(k.split(':', 1)[0]) in cls.include_property_patterns)} @@ -298,10 +656,10 @@ class OSMReverseGeocoder(RTreePolygonIndex): if poly is None or not poly.bounds or len(poly.bounds) != 4: continue if poly.type != 'MultiPolygon': - index.index_polygon(polygon_index, poly) + index.index_polygon(poly) else: for p in poly: - index.index_polygon(polygon_index, p) + index.index_polygon(p) else: multi = [] inner = [] @@ -331,11 +689,11 @@ class OSMReverseGeocoder(RTreePolygonIndex): continue # R-tree only stores the bounding box, so add the whole polygon if poly.type != 'MultiPolygon': - index.index_polygon(polygon_index, poly) + index.index_polygon(poly) multi.append(poly) else: for p in poly: - index.index_polygon(polygon_index, p) + index.index_polygon(p) multi.extend(poly) if len(multi) > 1: @@ -351,6 +709,7 @@ class OSMReverseGeocoder(RTreePolygonIndex): return index + if __name__ == '__main__': # Handle argument parsing here parser = argparse.ArgumentParser() @@ -361,15 +720,25 @@ if __name__ == '__main__': parser.add_argument('-a', '--osm-admin-file', help='Path to OSM borders file (with dependencies, .osm format)') + parser.add_argument('-n', '--osm-neighborhoods-file', + help='Path to OSM neighborhoods file (no dependencies, .osm format)') + parser.add_argument('-o', '--out-dir', default=os.getcwd(), help='Output directory') args = parser.parse_args() - if args.quattroshapes_dir: + if args.osm_admin_file: + index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir, + quattroshapes_dir=args.quattroshapes_dir) + elif args.osm_neighborhoods_filename and args.quattroshapes_dir: + index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( + args.osm_neighorhoods_file, + args.quattroshapes_dir, + args.out_dir + ) + elif args.quattroshapes_dir: index = QuattroshapesReverseGeocoder.create_with_quattroshapes(args.quattroshapes_dir, args.out_dir) - elif args.osm_admin_file: - index = OSMReverseGeocoder.create_from_osm_file(args.osm_admin_file, args.out_dir) else: parser.error('Must specify quattroshapes dir or osm admin borders file')