From a7b3403bf7c67bcc844a0ec4b74122dc4cab2408 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 14 Jul 2016 17:03:23 -0400 Subject: [PATCH] [osm] Neighborhood index now uses OSM admin R-tree to check whether nodes that are otherwise classified as non-suburbs (e.g. Santa Monica is a city) but may still match one of the neighborhood data sets, are excluded from the final neighborhood index --- .../geodata/neighborhoods/reverse_geocode.py | 51 +++++++++++++++++-- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/scripts/geodata/neighborhoods/reverse_geocode.py b/scripts/geodata/neighborhoods/reverse_geocode.py index e0751a7b..4fcc7265 100644 --- a/scripts/geodata/neighborhoods/reverse_geocode.py +++ b/scripts/geodata/neighborhoods/reverse_geocode.py @@ -11,6 +11,7 @@ import sys this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) +from geodata.address_formatting.formatter import AddressFormatter from geodata.coordinates.conversion import latlon_to_decimal from geodata.encoding import safe_decode from geodata.file_utils import ensure_dir, download_file @@ -20,7 +21,8 @@ from geodata.names.deduping import NameDeduper from geodata.osm.definitions import osm_definitions from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.polygons.index import * -from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder +from geodata.polygons.language_polys import LanguagePolygonIndex +from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMReverseGeocoder from geodata.statistics.tf_idf import IDFIndex @@ -233,7 +235,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return doc @classmethod - def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, output_dir, scratch_dir=SCRATCH_DIR): + def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, language_rtree_dir, osm_rtree_dir, output_dir, scratch_dir=SCRATCH_DIR): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) @@ -259,6 +261,10 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): logger.info('Creating ClickThatHood neighborhoods') cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() + language_rtree = LanguagePolygonIndex.load(language_rtree_dir) + + osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) + logger.info('Creating IDF index') idf = IDFIndex() @@ -300,7 +306,16 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): props['type'] = id_type props['id'] = element_id - possible_neighborhood = osm_definitions.meets_definition(attrs, osm_defintiions.LOCALITY) + possible_neighborhood = osm_definitions.meets_definition(attrs, osm_defintions.NEIGHBORHOOD) + + country, candidate_languages, language_props = language_rtree.country_and_languages(lat, lon) + component_name = None + for k, v in six.iteritems(attrs): + component_name = osm_address_components.get_component(country, k, v) + if component_name: + break + else: + component_name = None ranks = [] osm_names = [] @@ -313,6 +328,26 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): for name_key in OSM_NAME_TAGS: osm_names.extend([v for k, v in six.iteritems(attrs) if k.startswith('{}:'.format(name_key))]) + if component_name and component_name != AddressFormatter.SUBURB: + existing_osm_candidates = osm_admin_rtree.get_candidate_polygons(lat, lon) + skip_node = False + for i in existing_osm_candidates: + props = osm_admin_rtree.get_properties(i) + containing_component = None + name = props.get('name') + # Only exact name matches here since we're comparins OSM to OSM + if name and name == attrs.get('name'): + continue + + containing_component = osm_components.get_first_component(country, props) + + if containing_component != AddressFormatter.SUBURB: + skip_node = True + break + # Skip this element + if skip_node: + continue + for idx in (cth, qs): candidates = idx.get_candidate_polygons(lat, lon, return_all=True) @@ -439,6 +474,12 @@ if __name__ == '__main__': parser.add_argument('-q', '--quattroshapes-dir', help='Path to quattroshapes dir') + parser.add_argument('-a', '--osm-admin-rtree-dir', + help='Path to OSM admin rtree dir') + + parser.add_argument('-l', '--language-rtree-dir', + help='Path to language rtree dir') + parser.add_argument('-n', '--osm-neighborhoods-file', help='Path to OSM neighborhoods file (no dependencies, .osm format)') @@ -449,10 +490,12 @@ if __name__ == '__main__': logging.basicConfig(level=logging.INFO) args = parser.parse_args() - if args.osm_neighborhoods_file and args.quattroshapes_dir: + if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.language_rtree_dir: index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( args.osm_neighborhoods_file, args.quattroshapes_dir, + args.language_rtree_dir, + args.osm_rtree_dir, args.out_dir ) else: