From da36b718292cbc65e51a6d7b010002bc0e361583 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 5 Dec 2016 18:34:09 -0500 Subject: [PATCH] [addresses] adding new places index in OSM and OpenAddresses training data --- scripts/geodata/addresses/components.py | 2 +- scripts/geodata/openaddresses/formatter.py | 2 +- .../openaddresses_training_data.py | 11 ++++++++++- scripts/geodata/osm/formatter.py | 3 +++ .../geodata/osm/osm_address_training_data.py | 19 +++++++++++++++---- scripts/geodata/places/reverse_geocode.py | 2 +- 6 files changed, 31 insertions(+), 8 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 817e95f1..2b4b8d7a 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -62,7 +62,7 @@ class AddressComponents(object): prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen". Usage: - >>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames) + >>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) >>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567) Returns (results vary because of randomness): diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 8e2f1d00..97758e84 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -430,7 +430,7 @@ class OpenAddressesFormatter(object): unambiguous_city = False if add_osm_boundaries or AddressFormatter.CITY not in components: osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) - self.components.add_admin_boundaries(components, osm_components, country, language) + self.components.add_admin_boundaries(components, osm_components, country, language, latitude, longitude) categorized = self.components.categorized_osm_components(country, osm_components) for component, label in categorized: if label == AddressFormatter.CITY: diff --git a/scripts/geodata/openaddresses/openaddresses_training_data.py b/scripts/geodata/openaddresses/openaddresses_training_data.py index fae064fc..334b8c71 100644 --- a/scripts/geodata/openaddresses/openaddresses_training_data.py +++ b/scripts/geodata/openaddresses/openaddresses_training_data.py @@ -15,6 +15,7 @@ from geodata.addresses.components import AddressComponents from geodata.geonames.db import GeoNamesDB from geodata.polygons.language_polys import LanguagePolygonIndex from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder +from geodata.places.reverse_geocode import PlaceReverseGeocoder from geodata.polygons.reverse_geocode import OSMReverseGeocoder, OSMCountryReverseGeocoder, QuattroshapesReverseGeocoder @@ -47,6 +48,10 @@ if __name__ == '__main__': default=None, help='Quattroshapes reverse geocoder RTree directory') + parser.add_argument('--places-index-dir', + default=None, + help='Places index directory') + parser.add_argument('--geonames-db', default=None, help='GeoNames db file') @@ -76,6 +81,10 @@ if __name__ == '__main__': if args.neighborhoods_rtree_dir: neighborhoods_rtree = NeighborhoodReverseGeocoder.load(args.neighborhoods_rtree_dir) + places_index = None + if args.places_index_dir: + places_index = PlaceReverseGeocoder.load(args.places_index_dir) + quattroshapes_rtree = None if args.quattroshapes_rtree_dir: quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir) @@ -86,7 +95,7 @@ if __name__ == '__main__': geonames = GeoNamesDB(args.geonames_db) if args.openaddresses_dir and args.format: - components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug) oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged) diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index d9ac3dad..df4d9f76 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -646,6 +646,7 @@ class OSMAddressFormatter(object): address_components = {component_name: name} self.components.add_admin_boundaries(address_components, osm_components, country, UNKNOWN_LANGUAGE, + latitude, longitude, random_key=num_references > 1, language_suffix=language_suffix, drop_duplicate_city_names=False) @@ -689,6 +690,7 @@ class OSMAddressFormatter(object): for i in xrange(n): address_components = {component_name: name} self.components.add_admin_boundaries(address_components, osm_components, country, language, + latitude, longitude, random_key=is_default, language_suffix=language_suffix, drop_duplicate_city_names=False) @@ -725,6 +727,7 @@ class OSMAddressFormatter(object): for i in xrange(num_references / 2 if language == ENGLISH else min_references / 2): address_components = {component_name: name} self.components.add_admin_boundaries(address_components, osm_components, country, language, + latitude, longitude, random_key=False, non_local_language=language, language_suffix=language_suffix, diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index d33e6bbc..64fca501 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -65,6 +65,7 @@ from geodata.metro_stations.reverse_geocode import MetroStationReverseGeocoder from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder from geodata.osm.extract import * from geodata.osm.formatter import OSMAddressFormatter +from geodata.places.reverse_geocode import PlaceReverseGeocoder from geodata.polygons.language_polys import * from geodata.polygons.reverse_geocode import * from geodata.i18n.unicode_paths import DATA_DIR @@ -471,6 +472,10 @@ if __name__ == '__main__': default=None, help='Quattroshapes reverse geocoder RTree directory') + parser.add_argument('--places-index-dir', + default=None, + help='Places index directory') + parser.add_argument('--metro-stations-index-dir', default=None, help='Metro stations reverse geocoder directory') @@ -507,6 +512,10 @@ if __name__ == '__main__': if args.neighborhoods_rtree_dir: neighborhoods_rtree = NeighborhoodReverseGeocoder.load(args.neighborhoods_rtree_dir) + places_index = None + if args.places_index_dir: + places_index = PlaceReverseGeocoder.load(args.places_index_dir) + quattroshapes_rtree = None if args.quattroshapes_rtree_dir: quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir) @@ -545,22 +554,24 @@ if __name__ == '__main__': parser.error('--quattroshapes-rtree-dir required for formatted addresses') elif geonames is None: parser.error('--geonames-db required for formatted addresses') + elif places_index is None: + parser.error('--places-index-dir required for formatted addresses') if args.address_file and args.format: - components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged) if args.address_file and args.limited_addresses: - components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ') osm_formatter.build_limited_training_data(args.address_file, args.out_dir) if args.place_nodes_file and args.format: - components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged) if args.intersections_file and args.format: - components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged) diff --git a/scripts/geodata/places/reverse_geocode.py b/scripts/geodata/places/reverse_geocode.py index b3645d16..db9b0d78 100644 --- a/scripts/geodata/places/reverse_geocode.py +++ b/scripts/geodata/places/reverse_geocode.py @@ -94,7 +94,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-i', '--osm-places-file', - help='Path to OSM metro stations file') + help='Path to OSM places file') parser.add_argument('-p', '--precision', type=int,