[addresses] remove Quattroshapes/GeoNames cities as they may have problematic names, and in any case we have point-based cities from OSM now

This commit is contained in:
Al
2016-12-10 02:08:33 -05:00
parent 18c5fd0855
commit 5098599ed6
3 changed files with 8 additions and 106 deletions

View File

@@ -63,7 +63,7 @@ class AddressComponents(object):
prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen". prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen".
Usage: Usage:
>>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) >>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, places_index)
>>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567) >>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567)
Returns (results vary because of randomness): Returns (results vary because of randomness):
@@ -142,7 +142,7 @@ class AddressComponents(object):
AddressFormatter.UNIT: Unit, AddressFormatter.UNIT: Unit,
} }
def __init__(self, osm_admin_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames): def __init__(self, osm_admin_rtree, neighborhoods_rtree, places_index):
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG)) self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
self.setup_component_dependencies() self.setup_component_dependencies()
@@ -152,8 +152,6 @@ class AddressComponents(object):
self.osm_admin_rtree = osm_admin_rtree self.osm_admin_rtree = osm_admin_rtree
self.neighborhoods_rtree = neighborhoods_rtree self.neighborhoods_rtree = neighborhoods_rtree
self.places_index = places_index self.places_index = places_index
self.quattroshapes_rtree = quattroshapes_rtree
self.geonames = geonames
def setup_component_dependencies(self): def setup_component_dependencies(self):
self.component_dependencies = {} self.component_dependencies = {}
@@ -880,57 +878,6 @@ class AddressComponents(object):
address_components.update(new_admin_components) address_components.update(new_admin_components)
def quattroshapes_city(self, address_components,
latitude, longitude,
language, non_local_language=None,
always_use_full_names=False):
'''
Quattroshapes/GeoNames cities
-----------------------------
Quattroshapes isn't great for everything, but it has decent city boundaries
in places where OSM sometimes does not (or at least in places where we aren't
currently able to create valid polygons). While Quattroshapes itself doesn't
reliably use local names, which we'll want for consistency, Quattroshapes cities
are linked with GeoNames, which has per-language localized names for most places.
'''
city = None
qs_add_city_prob = float(nested_get(self.config, ('city', 'quattroshapes_geonames_backup_city_probability')))
abbreviated_name_prob = float(nested_get(self.config, ('city', 'quattroshapes_geonames_abbreviated_probability')))
if AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob:
lang = non_local_language or language
quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
for result in quattroshapes_cities:
if result.get(self.quattroshapes_rtree.LEVEL) == self.quattroshapes_rtree.LOCALITY and self.quattroshapes_rtree.GEONAMES_ID in result:
geonames_id = int(result[self.quattroshapes_rtree.GEONAMES_ID].split(',')[0])
names = self.geonames.get_alternate_names(geonames_id)
if not names or lang not in names:
continue
city = None
if 'abbr' not in names or non_local_language:
# Use the common city name in the target language
city = names[lang][0][0]
elif not always_use_full_names and random.random() < abbreviated_name_prob:
# Use an abbreviation: NYC, BK, SF, etc.
city = random.choice(names['abbr'])[0]
if not city or not city.strip():
continue
return city
break
else:
if non_local_language and AddressFormatter.CITY in address_components and (
AddressFormatter.CITY_DISTRICT in address_components or
AddressFormatter.SUBURB in address_components):
address_components.pop(AddressFormatter.CITY)
return city
generic_wiki_name_regex = re.compile('^[a-z]{2,3}:') generic_wiki_name_regex = re.compile('^[a-z]{2,3}:')
@classmethod @classmethod
@@ -1484,12 +1431,6 @@ class AddressComponents(object):
non_local_language=non_local_language, non_local_language=non_local_language,
language_suffix=language_suffix) language_suffix=language_suffix)
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
if city:
city = self.normalized_place_name(city, AddressFormatter.CITY, all_osm_components, country=country, languages=all_languages)
if city:
address_components[AddressFormatter.CITY] = city
self.add_neighborhoods(address_components, neighborhoods, self.add_neighborhoods(address_components, neighborhoods,
language_suffix=language_suffix) language_suffix=language_suffix)
@@ -1596,14 +1537,6 @@ class AddressComponents(object):
random_key=False, random_key=False,
always_use_full_names=True) always_use_full_names=True)
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language,
always_use_full_names=True)
if city:
city = self.normalized_place_name(city, AddressFormatter.CITY, all_osm_components, country=country, languages=all_languages)
if city:
address_components[AddressFormatter.CITY] = city
self.add_neighborhoods(address_components, neighborhoods, self.add_neighborhoods(address_components, neighborhoods,
language_suffix=language_suffix) language_suffix=language_suffix)

View File

@@ -12,11 +12,10 @@ import os
from geodata.openaddresses.formatter import OpenAddressesFormatter from geodata.openaddresses.formatter import OpenAddressesFormatter
from geodata.addresses.components import AddressComponents from geodata.addresses.components import AddressComponents
from geodata.geonames.db import GeoNamesDB
from geodata.polygons.language_polys import LanguagePolygonIndex from geodata.polygons.language_polys import LanguagePolygonIndex
from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
from geodata.places.reverse_geocode import PlaceReverseGeocoder from geodata.places.reverse_geocode import PlaceReverseGeocoder
from geodata.polygons.reverse_geocode import OSMReverseGeocoder, OSMCountryReverseGeocoder, QuattroshapesReverseGeocoder from geodata.polygons.reverse_geocode import OSMReverseGeocoder, OSMCountryReverseGeocoder
if __name__ == '__main__': if __name__ == '__main__':
@@ -44,18 +43,10 @@ if __name__ == '__main__':
default=None, default=None,
help='OSM reverse geocoder RTree directory') help='OSM reverse geocoder RTree directory')
parser.add_argument('--quattroshapes-rtree-dir',
default=None,
help='Quattroshapes reverse geocoder RTree directory')
parser.add_argument('--places-index-dir', parser.add_argument('--places-index-dir',
default=None, default=None,
help='Places index directory') help='Places index directory')
parser.add_argument('--geonames-db',
default=None,
help='GeoNames db file')
parser.add_argument('--neighborhoods-rtree-dir', parser.add_argument('--neighborhoods-rtree-dir',
default=None, default=None,
help='Neighborhoods reverse geocoder RTree directory') help='Neighborhoods reverse geocoder RTree directory')
@@ -85,17 +76,8 @@ if __name__ == '__main__':
if args.places_index_dir: if args.places_index_dir:
places_index = PlaceReverseGeocoder.load(args.places_index_dir) places_index = PlaceReverseGeocoder.load(args.places_index_dir)
quattroshapes_rtree = None
if args.quattroshapes_rtree_dir:
quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir)
geonames = None
if args.geonames_db:
geonames = GeoNamesDB(args.geonames_db)
if args.openaddresses_dir and args.format: if args.openaddresses_dir and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug) oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug)
oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged) oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged)

View File

@@ -516,10 +516,6 @@ if __name__ == '__main__':
if args.places_index_dir: if args.places_index_dir:
places_index = PlaceReverseGeocoder.load(args.places_index_dir) places_index = PlaceReverseGeocoder.load(args.places_index_dir)
quattroshapes_rtree = None
if args.quattroshapes_rtree_dir:
quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir)
metro_stations_index = None metro_stations_index = None
if args.metro_stations_index_dir: if args.metro_stations_index_dir:
metro_stations_index = MetroStationReverseGeocoder.load(args.metro_stations_index_dir) metro_stations_index = MetroStationReverseGeocoder.load(args.metro_stations_index_dir)
@@ -532,11 +528,6 @@ if __name__ == '__main__':
if args.buildings_rtree_dir: if args.buildings_rtree_dir:
buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree_dir) buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree_dir)
geonames = None
if args.geonames_db:
geonames = GeoNamesDB(args.geonames_db)
# Can parallelize # Can parallelize
if args.streets_file: if args.streets_file:
build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated) build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
@@ -550,28 +541,24 @@ if __name__ == '__main__':
parser.error('--rtree-dir required for formatted addresses') parser.error('--rtree-dir required for formatted addresses')
elif neighborhoods_rtree is None: elif neighborhoods_rtree is None:
parser.error('--neighborhoods-rtree-dir required for formatted addresses') parser.error('--neighborhoods-rtree-dir required for formatted addresses')
elif quattroshapes_rtree is None:
parser.error('--quattroshapes-rtree-dir required for formatted addresses')
elif geonames is None:
parser.error('--geonames-db required for formatted addresses')
elif places_index is None: elif places_index is None:
parser.error('--places-index-dir required for formatted addresses') parser.error('--places-index-dir required for formatted addresses')
if args.address_file and args.format: if args.address_file and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged) osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
if args.address_file and args.limited_addresses: if args.address_file and args.limited_addresses:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ') osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
osm_formatter.build_limited_training_data(args.address_file, args.out_dir) osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
if args.place_nodes_file and args.format: if args.place_nodes_file and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged) osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)
if args.intersections_file and args.format: if args.intersections_file and args.format:
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged) osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)