[addresses] remove Quattroshapes/GeoNames cities as they may have problematic names, and in any case we have point-based cities from OSM now
This commit is contained in:
@@ -63,7 +63,7 @@ class AddressComponents(object):
|
|||||||
prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen".
|
prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen".
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
>>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames)
|
>>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, places_index)
|
||||||
>>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567)
|
>>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567)
|
||||||
|
|
||||||
Returns (results vary because of randomness):
|
Returns (results vary because of randomness):
|
||||||
@@ -142,7 +142,7 @@ class AddressComponents(object):
|
|||||||
AddressFormatter.UNIT: Unit,
|
AddressFormatter.UNIT: Unit,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, osm_admin_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames):
|
def __init__(self, osm_admin_rtree, neighborhoods_rtree, places_index):
|
||||||
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
||||||
|
|
||||||
self.setup_component_dependencies()
|
self.setup_component_dependencies()
|
||||||
@@ -152,8 +152,6 @@ class AddressComponents(object):
|
|||||||
self.osm_admin_rtree = osm_admin_rtree
|
self.osm_admin_rtree = osm_admin_rtree
|
||||||
self.neighborhoods_rtree = neighborhoods_rtree
|
self.neighborhoods_rtree = neighborhoods_rtree
|
||||||
self.places_index = places_index
|
self.places_index = places_index
|
||||||
self.quattroshapes_rtree = quattroshapes_rtree
|
|
||||||
self.geonames = geonames
|
|
||||||
|
|
||||||
def setup_component_dependencies(self):
|
def setup_component_dependencies(self):
|
||||||
self.component_dependencies = {}
|
self.component_dependencies = {}
|
||||||
@@ -880,57 +878,6 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
address_components.update(new_admin_components)
|
address_components.update(new_admin_components)
|
||||||
|
|
||||||
def quattroshapes_city(self, address_components,
|
|
||||||
latitude, longitude,
|
|
||||||
language, non_local_language=None,
|
|
||||||
always_use_full_names=False):
|
|
||||||
'''
|
|
||||||
Quattroshapes/GeoNames cities
|
|
||||||
-----------------------------
|
|
||||||
|
|
||||||
Quattroshapes isn't great for everything, but it has decent city boundaries
|
|
||||||
in places where OSM sometimes does not (or at least in places where we aren't
|
|
||||||
currently able to create valid polygons). While Quattroshapes itself doesn't
|
|
||||||
reliably use local names, which we'll want for consistency, Quattroshapes cities
|
|
||||||
are linked with GeoNames, which has per-language localized names for most places.
|
|
||||||
'''
|
|
||||||
|
|
||||||
city = None
|
|
||||||
|
|
||||||
qs_add_city_prob = float(nested_get(self.config, ('city', 'quattroshapes_geonames_backup_city_probability')))
|
|
||||||
abbreviated_name_prob = float(nested_get(self.config, ('city', 'quattroshapes_geonames_abbreviated_probability')))
|
|
||||||
|
|
||||||
if AddressFormatter.CITY not in address_components and random.random() < qs_add_city_prob:
|
|
||||||
lang = non_local_language or language
|
|
||||||
quattroshapes_cities = self.quattroshapes_rtree.point_in_poly(latitude, longitude, return_all=True)
|
|
||||||
for result in quattroshapes_cities:
|
|
||||||
if result.get(self.quattroshapes_rtree.LEVEL) == self.quattroshapes_rtree.LOCALITY and self.quattroshapes_rtree.GEONAMES_ID in result:
|
|
||||||
geonames_id = int(result[self.quattroshapes_rtree.GEONAMES_ID].split(',')[0])
|
|
||||||
names = self.geonames.get_alternate_names(geonames_id)
|
|
||||||
|
|
||||||
if not names or lang not in names:
|
|
||||||
continue
|
|
||||||
|
|
||||||
city = None
|
|
||||||
if 'abbr' not in names or non_local_language:
|
|
||||||
# Use the common city name in the target language
|
|
||||||
city = names[lang][0][0]
|
|
||||||
elif not always_use_full_names and random.random() < abbreviated_name_prob:
|
|
||||||
# Use an abbreviation: NYC, BK, SF, etc.
|
|
||||||
city = random.choice(names['abbr'])[0]
|
|
||||||
|
|
||||||
if not city or not city.strip():
|
|
||||||
continue
|
|
||||||
return city
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
if non_local_language and AddressFormatter.CITY in address_components and (
|
|
||||||
AddressFormatter.CITY_DISTRICT in address_components or
|
|
||||||
AddressFormatter.SUBURB in address_components):
|
|
||||||
address_components.pop(AddressFormatter.CITY)
|
|
||||||
|
|
||||||
return city
|
|
||||||
|
|
||||||
generic_wiki_name_regex = re.compile('^[a-z]{2,3}:')
|
generic_wiki_name_regex = re.compile('^[a-z]{2,3}:')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -1484,12 +1431,6 @@ class AddressComponents(object):
|
|||||||
non_local_language=non_local_language,
|
non_local_language=non_local_language,
|
||||||
language_suffix=language_suffix)
|
language_suffix=language_suffix)
|
||||||
|
|
||||||
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
|
|
||||||
if city:
|
|
||||||
city = self.normalized_place_name(city, AddressFormatter.CITY, all_osm_components, country=country, languages=all_languages)
|
|
||||||
if city:
|
|
||||||
address_components[AddressFormatter.CITY] = city
|
|
||||||
|
|
||||||
self.add_neighborhoods(address_components, neighborhoods,
|
self.add_neighborhoods(address_components, neighborhoods,
|
||||||
language_suffix=language_suffix)
|
language_suffix=language_suffix)
|
||||||
|
|
||||||
@@ -1596,14 +1537,6 @@ class AddressComponents(object):
|
|||||||
random_key=False,
|
random_key=False,
|
||||||
always_use_full_names=True)
|
always_use_full_names=True)
|
||||||
|
|
||||||
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language,
|
|
||||||
always_use_full_names=True)
|
|
||||||
|
|
||||||
if city:
|
|
||||||
city = self.normalized_place_name(city, AddressFormatter.CITY, all_osm_components, country=country, languages=all_languages)
|
|
||||||
if city:
|
|
||||||
address_components[AddressFormatter.CITY] = city
|
|
||||||
|
|
||||||
self.add_neighborhoods(address_components, neighborhoods,
|
self.add_neighborhoods(address_components, neighborhoods,
|
||||||
language_suffix=language_suffix)
|
language_suffix=language_suffix)
|
||||||
|
|
||||||
|
|||||||
@@ -12,11 +12,10 @@ import os
|
|||||||
from geodata.openaddresses.formatter import OpenAddressesFormatter
|
from geodata.openaddresses.formatter import OpenAddressesFormatter
|
||||||
|
|
||||||
from geodata.addresses.components import AddressComponents
|
from geodata.addresses.components import AddressComponents
|
||||||
from geodata.geonames.db import GeoNamesDB
|
|
||||||
from geodata.polygons.language_polys import LanguagePolygonIndex
|
from geodata.polygons.language_polys import LanguagePolygonIndex
|
||||||
from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
|
from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
|
||||||
from geodata.places.reverse_geocode import PlaceReverseGeocoder
|
from geodata.places.reverse_geocode import PlaceReverseGeocoder
|
||||||
from geodata.polygons.reverse_geocode import OSMReverseGeocoder, OSMCountryReverseGeocoder, QuattroshapesReverseGeocoder
|
from geodata.polygons.reverse_geocode import OSMReverseGeocoder, OSMCountryReverseGeocoder
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@@ -44,18 +43,10 @@ if __name__ == '__main__':
|
|||||||
default=None,
|
default=None,
|
||||||
help='OSM reverse geocoder RTree directory')
|
help='OSM reverse geocoder RTree directory')
|
||||||
|
|
||||||
parser.add_argument('--quattroshapes-rtree-dir',
|
|
||||||
default=None,
|
|
||||||
help='Quattroshapes reverse geocoder RTree directory')
|
|
||||||
|
|
||||||
parser.add_argument('--places-index-dir',
|
parser.add_argument('--places-index-dir',
|
||||||
default=None,
|
default=None,
|
||||||
help='Places index directory')
|
help='Places index directory')
|
||||||
|
|
||||||
parser.add_argument('--geonames-db',
|
|
||||||
default=None,
|
|
||||||
help='GeoNames db file')
|
|
||||||
|
|
||||||
parser.add_argument('--neighborhoods-rtree-dir',
|
parser.add_argument('--neighborhoods-rtree-dir',
|
||||||
default=None,
|
default=None,
|
||||||
help='Neighborhoods reverse geocoder RTree directory')
|
help='Neighborhoods reverse geocoder RTree directory')
|
||||||
@@ -85,17 +76,8 @@ if __name__ == '__main__':
|
|||||||
if args.places_index_dir:
|
if args.places_index_dir:
|
||||||
places_index = PlaceReverseGeocoder.load(args.places_index_dir)
|
places_index = PlaceReverseGeocoder.load(args.places_index_dir)
|
||||||
|
|
||||||
quattroshapes_rtree = None
|
|
||||||
if args.quattroshapes_rtree_dir:
|
|
||||||
quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir)
|
|
||||||
|
|
||||||
geonames = None
|
|
||||||
|
|
||||||
if args.geonames_db:
|
|
||||||
geonames = GeoNamesDB(args.geonames_db)
|
|
||||||
|
|
||||||
if args.openaddresses_dir and args.format:
|
if args.openaddresses_dir and args.format:
|
||||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames)
|
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||||
|
|
||||||
oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug)
|
oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug)
|
||||||
oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged)
|
oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged)
|
||||||
|
|||||||
@@ -516,10 +516,6 @@ if __name__ == '__main__':
|
|||||||
if args.places_index_dir:
|
if args.places_index_dir:
|
||||||
places_index = PlaceReverseGeocoder.load(args.places_index_dir)
|
places_index = PlaceReverseGeocoder.load(args.places_index_dir)
|
||||||
|
|
||||||
quattroshapes_rtree = None
|
|
||||||
if args.quattroshapes_rtree_dir:
|
|
||||||
quattroshapes_rtree = QuattroshapesReverseGeocoder.load(args.quattroshapes_rtree_dir)
|
|
||||||
|
|
||||||
metro_stations_index = None
|
metro_stations_index = None
|
||||||
if args.metro_stations_index_dir:
|
if args.metro_stations_index_dir:
|
||||||
metro_stations_index = MetroStationReverseGeocoder.load(args.metro_stations_index_dir)
|
metro_stations_index = MetroStationReverseGeocoder.load(args.metro_stations_index_dir)
|
||||||
@@ -532,11 +528,6 @@ if __name__ == '__main__':
|
|||||||
if args.buildings_rtree_dir:
|
if args.buildings_rtree_dir:
|
||||||
buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree_dir)
|
buildings_rtree = OSMBuildingReverseGeocoder.load(args.buildings_rtree_dir)
|
||||||
|
|
||||||
geonames = None
|
|
||||||
|
|
||||||
if args.geonames_db:
|
|
||||||
geonames = GeoNamesDB(args.geonames_db)
|
|
||||||
|
|
||||||
# Can parallelize
|
# Can parallelize
|
||||||
if args.streets_file:
|
if args.streets_file:
|
||||||
build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
|
build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
|
||||||
@@ -550,28 +541,24 @@ if __name__ == '__main__':
|
|||||||
parser.error('--rtree-dir required for formatted addresses')
|
parser.error('--rtree-dir required for formatted addresses')
|
||||||
elif neighborhoods_rtree is None:
|
elif neighborhoods_rtree is None:
|
||||||
parser.error('--neighborhoods-rtree-dir required for formatted addresses')
|
parser.error('--neighborhoods-rtree-dir required for formatted addresses')
|
||||||
elif quattroshapes_rtree is None:
|
|
||||||
parser.error('--quattroshapes-rtree-dir required for formatted addresses')
|
|
||||||
elif geonames is None:
|
|
||||||
parser.error('--geonames-db required for formatted addresses')
|
|
||||||
elif places_index is None:
|
elif places_index is None:
|
||||||
parser.error('--places-index-dir required for formatted addresses')
|
parser.error('--places-index-dir required for formatted addresses')
|
||||||
|
|
||||||
if args.address_file and args.format:
|
if args.address_file and args.format:
|
||||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames)
|
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
||||||
osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
|
osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
|
||||||
if args.address_file and args.limited_addresses:
|
if args.address_file and args.limited_addresses:
|
||||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames)
|
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
|
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
|
||||||
osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
|
osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
|
||||||
|
|
||||||
if args.place_nodes_file and args.format:
|
if args.place_nodes_file and args.format:
|
||||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames)
|
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
||||||
osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)
|
osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)
|
||||||
|
|
||||||
if args.intersections_file and args.format:
|
if args.intersections_file and args.format:
|
||||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index, quattroshapes_rtree, geonames)
|
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||||
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
|
||||||
osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)
|
osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)
|
||||||
|
|||||||
Reference in New Issue
Block a user