From faf418decb527a913a9820745acf2d3e3e05967f Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 5 Oct 2016 02:49:55 -0400 Subject: [PATCH] [languages] using country_and_languages method in OSM, neighborhoods and OpenAddresses --- scripts/geodata/addresses/components.py | 33 +++++----- .../geodata/neighborhoods/reverse_geocode.py | 16 ++--- scripts/geodata/openaddresses/formatter.py | 6 +- .../openaddresses_training_data.py | 12 ++-- scripts/geodata/osm/formatter.py | 47 +++++++------- .../geodata/osm/osm_address_training_data.py | 63 ++++++++++--------- scripts/geodata/polygons/reverse_geocode.py | 6 +- 7 files changed, 95 insertions(+), 88 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 4b264860..563ad9db 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -35,6 +35,7 @@ from geodata.math.sampling import cdf, weighted_choice from geodata.names.normalization import name_affixes from geodata.osm.components import osm_address_components from geodata.places.config import place_config +from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder from geodata.states.state_abbreviations import state_abbreviations from geodata.text.utils import is_numeric @@ -69,7 +70,7 @@ class AddressComponents(object): prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen". Usage: - >>> components = AddressComponents(osm_admin_rtree, language_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames) + >>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames) >>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567) Returns (results vary because of randomness): @@ -145,7 +146,7 @@ class AddressComponents(object): AddressFormatter.UNIT: Unit, } - def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames): + def __init__(self, osm_admin_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames): self.config = yaml.load(open(PARSER_DEFAULT_CONFIG)) self.setup_component_dependencies() @@ -153,7 +154,6 @@ class AddressComponents(object): self.address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(self.config['dropout'])} self.osm_admin_rtree = osm_admin_rtree - self.language_rtree = language_rtree self.neighborhoods_rtree = neighborhoods_rtree self.quattroshapes_rtree = quattroshapes_rtree self.geonames = geonames @@ -249,6 +249,9 @@ class AddressComponents(object): def osm_reverse_geocoded_components(self, latitude, longitude): return self.osm_admin_rtree.point_in_poly(latitude, longitude, return_all=True) + def osm_country_and_languages(self, osm_components): + return OSMCountryReverseGeocoder.country_and_languages_from_components(osm_components) + def categorize_osm_component(self, country, props, containing_components): containing_ids = [(c['type'], c['id']) for c in containing_components if 'type' in c and 'id' in c] @@ -288,17 +291,16 @@ class AddressComponents(object): language = None if len(candidate_languages) == 1: - language = candidate_languages[0]['lang'] + language = candidate_languages[0][0] else: street = components.get(AddressFormatter.ROAD, None) - lang_tuples = [(l['lang'], l['default']) for l in candidate_languages] if street is not None: - language = disambiguate_language(street, lang_tuples) + language = disambiguate_language(street, candidate_languages) else: - if has_non_latin_script(lang_tuples): + if has_non_latin_script(candidate_languages): for component, value in six.iteritems(components): - language, script_langs = disambiguate_language_script(value, lang_tuples) + language, script_langs = disambiguate_language_script(value, candidate_languages) if language is not UNKNOWN_LANGUAGE: break else: @@ -1247,16 +1249,14 @@ class AddressComponents(object): except Exception: return None, None, None - country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) - if not (country and candidate_languages): - return None, None, None + osm_components = self.osm_reverse_geocoded_components(latitude, longitude) + country, candidate_languages = self.osm_country_and_languages(osm_components) more_than_one_official_language = len(candidate_languages) > 1 non_local_language = None language_suffix = '' - osm_components = self.osm_reverse_geocoded_components(latitude, longitude) neighborhoods = self.neighborhood_components(latitude, longitude) all_osm_components = osm_components + neighborhoods @@ -1272,7 +1272,7 @@ class AddressComponents(object): if address_state: address_components[AddressFormatter.STATE] = address_state - all_languages = set([l['lang'] for l in candidate_languages]) + all_languages = set([l for l, d in candidate_languages]) self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages) @@ -1329,7 +1329,9 @@ class AddressComponents(object): except Exception: return None, None, None - country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + osm_components = self.osm_reverse_geocoded_components(latitude, longitude) + country, candidate_languages = self.osm_country_and_languages(osm_components) + if not (country and candidate_languages): return None, None, None @@ -1355,10 +1357,9 @@ class AddressComponents(object): street = address_components.get(AddressFormatter.ROAD) - osm_components = self.osm_reverse_geocoded_components(latitude, longitude) neighborhoods = self.neighborhood_components(latitude, longitude) - all_languages = set([l['lang'] for l in candidate_languages]) + all_languages = set([l for l, d in candidate_languages]) all_osm_components = osm_components + neighborhoods language_suffix = self.pick_language_suffix(all_osm_components, language, non_local_language, more_than_one_official_language) diff --git a/scripts/geodata/neighborhoods/reverse_geocode.py b/scripts/geodata/neighborhoods/reverse_geocode.py index e5928b78..c25079be 100644 --- a/scripts/geodata/neighborhoods/reverse_geocode.py +++ b/scripts/geodata/neighborhoods/reverse_geocode.py @@ -23,7 +23,7 @@ from geodata.osm.definitions import osm_definitions from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.polygons.index import * from geodata.polygons.language_polys import LanguagePolygonIndex -from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMReverseGeocoder +from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder from geodata.statistics.tf_idf import IDFIndex @@ -234,7 +234,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): return doc @classmethod - def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, language_rtree_dir, osm_rtree_dir, output_dir): + def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, output_dir): ''' Given an OSM file (planet or some other bounds) containing neighborhoods as points (some suburbs have boundaries) @@ -259,7 +259,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): logger.info('Creating ClickThatHood neighborhoods') cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() - language_rtree = LanguagePolygonIndex.load(language_rtree_dir) + country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir) osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) @@ -307,7 +307,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex): possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) is_neighborhood = attrs.get('place') in ('neighbourhood', 'neighborhood') - country, candidate_languages, language_props = language_rtree.country_and_languages(lat, lon) + country, candidate_languages = country_rtree.country_and_languages(lat, lon) + component_name = None component_name = osm_address_components.component_from_properties(country, attrs) @@ -473,8 +474,8 @@ if __name__ == '__main__': parser.add_argument('-a', '--osm-admin-rtree-dir', help='Path to OSM admin rtree dir') - parser.add_argument('-l', '--language-rtree-dir', - help='Path to language rtree dir') + parser.add_argument('-c', '--country-rtree-dir', + help='Path to country rtree dir') parser.add_argument('-n', '--osm-neighborhoods-file', help='Path to OSM neighborhoods file (no dependencies, .osm format)') @@ -486,10 +487,11 @@ if __name__ == '__main__': logging.basicConfig(level=logging.INFO) args = parser.parse_args() - if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.language_rtree_dir: + if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir: index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( args.osm_neighborhoods_file, args.quattroshapes_dir, + args.country_rtree_dir, args.language_rtree_dir, args.osm_admin_rtree_dir, args.out_dir diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 87461f65..cdeec563 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -67,9 +67,9 @@ class OpenAddressesFormatter(object): re.I | re.UNICODE) unit_type_regexes[lang] = pattern - def __init__(self, components, debug=False): + def __init__(self, components, country_rtree, debug=False): self.components = components - self.language_rtree = components.language_rtree + self.country_rtree = country_rtree self.debug = debug @@ -309,7 +309,7 @@ class OpenAddressesFormatter(object): continue if components: - country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages): continue diff --git a/scripts/geodata/openaddresses/openaddresses_training_data.py b/scripts/geodata/openaddresses/openaddresses_training_data.py index 9f9a5f1f..fae064fc 100644 --- a/scripts/geodata/openaddresses/openaddresses_training_data.py +++ b/scripts/geodata/openaddresses/openaddresses_training_data.py @@ -15,7 +15,7 @@ from geodata.addresses.components import AddressComponents from geodata.geonames.db import GeoNamesDB from geodata.polygons.language_polys import LanguagePolygonIndex from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder -from geodata.polygons.reverse_geocode import OSMReverseGeocoder, QuattroshapesReverseGeocoder +from geodata.polygons.reverse_geocode import OSMReverseGeocoder, OSMCountryReverseGeocoder, QuattroshapesReverseGeocoder if __name__ == '__main__': @@ -35,9 +35,9 @@ if __name__ == '__main__': default=False, help='Save untagged formatted addresses (slow)') - parser.add_argument('--language-rtree-dir', + parser.add_argument('--country-rtree-dir', required=True, - help='Language RTree directory') + help='Country RTree directory') parser.add_argument('--rtree-dir', default=None, @@ -66,7 +66,7 @@ if __name__ == '__main__': args = parser.parse_args() - language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir) + country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir) osm_rtree = None if args.rtree_dir: @@ -86,7 +86,7 @@ if __name__ == '__main__': geonames = GeoNamesDB(args.geonames_db) if args.openaddresses_dir and args.format: - components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - oa_formatter = OpenAddressesFormatter(components, debug=args.debug) + oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug) oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged) diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 570b9995..2415c6cc 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -7,7 +7,7 @@ import six import sys import yaml -from collections import OrderedDict +from collections import defaultdict, OrderedDict, Counter from six import itertools this_dir = os.path.realpath(os.path.dirname(__file__)) @@ -109,8 +109,8 @@ class OSMAddressFormatter(object): ('is_in:region', AddressFormatter.STATE), # Used in Tunisia ('addr:governorate', AddressFormatter.STATE), - ('addr:postal_code', AddressFormatter.POSTCODE), ('addr:postcode', AddressFormatter.POSTCODE), + ('addr:postal_code', AddressFormatter.POSTCODE), ('addr:zipcode', AddressFormatter.POSTCODE), ('postal_code', AddressFormatter.POSTCODE), ('addr:country', AddressFormatter.COUNTRY), @@ -138,6 +138,8 @@ class OSMAddressFormatter(object): 'commercial': AddressComponents.zones.COMMERCIAL, 'industrial': AddressComponents.zones.INDUSTRIAL, 'residential': AddressComponents.zones.RESIDENTIAL, + 'university': AddressComponents.zones.UNIVERSITY, + 'college': AddressComponents.zones.UNIVERSITY, }, 'amenity': { 'university': AddressComponents.zones.UNIVERSITY, @@ -147,10 +149,10 @@ class OSMAddressFormatter(object): boundary_component_priorities = {k: i for i, k in enumerate(AddressFormatter.BOUNDARY_COMPONENTS_ORDERED)} - def __init__(self, components, subdivisions_rtree=None, buildings_rtree=None, metro_stations_index=None): + def __init__(self, components, country_rtree, subdivisions_rtree=None, buildings_rtree=None, metro_stations_index=None): # Instance of AddressComponents, contains structures for reverse geocoding, etc. self.components = components - self.language_rtree = components.language_rtree + self.country_rtree = country_rtree self.subdivisions_rtree = subdivisions_rtree self.buildings_rtree = buildings_rtree @@ -168,7 +170,7 @@ class OSMAddressFormatter(object): if len(candidate_languages) > 1: street = tags.get('addr:street', None) - namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in tags] + namespaced = [l for l, d in candidate_languages if 'addr:street:{}'.format(l) in tags] if namespaced and random.random() < pick_namespaced_language_prob: language = random.choice(namespaced) @@ -344,7 +346,6 @@ class OSMAddressFormatter(object): return True return False - def add_metro_station(self, address_components, latitude, longitude, language=None, default_language=None): ''' Metro stations @@ -476,19 +477,10 @@ class OSMAddressFormatter(object): return (), None osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) - country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) - if country and candidate_languages: - local_languages = [(l['lang'], bool(int(l['default']))) for l in candidate_languages] - else: - for c in reversed(osm_components): - country = c.get('ISO3166-1:alpha2') - if country: - country = country.lower() - break - else: - return (), None - local_languages = [(lang, bool(int(default))) for lang, default in get_country_languages(country).iteritems()] + country, candidate_languages = OSMCountryReverseGeocoder.country_and_languages_from_components(osm_components) + + local_languages = candidate_languages all_local_languages = set([l for l, d in local_languages]) random_languages = set(INTERNET_LANGUAGE_DISTRIBUTION) @@ -551,12 +543,17 @@ class OSMAddressFormatter(object): # Calculate how many records to produce for this place given its population population_divisor = 10000 # Add one record for every 10k in population min_references = 5 # Every place gets at least 5 reference to account for variations + if component_name == AddressFormatter.CITY: + # Cities get a few extra references over e.g. a state_district with the same name + # so that if the population is unknown, hopefully the city will have more references + # and the parser will prefer that meaning + min_references += 2 max_references = 1000 # Cap the number of references e.g. for India and China country nodes num_references = min(population / population_divisor + min_references, max_references) cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0)) - for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name'): + for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'): if more_than_one_official_language: name = tags.get(name_tag) language_suffix = '' @@ -757,7 +754,7 @@ class OSMAddressFormatter(object): except Exception: return None, None, None - country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages): return None, None, None @@ -880,7 +877,7 @@ class OSMAddressFormatter(object): except Exception: return None, None, None - country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages): return None, None, None @@ -986,8 +983,10 @@ class OSMAddressFormatter(object): for node_id, tags, deps in parse_osm(infile): tags['type'], tags['id'] = node_id.split(':') place_tags, country = self.node_place_tags(tags) + for address_components, language, is_default in place_tags: addresses = self.formatted_places(address_components, country, language) + if language is None: language = UNKNOWN_LANGUAGE @@ -1083,11 +1082,11 @@ class OSMAddressFormatter(object): except Exception: continue - country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages): continue - more_than_one_official_language = sum((1 for l in candidate_languages if int(l['default']))) > 1 + more_than_one_official_language = sum((1 for l, d in candidate_languages if d)) > 1 base_name_tag = None for t in all_base_name_tags: @@ -1103,7 +1102,7 @@ class OSMAddressFormatter(object): names = defaultdict(list) if len(candidate_languages) == 1: - default_language = candidate_languages[0]['lang'] + default_language = candidate_languages[0][0] elif not more_than_one_official_language: default_language = None name = way['name'] diff --git a/scripts/geodata/osm/osm_address_training_data.py b/scripts/geodata/osm/osm_address_training_data.py index 6252b068..c02434d7 100644 --- a/scripts/geodata/osm/osm_address_training_data.py +++ b/scripts/geodata/osm/osm_address_training_data.py @@ -12,25 +12,25 @@ plenty of disk space. The following commands can be used in parallel to create all the training sets: Ways: -python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --language-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR) +python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR) Venues: -python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --language-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR) +python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR) Limited formatted addresses: -python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --language-rtree-dir=$(LANG_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) -o $(OUT_DIR) +python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) -o $(OUT_DIR) Formatted addresses (tagged): -python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) +python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) Formatted addresses (untagged): -python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f -u --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) +python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f -u --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) Intersections (after running intersections.py to create the JSON file): -python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) +python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) Toponyms: -python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --language-rtree-dir=$(LANG_RTREE_DIR) -o $(OUT_DIR) +python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) -o $(OUT_DIR) ''' import argparse @@ -91,7 +91,7 @@ def normalize_osm_name_tag(tag, script=False): return norm.split('_', 1)[0] -def get_language_names(language_rtree, key, value, tag_prefix='name'): +def get_language_names(country_rtree, key, value, tag_prefix='name'): if not ('lat' in value and 'lon' in value): return None, None @@ -104,7 +104,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): except Exception: return None, None - country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude) + country, candidate_languages = country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages): return None, None @@ -177,7 +177,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'): return country, name_language -def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets=True): +def build_ways_training_data(country_rtree, infile, out_dir, abbreviate_streets=True): ''' Creates a training set for language classification using most OSM ways (streets) under a fairly lengthy osmfilter definition which attempts to @@ -193,7 +193,7 @@ def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets writer = csv.writer(f, 'tsv_no_quote') for key, value, deps in parse_osm(infile, allowed_types=WAYS_RELATIONS): - country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name') + country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name') if not name_language: continue @@ -242,7 +242,7 @@ POSTAL_KEYS = ( ) -def build_toponym_training_data(language_rtree, infile, out_dir): +def build_toponym_training_data(country_rtree, infile, out_dir): ''' Data set of toponyms by language and country which should assist in language classification. OSM tends to use the native language @@ -268,7 +268,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir): except Exception: continue - country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude) + country, candidate_languages = country_rtree.country_and_languages(latitude, longitude) if not (country and candidate_languages): continue @@ -340,7 +340,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir): f.close() -def build_address_training_data(langauge_rtree, infile, out_dir, format=False): +def build_address_training_data(country_rtree, infile, out_dir, format=False): ''' Creates training set similar to the ways data but using addr:street tags instead. These may be slightly closer to what we'd see in real live addresses, containing @@ -354,7 +354,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False): writer = csv.writer(f, 'tsv_no_quote') for key, value, deps in parse_osm(infile): - country, street_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street') + country, street_language = get_language_names(country_rtree, key, value, tag_prefix='addr:street') if not street_language: continue @@ -374,14 +374,14 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False): VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv' -def build_venue_training_data(language_rtree, infile, out_dir): +def build_venue_training_data(country_rtree, infile, out_dir): i = 0 f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w') writer = csv.writer(f, 'tsv_no_quote') for key, value, deps in parse_osm(infile): - country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name') + country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name') if not name_language: continue @@ -455,9 +455,9 @@ if __name__ == '__main__': parser.add_argument('-x', '--intersections-file', help='Path to planet-ways-latlons.osm') - parser.add_argument('--language-rtree-dir', + parser.add_argument('--country-rtree-dir', required=True, - help='Language RTree directory') + help='Country RTree directory') parser.add_argument('--rtree-dir', default=None, @@ -493,7 +493,8 @@ if __name__ == '__main__': args = parser.parse_args() - language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir) + country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir) + osm_rtree = None if args.rtree_dir: osm_rtree = OSMReverseGeocoder.load(args.rtree_dir) @@ -525,11 +526,11 @@ if __name__ == '__main__': # Can parallelize if args.streets_file: - build_ways_training_data(language_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated) + build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated) if args.borders_file: - build_toponym_training_data(language_rtree, args.borders_file, args.out_dir) + build_toponym_training_data(country_rtree, args.borders_file, args.out_dir) if args.venues_file: - build_venue_training_data(language_rtree, args.venues_file, args.out_dir) + build_venue_training_data(country_rtree, args.venues_file, args.out_dir) if args.address_file or args.intersections_file: if osm_rtree is None: @@ -542,20 +543,20 @@ if __name__ == '__main__': parser.error('--geonames-db required for formatted addresses') if args.address_file and args.format: - components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index) + components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged) if args.address_file and args.limited_addresses: - components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ') + components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ') osm_formatter.build_limited_training_data(args.address_file, args.out_dir) if args.place_nodes_file and args.format: - components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index) + components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged) if args.intersections_file and args.format: - components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index) + components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) + osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged) diff --git a/scripts/geodata/polygons/reverse_geocode.py b/scripts/geodata/polygons/reverse_geocode.py index 5a414aad..e695483b 100644 --- a/scripts/geodata/polygons/reverse_geocode.py +++ b/scripts/geodata/polygons/reverse_geocode.py @@ -465,7 +465,7 @@ class OSMCountryReverseGeocoder(OSMReverseGeocoder): polygon_reader = OSMCountryPolygonReader @classmethod - def country_and_languages(cls, osm_components): + def country_and_languages_from_components(cls, osm_components): country = None for c in reversed(osm_components): country = c.get('ISO3166-1:alpha2') @@ -501,6 +501,10 @@ class OSMCountryReverseGeocoder(OSMReverseGeocoder): return country, default_languages + def country_and_languages(self, lat, lon): + osm_components = self.point_in_poly(lat, lon, return_all=True) + return self.country_and_languages_from_components(osm_components) + if __name__ == '__main__': # Handle argument parsing here