[languages] using country_and_languages method in OSM, neighborhoods and OpenAddresses

This commit is contained in:
Al
2016-10-05 02:49:55 -04:00
parent 98a8d898a1
commit faf418decb
7 changed files with 95 additions and 88 deletions

View File

@@ -35,6 +35,7 @@ from geodata.math.sampling import cdf, weighted_choice
from geodata.names.normalization import name_affixes from geodata.names.normalization import name_affixes
from geodata.osm.components import osm_address_components from geodata.osm.components import osm_address_components
from geodata.places.config import place_config from geodata.places.config import place_config
from geodata.polygons.reverse_geocode import OSMCountryReverseGeocoder
from geodata.states.state_abbreviations import state_abbreviations from geodata.states.state_abbreviations import state_abbreviations
from geodata.text.utils import is_numeric from geodata.text.utils import is_numeric
@@ -69,7 +70,7 @@ class AddressComponents(object):
prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen". prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen".
Usage: Usage:
>>> components = AddressComponents(osm_admin_rtree, language_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames) >>> components = AddressComponents(osm_admin_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames)
>>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567) >>> components.expand({'name': 'Hackney Empire'}, 51.54559, -0.05567)
Returns (results vary because of randomness): Returns (results vary because of randomness):
@@ -145,7 +146,7 @@ class AddressComponents(object):
AddressFormatter.UNIT: Unit, AddressFormatter.UNIT: Unit,
} }
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames): def __init__(self, osm_admin_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames):
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG)) self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
self.setup_component_dependencies() self.setup_component_dependencies()
@@ -153,7 +154,6 @@ class AddressComponents(object):
self.address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(self.config['dropout'])} self.address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(self.config['dropout'])}
self.osm_admin_rtree = osm_admin_rtree self.osm_admin_rtree = osm_admin_rtree
self.language_rtree = language_rtree
self.neighborhoods_rtree = neighborhoods_rtree self.neighborhoods_rtree = neighborhoods_rtree
self.quattroshapes_rtree = quattroshapes_rtree self.quattroshapes_rtree = quattroshapes_rtree
self.geonames = geonames self.geonames = geonames
@@ -249,6 +249,9 @@ class AddressComponents(object):
def osm_reverse_geocoded_components(self, latitude, longitude): def osm_reverse_geocoded_components(self, latitude, longitude):
return self.osm_admin_rtree.point_in_poly(latitude, longitude, return_all=True) return self.osm_admin_rtree.point_in_poly(latitude, longitude, return_all=True)
def osm_country_and_languages(self, osm_components):
return OSMCountryReverseGeocoder.country_and_languages_from_components(osm_components)
def categorize_osm_component(self, country, props, containing_components): def categorize_osm_component(self, country, props, containing_components):
containing_ids = [(c['type'], c['id']) for c in containing_components if 'type' in c and 'id' in c] containing_ids = [(c['type'], c['id']) for c in containing_components if 'type' in c and 'id' in c]
@@ -288,17 +291,16 @@ class AddressComponents(object):
language = None language = None
if len(candidate_languages) == 1: if len(candidate_languages) == 1:
language = candidate_languages[0]['lang'] language = candidate_languages[0][0]
else: else:
street = components.get(AddressFormatter.ROAD, None) street = components.get(AddressFormatter.ROAD, None)
lang_tuples = [(l['lang'], l['default']) for l in candidate_languages]
if street is not None: if street is not None:
language = disambiguate_language(street, lang_tuples) language = disambiguate_language(street, candidate_languages)
else: else:
if has_non_latin_script(lang_tuples): if has_non_latin_script(candidate_languages):
for component, value in six.iteritems(components): for component, value in six.iteritems(components):
language, script_langs = disambiguate_language_script(value, lang_tuples) language, script_langs = disambiguate_language_script(value, candidate_languages)
if language is not UNKNOWN_LANGUAGE: if language is not UNKNOWN_LANGUAGE:
break break
else: else:
@@ -1247,16 +1249,14 @@ class AddressComponents(object):
except Exception: except Exception:
return None, None, None return None, None, None
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
if not (country and candidate_languages): country, candidate_languages = self.osm_country_and_languages(osm_components)
return None, None, None
more_than_one_official_language = len(candidate_languages) > 1 more_than_one_official_language = len(candidate_languages) > 1
non_local_language = None non_local_language = None
language_suffix = '' language_suffix = ''
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
neighborhoods = self.neighborhood_components(latitude, longitude) neighborhoods = self.neighborhood_components(latitude, longitude)
all_osm_components = osm_components + neighborhoods all_osm_components = osm_components + neighborhoods
@@ -1272,7 +1272,7 @@ class AddressComponents(object):
if address_state: if address_state:
address_components[AddressFormatter.STATE] = address_state address_components[AddressFormatter.STATE] = address_state
all_languages = set([l['lang'] for l in candidate_languages]) all_languages = set([l for l, d in candidate_languages])
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages) self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
@@ -1329,7 +1329,9 @@ class AddressComponents(object):
except Exception: except Exception:
return None, None, None return None, None, None
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
country, candidate_languages = self.osm_country_and_languages(osm_components)
if not (country and candidate_languages): if not (country and candidate_languages):
return None, None, None return None, None, None
@@ -1355,10 +1357,9 @@ class AddressComponents(object):
street = address_components.get(AddressFormatter.ROAD) street = address_components.get(AddressFormatter.ROAD)
osm_components = self.osm_reverse_geocoded_components(latitude, longitude)
neighborhoods = self.neighborhood_components(latitude, longitude) neighborhoods = self.neighborhood_components(latitude, longitude)
all_languages = set([l['lang'] for l in candidate_languages]) all_languages = set([l for l, d in candidate_languages])
all_osm_components = osm_components + neighborhoods all_osm_components = osm_components + neighborhoods
language_suffix = self.pick_language_suffix(all_osm_components, language, non_local_language, more_than_one_official_language) language_suffix = self.pick_language_suffix(all_osm_components, language, non_local_language, more_than_one_official_language)

View File

@@ -23,7 +23,7 @@ from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS from geodata.osm.extract import parse_osm, osm_type_and_id, NODE, WAY, RELATION, OSM_NAME_TAGS
from geodata.polygons.index import * from geodata.polygons.index import *
from geodata.polygons.language_polys import LanguagePolygonIndex from geodata.polygons.language_polys import LanguagePolygonIndex
from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMReverseGeocoder from geodata.polygons.reverse_geocode import QuattroshapesReverseGeocoder, OSMCountryReverseGeocoder, OSMReverseGeocoder
from geodata.statistics.tf_idf import IDFIndex from geodata.statistics.tf_idf import IDFIndex
@@ -234,7 +234,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
return doc return doc
@classmethod @classmethod
def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, language_rtree_dir, osm_rtree_dir, output_dir): def create_from_osm_and_quattroshapes(cls, filename, quattroshapes_dir, country_rtree_dir, osm_rtree_dir, output_dir):
''' '''
Given an OSM file (planet or some other bounds) containing neighborhoods Given an OSM file (planet or some other bounds) containing neighborhoods
as points (some suburbs have boundaries) as points (some suburbs have boundaries)
@@ -259,7 +259,7 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
logger.info('Creating ClickThatHood neighborhoods') logger.info('Creating ClickThatHood neighborhoods')
cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index() cth = ClickThatHoodReverseGeocoder.create_neighborhoods_index()
language_rtree = LanguagePolygonIndex.load(language_rtree_dir) country_rtree = OSMCountryReverseGeocoder.load(country_rtree_dir)
osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir) osm_admin_rtree = OSMReverseGeocoder.load(osm_rtree_dir)
@@ -307,7 +307,8 @@ class NeighborhoodReverseGeocoder(RTreePolygonIndex):
possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD) possible_neighborhood = osm_definitions.meets_definition(attrs, osm_definitions.NEIGHBORHOOD)
is_neighborhood = attrs.get('place') in ('neighbourhood', 'neighborhood') is_neighborhood = attrs.get('place') in ('neighbourhood', 'neighborhood')
country, candidate_languages, language_props = language_rtree.country_and_languages(lat, lon) country, candidate_languages = country_rtree.country_and_languages(lat, lon)
component_name = None component_name = None
component_name = osm_address_components.component_from_properties(country, attrs) component_name = osm_address_components.component_from_properties(country, attrs)
@@ -473,8 +474,8 @@ if __name__ == '__main__':
parser.add_argument('-a', '--osm-admin-rtree-dir', parser.add_argument('-a', '--osm-admin-rtree-dir',
help='Path to OSM admin rtree dir') help='Path to OSM admin rtree dir')
parser.add_argument('-l', '--language-rtree-dir', parser.add_argument('-c', '--country-rtree-dir',
help='Path to language rtree dir') help='Path to country rtree dir')
parser.add_argument('-n', '--osm-neighborhoods-file', parser.add_argument('-n', '--osm-neighborhoods-file',
help='Path to OSM neighborhoods file (no dependencies, .osm format)') help='Path to OSM neighborhoods file (no dependencies, .osm format)')
@@ -486,10 +487,11 @@ if __name__ == '__main__':
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
args = parser.parse_args() args = parser.parse_args()
if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.language_rtree_dir: if args.osm_neighborhoods_file and args.quattroshapes_dir and args.osm_admin_rtree_dir and args.country_rtree_dir:
index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes( index = NeighborhoodReverseGeocoder.create_from_osm_and_quattroshapes(
args.osm_neighborhoods_file, args.osm_neighborhoods_file,
args.quattroshapes_dir, args.quattroshapes_dir,
args.country_rtree_dir,
args.language_rtree_dir, args.language_rtree_dir,
args.osm_admin_rtree_dir, args.osm_admin_rtree_dir,
args.out_dir args.out_dir

View File

@@ -67,9 +67,9 @@ class OpenAddressesFormatter(object):
re.I | re.UNICODE) re.I | re.UNICODE)
unit_type_regexes[lang] = pattern unit_type_regexes[lang] = pattern
def __init__(self, components, debug=False): def __init__(self, components, country_rtree, debug=False):
self.components = components self.components = components
self.language_rtree = components.language_rtree self.country_rtree = country_rtree
self.debug = debug self.debug = debug
@@ -309,7 +309,7 @@ class OpenAddressesFormatter(object):
continue continue
if components: if components:
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages): if not (country and candidate_languages):
continue continue

View File

@@ -15,7 +15,7 @@ from geodata.addresses.components import AddressComponents
from geodata.geonames.db import GeoNamesDB from geodata.geonames.db import GeoNamesDB
from geodata.polygons.language_polys import LanguagePolygonIndex from geodata.polygons.language_polys import LanguagePolygonIndex
from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder from geodata.neighborhoods.reverse_geocode import NeighborhoodReverseGeocoder
from geodata.polygons.reverse_geocode import OSMReverseGeocoder, QuattroshapesReverseGeocoder from geodata.polygons.reverse_geocode import OSMReverseGeocoder, OSMCountryReverseGeocoder, QuattroshapesReverseGeocoder
if __name__ == '__main__': if __name__ == '__main__':
@@ -35,9 +35,9 @@ if __name__ == '__main__':
default=False, default=False,
help='Save untagged formatted addresses (slow)') help='Save untagged formatted addresses (slow)')
parser.add_argument('--language-rtree-dir', parser.add_argument('--country-rtree-dir',
required=True, required=True,
help='Language RTree directory') help='Country RTree directory')
parser.add_argument('--rtree-dir', parser.add_argument('--rtree-dir',
default=None, default=None,
@@ -66,7 +66,7 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir) country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir)
osm_rtree = None osm_rtree = None
if args.rtree_dir: if args.rtree_dir:
@@ -86,7 +86,7 @@ if __name__ == '__main__':
geonames = GeoNamesDB(args.geonames_db) geonames = GeoNamesDB(args.geonames_db)
if args.openaddresses_dir and args.format: if args.openaddresses_dir and args.format:
components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
oa_formatter = OpenAddressesFormatter(components, debug=args.debug) oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug)
oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged) oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged)

View File

@@ -7,7 +7,7 @@ import six
import sys import sys
import yaml import yaml
from collections import OrderedDict from collections import defaultdict, OrderedDict, Counter
from six import itertools from six import itertools
this_dir = os.path.realpath(os.path.dirname(__file__)) this_dir = os.path.realpath(os.path.dirname(__file__))
@@ -109,8 +109,8 @@ class OSMAddressFormatter(object):
('is_in:region', AddressFormatter.STATE), ('is_in:region', AddressFormatter.STATE),
# Used in Tunisia # Used in Tunisia
('addr:governorate', AddressFormatter.STATE), ('addr:governorate', AddressFormatter.STATE),
('addr:postal_code', AddressFormatter.POSTCODE),
('addr:postcode', AddressFormatter.POSTCODE), ('addr:postcode', AddressFormatter.POSTCODE),
('addr:postal_code', AddressFormatter.POSTCODE),
('addr:zipcode', AddressFormatter.POSTCODE), ('addr:zipcode', AddressFormatter.POSTCODE),
('postal_code', AddressFormatter.POSTCODE), ('postal_code', AddressFormatter.POSTCODE),
('addr:country', AddressFormatter.COUNTRY), ('addr:country', AddressFormatter.COUNTRY),
@@ -138,6 +138,8 @@ class OSMAddressFormatter(object):
'commercial': AddressComponents.zones.COMMERCIAL, 'commercial': AddressComponents.zones.COMMERCIAL,
'industrial': AddressComponents.zones.INDUSTRIAL, 'industrial': AddressComponents.zones.INDUSTRIAL,
'residential': AddressComponents.zones.RESIDENTIAL, 'residential': AddressComponents.zones.RESIDENTIAL,
'university': AddressComponents.zones.UNIVERSITY,
'college': AddressComponents.zones.UNIVERSITY,
}, },
'amenity': { 'amenity': {
'university': AddressComponents.zones.UNIVERSITY, 'university': AddressComponents.zones.UNIVERSITY,
@@ -147,10 +149,10 @@ class OSMAddressFormatter(object):
boundary_component_priorities = {k: i for i, k in enumerate(AddressFormatter.BOUNDARY_COMPONENTS_ORDERED)} boundary_component_priorities = {k: i for i, k in enumerate(AddressFormatter.BOUNDARY_COMPONENTS_ORDERED)}
def __init__(self, components, subdivisions_rtree=None, buildings_rtree=None, metro_stations_index=None): def __init__(self, components, country_rtree, subdivisions_rtree=None, buildings_rtree=None, metro_stations_index=None):
# Instance of AddressComponents, contains structures for reverse geocoding, etc. # Instance of AddressComponents, contains structures for reverse geocoding, etc.
self.components = components self.components = components
self.language_rtree = components.language_rtree self.country_rtree = country_rtree
self.subdivisions_rtree = subdivisions_rtree self.subdivisions_rtree = subdivisions_rtree
self.buildings_rtree = buildings_rtree self.buildings_rtree = buildings_rtree
@@ -168,7 +170,7 @@ class OSMAddressFormatter(object):
if len(candidate_languages) > 1: if len(candidate_languages) > 1:
street = tags.get('addr:street', None) street = tags.get('addr:street', None)
namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in tags] namespaced = [l for l, d in candidate_languages if 'addr:street:{}'.format(l) in tags]
if namespaced and random.random() < pick_namespaced_language_prob: if namespaced and random.random() < pick_namespaced_language_prob:
language = random.choice(namespaced) language = random.choice(namespaced)
@@ -344,7 +346,6 @@ class OSMAddressFormatter(object):
return True return True
return False return False
def add_metro_station(self, address_components, latitude, longitude, language=None, default_language=None): def add_metro_station(self, address_components, latitude, longitude, language=None, default_language=None):
''' '''
Metro stations Metro stations
@@ -476,19 +477,10 @@ class OSMAddressFormatter(object):
return (), None return (), None
osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
if country and candidate_languages:
local_languages = [(l['lang'], bool(int(l['default']))) for l in candidate_languages]
else:
for c in reversed(osm_components):
country = c.get('ISO3166-1:alpha2')
if country:
country = country.lower()
break
else:
return (), None
local_languages = [(lang, bool(int(default))) for lang, default in get_country_languages(country).iteritems()] country, candidate_languages = OSMCountryReverseGeocoder.country_and_languages_from_components(osm_components)
local_languages = candidate_languages
all_local_languages = set([l for l, d in local_languages]) all_local_languages = set([l for l, d in local_languages])
random_languages = set(INTERNET_LANGUAGE_DISTRIBUTION) random_languages = set(INTERNET_LANGUAGE_DISTRIBUTION)
@@ -551,12 +543,17 @@ class OSMAddressFormatter(object):
# Calculate how many records to produce for this place given its population # Calculate how many records to produce for this place given its population
population_divisor = 10000 # Add one record for every 10k in population population_divisor = 10000 # Add one record for every 10k in population
min_references = 5 # Every place gets at least 5 reference to account for variations min_references = 5 # Every place gets at least 5 reference to account for variations
if component_name == AddressFormatter.CITY:
# Cities get a few extra references over e.g. a state_district with the same name
# so that if the population is unknown, hopefully the city will have more references
# and the parser will prefer that meaning
min_references += 2
max_references = 1000 # Cap the number of references e.g. for India and China country nodes max_references = 1000 # Cap the number of references e.g. for India and China country nodes
num_references = min(population / population_divisor + min_references, max_references) num_references = min(population / population_divisor + min_references, max_references)
cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0)) cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0))
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name'): for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
if more_than_one_official_language: if more_than_one_official_language:
name = tags.get(name_tag) name = tags.get(name_tag)
language_suffix = '' language_suffix = ''
@@ -757,7 +754,7 @@ class OSMAddressFormatter(object):
except Exception: except Exception:
return None, None, None return None, None, None
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages): if not (country and candidate_languages):
return None, None, None return None, None, None
@@ -880,7 +877,7 @@ class OSMAddressFormatter(object):
except Exception: except Exception:
return None, None, None return None, None, None
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages): if not (country and candidate_languages):
return None, None, None return None, None, None
@@ -986,8 +983,10 @@ class OSMAddressFormatter(object):
for node_id, tags, deps in parse_osm(infile): for node_id, tags, deps in parse_osm(infile):
tags['type'], tags['id'] = node_id.split(':') tags['type'], tags['id'] = node_id.split(':')
place_tags, country = self.node_place_tags(tags) place_tags, country = self.node_place_tags(tags)
for address_components, language, is_default in place_tags: for address_components, language, is_default in place_tags:
addresses = self.formatted_places(address_components, country, language) addresses = self.formatted_places(address_components, country, language)
if language is None: if language is None:
language = UNKNOWN_LANGUAGE language = UNKNOWN_LANGUAGE
@@ -1083,11 +1082,11 @@ class OSMAddressFormatter(object):
except Exception: except Exception:
continue continue
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) country, candidate_languages = self.country_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages): if not (country and candidate_languages):
continue continue
more_than_one_official_language = sum((1 for l in candidate_languages if int(l['default']))) > 1 more_than_one_official_language = sum((1 for l, d in candidate_languages if d)) > 1
base_name_tag = None base_name_tag = None
for t in all_base_name_tags: for t in all_base_name_tags:
@@ -1103,7 +1102,7 @@ class OSMAddressFormatter(object):
names = defaultdict(list) names = defaultdict(list)
if len(candidate_languages) == 1: if len(candidate_languages) == 1:
default_language = candidate_languages[0]['lang'] default_language = candidate_languages[0][0]
elif not more_than_one_official_language: elif not more_than_one_official_language:
default_language = None default_language = None
name = way['name'] name = way['name']

View File

@@ -12,25 +12,25 @@ plenty of disk space. The following commands can be used in parallel to create
all the training sets: all the training sets:
Ways: Ways:
python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --language-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR) python osm_address_training_data.py -s $(OSM_DIR)/planet-ways.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
Venues: Venues:
python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --language-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR) python osm_address_training_data.py -v $(OSM_DIR)/planet-venues.osm --country-rtree-dir=$(RTREE_DIR) -o $(OUT_DIR)
Limited formatted addresses: Limited formatted addresses:
python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --language-rtree-dir=$(LANG_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) -o $(OUT_DIR) python osm_address_training_data.py -a -l $(OSM_DIR)/planet-addresses.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) -o $(OUT_DIR)
Formatted addresses (tagged): Formatted addresses (tagged):
python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)
Formatted addresses (untagged): Formatted addresses (untagged):
python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f -u --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) python osm_address_training_data.py -a $(OSM_DIR)/planet-addresses.osm -f -u --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)
Intersections (after running intersections.py to create the JSON file): Intersections (after running intersections.py to create the JSON file):
python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --language-rtree-dir=$(LANG_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR) python osm_address_training_data -x $(OSM_DIR)/intersections.json -f --country-rtree-dir=$(COUNTRY_RTREE_DIR) --neighborhoods-rtree-dir=$(NEIGHBORHOODS_RTREE_DIR) --rtree-dir=$(RTREE_DIR) --quattroshapes-rtree-dir=$(QS_TREE_DIR) --geonames-db=$(GEONAMES_DB_PATH) -o $(OUT_DIR)
Toponyms: Toponyms:
python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --language-rtree-dir=$(LANG_RTREE_DIR) -o $(OUT_DIR) python osm_address_training_data.py -b $(OSM_DIR)/planet-borders.osm --country-rtree-dir=$(COUNTRY_RTREE_DIR) -o $(OUT_DIR)
''' '''
import argparse import argparse
@@ -91,7 +91,7 @@ def normalize_osm_name_tag(tag, script=False):
return norm.split('_', 1)[0] return norm.split('_', 1)[0]
def get_language_names(language_rtree, key, value, tag_prefix='name'): def get_language_names(country_rtree, key, value, tag_prefix='name'):
if not ('lat' in value and 'lon' in value): if not ('lat' in value and 'lon' in value):
return None, None return None, None
@@ -104,7 +104,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
except Exception: except Exception:
return None, None return None, None
country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude) country, candidate_languages = country_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages): if not (country and candidate_languages):
return None, None return None, None
@@ -177,7 +177,7 @@ def get_language_names(language_rtree, key, value, tag_prefix='name'):
return country, name_language return country, name_language
def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets=True): def build_ways_training_data(country_rtree, infile, out_dir, abbreviate_streets=True):
''' '''
Creates a training set for language classification using most OSM ways Creates a training set for language classification using most OSM ways
(streets) under a fairly lengthy osmfilter definition which attempts to (streets) under a fairly lengthy osmfilter definition which attempts to
@@ -193,7 +193,7 @@ def build_ways_training_data(language_rtree, infile, out_dir, abbreviate_streets
writer = csv.writer(f, 'tsv_no_quote') writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile, allowed_types=WAYS_RELATIONS): for key, value, deps in parse_osm(infile, allowed_types=WAYS_RELATIONS):
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name') country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
if not name_language: if not name_language:
continue continue
@@ -242,7 +242,7 @@ POSTAL_KEYS = (
) )
def build_toponym_training_data(language_rtree, infile, out_dir): def build_toponym_training_data(country_rtree, infile, out_dir):
''' '''
Data set of toponyms by language and country which should assist Data set of toponyms by language and country which should assist
in language classification. OSM tends to use the native language in language classification. OSM tends to use the native language
@@ -268,7 +268,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
except Exception: except Exception:
continue continue
country, candidate_languages, language_props = language_rtree.country_and_languages(latitude, longitude) country, candidate_languages = country_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages): if not (country and candidate_languages):
continue continue
@@ -340,7 +340,7 @@ def build_toponym_training_data(language_rtree, infile, out_dir):
f.close() f.close()
def build_address_training_data(langauge_rtree, infile, out_dir, format=False): def build_address_training_data(country_rtree, infile, out_dir, format=False):
''' '''
Creates training set similar to the ways data but using addr:street tags instead. Creates training set similar to the ways data but using addr:street tags instead.
These may be slightly closer to what we'd see in real live addresses, containing These may be slightly closer to what we'd see in real live addresses, containing
@@ -354,7 +354,7 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
writer = csv.writer(f, 'tsv_no_quote') writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile): for key, value, deps in parse_osm(infile):
country, street_language = get_language_names(language_rtree, key, value, tag_prefix='addr:street') country, street_language = get_language_names(country_rtree, key, value, tag_prefix='addr:street')
if not street_language: if not street_language:
continue continue
@@ -374,14 +374,14 @@ def build_address_training_data(langauge_rtree, infile, out_dir, format=False):
VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv' VENUE_LANGUAGE_DATA_FILENAME = 'names_by_language.tsv'
def build_venue_training_data(language_rtree, infile, out_dir): def build_venue_training_data(country_rtree, infile, out_dir):
i = 0 i = 0
f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w') f = open(os.path.join(out_dir, VENUE_LANGUAGE_DATA_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote') writer = csv.writer(f, 'tsv_no_quote')
for key, value, deps in parse_osm(infile): for key, value, deps in parse_osm(infile):
country, name_language = get_language_names(language_rtree, key, value, tag_prefix='name') country, name_language = get_language_names(country_rtree, key, value, tag_prefix='name')
if not name_language: if not name_language:
continue continue
@@ -455,9 +455,9 @@ if __name__ == '__main__':
parser.add_argument('-x', '--intersections-file', parser.add_argument('-x', '--intersections-file',
help='Path to planet-ways-latlons.osm') help='Path to planet-ways-latlons.osm')
parser.add_argument('--language-rtree-dir', parser.add_argument('--country-rtree-dir',
required=True, required=True,
help='Language RTree directory') help='Country RTree directory')
parser.add_argument('--rtree-dir', parser.add_argument('--rtree-dir',
default=None, default=None,
@@ -493,7 +493,8 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
language_rtree = LanguagePolygonIndex.load(args.language_rtree_dir) country_rtree = OSMCountryReverseGeocoder.load(args.country_rtree_dir)
osm_rtree = None osm_rtree = None
if args.rtree_dir: if args.rtree_dir:
osm_rtree = OSMReverseGeocoder.load(args.rtree_dir) osm_rtree = OSMReverseGeocoder.load(args.rtree_dir)
@@ -525,11 +526,11 @@ if __name__ == '__main__':
# Can parallelize # Can parallelize
if args.streets_file: if args.streets_file:
build_ways_training_data(language_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated) build_ways_training_data(country_rtree, args.streets_file, args.out_dir, abbreviate_streets=not args.unabbreviated)
if args.borders_file: if args.borders_file:
build_toponym_training_data(language_rtree, args.borders_file, args.out_dir) build_toponym_training_data(country_rtree, args.borders_file, args.out_dir)
if args.venues_file: if args.venues_file:
build_venue_training_data(language_rtree, args.venues_file, args.out_dir) build_venue_training_data(country_rtree, args.venues_file, args.out_dir)
if args.address_file or args.intersections_file: if args.address_file or args.intersections_file:
if osm_rtree is None: if osm_rtree is None:
@@ -542,20 +543,20 @@ if __name__ == '__main__':
parser.error('--geonames-db required for formatted addresses') parser.error('--geonames-db required for formatted addresses')
if args.address_file and args.format: if args.address_file and args.format:
components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged) osm_formatter.build_training_data(args.address_file, args.out_dir, tag_components=not args.untagged)
if args.address_file and args.limited_addresses: if args.address_file and args.limited_addresses:
components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ') osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index, splitter=u' ')
osm_formatter.build_limited_training_data(args.address_file, args.out_dir) osm_formatter.build_limited_training_data(args.address_file, args.out_dir)
if args.place_nodes_file and args.format: if args.place_nodes_file and args.format:
components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged) osm_formatter.build_place_training_data(args.place_nodes_file, args.out_dir, tag_components=not args.untagged)
if args.intersections_file and args.format: if args.intersections_file and args.format:
components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) components = AddressComponents(osm_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames)
osm_formatter = OSMAddressFormatter(components, subdivisions_rtree, buildings_rtree, metro_stations_index) osm_formatter = OSMAddressFormatter(components, country_rtree, subdivisions_rtree, buildings_rtree, metro_stations_index)
osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged) osm_formatter.build_intersections_training_data(args.intersections_file, args.out_dir, tag_components=not args.untagged)

View File

@@ -465,7 +465,7 @@ class OSMCountryReverseGeocoder(OSMReverseGeocoder):
polygon_reader = OSMCountryPolygonReader polygon_reader = OSMCountryPolygonReader
@classmethod @classmethod
def country_and_languages(cls, osm_components): def country_and_languages_from_components(cls, osm_components):
country = None country = None
for c in reversed(osm_components): for c in reversed(osm_components):
country = c.get('ISO3166-1:alpha2') country = c.get('ISO3166-1:alpha2')
@@ -501,6 +501,10 @@ class OSMCountryReverseGeocoder(OSMReverseGeocoder):
return country, default_languages return country, default_languages
def country_and_languages(self, lat, lon):
osm_components = self.point_in_poly(lat, lon, return_all=True)
return self.country_and_languages_from_components(osm_components)
if __name__ == '__main__': if __name__ == '__main__':
# Handle argument parsing here # Handle argument parsing here