[osm] Adding place training set. Every place, even nodes, in OSM will get population / 10000 + 1 simple place queries like city + state included in the training set, even if there are no OSM addresses for that city. Where postcodes are available, they'll also be added to the training examples

This commit is contained in:
Al
2016-07-24 20:09:56 -04:00
parent 39c193d52d
commit 09b77b52a6

View File

@@ -41,9 +41,11 @@ from geodata.file_utils import *
OSM_PARSER_DATA_DEFAULT_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'parser', 'data_sets', 'osm.yaml')
ADDRESS_FORMAT_DATA_TAGGED_FILENAME = 'formatted_addresses_tagged.tsv'
ADDRESS_FORMAT_DATA_FILENAME = 'formatted_addresses.tsv'
ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
FORMATTED_ADDRESS_DATA_TAGGED_FILENAME = 'formatted_addresses_tagged.tsv'
FORMATTED_ADDRESS_DATA_FILENAME = 'formatted_addresses.tsv'
FORMATTED_ADDRESS_DATA_LANGUAGE_FILENAME = 'formatted_addresses_by_language.tsv'
FORMATTED_PLACE_DATA_TAGGED_FILENAME = 'formatted_places_tagged.tsv'
FORMATTED_PLACE_DATA_FILENAME = 'formatted_places.tsv'
INTERSECTIONS_FILENAME = 'intersections.tsv'
INTERSECTIONS_TAGGED_FILENAME = 'intersections_tagged.tsv'
@@ -129,6 +131,8 @@ class OSMAddressFormatter(object):
}
}
boundary_component_priorities = {k: i for i, k in enumerate(AddressFormatter.BOUNDARY_COMPONENTS_ORDERED)}
def __init__(self, components, subdivisions_rtree=None, buildings_rtree=None):
# Instance of AddressComponents, contains structures for reverse geocoding, etc.
self.components = components
@@ -357,6 +361,102 @@ class OSMAddressFormatter(object):
formatted_addresses.append(formatted_address)
return formatted_addresses
def node_place_tags(self, tags):
try:
latitude, longitude = latlon_to_decimal(tags['lat'], tags['lon'])
except Exception:
return None
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages):
return None
local_languages = [(l['lang'], int(l['default'])) for l in candidate_languages]
all_local_languages = set([l['lang'] for l in candidate_languages])
random_languages = set(INTERNET_LANGUAGE_DISTRIBUTION)
more_than_one_official_language = len([l for l in candidate_languages if int(l['default'])]) > 1
osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
containing_ids = [(b['type'], b['id']) for b in osm_components]
component_name = osm_address_components.component_from_properties(country, tags, containing=containing_ids)
component_index = self.boundary_component_priorities.get(component_name)
if component_index:
osm_components = [c for i, c in enumerate(osm_components)
if self.boundary_component_priorities.get(osm_address_components.component_from_properties(country, c, containing=containing_ids[i:]), -1) >= component_index and
(c['type'], c['id']) != (tags['type'], tags['id'])]
# Do addr:postcode, postcode, postal_code, etc.
revised_tags = self.normalize_address_components(tags)
place_tags = []
postal_code = revised_tags.get(AddressFormatter.POSTCODE, None)
if postal_code:
postal_codes = parse_osm_number_range(postal_code, parse_letter_range=False)
try:
population = int(tags.get('population', 0))
except (ValueError, TypeError):
population = 0
num_references = population / 10000 + 1
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name'):
if more_than_one_official_language:
name = tags.get(name_tag)
language_suffix = None
if name and name.strip():
address_components = {component_name: name.strip()}
self.components.add_admin_boundaries(address_components, osm_components, country, language,
language_suffix=language_suffix)
self.components.normalize_place_names(address_components, osm_components, country=country, languages=all_local_languages)
place_tags.append((address_components, None, True))
for language, is_default in local_languages:
if is_default and not more_than_one_official_language:
language_suffix = None
name = tags.get(name_tag)
else:
language_suffix = ':{}'.format(language)
name = tags.get('{}{}'.format(name_tag, language_suffix))
if not name or not name.strip():
continue
address_components = {component_name: name.strip()}
self.components.add_admin_boundaries(address_components, osm_components, country, language,
language_suffix=language_suffix)
self.components.normalize_place_names(address_components, osm_components, country=country, languages=all_local_languages)
place_tags.append((address_components, language, is_default))
for language in random_languages - all_local_languages:
language_suffix = ':{}'.format(language)
name = tags.get('{}{}'.format(name_tag, language_suffix))
if not name or not name.strip():
continue
address_components = {component_name: name.strip()}
self.components.add_admin_boundaries(address_components, osm_components, country, language,
non_local_language=language,
language_suffix=language_suffix)
self.components.normalize_place_names(address_components, osm_components, country=country, languages=set([language]))
place_tags.append((address_components, language, False))
if postal_codes:
for address_components in place_tags:
address_components[AddressFormatter.POSTCODE] = random.choice(postal_codes)
return place_tags, num_references, country, language
def category_queries(self, tags, address_components, language, country=None, tag_components=True):
formatted_addresses = []
possible_category_keys = category_config.has_keys(language, tags)
@@ -607,10 +707,10 @@ class OSMAddressFormatter(object):
i = 0
if tag_components:
formatted_tagged_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_TAGGED_FILENAME), 'w')
formatted_tagged_file = open(os.path.join(out_dir, FORMATTED_ADDRESS_DATA_TAGGED_FILENAME), 'w')
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
else:
formatted_file = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_FILENAME), 'w')
formatted_file = open(os.path.join(out_dir, FORMATTED_ADDRESS_DATA_FILENAME), 'w')
writer = csv.writer(formatted_file, 'tsv_no_quote')
for node_id, value, deps in parse_osm(infile):
@@ -635,6 +735,40 @@ class OSMAddressFormatter(object):
if i % 1000 == 0 and i > 0:
print('did {} formatted addresses'.format(i))
def build_place_training_data(self, infile, out_dir, tag_components=True):
i = 0
if tag_components:
formatted_tagged_file = open(os.path.join(out_dir, FORMATTED_PLACE_DATA_TAGGED_FILENAME), 'w')
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
else:
formatted_tagged_file = open(os.path.join(out_dir, FORMATTED_PLACE_DATA_FILENAME), 'w')
writer = csv.writer(formatted_file, 'tsv_no_quote')
for node_id, tags, deps in parse_osm(infile):
place_tags, num_references, country, language = self.node_place_tags(tags)
for address_components, language, is_default in place_tags:
addresses = self.formatted_places(address_components, country, language)
if language is None:
language = UNKNOWN_LANGUAGE
for address in addresses:
if not address or not address.strip():
continue
address = tsv_string(address)
if tag_components:
row = (language, country, address)
else:
row = (address, )
for j in xrange(num_references if is_default else 1):
writer.writerow(row)
i += 1
if i % 1000 == 0 and i > 0:
print('did {} formatted places'.format(i))
def build_intersections_training_data(self, infile, out_dir, way_db_dir, tag_components=True):
'''
Intersection addresses like "4th & Main Street" are represented in OSM
@@ -750,7 +884,7 @@ class OSMAddressFormatter(object):
'''
i = 0
f = open(os.path.join(out_dir, ADDRESS_FORMAT_DATA_LANGUAGE_FILENAME), 'w')
f = open(os.path.join(out_dir, FORMATTED_ADDRESS_DATA_LANGUAGE_FILENAME), 'w')
writer = csv.writer(f, 'tsv_no_quote')
for node_id, value, deps in parse_osm(infile):