[osm] hyphenating and de-hyphenating place names in places training data

This commit is contained in:
Al
2016-10-19 00:33:10 -04:00
parent 562caba31c
commit 98ac232eea

View File

@@ -512,6 +512,7 @@ class OSMAddressFormatter(object):
for i, c in enumerate(osm_components): for i, c in enumerate(osm_components):
c_name = osm_address_components.component_from_properties(country, c, containing=containing_ids[i + 1:]) c_name = osm_address_components.component_from_properties(country, c, containing=containing_ids[i + 1:])
c_index = self.boundary_component_priorities.get(c_name, -1) c_index = self.boundary_component_priorities.get(c_name, -1)
if c_index >= component_index and (c['type'], c['id']) != (tags.get('type', 'node'), tags.get('id')): if c_index >= component_index and (c['type'], c['id']) != (tags.get('type', 'node'), tags.get('id')):
revised_osm_components.append(c) revised_osm_components.append(c)
@@ -555,6 +556,8 @@ class OSMAddressFormatter(object):
cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0)) cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0))
cldr_country_prob = float(nested_get(self.config, ('places', 'cldr_country_probability'), default=0.0))
for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'): for name_tag in ('name', 'alt_name', 'loc_name', 'short_name', 'int_name', 'name:simple', 'official_name'):
if more_than_one_official_language: if more_than_one_official_language:
name = tags.get(name_tag) name = tags.get(name_tag)
@@ -569,14 +572,29 @@ class OSMAddressFormatter(object):
if six.u('|') in name: if six.u('|') in name:
name = name.replace(six.u('|'), six.u('')) name = name.replace(six.u('|'), six.u(''))
name = self.components.strip_whitespace_and_hyphens(name)
sans_hyphens = self.components.dehyphenate_multiword_name(name)
with_hyphens = self.components.hyphenate_multiword_name(name)
for i in xrange(num_references if name_tag == 'name' else 1): for i in xrange(num_references if name_tag == 'name' else 1):
address_components = {component_name: name.strip()} address_components = {component_name: name}
self.components.add_admin_boundaries(address_components, osm_components, country, UNKNOWN_LANGUAGE, self.components.add_admin_boundaries(address_components, osm_components, country, UNKNOWN_LANGUAGE,
random_key=num_references > 1, random_key=num_references > 1,
language_suffix=language_suffix, language_suffix=language_suffix,
drop_duplicate_city_names=False) drop_duplicate_city_names=False)
place_tags.append((address_components, None, True)) place_tags.append((address_components, None, True))
if sans_hyphens != name:
address_components = address_components.copy()
address_components[component_name] = sans_hyphens
place_tags.append((address_components, None, True))
if with_hyphens != name:
address_components = address_components.copy()
address_components[component_name] = with_hyphens
place_tags.append((address_components, None, True))
for language, is_default in local_languages: for language, is_default in local_languages:
if is_default and not more_than_one_official_language: if is_default and not more_than_one_official_language:
@@ -604,8 +622,13 @@ class OSMAddressFormatter(object):
else: else:
n = num_references / 2 n = num_references / 2
name = self.components.strip_whitespace_and_hyphens(name)
sans_hyphens = self.components.dehyphenate_multiword_name(name)
with_hyphens = self.components.hyphenate_multiword_name(name)
for i in xrange(n): for i in xrange(n):
address_components = {component_name: name.strip()} address_components = {component_name: name}
self.components.add_admin_boundaries(address_components, osm_components, country, language, self.components.add_admin_boundaries(address_components, osm_components, country, language,
random_key=is_default, random_key=is_default,
language_suffix=language_suffix, language_suffix=language_suffix,
@@ -613,6 +636,16 @@ class OSMAddressFormatter(object):
place_tags.append((address_components, language, is_default)) place_tags.append((address_components, language, is_default))
if sans_hyphens != name:
address_components = address_components.copy()
address_components[component_name] = sans_hyphens
place_tags.append((address_components, language, is_default))
if with_hyphens != name:
address_components = address_components.copy()
address_components[component_name] = with_hyphens
place_tags.append((address_components, language, is_default))
for language in random_languages - all_local_languages: for language in random_languages - all_local_languages:
language_suffix = ':{}'.format(language) language_suffix = ':{}'.format(language)
@@ -631,9 +664,14 @@ class OSMAddressFormatter(object):
if six.u('|') in name: if six.u('|') in name:
name = name.replace(six.u('|'), six.u('')) name = name.replace(six.u('|'), six.u(''))
name = self.components.strip_whitespace_and_hyphens(name)
sans_hyphens = self.components.dehyphenate_multiword_name(name)
with_hyphens = self.components.hyphenate_multiword_name(name)
# Add half as many English records as the local language, every other language gets min_referenes / 2 # Add half as many English records as the local language, every other language gets min_referenes / 2
for i in xrange(num_references / 2 if language == ENGLISH else min_references / 2): for i in xrange(num_references / 2 if language == ENGLISH else min_references / 2):
address_components = {component_name: name.strip()} address_components = {component_name: name}
self.components.add_admin_boundaries(address_components, osm_components, country, language, self.components.add_admin_boundaries(address_components, osm_components, country, language,
random_key=False, random_key=False,
non_local_language=language, non_local_language=language,
@@ -642,6 +680,16 @@ class OSMAddressFormatter(object):
place_tags.append((address_components, language, False)) place_tags.append((address_components, language, False))
if sans_hyphens != name:
address_components = address_components.copy()
address_components[component_name] = sans_hyphens
place_tags.append((address_components, language, False))
if with_hyphens != name:
address_components = address_components.copy()
address_components[component_name] = with_hyphens
place_tags.append((address_components, language, False))
if postal_codes: if postal_codes:
extra_place_tags = [] extra_place_tags = []
num_existing_place_tags = len(place_tags) num_existing_place_tags = len(place_tags)