[boundaries] Adding regex replacements for boundary names such as Lyon 2e Arrondissement where putting Lyon is the OSM convention but we might sometimes want just 2e Arrondissement to appear in the training data next to Lyon

This commit is contained in:
Al
2016-08-11 13:09:08 -04:00
parent 10a41309b8
commit 48755ec218
3 changed files with 38 additions and 3 deletions

View File

@@ -694,9 +694,6 @@ class AddressComponents(object):
include these qualifiers in the training data.
'''
simple_name_key = 'name:simple'
international_name_key = 'int_name'
if osm_components:
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
raw_name_key = boundary_names.DEFAULT_NAME_KEY
@@ -721,6 +718,9 @@ class AddressComponents(object):
for k in (key, name_key, raw_key, raw_name_key):
name = component_value.get(k)
if name:
name = boundary_names.name(country, name)
if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names):
if six.u(';') in name:
name = random.choice(name.split(six.u(';'))).strip()

View File

@@ -1,4 +1,6 @@
import os
import random
import re
import six
import yaml
@@ -36,6 +38,20 @@ class BoundaryNames(object):
component_name_keys, component_probs = alternative_probabilities(component_names)
self.component_name_keys[component] = (component_name_keys, cdf(component_probs))
self.country_regex_replacements = defaultdict(list)
for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
country = props.get('country')
re_flags = re.I | re.UNICODE
if not props.get('case_insensitive', True):
re.flags ^= re.I
pattern = re.compile(props['pattern'], re_flags)
replace_group = props['replace_with_group']
replace_probability = props['replace_probability']
self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))
self.country_regex_replacements = dict(self.country_regex_replacements)
self.exceptions = {}
for props in nested_get(config, ('names', 'exceptions'), default=[]):
@@ -61,4 +77,17 @@ class BoundaryNames(object):
name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs))
return weighted_choice(name_keys, probs)
def name(self, country, name):
all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
if not all_replacements:
return name
for regex, group, prob in all_replacements:
match = regex.match(name)
if match and random.random() < prob:
name = match.group(group)
return name
boundary_names = BoundaryNames()