[boundaries] Adding regex replacements for boundary names such as Lyon 2e Arrondissement where putting Lyon is the OSM convention but we might sometimes want just 2e Arrondissement to appear in the training data next to Lyon
This commit is contained in:
@@ -694,9 +694,6 @@ class AddressComponents(object):
|
||||
include these qualifiers in the training data.
|
||||
'''
|
||||
|
||||
simple_name_key = 'name:simple'
|
||||
international_name_key = 'int_name'
|
||||
|
||||
if osm_components:
|
||||
name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
|
||||
raw_name_key = boundary_names.DEFAULT_NAME_KEY
|
||||
@@ -721,6 +718,9 @@ class AddressComponents(object):
|
||||
for k in (key, name_key, raw_key, raw_name_key):
|
||||
name = component_value.get(k)
|
||||
|
||||
if name:
|
||||
name = boundary_names.name(country, name)
|
||||
|
||||
if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names):
|
||||
if six.u(';') in name:
|
||||
name = random.choice(name.split(six.u(';'))).strip()
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import six
|
||||
import yaml
|
||||
|
||||
@@ -36,6 +38,20 @@ class BoundaryNames(object):
|
||||
component_name_keys, component_probs = alternative_probabilities(component_names)
|
||||
self.component_name_keys[component] = (component_name_keys, cdf(component_probs))
|
||||
|
||||
self.country_regex_replacements = defaultdict(list)
|
||||
for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
|
||||
country = props.get('country')
|
||||
re_flags = re.I | re.UNICODE
|
||||
if not props.get('case_insensitive', True):
|
||||
re.flags ^= re.I
|
||||
|
||||
pattern = re.compile(props['pattern'], re_flags)
|
||||
replace_group = props['replace_with_group']
|
||||
replace_probability = props['replace_probability']
|
||||
self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))
|
||||
|
||||
self.country_regex_replacements = dict(self.country_regex_replacements)
|
||||
|
||||
self.exceptions = {}
|
||||
|
||||
for props in nested_get(config, ('names', 'exceptions'), default=[]):
|
||||
@@ -61,4 +77,17 @@ class BoundaryNames(object):
|
||||
name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs))
|
||||
return weighted_choice(name_keys, probs)
|
||||
|
||||
def name(self, country, name):
|
||||
all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
|
||||
if not all_replacements:
|
||||
return name
|
||||
|
||||
for regex, group, prob in all_replacements:
|
||||
match = regex.match(name)
|
||||
if match and random.random() < prob:
|
||||
name = match.group(group)
|
||||
return name
|
||||
|
||||
|
||||
|
||||
boundary_names = BoundaryNames()
|
||||
|
||||
Reference in New Issue
Block a user