diff --git a/resources/boundaries/names/global.yaml b/resources/boundaries/names/global.yaml index a741f1f2..2b24e209 100644 --- a/resources/boundaries/names/global.yaml +++ b/resources/boundaries/names/global.yaml @@ -29,6 +29,12 @@ names: - alternative: official_name # e.g. United Kingdom of Great Britain and Northern Ireland probability: 0.01 + regex_replacements: + - country: fr + pattern: "(?:lyon|paris|marseilles) ([\\d]+er? arrondissement)" + replace_with_group: 1 + replace_probability: 0.5 + # This section overrides place names exceptions: # Boroughs of New York City diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index a27f8e7e..18d135e4 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -694,9 +694,6 @@ class AddressComponents(object): include these qualifiers in the training data. ''' - simple_name_key = 'name:simple' - international_name_key = 'int_name' - if osm_components: name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix)) raw_name_key = boundary_names.DEFAULT_NAME_KEY @@ -721,6 +718,9 @@ class AddressComponents(object): for k in (key, name_key, raw_key, raw_name_key): name = component_value.get(k) + if name: + name = boundary_names.name(country, name) + if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names): if six.u(';') in name: name = random.choice(name.split(six.u(';'))).strip() diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py index f337b246..e7401594 100644 --- a/scripts/geodata/boundaries/names.py +++ b/scripts/geodata/boundaries/names.py @@ -1,4 +1,6 @@ import os +import random +import re import six import yaml @@ -36,6 +38,20 @@ class BoundaryNames(object): component_name_keys, component_probs = alternative_probabilities(component_names) self.component_name_keys[component] = (component_name_keys, cdf(component_probs)) + self.country_regex_replacements = defaultdict(list) + for props in nested_get(config, ('names', 'regex_replacements',), default=[]): + country = props.get('country') + re_flags = re.I | re.UNICODE + if not props.get('case_insensitive', True): + re.flags ^= re.I + + pattern = re.compile(props['pattern'], re_flags) + replace_group = props['replace_with_group'] + replace_probability = props['replace_probability'] + self.country_regex_replacements[country].append((pattern, replace_group, replace_probability)) + + self.country_regex_replacements = dict(self.country_regex_replacements) + self.exceptions = {} for props in nested_get(config, ('names', 'exceptions'), default=[]): @@ -61,4 +77,17 @@ class BoundaryNames(object): name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs)) return weighted_choice(name_keys, probs) + def name(self, country, name): + all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, []) + if not all_replacements: + return name + + for regex, group, prob in all_replacements: + match = regex.match(name) + if match and random.random() < prob: + name = match.group(group) + return name + + + boundary_names = BoundaryNames()