From 48755ec218b7523be5cf8a507d99688b67b1d3c8 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 11 Aug 2016 13:09:08 -0400 Subject: [PATCH] [boundaries] Adding regex replacements for boundary names such as Lyon 2e Arrondissement where putting Lyon is the OSM convention but we might sometimes want just 2e Arrondissement to appear in the training data next to Lyon --- resources/boundaries/names/global.yaml | 6 +++++ scripts/geodata/addresses/components.py | 6 ++--- scripts/geodata/boundaries/names.py | 29 +++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/resources/boundaries/names/global.yaml b/resources/boundaries/names/global.yaml index a741f1f2..2b24e209 100644 --- a/resources/boundaries/names/global.yaml +++ b/resources/boundaries/names/global.yaml @@ -29,6 +29,12 @@ names: - alternative: official_name # e.g. United Kingdom of Great Britain and Northern Ireland probability: 0.01 + regex_replacements: + - country: fr + pattern: "(?:lyon|paris|marseilles) ([\\d]+er? arrondissement)" + replace_with_group: 1 + replace_probability: 0.5 + # This section overrides place names exceptions: # Boroughs of New York City diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index a27f8e7e..18d135e4 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -694,9 +694,6 @@ class AddressComponents(object): include these qualifiers in the training data. ''' - simple_name_key = 'name:simple' - international_name_key = 'int_name' - if osm_components: name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix)) raw_name_key = boundary_names.DEFAULT_NAME_KEY @@ -721,6 +718,9 @@ class AddressComponents(object): for k in (key, name_key, raw_key, raw_name_key): name = component_value.get(k) + if name: + name = boundary_names.name(country, name) + if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names): if six.u(';') in name: name = random.choice(name.split(six.u(';'))).strip() diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py index f337b246..e7401594 100644 --- a/scripts/geodata/boundaries/names.py +++ b/scripts/geodata/boundaries/names.py @@ -1,4 +1,6 @@ import os +import random +import re import six import yaml @@ -36,6 +38,20 @@ class BoundaryNames(object): component_name_keys, component_probs = alternative_probabilities(component_names) self.component_name_keys[component] = (component_name_keys, cdf(component_probs)) + self.country_regex_replacements = defaultdict(list) + for props in nested_get(config, ('names', 'regex_replacements',), default=[]): + country = props.get('country') + re_flags = re.I | re.UNICODE + if not props.get('case_insensitive', True): + re.flags ^= re.I + + pattern = re.compile(props['pattern'], re_flags) + replace_group = props['replace_with_group'] + replace_probability = props['replace_probability'] + self.country_regex_replacements[country].append((pattern, replace_group, replace_probability)) + + self.country_regex_replacements = dict(self.country_regex_replacements) + self.exceptions = {} for props in nested_get(config, ('names', 'exceptions'), default=[]): @@ -61,4 +77,17 @@ class BoundaryNames(object): name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs)) return weighted_choice(name_keys, probs) + def name(self, country, name): + all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, []) + if not all_replacements: + return name + + for regex, group, prob in all_replacements: + match = regex.match(name) + if match and random.random() < prob: + name = match.group(group) + return name + + + boundary_names = BoundaryNames()