From 48755ec218b7523be5cf8a507d99688b67b1d3c8 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Thu, 11 Aug 2016 13:09:08 -0400
Subject: [PATCH] [boundaries] Adding regex replacements for boundary names
 such as Lyon 2e Arrondissement where putting Lyon is the OSM convention but
 we might sometimes want just 2e Arrondissement to appear in the training data
 next to Lyon

---
 resources/boundaries/names/global.yaml  |  6 +++++
 scripts/geodata/addresses/components.py |  6 ++---
 scripts/geodata/boundaries/names.py     | 29 +++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/resources/boundaries/names/global.yaml b/resources/boundaries/names/global.yaml
index a741f1f2..2b24e209 100644
--- a/resources/boundaries/names/global.yaml
+++ b/resources/boundaries/names/global.yaml
@@ -29,6 +29,12 @@ names:
                     - alternative: official_name # e.g. United Kingdom of Great Britain and Northern Ireland
                       probability: 0.01
 
+    regex_replacements:
+        - country: fr
+          pattern: "(?:lyon|paris|marseilles) ([\\d]+er? arrondissement)"
+          replace_with_group: 1
+          replace_probability: 0.5
+
     # This section overrides place names
     exceptions:
         # Boroughs of New York City
diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py
index a27f8e7e..18d135e4 100644
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -694,9 +694,6 @@ class AddressComponents(object):
         include these qualifiers in the training data.
         '''
 
-        simple_name_key = 'name:simple'
-        international_name_key = 'int_name'
-
         if osm_components:
             name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
             raw_name_key = boundary_names.DEFAULT_NAME_KEY
@@ -721,6 +718,9 @@ class AddressComponents(object):
                     for k in (key, name_key, raw_key, raw_name_key):
                         name = component_value.get(k)
 
+                        if name:
+                            name = boundary_names.name(country, name)
+
                         if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names):
                             if six.u(';') in name:
                                 name = random.choice(name.split(six.u(';'))).strip()
diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py
index f337b246..e7401594 100644
--- a/scripts/geodata/boundaries/names.py
+++ b/scripts/geodata/boundaries/names.py
@@ -1,4 +1,6 @@
 import os
+import random
+import re
 import six
 import yaml
 
@@ -36,6 +38,20 @@ class BoundaryNames(object):
             component_name_keys, component_probs = alternative_probabilities(component_names)
             self.component_name_keys[component] = (component_name_keys, cdf(component_probs))
 
+        self.country_regex_replacements = defaultdict(list)
+        for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
+            country = props.get('country')
+            re_flags = re.I | re.UNICODE
+            if not props.get('case_insensitive', True):
+                re.flags ^= re.I
+
+            pattern = re.compile(props['pattern'], re_flags)
+            replace_group = props['replace_with_group']
+            replace_probability = props['replace_probability']
+            self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))
+
+        self.country_regex_replacements = dict(self.country_regex_replacements)
+
         self.exceptions = {}
 
         for props in nested_get(config, ('names', 'exceptions'), default=[]):
@@ -61,4 +77,17 @@ class BoundaryNames(object):
         name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs))
         return weighted_choice(name_keys, probs)
 
+    def name(self, country, name):
+        all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
+        if not all_replacements:
+            return name
+
+        for regex, group, prob in all_replacements:
+            match = regex.match(name)
+            if match and random.random() < prob:
+                name = match.group(group)
+        return name
+
+
+
 boundary_names = BoundaryNames()