From e6e4b28e434f4e717ee2d0b5e79b6f11d06f0bba Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Wed, 28 Dec 2016 13:26:19 -0500
Subject: [PATCH] =?UTF-8?q?[addresses]=20making=20the=20=D0=B3=D0=BE=D1=80?=
 =?UTF-8?q?=D0=BE=D0=B4/=D0=B3.=20prefix=20apply=20to=20the=20Russian=20la?=
 =?UTF-8?q?nguage=20rather=20than=20the=20country?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 resources/boundaries/names/global.yaml  | 25 +++++++++++++------------
 scripts/geodata/addresses/components.py |  5 ++++-
 scripts/geodata/boundaries/names.py     | 24 ++++++++++++------------
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/resources/boundaries/names/global.yaml b/resources/boundaries/names/global.yaml
index 81bee822..2bea2dca 100644
--- a/resources/boundaries/names/global.yaml
+++ b/resources/boundaries/names/global.yaml
@@ -36,18 +36,19 @@ names:
           replace_probability: 0.5
 
     prefixes:
-        ru:
-            city:
-                default:
-                    prefix: г.
-                probability: 0.35
-                alternatives:
-                    - alternative:
-                          prefix: г
-                      probability: 0.1
-                    - alternative:
-                          prefix: город
-                      probability: 0.05
+        language:
+            ru:
+                city:
+                    default:
+                        prefix: г.
+                    probability: 0.35
+                    alternatives:
+                        - alternative:
+                              prefix: г
+                          probability: 0.1
+                        - alternative:
+                              prefix: город
+                          probability: 0.05
 
     # This section overrides place names
     exceptions:
diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py
index 8d7d7e67..a62aa114 100644
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -962,6 +962,8 @@ class AddressComponents(object):
         include these qualifiers in the training data.
         '''
 
+        suffix_lang = None if not language_suffixe else language_suffix.lstrip(':')
+
         if osm_components:
             name_key = ''.join((boundary_names.DEFAULT_NAME_KEY, language_suffix))
             raw_name_key = boundary_names.DEFAULT_NAME_KEY
@@ -1018,7 +1020,8 @@ class AddressComponents(object):
                         name = component_value.get(k)
 
                         if name:
-                            name = boundary_names.name(country, component, name)
+                            name_lang = language if not suffix_lang or not k.endswith(language_suffix) else suffix_lang
+                            name = boundary_names.name(country, name_lang, component, name)
 
                         if name and not (name == existing_city_name and component != AddressFormatter.CITY and drop_duplicate_city_names):
                             name = self.cleaned_name(name, first_comma_delimited_phrase=True)
diff --git a/scripts/geodata/boundaries/names.py b/scripts/geodata/boundaries/names.py
index 6b560942..d8fa7326 100644
--- a/scripts/geodata/boundaries/names.py
+++ b/scripts/geodata/boundaries/names.py
@@ -59,39 +59,39 @@ class BoundaryNames(object):
         self.suffixes = {}
         self.suffix_regexes = {}
 
-        for country, components in six.iteritems(nested_get(config, ('names', 'prefixes',), default={}) ):
+        for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ):
             for component, affixes in six.iteritems(components):
                 affix_values, probs = alternative_probabilities(affixes)
 
                 for val in affix_values:
                     if 'prefix' not in val:
-                        raise AssertionError(six.u('Invalid prefix value for (country={}, component={}): {} ').format(country, component, val))
+                        raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val))
 
                 prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values])
-                self.prefix_regexes[(country, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)
+                self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)
 
                 if not isclose(sum(probs), 1.0):
                     affix_values.append(None)
                     probs.append(1.0 - sum(probs))
                 affix_probs_cdf = cdf(probs)
-                self.prefixes[(country, component)] = affix_values, affix_probs_cdf
+                self.prefixes[(language, component)] = affix_values, affix_probs_cdf
 
-        for country, components in six.iteritems(nested_get(config, ('names', 'suffixes',), default={}) ):
+        for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ):
             for component, affixes in six.iteritems(components):
                 affix_values, probs = alternative_probabilities(affixes)
 
                 for val in affix_values:
                     if 'suffix' not in val:
-                        raise AssertionError(six.u('Invalid suffix value for (country={}, component={}): {} ').format(country, component, val))
+                        raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val))
 
                 suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values])
-                self.suffix_regexes[(country, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)
+                self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)
 
                 if not isclose(sum(probs), 1.0):
                     affix_values.append(None)
                     probs.append(1.0 - sum(probs))
                 affix_probs_cdf = cdf(probs)
-                self.suffixes[(country, component)] = affix_values, affix_probs_cdf
+                self.suffixes[(language, component)] = affix_values, affix_probs_cdf
 
         self.exceptions = {}
 
@@ -125,11 +125,11 @@ class BoundaryNames(object):
         name_keys, probs = self.name_key_dist(props, component)
         return weighted_choice(name_keys, probs)
 
-    def name(self, country, component, name):
+    def name(self, country, language, component, name):
         all_replacements = self.country_regex_replacements.get(country, []) + self.country_regex_replacements.get(None, [])
 
-        prefixes, prefix_probs = self.prefixes.get((country, component), (None, None))
-        suffixes, suffix_probs = self.suffixes.get((country, component), (None, None))
+        prefixes, prefix_probs = self.prefixes.get((language, component), (None, None))
+        suffixes, suffix_probs = self.suffixes.get((language, component), (None, None))
 
         if not all_replacements and not prefixes and not suffixes:
             return name
@@ -142,7 +142,7 @@ class BoundaryNames(object):
         for affixes, affix_probs, regexes, key, direction in ((prefixes, prefix_probs, self.prefix_regexes, 'prefix', 0),
                                                               (suffixes, suffix_probs, self.suffix_regexes, 'suffix', 1)):
             if affixes is not None:
-                regex = regexes[country, component]
+                regex = regexes[language, component]
                 if regex.match(name):
                     continue