From 9386a999f67371d22b48a3795260d56c10e00864 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Wed, 7 Dec 2016 05:37:25 -0500
Subject: [PATCH] [names] adding country-specific affixes and only normalizing
 the word City as a suffix in UK/Ireland

---
 resources/boundaries/names/languages/en.yaml | 13 +++-
 scripts/geodata/names/normalization.py       | 76 ++++++++++++--------
 2 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/resources/boundaries/names/languages/en.yaml b/resources/boundaries/names/languages/en.yaml
index 7f0cff70..c432ba64 100644
--- a/resources/boundaries/names/languages/en.yaml
+++ b/resources/boundaries/names/languages/en.yaml
@@ -18,18 +18,27 @@ prefixes:
     - free state of
     - autonomous city of
     - territorial waters of
-    - county
     - county of
+    - county
     - district of
 
 # Suffixes which can be stripped to normalize a place name
 suffixes:
     - township
     - municipality
-    - city
     - cp
     - civil parish
     - community development council
     - cdc
     - council
+    - borough council
     - city council
+
+countries:
+    gb:
+        suffixes:
+            - city
+
+    ie:
+        suffixes:
+            - city
\ No newline at end of file
diff --git a/scripts/geodata/names/normalization.py b/scripts/geodata/names/normalization.py
index ee397dff..0a68679f 100644
--- a/scripts/geodata/names/normalization.py
+++ b/scripts/geodata/names/normalization.py
@@ -30,61 +30,79 @@ class NameAffixes(object):
             lang = filename.rsplit('.yaml')[0]
 
             conf = yaml.load(open(os.path.join(config_dir, filename)))
+            self.add_affixes(lang, conf)
 
-            prefixes = [safe_decode(phrase).lower() for phrase in conf.get('prefixes', [])]
-            prefixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('prefixes_no_whitespace', [])]
+            for country, country_conf in six.iteritems(conf.get('countries', {})):
+                country_lang = (country, lang)
+                self.add_affixes(country_lang, country_conf)
 
-            self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
+    def add_affixes(self, lang, *confs):
+        prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
+        prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
 
-            suffixes = [safe_decode(phrase).lower() for phrase in conf.get('suffixes', [])]
-            suffixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('suffixes_no_whitespace', [])]
+        self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
 
-            self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
+        suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
+        suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
 
-            whitespace_phrase = six.u('[ \-]')
+        self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
 
-            all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
-            all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
+        whitespace_phrase = six.u('[ \-]')
 
-            if all_prefixes:
-                prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
-                self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
+        all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
+        all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
 
-            if all_suffixes:
-                suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
-                self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
+        if all_prefixes:
+            prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
+            self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
 
-            sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for phrase in conf.get('prefixes_similarity_only', [])]
-            if sim_only_prefixes:
-                sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
-                self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
+        if all_suffixes:
+            suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
+            self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
 
-            sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for phrase in conf.get('suffixes_similarity_only', [])]
-            if sim_only_suffixes:
-                sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
+        sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
+        if sim_only_prefixes:
+            sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
+            self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
 
-                self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
+        sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
+        if sim_only_suffixes:
+            sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
 
-    def replace_prefixes(self, name, lang, sim_only=False):
+            self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
+
+    def replace_prefixes(self, name, lang, country=None, sim_only=False):
         name = safe_decode(name).strip()
 
         if not sim_only or lang not in self.language_prefix_sim_only_regexes:
-            re = self.language_prefix_regexes.get(lang)
+            d = self.language_prefix_regexes
         else:
-            re = self.language_prefix_sim_only_regexes.get(lang)
+            d = self.language_prefix_sim_only_regexes
+
+        re = None
+        if country is not None:
+            re = d.get((country, lang))
+        if re is None:
+            re = d.get(lang)
 
         if not re:
             return name
 
         return re.sub(six.u(''), name)
 
-    def replace_suffixes(self, name, lang, sim_only=False):
+    def replace_suffixes(self, name, lang, country=None, sim_only=False):
         name = safe_decode(name).strip()
 
         if not sim_only or lang not in self.language_suffix_sim_only_regexes:
-            re = self.language_suffix_regexes.get(lang)
+            d = self.language_suffix_regexes
         else:
-            re = self.language_suffix_sim_only_regexes.get(lang)
+            d = self.language_suffix_sim_only_regexes
+
+        re = None
+        if country is not None:
+            re = d.get((country, lang))
+        if re is None:
+            re = d.get(lang)
 
         if not re:
             return name