From 9386a999f67371d22b48a3795260d56c10e00864 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 7 Dec 2016 05:37:25 -0500 Subject: [PATCH] [names] adding country-specific affixes and only normalizing the word City as a suffix in UK/Ireland --- resources/boundaries/names/languages/en.yaml | 13 +++- scripts/geodata/names/normalization.py | 76 ++++++++++++-------- 2 files changed, 58 insertions(+), 31 deletions(-) diff --git a/resources/boundaries/names/languages/en.yaml b/resources/boundaries/names/languages/en.yaml index 7f0cff70..c432ba64 100644 --- a/resources/boundaries/names/languages/en.yaml +++ b/resources/boundaries/names/languages/en.yaml @@ -18,18 +18,27 @@ prefixes: - free state of - autonomous city of - territorial waters of - - county - county of + - county - district of # Suffixes which can be stripped to normalize a place name suffixes: - township - municipality - - city - cp - civil parish - community development council - cdc - council + - borough council - city council + +countries: + gb: + suffixes: + - city + + ie: + suffixes: + - city \ No newline at end of file diff --git a/scripts/geodata/names/normalization.py b/scripts/geodata/names/normalization.py index ee397dff..0a68679f 100644 --- a/scripts/geodata/names/normalization.py +++ b/scripts/geodata/names/normalization.py @@ -30,61 +30,79 @@ class NameAffixes(object): lang = filename.rsplit('.yaml')[0] conf = yaml.load(open(os.path.join(config_dir, filename))) + self.add_affixes(lang, conf) - prefixes = [safe_decode(phrase).lower() for phrase in conf.get('prefixes', [])] - prefixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('prefixes_no_whitespace', [])] + for country, country_conf in six.iteritems(conf.get('countries', {})): + country_lang = (country, lang) + self.add_affixes(country_lang, country_conf) - self.language_prefixes[lang] = prefixes + prefixes_no_whitespace + def add_affixes(self, lang, *confs): + prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])] + prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])] - suffixes = [safe_decode(phrase).lower() for phrase in conf.get('suffixes', [])] - suffixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('suffixes_no_whitespace', [])] + self.language_prefixes[lang] = prefixes + prefixes_no_whitespace - self.language_suffixes[lang] = suffixes + suffixes_no_whitespace + suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])] + suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])] - whitespace_phrase = six.u('[ \-]') + self.language_suffixes[lang] = suffixes + suffixes_no_whitespace - all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace - all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace + whitespace_phrase = six.u('[ \-]') - if all_prefixes: - prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes)) - self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE) + all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace + all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace - if all_suffixes: - suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes)) - self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE) + if all_prefixes: + prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes)) + self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE) - sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for phrase in conf.get('prefixes_similarity_only', [])] - if sim_only_prefixes: - sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes)) - self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE) + if all_suffixes: + suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes)) + self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE) - sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for phrase in conf.get('suffixes_similarity_only', [])] - if sim_only_suffixes: - sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes)) + sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])] + if sim_only_prefixes: + sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes)) + self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE) - self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE) + sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])] + if sim_only_suffixes: + sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes)) - def replace_prefixes(self, name, lang, sim_only=False): + self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE) + + def replace_prefixes(self, name, lang, country=None, sim_only=False): name = safe_decode(name).strip() if not sim_only or lang not in self.language_prefix_sim_only_regexes: - re = self.language_prefix_regexes.get(lang) + d = self.language_prefix_regexes else: - re = self.language_prefix_sim_only_regexes.get(lang) + d = self.language_prefix_sim_only_regexes + + re = None + if country is not None: + re = d.get((country, lang)) + if re is None: + re = d.get(lang) if not re: return name return re.sub(six.u(''), name) - def replace_suffixes(self, name, lang, sim_only=False): + def replace_suffixes(self, name, lang, country=None, sim_only=False): name = safe_decode(name).strip() if not sim_only or lang not in self.language_suffix_sim_only_regexes: - re = self.language_suffix_regexes.get(lang) + d = self.language_suffix_regexes else: - re = self.language_suffix_sim_only_regexes.get(lang) + d = self.language_suffix_sim_only_regexes + + re = None + if country is not None: + re = d.get((country, lang)) + if re is None: + re = d.get(lang) if not re: return name