diff --git a/resources/boundaries/names/languages/es.yaml b/resources/boundaries/names/languages/es.yaml index 3fa3e211..c12488f5 100644 --- a/resources/boundaries/names/languages/es.yaml +++ b/resources/boundaries/names/languages/es.yaml @@ -2,8 +2,9 @@ prefixes: # Note: "ciudad de" should not be included as it's part of proper names - colonia - municipio nuestra senora de - - ciudad de +prefixes_similarity_only: + - ciudad de suffixes: - colonia \ No newline at end of file diff --git a/scripts/geodata/names/normalization.py b/scripts/geodata/names/normalization.py index c9bd8d97..ee397dff 100644 --- a/scripts/geodata/names/normalization.py +++ b/scripts/geodata/names/normalization.py @@ -21,6 +21,9 @@ class NameAffixes(object): self.language_prefix_regexes = {} self.language_suffix_regexes = {} + self.language_prefix_sim_only_regexes = {} + self.language_suffix_sim_only_regexes = {} + for filename in os.listdir(config_dir): if not filename.endswith('.yaml'): continue @@ -43,25 +46,46 @@ class NameAffixes(object): all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace - prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes)) - suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes)) + if all_prefixes: + prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes)) + self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE) - self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE) - self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE) + if all_suffixes: + suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes)) + self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE) - def replace_prefixes(self, name, lang): + sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for phrase in conf.get('prefixes_similarity_only', [])] + if sim_only_prefixes: + sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes)) + self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE) + + sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for phrase in conf.get('suffixes_similarity_only', [])] + if sim_only_suffixes: + sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes)) + + self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE) + + def replace_prefixes(self, name, lang, sim_only=False): name = safe_decode(name).strip() - re = self.language_prefix_regexes.get(lang) + if not sim_only or lang not in self.language_prefix_sim_only_regexes: + re = self.language_prefix_regexes.get(lang) + else: + re = self.language_prefix_sim_only_regexes.get(lang) + if not re: return name return re.sub(six.u(''), name) - def replace_suffixes(self, name, lang): + def replace_suffixes(self, name, lang, sim_only=False): name = safe_decode(name).strip() - re = self.language_suffix_regexes.get(lang) + if not sim_only or lang not in self.language_suffix_sim_only_regexes: + re = self.language_suffix_regexes.get(lang) + else: + re = self.language_suffix_sim_only_regexes.get(lang) + if not re: return name