[names] Allowing for similarity-only normalization in name affixes

This commit is contained in:
Al
2016-08-22 03:47:03 -04:00
parent 72b5f6b55a
commit 79c9694e2d
2 changed files with 34 additions and 9 deletions

View File

@@ -2,8 +2,9 @@ prefixes:
# Note: "ciudad de" should not be included as it's part of proper names
- colonia
- municipio nuestra senora de
- ciudad de
prefixes_similarity_only:
- ciudad de
suffixes:
- colonia

View File

@@ -21,6 +21,9 @@ class NameAffixes(object):
self.language_prefix_regexes = {}
self.language_suffix_regexes = {}
self.language_prefix_sim_only_regexes = {}
self.language_suffix_sim_only_regexes = {}
for filename in os.listdir(config_dir):
if not filename.endswith('.yaml'):
continue
@@ -43,25 +46,46 @@ class NameAffixes(object):
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
if all_prefixes:
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
if all_suffixes:
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
def replace_prefixes(self, name, lang):
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for phrase in conf.get('prefixes_similarity_only', [])]
if sim_only_prefixes:
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for phrase in conf.get('suffixes_similarity_only', [])]
if sim_only_suffixes:
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
def replace_prefixes(self, name, lang, sim_only=False):
name = safe_decode(name).strip()
re = self.language_prefix_regexes.get(lang)
if not sim_only or lang not in self.language_prefix_sim_only_regexes:
re = self.language_prefix_regexes.get(lang)
else:
re = self.language_prefix_sim_only_regexes.get(lang)
if not re:
return name
return re.sub(six.u(''), name)
def replace_suffixes(self, name, lang):
def replace_suffixes(self, name, lang, sim_only=False):
name = safe_decode(name).strip()
re = self.language_suffix_regexes.get(lang)
if not sim_only or lang not in self.language_suffix_sim_only_regexes:
re = self.language_suffix_regexes.get(lang)
else:
re = self.language_suffix_sim_only_regexes.get(lang)
if not re:
return name