[names] adding country-specific affixes and only normalizing the word City as a suffix in UK/Ireland

This commit is contained in:
Al
2016-12-07 05:37:25 -05:00
parent a9209fae37
commit 9386a999f6
2 changed files with 58 additions and 31 deletions

View File

@@ -18,18 +18,27 @@ prefixes:
- free state of - free state of
- autonomous city of - autonomous city of
- territorial waters of - territorial waters of
- county
- county of - county of
- county
- district of - district of
# Suffixes which can be stripped to normalize a place name # Suffixes which can be stripped to normalize a place name
suffixes: suffixes:
- township - township
- municipality - municipality
- city
- cp - cp
- civil parish - civil parish
- community development council - community development council
- cdc - cdc
- council - council
- borough council
- city council - city council
countries:
gb:
suffixes:
- city
ie:
suffixes:
- city

View File

@@ -30,61 +30,79 @@ class NameAffixes(object):
lang = filename.rsplit('.yaml')[0] lang = filename.rsplit('.yaml')[0]
conf = yaml.load(open(os.path.join(config_dir, filename))) conf = yaml.load(open(os.path.join(config_dir, filename)))
self.add_affixes(lang, conf)
prefixes = [safe_decode(phrase).lower() for phrase in conf.get('prefixes', [])] for country, country_conf in six.iteritems(conf.get('countries', {})):
prefixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('prefixes_no_whitespace', [])] country_lang = (country, lang)
self.add_affixes(country_lang, country_conf)
self.language_prefixes[lang] = prefixes + prefixes_no_whitespace def add_affixes(self, lang, *confs):
prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
suffixes = [safe_decode(phrase).lower() for phrase in conf.get('suffixes', [])] self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
suffixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('suffixes_no_whitespace', [])]
self.language_suffixes[lang] = suffixes + suffixes_no_whitespace suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
whitespace_phrase = six.u('[ \-]') self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace whitespace_phrase = six.u('[ \-]')
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
if all_prefixes: all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes)) all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
if all_suffixes: if all_prefixes:
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes)) prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE) self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for phrase in conf.get('prefixes_similarity_only', [])] if all_suffixes:
if sim_only_prefixes: suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes)) self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for phrase in conf.get('suffixes_similarity_only', [])] sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
if sim_only_suffixes: if sim_only_prefixes:
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes)) sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE) sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
if sim_only_suffixes:
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
def replace_prefixes(self, name, lang, sim_only=False): self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
def replace_prefixes(self, name, lang, country=None, sim_only=False):
name = safe_decode(name).strip() name = safe_decode(name).strip()
if not sim_only or lang not in self.language_prefix_sim_only_regexes: if not sim_only or lang not in self.language_prefix_sim_only_regexes:
re = self.language_prefix_regexes.get(lang) d = self.language_prefix_regexes
else: else:
re = self.language_prefix_sim_only_regexes.get(lang) d = self.language_prefix_sim_only_regexes
re = None
if country is not None:
re = d.get((country, lang))
if re is None:
re = d.get(lang)
if not re: if not re:
return name return name
return re.sub(six.u(''), name) return re.sub(six.u(''), name)
def replace_suffixes(self, name, lang, sim_only=False): def replace_suffixes(self, name, lang, country=None, sim_only=False):
name = safe_decode(name).strip() name = safe_decode(name).strip()
if not sim_only or lang not in self.language_suffix_sim_only_regexes: if not sim_only or lang not in self.language_suffix_sim_only_regexes:
re = self.language_suffix_regexes.get(lang) d = self.language_suffix_regexes
else: else:
re = self.language_suffix_sim_only_regexes.get(lang) d = self.language_suffix_sim_only_regexes
re = None
if country is not None:
re = d.get((country, lang))
if re is None:
re = d.get(lang)
if not re: if not re:
return name return name