[names] adding country-specific affixes and only normalizing the word City as a suffix in UK/Ireland
This commit is contained in:
@@ -18,18 +18,27 @@ prefixes:
|
|||||||
- free state of
|
- free state of
|
||||||
- autonomous city of
|
- autonomous city of
|
||||||
- territorial waters of
|
- territorial waters of
|
||||||
- county
|
|
||||||
- county of
|
- county of
|
||||||
|
- county
|
||||||
- district of
|
- district of
|
||||||
|
|
||||||
# Suffixes which can be stripped to normalize a place name
|
# Suffixes which can be stripped to normalize a place name
|
||||||
suffixes:
|
suffixes:
|
||||||
- township
|
- township
|
||||||
- municipality
|
- municipality
|
||||||
- city
|
|
||||||
- cp
|
- cp
|
||||||
- civil parish
|
- civil parish
|
||||||
- community development council
|
- community development council
|
||||||
- cdc
|
- cdc
|
||||||
- council
|
- council
|
||||||
|
- borough council
|
||||||
- city council
|
- city council
|
||||||
|
|
||||||
|
countries:
|
||||||
|
gb:
|
||||||
|
suffixes:
|
||||||
|
- city
|
||||||
|
|
||||||
|
ie:
|
||||||
|
suffixes:
|
||||||
|
- city
|
||||||
@@ -30,61 +30,79 @@ class NameAffixes(object):
|
|||||||
lang = filename.rsplit('.yaml')[0]
|
lang = filename.rsplit('.yaml')[0]
|
||||||
|
|
||||||
conf = yaml.load(open(os.path.join(config_dir, filename)))
|
conf = yaml.load(open(os.path.join(config_dir, filename)))
|
||||||
|
self.add_affixes(lang, conf)
|
||||||
|
|
||||||
prefixes = [safe_decode(phrase).lower() for phrase in conf.get('prefixes', [])]
|
for country, country_conf in six.iteritems(conf.get('countries', {})):
|
||||||
prefixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('prefixes_no_whitespace', [])]
|
country_lang = (country, lang)
|
||||||
|
self.add_affixes(country_lang, country_conf)
|
||||||
|
|
||||||
self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
|
def add_affixes(self, lang, *confs):
|
||||||
|
prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
|
||||||
|
prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
|
||||||
|
|
||||||
suffixes = [safe_decode(phrase).lower() for phrase in conf.get('suffixes', [])]
|
self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
|
||||||
suffixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('suffixes_no_whitespace', [])]
|
|
||||||
|
|
||||||
self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
|
suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
|
||||||
|
suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
|
||||||
|
|
||||||
whitespace_phrase = six.u('[ \-]')
|
self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
|
||||||
|
|
||||||
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
|
whitespace_phrase = six.u('[ \-]')
|
||||||
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
|
|
||||||
|
|
||||||
if all_prefixes:
|
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
|
||||||
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
|
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
|
||||||
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
|
|
||||||
|
|
||||||
if all_suffixes:
|
if all_prefixes:
|
||||||
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
|
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
|
||||||
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
|
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
|
||||||
|
|
||||||
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for phrase in conf.get('prefixes_similarity_only', [])]
|
if all_suffixes:
|
||||||
if sim_only_prefixes:
|
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
|
||||||
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
|
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
|
||||||
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
|
|
||||||
|
|
||||||
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for phrase in conf.get('suffixes_similarity_only', [])]
|
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
|
||||||
if sim_only_suffixes:
|
if sim_only_prefixes:
|
||||||
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
|
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
|
||||||
|
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
|
||||||
|
|
||||||
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
|
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
|
||||||
|
if sim_only_suffixes:
|
||||||
|
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
|
||||||
|
|
||||||
def replace_prefixes(self, name, lang, sim_only=False):
|
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
|
||||||
|
|
||||||
|
def replace_prefixes(self, name, lang, country=None, sim_only=False):
|
||||||
name = safe_decode(name).strip()
|
name = safe_decode(name).strip()
|
||||||
|
|
||||||
if not sim_only or lang not in self.language_prefix_sim_only_regexes:
|
if not sim_only or lang not in self.language_prefix_sim_only_regexes:
|
||||||
re = self.language_prefix_regexes.get(lang)
|
d = self.language_prefix_regexes
|
||||||
else:
|
else:
|
||||||
re = self.language_prefix_sim_only_regexes.get(lang)
|
d = self.language_prefix_sim_only_regexes
|
||||||
|
|
||||||
|
re = None
|
||||||
|
if country is not None:
|
||||||
|
re = d.get((country, lang))
|
||||||
|
if re is None:
|
||||||
|
re = d.get(lang)
|
||||||
|
|
||||||
if not re:
|
if not re:
|
||||||
return name
|
return name
|
||||||
|
|
||||||
return re.sub(six.u(''), name)
|
return re.sub(six.u(''), name)
|
||||||
|
|
||||||
def replace_suffixes(self, name, lang, sim_only=False):
|
def replace_suffixes(self, name, lang, country=None, sim_only=False):
|
||||||
name = safe_decode(name).strip()
|
name = safe_decode(name).strip()
|
||||||
|
|
||||||
if not sim_only or lang not in self.language_suffix_sim_only_regexes:
|
if not sim_only or lang not in self.language_suffix_sim_only_regexes:
|
||||||
re = self.language_suffix_regexes.get(lang)
|
d = self.language_suffix_regexes
|
||||||
else:
|
else:
|
||||||
re = self.language_suffix_sim_only_regexes.get(lang)
|
d = self.language_suffix_sim_only_regexes
|
||||||
|
|
||||||
|
re = None
|
||||||
|
if country is not None:
|
||||||
|
re = d.get((country, lang))
|
||||||
|
if re is None:
|
||||||
|
re = d.get(lang)
|
||||||
|
|
||||||
if not re:
|
if not re:
|
||||||
return name
|
return name
|
||||||
|
|||||||
Reference in New Issue
Block a user