[names] Adding name affix normalizations to a YAML config
This commit is contained in:
15
resources/boundaries/names/languages/en.yaml
Normal file
15
resources/boundaries/names/languages/en.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Prefixes which can be stripped to normalize a place name
|
||||||
|
prefixes:
|
||||||
|
- city of
|
||||||
|
- township of
|
||||||
|
- regional municipality of
|
||||||
|
- rural city of
|
||||||
|
- municipality of
|
||||||
|
- borough of
|
||||||
|
- london borough of
|
||||||
|
- shire of
|
||||||
|
|
||||||
|
# Suffixes which can be stripped to normalize a place name
|
||||||
|
suffixes:
|
||||||
|
- township
|
||||||
|
- municipality
|
||||||
7
resources/boundaries/names/languages/es.yaml
Normal file
7
resources/boundaries/names/languages/es.yaml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
prefixes:
|
||||||
|
# Note: "ciudad de" should not be included as it's part of proper names
|
||||||
|
- colonia
|
||||||
|
|
||||||
|
|
||||||
|
suffixes:
|
||||||
|
- colonia
|
||||||
@@ -1,32 +1,65 @@
|
|||||||
from __future__ import unicode_literals
|
import os
|
||||||
import re
|
import re
|
||||||
|
import six
|
||||||
|
import yaml
|
||||||
|
|
||||||
from geodata.encoding import safe_decode
|
from geodata.encoding import safe_decode
|
||||||
|
|
||||||
name_prefixes = ['{} '.format(s) for s in (
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
'city of',
|
|
||||||
'township of',
|
|
||||||
'regional municipality of',
|
|
||||||
'municipality of',
|
|
||||||
'borough of',
|
|
||||||
'london borough of',
|
|
||||||
'town of',
|
|
||||||
)]
|
|
||||||
|
|
||||||
name_suffixes = [' {}'.format(s) for s in (
|
AFFIX_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
'township',
|
'resources', 'boundaries', 'names', 'languages')
|
||||||
'municipality',
|
|
||||||
)]
|
|
||||||
|
|
||||||
name_prefix_regex = re.compile('^(?:{})'.format('|'.join(name_prefixes)), re.I | re.UNICODE)
|
|
||||||
name_suffix_regex = re.compile('(?:{})$'.format('|'.join(name_suffixes)), re.I | re.UNICODE)
|
|
||||||
|
|
||||||
|
|
||||||
def replace_name_prefixes(name):
|
class NameAffixes(object):
|
||||||
name = safe_decode(name)
|
def __init__(self, config_dir=AFFIX_CONFIG_DIR):
|
||||||
return name_prefix_regex.sub('', name)
|
self.config_dir = config_dir
|
||||||
|
|
||||||
|
self.language_prefixes = {}
|
||||||
|
self.language_suffixes = {}
|
||||||
|
|
||||||
def replace_name_suffixes(name):
|
self.language_prefix_regexes = {}
|
||||||
name = safe_decode(name)
|
self.language_suffix_regexes = {}
|
||||||
return name_suffix_regex.sub('', name)
|
|
||||||
|
for filename in os.listdir(config_dir):
|
||||||
|
if not filename.endswith('.yaml'):
|
||||||
|
continue
|
||||||
|
lang = filename.rsplit('.yaml')[0]
|
||||||
|
|
||||||
|
conf = yaml.load(open(os.path.join(config_dir, filename)))
|
||||||
|
|
||||||
|
prefixes = conf.get('prefixes', [])
|
||||||
|
name_prefixes = [safe_decode(phrase).lower() for phrase in prefixes]
|
||||||
|
self.language_prefixes[lang] = name_prefixes
|
||||||
|
|
||||||
|
suffixes = conf.get('suffixes', [])
|
||||||
|
name_suffixes = [safe_decode(phrase).lower() for phrase in suffixes]
|
||||||
|
self.language_suffixes[lang] = name_suffixes
|
||||||
|
|
||||||
|
whitespace_phrase = six.u(' ') if conf.get('whitespace', True) else six.u('')
|
||||||
|
|
||||||
|
prefix_regex = six.u('^(?:{})').format(six.u('|').join(['{}{}'.format(s, whitespace_phrase) for s in name_prefixes]))
|
||||||
|
suffix_regex = six.u('(?:{})$').format(six.u('|').join(['{}{}'.format(whitespace_phrase, s) for s in name_suffixes]))
|
||||||
|
|
||||||
|
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
|
||||||
|
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
|
||||||
|
|
||||||
|
def replace_prefixes(self, name, lang):
|
||||||
|
name = safe_decode(name).strip()
|
||||||
|
|
||||||
|
re = self.language_prefix_regexes.get(lang)
|
||||||
|
if not re:
|
||||||
|
return name
|
||||||
|
|
||||||
|
return re.sub(six.u(''), name)
|
||||||
|
|
||||||
|
def replace_suffixes(self, name, lang):
|
||||||
|
name = safe_decode(name).strip()
|
||||||
|
|
||||||
|
re = self.language_suffix_regexes.get(lang)
|
||||||
|
if not re:
|
||||||
|
return name
|
||||||
|
|
||||||
|
return re.sub(six.u(''), name)
|
||||||
|
|
||||||
|
name_affixes = NameAffixes()
|
||||||
|
|||||||
Reference in New Issue
Block a user