From 88b25a2d2207b6e27ab4d770cea4d561d65c0b55 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 5 May 2016 18:26:45 -0400 Subject: [PATCH] [names] Adding name affix normalizations to a YAML config --- resources/boundaries/names/languages/en.yaml | 15 ++++ resources/boundaries/names/languages/es.yaml | 7 ++ scripts/geodata/names/normalization.py | 79 ++++++++++++++------ 3 files changed, 78 insertions(+), 23 deletions(-) create mode 100644 resources/boundaries/names/languages/en.yaml create mode 100644 resources/boundaries/names/languages/es.yaml diff --git a/resources/boundaries/names/languages/en.yaml b/resources/boundaries/names/languages/en.yaml new file mode 100644 index 00000000..3abfa21e --- /dev/null +++ b/resources/boundaries/names/languages/en.yaml @@ -0,0 +1,15 @@ +# Prefixes which can be stripped to normalize a place name +prefixes: + - city of + - township of + - regional municipality of + - rural city of + - municipality of + - borough of + - london borough of + - shire of + +# Suffixes which can be stripped to normalize a place name +suffixes: + - township + - municipality diff --git a/resources/boundaries/names/languages/es.yaml b/resources/boundaries/names/languages/es.yaml new file mode 100644 index 00000000..21576254 --- /dev/null +++ b/resources/boundaries/names/languages/es.yaml @@ -0,0 +1,7 @@ +prefixes: + # Note: "ciudad de" should not be included as it's part of proper names + - colonia + + +suffixes: + - colonia \ No newline at end of file diff --git a/scripts/geodata/names/normalization.py b/scripts/geodata/names/normalization.py index 03442559..fd940e13 100644 --- a/scripts/geodata/names/normalization.py +++ b/scripts/geodata/names/normalization.py @@ -1,32 +1,65 @@ -from __future__ import unicode_literals +import os import re +import six +import yaml from geodata.encoding import safe_decode -name_prefixes = ['{} '.format(s) for s in ( - 'city of', - 'township of', - 'regional municipality of', - 'municipality of', - 'borough of', - 'london borough of', - 'town of', -)] +this_dir = os.path.realpath(os.path.dirname(__file__)) -name_suffixes = [' {}'.format(s) for s in ( - 'township', - 'municipality', -)] - -name_prefix_regex = re.compile('^(?:{})'.format('|'.join(name_prefixes)), re.I | re.UNICODE) -name_suffix_regex = re.compile('(?:{})$'.format('|'.join(name_suffixes)), re.I | re.UNICODE) +AFFIX_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'boundaries', 'names', 'languages') -def replace_name_prefixes(name): - name = safe_decode(name) - return name_prefix_regex.sub('', name) +class NameAffixes(object): + def __init__(self, config_dir=AFFIX_CONFIG_DIR): + self.config_dir = config_dir + self.language_prefixes = {} + self.language_suffixes = {} -def replace_name_suffixes(name): - name = safe_decode(name) - return name_suffix_regex.sub('', name) + self.language_prefix_regexes = {} + self.language_suffix_regexes = {} + + for filename in os.listdir(config_dir): + if not filename.endswith('.yaml'): + continue + lang = filename.rsplit('.yaml')[0] + + conf = yaml.load(open(os.path.join(config_dir, filename))) + + prefixes = conf.get('prefixes', []) + name_prefixes = [safe_decode(phrase).lower() for phrase in prefixes] + self.language_prefixes[lang] = name_prefixes + + suffixes = conf.get('suffixes', []) + name_suffixes = [safe_decode(phrase).lower() for phrase in suffixes] + self.language_suffixes[lang] = name_suffixes + + whitespace_phrase = six.u(' ') if conf.get('whitespace', True) else six.u('') + + prefix_regex = six.u('^(?:{})').format(six.u('|').join(['{}{}'.format(s, whitespace_phrase) for s in name_prefixes])) + suffix_regex = six.u('(?:{})$').format(six.u('|').join(['{}{}'.format(whitespace_phrase, s) for s in name_suffixes])) + + self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE) + self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE) + + def replace_prefixes(self, name, lang): + name = safe_decode(name).strip() + + re = self.language_prefix_regexes.get(lang) + if not re: + return name + + return re.sub(six.u(''), name) + + def replace_suffixes(self, name, lang): + name = safe_decode(name).strip() + + re = self.language_suffix_regexes.get(lang) + if not re: + return name + + return re.sub(six.u(''), name) + +name_affixes = NameAffixes()