From ba7c9b0818b3f6c6d6eea3291f21fe766e264c17 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 16 May 2016 10:22:09 -0400 Subject: [PATCH] [names] Name affixes respect hyphens and lack of whitespace (for ideographic languages) --- scripts/geodata/names/normalization.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/scripts/geodata/names/normalization.py b/scripts/geodata/names/normalization.py index fd940e13..c9bd8d97 100644 --- a/scripts/geodata/names/normalization.py +++ b/scripts/geodata/names/normalization.py @@ -28,18 +28,23 @@ class NameAffixes(object): conf = yaml.load(open(os.path.join(config_dir, filename))) - prefixes = conf.get('prefixes', []) - name_prefixes = [safe_decode(phrase).lower() for phrase in prefixes] - self.language_prefixes[lang] = name_prefixes + prefixes = [safe_decode(phrase).lower() for phrase in conf.get('prefixes', [])] + prefixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('prefixes_no_whitespace', [])] - suffixes = conf.get('suffixes', []) - name_suffixes = [safe_decode(phrase).lower() for phrase in suffixes] - self.language_suffixes[lang] = name_suffixes + self.language_prefixes[lang] = prefixes + prefixes_no_whitespace - whitespace_phrase = six.u(' ') if conf.get('whitespace', True) else six.u('') + suffixes = [safe_decode(phrase).lower() for phrase in conf.get('suffixes', [])] + suffixes_no_whitespace = [safe_decode(phrase).lower() for phrase in conf.get('suffixes_no_whitespace', [])] - prefix_regex = six.u('^(?:{})').format(six.u('|').join(['{}{}'.format(s, whitespace_phrase) for s in name_prefixes])) - suffix_regex = six.u('(?:{})$').format(six.u('|').join(['{}{}'.format(whitespace_phrase, s) for s in name_suffixes])) + self.language_suffixes[lang] = suffixes + suffixes_no_whitespace + + whitespace_phrase = six.u('[ \-]') + + all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace + all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace + + prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes)) + suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes)) self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE) self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)