diff --git a/scripts/geodata/address_expansions/address_dictionaries.py b/scripts/geodata/address_expansions/address_dictionaries.py index 50faacd7..40a96b84 100644 --- a/scripts/geodata/address_expansions/address_dictionaries.py +++ b/scripts/geodata/address_expansions/address_dictionaries.py @@ -66,6 +66,8 @@ gazetteer_types = { 'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE', 'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION', 'building_types': 'DICTIONARY_BUILDING_TYPE', + 'categories': 'DICTIONARY_CATEGORY', + 'categories_plural': 'DICTIONARY_CATEGORY_PLURAL', 'company_types': 'DICTIONARY_COMPANY_TYPE', 'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE', 'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE', @@ -95,7 +97,7 @@ gazetteer_types = { 'surnames': 'DICTIONARY_SURNAME', 'synonyms': 'DICTIONARY_SYNONYM', 'toponyms': 'DICTIONARY_TOPONYM', - 'unit_direction': 'DICTIONARY_UNIT_DIRECTION', + 'unit_directions': 'DICTIONARY_UNIT_DIRECTION', 'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED', 'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE', @@ -106,10 +108,71 @@ class InvalidAddressFileException(Exception): pass +def read_dictionary_file(path): + for i, line in enumerate(open(path)): + line = safe_decode(line.rstrip()) + if not line.strip(): + continue + + if u'}' in line: + raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1)) + phrases = line.split(u'|') + + if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: + raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1)) + + yield phrases + + def quote_string(s): return u'"{}"'.format(safe_decode(s).replace('"', '\\"')) +class AddressPhraseDictionaries(object): + def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR): + self.base_dir = base_dir + self.languages = [] + + self.language_dictionaries = defaultdict(list) + self.phrases = defaultdict(list) + + for language in os.listdir(base_dir): + language_dir = os.path.join(base_dir, language) + if not os.path.isdir(language_dir): + continue + + self.languages.append(language) + + for filename in os.listdir(language_dir): + if not filename.endswith('.txt'): + raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename)) + dictionary_name = filename.split('.')[0].lower() + + if dictionary_name not in gazetteer_types: + raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types)))) + self.language_dictionaries[language].append(dictionary_name) + + for i, line in enumerate(open(os.path.join(language_dir, filename))): + line = safe_decode(line.rstrip()) + if not line.strip(): + continue + + if u'}' in line: + raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1)) + phrases = line.split(u'|') + + if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: + raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1)) + + self.phrases[(language, dictionary_name)].append(phrases) + + self.language_dictionaries = dict(self.language_dictionaries) + self.phrases = dict(self.phrases) + + +address_phrase_dictionaries = AddressPhraseDictionaries() + + def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE): address_languages = [] expansion_rules = [] @@ -117,39 +180,17 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_ max_dictionary_types = 0 - for language in os.listdir(base_dir): - language_dir = os.path.join(base_dir, language) - if not os.path.isdir(language_dir): - continue - + for language in address_phrase_dictionaries.languages: num_language_rules = 0 language_index = len(expansion_rules) language_canonical_dictionaries = defaultdict(list) canonical_indices = {} - for filename in os.listdir(language_dir): - dictionary_name = filename.rstrip('.txt').lower() - if '.' in dictionary_name: - raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language, filename)) - - if dictionary_name not in gazetteer_types: - raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language, filename, ', '.join(gazetteer_types))) - + for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]: dictionary_type = gazetteer_types[dictionary_name] - f = open(os.path.join(language_dir, filename)) - for i, line in enumerate(f): - line = safe_decode(line.rstrip()) - if not line.strip(): - continue - - if u'}' in line: - raise InvalidAddressFileException(u'Found }} in file: {}/{}, line {}'.format(language, filename, i+1)) - phrases = line.split(u'|') - if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: - raise InvalidAddressFileException(u'Found blank synonym in: {}/{}, line {}'.format(language, filename, i+1)) - + for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]: canonical = phrases[0] if len(phrases) > 1: canonical_index = canonical_indices.get(canonical, None) @@ -197,7 +238,6 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_ out.close() - if __name__ == '__main__': if len(sys.argv) > 1: input_dir = sys.argv[1]