diff --git a/scripts/geodata/address_expansions/address_dictionaries.py b/scripts/geodata/address_expansions/address_dictionaries.py index cf8a183b..0cef633c 100644 --- a/scripts/geodata/address_expansions/address_dictionaries.py +++ b/scripts/geodata/address_expansions/address_dictionaries.py @@ -31,6 +31,37 @@ address_language_index_t languages[] = {{ ''' +gazetteer_types = { + 'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE', + 'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION', + 'building_types': 'DICTIONARY_BUILDING_TYPE', + 'company_types': 'DICTIONARY_COMPANY_TYPE', + 'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE', + 'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE', + 'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE', + 'directionals': 'DICTIONARY_DIRECTIONAL', + 'elisions': 'DICTIONARY_ELISION', + 'given_names': 'DICTIONARY_GIVEN_NAME', + 'level_types': 'DICTIONARY_LEVEL', + 'no_number': 'DICTIONARY_NO_ADDRESS', + 'nulls': 'DICTIONARY_NULL', + 'organizations': 'DICTIONARY_NAMED_ORGANIZATION', + 'people': 'DICTIONARY_NAMED_PERSON', + 'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX', + 'personal_titles': 'DICTIONARY_PERSONAL_TITLE', + 'place_names': 'DICTIONARY_PLACE_NAME', + 'post_office': 'DICTIONARY_POST_OFFICE', + 'qualifiers': 'DICTIONARY_QUALIFIER', + 'stopwords': 'DICTIONARY_STOPWORD', + 'street_types': 'DICTIONARY_STREET_TYPE', + 'surnames': 'DICTIONARY_SURNAME', + 'synonyms': 'DICTIONARY_SYNONYM', + 'toponyms': 'DICTIONARY_TOPONYM', + 'unit_types': 'DICTIONARY_UNIT', + +} + + class InvalidAddressFileException(Exception): pass @@ -51,8 +82,14 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_ language_index = len(expansion_rules) for filename in os.listdir(language_dir): - dictionary_name = filename.rstrip('.txt') - assert '.' not in dictionary_name + dictionary_name = filename.rstrip('.txt').lower() + if '.' in dictionary_name: + raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language, filename)) + + if dictionary_name not in gazetteer_types: + raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language, filename, ', '.join(gazetteer_types))) + + dictionary_type = gazetteer_types[dictionary_name] f = open(os.path.join(language_dir, filename)) for i, line in enumerate(f): @@ -61,10 +98,10 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_ continue if u'}' in line: - raise InvalidAddressFileException(u'found }} in file: {}/{}, line {}'.format(language, filename, i+1)) + raise InvalidAddressFileException(u'Found }} in file: {}/{}, line {}'.format(language, filename, i+1)) phrases = line.split(u'|') if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: - raise InvalidAddressFileException(u'found blank synonym in: {}/{}, line {}'.format(language, filename, i+1)) + raise InvalidAddressFileException(u'Found blank synonym in: {}/{}, line {}'.format(language, filename, i+1)) canonical = phrases[0] if len(phrases) > 1: @@ -75,7 +112,7 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_ for p in phrases: rule_template = address_expansion_rule_template.format(phrase=quote_string(p), - dictionary=quote_string(dictionary_name), + dictionary=dictionary_type, canonical_index=canonical_index) expansion_rules.append(rule_template) num_language_rules += 1