import os import sys from collections import defaultdict this_dir = os.path.realpath(os.path.dirname(__file__)) sys.path.append(os.path.realpath(os.path.join(this_dir, os.pardir, os.pardir))) from geodata.encoding import safe_encode, safe_decode ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'resources', 'dictionaries') ADDRESS_HEADER_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h') ADDRESS_DATA_FILE = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c') address_language_index_template = u'{{{language}, {index}, {length}}}' address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}' address_expansion_rule_header_template = u''' #ifndef ADDRESS_EXPANSION_RULE_H #define ADDRESS_EXPANSION_RULE_H #include #include #include "constants.h" #include "gazetteers.h" #define MAX_DICTIONARY_TYPES {max_dictionary_types} typedef struct address_expansion_rule {{ char *phrase; uint32_t num_dictionaries; dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES]; int32_t canonical_index; }} address_expansion_rule_t; typedef struct address_language_index {{ char language[MAX_LANGUAGE_LEN]; uint32_t index; size_t len; }} address_language_index_t; #endif ''' address_expansion_data_file_template = u''' char *canonical_strings[] = {{ {canonical_strings} }}; address_expansion_rule_t expansion_rules[] = {{ {expansion_rules} }}; address_language_index_t expansion_languages[] = {{ {address_languages} }}; ''' gazetteer_types = { 'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE', 'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION', 'building_types': 'DICTIONARY_BUILDING_TYPE', 'company_types': 'DICTIONARY_COMPANY_TYPE', 'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE', 'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE', 'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE', 'directionals': 'DICTIONARY_DIRECTIONAL', 'elisions': 'DICTIONARY_ELISION', 'given_names': 'DICTIONARY_GIVEN_NAME', 'level_types': 'DICTIONARY_LEVEL', 'no_number': 'DICTIONARY_NO_ADDRESS', 'nulls': 'DICTIONARY_NULL', 'organizations': 'DICTIONARY_NAMED_ORGANIZATION', 'people': 'DICTIONARY_NAMED_PERSON', 'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX', 'personal_titles': 'DICTIONARY_PERSONAL_TITLE', 'place_names': 'DICTIONARY_PLACE_NAME', 'post_office': 'DICTIONARY_POST_OFFICE', 'qualifiers': 'DICTIONARY_QUALIFIER', 'stopwords': 'DICTIONARY_STOPWORD', 'street_types': 'DICTIONARY_STREET_TYPE', 'surnames': 'DICTIONARY_SURNAME', 'synonyms': 'DICTIONARY_SYNONYM', 'toponyms': 'DICTIONARY_TOPONYM', 'unit_types': 'DICTIONARY_UNIT', } class InvalidAddressFileException(Exception): pass def quote_string(s): return u'"{}"'.format(safe_decode(s).replace('"', '\\"')) def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE): address_languages = [] expansion_rules = [] canonical_strings = [] max_dictionary_types = 0 for language in os.listdir(base_dir): language_dir = os.path.join(base_dir, language) num_language_rules = 0 language_index = len(expansion_rules) language_canonical_dictionaries = defaultdict(list) canonical_indices = {} for filename in os.listdir(language_dir): dictionary_name = filename.rstrip('.txt').lower() if '.' in dictionary_name: raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language, filename)) if dictionary_name not in gazetteer_types: raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language, filename, ', '.join(gazetteer_types))) dictionary_type = gazetteer_types[dictionary_name] f = open(os.path.join(language_dir, filename)) for i, line in enumerate(f): line = safe_decode(line.rstrip()) if not line.strip(): continue if u'}' in line: raise InvalidAddressFileException(u'Found }} in file: {}/{}, line {}'.format(language, filename, i+1)) phrases = line.split(u'|') if sum((1 for p in phrases if len(p.strip()) == 0)) > 0: raise InvalidAddressFileException(u'Found blank synonym in: {}/{}, line {}'.format(language, filename, i+1)) canonical = phrases[0] if len(phrases) > 1: canonical_index = canonical_indices.get(canonical, None) if canonical_index is None: canonical_index = len(canonical_strings) canonical_strings.append(quote_string(canonical)) canonical_indices[canonical] = canonical_index else: canonical_index = -1 for i, p in enumerate(phrases): language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type) for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems(): max_dictionary_types = max(max_dictionary_types, len(dictionary_types)) rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase), num_dictionaries=str(len(dictionary_types)), dictionaries=', '.join(dictionary_types), canonical_index=canonical_index) expansion_rules.append(rule_template) num_language_rules += 1 address_languages.append(address_language_index_template.format(language=quote_string(language), index=language_index, length=num_language_rules)) header = address_expansion_rule_header_template.format( max_dictionary_types=str(max_dictionary_types) ) out = open(header_file, 'w') out.write(safe_encode(header)) out.close() data_file = address_expansion_data_file_template.format( canonical_strings=u''', '''.join(canonical_strings), expansion_rules=u''', '''.join(expansion_rules), address_languages=u''', '''.join(address_languages), ) out = open(output_file, 'w') out.write(safe_encode(data_file)) out.close() if __name__ == '__main__': if len(sys.argv) > 1: input_dir = sys.argv[1] else: input_dir = ADDRESS_EXPANSIONS_DIR create_address_expansion_rules_file(base_dir=input_dir, output_file=ADDRESS_DATA_FILE)