[expansion] Moving filename=>dictionary type mapping to the Python generation script and validating there
This commit is contained in:
@@ -31,6 +31,37 @@ address_language_index_t languages[] = {{
|
|||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
gazetteer_types = {
|
||||||
|
'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
|
||||||
|
'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
|
||||||
|
'building_types': 'DICTIONARY_BUILDING_TYPE',
|
||||||
|
'company_types': 'DICTIONARY_COMPANY_TYPE',
|
||||||
|
'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
|
||||||
|
'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
|
||||||
|
'concatenated_suffixes_separable': 'DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE',
|
||||||
|
'directionals': 'DICTIONARY_DIRECTIONAL',
|
||||||
|
'elisions': 'DICTIONARY_ELISION',
|
||||||
|
'given_names': 'DICTIONARY_GIVEN_NAME',
|
||||||
|
'level_types': 'DICTIONARY_LEVEL',
|
||||||
|
'no_number': 'DICTIONARY_NO_ADDRESS',
|
||||||
|
'nulls': 'DICTIONARY_NULL',
|
||||||
|
'organizations': 'DICTIONARY_NAMED_ORGANIZATION',
|
||||||
|
'people': 'DICTIONARY_NAMED_PERSON',
|
||||||
|
'personal_suffixes': 'DICTIONARY_PERSONAL_SUFFIX',
|
||||||
|
'personal_titles': 'DICTIONARY_PERSONAL_TITLE',
|
||||||
|
'place_names': 'DICTIONARY_PLACE_NAME',
|
||||||
|
'post_office': 'DICTIONARY_POST_OFFICE',
|
||||||
|
'qualifiers': 'DICTIONARY_QUALIFIER',
|
||||||
|
'stopwords': 'DICTIONARY_STOPWORD',
|
||||||
|
'street_types': 'DICTIONARY_STREET_TYPE',
|
||||||
|
'surnames': 'DICTIONARY_SURNAME',
|
||||||
|
'synonyms': 'DICTIONARY_SYNONYM',
|
||||||
|
'toponyms': 'DICTIONARY_TOPONYM',
|
||||||
|
'unit_types': 'DICTIONARY_UNIT',
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class InvalidAddressFileException(Exception):
|
class InvalidAddressFileException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -51,8 +82,14 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_
|
|||||||
language_index = len(expansion_rules)
|
language_index = len(expansion_rules)
|
||||||
|
|
||||||
for filename in os.listdir(language_dir):
|
for filename in os.listdir(language_dir):
|
||||||
dictionary_name = filename.rstrip('.txt')
|
dictionary_name = filename.rstrip('.txt').lower()
|
||||||
assert '.' not in dictionary_name
|
if '.' in dictionary_name:
|
||||||
|
raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language, filename))
|
||||||
|
|
||||||
|
if dictionary_name not in gazetteer_types:
|
||||||
|
raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language, filename, ', '.join(gazetteer_types)))
|
||||||
|
|
||||||
|
dictionary_type = gazetteer_types[dictionary_name]
|
||||||
|
|
||||||
f = open(os.path.join(language_dir, filename))
|
f = open(os.path.join(language_dir, filename))
|
||||||
for i, line in enumerate(f):
|
for i, line in enumerate(f):
|
||||||
@@ -61,10 +98,10 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if u'}' in line:
|
if u'}' in line:
|
||||||
raise InvalidAddressFileException(u'found }} in file: {}/{}, line {}'.format(language, filename, i+1))
|
raise InvalidAddressFileException(u'Found }} in file: {}/{}, line {}'.format(language, filename, i+1))
|
||||||
phrases = line.split(u'|')
|
phrases = line.split(u'|')
|
||||||
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
|
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
|
||||||
raise InvalidAddressFileException(u'found blank synonym in: {}/{}, line {}'.format(language, filename, i+1))
|
raise InvalidAddressFileException(u'Found blank synonym in: {}/{}, line {}'.format(language, filename, i+1))
|
||||||
|
|
||||||
canonical = phrases[0]
|
canonical = phrases[0]
|
||||||
if len(phrases) > 1:
|
if len(phrases) > 1:
|
||||||
@@ -75,7 +112,7 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_
|
|||||||
|
|
||||||
for p in phrases:
|
for p in phrases:
|
||||||
rule_template = address_expansion_rule_template.format(phrase=quote_string(p),
|
rule_template = address_expansion_rule_template.format(phrase=quote_string(p),
|
||||||
dictionary=quote_string(dictionary_name),
|
dictionary=dictionary_type,
|
||||||
canonical_index=canonical_index)
|
canonical_index=canonical_index)
|
||||||
expansion_rules.append(rule_template)
|
expansion_rules.append(rule_template)
|
||||||
num_language_rules += 1
|
num_language_rules += 1
|
||||||
|
|||||||
Reference in New Issue
Block a user