[dictionaries] encapsulating reading address dictionaries so it's easy to implement sampling for the address training data

This commit is contained in:
Al
2016-04-08 18:12:30 -04:00
parent 02e82e5342
commit d38de71854

View File

@@ -66,6 +66,8 @@ gazetteer_types = {
'academic_degrees': 'DICTIONARY_ACADEMIC_DEGREE',
'ambiguous_expansions': 'DICTIONARY_AMBIGUOUS_EXPANSION',
'building_types': 'DICTIONARY_BUILDING_TYPE',
'categories': 'DICTIONARY_CATEGORY',
'categories_plural': 'DICTIONARY_CATEGORY_PLURAL',
'company_types': 'DICTIONARY_COMPANY_TYPE',
'concatenated_prefixes_separable': 'DICTIONARY_CONCATENATED_PREFIX_SEPARABLE',
'concatenated_suffixes_inseparable': 'DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE',
@@ -95,7 +97,7 @@ gazetteer_types = {
'surnames': 'DICTIONARY_SURNAME',
'synonyms': 'DICTIONARY_SYNONYM',
'toponyms': 'DICTIONARY_TOPONYM',
'unit_direction': 'DICTIONARY_UNIT_DIRECTION',
'unit_directions': 'DICTIONARY_UNIT_DIRECTION',
'unit_types_numbered': 'DICTIONARY_UNIT_NUMBERED',
'unit_types_standalone': 'DICTIONARY_UNIT_STANDALONE',
@@ -106,10 +108,71 @@ class InvalidAddressFileException(Exception):
pass
def read_dictionary_file(path):
for i, line in enumerate(open(path)):
line = safe_decode(line.rstrip())
if not line.strip():
continue
if u'}' in line:
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
phrases = line.split(u'|')
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
yield phrases
def quote_string(s):
return u'"{}"'.format(safe_decode(s).replace('"', '\\"'))
class AddressPhraseDictionaries(object):
def __init__(self, base_dir=ADDRESS_EXPANSIONS_DIR):
self.base_dir = base_dir
self.languages = []
self.language_dictionaries = defaultdict(list)
self.phrases = defaultdict(list)
for language in os.listdir(base_dir):
language_dir = os.path.join(base_dir, language)
if not os.path.isdir(language_dir):
continue
self.languages.append(language)
for filename in os.listdir(language_dir):
if not filename.endswith('.txt'):
raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language_dir, filename))
dictionary_name = filename.split('.')[0].lower()
if dictionary_name not in gazetteer_types:
raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language_dir, filename, ', '.join(sorted(gazetteer_types))))
self.language_dictionaries[language].append(dictionary_name)
for i, line in enumerate(open(os.path.join(language_dir, filename))):
line = safe_decode(line.rstrip())
if not line.strip():
continue
if u'}' in line:
raise InvalidAddressFileException(u'Found }} in file: {}, line {}'.format(path, i+1))
phrases = line.split(u'|')
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
raise InvalidAddressFileException(u'Found blank synonym in: {}, line {}'.format(path, i+1))
self.phrases[(language, dictionary_name)].append(phrases)
self.language_dictionaries = dict(self.language_dictionaries)
self.phrases = dict(self.phrases)
address_phrase_dictionaries = AddressPhraseDictionaries()
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
address_languages = []
expansion_rules = []
@@ -117,39 +180,17 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_
max_dictionary_types = 0
for language in os.listdir(base_dir):
language_dir = os.path.join(base_dir, language)
if not os.path.isdir(language_dir):
continue
for language in address_phrase_dictionaries.languages:
num_language_rules = 0
language_index = len(expansion_rules)
language_canonical_dictionaries = defaultdict(list)
canonical_indices = {}
for filename in os.listdir(language_dir):
dictionary_name = filename.rstrip('.txt').lower()
if '.' in dictionary_name:
raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language, filename))
if dictionary_name not in gazetteer_types:
raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language, filename, ', '.join(gazetteer_types)))
for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
dictionary_type = gazetteer_types[dictionary_name]
f = open(os.path.join(language_dir, filename))
for i, line in enumerate(f):
line = safe_decode(line.rstrip())
if not line.strip():
continue
if u'}' in line:
raise InvalidAddressFileException(u'Found }} in file: {}/{}, line {}'.format(language, filename, i+1))
phrases = line.split(u'|')
if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
raise InvalidAddressFileException(u'Found blank synonym in: {}/{}, line {}'.format(language, filename, i+1))
for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
canonical = phrases[0]
if len(phrases) > 1:
canonical_index = canonical_indices.get(canonical, None)
@@ -197,7 +238,6 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_
out.close()
if __name__ == '__main__':
if len(sys.argv) > 1:
input_dir = sys.argv[1]