[expansion] Adding an array of dictionaries to each (phrase, canonical) pair
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||||
|
|
||||||
@@ -9,13 +11,41 @@ from geodata.encoding import safe_encode, safe_decode
|
|||||||
ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
ADDRESS_EXPANSIONS_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||||
'resources', 'dictionaries')
|
'resources', 'dictionaries')
|
||||||
|
|
||||||
|
ADDRESS_HEADER_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_rule.h')
|
||||||
ADDRESS_DATA_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
|
ADDRESS_DATA_FILE = os.path.join(os.pardir, os.pardir, os.pardir, 'src', 'address_expansion_data.c')
|
||||||
|
|
||||||
|
|
||||||
address_language_index_template = u'{{{language}, {index}, {length}}}'
|
address_language_index_template = u'{{{language}, {index}, {length}}}'
|
||||||
address_expansion_rule_template = u'{{{phrase}, {dictionary}, {canonical_index}}}'
|
address_expansion_rule_template = u'{{{phrase}, {num_dictionaries}, {{{dictionaries}}}, {canonical_index}}}'
|
||||||
|
|
||||||
|
|
||||||
|
address_expansion_rule_header_template = u'''
|
||||||
|
#ifndef ADDRESS_EXPANSION_RULE_H
|
||||||
|
#define ADDRESS_EXPANSION_RULE_H
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "constants.h"
|
||||||
|
#include "gazetteers.h"
|
||||||
|
|
||||||
|
#define MAX_DICTIONARY_TYPES {max_dictionary_types}
|
||||||
|
|
||||||
|
typedef struct address_expansion_rule {{
|
||||||
|
char *phrase;
|
||||||
|
uint32_t num_dictionaries;
|
||||||
|
dictionary_type_t dictionaries[MAX_DICTIONARY_TYPES];
|
||||||
|
int32_t canonical_index;
|
||||||
|
}} address_expansion_rule_t;
|
||||||
|
|
||||||
|
typedef struct address_language_index {{
|
||||||
|
char language[MAX_LANGUAGE_LEN];
|
||||||
|
uint32_t index;
|
||||||
|
size_t len;
|
||||||
|
}} address_language_index_t;
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
'''
|
||||||
|
|
||||||
address_expansion_data_file_template = u'''
|
address_expansion_data_file_template = u'''
|
||||||
char *canonical_strings[] = {{
|
char *canonical_strings[] = {{
|
||||||
{canonical_strings}
|
{canonical_strings}
|
||||||
@@ -70,17 +100,22 @@ def quote_string(s):
|
|||||||
return u'"{}"'.format(safe_decode(s).replace('"', '\\"'))
|
return u'"{}"'.format(safe_decode(s).replace('"', '\\"'))
|
||||||
|
|
||||||
|
|
||||||
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE):
|
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
|
||||||
address_languages = []
|
address_languages = []
|
||||||
expansion_rules = []
|
expansion_rules = []
|
||||||
canonical_strings = []
|
canonical_strings = []
|
||||||
|
|
||||||
|
max_dictionary_types = 0
|
||||||
|
|
||||||
for language in os.listdir(base_dir):
|
for language in os.listdir(base_dir):
|
||||||
language_dir = os.path.join(base_dir, language)
|
language_dir = os.path.join(base_dir, language)
|
||||||
|
|
||||||
num_language_rules = 0
|
num_language_rules = 0
|
||||||
language_index = len(expansion_rules)
|
language_index = len(expansion_rules)
|
||||||
|
|
||||||
|
language_canonical_dictionaries = defaultdict(list)
|
||||||
|
canonical_indices = {}
|
||||||
|
|
||||||
for filename in os.listdir(language_dir):
|
for filename in os.listdir(language_dir):
|
||||||
dictionary_name = filename.rstrip('.txt').lower()
|
dictionary_name = filename.rstrip('.txt').lower()
|
||||||
if '.' in dictionary_name:
|
if '.' in dictionary_name:
|
||||||
@@ -105,22 +140,38 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_
|
|||||||
|
|
||||||
canonical = phrases[0]
|
canonical = phrases[0]
|
||||||
if len(phrases) > 1:
|
if len(phrases) > 1:
|
||||||
canonical_index = len(canonical_strings)
|
canonical_index = canonical_indices.get(canonical, None)
|
||||||
canonical_strings.append(quote_string(canonical))
|
if canonical_index is None:
|
||||||
|
canonical_index = len(canonical_strings)
|
||||||
|
canonical_strings.append(quote_string(canonical))
|
||||||
|
canonical_indices[canonical] = canonical_index
|
||||||
else:
|
else:
|
||||||
canonical_index = -1
|
canonical_index = -1
|
||||||
|
|
||||||
for p in phrases:
|
for p in phrases:
|
||||||
rule_template = address_expansion_rule_template.format(phrase=quote_string(p),
|
language_canonical_dictionaries[(p, canonical_index)].append(dictionary_type)
|
||||||
dictionary=dictionary_type,
|
|
||||||
canonical_index=canonical_index)
|
for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
|
||||||
expansion_rules.append(rule_template)
|
max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
|
||||||
num_language_rules += 1
|
rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
|
||||||
|
num_dictionaries=str(len(dictionary_types)),
|
||||||
|
dictionaries=', '.join(dictionary_types),
|
||||||
|
canonical_index=canonical_index)
|
||||||
|
expansion_rules.append(rule_template)
|
||||||
|
num_language_rules += 1
|
||||||
|
|
||||||
|
|
||||||
address_languages.append(address_language_index_template.format(language=quote_string(language),
|
address_languages.append(address_language_index_template.format(language=quote_string(language),
|
||||||
index=language_index,
|
index=language_index,
|
||||||
length=num_language_rules))
|
length=num_language_rules))
|
||||||
|
|
||||||
|
header = address_expansion_rule_header_template.format(
|
||||||
|
max_dictionary_types=str(max_dictionary_types)
|
||||||
|
)
|
||||||
|
out = open(header_file, 'w')
|
||||||
|
out.write(safe_encode(header))
|
||||||
|
out.close()
|
||||||
|
|
||||||
data_file = address_expansion_data_file_template.format(
|
data_file = address_expansion_data_file_template.format(
|
||||||
canonical_strings=u''',
|
canonical_strings=u''',
|
||||||
'''.join(canonical_strings),
|
'''.join(canonical_strings),
|
||||||
@@ -135,6 +186,7 @@ def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_
|
|||||||
out.close()
|
out.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
input_dir = sys.argv[1]
|
input_dir = sys.argv[1]
|
||||||
|
|||||||
Reference in New Issue
Block a user