[expansion] Prefix/suffix expansions by default can apply to ADDRESS_ANY but also inherit the types of any dictionary that lists their canonical form (so we can add suffixes without worrying about whether they're for streets or place names, etc.)

This commit is contained in:
Al
2016-06-28 02:37:38 -04:00
parent 8072b01023
commit be5fd79a48
2 changed files with 116 additions and 54 deletions

View File

@@ -30,90 +30,151 @@ int main(int argc, char **argv) {
khiter_t k;
int ret;
for (int g = 0; g < NUM_DICTIONARY_TYPES; g++) {
gazetteer_t gazetteer = gazetteer_config[g];
int ret;
k = kh_put(int_uint32, dictionary_components, (uint32_t)gazetteer.type, &ret);
kh_value(dictionary_components, k) = (uint32_t) gazetteer.address_components;
}
char_array *key = char_array_new();
for (int i = 0; i < sizeof(expansion_languages) / sizeof(address_language_index_t); i++) {
size_t num_expansion_languages = sizeof(expansion_languages) / sizeof(address_language_index_t);
log_info("num_expansion_languages = %zu\n", num_expansion_languages);
khash_t(str_uint32) *phrase_address_components = kh_init(str_uint32);
for (size_t i = 0; i < num_expansion_languages; i++) {
address_language_index_t lang_index = expansion_languages[i];
char *language = lang_index.language;
kh_clear(str_uint32, phrase_address_components);
log_info("Doing language: %s\n", language);
for (int add_affixes = 0; add_affixes <= 1; add_affixes++) {
for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) {
address_expansion_rule_t expansion_rule = expansion_rules[j];
for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) {
address_expansion_rule_t expansion_rule = expansion_rules[j];
uint16_t address_components = 0;
uint16_t address_components = 0;
address_expansion_t expansion;
expansion.separable = 0;
address_expansion_t expansion;
expansion.separable = 0;
strcpy(expansion.language, language);
expansion.num_dictionaries = expansion_rule.num_dictionaries;
strcpy(expansion.language, language);
expansion.num_dictionaries = expansion_rule.num_dictionaries;
for (int d = 0; d < expansion_rule.num_dictionaries; d++) {
uint16_t dictionary_id = (uint16_t) expansion_rule.dictionaries[d];
bool is_affix = false;
expansion.dictionary_ids[d] = dictionary_id;
for (int d = 0; d < expansion_rule.num_dictionaries; d++) {
uint16_t dictionary_id = (uint16_t) expansion_rule.dictionaries[d];
if (dictionary_id == DICTIONARY_CONCATENATED_PREFIX_SEPARABLE ||
dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE ||
dictionary_id == DICTIONARY_ELISION) {
expansion.separable = 1;
}
expansion.dictionary_ids[d] = dictionary_id;
k = kh_get(int_uint32, dictionary_components, (uint32_t)dictionary_id);
if (k == kh_end(dictionary_components)) {
log_error("Invalid dictionary_type: %d\n", dictionary_id);
exit(EXIT_FAILURE);
}
address_components |= (uint16_t) kh_value(dictionary_components, k);
}
if (dictionary_id == DICTIONARY_CONCATENATED_PREFIX_SEPARABLE ||
dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE ||
dictionary_id == DICTIONARY_ELISION) {
expansion.separable = 1;
is_affix = true;
} else if (dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE) {
is_affix = true;
}
expansion.address_components = address_components;
char *canonical = NULL;
if (expansion_rule.canonical_index != -1) {
canonical = canonical_strings[expansion_rule.canonical_index];
}
if (canonical == NULL) {
expansion.canonical_index = -1;
} else {
k = kh_get(str_uint32, canonical_indices, canonical);
if (k != kh_end(canonical_indices)) {
expansion.canonical_index = kh_value(canonical_indices, k);
} else {
expansion.canonical_index = address_dictionary_next_canonical_index();
if (!address_dictionary_add_canonical(canonical)) {
log_error("Error adding canonical string: %s\n", canonical);
k = kh_get(int_uint32, dictionary_components, (uint32_t)dictionary_id);
if (k == kh_end(dictionary_components)) {
log_error("Invalid dictionary_type: %d\n", dictionary_id);
exit(EXIT_FAILURE);
}
address_components |= (uint16_t) kh_value(dictionary_components, k);
}
char *canonical = NULL;
if (expansion_rule.canonical_index != -1) {
canonical = canonical_strings[expansion_rule.canonical_index];
}
if (!add_affixes || !is_affix) {
expansion.address_components = address_components;
} else {
char *phrase = canonical == NULL ? expansion_rule.phrase : canonical;
k = kh_get(str_uint32, phrase_address_components, phrase);
if (k != kh_end(phrase_address_components)) {
uint32_t val = kh_value(phrase_address_components, k);
expansion.address_components = (uint16_t)val;
} else {
expansion.address_components = address_components;
}
}
}
if (canonical == NULL) {
expansion.canonical_index = -1;
// Add the phrase itself to the base namespace for existence checks
if (!add_affixes && is_affix) {
uint32_t phrase_components = (uint32_t)address_components;
if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) {
log_error("Could not add expansion {%s}\n", expansion_rule.phrase);
exit(EXIT_FAILURE);
}
k = kh_get(str_uint32, phrase_address_components, expansion_rule.phrase);
if (k != kh_end(phrase_address_components)) {
phrase_components |= kh_value(phrase_address_components, k);
}
// Add phrase namespaced by language for language-specific matching
k = kh_put(str_uint32, phrase_address_components, expansion_rule.phrase, &ret);
if (!address_dictionary_add_expansion(expansion_rule.phrase, language, expansion)) {
log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase);
exit(EXIT_FAILURE);
if (ret < 0) {
log_error("Error in kh_put on phrase_address_components\n");
exit(EXIT_FAILURE);
}
kh_value(phrase_address_components, k) = phrase_components;
}
} else {
if (add_affixes) {
k = kh_get(str_uint32, canonical_indices, canonical);
if (k != kh_end(canonical_indices)) {
expansion.canonical_index = kh_value(canonical_indices, k);
} else {
uint32_t canonical_index = address_dictionary_next_canonical_index();
if (!address_dictionary_add_canonical(canonical)) {
log_error("Error adding canonical string: %s\n", canonical);
exit(EXIT_FAILURE);
}
expansion.canonical_index = canonical_index;
k = kh_put(str_uint32, canonical_indices, canonical, &ret);
if (ret < 0) {
log_error("Error in kh_put on canonical_indices\n");
exit(EXIT_FAILURE);
}
kh_value(canonical_indices, k) = canonical_index;
}
}
}
if (add_affixes) {
// Add the phrase itself to the base namespace for existence checks
if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) {
log_error("Could not add expansion {%s}\n", expansion_rule.phrase);
exit(EXIT_FAILURE);
}
// Add phrase namespaced by language for language-specific matching
if (!address_dictionary_add_expansion(expansion_rule.phrase, language, expansion)) {
log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase);
exit(EXIT_FAILURE);
}
}
}
}
}
address_dictionary_save(output_file);
char_array_destroy(key);
@@ -121,6 +182,7 @@ int main(int argc, char **argv) {
kh_destroy(int_uint32, dictionary_components);
kh_destroy(str_uint32, canonical_indices);
kh_destroy(str_uint32, phrase_address_components);
address_dictionary_module_teardown();
}

View File

@@ -6,9 +6,9 @@ gazetteer_t gazetteer_config[] = {
{DICTIONARY_CATEGORY, ADDRESS_CATEGORY},
{DICTIONARY_CHAIN, ADDRESS_NAME},
{DICTIONARY_COMPANY_TYPE, ADDRESS_NAME},
{DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_STREET},
{DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_STREET},
{DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_STREET},
{DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_ANY},
{DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_ANY},
{DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_ANY},
{DICTIONARY_CROSS_STREET, ADDRESS_STREET},
{DICTIONARY_DIRECTIONAL, ADDRESS_ANY},
{DICTIONARY_ELISION, ADDRESS_ANY},