diff --git a/src/address_dictionary_builder.c b/src/address_dictionary_builder.c index a518eeed..80c5fdf2 100644 --- a/src/address_dictionary_builder.c +++ b/src/address_dictionary_builder.c @@ -30,90 +30,151 @@ int main(int argc, char **argv) { khiter_t k; + int ret; + for (int g = 0; g < NUM_DICTIONARY_TYPES; g++) { gazetteer_t gazetteer = gazetteer_config[g]; - int ret; k = kh_put(int_uint32, dictionary_components, (uint32_t)gazetteer.type, &ret); kh_value(dictionary_components, k) = (uint32_t) gazetteer.address_components; } char_array *key = char_array_new(); - for (int i = 0; i < sizeof(expansion_languages) / sizeof(address_language_index_t); i++) { + size_t num_expansion_languages = sizeof(expansion_languages) / sizeof(address_language_index_t); + + log_info("num_expansion_languages = %zu\n", num_expansion_languages); + + khash_t(str_uint32) *phrase_address_components = kh_init(str_uint32); + + for (size_t i = 0; i < num_expansion_languages; i++) { address_language_index_t lang_index = expansion_languages[i]; char *language = lang_index.language; + kh_clear(str_uint32, phrase_address_components); + log_info("Doing language: %s\n", language); + for (int add_affixes = 0; add_affixes <= 1; add_affixes++) { - for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) { - address_expansion_rule_t expansion_rule = expansion_rules[j]; + for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) { + address_expansion_rule_t expansion_rule = expansion_rules[j]; - uint16_t address_components = 0; + uint16_t address_components = 0; - address_expansion_t expansion; - expansion.separable = 0; + address_expansion_t expansion; + expansion.separable = 0; - strcpy(expansion.language, language); - expansion.num_dictionaries = expansion_rule.num_dictionaries; + strcpy(expansion.language, language); + expansion.num_dictionaries = expansion_rule.num_dictionaries; - for (int d = 0; d < expansion_rule.num_dictionaries; d++) { - uint16_t dictionary_id = (uint16_t) expansion_rule.dictionaries[d]; + bool is_affix = false; - expansion.dictionary_ids[d] = dictionary_id; + for (int d = 0; d < expansion_rule.num_dictionaries; d++) { + uint16_t dictionary_id = (uint16_t) expansion_rule.dictionaries[d]; - if (dictionary_id == DICTIONARY_CONCATENATED_PREFIX_SEPARABLE || - dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE || - dictionary_id == DICTIONARY_ELISION) { - expansion.separable = 1; - } + expansion.dictionary_ids[d] = dictionary_id; - k = kh_get(int_uint32, dictionary_components, (uint32_t)dictionary_id); - if (k == kh_end(dictionary_components)) { - log_error("Invalid dictionary_type: %d\n", dictionary_id); - exit(EXIT_FAILURE); - } - address_components |= (uint16_t) kh_value(dictionary_components, k); - } + if (dictionary_id == DICTIONARY_CONCATENATED_PREFIX_SEPARABLE || + dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE || + dictionary_id == DICTIONARY_ELISION) { + expansion.separable = 1; + is_affix = true; + } else if (dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE) { + is_affix = true; + } - expansion.address_components = address_components; - - char *canonical = NULL; - if (expansion_rule.canonical_index != -1) { - canonical = canonical_strings[expansion_rule.canonical_index]; - } - - if (canonical == NULL) { - expansion.canonical_index = -1; - } else { - k = kh_get(str_uint32, canonical_indices, canonical); - if (k != kh_end(canonical_indices)) { - expansion.canonical_index = kh_value(canonical_indices, k); - } else { - expansion.canonical_index = address_dictionary_next_canonical_index(); - if (!address_dictionary_add_canonical(canonical)) { - log_error("Error adding canonical string: %s\n", canonical); + k = kh_get(int_uint32, dictionary_components, (uint32_t)dictionary_id); + if (k == kh_end(dictionary_components)) { + log_error("Invalid dictionary_type: %d\n", dictionary_id); exit(EXIT_FAILURE); } + address_components |= (uint16_t) kh_value(dictionary_components, k); + } + + char *canonical = NULL; + if (expansion_rule.canonical_index != -1) { + canonical = canonical_strings[expansion_rule.canonical_index]; + } + + if (!add_affixes || !is_affix) { + expansion.address_components = address_components; + } else { + char *phrase = canonical == NULL ? expansion_rule.phrase : canonical; + + k = kh_get(str_uint32, phrase_address_components, phrase); + + if (k != kh_end(phrase_address_components)) { + uint32_t val = kh_value(phrase_address_components, k); + expansion.address_components = (uint16_t)val; + } else { + expansion.address_components = address_components; + } } - } + if (canonical == NULL) { + expansion.canonical_index = -1; - // Add the phrase itself to the base namespace for existence checks + if (!add_affixes && is_affix) { + uint32_t phrase_components = (uint32_t)address_components; - if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) { - log_error("Could not add expansion {%s}\n", expansion_rule.phrase); - exit(EXIT_FAILURE); - } + k = kh_get(str_uint32, phrase_address_components, expansion_rule.phrase); + if (k != kh_end(phrase_address_components)) { + phrase_components |= kh_value(phrase_address_components, k); + } - // Add phrase namespaced by language for language-specific matching + k = kh_put(str_uint32, phrase_address_components, expansion_rule.phrase, &ret); - if (!address_dictionary_add_expansion(expansion_rule.phrase, language, expansion)) { - log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase); - exit(EXIT_FAILURE); + if (ret < 0) { + log_error("Error in kh_put on phrase_address_components\n"); + exit(EXIT_FAILURE); + } + kh_value(phrase_address_components, k) = phrase_components; + } + + } else { + + if (add_affixes) { + k = kh_get(str_uint32, canonical_indices, canonical); + if (k != kh_end(canonical_indices)) { + expansion.canonical_index = kh_value(canonical_indices, k); + } else { + uint32_t canonical_index = address_dictionary_next_canonical_index(); + if (!address_dictionary_add_canonical(canonical)) { + log_error("Error adding canonical string: %s\n", canonical); + exit(EXIT_FAILURE); + } + expansion.canonical_index = canonical_index; + k = kh_put(str_uint32, canonical_indices, canonical, &ret); + if (ret < 0) { + log_error("Error in kh_put on canonical_indices\n"); + exit(EXIT_FAILURE); + } + kh_value(canonical_indices, k) = canonical_index; + } + } + } + + if (add_affixes) { + // Add the phrase itself to the base namespace for existence checks + + if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) { + log_error("Could not add expansion {%s}\n", expansion_rule.phrase); + exit(EXIT_FAILURE); + } + + // Add phrase namespaced by language for language-specific matching + + if (!address_dictionary_add_expansion(expansion_rule.phrase, language, expansion)) { + log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase); + exit(EXIT_FAILURE); + } + + } } } } + + address_dictionary_save(output_file); char_array_destroy(key); @@ -121,6 +182,7 @@ int main(int argc, char **argv) { kh_destroy(int_uint32, dictionary_components); kh_destroy(str_uint32, canonical_indices); + kh_destroy(str_uint32, phrase_address_components); address_dictionary_module_teardown(); } diff --git a/src/gazetteer_data.c b/src/gazetteer_data.c index 268d9aa7..546c17ae 100644 --- a/src/gazetteer_data.c +++ b/src/gazetteer_data.c @@ -6,9 +6,9 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_CATEGORY, ADDRESS_CATEGORY}, {DICTIONARY_CHAIN, ADDRESS_NAME}, {DICTIONARY_COMPANY_TYPE, ADDRESS_NAME}, - {DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_STREET}, - {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_STREET}, - {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_STREET}, + {DICTIONARY_CONCATENATED_PREFIX_SEPARABLE, ADDRESS_ANY}, + {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, ADDRESS_ANY}, + {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, ADDRESS_ANY}, {DICTIONARY_CROSS_STREET, ADDRESS_STREET}, {DICTIONARY_DIRECTIONAL, ADDRESS_ANY}, {DICTIONARY_ELISION, ADDRESS_ANY},