From de5d6945b553cdf1f9b2147d26d1c0c627e1ad07 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 10 Aug 2015 16:15:01 -0400 Subject: [PATCH] [expansion] Adding search_address_dictionaries_prefix/suffix for concatenated prefixes/suffixes e.g. in Germanic languages. Adding a flag to the address_expansion struct and trie value to denote separability, adding prefix/suffix keys during dictionary creation --- src/address_dictionary.c | 93 ++++++++++++++++++++++++++------ src/address_dictionary.h | 9 +++- src/address_dictionary_builder.c | 19 +++---- 3 files changed, 93 insertions(+), 28 deletions(-) diff --git a/src/address_dictionary.c b/src/address_dictionary.c index e0803688..ff5ad206 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -1,4 +1,3 @@ -#include #include #include @@ -34,11 +33,13 @@ char *address_dictionary_get_canonical(uint32_t index) { return cstring_array_get_string(address_dict->canonical, index); } -bool address_dictionary_add_expansion(char *key, address_expansion_t expansion) { +bool address_dictionary_add_expansion(char *name, char *language, address_expansion_t expansion) { + if (name == NULL) return false; + int ret; - log_debug("key=%s\n", key); - address_expansion_array *expansions = address_dictionary_get_expansions(key); + char *key; + bool free_key = false; expansion_value_t value; value.value = 0; @@ -61,6 +62,33 @@ bool address_dictionary_add_expansion(char *key, address_expansion_t expansion) } } + value.separable = expansion.separable; + + char_array *array = char_array_new_size(strlen(name)); + + if (language != NULL) { + char_array_cat(array, language); + char_array_cat(array, NAMESPACE_SEPARATOR_CHAR); + } + + size_t namespace_len = array->n; + char *trie_key = NULL; + + if (!is_suffix && !is_prefix) { + char_array_cat(array, name); + } else if (is_prefix) { + char_array_cat(array, TRIE_PREFIX_CHAR); + char_array_cat(array, name); + } else if (is_suffix) { + char_array_cat(array, TRIE_SUFFIX_CHAR); + char_array_cat_reversed(array, name); + } + + key = char_array_to_string(array); + + log_debug("key=%s\n", key); + address_expansion_array *expansions = address_dictionary_get_expansions(key); + if (expansions == NULL) { expansions = address_expansion_array_new_size(1); address_expansion_array_push(expansions, expansion); @@ -71,25 +99,17 @@ bool address_dictionary_add_expansion(char *key, address_expansion_t expansion) value.components = expansion.address_components; log_debug("value.count=%d, value.components=%d\n", value.count, value.components); - if (is_phrase) { - trie_add(address_dict->trie, key, value.value); + if (!trie_add(address_dict->trie, key, value.value)) { + log_warn("Key %s could not be added to trie\n", key); + goto exit_key_created;; } - - if (is_suffix) { - trie_add_suffix(address_dict->trie, key, value.value); - } - - if (is_prefix) { - trie_add_prefix(address_dict->trie, key, value.value); - } - } else { uint32_t node_id = trie_get(address_dict->trie, key); log_debug("node_id=%d\n", node_id); if (node_id != NULL_NODE_ID) { if (!trie_get_data_at_index(address_dict->trie, node_id, &value.value)) { log_warn("get_data_at_index returned false\n"); - return false; + goto exit_key_created; } log_debug("value.count=%d, value.components=%d\n", value.count, value.components); @@ -103,15 +123,20 @@ bool address_dictionary_add_expansion(char *key, address_expansion_t expansion) if (!trie_set_data_at_index(address_dict->trie, node_id, value.value)) { log_warn("set_data_at_index returned false for node_id=%d and value=%d\n", node_id, value.value); - return false; + goto exit_key_created; } } address_expansion_array_push(expansions, expansion); } + free(key); + return true; +exit_key_created: + free(key); + return false; } static trie_prefix_result_t get_language_prefix(char *lang) { @@ -154,6 +179,32 @@ phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, return trie_search_tokens_from_index(address_dict->trie, str, tokens, prefix.node_id); } +phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) { + if (str == NULL || lang == NULL) return NULL_PHRASE; + + trie_prefix_result_t prefix = get_language_prefix(lang); + + if (prefix.node_id == NULL_NODE_ID) { + log_debug("prefix.node_id == NULL_NODE_ID\n"); + return NULL_PHRASE; + } + + return trie_search_prefixes_from_index_get_prefix_char(address_dict->trie, str, len, prefix.node_id); +} + +phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang) { + if (str == NULL || lang == NULL) return NULL_PHRASE; + + trie_prefix_result_t prefix = get_language_prefix(lang); + + if (prefix.node_id == NULL_NODE_ID) { + log_debug("prefix.node_id == NULL_NODE_ID\n"); + return NULL_PHRASE; + } + + return trie_search_suffixes_from_index_get_suffix_char(address_dict->trie, str, len, prefix.node_id); +} + bool address_dictionary_init(void) { if (address_dict != NULL) return false; @@ -242,6 +293,10 @@ static bool address_expansion_read(FILE *f, address_expansion_t *expansion) { return false; } + if (!file_read_uint8(f, (uint8_t *)&expansion->separable)) { + return false; + } + return true; } @@ -269,6 +324,10 @@ static bool address_expansion_write(FILE *f, address_expansion_t expansion) { return false; } + if (!file_write_uint8(f, expansion.separable)) { + return false; + } + return true; } diff --git a/src/address_dictionary.h b/src/address_dictionary.h index 6b5ddcae..a5b2fc4e 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -29,8 +29,9 @@ typedef union expansion_value { uint32_t value; struct { uint32_t components:16; - uint32_t count:15; + uint32_t count:14; uint32_t canonical:1; + uint32_t separable:1; }; } expansion_value_t; @@ -40,6 +41,7 @@ typedef struct address_expansion { uint32_t num_dictionaries; uint16_t dictionary_ids[MAX_DICTIONARY_TYPES]; uint16_t address_components; + bool separable; } address_expansion_t; VECTOR_INIT(address_expansion_array, address_expansion_t) @@ -59,11 +61,14 @@ bool address_dictionary_init(void); phrase_array *search_address_dictionaries(char *str, char *lang); phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, char *lang); +phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang); +phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); + address_expansion_array *address_dictionary_get_expansions(char *key); char *address_dictionary_get_canonical(uint32_t index); int32_t address_dictionary_next_canonical_index(void); bool address_dictionary_add_canonical(char *canonical); -bool address_dictionary_add_expansion(char *key, address_expansion_t expansion); +bool address_dictionary_add_expansion(char *key, char *language, address_expansion_t expansion); void address_dictionary_destroy(address_dictionary_t *self); diff --git a/src/address_dictionary_builder.c b/src/address_dictionary_builder.c index 1ffd11bd..1e9bb8e5 100644 --- a/src/address_dictionary_builder.c +++ b/src/address_dictionary_builder.c @@ -51,6 +51,7 @@ int main(int argc, char **argv) { uint16_t address_components = 0; address_expansion_t expansion; + expansion.separable = 0; strcpy(expansion.language, language); expansion.num_dictionaries = expansion_rule.num_dictionaries; @@ -60,6 +61,12 @@ int main(int argc, char **argv) { expansion.dictionary_ids[d] = dictionary_id; + if (dictionary_id == DICTIONARY_CONCATENATED_PREFIX_SEPARABLE || + dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE || + dictionary_id == DICTIONARY_ELISION) { + expansion.separable = 1; + } + k = kh_get(int_int, dictionary_components, (uint32_t)dictionary_id); if (k == kh_end(dictionary_components)) { log_error("Invalid dictionary_type: %d\n", dictionary_id); @@ -93,20 +100,14 @@ int main(int argc, char **argv) { // Add the phrase itself to the base namespace for existence checks - if (!address_dictionary_add_expansion(expansion_rule.phrase, expansion)) { + if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) { log_error("Could not add expansion {%s}\n", expansion_rule.phrase); exit(EXIT_FAILURE); } // Add phrase namespaced by language for language-specific matching - char_array_clear(key); - char_array_cat(key, language); - char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); - char_array_cat(key, expansion_rule.phrase); - char *token = char_array_get_string(key); - - if (!address_dictionary_add_expansion(token, expansion)) { + if (!address_dictionary_add_expansion(expansion_rule.phrase, language, expansion)) { log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase); exit(EXIT_FAILURE); } @@ -122,4 +123,4 @@ int main(int argc, char **argv) { kh_destroy(str_int, canonical_indices); address_dictionary_module_teardown(); -} \ No newline at end of file +}