[expansion] Adding search_address_dictionaries_prefix/suffix for concatenated prefixes/suffixes e.g. in Germanic languages. Adding a flag to the address_expansion struct and trie value to denote separability, adding prefix/suffix keys during dictionary creation

This commit is contained in:
Al
2015-08-10 16:15:01 -04:00
parent 0f77ca1213
commit de5d6945b5
3 changed files with 93 additions and 28 deletions

View File

@@ -1,4 +1,3 @@
#include <assert.h>
#include <dirent.h> #include <dirent.h>
#include <limits.h> #include <limits.h>
@@ -34,11 +33,13 @@ char *address_dictionary_get_canonical(uint32_t index) {
return cstring_array_get_string(address_dict->canonical, index); return cstring_array_get_string(address_dict->canonical, index);
} }
bool address_dictionary_add_expansion(char *key, address_expansion_t expansion) { bool address_dictionary_add_expansion(char *name, char *language, address_expansion_t expansion) {
if (name == NULL) return false;
int ret; int ret;
log_debug("key=%s\n", key); char *key;
address_expansion_array *expansions = address_dictionary_get_expansions(key); bool free_key = false;
expansion_value_t value; expansion_value_t value;
value.value = 0; value.value = 0;
@@ -61,6 +62,33 @@ bool address_dictionary_add_expansion(char *key, address_expansion_t expansion)
} }
} }
value.separable = expansion.separable;
char_array *array = char_array_new_size(strlen(name));
if (language != NULL) {
char_array_cat(array, language);
char_array_cat(array, NAMESPACE_SEPARATOR_CHAR);
}
size_t namespace_len = array->n;
char *trie_key = NULL;
if (!is_suffix && !is_prefix) {
char_array_cat(array, name);
} else if (is_prefix) {
char_array_cat(array, TRIE_PREFIX_CHAR);
char_array_cat(array, name);
} else if (is_suffix) {
char_array_cat(array, TRIE_SUFFIX_CHAR);
char_array_cat_reversed(array, name);
}
key = char_array_to_string(array);
log_debug("key=%s\n", key);
address_expansion_array *expansions = address_dictionary_get_expansions(key);
if (expansions == NULL) { if (expansions == NULL) {
expansions = address_expansion_array_new_size(1); expansions = address_expansion_array_new_size(1);
address_expansion_array_push(expansions, expansion); address_expansion_array_push(expansions, expansion);
@@ -71,25 +99,17 @@ bool address_dictionary_add_expansion(char *key, address_expansion_t expansion)
value.components = expansion.address_components; value.components = expansion.address_components;
log_debug("value.count=%d, value.components=%d\n", value.count, value.components); log_debug("value.count=%d, value.components=%d\n", value.count, value.components);
if (is_phrase) { if (!trie_add(address_dict->trie, key, value.value)) {
trie_add(address_dict->trie, key, value.value); log_warn("Key %s could not be added to trie\n", key);
goto exit_key_created;;
} }
if (is_suffix) {
trie_add_suffix(address_dict->trie, key, value.value);
}
if (is_prefix) {
trie_add_prefix(address_dict->trie, key, value.value);
}
} else { } else {
uint32_t node_id = trie_get(address_dict->trie, key); uint32_t node_id = trie_get(address_dict->trie, key);
log_debug("node_id=%d\n", node_id); log_debug("node_id=%d\n", node_id);
if (node_id != NULL_NODE_ID) { if (node_id != NULL_NODE_ID) {
if (!trie_get_data_at_index(address_dict->trie, node_id, &value.value)) { if (!trie_get_data_at_index(address_dict->trie, node_id, &value.value)) {
log_warn("get_data_at_index returned false\n"); log_warn("get_data_at_index returned false\n");
return false; goto exit_key_created;
} }
log_debug("value.count=%d, value.components=%d\n", value.count, value.components); log_debug("value.count=%d, value.components=%d\n", value.count, value.components);
@@ -103,15 +123,20 @@ bool address_dictionary_add_expansion(char *key, address_expansion_t expansion)
if (!trie_set_data_at_index(address_dict->trie, node_id, value.value)) { if (!trie_set_data_at_index(address_dict->trie, node_id, value.value)) {
log_warn("set_data_at_index returned false for node_id=%d and value=%d\n", node_id, value.value); log_warn("set_data_at_index returned false for node_id=%d and value=%d\n", node_id, value.value);
return false; goto exit_key_created;
} }
} }
address_expansion_array_push(expansions, expansion); address_expansion_array_push(expansions, expansion);
} }
free(key);
return true; return true;
exit_key_created:
free(key);
return false;
} }
static trie_prefix_result_t get_language_prefix(char *lang) { static trie_prefix_result_t get_language_prefix(char *lang) {
@@ -154,6 +179,32 @@ phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens,
return trie_search_tokens_from_index(address_dict->trie, str, tokens, prefix.node_id); return trie_search_tokens_from_index(address_dict->trie, str, tokens, prefix.node_id);
} }
phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) {
if (str == NULL || lang == NULL) return NULL_PHRASE;
trie_prefix_result_t prefix = get_language_prefix(lang);
if (prefix.node_id == NULL_NODE_ID) {
log_debug("prefix.node_id == NULL_NODE_ID\n");
return NULL_PHRASE;
}
return trie_search_prefixes_from_index_get_prefix_char(address_dict->trie, str, len, prefix.node_id);
}
phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang) {
if (str == NULL || lang == NULL) return NULL_PHRASE;
trie_prefix_result_t prefix = get_language_prefix(lang);
if (prefix.node_id == NULL_NODE_ID) {
log_debug("prefix.node_id == NULL_NODE_ID\n");
return NULL_PHRASE;
}
return trie_search_suffixes_from_index_get_suffix_char(address_dict->trie, str, len, prefix.node_id);
}
bool address_dictionary_init(void) { bool address_dictionary_init(void) {
if (address_dict != NULL) return false; if (address_dict != NULL) return false;
@@ -242,6 +293,10 @@ static bool address_expansion_read(FILE *f, address_expansion_t *expansion) {
return false; return false;
} }
if (!file_read_uint8(f, (uint8_t *)&expansion->separable)) {
return false;
}
return true; return true;
} }
@@ -269,6 +324,10 @@ static bool address_expansion_write(FILE *f, address_expansion_t expansion) {
return false; return false;
} }
if (!file_write_uint8(f, expansion.separable)) {
return false;
}
return true; return true;
} }

View File

@@ -29,8 +29,9 @@ typedef union expansion_value {
uint32_t value; uint32_t value;
struct { struct {
uint32_t components:16; uint32_t components:16;
uint32_t count:15; uint32_t count:14;
uint32_t canonical:1; uint32_t canonical:1;
uint32_t separable:1;
}; };
} expansion_value_t; } expansion_value_t;
@@ -40,6 +41,7 @@ typedef struct address_expansion {
uint32_t num_dictionaries; uint32_t num_dictionaries;
uint16_t dictionary_ids[MAX_DICTIONARY_TYPES]; uint16_t dictionary_ids[MAX_DICTIONARY_TYPES];
uint16_t address_components; uint16_t address_components;
bool separable;
} address_expansion_t; } address_expansion_t;
VECTOR_INIT(address_expansion_array, address_expansion_t) VECTOR_INIT(address_expansion_array, address_expansion_t)
@@ -59,11 +61,14 @@ bool address_dictionary_init(void);
phrase_array *search_address_dictionaries(char *str, char *lang); phrase_array *search_address_dictionaries(char *str, char *lang);
phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, char *lang); phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, char *lang);
phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang);
phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang);
address_expansion_array *address_dictionary_get_expansions(char *key); address_expansion_array *address_dictionary_get_expansions(char *key);
char *address_dictionary_get_canonical(uint32_t index); char *address_dictionary_get_canonical(uint32_t index);
int32_t address_dictionary_next_canonical_index(void); int32_t address_dictionary_next_canonical_index(void);
bool address_dictionary_add_canonical(char *canonical); bool address_dictionary_add_canonical(char *canonical);
bool address_dictionary_add_expansion(char *key, address_expansion_t expansion); bool address_dictionary_add_expansion(char *key, char *language, address_expansion_t expansion);
void address_dictionary_destroy(address_dictionary_t *self); void address_dictionary_destroy(address_dictionary_t *self);

View File

@@ -51,6 +51,7 @@ int main(int argc, char **argv) {
uint16_t address_components = 0; uint16_t address_components = 0;
address_expansion_t expansion; address_expansion_t expansion;
expansion.separable = 0;
strcpy(expansion.language, language); strcpy(expansion.language, language);
expansion.num_dictionaries = expansion_rule.num_dictionaries; expansion.num_dictionaries = expansion_rule.num_dictionaries;
@@ -60,6 +61,12 @@ int main(int argc, char **argv) {
expansion.dictionary_ids[d] = dictionary_id; expansion.dictionary_ids[d] = dictionary_id;
if (dictionary_id == DICTIONARY_CONCATENATED_PREFIX_SEPARABLE ||
dictionary_id == DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE ||
dictionary_id == DICTIONARY_ELISION) {
expansion.separable = 1;
}
k = kh_get(int_int, dictionary_components, (uint32_t)dictionary_id); k = kh_get(int_int, dictionary_components, (uint32_t)dictionary_id);
if (k == kh_end(dictionary_components)) { if (k == kh_end(dictionary_components)) {
log_error("Invalid dictionary_type: %d\n", dictionary_id); log_error("Invalid dictionary_type: %d\n", dictionary_id);
@@ -93,20 +100,14 @@ int main(int argc, char **argv) {
// Add the phrase itself to the base namespace for existence checks // Add the phrase itself to the base namespace for existence checks
if (!address_dictionary_add_expansion(expansion_rule.phrase, expansion)) { if (!address_dictionary_add_expansion(expansion_rule.phrase, NULL, expansion)) {
log_error("Could not add expansion {%s}\n", expansion_rule.phrase); log_error("Could not add expansion {%s}\n", expansion_rule.phrase);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// Add phrase namespaced by language for language-specific matching // Add phrase namespaced by language for language-specific matching
char_array_clear(key); if (!address_dictionary_add_expansion(expansion_rule.phrase, language, expansion)) {
char_array_cat(key, language);
char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
char_array_cat(key, expansion_rule.phrase);
char *token = char_array_get_string(key);
if (!address_dictionary_add_expansion(token, expansion)) {
log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase); log_error("Could not add language expansion {%s, %s}\n", language, expansion_rule.phrase);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }