From eea11beb6acf782fb8ab6ee80263f5f22d6dca1e Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 27 Nov 2016 00:56:20 -0800 Subject: [PATCH] [expansion] using easier-to-access data structure for address dictionaries --- src/address_dictionary.c | 275 +++++++++++++++---------------- src/address_dictionary.h | 30 ++-- src/address_dictionary_builder.c | 6 +- src/libpostal.c | 39 ++--- 4 files changed, 160 insertions(+), 190 deletions(-) diff --git a/src/address_dictionary.c b/src/address_dictionary.c index b0d5e345..d0de7bba 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -13,13 +13,14 @@ address_dictionary_t *get_address_dictionary(void) { return address_dict; } -address_expansion_array *address_dictionary_get_expansions(char *key) { - if (address_dict == NULL || address_dict->expansions == NULL) { +address_expansion_value_t *address_dictionary_get_expansions(uint32_t i) { + if (address_dict == NULL || address_dict->values == NULL || i > address_dict->values->n) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return NULL; } - khiter_t k = kh_get(str_expansions, address_dict->expansions, key); - return k != kh_end(address_dict->expansions) ? kh_value(address_dict->expansions, k) : NULL; + + return address_dict->values->a[i]; + } inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id) { @@ -61,22 +62,51 @@ char *address_dictionary_get_canonical(uint32_t index) { return cstring_array_get_string(address_dict->canonical, index); } +address_expansion_value_t *address_expansion_value_new(void) { + address_expansion_value_t *self = malloc(sizeof(address_expansion_value_t)); + + if (self == NULL) return NULL; + + address_expansion_array *expansions = address_expansion_array_new(); + if (expansions == NULL) { + free(self); + return NULL; + } + + self->components = 0; + self->expansions = expansions; + + return self; +} + +address_expansion_value_t *address_expansion_value_new_with_expansion(address_expansion_t expansion) { + address_expansion_value_t *self = address_expansion_value_new(); + if (self == NULL) return NULL; + + address_expansion_array_push(self->expansions, expansion); + self->components = expansion.address_components; + + return self; +} + +void address_expansion_value_destroy(address_expansion_value_t *self) { + if (self == NULL) return; + if (self->expansions != NULL) { + address_expansion_array_destroy(self->expansions); + } + + free(self); +} + bool address_dictionary_add_expansion(char *name, char *language, address_expansion_t expansion) { - if (address_dict == NULL || address_dict->expansions == NULL) { + if (address_dict == NULL || address_dict->values == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); return false; } if (name == NULL) return false; - int ret; - char *key; - bool free_key = false; - - expansion_value_t value; - value.value = 0; - value.canonical = expansion.canonical_index == -1; bool is_prefix = false; bool is_suffix = false; @@ -95,18 +125,16 @@ bool address_dictionary_add_expansion(char *name, char *language, address_expans } } - value.separable = expansion.separable; - char_array *array = char_array_new_size(strlen(name)); + if (array == NULL) { + return false; + } if (language != NULL) { char_array_cat(array, language); char_array_cat(array, NAMESPACE_SEPARATOR_CHAR); } - size_t namespace_len = array->n; - char *trie_key = NULL; - if (!is_suffix && !is_prefix) { char_array_cat(array, name); } else if (is_prefix) { @@ -120,48 +148,23 @@ bool address_dictionary_add_expansion(char *name, char *language, address_expans key = char_array_to_string(array); log_debug("key=%s\n", key); - address_expansion_array *expansions = address_dictionary_get_expansions(key); - if (expansions == NULL) { - expansions = address_expansion_array_new_size(1); - address_expansion_array_push(expansions, expansion); - khiter_t k = kh_put(str_expansions, address_dict->expansions, strdup(key), &ret); - if (ret < 0) goto exit_key_created; - kh_value(address_dict->expansions, k) = expansions; + uint32_t expansion_index; + address_expansion_value_t *value; - value.count = 1; - value.components = expansion.address_components; - log_debug("value.count=%d, value.components=%d\n", value.count, value.components); + if (trie_get_data(address_dict->trie, key, &expansion_index)) { + value = address_dict->values->a[expansion_index]; + value->components |= expansion.address_components; + address_expansion_array_push(value->expansions, expansion); + } else { + value = address_expansion_value_new_with_expansion(expansion); + expansion_index = (uint32_t)address_dict->values->n; + address_expansion_value_array_push(address_dict->values, value); - if (!trie_add(address_dict->trie, key, value.value)) { + if (!trie_add(address_dict->trie, key, expansion_index)) { log_warn("Key %s could not be added to trie\n", key); goto exit_key_created;; } - } else { - uint32_t node_id = trie_get(address_dict->trie, key); - log_debug("node_id=%d\n", node_id); - if (node_id != NULL_NODE_ID) { - if (!trie_get_data_at_index(address_dict->trie, node_id, &value.value)) { - log_warn("get_data_at_index returned false\n"); - goto exit_key_created; - } - - log_debug("value.count=%d, value.components=%d\n", value.count, value.components); - - if (value.count <= 0) { - log_warn("value.count=%d\n", value.count); - } - - value.count++; - value.components |= expansion.address_components; - - if (!trie_set_data_at_index(address_dict->trie, node_id, value.value)) { - log_warn("set_data_at_index returned false for node_id=%d and value=%d\n", node_id, value.value); - goto exit_key_created; - } - } - - address_expansion_array_push(expansions, expansion); } free(key); @@ -293,8 +296,8 @@ bool address_dictionary_init(void) { goto exit_destroy_address_dict; } - address_dict->expansions = kh_init(str_expansions); - if (address_dict->expansions == NULL) { + address_dict->values = address_expansion_value_array_new(); + if (address_dict->values == NULL) { goto exit_destroy_address_dict; } @@ -318,18 +321,10 @@ void address_dictionary_destroy(address_dictionary_t *self) { cstring_array_destroy(self->canonical); } - if (self->expansions != NULL) { - const char *key; - address_expansion_array *expansions; - kh_foreach(self->expansions, key, expansions, { - free((char *)key); - address_expansion_array_destroy(expansions); - }) + if (self->values != NULL) { + address_expansion_value_array_destroy(self->values); } - kh_destroy(str_expansions, self->expansions); - - if (self->trie != NULL) { trie_destroy(self->trie); } @@ -365,7 +360,7 @@ static bool address_expansion_read(FILE *f, address_expansion_t *expansion) { } } - if (!file_read_uint16(f, &expansion->address_components)) { + if (!file_read_uint32(f, &expansion->address_components)) { return false; } @@ -376,8 +371,39 @@ static bool address_expansion_read(FILE *f, address_expansion_t *expansion) { return true; } +static address_expansion_value_t *address_expansion_value_read(FILE *f) { + if (f == NULL) return NULL; -static bool address_expansion_write(FILE *f, address_expansion_t expansion) { + address_expansion_value_t *value = address_expansion_value_new(); + + if (!file_read_uint32(f, &value->components)) { + goto exit_expansion_value_created; + } + + uint32_t num_expansions; + + if (!file_read_uint32(f, &num_expansions)) { + goto exit_expansion_value_created; + } + + address_expansion_t expansion; + + for (size_t i = 0; i < num_expansions; i++) { + if (!address_expansion_read(f, &expansion)) { + goto exit_expansion_value_created; + } + address_expansion_array_push(value->expansions, expansion); + } + + return value; + +exit_expansion_value_created: + address_expansion_value_destroy(value); + return NULL; +} + + +static bool address_expansion_write(address_expansion_t expansion, FILE *f) { if (f == NULL) return false; uint32_t language_len = (uint32_t)strlen(expansion.language) + 1; @@ -396,7 +422,7 @@ static bool address_expansion_write(FILE *f, address_expansion_t expansion) { } } - if (!file_write_uint16(f, expansion.address_components)) { + if (!file_write_uint32(f, expansion.address_components)) { return false; } @@ -407,6 +433,29 @@ static bool address_expansion_write(FILE *f, address_expansion_t expansion) { return true; } +static bool address_expansion_value_write(address_expansion_value_t *value, FILE *f) { + if (value == NULL || value->expansions == NULL || f == NULL) return false; + if (!file_write_uint32(f, value->components)) { + return false; + } + + uint32_t num_expansions = value->expansions->n; + + if (!file_write_uint32(f, num_expansions)) { + return false; + } + + for (size_t i = 0; i < num_expansions; i++) { + address_expansion_t expansion = value->expansions->a[i]; + if (!address_expansion_write(expansion, f)) { + return false; + } + } + + return true; +} + + bool address_dictionary_write(FILE *f) { if (address_dict == NULL || f == NULL) return false; @@ -423,39 +472,18 @@ bool address_dictionary_write(FILE *f) { return false; } - uint32_t num_keys = (uint32_t) kh_size(address_dict->expansions); + uint32_t num_values = (uint32_t) address_dict->values->n; - if (!file_write_uint32(f, num_keys)) { + if (!file_write_uint32(f, num_values)) { return false; } - const char *key; - address_expansion_array *expansions; - - kh_foreach(address_dict->expansions, key, expansions, { - uint32_t key_len = (uint32_t) strlen(key) + 1; - if (!file_write_uint32(f, key_len)) { + for (size_t i = 0; i < num_values; i++) { + address_expansion_value_t *value = address_dict->values->a[i]; + if (!address_expansion_value_write(value, f)) { return false; } - - if (!file_write_chars(f, key, key_len)) { - return false; - } - - uint32_t num_expansions = expansions->n; - - if (!file_write_uint32(f, num_expansions)) { - return false; - } - - - for (size_t i = 0; i < num_expansions; i++) { - address_expansion_t expansion = expansions->a[i]; - if (!address_expansion_write(f, expansion)) { - return false; - } - } - }) + } if (!trie_write(address_dict->trie, f)) { return false; @@ -497,61 +525,20 @@ bool address_dictionary_read(FILE *f) { address_dict->canonical = cstring_array_from_char_array(array); - uint32_t num_keys; + uint32_t num_values; - if (!file_read_uint32(f, &num_keys)) { - return false; + if (!file_read_uint32(f, &num_values)) { + goto exit_address_dict_created; } - address_dict->expansions = kh_init(str_expansions); + address_dict->values = address_expansion_value_array_new_size(num_values); - uint32_t key_len; - uint32_t num_expansions; - char *key; - address_expansion_array *expansions; - - for (uint32_t i = 0; i < num_keys; i++) { - if (!file_read_uint32(f, &key_len)) { + for (uint32_t i = 0; i < num_values; i++) { + address_expansion_value_t *value = address_expansion_value_read(f); + if (value == NULL) { goto exit_address_dict_created; } - - key = malloc(key_len); - if (key == NULL) { - goto exit_address_dict_created; - } - - if (!file_read_chars(f, key, key_len)) { - free(key); - goto exit_address_dict_created; - } - - if (!file_read_uint32(f, &num_expansions)) { - free(key); - goto exit_address_dict_created; - } - - expansions = address_expansion_array_new_size(num_expansions); - if (expansions == NULL) { - free(key); - goto exit_address_dict_created; - } - - address_expansion_t expansion; - - for (uint32_t j = 0; j < num_expansions; j++) { - if (!address_expansion_read(f, &expansion)) { - free(key); - address_expansion_array_destroy(expansions); - goto exit_address_dict_created; - } - address_expansion_array_push(expansions, expansion); - } - - int ret; - - khiter_t k = kh_put(str_expansions, address_dict->expansions, key, &ret); - if (ret < 0) goto exit_address_dict_created; - kh_value(address_dict->expansions, k) = expansions; + address_expansion_value_array_push(address_dict->values, value); } address_dict->trie = trie_read(f); diff --git a/src/address_dictionary.h b/src/address_dictionary.h index c174b002..0cdbb0a3 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -25,32 +25,31 @@ #define NULL_CANONICAL_INDEX -1 -typedef union expansion_value { - uint32_t value; - struct { - uint32_t components:16; - uint32_t count:14; - uint32_t canonical:1; - uint32_t separable:1; - }; -} expansion_value_t; - typedef struct address_expansion { int32_t canonical_index; char language[MAX_LANGUAGE_LEN]; uint32_t num_dictionaries; uint16_t dictionary_ids[MAX_DICTIONARY_TYPES]; - uint16_t address_components; + uint32_t address_components; bool separable; } address_expansion_t; VECTOR_INIT(address_expansion_array, address_expansion_t) -KHASH_MAP_INIT_STR(str_expansions, address_expansion_array *) +typedef struct address_expansion_value { + uint32_t components; + address_expansion_array *expansions; +} address_expansion_value_t; + +address_expansion_value_t *address_expansion_value_new(void); +address_expansion_value_t *address_expansion_value_new_with_expansion(address_expansion_t expansion); +void address_expansion_value_destroy(address_expansion_value_t *self); + +VECTOR_INIT_FREE_DATA(address_expansion_value_array, address_expansion_value_t *, address_expansion_value_destroy) typedef struct address_dictionary { cstring_array *canonical; - khash_t(str_expansions) *expansions; + address_expansion_value_array *values; trie_t *trie; } address_dictionary_t; @@ -66,7 +65,7 @@ bool search_address_dictionaries_tokens_with_phrases(char *str, token_array *tok phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang); phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); -address_expansion_array *address_dictionary_get_expansions(char *key); +address_expansion_value_t *address_dictionary_get_expansions(uint32_t i); bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id); char *address_dictionary_get_canonical(uint32_t index); int32_t address_dictionary_next_canonical_index(void); @@ -81,7 +80,4 @@ bool address_dictionary_save(char *path); bool address_dictionary_module_setup(char *filename); void address_dictionary_module_teardown(void); - - - #endif diff --git a/src/address_dictionary_builder.c b/src/address_dictionary_builder.c index 80c5fdf2..99fed2eb 100644 --- a/src/address_dictionary_builder.c +++ b/src/address_dictionary_builder.c @@ -58,7 +58,7 @@ int main(int argc, char **argv) { for (int j = lang_index.index; j < lang_index.index + lang_index.len; j++) { address_expansion_rule_t expansion_rule = expansion_rules[j]; - uint16_t address_components = 0; + uint32_t address_components = 0; address_expansion_t expansion; expansion.separable = 0; @@ -87,7 +87,7 @@ int main(int argc, char **argv) { log_error("Invalid dictionary_type: %d\n", dictionary_id); exit(EXIT_FAILURE); } - address_components |= (uint16_t) kh_value(dictionary_components, k); + address_components |= kh_value(dictionary_components, k); } char *canonical = NULL; @@ -104,7 +104,7 @@ int main(int argc, char **argv) { if (k != kh_end(phrase_address_components)) { uint32_t val = kh_value(phrase_address_components, k); - expansion.address_components = (uint16_t)val; + expansion.address_components = val; } else { expansion.address_components = address_components; } diff --git a/src/libpostal.c b/src/libpostal.c index aae348fe..43ee38f4 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -275,13 +275,13 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt } } - expansion_value_t value; - value.value = phrase.data; + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); token_t token; size_t added_expansions = 0; - if ((value.components & options.address_components) > 0) { + if ((value->components & options.address_components) > 0) { key->n = namespace_len; for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { token = tokens->a[j]; @@ -296,7 +296,7 @@ static string_tree_t *add_string_alternatives(char *str, normalize_options_t opt char *key_str = char_array_get_string(key); log_debug("key_str=%s\n", key_str); - address_expansion_array *expansions = address_dictionary_get_expansions(key_str); + address_expansion_array *expansions = value->expansions; if (expansions != NULL) { for (size_t j = 0; j < expansions->n; j++) { @@ -517,27 +517,14 @@ static void add_postprocessed_string(cstring_array *strings, char *str, normaliz -static address_expansion_array *get_affix_expansions(char_array *key, char *str, char *lang, token_t token, phrase_t phrase, bool reverse, normalize_options_t options) { - expansion_value_t value; - value.value = phrase.data; - address_expansion_array *expansions = NULL; - - if (value.components & options.address_components && (value.separable || !value.canonical)) { - char_array_clear(key); - char_array_cat(key, lang); - char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); - if (reverse) { - char_array_cat(key, TRIE_SUFFIX_CHAR); - char_array_cat_reversed_len(key, str + token.offset + phrase.start, phrase.len); - } else { - char_array_cat(key, TRIE_PREFIX_CHAR); - char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); - } - char *key_str = char_array_get_string(key); - log_debug("key_str=%s\n", key_str); - expansions = address_dictionary_get_expansions(key_str); +static address_expansion_array *get_affix_expansions(phrase_t phrase, normalize_options_t options) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + if (value != NULL && value->components & options.address_components) { + return value->expansions; } - return expansions; + + return NULL; } static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, normalize_options_t options) { @@ -586,12 +573,12 @@ static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, tok size_t prefix_start, prefix_end, root_end, suffix_start; if (have_prefix) { - prefix_expansions = get_affix_expansions(key, str, lang, token, prefix, false, options); + prefix_expansions = get_affix_expansions(prefix, options); if (prefix_expansions == NULL) have_prefix = false; } if (have_suffix) { - suffix_expansions = get_affix_expansions(key, str, lang, token, suffix, true, options); + suffix_expansions = get_affix_expansions(suffix, options); if (suffix_expansions == NULL) have_suffix = false; }