From 9fe22505218a3f7cb4557c58447d21867e8e7aa0 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 9 Oct 2015 15:35:50 -0400 Subject: [PATCH] [geodb] Using a trie for geo disambiguation features rather than the sparkey hashtable, sparkey simply contains the ids or code/country pairs in the case of postal codes --- src/geodb.c | 66 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/src/geodb.c b/src/geodb.c index 6a239bde..ad731ee8 100644 --- a/src/geodb.c +++ b/src/geodb.c @@ -9,12 +9,16 @@ geodb_t *get_geodb(void) { void geodb_destroy(geodb_t *self) { if (self == NULL) return; - if (self->trie != NULL) { - trie_destroy(self->trie); + if (self->names != NULL) { + trie_destroy(self->names); } - if (self->bloom_filter != NULL) { - bloom_filter_destroy(self->bloom_filter); + if (self->features != NULL) { + trie_destroy(self->features); + } + + if (self->postal_codes != NULL) { + cstring_array_destroy(self->postal_codes); } if (self->hash_reader != NULL) { @@ -48,25 +52,55 @@ geodb_t *geodb_init(char *dir) { if (gdb == NULL) return NULL; char_array *path = char_array_new_size(strlen(dir)); - char_array_cat(path, dir); - char_array_cat(path, PATH_SEPARATOR); - char_array_cat(path, GEODB_TRIE_FILENAME); + char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_NAMES_TRIE_FILENAME); - char *trie_path = char_array_get_string(path); + char *names_path = char_array_get_string(path); - gdb->trie = trie_load(trie_path); - if (gdb->trie == NULL) { + gdb->names = trie_load(names_path); + if (gdb->names == NULL) { goto exit_geodb_created; } char_array_clear(path); - char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_BLOOM_FILTER_FILENAME); + char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_FEATURES_TRIE_FILENAME); - char *bloom_path = char_array_get_string(path); + char *features_path = char_array_get_string(path); - gdb->bloom_filter = bloom_filter_load(bloom_path); - if(gdb->bloom_filter == NULL) { + gdb->features = trie_load(features_path); + if(gdb->features == NULL) { + goto exit_geodb_created; + } + + char_array_clear(path); + + char_array_cat_joined(path, PATH_SEPARATOR, true, 2, dir, GEODB_POSTAL_CODES_FILENAME); + char *postal_codes_path = char_array_get_string(path); + + FILE *f = fopen(postal_codes_path, "rb"); + + uint64_t num_postal_strings = 0; + if (!file_read_uint64(f, (uint64_t *)&num_postal_strings)) { + goto exit_geodb_created; + } + + size_t postal_codes_str_len; + + if (!file_read_uint64(f, (uint64_t *)&postal_codes_str_len)) { + goto exit_geodb_created; + } + + char_array *array = char_array_new_size(postal_codes_str_len); + + if (!file_read_chars(f, array->a, postal_codes_str_len)) { + goto exit_geodb_created; + } + + array->n = postal_codes_str_len; + + gdb->postal_codes = cstring_array_from_char_array(array); + + if (cstring_array_num_strings(gdb->postal_codes) != num_postal_strings) { goto exit_geodb_created; } @@ -130,7 +164,7 @@ bool geodb_load(char *dir) { bool search_geodb_with_phrases(char *str, phrase_array **phrases) { if (str == NULL) return false; - return trie_search_with_phrases(geodb->trie, str, phrases); + return trie_search_with_phrases(geodb->names, str, phrases); } phrase_array *search_geodb(char *str) { @@ -147,7 +181,7 @@ phrase_array *search_geodb(char *str) { bool search_geodb_tokens_with_phrases(char *str, token_array *tokens, phrase_array **phrases) { if (str == NULL) return false; - return trie_search_tokens_with_phrases(geodb->trie, str, tokens, phrases); + return trie_search_tokens_with_phrases(geodb->names, str, tokens, phrases); }