From 90d4da9e72c840ca77170af1c083fc7c726abedb Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 28 Jul 2015 19:08:24 -0400 Subject: [PATCH] [geodb] Adding an is_canonical bit field to geodb trie values --- src/geodb.h | 3 ++- src/geodb_builder.c | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/geodb.h b/src/geodb.h index 740afe66..8a774cee 100644 --- a/src/geodb.h +++ b/src/geodb.h @@ -30,7 +30,8 @@ extern "C" { typedef union geodb_value { uint32_t value; struct { - uint32_t components:16; + uint32_t is_canonical:1; + uint32_t components:15; uint32_t count:16; }; } geodb_value_t; diff --git a/src/geodb_builder.c b/src/geodb_builder.c index 2d448e78..3b5aac96 100644 --- a/src/geodb_builder.c +++ b/src/geodb_builder.c @@ -360,7 +360,7 @@ uint16_t get_address_component(uint32_t boundary_type) { } } -bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, uint16_t address_components) { +bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, bool is_canonical, uint16_t address_components) { if (self == NULL || self->trie == NULL) return false; uint32_t node_id = trie_get(self->trie, key); @@ -369,6 +369,7 @@ bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, uint16_t addres if (node_id == NULL_NODE_ID) { value.components |= address_components; + value.is_canonical = is_canonical; value.count = 1; return trie_add(self->trie, key, value.value); @@ -378,6 +379,7 @@ bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, uint16_t addres } value.components |= address_components; + value.is_canonical = is_canonical; value.count++; return trie_set_data_at_index(self->trie, node_id, value.value); @@ -463,7 +465,7 @@ void import_geonames(geodb_builder_t *self, char *filename) { char id_string[INT32_MAX_STRING_SIZE]; int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM; - int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII; + //int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII; uint32_array *ordered_ids = uint32_array_new(); char_array *ordered_ids_str = char_array_new(); @@ -479,6 +481,10 @@ void import_geonames(geodb_builder_t *self, char *filename) { read_geoname_from_line(g, line); char *name = char_array_get_string(g->name); + char *canonical = char_array_get_string(g->canonical); + + bool is_canonical = strcmp(name, canonical) == 0; + char *utf8_normalized = NULL; size_t id_len = sprintf(id_string, "%d", g->geonames_id); @@ -492,7 +498,7 @@ void import_geonames(geodb_builder_t *self, char *filename) { if (utf8_normalized != NULL && (prev_name == NULL || strcmp(utf8_normalized, prev_name) != 0)) { // New name - geodb_builder_add_to_trie(self, utf8_normalized, get_address_component(g->type)); + geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, get_address_component(g->type)); cmp_write_uint_vector(&ctx, ordered_ids); @@ -509,7 +515,7 @@ void import_geonames(geodb_builder_t *self, char *filename) { } else if (utf8_normalized != NULL) { key = kh_get(int_set, distinct_ids, g->geonames_id); if (key == kh_end(distinct_ids)) { - geodb_builder_add_to_trie(self, utf8_normalized, get_address_component(g->type)); + geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, get_address_component(g->type)); } } else { log_error("normalization failed for name %s\n", name); @@ -561,7 +567,7 @@ void import_geonames(geodb_builder_t *self, char *filename) { log_error("Error writing key %s to Sparkey\n", token); } - int in_bloom_filter = bloom_filter_add(self->bloom_filter, token, strlen(token)); + bloom_filter_add(self->bloom_filter, token, strlen(token)); key = kh_put(str_set, distinct_features, token, &ret); } @@ -618,6 +624,9 @@ void import_geonames_postal_codes(geodb_builder_t *self, char *filename) { int i = 0; + // Always true for postal codes + bool is_canonical = true; + while ((line = file_getline(f)) != NULL) { if (!read_gn_postal_code_from_line(pc, line)) { log_error("Error reading line: %s\n", line); @@ -636,7 +645,7 @@ void import_geonames_postal_codes(geodb_builder_t *self, char *filename) { kh_clear(str_set, distinct_features); } - geodb_builder_add_to_trie(self, utf8_normalized, ADDRESS_POSTAL_CODE); + geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, ADDRESS_POSTAL_CODE); char_array_clear(serialized); if (!gn_postal_code_serialize(pc, serialized)) { @@ -663,7 +672,7 @@ void import_geonames_postal_codes(geodb_builder_t *self, char *filename) { log_error("Error writing key %s to Sparkey\n", token); } - int in_bloom_filter = bloom_filter_add(self->bloom_filter, token, strlen(token)); + bloom_filter_add(self->bloom_filter, token, strlen(token)); key = kh_put(str_set, distinct_features, token, &ret); }