diff --git a/src/geodb_builder.c b/src/geodb_builder.c new file mode 100644 index 00000000..f6b098e9 --- /dev/null +++ b/src/geodb_builder.c @@ -0,0 +1,732 @@ +#include +#include +#include +#include + +#include "log/log.h" +#include "sparkey/sparkey.h" + +#include "collections.h" +#include "config.h" +#include "file_utils.h" +#include "gazetteers.h" +#include "geonames.h" +#include "geodb.h" +#include "geo_disambiguation.h" +#include "normalize.h" +#include "string_utils.h" + +// These files are generated by create_geonames_tsv.py +#include "geonames_fields.h" +#include "postal_fields.h" + +#define DEFAULT_GEONAMES_TSV LIBPOSTAL_GEONAMES_DIR PATH_SEPARATOR "geonames.tsv"; + +static bool read_geoname_from_line(geoname_t *g, char *line) { + int token_count; + + char *token; + + geoname_clear(g); + + cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count); + if (tokens == NULL) return false; + + if (token_count != NUM_GEONAMES_FIELDS) { + log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_GEONAMES_FIELDS); + goto exit_geoname_free_tokens; + } + + token = cstring_array_get_string(tokens, GEONAMES_ID); + if (strlen(token) == 0) { + log_error("geonames_id is required\n"); + goto exit_geoname_free_tokens; + } + + sscanf(token, "%d", &g->geonames_id); + + token = cstring_array_get_string(tokens, GEONAMES_CANONICAL); + char_array_cat(g->canonical, token); + + token = cstring_array_get_string(tokens, GEONAMES_BOUNDARY_TYPE); + + sscanf(token, "%d", &g->type); + token = cstring_array_get_string(tokens, GEONAMES_NAME); + char_array_cat(g->name, token); + + token = cstring_array_get_string(tokens, GEONAMES_ISO_LANGUAGE); + char_array_cat(g->iso_language, token); + + token = cstring_array_get_string(tokens, GEONAMES_HAS_WIKIPEDIA_ENTRY); + if (strlen(token) > 0) { + int has_wikipedia_entry; + sscanf(token, "%d", &has_wikipedia_entry); + g->has_wikipedia_entry = has_wikipedia_entry; + } else { + g->has_wikipedia_entry = false; + } + + token = cstring_array_get_string(tokens, GEONAMES_IS_PREFERRED_NAME); + if (strlen(token) > 0) { + int is_preferred_name; + sscanf(token, "%d", &is_preferred_name); + g->is_preferred_name = is_preferred_name; + } else { + g->is_preferred_name = false; + } + + token = cstring_array_get_string(tokens, GEONAMES_IS_SHORT_NAME); + if (strlen(token) > 0) { + int is_short_name; + sscanf(token, "%d", &is_short_name); + g->is_short_name = is_short_name; + } else { + g->is_short_name = false; + } + + token = cstring_array_get_string(tokens, GEONAMES_IS_COLLOQUIAL); + if (strlen(token) > 0) { + int is_colloquial; + sscanf(token, "%d", &is_colloquial); + g->is_colloquial = is_colloquial; + } else { + g->is_colloquial = false; + } + + token = cstring_array_get_string(tokens, GEONAMES_IS_HISTORICAL); + if (strlen(token) > 0) { + int is_historical; + sscanf(token, "%d", &is_historical); + g->is_historical = is_historical; + } else { + g->is_historical = false; + } + + token = cstring_array_get_string(tokens, GEONAMES_POPULATION); + if (strlen(token) > 0) { + sscanf(token, "%d", &g->population); + } else { + g->population = 0; + } + + token = cstring_array_get_string(tokens, GEONAMES_LATITUDE); + if (strlen(token) > 0) { + sscanf(token, "%lf", &g->latitude); + } else { + g->longitude = 0.0; + } + + token = cstring_array_get_string(tokens, GEONAMES_LONGITUDE); + if (strlen(token) > 0) { + sscanf(token, "%lf", &g->longitude); + } else { + g->longitude = 0.0; + } + + token = cstring_array_get_string(tokens, GEONAMES_FEATURE_CODE); + char_array_cat(g->feature_code, token); + token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_CODE); + char_array_cat(g->country_code, token); + + token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_ID); + if (strlen(token) > 0) { + sscanf(token, "%d", &g->country_geonames_id); + } else { + g->country_geonames_id = 0; + } + + token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_CODE); + char_array_cat(g->admin1_code, token); + + token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_ID); + if (strlen(token) > 0) { + sscanf(token, "%d", &g->admin1_geonames_id); + } else { + g->admin1_geonames_id = 0; + } + + token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_CODE); + char_array_cat(g->admin2_code, token); + token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_ID); + if (strlen(token) > 0) { + sscanf(token, "%d", &g->admin2_geonames_id); + } else { + g->admin2_geonames_id = 0; + } + + token = cstring_array_get_string(tokens, GEONAMES_ADMIN3_CODE); + + char_array_cat(g->admin3_code, token); + + cstring_array_get_string(tokens, GEONAMES_ADMIN3_ID); + if (strlen(token) > 0) { + sscanf(token, "%d", &g->admin3_geonames_id); + } else { + g->admin3_geonames_id = 0; + } + + token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_CODE); + char_array_cat(g->admin4_code, token); + + token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_ID); + if (strlen(token)) { + sscanf(token, "%d", &g->admin4_geonames_id); + } else { + g->admin4_geonames_id = 0; + } + + cstring_array_destroy(tokens); + return true; + +exit_geoname_free_tokens: + cstring_array_destroy(tokens); + return false; +} + +static bool read_gn_postal_code_from_line(gn_postal_code_t *postal, char *line) { + int token_count, i; + + gn_postal_code_clear(postal); + + char *token; + + cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count); + + if (tokens == NULL) return false; + + if (token_count != NUM_POSTAL_FIELDS) { + log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_POSTAL_FIELDS); + goto exit_postal_tokens_created; + } + + token = cstring_array_get_string(tokens, GN_POSTAL_CODE); + if (strlen(token) == 0) { + log_error("postal_code field required\n"); + goto exit_postal_tokens_created; + } + + token = cstring_array_get_string(tokens, GN_POSTAL_CODE); + char_array_cat(postal->postal_code, token); + token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_CODE); + + token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_GEONAMES_ID); + if (strlen(token) > 0) { + sscanf(token, "%d", &postal->country_geonames_id); + } else { + postal->country_geonames_id = 0; + } + + char_array_cat(postal->country_code, token); + token = cstring_array_get_string(tokens, GN_POSTAL_CONTAINING_GEONAME_ID); + char_array_cat(postal->containing_geoname, token); + + char *admin1_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN1_IDS); + size_t admin1_field_len = strlen(admin1_field); + + if (admin1_field_len > 0) { + int admin1_token_count; + cstring_array *admin1_tokens = cstring_array_split(admin1_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin1_token_count); + uint32_t admin1_id; + if (admin1_token_count > 0) { + for (i = 0; i < admin1_token_count; i++) { + char *admin1_token = cstring_array_get_string(tokens, i); + if (strlen(admin1_token) > 0) { + sscanf(admin1_token, "%u", &admin1_id); + uint32_array_push(postal->admin1_ids, admin1_id); + } + } + } + cstring_array_destroy(admin1_tokens); + } + + char *admin2_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN2_IDS); + size_t admin2_field_len = strlen(admin2_field); + + if (admin2_field_len > 0) { + int admin2_token_count; + cstring_array *admin2_tokens = cstring_array_split(admin2_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin2_token_count); + uint32_t admin2_id; + if (admin2_token_count > 0) { + for (i = 0; i < admin2_token_count; i++) { + char *admin2_token = cstring_array_get_string(admin2_tokens, i); + if (strlen(admin2_token) > 0) { + sscanf(admin2_token, "%u", &admin2_id); + uint32_array_push(postal->admin2_ids, admin2_id); + } + } + } + cstring_array_destroy(admin2_tokens); + } + + char *admin3_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN3_IDS); + size_t admin3_field_len = strlen(admin3_field); + + if (admin3_field_len > 0) { + int admin3_token_count; + cstring_array *admin3_tokens = cstring_array_split(admin3_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin3_token_count); + uint32_t admin3_id; + if (admin3_token_count > 0) { + for (i = 0; i < admin3_token_count; i++) { + char *admin3_token = cstring_array_get_string(admin3_tokens, i); + if (strlen(admin3_token) > 0) { + sscanf(admin3_token, "%u", &admin3_id); + uint32_array_push(postal->admin3_ids, admin3_id); + } + } + } + cstring_array_destroy(admin3_tokens); + } + + cstring_array_destroy(tokens); + return true; + +exit_postal_tokens_created: + cstring_array_destroy(tokens); + return false; +} + + +typedef struct geodb_builder { + trie_t *trie; + sparkey_logwriter *log_writer; + bloom_filter_t *bloom_filter; +} geodb_builder_t; + +void geodb_builder_destroy(geodb_builder_t *self) { + if (self == NULL) return; + + if (self->trie != NULL) { + trie_destroy(self->trie); + } + + if (self->bloom_filter != NULL) { + bloom_filter_destroy(self->bloom_filter); + } + + if (self->log_writer != NULL) { + sparkey_logwriter_close(&self->log_writer); + } + + free(self); + +} + +geodb_builder_t *geodb_builder_new(char *log_filename) { + geodb_builder_t *builder = malloc(sizeof(geodb_builder_t)); + + if (builder == NULL) return NULL; + + builder->trie = trie_new(); + + if (builder->trie == NULL) { + goto exit_destroy_builder; + } + + builder->bloom_filter = bloom_filter_new(GEODB_BLOOM_FILTER_SIZE, GEODB_BLOOM_FILTER_ERROR); + if (builder->bloom_filter == NULL) { + goto exit_destroy_builder; + } + + sparkey_returncode ret_code = sparkey_logwriter_create(&builder->log_writer, log_filename, SPARKEY_COMPRESSION_NONE, 0); + if (ret_code != SPARKEY_SUCCESS) { + goto exit_destroy_builder; + } + + return builder; + +exit_destroy_builder: + geodb_builder_destroy(builder); + return NULL; +} + +uint16_t get_address_component(uint32_t boundary_type) { + if (boundary_type == GEONAMES_LOCALITY) { + return ADDRESS_LOCALITY; + } else if (boundary_type == GEONAMES_NEIGHBORHOOD) { + return ADDRESS_NEIGHBORHOOD; + } else if (boundary_type == GEONAMES_ADMIN1) { + return ADDRESS_ADMIN1; + } else if (boundary_type == GEONAMES_ADMIN2) { + return ADDRESS_ADMIN2; + } else if (boundary_type == GEONAMES_ADMIN3) { + return ADDRESS_ADMIN3; + } else if (boundary_type == GEONAMES_ADMIN4) { + return ADDRESS_ADMIN4; + } else if (boundary_type == GEONAMES_ADMIN_OTHER) { + return ADDRESS_ADMIN_OTHER; + } else { + return 0; + } +} + +bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, uint16_t address_component) { + if (self == NULL || self->trie == NULL) return false; + uint32_t node_id = trie_get(self->trie, key); + + geodb_value_t value; + value.value = 0; + + if (node_id == NULL_NODE_ID) { + value.components |= address_component; + value.count = 1; + trie_add(self->trie, key, value.value); + + } else { + trie_node_t node = trie_get_node(self->trie, node_id); + trie_data_node_t data_node = trie_get_data_node(self->trie, node); + value.value = data_node.data; + value.components |= address_component; + value.count++; + + data_node.data = value.value; + trie_set_data_node(self->trie, -1 * node.base, data_node); + } + +} + +void join_path(char_array *path, char *dir, char *filename) { + char_array_clear(path); + bool strip_separator = strncmp(dir + strlen(dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0; + + char_array_cat(path, dir); + if (!strip_separator) { + char_array_cat(path, PATH_SEPARATOR); + } + char_array_cat(path, filename); + +} + + +bool geodb_finalize(geodb_builder_t *self, char *output_dir) { + bool strip_output_separator = strncmp(output_dir + strlen(output_dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0; + char_array *path = char_array_new_size(strlen(output_dir)); + + join_path(path, output_dir, GEODB_TRIE_FILENAME); + char *trie_path = char_array_get_string(path); + + trie_save(self->trie, trie_path); + + char *trie_filename = char_array_get_string(path); + + join_path(path, output_dir, GEODB_HASH_FILENAME); + char *hash_filename = strdup(char_array_get_string(path)); + + join_path(path, output_dir, GEODB_LOG_FILENAME); + char *log_filename = char_array_get_string(path); + + if (self->log_writer != NULL) { + sparkey_logwriter_close(&self->log_writer); + self->log_writer = NULL; + } + + if ((sparkey_hash_write(hash_filename, log_filename, 0)) != SPARKEY_SUCCESS) { + log_error("Could not write Sparkey hash file\n"); + free(hash_filename); + return false; + } + + free(hash_filename); + + join_path(path, output_dir, GEODB_BLOOM_FILTER_FILENAME); + char *bloom_filter_path = char_array_get_string(path); + if (!bloom_filter_save(self->bloom_filter, bloom_filter_path)) { + log_error("Could not save bloom filter\n"); + return false; + } + + return true; + +} + +bool name_is_iso_code(char *name) { + size_t len = strlen(name); + return (len == 2 || len == 3) && string_is_upper(name); +} + +void import_geonames(geodb_builder_t *self, char *filename) { + FILE *f = fopen(filename, "r"); + if (f == NULL) { + printf("Couldn't open file\n"); + exit(1); + } + + char *line; + char *prev_name = NULL; + geoname_t *g = geoname_new(); + + char_array *serialized = char_array_new(); + + // Just a set of all ids in GeoNames so we only add keys once, takes up < 50MB + khash_t(int_set) *all_ids = kh_init(int_set); + + khash_t(int_set) *distinct_ids = kh_init(int_set); + khash_t(str_set) *distinct_features = kh_init(str_set); + + khiter_t key; + int ret; + + cstring_array *geo_features = cstring_array_new(); + + char id_string[INT32_MAX_STRING_SIZE]; + + int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE; + int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII; + + int i = 0; + + while ((line = file_getline(f)) != NULL) { + read_geoname_from_line(g, line); + char *name = char_array_get_string(g->name); + + char *utf8_normalized = NULL; + char *normalized = NULL; + + size_t id_len = sprintf(id_string, "%d", g->geonames_id); + + if (g->type == GEONAMES_COUNTRY && name_is_iso_code(name)) { + utf8_normalized = strdup(name); + } else if (name != NULL) { + utf8_normalized = normalize_string_utf8(name, normalize_utf8_options); + } + + if (utf8_normalized != NULL && (prev_name == NULL || strcmp(utf8_normalized, prev_name) != 0)) { + // New name + + geodb_builder_add_to_trie(self, utf8_normalized, get_address_component(g->type)); + + kh_clear(int_set, distinct_ids); + kh_clear(str_set, distinct_features); + + } else if (utf8_normalized != NULL) { + key = kh_get(int_set, distinct_ids, g->geonames_id); + if (key == kh_end(distinct_ids)) { + geodb_builder_add_to_trie(self, utf8_normalized, get_address_component(g->type)); + } + } else { + log_error("normalization failed for name %s\n", name); + exit(EXIT_FAILURE); + } + + char_array_clear(serialized); + + if (!geoname_serialize(g, serialized)) { + log_error("geoname_serialize failed for id=%d\n", g->geonames_id); + exit(EXIT_FAILURE); + } + + key = kh_get(int_set, all_ids, g->geonames_id); + if (key == kh_end(all_ids)) { + + if ((sparkey_logwriter_put(self->log_writer, strlen(id_string), (uint8_t *)id_string, serialized->n, (uint8_t *)char_array_get_string(serialized))) != SPARKEY_SUCCESS) { + log_error("Error writing to Sparkey with id=%d\n", g->geonames_id); + exit(EXIT_FAILURE); + } + + key = kh_put(int_set, all_ids, g->geonames_id, &ret); + } + + key = kh_put(int_set, distinct_ids, g->geonames_id, &ret); + + char_array_clear(g->name); + char_array_cat(g->name, utf8_normalized); + + cstring_array_clear(geo_features); + + if (!geodisambig_add_geoname_features(geo_features, g)) { + log_error("Could not add geonames features for id=%d\n", g->geonames_id); + exit(EXIT_FAILURE); + } + + for (int i = 0; i < cstring_array_num_strings(geo_features); i++) { + char *token = cstring_array_get_string(geo_features, i); + key = kh_get(str_set, distinct_features, token); + if (key == kh_end(distinct_features)) { + // Not in set, this GeoName takes priority + if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, strlen(id_string), (uint8_t *)id_string) != SPARKEY_SUCCESS) { + log_error("Error writing key %s to Sparkey\n", token); + } + + int in_bloom_filter = bloom_filter_add(self->bloom_filter, token, strlen(token)); + + key = kh_put(str_set, distinct_features, token, &ret); + } + } + + if (prev_name != NULL) { + free(prev_name); + } + + if (utf8_normalized != NULL) { + prev_name = utf8_normalized; + } + + free(line); + i++; + + if (i % 1000 == 0) { + log_info("Did %d geonames\n", i); + } + } + + kh_destroy(int_set, all_ids); + kh_destroy(int_set, distinct_ids); + kh_destroy(str_set, distinct_features); + + char_array_destroy(serialized); + + cstring_array_destroy(geo_features); + + geoname_destroy(g); + fclose(f); +} + +void import_geonames_postal_codes(geodb_builder_t *self, char *filename) { + FILE *f = fopen(filename, "r"); + if (f == NULL) { + printf("Couldn't open file\n"); + exit(1); + } + + char *line; + + char *prev_code = NULL; + gn_postal_code_t *pc = gn_postal_code_new(); + + char_array *serialized = char_array_new(); + + cstring_array *postal_code_features = cstring_array_new(); + + khash_t(str_set) *distinct_features = kh_init(str_set); + + khiter_t key; + int ret; + + int i = 0; + + while ((line = file_getline(f)) != NULL) { + if (!read_gn_postal_code_from_line(pc, line)) { + log_error("Error reading line: %s\n", line); + exit(EXIT_FAILURE); + } + + char *code = char_array_get_string(pc->postal_code); + char *utf8_normalized = normalize_string_utf8(code, NORMALIZE_STRING_LOWERCASE); + + if (utf8_normalized == NULL) { + log_error("normalization failed for postal code %s\n", code); + exit(EXIT_FAILURE); + } + + if (prev_code == NULL || strcmp(utf8_normalized, prev_code) != 0) { + kh_clear(str_set, distinct_features); + } + + geodb_builder_add_to_trie(self, utf8_normalized, ADDRESS_POSTAL_CODE); + + char_array_clear(serialized); + if (!gn_postal_code_serialize(pc, serialized)) { + log_error("gn_postal_code_serialize failed for postal code=%s\n", code); + exit(EXIT_FAILURE); + } + + cstring_array_clear(postal_code_features); + + char_array_clear(pc->postal_code); + char_array_cat(pc->postal_code, utf8_normalized); + + if (!geodisambig_add_postal_code_features(postal_code_features, pc)) { + log_error("Could not add geonames features for postal code=%s\n", code); + exit(EXIT_FAILURE); + } + + for (int i = 0; i < cstring_array_num_strings(postal_code_features); i++) { + char *token = cstring_array_get_string(postal_code_features, i); + key = kh_get(str_set, distinct_features, token); + if (key == kh_end(distinct_features)) { + // Not in set, this GeoName takes priority + if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, serialized->n, (uint8_t *)char_array_get_string(serialized)) != SPARKEY_SUCCESS) { + log_error("Error writing key %s to Sparkey\n", token); + } + + int in_bloom_filter = bloom_filter_add(self->bloom_filter, token, strlen(token)); + + key = kh_put(str_set, distinct_features, token, &ret); + } + } + + if (prev_code != NULL) { + free(prev_code); + } + + if (utf8_normalized != NULL) { + prev_code = utf8_normalized; + } + + free(line); + i++; + + if (i % 1000 == 0) { + log_info("Did %d postal codes\n", i); + } + } + + kh_destroy(str_set, distinct_features); + char_array_destroy(serialized); + cstring_array_destroy(postal_code_features); + + gn_postal_code_destroy(pc); + + fclose(f); +} + +int main(int argc, char **argv) { + char *input_dir; + char *output_dir; + if (argc > 2) { + input_dir = argv[1]; + output_dir = argv[2]; + } else { + input_dir = LIBPOSTAL_GEONAMES_DIR; + output_dir = LIBPOSTAL_GEODB_DIR; + } + + bool strip_input_separator = strncmp(input_dir + strlen(input_dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0; + bool strip_output_separator = strncmp(output_dir + strlen(output_dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0; + + char *geonames_filename = "geonames.tsv"; + + char_array *path = char_array_new_size(strlen(input_dir)); + + join_path(path, input_dir, geonames_filename); + char *geonames_path = strdup(char_array_get_string(path)); + + join_path(path, output_dir, GEODB_LOG_FILENAME); + char *log_filename = char_array_get_string(path); + + geodb_builder_t *builder = geodb_builder_new(log_filename); + + import_geonames(builder, geonames_path); + + free(geonames_path); + + printf("\n\n"); + + char *postal_codes_filename = "postal_codes.tsv"; + + join_path(path, input_dir, postal_codes_filename); + char *postal_codes_path = char_array_get_string(path); + + log_info("Doing postal_codes\n"); + + import_geonames_postal_codes(builder, postal_codes_path); + + char_array_destroy(path); + + if (!geodb_finalize(builder, output_dir)) { + exit(EXIT_FAILURE); + } + geodb_builder_destroy(builder); + + exit(EXIT_SUCCESS); + +}