#include #include #include #include #include "log/log.h" #include "sparkey/sparkey.h" #include "collections.h" #include "libpostal_config.h" #include "file_utils.h" #include "gazetteers.h" #include "geonames.h" #include "geodb.h" #include "geo_disambiguation.h" #include "msgpack_utils.h" #include "normalize.h" #include "string_utils.h" // These files are generated by create_geonames_tsv.py #include "geonames_fields.h" #include "postal_fields.h" #define DEFAULT_GEONAMES_TSV LIBPOSTAL_GEONAMES_DIR PATH_SEPARATOR "geonames.tsv"; static bool read_geoname_from_line(geoname_t *g, char *line) { size_t token_count; char *token; geoname_clear(g); cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count); if (tokens == NULL) return false; if (token_count != NUM_GEONAMES_FIELDS) { log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_GEONAMES_FIELDS); goto exit_geoname_free_tokens; } token = cstring_array_get_string(tokens, GEONAMES_ID); if (strlen(token) == 0) { log_error("geonames_id is required\n"); goto exit_geoname_free_tokens; } sscanf(token, "%d", &g->geonames_id); token = cstring_array_get_string(tokens, GEONAMES_CANONICAL); char_array_cat(g->canonical, token); token = cstring_array_get_string(tokens, GEONAMES_BOUNDARY_TYPE); sscanf(token, "%d", (int *)&g->type); token = cstring_array_get_string(tokens, GEONAMES_NAME); char_array_cat(g->name, token); token = cstring_array_get_string(tokens, GEONAMES_ISO_LANGUAGE); char_array_cat(g->iso_language, token); token = cstring_array_get_string(tokens, GEONAMES_HAS_WIKIPEDIA_ENTRY); if (strlen(token) > 0) { int has_wikipedia_entry; sscanf(token, "%d", &has_wikipedia_entry); g->has_wikipedia_entry = has_wikipedia_entry; } else { g->has_wikipedia_entry = false; } token = cstring_array_get_string(tokens, GEONAMES_IS_PREFERRED_NAME); if (strlen(token) > 0) { int is_preferred_name; sscanf(token, "%d", &is_preferred_name); g->is_preferred_name = is_preferred_name; } else { g->is_preferred_name = false; } token = cstring_array_get_string(tokens, GEONAMES_IS_SHORT_NAME); if (strlen(token) > 0) { int is_short_name; sscanf(token, "%d", &is_short_name); g->is_short_name = is_short_name; } else { g->is_short_name = false; } token = cstring_array_get_string(tokens, GEONAMES_IS_COLLOQUIAL); if (strlen(token) > 0) { int is_colloquial; sscanf(token, "%d", &is_colloquial); g->is_colloquial = is_colloquial; } else { g->is_colloquial = false; } token = cstring_array_get_string(tokens, GEONAMES_IS_HISTORICAL); if (strlen(token) > 0) { int is_historical; sscanf(token, "%d", &is_historical); g->is_historical = is_historical; } else { g->is_historical = false; } token = cstring_array_get_string(tokens, GEONAMES_POPULATION); if (strlen(token) > 0) { sscanf(token, "%d", &g->population); } else { g->population = 0; } token = cstring_array_get_string(tokens, GEONAMES_LATITUDE); if (strlen(token) > 0) { sscanf(token, "%lf", &g->latitude); } else { g->longitude = 0.0; } token = cstring_array_get_string(tokens, GEONAMES_LONGITUDE); if (strlen(token) > 0) { sscanf(token, "%lf", &g->longitude); } else { g->longitude = 0.0; } token = cstring_array_get_string(tokens, GEONAMES_FEATURE_CODE); char_array_cat(g->feature_code, token); token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_CODE); char_array_cat(g->country_code, token); token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_ID); if (strlen(token) > 0) { sscanf(token, "%d", &g->country_geonames_id); } else { g->country_geonames_id = 0; } token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_CODE); char_array_cat(g->admin1_code, token); token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_ID); if (strlen(token) > 0) { sscanf(token, "%d", &g->admin1_geonames_id); } else { g->admin1_geonames_id = 0; } token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_CODE); char_array_cat(g->admin2_code, token); token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_ID); if (strlen(token) > 0) { sscanf(token, "%d", &g->admin2_geonames_id); } else { g->admin2_geonames_id = 0; } token = cstring_array_get_string(tokens, GEONAMES_ADMIN3_CODE); char_array_cat(g->admin3_code, token); cstring_array_get_string(tokens, GEONAMES_ADMIN3_ID); if (strlen(token) > 0) { sscanf(token, "%d", &g->admin3_geonames_id); } else { g->admin3_geonames_id = 0; } token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_CODE); char_array_cat(g->admin4_code, token); token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_ID); if (strlen(token)) { sscanf(token, "%d", &g->admin4_geonames_id); } else { g->admin4_geonames_id = 0; } cstring_array_destroy(tokens); return true; exit_geoname_free_tokens: cstring_array_destroy(tokens); return false; } static bool read_gn_postal_code_from_line(gn_postal_code_t *postal, char *line) { size_t token_count; int i; gn_postal_code_clear(postal); char *token; cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count); if (tokens == NULL) return false; if (token_count != NUM_POSTAL_FIELDS) { log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_POSTAL_FIELDS); goto exit_postal_tokens_created; } token = cstring_array_get_string(tokens, GN_POSTAL_CODE); if (strlen(token) == 0) { log_error("postal_code field required\n"); goto exit_postal_tokens_created; } token = cstring_array_get_string(tokens, GN_POSTAL_CODE); char_array_cat(postal->postal_code, token); token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_CODE); token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_GEONAMES_ID); if (strlen(token) > 0) { sscanf(token, "%d", &postal->country_geonames_id); } else { postal->country_geonames_id = 0; } char_array_cat(postal->country_code, token); token = cstring_array_get_string(tokens, GN_POSTAL_CONTAINING_GEONAME_ID); char_array_cat(postal->containing_geoname, token); char *admin1_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN1_IDS); size_t admin1_field_len = strlen(admin1_field); if (admin1_field_len > 0) { size_t admin1_token_count; cstring_array *admin1_tokens = cstring_array_split(admin1_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin1_token_count); uint32_t admin1_id; if (admin1_token_count > 0) { for (i = 0; i < admin1_token_count; i++) { char *admin1_token = cstring_array_get_string(tokens, i); if (strlen(admin1_token) > 0) { sscanf(admin1_token, "%u", &admin1_id); uint32_array_push(postal->admin1_ids, admin1_id); } } } cstring_array_destroy(admin1_tokens); } char *admin2_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN2_IDS); size_t admin2_field_len = strlen(admin2_field); if (admin2_field_len > 0) { size_t admin2_token_count; cstring_array *admin2_tokens = cstring_array_split(admin2_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin2_token_count); uint32_t admin2_id; if (admin2_token_count > 0) { for (i = 0; i < admin2_token_count; i++) { char *admin2_token = cstring_array_get_string(admin2_tokens, i); if (strlen(admin2_token) > 0) { sscanf(admin2_token, "%u", &admin2_id); uint32_array_push(postal->admin2_ids, admin2_id); } } } cstring_array_destroy(admin2_tokens); } char *admin3_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN3_IDS); size_t admin3_field_len = strlen(admin3_field); if (admin3_field_len > 0) { size_t admin3_token_count; cstring_array *admin3_tokens = cstring_array_split(admin3_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin3_token_count); uint32_t admin3_id; if (admin3_token_count > 0) { for (i = 0; i < admin3_token_count; i++) { char *admin3_token = cstring_array_get_string(admin3_tokens, i); if (strlen(admin3_token) > 0) { sscanf(admin3_token, "%u", &admin3_id); uint32_array_push(postal->admin3_ids, admin3_id); } } } cstring_array_destroy(admin3_tokens); } cstring_array_destroy(tokens); return true; exit_postal_tokens_created: cstring_array_destroy(tokens); return false; } typedef struct geodb_builder { trie_t *trie; sparkey_logwriter *log_writer; bloom_filter_t *bloom_filter; } geodb_builder_t; void geodb_builder_destroy(geodb_builder_t *self) { if (self == NULL) return; if (self->trie != NULL) { trie_destroy(self->trie); } if (self->bloom_filter != NULL) { bloom_filter_destroy(self->bloom_filter); } if (self->log_writer != NULL) { sparkey_logwriter_close(&self->log_writer); } free(self); } geodb_builder_t *geodb_builder_new(char *log_filename) { geodb_builder_t *builder = malloc(sizeof(geodb_builder_t)); if (builder == NULL) return NULL; builder->trie = trie_new(); if (builder->trie == NULL) { goto exit_destroy_builder; } builder->bloom_filter = bloom_filter_new(GEODB_BLOOM_FILTER_SIZE, GEODB_BLOOM_FILTER_ERROR); if (builder->bloom_filter == NULL) { goto exit_destroy_builder; } sparkey_returncode ret_code = sparkey_logwriter_create(&builder->log_writer, log_filename, SPARKEY_COMPRESSION_NONE, 0); if (ret_code != SPARKEY_SUCCESS) { goto exit_destroy_builder; } return builder; exit_destroy_builder: geodb_builder_destroy(builder); return NULL; } uint16_t get_address_component(uint32_t boundary_type) { if (boundary_type == GEONAMES_LOCALITY) { return ADDRESS_LOCALITY; } else if (boundary_type == GEONAMES_NEIGHBORHOOD) { return ADDRESS_NEIGHBORHOOD; } else if (boundary_type == GEONAMES_ADMIN1) { return ADDRESS_ADMIN1; } else if (boundary_type == GEONAMES_ADMIN2) { return ADDRESS_ADMIN2; } else if (boundary_type == GEONAMES_ADMIN3) { return ADDRESS_ADMIN3; } else if (boundary_type == GEONAMES_ADMIN4) { return ADDRESS_ADMIN4; } else if (boundary_type == GEONAMES_ADMIN_OTHER) { return ADDRESS_ADMIN_OTHER; } else { return 0; } } bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, bool is_canonical, uint16_t address_components) { if (self == NULL || self->trie == NULL) return false; uint32_t node_id = trie_get(self->trie, key); geodb_value_t value; value.value = 0; if (node_id == NULL_NODE_ID) { value.components |= address_components; value.is_canonical = is_canonical; value.count = 1; return trie_add(self->trie, key, value.value); } else { if (!trie_get_data_at_index(self->trie, node_id, &value.value)) { return false; } value.components |= address_components; value.is_canonical = is_canonical; value.count++; return trie_set_data_at_index(self->trie, node_id, value.value); } } bool geodb_finalize(geodb_builder_t *self, char *output_dir) { char_array *path = char_array_new_size(strlen(output_dir)); char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_TRIE_FILENAME); char *trie_path = char_array_get_string(path); trie_save(self->trie, trie_path); char_array_clear(path); char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_HASH_FILENAME); char *hash_filename = strdup(char_array_get_string(path)); char_array_clear(path); char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_LOG_FILENAME); char *log_filename = char_array_get_string(path); if (self->log_writer != NULL) { sparkey_logwriter_close(&self->log_writer); self->log_writer = NULL; } if ((sparkey_hash_write(hash_filename, log_filename, 0)) != SPARKEY_SUCCESS) { log_error("Could not write Sparkey hash file\n"); free(hash_filename); return false; } free(hash_filename); char_array_clear(path); char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_BLOOM_FILTER_FILENAME); char *bloom_filter_path = char_array_get_string(path); if (!bloom_filter_save(self->bloom_filter, bloom_filter_path)) { log_error("Could not save bloom filter\n"); return false; } return true; } bool name_is_iso_code(char *name) { size_t len = strlen(name); return (len == 2 || len == 3) && string_is_upper(name); } void import_geonames(geodb_builder_t *self, char *filename) { FILE *f = fopen(filename, "r"); if (f == NULL) { printf("Couldn't open file\n"); exit(1); } char *line; char *prev_name = NULL; geoname_t *g = geoname_new(); char_array *serialized = char_array_new(); // Just a set of all ids in GeoNames so we only add keys once, takes up < 50MB khash_t(int_set) *all_ids = kh_init(int_set); khash_t(int_set) *distinct_ids = kh_init(int_set); khash_t(str_set) *distinct_features = kh_init(str_set); khiter_t key; int ret; cstring_array *geo_features = cstring_array_new(); char id_string[INT32_MAX_STRING_SIZE]; int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM; //int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII; uint32_array *ordered_ids = uint32_array_new(); char_array *ordered_ids_str = char_array_new(); cmp_ctx_t ctx; msgpack_buffer_t buffer = (msgpack_buffer_t){ordered_ids_str, 0}; cmp_init(&ctx, &buffer, msgpack_bytes_reader, msgpack_bytes_writer); int i = 0; while ((line = file_getline(f)) != NULL) { read_geoname_from_line(g, line); char *name = char_array_get_string(g->name); char *canonical = char_array_get_string(g->canonical); bool is_canonical = strcmp(name, canonical) == 0; char *utf8_normalized = NULL; size_t id_len = sprintf(id_string, "%d", g->geonames_id); if (g->type == GEONAMES_COUNTRY && name_is_iso_code(name)) { utf8_normalized = strdup(name); } else if (name != NULL) { utf8_normalized = normalize_string_utf8(name, normalize_utf8_options); } if (utf8_normalized != NULL && (prev_name == NULL || strcmp(utf8_normalized, prev_name) != 0)) { // New name geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, get_address_component(g->type)); cmp_write_uint_vector(&ctx, ordered_ids); if ((sparkey_logwriter_put(self->log_writer, strlen(utf8_normalized), (uint8_t *)utf8_normalized, ordered_ids_str->n - 1, (uint8_t *)char_array_get_string(ordered_ids_str))) != SPARKEY_SUCCESS) { log_error("Error writing ids string to Sparkey\n"); exit(EXIT_FAILURE); } uint32_array_clear(ordered_ids); char_array_clear(ordered_ids_str); kh_clear(int_set, distinct_ids); kh_clear(str_set, distinct_features); } else if (utf8_normalized != NULL) { key = kh_get(int_set, distinct_ids, g->geonames_id); if (key == kh_end(distinct_ids)) { geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, get_address_component(g->type)); } } else { log_error("normalization failed for name %s\n", name); exit(EXIT_FAILURE); } char_array_clear(serialized); if (!geoname_serialize(g, serialized)) { log_error("geoname_serialize failed for id=%d\n", g->geonames_id); exit(EXIT_FAILURE); } key = kh_get(int_set, all_ids, g->geonames_id); if (key == kh_end(all_ids)) { if ((sparkey_logwriter_put(self->log_writer, strlen(id_string), (uint8_t *)id_string, serialized->n, (uint8_t *)char_array_get_string(serialized))) != SPARKEY_SUCCESS) { log_error("Error writing to Sparkey with id=%d\n", g->geonames_id); exit(EXIT_FAILURE); } key = kh_put(int_set, all_ids, g->geonames_id, &ret); } key = kh_get(int_set, distinct_ids, g->geonames_id); if (key == kh_end(distinct_ids)) { uint32_array_push(ordered_ids, g->geonames_id); } key = kh_put(int_set, distinct_ids, g->geonames_id, &ret); char_array_clear(g->name); char_array_cat(g->name, utf8_normalized); cstring_array_clear(geo_features); if (!geodisambig_add_geoname_features(geo_features, g)) { log_error("Could not add geonames features for id=%d\n", g->geonames_id); exit(EXIT_FAILURE); } for (int i = 0; i < cstring_array_num_strings(geo_features); i++) { char *token = cstring_array_get_string(geo_features, i); key = kh_get(str_set, distinct_features, token); if (key == kh_end(distinct_features)) { // Not in set, this GeoName takes priority if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, strlen(id_string), (uint8_t *)id_string) != SPARKEY_SUCCESS) { log_error("Error writing key %s to Sparkey\n", token); } bloom_filter_add(self->bloom_filter, token, strlen(token)); key = kh_put(str_set, distinct_features, token, &ret); } } if (prev_name != NULL) { free(prev_name); } if (utf8_normalized != NULL) { prev_name = utf8_normalized; } free(line); i++; if (i % 1000 == 0) { log_info("Did %d geonames\n", i); } } kh_destroy(int_set, all_ids); kh_destroy(int_set, distinct_ids); kh_destroy(str_set, distinct_features); char_array_destroy(serialized); cstring_array_destroy(geo_features); geoname_destroy(g); fclose(f); } void import_geonames_postal_codes(geodb_builder_t *self, char *filename) { FILE *f = fopen(filename, "r"); if (f == NULL) { printf("Couldn't open file\n"); exit(1); } char *line; char *prev_code = NULL; gn_postal_code_t *pc = gn_postal_code_new(); char_array *serialized = char_array_new(); cstring_array *postal_code_features = cstring_array_new(); khash_t(str_set) *distinct_features = kh_init(str_set); khiter_t key; int ret; int i = 0; // Always true for postal codes bool is_canonical = true; while ((line = file_getline(f)) != NULL) { if (!read_gn_postal_code_from_line(pc, line)) { log_error("Error reading line: %s\n", line); exit(EXIT_FAILURE); } char *code = char_array_get_string(pc->postal_code); char *utf8_normalized = normalize_string_utf8(code, NORMALIZE_STRING_LOWERCASE); if (utf8_normalized == NULL) { log_error("normalization failed for postal code %s\n", code); exit(EXIT_FAILURE); } if (prev_code == NULL || strcmp(utf8_normalized, prev_code) != 0) { kh_clear(str_set, distinct_features); } geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, ADDRESS_POSTAL_CODE); char_array_clear(serialized); if (!gn_postal_code_serialize(pc, serialized)) { log_error("gn_postal_code_serialize failed for postal code=%s\n", code); exit(EXIT_FAILURE); } cstring_array_clear(postal_code_features); char_array_clear(pc->postal_code); char_array_cat(pc->postal_code, utf8_normalized); if (!geodisambig_add_postal_code_features(postal_code_features, pc)) { log_error("Could not add geonames features for postal code=%s\n", code); exit(EXIT_FAILURE); } for (int i = 0; i < cstring_array_num_strings(postal_code_features); i++) { char *token = cstring_array_get_string(postal_code_features, i); key = kh_get(str_set, distinct_features, token); if (key == kh_end(distinct_features)) { // Not in set, this GeoName takes priority if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, serialized->n, (uint8_t *)char_array_get_string(serialized)) != SPARKEY_SUCCESS) { log_error("Error writing key %s to Sparkey\n", token); } bloom_filter_add(self->bloom_filter, token, strlen(token)); key = kh_put(str_set, distinct_features, token, &ret); } } if (prev_code != NULL) { free(prev_code); } if (utf8_normalized != NULL) { prev_code = utf8_normalized; } free(line); i++; if (i % 1000 == 0) { log_info("Did %d postal codes\n", i); } } kh_destroy(str_set, distinct_features); char_array_destroy(serialized); cstring_array_destroy(postal_code_features); gn_postal_code_destroy(pc); fclose(f); } int main(int argc, char **argv) { char *input_dir; char *output_dir; if (argc > 2) { input_dir = argv[1]; output_dir = argv[2]; } else { input_dir = LIBPOSTAL_GEONAMES_DIR; output_dir = LIBPOSTAL_GEODB_DIR; } char *geonames_filename = "geonames.tsv"; char_array *path = char_array_new_size(strlen(input_dir)); char_array_add_joined(path, PATH_SEPARATOR, true, 2, input_dir, geonames_filename); char *geonames_path = strdup(char_array_get_string(path)); char_array_clear(path); char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_LOG_FILENAME); char *log_filename = char_array_get_string(path); geodb_builder_t *builder = geodb_builder_new(log_filename); import_geonames(builder, geonames_path); free(geonames_path); printf("\n\n"); char *postal_codes_filename = "postal_codes.tsv"; char_array_clear(path); char_array_add_joined(path, PATH_SEPARATOR, true, 2, input_dir, postal_codes_filename); char *postal_codes_path = char_array_get_string(path); log_info("Doing postal_codes\n"); import_geonames_postal_codes(builder, postal_codes_path); char_array_destroy(path); if (!geodb_finalize(builder, output_dir)) { exit(EXIT_FAILURE); } geodb_builder_destroy(builder); exit(EXIT_SUCCESS); }