Files
libpostal/src/geodb_builder.c

733 lines
22 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "log/log.h"
#include "sparkey/sparkey.h"
#include "collections.h"
#include "config.h"
#include "file_utils.h"
#include "gazetteers.h"
#include "geonames.h"
#include "geodb.h"
#include "geo_disambiguation.h"
#include "normalize.h"
#include "string_utils.h"
// These files are generated by create_geonames_tsv.py
#include "geonames_fields.h"
#include "postal_fields.h"
#define DEFAULT_GEONAMES_TSV LIBPOSTAL_GEONAMES_DIR PATH_SEPARATOR "geonames.tsv";
static bool read_geoname_from_line(geoname_t *g, char *line) {
int token_count;
char *token;
geoname_clear(g);
cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count);
if (tokens == NULL) return false;
if (token_count != NUM_GEONAMES_FIELDS) {
log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_GEONAMES_FIELDS);
goto exit_geoname_free_tokens;
}
token = cstring_array_get_string(tokens, GEONAMES_ID);
if (strlen(token) == 0) {
log_error("geonames_id is required\n");
goto exit_geoname_free_tokens;
}
sscanf(token, "%d", &g->geonames_id);
token = cstring_array_get_string(tokens, GEONAMES_CANONICAL);
char_array_cat(g->canonical, token);
token = cstring_array_get_string(tokens, GEONAMES_BOUNDARY_TYPE);
sscanf(token, "%d", &g->type);
token = cstring_array_get_string(tokens, GEONAMES_NAME);
char_array_cat(g->name, token);
token = cstring_array_get_string(tokens, GEONAMES_ISO_LANGUAGE);
char_array_cat(g->iso_language, token);
token = cstring_array_get_string(tokens, GEONAMES_HAS_WIKIPEDIA_ENTRY);
if (strlen(token) > 0) {
int has_wikipedia_entry;
sscanf(token, "%d", &has_wikipedia_entry);
g->has_wikipedia_entry = has_wikipedia_entry;
} else {
g->has_wikipedia_entry = false;
}
token = cstring_array_get_string(tokens, GEONAMES_IS_PREFERRED_NAME);
if (strlen(token) > 0) {
int is_preferred_name;
sscanf(token, "%d", &is_preferred_name);
g->is_preferred_name = is_preferred_name;
} else {
g->is_preferred_name = false;
}
token = cstring_array_get_string(tokens, GEONAMES_IS_SHORT_NAME);
if (strlen(token) > 0) {
int is_short_name;
sscanf(token, "%d", &is_short_name);
g->is_short_name = is_short_name;
} else {
g->is_short_name = false;
}
token = cstring_array_get_string(tokens, GEONAMES_IS_COLLOQUIAL);
if (strlen(token) > 0) {
int is_colloquial;
sscanf(token, "%d", &is_colloquial);
g->is_colloquial = is_colloquial;
} else {
g->is_colloquial = false;
}
token = cstring_array_get_string(tokens, GEONAMES_IS_HISTORICAL);
if (strlen(token) > 0) {
int is_historical;
sscanf(token, "%d", &is_historical);
g->is_historical = is_historical;
} else {
g->is_historical = false;
}
token = cstring_array_get_string(tokens, GEONAMES_POPULATION);
if (strlen(token) > 0) {
sscanf(token, "%d", &g->population);
} else {
g->population = 0;
}
token = cstring_array_get_string(tokens, GEONAMES_LATITUDE);
if (strlen(token) > 0) {
sscanf(token, "%lf", &g->latitude);
} else {
g->longitude = 0.0;
}
token = cstring_array_get_string(tokens, GEONAMES_LONGITUDE);
if (strlen(token) > 0) {
sscanf(token, "%lf", &g->longitude);
} else {
g->longitude = 0.0;
}
token = cstring_array_get_string(tokens, GEONAMES_FEATURE_CODE);
char_array_cat(g->feature_code, token);
token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_CODE);
char_array_cat(g->country_code, token);
token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_ID);
if (strlen(token) > 0) {
sscanf(token, "%d", &g->country_geonames_id);
} else {
g->country_geonames_id = 0;
}
token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_CODE);
char_array_cat(g->admin1_code, token);
token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_ID);
if (strlen(token) > 0) {
sscanf(token, "%d", &g->admin1_geonames_id);
} else {
g->admin1_geonames_id = 0;
}
token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_CODE);
char_array_cat(g->admin2_code, token);
token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_ID);
if (strlen(token) > 0) {
sscanf(token, "%d", &g->admin2_geonames_id);
} else {
g->admin2_geonames_id = 0;
}
token = cstring_array_get_string(tokens, GEONAMES_ADMIN3_CODE);
char_array_cat(g->admin3_code, token);
cstring_array_get_string(tokens, GEONAMES_ADMIN3_ID);
if (strlen(token) > 0) {
sscanf(token, "%d", &g->admin3_geonames_id);
} else {
g->admin3_geonames_id = 0;
}
token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_CODE);
char_array_cat(g->admin4_code, token);
token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_ID);
if (strlen(token)) {
sscanf(token, "%d", &g->admin4_geonames_id);
} else {
g->admin4_geonames_id = 0;
}
cstring_array_destroy(tokens);
return true;
exit_geoname_free_tokens:
cstring_array_destroy(tokens);
return false;
}
static bool read_gn_postal_code_from_line(gn_postal_code_t *postal, char *line) {
int token_count, i;
gn_postal_code_clear(postal);
char *token;
cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count);
if (tokens == NULL) return false;
if (token_count != NUM_POSTAL_FIELDS) {
log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_POSTAL_FIELDS);
goto exit_postal_tokens_created;
}
token = cstring_array_get_string(tokens, GN_POSTAL_CODE);
if (strlen(token) == 0) {
log_error("postal_code field required\n");
goto exit_postal_tokens_created;
}
token = cstring_array_get_string(tokens, GN_POSTAL_CODE);
char_array_cat(postal->postal_code, token);
token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_CODE);
token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_GEONAMES_ID);
if (strlen(token) > 0) {
sscanf(token, "%d", &postal->country_geonames_id);
} else {
postal->country_geonames_id = 0;
}
char_array_cat(postal->country_code, token);
token = cstring_array_get_string(tokens, GN_POSTAL_CONTAINING_GEONAME_ID);
char_array_cat(postal->containing_geoname, token);
char *admin1_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN1_IDS);
size_t admin1_field_len = strlen(admin1_field);
if (admin1_field_len > 0) {
int admin1_token_count;
cstring_array *admin1_tokens = cstring_array_split(admin1_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin1_token_count);
uint32_t admin1_id;
if (admin1_token_count > 0) {
for (i = 0; i < admin1_token_count; i++) {
char *admin1_token = cstring_array_get_string(tokens, i);
if (strlen(admin1_token) > 0) {
sscanf(admin1_token, "%u", &admin1_id);
uint32_array_push(postal->admin1_ids, admin1_id);
}
}
}
cstring_array_destroy(admin1_tokens);
}
char *admin2_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN2_IDS);
size_t admin2_field_len = strlen(admin2_field);
if (admin2_field_len > 0) {
int admin2_token_count;
cstring_array *admin2_tokens = cstring_array_split(admin2_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin2_token_count);
uint32_t admin2_id;
if (admin2_token_count > 0) {
for (i = 0; i < admin2_token_count; i++) {
char *admin2_token = cstring_array_get_string(admin2_tokens, i);
if (strlen(admin2_token) > 0) {
sscanf(admin2_token, "%u", &admin2_id);
uint32_array_push(postal->admin2_ids, admin2_id);
}
}
}
cstring_array_destroy(admin2_tokens);
}
char *admin3_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN3_IDS);
size_t admin3_field_len = strlen(admin3_field);
if (admin3_field_len > 0) {
int admin3_token_count;
cstring_array *admin3_tokens = cstring_array_split(admin3_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin3_token_count);
uint32_t admin3_id;
if (admin3_token_count > 0) {
for (i = 0; i < admin3_token_count; i++) {
char *admin3_token = cstring_array_get_string(admin3_tokens, i);
if (strlen(admin3_token) > 0) {
sscanf(admin3_token, "%u", &admin3_id);
uint32_array_push(postal->admin3_ids, admin3_id);
}
}
}
cstring_array_destroy(admin3_tokens);
}
cstring_array_destroy(tokens);
return true;
exit_postal_tokens_created:
cstring_array_destroy(tokens);
return false;
}
typedef struct geodb_builder {
trie_t *trie;
sparkey_logwriter *log_writer;
bloom_filter_t *bloom_filter;
} geodb_builder_t;
void geodb_builder_destroy(geodb_builder_t *self) {
if (self == NULL) return;
if (self->trie != NULL) {
trie_destroy(self->trie);
}
if (self->bloom_filter != NULL) {
bloom_filter_destroy(self->bloom_filter);
}
if (self->log_writer != NULL) {
sparkey_logwriter_close(&self->log_writer);
}
free(self);
}
geodb_builder_t *geodb_builder_new(char *log_filename) {
geodb_builder_t *builder = malloc(sizeof(geodb_builder_t));
if (builder == NULL) return NULL;
builder->trie = trie_new();
if (builder->trie == NULL) {
goto exit_destroy_builder;
}
builder->bloom_filter = bloom_filter_new(GEODB_BLOOM_FILTER_SIZE, GEODB_BLOOM_FILTER_ERROR);
if (builder->bloom_filter == NULL) {
goto exit_destroy_builder;
}
sparkey_returncode ret_code = sparkey_logwriter_create(&builder->log_writer, log_filename, SPARKEY_COMPRESSION_NONE, 0);
if (ret_code != SPARKEY_SUCCESS) {
goto exit_destroy_builder;
}
return builder;
exit_destroy_builder:
geodb_builder_destroy(builder);
return NULL;
}
uint16_t get_address_component(uint32_t boundary_type) {
if (boundary_type == GEONAMES_LOCALITY) {
return ADDRESS_LOCALITY;
} else if (boundary_type == GEONAMES_NEIGHBORHOOD) {
return ADDRESS_NEIGHBORHOOD;
} else if (boundary_type == GEONAMES_ADMIN1) {
return ADDRESS_ADMIN1;
} else if (boundary_type == GEONAMES_ADMIN2) {
return ADDRESS_ADMIN2;
} else if (boundary_type == GEONAMES_ADMIN3) {
return ADDRESS_ADMIN3;
} else if (boundary_type == GEONAMES_ADMIN4) {
return ADDRESS_ADMIN4;
} else if (boundary_type == GEONAMES_ADMIN_OTHER) {
return ADDRESS_ADMIN_OTHER;
} else {
return 0;
}
}
bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, uint16_t address_component) {
if (self == NULL || self->trie == NULL) return false;
uint32_t node_id = trie_get(self->trie, key);
geodb_value_t value;
value.value = 0;
if (node_id == NULL_NODE_ID) {
value.components |= address_component;
value.count = 1;
trie_add(self->trie, key, value.value);
} else {
trie_node_t node = trie_get_node(self->trie, node_id);
trie_data_node_t data_node = trie_get_data_node(self->trie, node);
value.value = data_node.data;
value.components |= address_component;
value.count++;
data_node.data = value.value;
trie_set_data_node(self->trie, -1 * node.base, data_node);
}
}
void join_path(char_array *path, char *dir, char *filename) {
char_array_clear(path);
bool strip_separator = strncmp(dir + strlen(dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0;
char_array_cat(path, dir);
if (!strip_separator) {
char_array_cat(path, PATH_SEPARATOR);
}
char_array_cat(path, filename);
}
bool geodb_finalize(geodb_builder_t *self, char *output_dir) {
bool strip_output_separator = strncmp(output_dir + strlen(output_dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0;
char_array *path = char_array_new_size(strlen(output_dir));
join_path(path, output_dir, GEODB_TRIE_FILENAME);
char *trie_path = char_array_get_string(path);
trie_save(self->trie, trie_path);
char *trie_filename = char_array_get_string(path);
join_path(path, output_dir, GEODB_HASH_FILENAME);
char *hash_filename = strdup(char_array_get_string(path));
join_path(path, output_dir, GEODB_LOG_FILENAME);
char *log_filename = char_array_get_string(path);
if (self->log_writer != NULL) {
sparkey_logwriter_close(&self->log_writer);
self->log_writer = NULL;
}
if ((sparkey_hash_write(hash_filename, log_filename, 0)) != SPARKEY_SUCCESS) {
log_error("Could not write Sparkey hash file\n");
free(hash_filename);
return false;
}
free(hash_filename);
join_path(path, output_dir, GEODB_BLOOM_FILTER_FILENAME);
char *bloom_filter_path = char_array_get_string(path);
if (!bloom_filter_save(self->bloom_filter, bloom_filter_path)) {
log_error("Could not save bloom filter\n");
return false;
}
return true;
}
bool name_is_iso_code(char *name) {
size_t len = strlen(name);
return (len == 2 || len == 3) && string_is_upper(name);
}
void import_geonames(geodb_builder_t *self, char *filename) {
FILE *f = fopen(filename, "r");
if (f == NULL) {
printf("Couldn't open file\n");
exit(1);
}
char *line;
char *prev_name = NULL;
geoname_t *g = geoname_new();
char_array *serialized = char_array_new();
// Just a set of all ids in GeoNames so we only add keys once, takes up < 50MB
khash_t(int_set) *all_ids = kh_init(int_set);
khash_t(int_set) *distinct_ids = kh_init(int_set);
khash_t(str_set) *distinct_features = kh_init(str_set);
khiter_t key;
int ret;
cstring_array *geo_features = cstring_array_new();
char id_string[INT32_MAX_STRING_SIZE];
int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE;
int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
int i = 0;
while ((line = file_getline(f)) != NULL) {
read_geoname_from_line(g, line);
char *name = char_array_get_string(g->name);
char *utf8_normalized = NULL;
char *normalized = NULL;
size_t id_len = sprintf(id_string, "%d", g->geonames_id);
if (g->type == GEONAMES_COUNTRY && name_is_iso_code(name)) {
utf8_normalized = strdup(name);
} else if (name != NULL) {
utf8_normalized = normalize_string_utf8(name, normalize_utf8_options);
}
if (utf8_normalized != NULL && (prev_name == NULL || strcmp(utf8_normalized, prev_name) != 0)) {
// New name
geodb_builder_add_to_trie(self, utf8_normalized, get_address_component(g->type));
kh_clear(int_set, distinct_ids);
kh_clear(str_set, distinct_features);
} else if (utf8_normalized != NULL) {
key = kh_get(int_set, distinct_ids, g->geonames_id);
if (key == kh_end(distinct_ids)) {
geodb_builder_add_to_trie(self, utf8_normalized, get_address_component(g->type));
}
} else {
log_error("normalization failed for name %s\n", name);
exit(EXIT_FAILURE);
}
char_array_clear(serialized);
if (!geoname_serialize(g, serialized)) {
log_error("geoname_serialize failed for id=%d\n", g->geonames_id);
exit(EXIT_FAILURE);
}
key = kh_get(int_set, all_ids, g->geonames_id);
if (key == kh_end(all_ids)) {
if ((sparkey_logwriter_put(self->log_writer, strlen(id_string), (uint8_t *)id_string, serialized->n, (uint8_t *)char_array_get_string(serialized))) != SPARKEY_SUCCESS) {
log_error("Error writing to Sparkey with id=%d\n", g->geonames_id);
exit(EXIT_FAILURE);
}
key = kh_put(int_set, all_ids, g->geonames_id, &ret);
}
key = kh_put(int_set, distinct_ids, g->geonames_id, &ret);
char_array_clear(g->name);
char_array_cat(g->name, utf8_normalized);
cstring_array_clear(geo_features);
if (!geodisambig_add_geoname_features(geo_features, g)) {
log_error("Could not add geonames features for id=%d\n", g->geonames_id);
exit(EXIT_FAILURE);
}
for (int i = 0; i < cstring_array_num_strings(geo_features); i++) {
char *token = cstring_array_get_string(geo_features, i);
key = kh_get(str_set, distinct_features, token);
if (key == kh_end(distinct_features)) {
// Not in set, this GeoName takes priority
if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, strlen(id_string), (uint8_t *)id_string) != SPARKEY_SUCCESS) {
log_error("Error writing key %s to Sparkey\n", token);
}
int in_bloom_filter = bloom_filter_add(self->bloom_filter, token, strlen(token));
key = kh_put(str_set, distinct_features, token, &ret);
}
}
if (prev_name != NULL) {
free(prev_name);
}
if (utf8_normalized != NULL) {
prev_name = utf8_normalized;
}
free(line);
i++;
if (i % 1000 == 0) {
log_info("Did %d geonames\n", i);
}
}
kh_destroy(int_set, all_ids);
kh_destroy(int_set, distinct_ids);
kh_destroy(str_set, distinct_features);
char_array_destroy(serialized);
cstring_array_destroy(geo_features);
geoname_destroy(g);
fclose(f);
}
void import_geonames_postal_codes(geodb_builder_t *self, char *filename) {
FILE *f = fopen(filename, "r");
if (f == NULL) {
printf("Couldn't open file\n");
exit(1);
}
char *line;
char *prev_code = NULL;
gn_postal_code_t *pc = gn_postal_code_new();
char_array *serialized = char_array_new();
cstring_array *postal_code_features = cstring_array_new();
khash_t(str_set) *distinct_features = kh_init(str_set);
khiter_t key;
int ret;
int i = 0;
while ((line = file_getline(f)) != NULL) {
if (!read_gn_postal_code_from_line(pc, line)) {
log_error("Error reading line: %s\n", line);
exit(EXIT_FAILURE);
}
char *code = char_array_get_string(pc->postal_code);
char *utf8_normalized = normalize_string_utf8(code, NORMALIZE_STRING_LOWERCASE);
if (utf8_normalized == NULL) {
log_error("normalization failed for postal code %s\n", code);
exit(EXIT_FAILURE);
}
if (prev_code == NULL || strcmp(utf8_normalized, prev_code) != 0) {
kh_clear(str_set, distinct_features);
}
geodb_builder_add_to_trie(self, utf8_normalized, ADDRESS_POSTAL_CODE);
char_array_clear(serialized);
if (!gn_postal_code_serialize(pc, serialized)) {
log_error("gn_postal_code_serialize failed for postal code=%s\n", code);
exit(EXIT_FAILURE);
}
cstring_array_clear(postal_code_features);
char_array_clear(pc->postal_code);
char_array_cat(pc->postal_code, utf8_normalized);
if (!geodisambig_add_postal_code_features(postal_code_features, pc)) {
log_error("Could not add geonames features for postal code=%s\n", code);
exit(EXIT_FAILURE);
}
for (int i = 0; i < cstring_array_num_strings(postal_code_features); i++) {
char *token = cstring_array_get_string(postal_code_features, i);
key = kh_get(str_set, distinct_features, token);
if (key == kh_end(distinct_features)) {
// Not in set, this GeoName takes priority
if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, serialized->n, (uint8_t *)char_array_get_string(serialized)) != SPARKEY_SUCCESS) {
log_error("Error writing key %s to Sparkey\n", token);
}
int in_bloom_filter = bloom_filter_add(self->bloom_filter, token, strlen(token));
key = kh_put(str_set, distinct_features, token, &ret);
}
}
if (prev_code != NULL) {
free(prev_code);
}
if (utf8_normalized != NULL) {
prev_code = utf8_normalized;
}
free(line);
i++;
if (i % 1000 == 0) {
log_info("Did %d postal codes\n", i);
}
}
kh_destroy(str_set, distinct_features);
char_array_destroy(serialized);
cstring_array_destroy(postal_code_features);
gn_postal_code_destroy(pc);
fclose(f);
}
int main(int argc, char **argv) {
char *input_dir;
char *output_dir;
if (argc > 2) {
input_dir = argv[1];
output_dir = argv[2];
} else {
input_dir = LIBPOSTAL_GEONAMES_DIR;
output_dir = LIBPOSTAL_GEODB_DIR;
}
bool strip_input_separator = strncmp(input_dir + strlen(input_dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0;
bool strip_output_separator = strncmp(output_dir + strlen(output_dir) - 1, PATH_SEPARATOR, PATH_SEPARATOR_LEN) == 0;
char *geonames_filename = "geonames.tsv";
char_array *path = char_array_new_size(strlen(input_dir));
join_path(path, input_dir, geonames_filename);
char *geonames_path = strdup(char_array_get_string(path));
join_path(path, output_dir, GEODB_LOG_FILENAME);
char *log_filename = char_array_get_string(path);
geodb_builder_t *builder = geodb_builder_new(log_filename);
import_geonames(builder, geonames_path);
free(geonames_path);
printf("\n\n");
char *postal_codes_filename = "postal_codes.tsv";
join_path(path, input_dir, postal_codes_filename);
char *postal_codes_path = char_array_get_string(path);
log_info("Doing postal_codes\n");
import_geonames_postal_codes(builder, postal_codes_path);
char_array_destroy(path);
if (!geodb_finalize(builder, output_dir)) {
exit(EXIT_FAILURE);
}
geodb_builder_destroy(builder);
exit(EXIT_SUCCESS);
}