759 lines
23 KiB
C
759 lines
23 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
|
|
#include "log/log.h"
|
|
#include "sparkey/sparkey.h"
|
|
|
|
#include "collections.h"
|
|
#include "libpostal_config.h"
|
|
#include "file_utils.h"
|
|
#include "gazetteers.h"
|
|
#include "geonames.h"
|
|
#include "geodb.h"
|
|
#include "geo_disambiguation.h"
|
|
#include "msgpack_utils.h"
|
|
#include "normalize.h"
|
|
#include "string_utils.h"
|
|
|
|
// These files are generated by create_geonames_tsv.py
|
|
#include "geonames_fields.h"
|
|
#include "postal_fields.h"
|
|
|
|
#define DEFAULT_GEONAMES_TSV LIBPOSTAL_GEONAMES_DIR PATH_SEPARATOR "geonames.tsv";
|
|
|
|
static bool read_geoname_from_line(geoname_t *g, char *line) {
|
|
size_t token_count;
|
|
|
|
char *token;
|
|
|
|
geoname_clear(g);
|
|
|
|
cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count);
|
|
if (tokens == NULL) return false;
|
|
|
|
if (token_count != NUM_GEONAMES_FIELDS) {
|
|
log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_GEONAMES_FIELDS);
|
|
goto exit_geoname_free_tokens;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ID);
|
|
if (strlen(token) == 0) {
|
|
log_error("geonames_id is required\n");
|
|
goto exit_geoname_free_tokens;
|
|
}
|
|
|
|
sscanf(token, "%d", &g->geonames_id);
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_CANONICAL);
|
|
char_array_cat(g->canonical, token);
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_BOUNDARY_TYPE);
|
|
|
|
sscanf(token, "%d", (int *)&g->type);
|
|
token = cstring_array_get_string(tokens, GEONAMES_NAME);
|
|
char_array_cat(g->name, token);
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ISO_LANGUAGE);
|
|
char_array_cat(g->iso_language, token);
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_HAS_WIKIPEDIA_ENTRY);
|
|
if (strlen(token) > 0) {
|
|
int has_wikipedia_entry;
|
|
sscanf(token, "%d", &has_wikipedia_entry);
|
|
g->has_wikipedia_entry = has_wikipedia_entry;
|
|
} else {
|
|
g->has_wikipedia_entry = false;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_IS_PREFERRED_NAME);
|
|
if (strlen(token) > 0) {
|
|
int is_preferred_name;
|
|
sscanf(token, "%d", &is_preferred_name);
|
|
g->is_preferred_name = is_preferred_name;
|
|
} else {
|
|
g->is_preferred_name = false;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_IS_SHORT_NAME);
|
|
if (strlen(token) > 0) {
|
|
int is_short_name;
|
|
sscanf(token, "%d", &is_short_name);
|
|
g->is_short_name = is_short_name;
|
|
} else {
|
|
g->is_short_name = false;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_IS_COLLOQUIAL);
|
|
if (strlen(token) > 0) {
|
|
int is_colloquial;
|
|
sscanf(token, "%d", &is_colloquial);
|
|
g->is_colloquial = is_colloquial;
|
|
} else {
|
|
g->is_colloquial = false;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_IS_HISTORICAL);
|
|
if (strlen(token) > 0) {
|
|
int is_historical;
|
|
sscanf(token, "%d", &is_historical);
|
|
g->is_historical = is_historical;
|
|
} else {
|
|
g->is_historical = false;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_POPULATION);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%d", &g->population);
|
|
} else {
|
|
g->population = 0;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_LATITUDE);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%lf", &g->latitude);
|
|
} else {
|
|
g->longitude = 0.0;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_LONGITUDE);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%lf", &g->longitude);
|
|
} else {
|
|
g->longitude = 0.0;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_FEATURE_CODE);
|
|
char_array_cat(g->feature_code, token);
|
|
token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_CODE);
|
|
char_array_cat(g->country_code, token);
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_COUNTRY_ID);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%d", &g->country_geonames_id);
|
|
} else {
|
|
g->country_geonames_id = 0;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_CODE);
|
|
char_array_cat(g->admin1_code, token);
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ADMIN1_ID);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%d", &g->admin1_geonames_id);
|
|
} else {
|
|
g->admin1_geonames_id = 0;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_CODE);
|
|
char_array_cat(g->admin2_code, token);
|
|
token = cstring_array_get_string(tokens, GEONAMES_ADMIN2_ID);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%d", &g->admin2_geonames_id);
|
|
} else {
|
|
g->admin2_geonames_id = 0;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ADMIN3_CODE);
|
|
|
|
char_array_cat(g->admin3_code, token);
|
|
|
|
cstring_array_get_string(tokens, GEONAMES_ADMIN3_ID);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%d", &g->admin3_geonames_id);
|
|
} else {
|
|
g->admin3_geonames_id = 0;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_CODE);
|
|
char_array_cat(g->admin4_code, token);
|
|
|
|
token = cstring_array_get_string(tokens, GEONAMES_ADMIN4_ID);
|
|
if (strlen(token)) {
|
|
sscanf(token, "%d", &g->admin4_geonames_id);
|
|
} else {
|
|
g->admin4_geonames_id = 0;
|
|
}
|
|
|
|
cstring_array_destroy(tokens);
|
|
return true;
|
|
|
|
exit_geoname_free_tokens:
|
|
cstring_array_destroy(tokens);
|
|
return false;
|
|
}
|
|
|
|
static bool read_gn_postal_code_from_line(gn_postal_code_t *postal, char *line) {
|
|
size_t token_count;
|
|
int i;
|
|
|
|
gn_postal_code_clear(postal);
|
|
|
|
char *token;
|
|
|
|
cstring_array *tokens = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count);
|
|
|
|
if (tokens == NULL) return false;
|
|
|
|
if (token_count != NUM_POSTAL_FIELDS) {
|
|
log_error("Number of fields (%d) != expected (%d)\n", token_count, NUM_POSTAL_FIELDS);
|
|
goto exit_postal_tokens_created;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GN_POSTAL_CODE);
|
|
if (strlen(token) == 0) {
|
|
log_error("postal_code field required\n");
|
|
goto exit_postal_tokens_created;
|
|
}
|
|
|
|
token = cstring_array_get_string(tokens, GN_POSTAL_CODE);
|
|
char_array_cat(postal->postal_code, token);
|
|
token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_CODE);
|
|
|
|
token = cstring_array_get_string(tokens, GN_POSTAL_COUNTRY_GEONAMES_ID);
|
|
if (strlen(token) > 0) {
|
|
sscanf(token, "%d", &postal->country_geonames_id);
|
|
} else {
|
|
postal->country_geonames_id = 0;
|
|
}
|
|
|
|
char_array_cat(postal->country_code, token);
|
|
token = cstring_array_get_string(tokens, GN_POSTAL_CONTAINING_GEONAME_ID);
|
|
char_array_cat(postal->containing_geoname, token);
|
|
|
|
char *admin1_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN1_IDS);
|
|
size_t admin1_field_len = strlen(admin1_field);
|
|
|
|
if (admin1_field_len > 0) {
|
|
size_t admin1_token_count;
|
|
cstring_array *admin1_tokens = cstring_array_split(admin1_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin1_token_count);
|
|
uint32_t admin1_id;
|
|
if (admin1_token_count > 0) {
|
|
for (i = 0; i < admin1_token_count; i++) {
|
|
char *admin1_token = cstring_array_get_string(tokens, i);
|
|
if (strlen(admin1_token) > 0) {
|
|
sscanf(admin1_token, "%u", &admin1_id);
|
|
uint32_array_push(postal->admin1_ids, admin1_id);
|
|
}
|
|
}
|
|
}
|
|
cstring_array_destroy(admin1_tokens);
|
|
}
|
|
|
|
char *admin2_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN2_IDS);
|
|
size_t admin2_field_len = strlen(admin2_field);
|
|
|
|
if (admin2_field_len > 0) {
|
|
size_t admin2_token_count;
|
|
cstring_array *admin2_tokens = cstring_array_split(admin2_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin2_token_count);
|
|
uint32_t admin2_id;
|
|
if (admin2_token_count > 0) {
|
|
for (i = 0; i < admin2_token_count; i++) {
|
|
char *admin2_token = cstring_array_get_string(admin2_tokens, i);
|
|
if (strlen(admin2_token) > 0) {
|
|
sscanf(admin2_token, "%u", &admin2_id);
|
|
uint32_array_push(postal->admin2_ids, admin2_id);
|
|
}
|
|
}
|
|
}
|
|
cstring_array_destroy(admin2_tokens);
|
|
}
|
|
|
|
char *admin3_field = cstring_array_get_string(tokens, GN_POSTAL_ADMIN3_IDS);
|
|
size_t admin3_field_len = strlen(admin3_field);
|
|
|
|
if (admin3_field_len > 0) {
|
|
size_t admin3_token_count;
|
|
cstring_array *admin3_tokens = cstring_array_split(admin3_field, COMMA_SEPARATOR, COMMA_SEPARATOR_LEN, &admin3_token_count);
|
|
uint32_t admin3_id;
|
|
if (admin3_token_count > 0) {
|
|
for (i = 0; i < admin3_token_count; i++) {
|
|
char *admin3_token = cstring_array_get_string(admin3_tokens, i);
|
|
if (strlen(admin3_token) > 0) {
|
|
sscanf(admin3_token, "%u", &admin3_id);
|
|
uint32_array_push(postal->admin3_ids, admin3_id);
|
|
}
|
|
}
|
|
}
|
|
cstring_array_destroy(admin3_tokens);
|
|
}
|
|
|
|
cstring_array_destroy(tokens);
|
|
return true;
|
|
|
|
exit_postal_tokens_created:
|
|
cstring_array_destroy(tokens);
|
|
return false;
|
|
}
|
|
|
|
|
|
typedef struct geodb_builder {
|
|
trie_t *trie;
|
|
sparkey_logwriter *log_writer;
|
|
bloom_filter_t *bloom_filter;
|
|
} geodb_builder_t;
|
|
|
|
void geodb_builder_destroy(geodb_builder_t *self) {
|
|
if (self == NULL) return;
|
|
|
|
if (self->trie != NULL) {
|
|
trie_destroy(self->trie);
|
|
}
|
|
|
|
if (self->bloom_filter != NULL) {
|
|
bloom_filter_destroy(self->bloom_filter);
|
|
}
|
|
|
|
if (self->log_writer != NULL) {
|
|
sparkey_logwriter_close(&self->log_writer);
|
|
}
|
|
|
|
free(self);
|
|
|
|
}
|
|
|
|
geodb_builder_t *geodb_builder_new(char *log_filename) {
|
|
geodb_builder_t *builder = malloc(sizeof(geodb_builder_t));
|
|
|
|
if (builder == NULL) return NULL;
|
|
|
|
builder->trie = trie_new();
|
|
|
|
if (builder->trie == NULL) {
|
|
goto exit_destroy_builder;
|
|
}
|
|
|
|
builder->bloom_filter = bloom_filter_new(GEODB_BLOOM_FILTER_SIZE, GEODB_BLOOM_FILTER_ERROR);
|
|
if (builder->bloom_filter == NULL) {
|
|
goto exit_destroy_builder;
|
|
}
|
|
|
|
sparkey_returncode ret_code = sparkey_logwriter_create(&builder->log_writer, log_filename, SPARKEY_COMPRESSION_NONE, 0);
|
|
if (ret_code != SPARKEY_SUCCESS) {
|
|
goto exit_destroy_builder;
|
|
}
|
|
|
|
return builder;
|
|
|
|
exit_destroy_builder:
|
|
geodb_builder_destroy(builder);
|
|
return NULL;
|
|
}
|
|
|
|
uint16_t get_address_component(uint32_t boundary_type) {
|
|
if (boundary_type == GEONAMES_LOCALITY) {
|
|
return ADDRESS_LOCALITY;
|
|
} else if (boundary_type == GEONAMES_NEIGHBORHOOD) {
|
|
return ADDRESS_NEIGHBORHOOD;
|
|
} else if (boundary_type == GEONAMES_ADMIN1) {
|
|
return ADDRESS_ADMIN1;
|
|
} else if (boundary_type == GEONAMES_ADMIN2) {
|
|
return ADDRESS_ADMIN2;
|
|
} else if (boundary_type == GEONAMES_ADMIN3) {
|
|
return ADDRESS_ADMIN3;
|
|
} else if (boundary_type == GEONAMES_ADMIN4) {
|
|
return ADDRESS_ADMIN4;
|
|
} else if (boundary_type == GEONAMES_ADMIN_OTHER) {
|
|
return ADDRESS_ADMIN_OTHER;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
bool geodb_builder_add_to_trie(geodb_builder_t *self, char *key, bool is_canonical, uint16_t address_components) {
|
|
if (self == NULL || self->trie == NULL) return false;
|
|
uint32_t node_id = trie_get(self->trie, key);
|
|
|
|
geodb_value_t value;
|
|
value.value = 0;
|
|
|
|
if (node_id == NULL_NODE_ID) {
|
|
value.components |= address_components;
|
|
value.is_canonical = is_canonical;
|
|
value.count = 1;
|
|
return trie_add(self->trie, key, value.value);
|
|
|
|
} else {
|
|
if (!trie_get_data_at_index(self->trie, node_id, &value.value)) {
|
|
return false;
|
|
}
|
|
|
|
value.components |= address_components;
|
|
value.is_canonical = is_canonical;
|
|
value.count++;
|
|
|
|
return trie_set_data_at_index(self->trie, node_id, value.value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bool geodb_finalize(geodb_builder_t *self, char *output_dir) {
|
|
char_array *path = char_array_new_size(strlen(output_dir));
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_TRIE_FILENAME);
|
|
char *trie_path = char_array_get_string(path);
|
|
|
|
trie_save(self->trie, trie_path);
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_HASH_FILENAME);
|
|
|
|
char *hash_filename = strdup(char_array_get_string(path));
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_LOG_FILENAME);
|
|
char *log_filename = char_array_get_string(path);
|
|
|
|
if (self->log_writer != NULL) {
|
|
sparkey_logwriter_close(&self->log_writer);
|
|
self->log_writer = NULL;
|
|
}
|
|
|
|
if ((sparkey_hash_write(hash_filename, log_filename, 0)) != SPARKEY_SUCCESS) {
|
|
log_error("Could not write Sparkey hash file\n");
|
|
free(hash_filename);
|
|
return false;
|
|
}
|
|
|
|
free(hash_filename);
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_BLOOM_FILTER_FILENAME);
|
|
char *bloom_filter_path = char_array_get_string(path);
|
|
if (!bloom_filter_save(self->bloom_filter, bloom_filter_path)) {
|
|
log_error("Could not save bloom filter\n");
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
bool name_is_iso_code(char *name) {
|
|
size_t len = strlen(name);
|
|
return (len == 2 || len == 3) && string_is_upper(name);
|
|
}
|
|
|
|
void import_geonames(geodb_builder_t *self, char *filename) {
|
|
FILE *f = fopen(filename, "r");
|
|
if (f == NULL) {
|
|
printf("Couldn't open file\n");
|
|
exit(1);
|
|
}
|
|
|
|
char *line;
|
|
char *prev_name = NULL;
|
|
geoname_t *g = geoname_new();
|
|
|
|
char_array *serialized = char_array_new();
|
|
|
|
// Just a set of all ids in GeoNames so we only add keys once, takes up < 50MB
|
|
khash_t(int_set) *all_ids = kh_init(int_set);
|
|
|
|
khash_t(int_set) *distinct_ids = kh_init(int_set);
|
|
khash_t(str_set) *distinct_features = kh_init(str_set);
|
|
|
|
khiter_t key;
|
|
int ret;
|
|
|
|
cstring_array *geo_features = cstring_array_new();
|
|
|
|
char id_string[INT32_MAX_STRING_SIZE];
|
|
|
|
int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
|
|
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
|
|
|
|
uint32_array *ordered_ids = uint32_array_new();
|
|
char_array *ordered_ids_str = char_array_new();
|
|
|
|
cmp_ctx_t ctx;
|
|
msgpack_buffer_t buffer = (msgpack_buffer_t){ordered_ids_str, 0};
|
|
|
|
cmp_init(&ctx, &buffer, msgpack_bytes_reader, msgpack_bytes_writer);
|
|
|
|
int i = 0;
|
|
|
|
while ((line = file_getline(f)) != NULL) {
|
|
read_geoname_from_line(g, line);
|
|
char *name = char_array_get_string(g->name);
|
|
|
|
char *canonical = char_array_get_string(g->canonical);
|
|
|
|
bool is_canonical = strcmp(name, canonical) == 0;
|
|
|
|
char *utf8_normalized = NULL;
|
|
|
|
size_t id_len = sprintf(id_string, "%d", g->geonames_id);
|
|
|
|
if (g->type == GEONAMES_COUNTRY && name_is_iso_code(name)) {
|
|
utf8_normalized = strdup(name);
|
|
} else if (name != NULL) {
|
|
utf8_normalized = normalize_string_utf8(name, normalize_utf8_options);
|
|
}
|
|
|
|
if (utf8_normalized != NULL && (prev_name == NULL || strcmp(utf8_normalized, prev_name) != 0)) {
|
|
// New name
|
|
|
|
geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, get_address_component(g->type));
|
|
|
|
cmp_write_uint_vector(&ctx, ordered_ids);
|
|
|
|
if ((sparkey_logwriter_put(self->log_writer, strlen(utf8_normalized), (uint8_t *)utf8_normalized, ordered_ids_str->n - 1, (uint8_t *)char_array_get_string(ordered_ids_str))) != SPARKEY_SUCCESS) {
|
|
log_error("Error writing ids string to Sparkey\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
uint32_array_clear(ordered_ids);
|
|
char_array_clear(ordered_ids_str);
|
|
kh_clear(int_set, distinct_ids);
|
|
kh_clear(str_set, distinct_features);
|
|
|
|
} else if (utf8_normalized != NULL) {
|
|
key = kh_get(int_set, distinct_ids, g->geonames_id);
|
|
if (key == kh_end(distinct_ids)) {
|
|
geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, get_address_component(g->type));
|
|
}
|
|
} else {
|
|
log_error("normalization failed for name %s\n", name);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
char_array_clear(serialized);
|
|
|
|
if (!geoname_serialize(g, serialized)) {
|
|
log_error("geoname_serialize failed for id=%d\n", g->geonames_id);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
key = kh_get(int_set, all_ids, g->geonames_id);
|
|
if (key == kh_end(all_ids)) {
|
|
|
|
if ((sparkey_logwriter_put(self->log_writer, strlen(id_string), (uint8_t *)id_string, serialized->n, (uint8_t *)char_array_get_string(serialized))) != SPARKEY_SUCCESS) {
|
|
log_error("Error writing to Sparkey with id=%d\n", g->geonames_id);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
key = kh_put(int_set, all_ids, g->geonames_id, &ret);
|
|
}
|
|
|
|
key = kh_get(int_set, distinct_ids, g->geonames_id);
|
|
|
|
if (key == kh_end(distinct_ids)) {
|
|
uint32_array_push(ordered_ids, g->geonames_id);
|
|
}
|
|
|
|
key = kh_put(int_set, distinct_ids, g->geonames_id, &ret);
|
|
|
|
char_array_clear(g->name);
|
|
char_array_cat(g->name, utf8_normalized);
|
|
|
|
cstring_array_clear(geo_features);
|
|
|
|
if (!geodisambig_add_geoname_features(geo_features, g)) {
|
|
log_error("Could not add geonames features for id=%d\n", g->geonames_id);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
for (int i = 0; i < cstring_array_num_strings(geo_features); i++) {
|
|
char *token = cstring_array_get_string(geo_features, i);
|
|
key = kh_get(str_set, distinct_features, token);
|
|
if (key == kh_end(distinct_features)) {
|
|
// Not in set, this GeoName takes priority
|
|
if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, strlen(id_string), (uint8_t *)id_string) != SPARKEY_SUCCESS) {
|
|
log_error("Error writing key %s to Sparkey\n", token);
|
|
}
|
|
|
|
bloom_filter_add(self->bloom_filter, token, strlen(token));
|
|
|
|
key = kh_put(str_set, distinct_features, token, &ret);
|
|
}
|
|
}
|
|
|
|
if (prev_name != NULL) {
|
|
free(prev_name);
|
|
}
|
|
|
|
if (utf8_normalized != NULL) {
|
|
prev_name = utf8_normalized;
|
|
}
|
|
|
|
free(line);
|
|
i++;
|
|
|
|
if (i % 1000 == 0) {
|
|
log_info("Did %d geonames\n", i);
|
|
}
|
|
}
|
|
|
|
kh_destroy(int_set, all_ids);
|
|
kh_destroy(int_set, distinct_ids);
|
|
kh_destroy(str_set, distinct_features);
|
|
|
|
char_array_destroy(serialized);
|
|
|
|
cstring_array_destroy(geo_features);
|
|
|
|
geoname_destroy(g);
|
|
fclose(f);
|
|
}
|
|
|
|
void import_geonames_postal_codes(geodb_builder_t *self, char *filename) {
|
|
FILE *f = fopen(filename, "r");
|
|
if (f == NULL) {
|
|
printf("Couldn't open file\n");
|
|
exit(1);
|
|
}
|
|
|
|
char *line;
|
|
|
|
char *prev_code = NULL;
|
|
gn_postal_code_t *pc = gn_postal_code_new();
|
|
|
|
char_array *serialized = char_array_new();
|
|
|
|
cstring_array *postal_code_features = cstring_array_new();
|
|
|
|
khash_t(str_set) *distinct_features = kh_init(str_set);
|
|
|
|
khiter_t key;
|
|
int ret;
|
|
|
|
int i = 0;
|
|
|
|
// Always true for postal codes
|
|
bool is_canonical = true;
|
|
|
|
while ((line = file_getline(f)) != NULL) {
|
|
if (!read_gn_postal_code_from_line(pc, line)) {
|
|
log_error("Error reading line: %s\n", line);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
char *code = char_array_get_string(pc->postal_code);
|
|
char *utf8_normalized = normalize_string_utf8(code, NORMALIZE_STRING_LOWERCASE);
|
|
|
|
if (utf8_normalized == NULL) {
|
|
log_error("normalization failed for postal code %s\n", code);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if (prev_code == NULL || strcmp(utf8_normalized, prev_code) != 0) {
|
|
kh_clear(str_set, distinct_features);
|
|
}
|
|
|
|
geodb_builder_add_to_trie(self, utf8_normalized, is_canonical, ADDRESS_POSTAL_CODE);
|
|
|
|
char_array_clear(serialized);
|
|
if (!gn_postal_code_serialize(pc, serialized)) {
|
|
log_error("gn_postal_code_serialize failed for postal code=%s\n", code);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
cstring_array_clear(postal_code_features);
|
|
|
|
char_array_clear(pc->postal_code);
|
|
char_array_cat(pc->postal_code, utf8_normalized);
|
|
|
|
if (!geodisambig_add_postal_code_features(postal_code_features, pc)) {
|
|
log_error("Could not add geonames features for postal code=%s\n", code);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
for (int i = 0; i < cstring_array_num_strings(postal_code_features); i++) {
|
|
char *token = cstring_array_get_string(postal_code_features, i);
|
|
key = kh_get(str_set, distinct_features, token);
|
|
if (key == kh_end(distinct_features)) {
|
|
// Not in set, this GeoName takes priority
|
|
if (sparkey_logwriter_put(self->log_writer, strlen(token), (uint8_t *)token, serialized->n, (uint8_t *)char_array_get_string(serialized)) != SPARKEY_SUCCESS) {
|
|
log_error("Error writing key %s to Sparkey\n", token);
|
|
}
|
|
|
|
bloom_filter_add(self->bloom_filter, token, strlen(token));
|
|
|
|
key = kh_put(str_set, distinct_features, token, &ret);
|
|
}
|
|
}
|
|
|
|
if (prev_code != NULL) {
|
|
free(prev_code);
|
|
}
|
|
|
|
if (utf8_normalized != NULL) {
|
|
prev_code = utf8_normalized;
|
|
}
|
|
|
|
free(line);
|
|
i++;
|
|
|
|
if (i % 1000 == 0) {
|
|
log_info("Did %d postal codes\n", i);
|
|
}
|
|
}
|
|
|
|
kh_destroy(str_set, distinct_features);
|
|
char_array_destroy(serialized);
|
|
cstring_array_destroy(postal_code_features);
|
|
|
|
gn_postal_code_destroy(pc);
|
|
|
|
fclose(f);
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
char *input_dir;
|
|
char *output_dir;
|
|
if (argc > 2) {
|
|
input_dir = argv[1];
|
|
output_dir = argv[2];
|
|
} else {
|
|
input_dir = LIBPOSTAL_GEONAMES_DIR;
|
|
output_dir = LIBPOSTAL_GEODB_DIR;
|
|
}
|
|
|
|
char *geonames_filename = "geonames.tsv";
|
|
|
|
char_array *path = char_array_new_size(strlen(input_dir));
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, input_dir, geonames_filename);
|
|
char *geonames_path = strdup(char_array_get_string(path));
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, GEODB_LOG_FILENAME);
|
|
char *log_filename = char_array_get_string(path);
|
|
|
|
geodb_builder_t *builder = geodb_builder_new(log_filename);
|
|
|
|
import_geonames(builder, geonames_path);
|
|
|
|
free(geonames_path);
|
|
|
|
printf("\n\n");
|
|
|
|
char *postal_codes_filename = "postal_codes.tsv";
|
|
|
|
char_array_clear(path);
|
|
|
|
char_array_add_joined(path, PATH_SEPARATOR, true, 2, input_dir, postal_codes_filename);
|
|
char *postal_codes_path = char_array_get_string(path);
|
|
|
|
log_info("Doing postal_codes\n");
|
|
|
|
import_geonames_postal_codes(builder, postal_codes_path);
|
|
|
|
char_array_destroy(path);
|
|
|
|
if (!geodb_finalize(builder, output_dir)) {
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
geodb_builder_destroy(builder);
|
|
|
|
exit(EXIT_SUCCESS);
|
|
|
|
}
|