[geodb] Adding some logging to geodb
This commit is contained in:
@@ -578,6 +578,8 @@ void import_geonames(geodb_builder_t *self, char *filename) {
|
|||||||
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
|
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
int ambiguous = 0;
|
||||||
|
int disambiguations = 0;
|
||||||
|
|
||||||
while ((line = file_getline(f)) != NULL) {
|
while ((line = file_getline(f)) != NULL) {
|
||||||
read_geoname_from_line(g, line);
|
read_geoname_from_line(g, line);
|
||||||
@@ -604,6 +606,7 @@ void import_geonames(geodb_builder_t *self, char *filename) {
|
|||||||
|
|
||||||
// Only add disambiguation features if there's > 1 id for this name
|
// Only add disambiguation features if there's > 1 id for this name
|
||||||
if (kh_size(distinct_ids) > 1) {
|
if (kh_size(distinct_ids) > 1) {
|
||||||
|
ambiguous++;
|
||||||
|
|
||||||
uint32_t string_index = 0;
|
uint32_t string_index = 0;
|
||||||
uint32_t lengths_index = 0;
|
uint32_t lengths_index = 0;
|
||||||
@@ -611,6 +614,7 @@ void import_geonames(geodb_builder_t *self, char *filename) {
|
|||||||
uint32_t geonames_id;
|
uint32_t geonames_id;
|
||||||
|
|
||||||
kh_foreach_key(distinct_ids, key, {
|
kh_foreach_key(distinct_ids, key, {
|
||||||
|
disambiguations++;
|
||||||
uint32_t length = feature_lengths->a[lengths_index];
|
uint32_t length = feature_lengths->a[lengths_index];
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
char *token = cstring_array_get_string(geo_features, string_index);
|
char *token = cstring_array_get_string(geo_features, string_index);
|
||||||
@@ -665,19 +669,22 @@ void import_geonames(geodb_builder_t *self, char *filename) {
|
|||||||
if (key == kh_end(distinct_ids)) {
|
if (key == kh_end(distinct_ids)) {
|
||||||
key = kh_put(int_set, distinct_ids, g->geonames_id, &ret);
|
key = kh_put(int_set, distinct_ids, g->geonames_id, &ret);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
log_error("ret < 0\n");
|
log_error("Error adding id %d to set\n", g->geonames_id);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
char_array_clear(g->name);
|
char_array_clear(g->name);
|
||||||
char_array_cat(g->name, utf8_normalized);
|
char_array_cat(g->name, utf8_normalized);
|
||||||
|
|
||||||
size_t num_geo_features = cstring_array_num_strings(geo_features);
|
size_t prev_num_geo_features = cstring_array_num_strings(geo_features);
|
||||||
|
|
||||||
if (!geodisambig_add_geoname_features(geo_features, g)) {
|
if (!geodisambig_add_geoname_features(geo_features, g)) {
|
||||||
log_error("Could not add geonames features for id=%d\n", g->geonames_id);
|
log_error("Could not add geonames features for id=%d\n", g->geonames_id);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
uint32_t feature_length = (uint32_t)(cstring_array_num_strings(geo_features) - num_geo_features);
|
|
||||||
|
uint32_t num_geo_features = cstring_array_num_strings(geo_features);
|
||||||
|
uint32_t feature_length = (uint32_t)(num_geo_features - prev_num_geo_features);
|
||||||
uint32_array_push(feature_lengths, feature_length);
|
uint32_array_push(feature_lengths, feature_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -694,7 +701,7 @@ void import_geonames(geodb_builder_t *self, char *filename) {
|
|||||||
i++;
|
i++;
|
||||||
|
|
||||||
if (i % 1000 == 0) {
|
if (i % 1000 == 0) {
|
||||||
log_info("Did %d geonames\n", i);
|
log_info("Did %d geonames, %d ambiguous, %d disambiguations, names=%d, features=%d\n", i, ambiguous, disambiguations, self->names->num_keys, self->features->num_keys);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user