[geodb] Adding separate bitset for geonames place types and using NFC normalization instead of NFD (requires retraining)

This commit is contained in:
Al
2016-05-29 01:36:00 -04:00
parent 6c39c663ff
commit c2ee5a45b3
2 changed files with 21 additions and 10 deletions

View File

@@ -379,23 +379,24 @@ exit_destroy_builder:
/*
Map of geonames boundary types to address components
*/
uint16_t get_address_component(uint32_t boundary_type) {
if (boundary_type == GEONAMES_LOCALITY) {
return ADDRESS_LOCALITY;
return GEONAMES_ADDRESS_COMPONENT_LOCALITY;
} else if (boundary_type == GEONAMES_NEIGHBORHOOD) {
return ADDRESS_NEIGHBORHOOD;
return GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD;
} else if (boundary_type == GEONAMES_ADMIN1) {
return ADDRESS_ADMIN1;
return GEONAMES_ADDRESS_COMPONENT_ADMIN1;
} else if (boundary_type == GEONAMES_COUNTRY) {
return ADDRESS_COUNTRY;
return GEONAMES_ADDRESS_COMPONENT_COUNTRY;
} else if (boundary_type == GEONAMES_ADMIN2) {
return ADDRESS_ADMIN2;
return GEONAMES_ADDRESS_COMPONENT_ADMIN2;
} else if (boundary_type == GEONAMES_ADMIN3) {
return ADDRESS_ADMIN3;
return GEONAMES_ADDRESS_COMPONENT_ADMIN3;
} else if (boundary_type == GEONAMES_ADMIN4) {
return ADDRESS_ADMIN4;
return GEONAMES_ADDRESS_COMPONENT_ADMIN4;
} else if (boundary_type == GEONAMES_ADMIN_OTHER) {
return ADDRESS_ADMIN_OTHER;
return GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER;
} else {
return 0;
}
@@ -574,7 +575,7 @@ void import_geonames(geodb_builder_t *self, char *filename) {
char id_string[INT32_MAX_STRING_SIZE + 1];
int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
int normalize_utf8_options = NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
int i = 0;
@@ -764,7 +765,7 @@ void import_geonames_postal_codes(geodb_builder_t *self, char *filename) {
exit(EXIT_FAILURE);
}
geodb_builder_add_name(self, utf8_normalized, is_canonical, ADDRESS_POSTAL_CODE);
geodb_builder_add_name(self, utf8_normalized, is_canonical, GEONAMES_ADDRESS_COMPONENT_POSTCODE);
char_array_clear(serialized);
if (!gn_postal_code_serialize(pc, serialized)) {

View File

@@ -25,6 +25,16 @@ typedef enum {
NUM_BOUNDARY_TYPES
} boundary_type_t;
#define GEONAMES_ADDRESS_COMPONENT_COUNTRY (1 << 0)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN1 (1 << 1)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN2 (1 << 2)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN3 (1 << 3)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN4 (1 << 4)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER (1 << 5)
#define GEONAMES_ADDRESS_COMPONENT_LOCALITY (1 << 6)
#define GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD (1 << 7)
#define GEONAMES_ADDRESS_COMPONENT_POSTCODE (1 << 8)
typedef struct geoname {
uint32_t geonames_id;
char_array *name;