[geodb] Adding separate bitset for geonames place types and using NFC normalization instead of NFD (requires retraining)

This commit is contained in:
Al
2016-05-29 01:36:00 -04:00
parent 6c39c663ff
commit c2ee5a45b3
2 changed files with 21 additions and 10 deletions

View File

@@ -379,23 +379,24 @@ exit_destroy_builder:
/* /*
Map of geonames boundary types to address components Map of geonames boundary types to address components
*/ */
uint16_t get_address_component(uint32_t boundary_type) { uint16_t get_address_component(uint32_t boundary_type) {
if (boundary_type == GEONAMES_LOCALITY) { if (boundary_type == GEONAMES_LOCALITY) {
return ADDRESS_LOCALITY; return GEONAMES_ADDRESS_COMPONENT_LOCALITY;
} else if (boundary_type == GEONAMES_NEIGHBORHOOD) { } else if (boundary_type == GEONAMES_NEIGHBORHOOD) {
return ADDRESS_NEIGHBORHOOD; return GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD;
} else if (boundary_type == GEONAMES_ADMIN1) { } else if (boundary_type == GEONAMES_ADMIN1) {
return ADDRESS_ADMIN1; return GEONAMES_ADDRESS_COMPONENT_ADMIN1;
} else if (boundary_type == GEONAMES_COUNTRY) { } else if (boundary_type == GEONAMES_COUNTRY) {
return ADDRESS_COUNTRY; return GEONAMES_ADDRESS_COMPONENT_COUNTRY;
} else if (boundary_type == GEONAMES_ADMIN2) { } else if (boundary_type == GEONAMES_ADMIN2) {
return ADDRESS_ADMIN2; return GEONAMES_ADDRESS_COMPONENT_ADMIN2;
} else if (boundary_type == GEONAMES_ADMIN3) { } else if (boundary_type == GEONAMES_ADMIN3) {
return ADDRESS_ADMIN3; return GEONAMES_ADDRESS_COMPONENT_ADMIN3;
} else if (boundary_type == GEONAMES_ADMIN4) { } else if (boundary_type == GEONAMES_ADMIN4) {
return ADDRESS_ADMIN4; return GEONAMES_ADDRESS_COMPONENT_ADMIN4;
} else if (boundary_type == GEONAMES_ADMIN_OTHER) { } else if (boundary_type == GEONAMES_ADMIN_OTHER) {
return ADDRESS_ADMIN_OTHER; return GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER;
} else { } else {
return 0; return 0;
} }
@@ -574,7 +575,7 @@ void import_geonames(geodb_builder_t *self, char *filename) {
char id_string[INT32_MAX_STRING_SIZE + 1]; char id_string[INT32_MAX_STRING_SIZE + 1];
int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM; int normalize_utf8_options = NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII; //int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
int i = 0; int i = 0;
@@ -764,7 +765,7 @@ void import_geonames_postal_codes(geodb_builder_t *self, char *filename) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
geodb_builder_add_name(self, utf8_normalized, is_canonical, ADDRESS_POSTAL_CODE); geodb_builder_add_name(self, utf8_normalized, is_canonical, GEONAMES_ADDRESS_COMPONENT_POSTCODE);
char_array_clear(serialized); char_array_clear(serialized);
if (!gn_postal_code_serialize(pc, serialized)) { if (!gn_postal_code_serialize(pc, serialized)) {

View File

@@ -25,6 +25,16 @@ typedef enum {
NUM_BOUNDARY_TYPES NUM_BOUNDARY_TYPES
} boundary_type_t; } boundary_type_t;
#define GEONAMES_ADDRESS_COMPONENT_COUNTRY (1 << 0)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN1 (1 << 1)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN2 (1 << 2)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN3 (1 << 3)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN4 (1 << 4)
#define GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER (1 << 5)
#define GEONAMES_ADDRESS_COMPONENT_LOCALITY (1 << 6)
#define GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD (1 << 7)
#define GEONAMES_ADDRESS_COMPONENT_POSTCODE (1 << 8)
typedef struct geoname { typedef struct geoname {
uint32_t geonames_id; uint32_t geonames_id;
char_array *name; char_array *name;