[geodb] Adding separate bitset for geonames place types and using NFC normalization instead of NFD (requires retraining)
This commit is contained in:
@@ -379,23 +379,24 @@ exit_destroy_builder:
|
||||
/*
|
||||
Map of geonames boundary types to address components
|
||||
*/
|
||||
|
||||
uint16_t get_address_component(uint32_t boundary_type) {
|
||||
if (boundary_type == GEONAMES_LOCALITY) {
|
||||
return ADDRESS_LOCALITY;
|
||||
return GEONAMES_ADDRESS_COMPONENT_LOCALITY;
|
||||
} else if (boundary_type == GEONAMES_NEIGHBORHOOD) {
|
||||
return ADDRESS_NEIGHBORHOOD;
|
||||
return GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD;
|
||||
} else if (boundary_type == GEONAMES_ADMIN1) {
|
||||
return ADDRESS_ADMIN1;
|
||||
return GEONAMES_ADDRESS_COMPONENT_ADMIN1;
|
||||
} else if (boundary_type == GEONAMES_COUNTRY) {
|
||||
return ADDRESS_COUNTRY;
|
||||
return GEONAMES_ADDRESS_COMPONENT_COUNTRY;
|
||||
} else if (boundary_type == GEONAMES_ADMIN2) {
|
||||
return ADDRESS_ADMIN2;
|
||||
return GEONAMES_ADDRESS_COMPONENT_ADMIN2;
|
||||
} else if (boundary_type == GEONAMES_ADMIN3) {
|
||||
return ADDRESS_ADMIN3;
|
||||
return GEONAMES_ADDRESS_COMPONENT_ADMIN3;
|
||||
} else if (boundary_type == GEONAMES_ADMIN4) {
|
||||
return ADDRESS_ADMIN4;
|
||||
return GEONAMES_ADDRESS_COMPONENT_ADMIN4;
|
||||
} else if (boundary_type == GEONAMES_ADMIN_OTHER) {
|
||||
return ADDRESS_ADMIN_OTHER;
|
||||
return GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
@@ -574,7 +575,7 @@ void import_geonames(geodb_builder_t *self, char *filename) {
|
||||
|
||||
char id_string[INT32_MAX_STRING_SIZE + 1];
|
||||
|
||||
int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
|
||||
int normalize_utf8_options = NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
|
||||
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
|
||||
|
||||
int i = 0;
|
||||
@@ -764,7 +765,7 @@ void import_geonames_postal_codes(geodb_builder_t *self, char *filename) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
geodb_builder_add_name(self, utf8_normalized, is_canonical, ADDRESS_POSTAL_CODE);
|
||||
geodb_builder_add_name(self, utf8_normalized, is_canonical, GEONAMES_ADDRESS_COMPONENT_POSTCODE);
|
||||
|
||||
char_array_clear(serialized);
|
||||
if (!gn_postal_code_serialize(pc, serialized)) {
|
||||
|
||||
@@ -25,6 +25,16 @@ typedef enum {
|
||||
NUM_BOUNDARY_TYPES
|
||||
} boundary_type_t;
|
||||
|
||||
#define GEONAMES_ADDRESS_COMPONENT_COUNTRY (1 << 0)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_ADMIN1 (1 << 1)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_ADMIN2 (1 << 2)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_ADMIN3 (1 << 3)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_ADMIN4 (1 << 4)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER (1 << 5)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_LOCALITY (1 << 6)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD (1 << 7)
|
||||
#define GEONAMES_ADDRESS_COMPONENT_POSTCODE (1 << 8)
|
||||
|
||||
typedef struct geoname {
|
||||
uint32_t geonames_id;
|
||||
char_array *name;
|
||||
|
||||
Reference in New Issue
Block a user