[geodb] Adding separate bitset for geonames place types and using NFC normalization instead of NFD (requires retraining)
This commit is contained in:
@@ -379,23 +379,24 @@ exit_destroy_builder:
|
|||||||
/*
|
/*
|
||||||
Map of geonames boundary types to address components
|
Map of geonames boundary types to address components
|
||||||
*/
|
*/
|
||||||
|
|
||||||
uint16_t get_address_component(uint32_t boundary_type) {
|
uint16_t get_address_component(uint32_t boundary_type) {
|
||||||
if (boundary_type == GEONAMES_LOCALITY) {
|
if (boundary_type == GEONAMES_LOCALITY) {
|
||||||
return ADDRESS_LOCALITY;
|
return GEONAMES_ADDRESS_COMPONENT_LOCALITY;
|
||||||
} else if (boundary_type == GEONAMES_NEIGHBORHOOD) {
|
} else if (boundary_type == GEONAMES_NEIGHBORHOOD) {
|
||||||
return ADDRESS_NEIGHBORHOOD;
|
return GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD;
|
||||||
} else if (boundary_type == GEONAMES_ADMIN1) {
|
} else if (boundary_type == GEONAMES_ADMIN1) {
|
||||||
return ADDRESS_ADMIN1;
|
return GEONAMES_ADDRESS_COMPONENT_ADMIN1;
|
||||||
} else if (boundary_type == GEONAMES_COUNTRY) {
|
} else if (boundary_type == GEONAMES_COUNTRY) {
|
||||||
return ADDRESS_COUNTRY;
|
return GEONAMES_ADDRESS_COMPONENT_COUNTRY;
|
||||||
} else if (boundary_type == GEONAMES_ADMIN2) {
|
} else if (boundary_type == GEONAMES_ADMIN2) {
|
||||||
return ADDRESS_ADMIN2;
|
return GEONAMES_ADDRESS_COMPONENT_ADMIN2;
|
||||||
} else if (boundary_type == GEONAMES_ADMIN3) {
|
} else if (boundary_type == GEONAMES_ADMIN3) {
|
||||||
return ADDRESS_ADMIN3;
|
return GEONAMES_ADDRESS_COMPONENT_ADMIN3;
|
||||||
} else if (boundary_type == GEONAMES_ADMIN4) {
|
} else if (boundary_type == GEONAMES_ADMIN4) {
|
||||||
return ADDRESS_ADMIN4;
|
return GEONAMES_ADDRESS_COMPONENT_ADMIN4;
|
||||||
} else if (boundary_type == GEONAMES_ADMIN_OTHER) {
|
} else if (boundary_type == GEONAMES_ADMIN_OTHER) {
|
||||||
return ADDRESS_ADMIN_OTHER;
|
return GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER;
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -574,7 +575,7 @@ void import_geonames(geodb_builder_t *self, char *filename) {
|
|||||||
|
|
||||||
char id_string[INT32_MAX_STRING_SIZE + 1];
|
char id_string[INT32_MAX_STRING_SIZE + 1];
|
||||||
|
|
||||||
int normalize_utf8_options = NORMALIZE_STRING_DECOMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
|
int normalize_utf8_options = NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_TRIM;
|
||||||
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
|
//int normalize_latin_options = normalize_utf8_options | NORMALIZE_STRING_LATIN_ASCII;
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
@@ -764,7 +765,7 @@ void import_geonames_postal_codes(geodb_builder_t *self, char *filename) {
|
|||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
geodb_builder_add_name(self, utf8_normalized, is_canonical, ADDRESS_POSTAL_CODE);
|
geodb_builder_add_name(self, utf8_normalized, is_canonical, GEONAMES_ADDRESS_COMPONENT_POSTCODE);
|
||||||
|
|
||||||
char_array_clear(serialized);
|
char_array_clear(serialized);
|
||||||
if (!gn_postal_code_serialize(pc, serialized)) {
|
if (!gn_postal_code_serialize(pc, serialized)) {
|
||||||
|
|||||||
@@ -25,6 +25,16 @@ typedef enum {
|
|||||||
NUM_BOUNDARY_TYPES
|
NUM_BOUNDARY_TYPES
|
||||||
} boundary_type_t;
|
} boundary_type_t;
|
||||||
|
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_COUNTRY (1 << 0)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_ADMIN1 (1 << 1)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_ADMIN2 (1 << 2)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_ADMIN3 (1 << 3)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_ADMIN4 (1 << 4)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_ADMIN_OTHER (1 << 5)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_LOCALITY (1 << 6)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_NEIGHBORHOOD (1 << 7)
|
||||||
|
#define GEONAMES_ADDRESS_COMPONENT_POSTCODE (1 << 8)
|
||||||
|
|
||||||
typedef struct geoname {
|
typedef struct geoname {
|
||||||
uint32_t geonames_id;
|
uint32_t geonames_id;
|
||||||
char_array *name;
|
char_array *name;
|
||||||
|
|||||||
Reference in New Issue
Block a user