diff --git a/src/address_parser.h b/src/address_parser.h index 7d72b002..a00b9a8a 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -58,9 +58,13 @@ with the general error-driven averaged perceptron. #define NULL_PHRASE_MEMBERSHIP -1 -#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII +#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_SIMPLE_LATIN_ASCII +#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII +#define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8 NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_STRIP_ACCENTS + #define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS -#define ADDRESS_PARSER_NORMALIZE_PHRASE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS +#define ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS +#define ADDRESS_PARSER_NORMALIZE_POSTAL_CODE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC #define ADDRESS_SEPARATOR_NONE 0 #define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0 @@ -72,18 +76,17 @@ with the general error-driven averaged perceptron. #define SEPARATOR_LABEL "sep" #define FIELD_SEPARATOR_LABEL "fsep" -#define ADDRESS_COMPONENT_HOUSE 1 << 0 -#define ADDRESS_COMPONENT_HOUSE_NUMBER 1 << 1 -#define ADDRESS_COMPONENT_ROAD 1 << 4 -#define ADDRESS_COMPONENT_SUBURB 1 << 7 -#define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 8 -#define ADDRESS_COMPONENT_CITY 1 << 9 -#define ADDRESS_COMPONENT_ISLAND 1 << 10 -#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 11 -#define ADDRESS_COMPONENT_STATE 1 << 12 -#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 13 -#define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 14 -#define ADDRESS_COMPONENT_COUNTRY 1 << 15 +#define ADDRESS_COMPONENT_NON_BOUNDARY 1 << 0 +#define ADDRESS_COMPONENT_SUBURB 1 << 3 +#define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 4 +#define ADDRESS_COMPONENT_CITY 1 << 5 +#define ADDRESS_COMPONENT_ISLAND 1 << 7 +#define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 8 +#define ADDRESS_COMPONENT_STATE 1 << 9 +#define ADDRESS_COMPONENT_POSTAL_CODE 1 << 10 +#define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 11 +#define ADDRESS_COMPONENT_COUNTRY 1 << 13 +#define ADDRESS_COMPONENT_WORLD_REGION 1 << 14 typedef enum { ADDRESS_PARSER_BOUNDARY_NONE, @@ -96,6 +99,7 @@ typedef enum { ADDRESS_PARSER_BOUNDARY_POSTAL_CODE, ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION, ADDRESS_PARSER_BOUNDARY_COUNTRY, + ADDRESS_PARSER_BOUNDARY_WORLD_REGION, NUM_ADDRESS_PARSER_BOUNDARY_TYPES } address_parser_boundary_components; @@ -110,8 +114,9 @@ typedef enum { #define ADDRESS_PARSER_LABEL_ISLAND "island" #define ADDRESS_PARSER_LABEL_STATE "state" #define ADDRESS_PARSER_LABEL_POSTAL_CODE "postcode" -#define ADDRESS_PARSER_LABEL_COUNTRY "country_region" +#define ADDRESS_PARSER_LABEL_COUNTRY_REGION "country_region" #define ADDRESS_PARSER_LABEL_COUNTRY "country" +#define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region" typedef union address_parser_types { uint32_t value; diff --git a/src/address_parser_io.c b/src/address_parser_io.c index a09bf783..a0a6257e 100644 --- a/src/address_parser_io.c +++ b/src/address_parser_io.c @@ -10,6 +10,8 @@ address_parser_data_set_t *address_parser_data_set_init(char *filename) { data_set->tokens = token_array_new(); data_set->tokenized_str = NULL; + data_set->normalizations = cstring_array_new(); + data_set->norm = 0; data_set->labels = cstring_array_new(); data_set->separators = uint32_array_new(); data_set->language = char_array_new_size(MAX_LANGUAGE_LEN); @@ -24,6 +26,63 @@ bool address_parser_data_set_rewind(address_parser_data_set_t *self) { return (fseek(self->f, 0, SEEK_SET) == 0); } + +bool address_parser_all_normalizations(cstring_array *strings, char *str, char *language) { + if (strings == NULL) return false; + + char *lowercased = normalize_string_utf8(str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS); + if (lowercased == NULL) { + return false; + } + + cstring_array_add_string(strings, lowercased); + + char *latin_normalized = normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN); + if (latin_normalized != NULL) { + if (!string_equals(latin_normalized, lowercased)) { + cstring_array_add_string(strings, latin_normalized); + } + free(latin_normalized); + } + + char *trans_name = NULL; + char *transliterated = NULL; + char *transliterated_utf8_normalized = NULL; + + foreach_transliterator(SCRIPT_LATIN, language, trans_name, { + if (!string_equals(trans_name, LATIN_ASCII)) { + transliterated = transliterate(trans_name, str, strlen(str)); + if (transliterated != NULL) { + transliterated_utf8_normalized = normalize_string_utf8(transliterated, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8); + if (transliterated_utf8_normalized != NULL) { + if (!string_equals(transliterated_utf8_normalized, lowercased)) { + cstring_array_add_string(strings, transliterated_utf8_normalized); + } + free(transliterated_utf8_normalized); + transliterated_utf8_normalized = NULL; + } else { + cstring_array_add_string(strings, transliterated); + } + + free(transliterated); + transliterated = NULL; + } + } + }) + + char *utf8_normalized = normalize_string_utf8(str, ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8); + if (utf8_normalized != NULL) { + if (!string_equals(utf8_normalized, lowercased)) { + cstring_array_add_string(strings, utf8_normalized); + } + free(utf8_normalized); + } + + free(lowercased); + + return true; +} + bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char *input) { token_array *tokens = self->tokens; uint32_array *separators = self->separators; @@ -130,36 +189,47 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *self, char bool address_parser_data_set_next(address_parser_data_set_t *self) { if (self == NULL) return false; - char *line = file_getline(self->f); - if (line == NULL) { - return false; + cstring_array *fields = NULL; + + if (self->norm == 0 || self->norm >= cstring_array_num_strings(self->normalizations)) { + char *line = file_getline(self->f); + if (line == NULL) { + return false; + } + + size_t token_count; + + fields = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count); + + free(line); + + if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) { + log_error("Token count did not match, expected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count); + return false; + } + + char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE); + char *country = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_COUNTRY); + char *address = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_ADDRESS); + + char_array_clear(self->country); + char_array_add(self->country, country); + + char_array_clear(self->language); + char_array_add(self->language, language); + + log_debug("Doing: %s\n", address); + + cstring_array_clear(self->normalizations); + + if (!address_parser_all_normalizations(self->normalizations, address, language) || cstring_array_num_strings(self->normalizations) == 0) { + log_error("Error during string normalization\n"); + return false; + } + self->norm = 0; } - size_t token_count; - - cstring_array *fields = cstring_array_split(line, TAB_SEPARATOR, TAB_SEPARATOR_LEN, &token_count); - - free(line); - - if (token_count != ADDRESS_PARSER_FILE_NUM_TOKENS) { - log_error("Token count did not match, ected %d, got %zu\n", ADDRESS_PARSER_FILE_NUM_TOKENS, token_count); - return false; - } - - char *language = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_LANGUAGE); - char *country = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_COUNTRY); - char *address = cstring_array_get_string(fields, ADDRESS_PARSER_FIELD_ADDRESS); - - log_debug("Doing: %s\n", address); - - char *normalized = address_parser_normalize_string(address); - bool is_normalized = normalized != NULL; - if (!is_normalized) { - log_debug("could not normalize\n"); - normalized = strdup(address); - } - - log_debug("Normalized: %s\n", normalized); + char *normalized = cstring_array_get_string(self->normalizations, self->norm); token_array *tokens = self->tokens; cstring_array *labels = self->labels; @@ -170,12 +240,6 @@ bool address_parser_data_set_next(address_parser_data_set_t *self) { uint32_array_clear(separators); size_t len = strlen(normalized); - char_array_clear(self->country); - char_array_add(self->country, country); - - char_array_clear(self->language); - char_array_add(self->language, language); - tokenized_string_t *tokenized_str = NULL; if (address_parser_data_set_tokenize_line(self, normalized)) { @@ -186,8 +250,11 @@ bool address_parser_data_set_next(address_parser_data_set_t *self) { self->tokenized_str = tokenized_str; - free(normalized); - cstring_array_destroy(fields); + self->norm++; + + if (fields != NULL) { + cstring_array_destroy(fields); + } return tokenized_str != NULL; } @@ -204,6 +271,10 @@ void address_parser_data_set_destroy(address_parser_data_set_t *self) { token_array_destroy(self->tokens); } + if (self->normalizations != NULL) { + cstring_array_destroy(self->normalizations); + } + if (self->labels != NULL) { cstring_array_destroy(self->labels); }