From b570855b78d83150646648a80a3f63d2ef90facc Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 10 Feb 2017 03:40:43 -0500 Subject: [PATCH] [parser] adding postcode context features and associated data structures to the parser. Masking digits, which should hopefully help with generalization. Creating positive/negative features for postcode with and without context support. Note: even with known postcodes in known contexts, only use the masked digits to avoid creating too many features that are redundant with the index. --- src/address_parser.c | 429 +++++++++++++++++++++++++++++++++---------- 1 file changed, 327 insertions(+), 102 deletions(-) diff --git a/src/address_parser.c b/src/address_parser.c index 61b74870..8ca1cec7 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -8,7 +8,8 @@ #define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat" #define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie" -#define ADDRESS_PARSER_PHRASE_FILENAME "address_parser_phrases.trie" +#define ADDRESS_PARSER_PHRASE_FILENAME "address_parser_phrases.dat" +#define ADDRESS_PARSER_POSTAL_CODES_FILENAME "address_parser_postal_codes.dat" #define UNKNOWN_WORD "UNKNOWN" #define UNKNOWN_NUMERIC "UNKNOWN_NUMERIC" @@ -71,10 +72,65 @@ bool address_parser_save(address_parser_t *self, char *output_dir) { char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_PHRASE_FILENAME); char *phrases_path = char_array_get_string(path); - if (!trie_save(self->phrase_types, phrases_path)) { + FILE *phrases_file = fopen(phrases_path, "w+"); + if (phrases_file == NULL || self->phrases == NULL) { return false; } + if (!trie_write(self->phrases, phrases_file)) { + return false; + } + + if (self->phrase_types == NULL) { + return false; + } + + size_t num_phrase_types = self->phrase_types->n; + if (!file_write_uint64(phrases_file, num_phrase_types)) { + return false; + } + + for (size_t i = 0; i < self->phrase_types->n; i++) { + address_parser_types_t phrase_type_value = self->phrase_types->a[i]; + if (!file_write_uint32(phrases_file, phrase_type_value.value)) { + return false; + } + } + + fclose(phrases_file); + + char_array_clear(path); + + char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME); + char *postal_codes_path = char_array_get_string(path); + + FILE *postal_codes_file = fopen(postal_codes_path, "w+"); + if (postal_codes_file == NULL || self->postal_codes == NULL) { + return false; + } + + if (!trie_write(self->postal_codes, postal_codes_file)) { + return false; + } + + if (self->postal_code_contexts == NULL) { + return false; + } + + size_t num_postal_code_contexts = kh_size(self->postal_code_contexts); + if (!file_write_uint64(postal_codes_file, num_postal_code_contexts)) { + return false; + } + + uint64_t postal_code_context; + kh_foreach_key(self->postal_code_contexts, postal_code_context, { + if (!file_write_uint64(postal_codes_file, postal_code_context)) { + return false; + } + }) + + fclose(postal_codes_file); + char_array_destroy(path); return true; @@ -111,9 +167,7 @@ bool address_parser_load(char *dir) { trie_t *vocab = trie_load(vocab_path); if (vocab == NULL) { - address_parser_destroy(parser); - char_array_destroy(path); - return false; + goto exit_address_parser_created; } parser->vocab = vocab; @@ -121,21 +175,98 @@ bool address_parser_load(char *dir) { char_array_clear(path); char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_PHRASE_FILENAME); - char *phrases_path = char_array_get_string(path); - trie_t *phrase_types = trie_load(phrases_path); - - if (phrase_types == NULL) { - address_parser_destroy(parser); - char_array_destroy(path); - return false; + FILE *phrases_file = fopen(phrases_path, "r"); + if (phrases_file == NULL) { + goto exit_address_parser_created; } - parser->phrase_types = phrase_types; + parser->phrases = trie_read(phrases_file); + if (parser->phrases == NULL) { + goto exit_address_parser_created; + } + + uint64_t num_phrase_types; + + if (!file_read_uint64(phrases_file, &num_phrase_types)) { + goto exit_address_parser_created; + } + + parser->phrase_types = address_parser_types_array_new_size(num_phrase_types); + + uint32_array *phrase_type_values = uint32_array_new_size(num_phrase_types); + if (!file_read_uint32_array(phrases_file, phrase_type_values->a, num_phrase_types)) { + uint32_array_destroy(phrase_type_values); + goto exit_address_parser_created; + } + phrase_type_values->n = num_phrase_types; + + for (size_t i = 0; i < phrase_type_values->n; i++) { + uint32_t phrase_type_value = phrase_type_values->a[i]; + address_parser_types_t phrase_type = {.value = phrase_type_value}; + address_parser_types_array_push(parser->phrase_types, phrase_type); + } + + uint32_array_destroy(phrase_type_values); + + fclose(phrases_file); + + char_array_clear(path); + + char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME); + + char *postal_codes_path = char_array_get_string(path); + + FILE *postal_codes_file = fopen(postal_codes_path, "r"); + if (postal_codes_file == NULL) { + goto exit_address_parser_created; + } + + parser->postal_codes = trie_read(postal_codes_file); + if (parser->postal_codes == NULL) { + goto exit_address_parser_created; + } + + uint64_t num_postal_code_contexts; + + if (!file_read_uint64(postal_codes_file, &num_postal_code_contexts)) { + goto exit_address_parser_created; + } + + log_info("num_postal_code_contexts = %llu\n", num_postal_code_contexts); + + uint64_array *postal_code_context_values = uint64_array_new_size(num_postal_code_contexts); + if (!file_read_uint64_array(postal_codes_file, postal_code_context_values->a, num_postal_code_contexts)) { + uint64_array_destroy(postal_code_context_values); + goto exit_address_parser_created; + } + postal_code_context_values->n = num_postal_code_contexts; + + + parser->postal_code_contexts = kh_init(int64_set); + if (parser->postal_code_contexts == NULL) { + goto exit_address_parser_created; + } + + for (size_t i = 0; i < postal_code_context_values->n; i++) { + uint64_t context_value = postal_code_context_values->a[i]; + int ret = 0; + kh_put(int64_set, parser->postal_code_contexts, context_value, &ret); + if (ret < 0) { + goto exit_address_parser_created; + } + } + + uint64_array_destroy(postal_code_context_values); char_array_destroy(path); return true; + +exit_address_parser_created: + address_parser_destroy(parser); + char_array_destroy(path); + return false; } void address_parser_destroy(address_parser_t *self) { @@ -149,8 +280,20 @@ void address_parser_destroy(address_parser_t *self) { trie_destroy(self->vocab); } + if (self->phrases != NULL) { + trie_destroy(self->phrases); + } + if (self->phrase_types != NULL) { - trie_destroy(self->phrase_types); + address_parser_types_array_destroy(self->phrase_types); + } + + if (self->postal_codes != NULL) { + trie_destroy(self->postal_codes); + } + + if (self->postal_code_contexts != NULL) { + kh_destroy(int64_set, self->postal_code_contexts); } free(self); @@ -274,6 +417,14 @@ void address_parser_context_destroy(address_parser_context_t *self) { int64_array_destroy(self->component_phrase_memberships); } + if (self->postal_code_phrases != NULL) { + phrase_array_destroy(self->postal_code_phrases); + } + + if (self->postal_code_phrase_memberships != NULL) { + int64_array_destroy(self->postal_code_phrase_memberships); + } + if (self->prefix_phrases != NULL) { phrase_array_destroy(self->prefix_phrases); } @@ -413,6 +564,16 @@ address_parser_context_t *address_parser_context_new(void) { goto exit_address_parser_context_allocated; } + context->postal_code_phrases = phrase_array_new(); + if (context->postal_code_phrases == NULL) { + goto exit_address_parser_context_allocated; + } + + context->postal_code_phrase_memberships = int64_array_new(); + if (context->postal_code_phrase_memberships == NULL) { + goto exit_address_parser_context_allocated; + } + context->prefix_phrases = phrase_array_new(); if (context->prefix_phrases == NULL) { goto exit_address_parser_context_allocated; @@ -430,29 +591,6 @@ exit_address_parser_context_allocated: return NULL; } -inline static void fill_phrase_memberships(phrase_array *phrases, int64_array *phrase_memberships, size_t len) { - int64_t i = 0; - for (int64_t j = 0; j < phrases->n; j++) { - phrase_t phrase = phrases->a[j]; - - for (; i < phrase.start; i++) { - int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP); - log_debug("token i=%lld, null phrase membership\n", i); - } - - for (i = phrase.start; i < phrase.start + phrase.len; i++) { - log_debug("token i=%lld, phrase membership=%lld\n", i, j); - int64_array_push(phrase_memberships, j); - } - } - - for (; i < len; i++) { - log_debug("token i=%lld, null phrase membership\n", i); - int64_array_push(phrase_memberships, NULL_PHRASE_MEMBERSHIP); - } -} - - void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) { uint32_t token_index; char *word; @@ -528,8 +666,8 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars size_t num_tokens = tokens->n; - bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, context->language, &context->address_dictionary_phrases); - fill_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens); + bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, context->language, &address_dictionary_phrases); + token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens); for (size_t i = 0; i < num_tokens; i++) { token_t token = tokens->a[i]; @@ -560,8 +698,17 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars phrase_array *component_phrases = context->component_phrases; int64_array *component_phrase_memberships = context->component_phrase_memberships; - bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrase_types, normalized_str_admin, normalized_admin_tokens, &component_phrases); - fill_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens); + bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrases, normalized_str_admin, normalized_admin_tokens, &component_phrases); + token_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens); + + phrase_array_clear(context->postal_code_phrases); + int64_array_clear(context->postal_code_phrase_memberships); + + phrase_array *postal_code_phrases = context->postal_code_phrases; + int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships; + + bool have_postal_code_phrases = trie_search_tokens_with_phrases(parser->postal_codes, normalized_str_admin, normalized_admin_tokens, &postal_code_phrases); + token_phrase_memberships(postal_code_phrases, postal_code_phrase_memberships, num_tokens); } @@ -585,7 +732,7 @@ typedef struct address_parser_phrase { phrase_t phrase; } address_parser_phrase_t; -static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i) { +static inline address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i) { phrase_t phrase; address_parser_phrase_t response; char *phrase_string = NULL; @@ -603,13 +750,8 @@ static inline address_parser_phrase_t word_or_phrase_at_index(tokenized_string_t return response; } - address_parser_types_t types; - phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i); if (phrase.len > 0) { - types.value = phrase.data; - uint32_t component_phrase_types = types.components; - phrase_string = cstring_array_get_phrase(context->normalized_admin, context->context_component_phrase, phrase); response = (address_parser_phrase_t){ @@ -815,6 +957,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize int64_array *address_phrase_memberships = context->address_phrase_memberships; phrase_array *component_phrases = context->component_phrases; int64_array *component_phrase_memberships = context->component_phrase_memberships; + phrase_array *postal_code_phrases = context->postal_code_phrases; + int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships; cstring_array *normalized = context->normalized; uint32_array *separators = context->separators; @@ -853,6 +997,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize bool add_word_feature = true; + size_t num_tokens = tokenized->tokens->n; + // Address dictionary phrases if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) { phrase = address_dictionary_phrases->a[address_phrase_index]; @@ -870,7 +1016,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize log_warn("expansion_value is NULL. word=%s, sentence=%s\n", word, tokenized->str); } - if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME | ADDRESS_UNIT)) { + if (address_phrase_types & (ADDRESS_STREET | ADDRESS_HOUSE_NUMBER | ADDRESS_NAME | ADDRESS_UNIT | ADDRESS_POSTAL_CODE)) { phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase); add_word_feature = false; @@ -879,7 +1025,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string); add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string); add_phrase_features(features, address_phrase_types, ADDRESS_HOUSE_NUMBER, "house_number", phrase_string); - + add_phrase_features(features, address_phrase_types, ADDRESS_POSTAL_CODE, "postal_code", phrase_string); } } @@ -888,20 +1034,28 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize address_parser_types_t types; - bool possible_postal_code = false; - // Component phrases if (component_phrase_index != NULL_PHRASE_MEMBERSHIP) { phrase = component_phrases->a[component_phrase_index]; component_phrase_string = cstring_array_get_phrase(context->normalized_admin, component_phrase_tokens, phrase); - types.value = phrase.data; + uint32_t component_phrase_index = phrase.data; + if (component_phrase_index > parser->phrase_types->n) { + log_error("Invalid component_phrase_index: %u (parser->phrase_types->n=%zu)\n", component_phrase_index, parser->phrase_types->n); + return false; + } + + types = parser->phrase_types->a[component_phrase_index]; + uint32_t component_phrase_types = types.components; uint32_t most_common = types.most_common; - if (last_index >= (ssize_t)phrase.start - 1 || next_index <= (ssize_t)phrase.start + phrase.len - 1) { + if (last_index >= (ssize_t)phrase.start - 1) { last_index = (ssize_t)phrase.start - 1; + } + + if (next_index < (ssize_t)phrase.start + phrase.len) { next_index = (ssize_t)phrase.start + phrase.len; } @@ -917,35 +1071,101 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string); add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string); add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string); - add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_POSTAL_CODE, "postal_code", component_phrase_string); add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string); add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string); add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_WORLD_REGION, "world_region", component_phrase_string); } - if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) { - feature_array_add(features, 2, "commonly city", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) { - feature_array_add(features, 2, "commonly state", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) { - feature_array_add(features, 2, "commonly country", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) { - feature_array_add(features, 2, "commonly country_region", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) { - feature_array_add(features, 2, "commonly state_district", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) { - feature_array_add(features, 2, "commonly island", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) { - feature_array_add(features, 2, "commonly suburb", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) { - feature_array_add(features, 2, "commonly city_district", component_phrase_string); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_POSTAL_CODE) { - feature_array_add(features, 2, "commonly postal_code", component_phrase_string); - possible_postal_code = true; + if (component_phrase_types != most_common) { + if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) { + feature_array_add(features, 2, "commonly city", component_phrase_string); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) { + feature_array_add(features, 2, "commonly country", component_phrase_string); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) { + feature_array_add(features, 2, "commonly suburb", component_phrase_string); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) { + feature_array_add(features, 2, "commonly city_district", component_phrase_string); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) { + feature_array_add(features, 2, "commonly state", component_phrase_string); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) { + feature_array_add(features, 2, "commonly country_region", component_phrase_string); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) { + feature_array_add(features, 2, "commonly state_district", component_phrase_string); + } else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) { + feature_array_add(features, 2, "commonly island", component_phrase_string); + } } } + bool possible_postal_code = false; + bool postal_code_have_admin = false; + int64_t postal_code_phrase_index = postal_code_phrase_memberships->a[idx]; + phrase_t postal_code_phrase = NULL_PHRASE; + + if (postal_code_phrase_index != NULL_PHRASE_MEMBERSHIP) { + postal_code_phrase = postal_code_phrases->a[postal_code_phrase_index]; + + uint32_t postal_code_id = postal_code_phrase.data; + + possible_postal_code = true; + + if (last_index >= (ssize_t)postal_code_phrase.start - 1) { + last_index = (ssize_t)postal_code_phrase.start - 1; + } + + if (next_index < (ssize_t)postal_code_phrase.start + postal_code_phrase.len) { + next_index = (ssize_t)postal_code_phrase.start + postal_code_phrase.len; + } + + uint32_t admin_id; + uint64_t postal_code_context; + + khiter_t k; + + if (last_index >= 0) { + int64_t last_component_phrase_index = component_phrase_memberships->a[last_index]; + if (last_component_phrase_index != NULL_PHRASE_MEMBERSHIP) { + phrase_t last_component_phrase = component_phrases->a[last_component_phrase_index]; + admin_id = last_component_phrase.data; + postal_code_context_value_t postal_code_context_value = POSTAL_CODE_CONTEXT(postal_code_id, admin_id); + postal_code_context = postal_code_context_value.value; + + k = kh_get(int64_set, parser->postal_code_contexts, postal_code_context); + + if (k != kh_end(parser->postal_code_contexts)) { + postal_code_have_admin = true; + } + } + } + + if (!postal_code_have_admin && next_index < num_tokens) { + int64_t next_component_phrase_index = component_phrase_memberships->a[next_index]; + if (next_component_phrase_index != NULL_PHRASE_MEMBERSHIP) { + phrase_t next_component_phrase = component_phrases->a[next_component_phrase_index]; + admin_id = next_component_phrase.data; + postal_code_context_value_t postal_code_context_value = POSTAL_CODE_CONTEXT(postal_code_id, admin_id); + postal_code_context = postal_code_context_value.value; + + k = kh_get(int64_set, parser->postal_code_contexts, postal_code_context); + + if (k != kh_end(parser->postal_code_contexts)) { + postal_code_have_admin = true; + } + } + } + + } + + if (possible_postal_code) { + if (postal_code_have_admin) { + feature_array_add(features, 1, "postcode have context"); + feature_array_add(features, 2, "postcode have context", word); + } else { + feature_array_add(features, 2, "postcode no context", word); + } + } + uint32_t word_freq = word_vocab_frequency(parser, word); bool is_word = is_word_token(token.type); @@ -1057,7 +1277,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize hyphenated_word_offset += next_hyphen_index + 1; first_sub_token = false; - log_debug("next_hyphen_index=%d\n", next_hyphen_index); + log_debug("next_hyphen_index=%zd\n", next_hyphen_index); } while(next_hyphen_index >= 0); } @@ -1101,7 +1321,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } if (last_index >= 0) { - address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(tokenized, context, last_index); + address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, last_index); char *prev_word = prev_word_or_phrase.str; if (prev_word_or_phrase.type == ADDRESS_PARSER_NULL_PHRASE) { @@ -1123,10 +1343,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize feature_array_add(features, 3, "prev word+word", prev_word, word); } - size_t num_tokens = tokenized->tokens->n; - if (next_index < num_tokens) { - address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(tokenized, context, next_index); + address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, next_index); char *next_word = next_word_or_phrase.str; size_t next_word_len = 1; @@ -1148,7 +1366,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize // Prev tag, current word and next word //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word); - } if (parser->options.print_features) { @@ -1228,18 +1445,24 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c address_parser_response_t *response = NULL; // If the whole input string is a single known phrase at the SUBURB level or higher, bypass sequence prediction altogether - + phrase_t only_phrase = NULL_PHRASE; + bool is_postal = false; if (context->component_phrases->n == 1) { - phrase_t only_phrase = context->component_phrases->a[0]; - if (only_phrase.start == 0 && only_phrase.len == tokenized_str->tokens->n) { - address_parser_types_t types; + only_phrase = context->component_phrases->a[0]; + } else if (context->postal_code_phrases->n == 1) { + only_phrase = context->postal_code_phrases->a[0]; + is_postal = true; + } - types.value = only_phrase.data; - uint32_t most_common = types.most_common; + if (only_phrase.start == 0 && only_phrase.len == tokenized_str->tokens->n && only_phrase.len > 0) { + uint32_t most_common = 0; - char *label = NULL; + char *label = NULL; - response = address_parser_response_new(); + if (!is_postal) { + uint32_t component_phrase_index = only_phrase.data; + address_parser_types_t types = parser->phrase_types->a[component_phrase_index]; + most_common = types.most_common; if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) { label = strdup(ADDRESS_PARSER_LABEL_CITY); @@ -1257,25 +1480,27 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c label = strdup(ADDRESS_PARSER_LABEL_CITY_DISTRICT); } else if (most_common == ADDRESS_PARSER_BOUNDARY_WORLD_REGION) { label = strdup(ADDRESS_PARSER_LABEL_WORLD_REGION); - } else if (most_common == ADDRESS_PARSER_BOUNDARY_POSTAL_CODE) { - label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE); } + } else { + label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE); + } - // Implicit: if most_common is not one of the above, ignore and parse regularly - if (label != NULL) { - char **single_label = malloc(sizeof(char *)); - single_label[0] = label; - char **single_component = malloc(sizeof(char *)); - single_component[0] = strdup(normalized); + response = address_parser_response_new(); - response->num_components = 1; - response->labels = single_label; - response->components = single_component; + // Implicit: if most_common is not one of the above, ignore and parse regularly + if (label != NULL) { + char **single_label = malloc(sizeof(char *)); + single_label[0] = label; + char **single_component = malloc(sizeof(char *)); + single_component[0] = strdup(normalized); - token_array_destroy(tokens); - tokenized_string_destroy(tokenized_str); - return response; - } + response->num_components = 1; + response->labels = single_label; + response->components = single_component; + + token_array_destroy(tokens); + tokenized_string_destroy(tokenized_str); + return response; } }