From ce9153d94d339fe348aec9ddcd2da9a2007cd8bf Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 7 Mar 2017 17:30:53 -0500 Subject: [PATCH] [parser] fixing some issues in address_parser_features. Prefix/suffix phrases use the word before token-level normalization (but after string-level normalization like lowercasing), needed to use the same string in the feature function as in address_parser_context_fill. Affects some German suffixes like "str." where the final "." would be deleted in token normalization, but the suffix length would include it. Also, three of the new arrays used in address_parser_context (suffix_phrases, prefix_phrases, and sub_tokens) weren't being cleared per call, which means computing the wrong features at best and a segfault at worst --- src/address_parser.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/address_parser.c b/src/address_parser.c index c486de84..a31e1f1a 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -706,13 +706,17 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases); token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens); + phrase_array_clear(context->prefix_phrases); + phrase_array_clear(context->suffix_phrases); + for (size_t i = 0; i < num_tokens; i++) { token_t token = tokens->a[i]; + char *word_pre_norm = tokenized_string_get_token(tokenized_str, i); - phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, NULL); + phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL); phrase_array_push(context->prefix_phrases, prefix_phrase); - phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, NULL); + phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL); phrase_array_push(context->suffix_phrases, suffix_phrase); } @@ -1050,14 +1054,14 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize ssize_t last_index = (ssize_t)idx - 1; ssize_t next_index = (ssize_t)idx + 1; + char *word_pre_norm = tokenized_string_get_token(tokenized, idx); + char *word = cstring_array_get_string(normalized, idx); if (word == NULL) { log_error("got NULL word at %d\n", idx); return false; } - char *word_pre_norm = tokenized_string_get_token(tokenized, idx); - size_t word_len = strlen(word); log_debug("word=%s\n", word); @@ -1307,7 +1311,9 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize known_suffix = true; char_array_clear(context->suffix_phrase); suffix_len = suffix_phrase.len; - char_array_add_len(context->suffix_phrase, word_pre_norm + (token.len - suffix_phrase.len), suffix_len); + size_t word_pre_norm_len = cstring_array_token_length(tokenized->strings, idx); + size_t suffix_offset = word_pre_norm_len - suffix_len; + char_array_add_len(context->suffix_phrase, word_pre_norm + suffix_offset, suffix_len); suffix = char_array_get_string(context->suffix_phrase); log_debug("got suffix: %s\n", suffix); feature_array_add(features, 2, "suffix", suffix); @@ -1326,6 +1332,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize ssize_t next_hyphen_index; + token_array_clear(context->sub_tokens); + do { next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset); char *sub_word = word;