[parser] fixing some issues in address_parser_features. Prefix/suffix phrases use the word before token-level normalization (but after string-level normalization like lowercasing), needed to use the same string in the feature function as in address_parser_context_fill. Affects some German suffixes like "str." where the final "." would be deleted in token normalization, but the suffix length would include it. Also, three of the new arrays used in address_parser_context (suffix_phrases, prefix_phrases, and sub_tokens) weren't being cleared per call, which means computing the wrong features at best and a segfault at worst

This commit is contained in:
Al
2017-03-07 17:30:53 -05:00
parent b6bf8da383
commit ce9153d94d

View File

@@ -706,13 +706,17 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars
bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases); bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases);
token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens); token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens);
phrase_array_clear(context->prefix_phrases);
phrase_array_clear(context->suffix_phrases);
for (size_t i = 0; i < num_tokens; i++) { for (size_t i = 0; i < num_tokens; i++) {
token_t token = tokens->a[i]; token_t token = tokens->a[i];
char *word_pre_norm = tokenized_string_get_token(tokenized_str, i);
phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, NULL); phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL);
phrase_array_push(context->prefix_phrases, prefix_phrase); phrase_array_push(context->prefix_phrases, prefix_phrase);
phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, NULL); phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL);
phrase_array_push(context->suffix_phrases, suffix_phrase); phrase_array_push(context->suffix_phrases, suffix_phrase);
} }
@@ -1050,14 +1054,14 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
ssize_t last_index = (ssize_t)idx - 1; ssize_t last_index = (ssize_t)idx - 1;
ssize_t next_index = (ssize_t)idx + 1; ssize_t next_index = (ssize_t)idx + 1;
char *word_pre_norm = tokenized_string_get_token(tokenized, idx);
char *word = cstring_array_get_string(normalized, idx); char *word = cstring_array_get_string(normalized, idx);
if (word == NULL) { if (word == NULL) {
log_error("got NULL word at %d\n", idx); log_error("got NULL word at %d\n", idx);
return false; return false;
} }
char *word_pre_norm = tokenized_string_get_token(tokenized, idx);
size_t word_len = strlen(word); size_t word_len = strlen(word);
log_debug("word=%s\n", word); log_debug("word=%s\n", word);
@@ -1307,7 +1311,9 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
known_suffix = true; known_suffix = true;
char_array_clear(context->suffix_phrase); char_array_clear(context->suffix_phrase);
suffix_len = suffix_phrase.len; suffix_len = suffix_phrase.len;
char_array_add_len(context->suffix_phrase, word_pre_norm + (token.len - suffix_phrase.len), suffix_len); size_t word_pre_norm_len = cstring_array_token_length(tokenized->strings, idx);
size_t suffix_offset = word_pre_norm_len - suffix_len;
char_array_add_len(context->suffix_phrase, word_pre_norm + suffix_offset, suffix_len);
suffix = char_array_get_string(context->suffix_phrase); suffix = char_array_get_string(context->suffix_phrase);
log_debug("got suffix: %s\n", suffix); log_debug("got suffix: %s\n", suffix);
feature_array_add(features, 2, "suffix", suffix); feature_array_add(features, 2, "suffix", suffix);
@@ -1326,6 +1332,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
ssize_t next_hyphen_index; ssize_t next_hyphen_index;
token_array_clear(context->sub_tokens);
do { do {
next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset); next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset);
char *sub_word = word; char *sub_word = word;