From ce9153d94d339fe348aec9ddcd2da9a2007cd8bf Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Tue, 7 Mar 2017 17:30:53 -0500
Subject: [PATCH] [parser] fixing some issues in address_parser_features.
 Prefix/suffix phrases use the word before token-level normalization (but
 after string-level normalization like lowercasing), needed to use the same
 string in the feature function as in address_parser_context_fill. Affects
 some German suffixes like "str." where the final "." would be deleted in
 token normalization, but the suffix length would include it. Also, three of
 the new arrays used in address_parser_context (suffix_phrases,
 prefix_phrases, and sub_tokens) weren't being cleared per call, which means
 computing the wrong features at best and a segfault at worst

---
 src/address_parser.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/address_parser.c b/src/address_parser.c
index c486de84..a31e1f1a 100644
--- a/src/address_parser.c
+++ b/src/address_parser.c
@@ -706,13 +706,17 @@ void address_parser_context_fill(address_parser_context_t *context, address_pars
     bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases);
     token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens);
 
+    phrase_array_clear(context->prefix_phrases);
+    phrase_array_clear(context->suffix_phrases);
+
     for (size_t i = 0; i < num_tokens; i++) {
         token_t token = tokens->a[i];
+        char *word_pre_norm = tokenized_string_get_token(tokenized_str, i);
 
-        phrase_t prefix_phrase = search_address_dictionaries_prefix(str + token.offset, token.len, NULL);
+        phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL);
         phrase_array_push(context->prefix_phrases, prefix_phrase);
 
-        phrase_t suffix_phrase = search_address_dictionaries_suffix(str + token.offset, token.len, NULL);
+        phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL);
         phrase_array_push(context->suffix_phrases, suffix_phrase);
     }
 
@@ -1050,14 +1054,14 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
     ssize_t last_index = (ssize_t)idx - 1;
     ssize_t next_index = (ssize_t)idx + 1;
 
+    char *word_pre_norm = tokenized_string_get_token(tokenized, idx);
+
     char *word = cstring_array_get_string(normalized, idx);
     if (word == NULL) {
         log_error("got NULL word at %d\n", idx);
         return false;
     }
 
-    char *word_pre_norm = tokenized_string_get_token(tokenized, idx);
-
     size_t word_len = strlen(word);
 
     log_debug("word=%s\n", word);
@@ -1307,7 +1311,9 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
                 known_suffix = true;
                 char_array_clear(context->suffix_phrase);
                 suffix_len = suffix_phrase.len;
-                char_array_add_len(context->suffix_phrase, word_pre_norm + (token.len - suffix_phrase.len), suffix_len);
+                size_t word_pre_norm_len = cstring_array_token_length(tokenized->strings, idx);
+                size_t suffix_offset = word_pre_norm_len - suffix_len;
+                char_array_add_len(context->suffix_phrase, word_pre_norm + suffix_offset, suffix_len);
                 suffix = char_array_get_string(context->suffix_phrase);
                 log_debug("got suffix: %s\n", suffix);
                 feature_array_add(features, 2, "suffix", suffix);
@@ -1326,6 +1332,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
 
             ssize_t next_hyphen_index;
 
+            token_array_clear(context->sub_tokens);
+
             do {
                 next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset);
                 char *sub_word = word;