From 88b8023ac8b4dc5c47e2787f23365446b18790da Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 10 Dec 2015 18:42:28 -0500 Subject: [PATCH] [fix] Bug in address parser feature extraction, can hold onto the wrong pointer --- src/address_parser.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/src/address_parser.c b/src/address_parser.c index fb8a4b00..2e6ee2cd 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -510,8 +510,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize cstring_array_clear(features); - char *original_word = tokenized_string_get_token(tokenized, i); - token_t token = tokenized->tokens->a[i]; ssize_t last_index = (ssize_t)i - 1; @@ -539,6 +537,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize char_array *phrase_tokens = context->phrase; + bool add_word_feature = true; + // Address dictionary phrases if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) { phrase = address_dictionary_phrases->a[address_phrase_index]; @@ -555,26 +555,22 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize if (address_phrase_types & (ADDRESS_STREET | ADDRESS_NAME)) { phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase); - if (phrase_string != NULL) { - word = phrase_string; - } - + add_word_feature = false; log_debug("phrase_string=%s\n", phrase_string); add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string, prev2, prev); add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string, prev2, prev); - } } // Prefixes like hinter, etc. - phrase_t prefix_phrase = search_address_dictionaries_prefix(original_word, token.len, language); + phrase_t prefix_phrase = search_address_dictionaries_prefix(word, token.len, language); if (prefix_phrase.len > 0) { expansion.value = prefix_phrase.data; // Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category if (expansion.components ^ ADDRESS_ANY) { char_array_clear(phrase_tokens); - char_array_add_len(phrase_tokens, original_word, prefix_phrase.len); + char_array_add_len(phrase_tokens, word, prefix_phrase.len); char *prefix = char_array_get_string(phrase_tokens); log_debug("got prefix: %s\n", prefix); feature_array_add(features, 2, "prefix", prefix); @@ -582,20 +578,18 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } // Suffixes like straße, etc. - phrase_t suffix_phrase = search_address_dictionaries_suffix(original_word, token.len, language); + phrase_t suffix_phrase = search_address_dictionaries_suffix(word, token.len, language); if (suffix_phrase.len > 0) { expansion.value = suffix_phrase.data; if (expansion.components & ADDRESS_STREET) { char_array_clear(phrase_tokens); - char_array_add_len(phrase_tokens, original_word + (token.len - suffix_phrase.len), suffix_phrase.len); + char_array_add_len(phrase_tokens, word + (token.len - suffix_phrase.len), suffix_phrase.len); char *suffix = char_array_get_string(phrase_tokens); log_debug("got suffix: %s\n", suffix); feature_array_add(features, 2, "suffix", suffix); } } - bool add_word_feature = true; - int64_t component_phrase_index = component_phrase_memberships->a[i]; phrase = NULL_PHRASE; @@ -688,7 +682,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize } } - + printf("word=%s, len=%zu\n", word, strlen(word)); uint32_t word_freq = word_vocab_frequency(parser, word); if (add_word_feature) { @@ -699,17 +693,17 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize // The individual word feature_array_add(features, 2, "word", word); } else { - log_debug("word not in vocab: %s\n", original_word); + log_debug("word not in vocab: %s\n", word); word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC; } } else if (component_phrase_string != NULL) { word = component_phrase_string; } else if (geo_phrase_string != NULL) { word = geo_phrase_string; + } else if (phrase_string != NULL) { + word = phrase_string; } - - if (prev != NULL && last_index == i - 1) { // Previous tag and current word feature_array_add(features, 3, "i-1 tag+word", prev, word); @@ -825,7 +819,6 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c averaged_perceptron_t *model = parser->model; token_array *tokens = tokenize(normalized); - char_array *token_array = char_array_new(); tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);