[fix] Bug in address parser feature extraction, can hold onto the wrong pointer

This commit is contained in:
Al
2015-12-10 18:42:28 -05:00
parent 3de59506ae
commit 88b8023ac8

View File

@@ -510,8 +510,6 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
cstring_array_clear(features);
char *original_word = tokenized_string_get_token(tokenized, i);
token_t token = tokenized->tokens->a[i];
ssize_t last_index = (ssize_t)i - 1;
@@ -539,6 +537,8 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
char_array *phrase_tokens = context->phrase;
bool add_word_feature = true;
// Address dictionary phrases
if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) {
phrase = address_dictionary_phrases->a[address_phrase_index];
@@ -555,26 +555,22 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
if (address_phrase_types & (ADDRESS_STREET | ADDRESS_NAME)) {
phrase_string = get_phrase_string(tokenized, phrase_tokens, phrase);
if (phrase_string != NULL) {
word = phrase_string;
}
add_word_feature = false;
log_debug("phrase_string=%s\n", phrase_string);
add_phrase_features(features, address_phrase_types, ADDRESS_STREET, "street", phrase_string, prev2, prev);
add_phrase_features(features, address_phrase_types, ADDRESS_NAME, "name", phrase_string, prev2, prev);
}
}
// Prefixes like hinter, etc.
phrase_t prefix_phrase = search_address_dictionaries_prefix(original_word, token.len, language);
phrase_t prefix_phrase = search_address_dictionaries_prefix(word, token.len, language);
if (prefix_phrase.len > 0) {
expansion.value = prefix_phrase.data;
// Don't include elisions like l', d', etc. which are in the ADDRESS_ANY category
if (expansion.components ^ ADDRESS_ANY) {
char_array_clear(phrase_tokens);
char_array_add_len(phrase_tokens, original_word, prefix_phrase.len);
char_array_add_len(phrase_tokens, word, prefix_phrase.len);
char *prefix = char_array_get_string(phrase_tokens);
log_debug("got prefix: %s\n", prefix);
feature_array_add(features, 2, "prefix", prefix);
@@ -582,20 +578,18 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
}
// Suffixes like straße, etc.
phrase_t suffix_phrase = search_address_dictionaries_suffix(original_word, token.len, language);
phrase_t suffix_phrase = search_address_dictionaries_suffix(word, token.len, language);
if (suffix_phrase.len > 0) {
expansion.value = suffix_phrase.data;
if (expansion.components & ADDRESS_STREET) {
char_array_clear(phrase_tokens);
char_array_add_len(phrase_tokens, original_word + (token.len - suffix_phrase.len), suffix_phrase.len);
char_array_add_len(phrase_tokens, word + (token.len - suffix_phrase.len), suffix_phrase.len);
char *suffix = char_array_get_string(phrase_tokens);
log_debug("got suffix: %s\n", suffix);
feature_array_add(features, 2, "suffix", suffix);
}
}
bool add_word_feature = true;
int64_t component_phrase_index = component_phrase_memberships->a[i];
phrase = NULL_PHRASE;
@@ -688,7 +682,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
}
}
printf("word=%s, len=%zu\n", word, strlen(word));
uint32_t word_freq = word_vocab_frequency(parser, word);
if (add_word_feature) {
@@ -699,17 +693,17 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
// The individual word
feature_array_add(features, 2, "word", word);
} else {
log_debug("word not in vocab: %s\n", original_word);
log_debug("word not in vocab: %s\n", word);
word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
}
} else if (component_phrase_string != NULL) {
word = component_phrase_string;
} else if (geo_phrase_string != NULL) {
word = geo_phrase_string;
} else if (phrase_string != NULL) {
word = phrase_string;
}
if (prev != NULL && last_index == i - 1) {
// Previous tag and current word
feature_array_add(features, 3, "i-1 tag+word", prev, word);
@@ -825,7 +819,6 @@ address_parser_response_t *address_parser_parse(char *address, char *language, c
averaged_perceptron_t *model = parser->model;
token_array *tokens = tokenize(normalized);
char_array *token_array = char_array_new();
tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);