[parsing] Using the entire phrase as the ith word
This commit is contained in:
@@ -702,8 +702,14 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
|||||||
log_debug("word not in vocab: %s\n", original_word);
|
log_debug("word not in vocab: %s\n", original_word);
|
||||||
word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
|
word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
|
||||||
}
|
}
|
||||||
|
} else if (component_phrase_string != NULL) {
|
||||||
|
word = component_phrase_string;
|
||||||
|
} else if (geo_phrase_string != NULL) {
|
||||||
|
word = geo_phrase_string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if (prev != NULL && last_index == i - 1) {
|
if (prev != NULL && last_index == i - 1) {
|
||||||
// Previous tag and current word
|
// Previous tag and current word
|
||||||
feature_array_add(features, 3, "i-1 tag+word", prev, word);
|
feature_array_add(features, 3, "i-1 tag+word", prev, word);
|
||||||
@@ -727,7 +733,10 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
|||||||
|
|
||||||
// Previous word
|
// Previous word
|
||||||
feature_array_add(features, 2, "i-1 word", prev_word);
|
feature_array_add(features, 2, "i-1 word", prev_word);
|
||||||
feature_array_add(features, 3, "i-1 tag+i-1 word", prev, prev_word);
|
|
||||||
|
if (last_index == i - 1) {
|
||||||
|
feature_array_add(features, 3, "i-1 tag+i-1 word", prev, prev_word);
|
||||||
|
}
|
||||||
|
|
||||||
// Previous word and current word
|
// Previous word and current word
|
||||||
feature_array_add(features, 3, "i-1 word+word", prev_word, word);
|
feature_array_add(features, 3, "i-1 word+word", prev_word, word);
|
||||||
|
|||||||
Reference in New Issue
Block a user