[parser] only adding ngrams in a hyphenated word if the subword is not rare
This commit is contained in:
@@ -775,7 +775,6 @@ static bool add_ngram_features(cstring_array *features, char *feature_prefix, cs
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
address_parser_features
|
address_parser_features
|
||||||
-----------------------
|
-----------------------
|
||||||
@@ -1008,6 +1007,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
|||||||
|
|
||||||
// For rare words and unknown words (so unknown words can benefit from statistics of known but super common words)
|
// For rare words and unknown words (so unknown words can benefit from statistics of known but super common words)
|
||||||
if (word_freq <= parser->options.rare_word_threshold && is_word) {
|
if (word_freq <= parser->options.rare_word_threshold && is_word) {
|
||||||
|
log_debug("rare word: %s\n", word);
|
||||||
bool ngrams_added = false;
|
bool ngrams_added = false;
|
||||||
size_t hyphenated_word_offset = 0;
|
size_t hyphenated_word_offset = 0;
|
||||||
bool first_sub_token = true;
|
bool first_sub_token = true;
|
||||||
@@ -1039,20 +1039,26 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
|||||||
bool add_prefix = first_sub_token && prefix_len < sub_word_len;
|
bool add_prefix = first_sub_token && prefix_len < sub_word_len;
|
||||||
bool add_suffix = last_sub_token && suffix_len < sub_word_len;
|
bool add_suffix = last_sub_token && suffix_len < sub_word_len;
|
||||||
|
|
||||||
|
uint32_t sub_word_freq = word_freq;
|
||||||
if (is_hyphenated) {
|
if (is_hyphenated) {
|
||||||
uint32_t sub_word_freq = word_vocab_frequency(parser, sub_word);
|
sub_word_freq = word_vocab_frequency(parser, sub_word);
|
||||||
if (sub_word_freq > 0) {
|
if (sub_word_freq > 0) {
|
||||||
feature_array_add(features, 2, "sub_word", sub_word);
|
feature_array_add(features, 2, "sub_word", sub_word);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// N-gram features from 3-6 characters
|
if (sub_word_freq <= parser->options.rare_word_threshold) {
|
||||||
|
// prefix/suffix features from 3-6 characters
|
||||||
for (size_t ng = 3; ng <= 6; ng++) {
|
for (size_t ng = 3; ng <= 6; ng++) {
|
||||||
ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0);
|
ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
hyphenated_word_offset += next_hyphen_index + 1;
|
hyphenated_word_offset += next_hyphen_index + 1;
|
||||||
first_sub_token = false;
|
first_sub_token = false;
|
||||||
|
|
||||||
|
log_debug("next_hyphen_index=%d\n", next_hyphen_index);
|
||||||
} while(next_hyphen_index >= 0);
|
} while(next_hyphen_index >= 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -1143,6 +1149,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize
|
|||||||
|
|
||||||
// Prev tag, current word and next word
|
// Prev tag, current word and next word
|
||||||
//feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);
|
//feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (parser->options.print_features) {
|
if (parser->options.print_features) {
|
||||||
|
|||||||
Reference in New Issue
Block a user