From 7a8f94330bac1474c3852cc24531f0c84124aba7 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 9 Jan 2017 02:53:33 -0500 Subject: [PATCH] [parser] only adding ngrams in a hyphenated word if the subword is not rare --- src/address_parser.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/address_parser.c b/src/address_parser.c index e4a98f3c..e97d5aa2 100644 --- a/src/address_parser.c +++ b/src/address_parser.c @@ -775,7 +775,6 @@ static bool add_ngram_features(cstring_array *features, char *feature_prefix, cs return true; } - /* address_parser_features ----------------------- @@ -1008,6 +1007,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize // For rare words and unknown words (so unknown words can benefit from statistics of known but super common words) if (word_freq <= parser->options.rare_word_threshold && is_word) { + log_debug("rare word: %s\n", word); bool ngrams_added = false; size_t hyphenated_word_offset = 0; bool first_sub_token = true; @@ -1039,20 +1039,26 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize bool add_prefix = first_sub_token && prefix_len < sub_word_len; bool add_suffix = last_sub_token && suffix_len < sub_word_len; + uint32_t sub_word_freq = word_freq; if (is_hyphenated) { - uint32_t sub_word_freq = word_vocab_frequency(parser, sub_word); + sub_word_freq = word_vocab_frequency(parser, sub_word); if (sub_word_freq > 0) { feature_array_add(features, 2, "sub_word", sub_word); } + } - // N-gram features from 3-6 characters - for (size_t ng = 3; ng <= 6; ng++) { - ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0); + if (sub_word_freq <= parser->options.rare_word_threshold) { + // prefix/suffix features from 3-6 characters + for (size_t ng = 3; ng <= 6; ng++) { + ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0); + } } hyphenated_word_offset += next_hyphen_index + 1; first_sub_token = false; + + log_debug("next_hyphen_index=%d\n", next_hyphen_index); } while(next_hyphen_index >= 0); } @@ -1143,6 +1149,7 @@ bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenize // Prev tag, current word and next word //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word); + } if (parser->options.print_features) {