From 0286a2fef3b51d62dbdb4a4fa8656a73a1e8f9bc Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 16 Jan 2018 03:00:12 -0500 Subject: [PATCH] [expand] for root expansions, delete ambiguous tokens only when there's a non-numeric non-phrase token present. This applies to all name components, not for components where numerics can be the root (house numbers, units, streets, etc.) --- src/expand.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/expand.c b/src/expand.c index dc184bce..e1fb5bb0 100644 --- a/src/expand.c +++ b/src/expand.c @@ -623,6 +623,8 @@ static inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id } } +static const uint16_t NUMERIC_ADDRESS_COMPONENTS = (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET); + typedef enum { GAZETTEER_MATCH_IGNORABLE, GAZETTEER_MATCH_EDGE_IGNORABLE, @@ -831,6 +833,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal size_t num_phrases = phrases->n; bool have_non_phrase_tokens = false; + bool have_non_phrase_word_tokens = false; bool have_canonical_phrases = false; bool have_ambiguous = false; bool have_possible_root = false; @@ -853,6 +856,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { log_debug("have_non_phrase_tokens\n"); have_non_phrase_tokens = true; + have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type); break; } } @@ -863,6 +867,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal inter_token = tokens[j]; if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { have_non_phrase_tokens = true; + have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type); break; } } @@ -873,6 +878,19 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase); have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous); + log_debug("have_non_phrase_word_tokens = %d, phrase_is_strictly_ignorable = %d, phrase_is_ambiguous = %d\n", have_non_phrase_word_tokens, phrase_is_strictly_ignorable, phrase_is_ambiguous); + if (!have_non_phrase_word_tokens && !phrase_is_strictly_ignorable && !phrase_is_ambiguous) { + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { + token_t pt = tokens[j]; + if (is_word_token(pt.type)) { + log_debug("have_non_phrase_word_tokens\n"); + have_non_phrase_word_tokens = true; + break; + } + } + } + + have_strictly_ignorable = have_strictly_ignorable || phrase_is_strictly_ignorable; have_strictly_ignorable_abbreviation = have_strictly_ignorable_abbreviation || (phrase_is_strictly_ignorable && !phrase_is_canonical); if (have_strictly_ignorable_abbreviation) { @@ -884,10 +902,6 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous); have_ambiguous = have_ambiguous || phrase_is_ambiguous; - if (have_non_phrase_tokens) { - break; - } - prev_phrase_end = phrase.start + phrase.len; } @@ -962,6 +976,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components); + bool is_numeric_component = (value->components & options.address_components & NUMERIC_ADDRESS_COMPONENTS); + if (expansion_valid_components) { key->n = namespace_len; for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { @@ -1136,11 +1152,9 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n"); current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0; log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); - } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases || have_possible_root)) { - log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases); - current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); - - log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n"); + } else if (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || is_numeric_component || have_canonical_phrases || have_possible_root)) { + log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens = %d, have_canonical_phrases = %d, have_possible_root = %d, have_non_phrase_word_tokens = %d, is_numeric_component = %d, have_non_phrase_tokens = %d\n", have_non_phrase_tokens, have_canonical_phrases, have_possible_root, have_non_phrase_word_tokens, is_numeric_component, have_non_phrase_tokens); + current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || (is_numeric_component && have_non_phrase_tokens)) && current_phrase_have_ignorable && current_phrase_have_unambiguous); log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); } else if (!is_valid_for_components && !is_ambiguous) { log_debug("!is_valid_for_components\n");