[expand] in cases like "Avenue D" where there are two phrases, one is ambiguous (and canonical) but not necessarily edge-ignorable (pre/post-directional), allow deletion of the other token (so "Avenue" in this case). Also allows skipping in cases where the language classifier may predict a second language with some small probability, such as French for a short string like "Avenue D" (in addition to English). If the token was ignorable in the highest probability language, ignore it in both.

This commit is contained in:
Al
2017-12-17 17:22:37 -05:00
parent 3f7abd5b24
commit 9eef46adee

View File

@@ -803,6 +803,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation); log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
} }
bool skipped_last_edge_phrase = false;
for (size_t i = 0; i < phrases->n; i++) { for (size_t i = 0; i < phrases->n; i++) {
phrase_lang = phrases->a[i]; phrase_lang = phrases->a[i];
@@ -882,19 +884,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
address_expansion_array *expansions = value->expansions; address_expansion_array *expansions = value->expansions;
if (expansions != NULL) { if (expansions != NULL) {
bool current_phrase_have_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
bool added_pre_phrase_space = false; bool added_pre_phrase_space = false;
bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components); bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
bool current_phrase_have_edge_ignorable = false; bool current_phrase_have_edge_ignorable = false;
bool current_phrase_have_unambiguous = address_phrase_contains_unambiguous_expansion(phrase); bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase);
bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase);
/* /*
Edge phrase handling. This is primarily for handling pre-directionals/post-directionals Edge phrase handling. This is primarily for handling pre-directionals/post-directionals
in English and other languages. in English and other languages.
*/ */
bool skip_edge_phrase = false; bool skip_edge_phrase = false;
bool other_phrase_have_edge_ignorable = false; bool other_phrase_is_ignorable = false;
if (delete_phrases) { if (delete_phrases) {
phrase_language_t other_phrase_lang; phrase_language_t other_phrase_lang;
@@ -918,19 +922,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) { if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) {
if (other_phrase.start + other_phrase.len == num_tokens) { if (other_phrase.start + other_phrase.len == num_tokens) {
skip_edge_phrase = false; skip_edge_phrase = false;
if (current_phrase_have_edge_ignorable) { if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
// don't delete the "E" in "E St" // don't delete the "E" in "E St"
log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n"); log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n");
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
} else { } else {
log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n"); log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n");
// delete "Avenue" in "Avenue E" // delete "Avenue" in "Avenue E"
other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
} }
} else { } else {
// If we encounter an ignorable phrase // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", this is probably a legit token instead of a pre-directional
skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase); skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase);
log_debug("phrase is possible root = %d\n", skip_edge_phrase); log_debug("phrase is possible root = %d\n", skip_edge_phrase);
} }
@@ -956,16 +962,15 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
if (other_phrase.start == 0) { if (other_phrase.start == 0) {
//other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components); //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components);
skip_edge_phrase = false; skip_edge_phrase = false;
if (current_phrase_have_edge_ignorable) { if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
// don't delete the "E" in "Avenue E" // don't delete the "E" in "Avenue E"
log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n"); log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n");
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
//skip_edge_phrase = !other_phrase_invalid;
} else { } else {
log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n"); log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n");
// delete "St" in "E St" // delete "St" in "E St"
other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
//skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
} }
} }
@@ -976,10 +981,17 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
} }
} }
if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) {
skip_edge_phrase = true;
}
for (size_t j = 0; j < expansions->n; j++) { for (size_t j = 0; j < expansions->n; j++) {
if (skip_edge_phrase) { if (skip_edge_phrase) {
skipped_last_edge_phrase = true;
log_debug("skip edge phrase\n"); log_debug("skip edge phrase\n");
continue; continue;
} else {
skipped_last_edge_phrase = false;
} }
address_expansion_t expansion = expansions->a[j]; address_expansion_t expansion = expansions->a[j];
@@ -998,7 +1010,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
// Edge phrase calculations from above // Edge phrase calculations from above
if (current_phrase_have_edge_ignorable || other_phrase_have_edge_ignorable) { if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) {
log_debug("current_phrase_have_edge_ignorable\n"); log_debug("current_phrase_have_edge_ignorable\n");
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
current_phrase_ignorable = skip_edge_phrase; current_phrase_ignorable = skip_edge_phrase;