[expand] in cases like "Avenue D" where there are two phrases, one is ambiguous (and canonical) but not necessarily edge-ignorable (pre/post-directional), allow deletion of the other token (so "Avenue" in this case). Also allows skipping in cases where the language classifier may predict a second language with some small probability, such as French for a short string like "Avenue D" (in addition to English). If the token was ignorable in the highest probability language, ignore it in both.

This commit is contained in:
Al
2017-12-17 17:22:37 -05:00
parent 3f7abd5b24
commit 9eef46adee

View File

@@ -803,6 +803,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
}
bool skipped_last_edge_phrase = false;
for (size_t i = 0; i < phrases->n; i++) {
phrase_lang = phrases->a[i];
@@ -882,19 +884,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
address_expansion_array *expansions = value->expansions;
if (expansions != NULL) {
bool current_phrase_have_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
bool added_pre_phrase_space = false;
bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
bool current_phrase_have_edge_ignorable = false;
bool current_phrase_have_unambiguous = address_phrase_contains_unambiguous_expansion(phrase);
bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase);
bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase);
/*
Edge phrase handling. This is primarily for handling pre-directionals/post-directionals
in English and other languages.
*/
bool skip_edge_phrase = false;
bool other_phrase_have_edge_ignorable = false;
bool other_phrase_is_ignorable = false;
if (delete_phrases) {
phrase_language_t other_phrase_lang;
@@ -918,19 +922,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) {
if (other_phrase.start + other_phrase.len == num_tokens) {
skip_edge_phrase = false;
if (current_phrase_have_edge_ignorable) {
if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
// don't delete the "E" in "E St"
log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n");
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
} else {
log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n");
// delete "Avenue" in "Avenue E"
other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
}
} else {
// If we encounter an ignorable phrase
// If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", this is probably a legit token instead of a pre-directional
skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase);
log_debug("phrase is possible root = %d\n", skip_edge_phrase);
}
@@ -956,16 +962,15 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
if (other_phrase.start == 0) {
//other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components);
skip_edge_phrase = false;
if (current_phrase_have_edge_ignorable) {
if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
// don't delete the "E" in "Avenue E"
log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n");
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
//skip_edge_phrase = !other_phrase_invalid;
} else {
log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n");
// delete "St" in "E St"
other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
//skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
}
}
@@ -976,10 +981,17 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
}
}
if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) {
skip_edge_phrase = true;
}
for (size_t j = 0; j < expansions->n; j++) {
if (skip_edge_phrase) {
skipped_last_edge_phrase = true;
log_debug("skip edge phrase\n");
continue;
} else {
skipped_last_edge_phrase = false;
}
address_expansion_t expansion = expansions->a[j];
@@ -998,7 +1010,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal
current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
// Edge phrase calculations from above
if (current_phrase_have_edge_ignorable || other_phrase_have_edge_ignorable) {
if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) {
log_debug("current_phrase_have_edge_ignorable\n");
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
current_phrase_ignorable = skip_edge_phrase;